From d8372205b1210f56483ef9c3d13803f0452276f9 Mon Sep 17 00:00:00 2001 From: Taylor Jackle Spriggs <74561858+tjs-intel@users.noreply.github.com> Date: Fri, 10 May 2024 09:58:38 -0600 Subject: [PATCH] Remove Orchestrate from sigopt-python (#456) * remove orchestrate from sigopt-python client * fix urllib patch --- .github/workflows/tests.yml | 3 +- .pre-commit-config.yaml | 1 - .vulture_allowlist | 115 +-- controller/Dockerfile | 96 --- controller/build | 6 - controller/build_packages.txt | 7 - controller/controller/__init__.py | 0 controller/controller/__main__.py | 22 - controller/controller/create_pod.py | 172 ----- controller/controller/event_repeater.py | 25 - controller/controller/k8s_constants.py | 13 - controller/controller/manage_pods.py | 173 ----- controller/controller/missing_pods.py | 9 - controller/controller/pod_status.py | 20 - controller/controller/refill_pods.py | 63 -- controller/controller/run_state.py | 98 --- controller/controller/settings.py | 71 -- controller/controller/thread.py | 24 - controller/controller/version.py | 7 - controller/controller/watch_pods.py | 46 -- controller/upgrade_packages.txt | 3 - setup.py | 15 +- sigopt/cli/arguments/__init__.py | 5 - sigopt/cli/arguments/cluster_filename.py | 18 - sigopt/cli/arguments/cluster_name.py | 7 - sigopt/cli/arguments/dockerfile.py | 12 - sigopt/cli/arguments/identifiers.py | 22 - sigopt/cli/arguments/provider.py | 14 - sigopt/cli/commands/__init__.py | 1 - sigopt/cli/commands/cluster/__init__.py | 23 - sigopt/cli/commands/cluster/base.py | 25 - sigopt/cli/commands/cluster/clean.py | 13 - sigopt/cli/commands/cluster/connect.py | 28 - sigopt/cli/commands/cluster/create.py | 24 - sigopt/cli/commands/cluster/destroy.py | 13 - sigopt/cli/commands/cluster/disconnect.py | 13 - .../cli/commands/cluster/install_plugins.py | 13 - sigopt/cli/commands/cluster/kubectl.py | 24 - sigopt/cli/commands/cluster/not_installed.py | 25 - sigopt/cli/commands/cluster/optimize.py | 30 - sigopt/cli/commands/cluster/run.py | 29 - sigopt/cli/commands/cluster/status.py | 23 - sigopt/cli/commands/cluster/stop.py | 18 - sigopt/cli/commands/cluster/test.py | 13 - sigopt/cli/commands/cluster/test_run.py | 29 - sigopt/cli/commands/cluster/update.py | 15 - sigopt/config.py | 4 - sigopt/decorators.py | 11 + sigopt/exception.py | 4 - sigopt/lib.py | 11 - sigopt/log_capture.py | 20 +- sigopt/orchestrate/__init__.py | 0 sigopt/orchestrate/aws/__init__.py | 0 sigopt/orchestrate/aws/service.py | 348 --------- sigopt/orchestrate/cloudformation/__init__.py | 0 .../cluster-autoscaler-role.yaml | 97 --- .../cloudformation/eks-cluster.yaml | 338 --------- .../cloudformation/eks-node-security.yaml | 158 ---- .../cloudformation/eks-nodegroup.yaml | 551 -------------- .../orchestrate/cloudformation/eks-vpc.yaml | 276 ------- sigopt/orchestrate/cloudformation/service.py | 492 ------------- sigopt/orchestrate/cluster/__init__.py | 0 sigopt/orchestrate/cluster/context.py | 22 - sigopt/orchestrate/cluster/errors.py | 43 -- sigopt/orchestrate/cluster/object.py | 85 --- sigopt/orchestrate/cluster/service.py | 137 ---- .../orchestrate/cluster_metadata/__init__.py | 0 sigopt/orchestrate/cluster_metadata/errors.py | 29 - .../orchestrate/cluster_metadata/service.py | 71 -- sigopt/orchestrate/common.py | 52 -- sigopt/orchestrate/controller.py | 515 ------------- sigopt/orchestrate/custom_cluster/__init__.py | 0 sigopt/orchestrate/custom_cluster/service.py | 51 -- sigopt/orchestrate/docker/__init__.py | 0 sigopt/orchestrate/docker/service.py | 218 ------ sigopt/orchestrate/ec2/__init__.py | 0 sigopt/orchestrate/ec2/service.py | 85 --- sigopt/orchestrate/ecr/__init__.py | 0 sigopt/orchestrate/ecr/service.py | 34 - sigopt/orchestrate/eks/__init__.py | 0 sigopt/orchestrate/eks/kubeconfig.yml | 15 - sigopt/orchestrate/eks/service.py | 23 - sigopt/orchestrate/exceptions.py | 71 -- .../gpu_options_validator/__init__.py | 0 .../gpu_options_validator/service.py | 19 - sigopt/orchestrate/iam/__init__.py | 0 sigopt/orchestrate/iam/service.py | 98 --- sigopt/orchestrate/identifier.py | 86 --- sigopt/orchestrate/job_runner/__init__.py | 0 sigopt/orchestrate/job_runner/service.py | 290 -------- sigopt/orchestrate/job_status/__init__.py | 0 sigopt/orchestrate/job_status/service.py | 68 -- sigopt/orchestrate/json_stream.py | 33 - sigopt/orchestrate/kubectl/__init__.py | 0 sigopt/orchestrate/kubectl/service.py | 32 - sigopt/orchestrate/kubernetes/__init__.py | 0 sigopt/orchestrate/kubernetes/http_proxy.py | 27 - sigopt/orchestrate/kubernetes/service.py | 674 ------------------ sigopt/orchestrate/lib/__init__.py | 0 sigopt/orchestrate/lib/lists.py | 58 -- sigopt/orchestrate/lib/types.py | 54 -- sigopt/orchestrate/logging/__init__.py | 0 sigopt/orchestrate/logging/service.py | 28 - sigopt/orchestrate/model_packer/__init__.py | 0 sigopt/orchestrate/model_packer/service.py | 95 --- sigopt/orchestrate/node_groups.py | 12 - .../orchestrate/options_validator/__init__.py | 0 .../orchestrate/options_validator/service.py | 112 --- sigopt/orchestrate/paths.py | 111 --- sigopt/orchestrate/plugins/__init__.py | 0 .../plugins/autoscaler-plugin-template.yml | 177 ----- sigopt/orchestrate/plugins/docker-service.yml | 14 - .../plugins/docker-statefulset.yml | 66 -- .../plugins/orchestrate-controller-roles.yml | 45 -- sigopt/orchestrate/provider/__init__.py | 0 sigopt/orchestrate/provider/broker.py | 17 - sigopt/orchestrate/provider/constants.py | 43 -- sigopt/orchestrate/provider/interface.py | 21 - sigopt/orchestrate/resource/__init__.py | 0 sigopt/orchestrate/resource/service.py | 32 - sigopt/orchestrate/s3/__init__.py | 0 sigopt/orchestrate/s3/service.py | 73 -- sigopt/orchestrate/services/__init__.py | 0 sigopt/orchestrate/services/aws_base.py | 14 - .../orchestrate/services/aws_provider_bag.py | 27 - sigopt/orchestrate/services/bag.py | 20 - sigopt/orchestrate/services/base.py | 10 - .../orchestrate/services/orchestrate_bag.py | 39 - sigopt/orchestrate/sigopt/__init__.py | 0 sigopt/orchestrate/sigopt/service.py | 95 --- sigopt/orchestrate/status.py | 131 ---- sigopt/orchestrate/stop.py | 29 - sigopt/orchestrate/sts/__init__.py | 0 sigopt/orchestrate/sts/service.py | 25 - sigopt/orchestrate/test/__init__.py | 0 sigopt/orchestrate/test/test_file.txt | 1 - sigopt/orchestrate/version.py | 7 - sigopt/orchestrate/zigopt/__init__.py | 0 sigopt/paths.py | 17 - sigopt/resource.py | 1 - sigopt/run_context.py | 13 +- sigopt/run_params.py | 3 + sigopt/utils.py | 20 - sigopt/xgboost/checkpoint_callback.py | 3 + test/cli/test_cluster_connect.py | 16 - test/cli/test_cluster_create.py | 19 - test/cli/test_cluster_destroy.py | 29 - test/cli/test_cluster_disconnect.py | 16 - test/cli/test_cluster_kubectl.py | 45 -- test/cli/test_cluster_run.py | 25 - test/cli/test_cluster_test.py | 18 - test/cli/test_files/import_hello.py | 3 + test/client/test_interface.py | 1 + test/orchestrate/__init__.py | 0 test/orchestrate/aws/__init__.py | 0 test/orchestrate/aws/service_test.py | 88 --- test/orchestrate/cluster/__init__.py | 0 test/orchestrate/cluster/service_test.py | 238 ------- test/orchestrate/cluster_metadata/__init__.py | 0 .../cluster_metadata/service_test.py | 90 --- test/orchestrate/common_test.py | 24 - test/orchestrate/docker/__init__.py | 0 test/orchestrate/docker/service_test.py | 54 -- test/orchestrate/ecr/__init__.py | 0 test/orchestrate/ecr/service_test.py | 21 - .../gpu_options_validator/__init__.py | 0 .../gpu_options_validator/service_test.py | 38 - test/orchestrate/job_runner/__init__.py | 0 test/orchestrate/job_runner/service_test.py | 78 -- test/orchestrate/job_status/__init__.py | 0 test/orchestrate/job_status/service_test.py | 65 -- test/orchestrate/kubernetes/__init__.py | 0 test/orchestrate/kubernetes/service_test.py | 130 ---- test/orchestrate/lib/__init__.py | 0 test/orchestrate/lib/lists_test.py | 77 -- test/orchestrate/lib/types_test.py | 152 ---- test/orchestrate/model_packer/__init__.py | 0 .../orchestrate/options_validator/__init__.py | 0 .../options_validator/service_test.py | 330 --------- test/orchestrate/provider/__init__.py | 0 test/orchestrate/provider/broker_test.py | 38 - test/orchestrate/resource/__init__.py | 0 test/orchestrate/resource/service_test.py | 37 - test/orchestrate/services/__init__.py | 0 .../services/aws_provider_bag_test.py | 30 - test/orchestrate/services/base_test.py | 13 - test/orchestrate/sigopt/service_test.py | 25 - test/orchestrate/sts/__init__.py | 0 test/orchestrate/sts/service_test.py | 21 - tools/generate_vulture_allowlist | 4 +- tools/run_vulture.sh | 2 +- 191 files changed, 78 insertions(+), 9576 deletions(-) delete mode 100644 controller/Dockerfile delete mode 100755 controller/build delete mode 100644 controller/build_packages.txt delete mode 100644 controller/controller/__init__.py delete mode 100644 controller/controller/__main__.py delete mode 100644 controller/controller/create_pod.py delete mode 100644 controller/controller/event_repeater.py delete mode 100644 controller/controller/k8s_constants.py delete mode 100644 controller/controller/manage_pods.py delete mode 100644 controller/controller/missing_pods.py delete mode 100644 controller/controller/pod_status.py delete mode 100644 controller/controller/refill_pods.py delete mode 100644 controller/controller/run_state.py delete mode 100644 controller/controller/settings.py delete mode 100644 controller/controller/thread.py delete mode 100644 controller/controller/version.py delete mode 100644 controller/controller/watch_pods.py delete mode 100644 controller/upgrade_packages.txt delete mode 100644 sigopt/cli/arguments/cluster_filename.py delete mode 100644 sigopt/cli/arguments/cluster_name.py delete mode 100644 sigopt/cli/arguments/dockerfile.py delete mode 100644 sigopt/cli/arguments/identifiers.py delete mode 100644 sigopt/cli/arguments/provider.py delete mode 100644 sigopt/cli/commands/cluster/__init__.py delete mode 100644 sigopt/cli/commands/cluster/base.py delete mode 100644 sigopt/cli/commands/cluster/clean.py delete mode 100644 sigopt/cli/commands/cluster/connect.py delete mode 100644 sigopt/cli/commands/cluster/create.py delete mode 100644 sigopt/cli/commands/cluster/destroy.py delete mode 100644 sigopt/cli/commands/cluster/disconnect.py delete mode 100644 sigopt/cli/commands/cluster/install_plugins.py delete mode 100644 sigopt/cli/commands/cluster/kubectl.py delete mode 100644 sigopt/cli/commands/cluster/not_installed.py delete mode 100644 sigopt/cli/commands/cluster/optimize.py delete mode 100644 sigopt/cli/commands/cluster/run.py delete mode 100644 sigopt/cli/commands/cluster/status.py delete mode 100644 sigopt/cli/commands/cluster/stop.py delete mode 100644 sigopt/cli/commands/cluster/test.py delete mode 100644 sigopt/cli/commands/cluster/test_run.py delete mode 100644 sigopt/cli/commands/cluster/update.py create mode 100644 sigopt/decorators.py delete mode 100644 sigopt/orchestrate/__init__.py delete mode 100644 sigopt/orchestrate/aws/__init__.py delete mode 100644 sigopt/orchestrate/aws/service.py delete mode 100644 sigopt/orchestrate/cloudformation/__init__.py delete mode 100644 sigopt/orchestrate/cloudformation/cluster-autoscaler-role.yaml delete mode 100644 sigopt/orchestrate/cloudformation/eks-cluster.yaml delete mode 100644 sigopt/orchestrate/cloudformation/eks-node-security.yaml delete mode 100644 sigopt/orchestrate/cloudformation/eks-nodegroup.yaml delete mode 100644 sigopt/orchestrate/cloudformation/eks-vpc.yaml delete mode 100644 sigopt/orchestrate/cloudformation/service.py delete mode 100644 sigopt/orchestrate/cluster/__init__.py delete mode 100644 sigopt/orchestrate/cluster/context.py delete mode 100644 sigopt/orchestrate/cluster/errors.py delete mode 100644 sigopt/orchestrate/cluster/object.py delete mode 100644 sigopt/orchestrate/cluster/service.py delete mode 100644 sigopt/orchestrate/cluster_metadata/__init__.py delete mode 100644 sigopt/orchestrate/cluster_metadata/errors.py delete mode 100644 sigopt/orchestrate/cluster_metadata/service.py delete mode 100644 sigopt/orchestrate/common.py delete mode 100644 sigopt/orchestrate/controller.py delete mode 100644 sigopt/orchestrate/custom_cluster/__init__.py delete mode 100644 sigopt/orchestrate/custom_cluster/service.py delete mode 100644 sigopt/orchestrate/docker/__init__.py delete mode 100644 sigopt/orchestrate/docker/service.py delete mode 100644 sigopt/orchestrate/ec2/__init__.py delete mode 100644 sigopt/orchestrate/ec2/service.py delete mode 100644 sigopt/orchestrate/ecr/__init__.py delete mode 100644 sigopt/orchestrate/ecr/service.py delete mode 100644 sigopt/orchestrate/eks/__init__.py delete mode 100644 sigopt/orchestrate/eks/kubeconfig.yml delete mode 100644 sigopt/orchestrate/eks/service.py delete mode 100644 sigopt/orchestrate/exceptions.py delete mode 100644 sigopt/orchestrate/gpu_options_validator/__init__.py delete mode 100644 sigopt/orchestrate/gpu_options_validator/service.py delete mode 100644 sigopt/orchestrate/iam/__init__.py delete mode 100644 sigopt/orchestrate/iam/service.py delete mode 100644 sigopt/orchestrate/identifier.py delete mode 100644 sigopt/orchestrate/job_runner/__init__.py delete mode 100644 sigopt/orchestrate/job_runner/service.py delete mode 100644 sigopt/orchestrate/job_status/__init__.py delete mode 100644 sigopt/orchestrate/job_status/service.py delete mode 100644 sigopt/orchestrate/json_stream.py delete mode 100644 sigopt/orchestrate/kubectl/__init__.py delete mode 100644 sigopt/orchestrate/kubectl/service.py delete mode 100644 sigopt/orchestrate/kubernetes/__init__.py delete mode 100644 sigopt/orchestrate/kubernetes/http_proxy.py delete mode 100644 sigopt/orchestrate/kubernetes/service.py delete mode 100644 sigopt/orchestrate/lib/__init__.py delete mode 100644 sigopt/orchestrate/lib/lists.py delete mode 100644 sigopt/orchestrate/lib/types.py delete mode 100644 sigopt/orchestrate/logging/__init__.py delete mode 100644 sigopt/orchestrate/logging/service.py delete mode 100644 sigopt/orchestrate/model_packer/__init__.py delete mode 100644 sigopt/orchestrate/model_packer/service.py delete mode 100644 sigopt/orchestrate/node_groups.py delete mode 100644 sigopt/orchestrate/options_validator/__init__.py delete mode 100644 sigopt/orchestrate/options_validator/service.py delete mode 100644 sigopt/orchestrate/paths.py delete mode 100644 sigopt/orchestrate/plugins/__init__.py delete mode 100644 sigopt/orchestrate/plugins/autoscaler-plugin-template.yml delete mode 100644 sigopt/orchestrate/plugins/docker-service.yml delete mode 100644 sigopt/orchestrate/plugins/docker-statefulset.yml delete mode 100644 sigopt/orchestrate/plugins/orchestrate-controller-roles.yml delete mode 100644 sigopt/orchestrate/provider/__init__.py delete mode 100644 sigopt/orchestrate/provider/broker.py delete mode 100644 sigopt/orchestrate/provider/constants.py delete mode 100644 sigopt/orchestrate/provider/interface.py delete mode 100644 sigopt/orchestrate/resource/__init__.py delete mode 100644 sigopt/orchestrate/resource/service.py delete mode 100644 sigopt/orchestrate/s3/__init__.py delete mode 100644 sigopt/orchestrate/s3/service.py delete mode 100644 sigopt/orchestrate/services/__init__.py delete mode 100644 sigopt/orchestrate/services/aws_base.py delete mode 100644 sigopt/orchestrate/services/aws_provider_bag.py delete mode 100644 sigopt/orchestrate/services/bag.py delete mode 100644 sigopt/orchestrate/services/base.py delete mode 100644 sigopt/orchestrate/services/orchestrate_bag.py delete mode 100644 sigopt/orchestrate/sigopt/__init__.py delete mode 100644 sigopt/orchestrate/sigopt/service.py delete mode 100644 sigopt/orchestrate/status.py delete mode 100644 sigopt/orchestrate/stop.py delete mode 100644 sigopt/orchestrate/sts/__init__.py delete mode 100644 sigopt/orchestrate/sts/service.py delete mode 100644 sigopt/orchestrate/test/__init__.py delete mode 100644 sigopt/orchestrate/test/test_file.txt delete mode 100644 sigopt/orchestrate/version.py delete mode 100644 sigopt/orchestrate/zigopt/__init__.py delete mode 100644 test/cli/test_cluster_connect.py delete mode 100644 test/cli/test_cluster_create.py delete mode 100644 test/cli/test_cluster_destroy.py delete mode 100644 test/cli/test_cluster_disconnect.py delete mode 100644 test/cli/test_cluster_kubectl.py delete mode 100644 test/cli/test_cluster_run.py delete mode 100644 test/cli/test_cluster_test.py delete mode 100644 test/orchestrate/__init__.py delete mode 100644 test/orchestrate/aws/__init__.py delete mode 100644 test/orchestrate/aws/service_test.py delete mode 100644 test/orchestrate/cluster/__init__.py delete mode 100644 test/orchestrate/cluster/service_test.py delete mode 100644 test/orchestrate/cluster_metadata/__init__.py delete mode 100644 test/orchestrate/cluster_metadata/service_test.py delete mode 100644 test/orchestrate/common_test.py delete mode 100644 test/orchestrate/docker/__init__.py delete mode 100644 test/orchestrate/docker/service_test.py delete mode 100644 test/orchestrate/ecr/__init__.py delete mode 100644 test/orchestrate/ecr/service_test.py delete mode 100644 test/orchestrate/gpu_options_validator/__init__.py delete mode 100644 test/orchestrate/gpu_options_validator/service_test.py delete mode 100644 test/orchestrate/job_runner/__init__.py delete mode 100644 test/orchestrate/job_runner/service_test.py delete mode 100644 test/orchestrate/job_status/__init__.py delete mode 100644 test/orchestrate/job_status/service_test.py delete mode 100644 test/orchestrate/kubernetes/__init__.py delete mode 100644 test/orchestrate/kubernetes/service_test.py delete mode 100644 test/orchestrate/lib/__init__.py delete mode 100644 test/orchestrate/lib/lists_test.py delete mode 100644 test/orchestrate/lib/types_test.py delete mode 100644 test/orchestrate/model_packer/__init__.py delete mode 100644 test/orchestrate/options_validator/__init__.py delete mode 100644 test/orchestrate/options_validator/service_test.py delete mode 100644 test/orchestrate/provider/__init__.py delete mode 100644 test/orchestrate/provider/broker_test.py delete mode 100644 test/orchestrate/resource/__init__.py delete mode 100644 test/orchestrate/resource/service_test.py delete mode 100644 test/orchestrate/services/__init__.py delete mode 100644 test/orchestrate/services/aws_provider_bag_test.py delete mode 100644 test/orchestrate/services/base_test.py delete mode 100644 test/orchestrate/sigopt/service_test.py delete mode 100644 test/orchestrate/sts/__init__.py delete mode 100644 test/orchestrate/sts/service_test.py diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 2b6e6eb8..73cb17be 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -20,7 +20,6 @@ jobs: test-suite: - cli - client - - orchestrate - runs - validate - xgboost @@ -33,7 +32,7 @@ jobs: uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - - run: pip install '.[orchestrate,xgboost]' -r requirements-dev.txt + - run: pip install '.[xgboost]' -r requirements-dev.txt - run: pytest -rw -v test/${{ matrix.test-suite }} pylint: runs-on: ubuntu-latest diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c349a27b..3e1aead3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -16,7 +16,6 @@ repos: - id: check-shebang-scripts-are-executable - id: check-toml - id: check-yaml - exclude: "^sigopt/orchestrate/.*$" - id: check-ast - id: debug-statements - id: end-of-file-fixer diff --git a/.vulture_allowlist b/.vulture_allowlist index b1e9ab16..b6a7325c 100644 --- a/.vulture_allowlist +++ b/.vulture_allowlist @@ -1,84 +1,33 @@ -MODIFIED # unused variable (controller/controller/k8s_constants.py:6) -signum # unused variable (controller/controller/manage_pods.py:29) -_.create_from_pod # unused method (controller/controller/run_state.py:35) load_ipython_extension # unused function (sigopt/__init__.py:43) -franke_function # unused function (sigopt/examples/franke.py:7) -FRANKE_EXPERIMENT_DEFINITION # unused variable (sigopt/examples/franke.py:17) -find # unused function (sigopt/lib.py:52) -safe_format # unused function (sigopt/lib.py:61) -_.fileno # unused method (sigopt/log_capture.py:27) -_.isatty # unused method (sigopt/log_capture.py:33) -_.readable # unused method (sigopt/log_capture.py:36) -_.seekable # unused method (sigopt/log_capture.py:48) -_.writable # unused method (sigopt/log_capture.py:54) -_.writelines # unused method (sigopt/log_capture.py:63) -trace # unused variable (sigopt/log_capture.py:80) -trace # unused variable (sigopt/log_capture.py:90) -trace # unused variable (sigopt/log_capture.py:114) -scale # unused variable (sigopt/objects.py:384) -shape_a # unused variable (sigopt/objects.py:385) -shape_b # unused variable (sigopt/objects.py:386) -grid # unused variable (sigopt/objects.py:394) -prior # unused variable (sigopt/objects.py:397) -finished_run_count # unused variable (sigopt/objects.py:412) -finished_run_count # unused variable (sigopt/objects.py:419) -lookback_checkpoints # unused variable (sigopt/objects.py:454) -min_checkpoints # unused variable (sigopt/objects.py:457) -max_checkpoints # unused variable (sigopt/objects.py:462) -early_stopping_criteria # unused variable (sigopt/objects.py:463) -num_solutions # unused variable (sigopt/objects.py:484) -observation_budget # unused variable (sigopt/objects.py:485) -training_monitor # unused variable (sigopt/objects.py:492) -num_solutions # unused variable (sigopt/objects.py:508) -should_stop # unused variable (sigopt/objects.py:539) -should_stop # unused variable (sigopt/objects.py:601) -stopping_reasons # unused variable (sigopt/objects.py:602) -training_run # unused variable (sigopt/objects.py:603) -email # unused variable (sigopt/objects.py:610) -DEFAULT_SYSTEM_NODE_GROUP_MIN_NODES # unused variable (sigopt/orchestrate/cloudformation/service.py:18) -FailedEksStackCreationError # unused class (sigopt/orchestrate/cloudformation/service.py:28) -IP_PRIVATE_PUBLIC_BITS # unused variable (sigopt/orchestrate/cloudformation/service.py:51) -_.delete_eks_cluster_autoscaler_role_stack # unused method (sigopt/orchestrate/cloudformation/service.py:100) -_.delete_eks_cluster_stack # unused method (sigopt/orchestrate/cloudformation/service.py:339) -t # unused variable (sigopt/orchestrate/cluster/context.py:15) -tb # unused variable (sigopt/orchestrate/cluster/context.py:15) -retry_with_backoff # unused function (sigopt/orchestrate/common.py:36) -load_user_options # unused function (sigopt/orchestrate/controller.py:511) -_.base_url # unused attribute (sigopt/orchestrate/docker/service.py:62) -_.untag_all # unused method (sigopt/orchestrate/docker/service.py:196) -_.image_exists_in_registry # unused method (sigopt/orchestrate/docker/service.py:200) -_.get_subnets # unused method (sigopt/orchestrate/ec2/service.py:23) -MissingGpuNodesException # unused class (sigopt/orchestrate/exceptions.py:34) -_.describe_eks_role # unused method (sigopt/orchestrate/iam/service.py:35) -_.get_runs_by_pod # unused method (sigopt/orchestrate/job_status/service.py:40) -_.parse_pod # unused method (sigopt/orchestrate/job_status/service.py:56) -_.add_headers # unused method (sigopt/orchestrate/kubernetes/http_proxy.py:15) -_.init_poolmanager # unused method (sigopt/orchestrate/kubernetes/http_proxy.py:20) -block # unused variable (sigopt/orchestrate/kubernetes/http_proxy.py:20) -_.cert_verify # unused method (sigopt/orchestrate/kubernetes/http_proxy.py:24) -cert # unused variable (sigopt/orchestrate/kubernetes/http_proxy.py:24) -verify # unused variable (sigopt/orchestrate/kubernetes/http_proxy.py:24) -_._names # unused attribute (sigopt/orchestrate/kubernetes/service.py:39) -PodNotFoundException # unused class (sigopt/orchestrate/kubernetes/service.py:64) -_.proxy # unused attribute (sigopt/orchestrate/kubernetes/service.py:88) -_.get_jobs # unused method (sigopt/orchestrate/kubernetes/service.py:107) -_.get_pod # unused method (sigopt/orchestrate/kubernetes/service.py:182) -_.CN # unused attribute (sigopt/orchestrate/kubernetes/service.py:305) -_.CN # unused attribute (sigopt/orchestrate/kubernetes/service.py:322) -_.delete_autoscaler # unused method (sigopt/orchestrate/kubernetes/service.py:610) -_.fetch_run # unused method (sigopt/orchestrate/sigopt/service.py:63) -_.pool_classes_by_scheme # unused attribute (sigopt/request_driver.py:24) -NoDefaultParameterError # unused class (sigopt/run_context.py:37) -_.set_parameter_meta # unused method (sigopt/run_context.py:103) -_.set_parameters_meta # unused method (sigopt/run_context.py:106) -tb # unused variable (sigopt/run_context.py:363) -_.setdefaults # unused method (sigopt/run_params.py:33) -ConnectionCls # unused variable (sigopt/urllib3_patch.py:44) -ConnectionCls # unused variable (sigopt/urllib3_patch.py:58) -_.after_iteration # unused method (sigopt/xgboost/checkpoint_callback.py:13) -_.after_training # unused method (sigopt/xgboost/checkpoint_callback.py:33) -print_hello # unused import (test/cli/test_files/import_hello.py:4) -autouse # unused variable (test/client/test_interface.py:15) -_.side_effect # unused attribute (test/client/test_request_driver.py:20) -_.get_option # unused attribute (test/orchestrate/cluster/service_test.py:24) -_.p1 # unused attribute (test/runs/test_factory.py:49) +franke_function # unused function (sigopt/examples/franke.py:8) +FRANKE_EXPERIMENT_DEFINITION # unused variable (sigopt/examples/franke.py:19) +_.optimize # unused method (sigopt/magics.py:108) +scale # unused variable (sigopt/objects.py:383) +shape_a # unused variable (sigopt/objects.py:384) +shape_b # unused variable (sigopt/objects.py:385) +grid # unused variable (sigopt/objects.py:393) +prior # unused variable (sigopt/objects.py:396) +active_run_count # unused variable (sigopt/objects.py:410) +finished_run_count # unused variable (sigopt/objects.py:411) +total_run_count # unused variable (sigopt/objects.py:412) +active_run_count # unused variable (sigopt/objects.py:417) +finished_run_count # unused variable (sigopt/objects.py:418) +total_run_count # unused variable (sigopt/objects.py:419) +lookback_checkpoints # unused variable (sigopt/objects.py:453) +min_checkpoints # unused variable (sigopt/objects.py:456) +max_checkpoints # unused variable (sigopt/objects.py:461) +early_stopping_criteria # unused variable (sigopt/objects.py:462) +num_solutions # unused variable (sigopt/objects.py:483) +observation_budget # unused variable (sigopt/objects.py:484) +training_monitor # unused variable (sigopt/objects.py:491) +num_solutions # unused variable (sigopt/objects.py:507) +should_stop # unused variable (sigopt/objects.py:538) +reasons # unused variable (sigopt/objects.py:539) +should_stop # unused variable (sigopt/objects.py:600) +stopping_reasons # unused variable (sigopt/objects.py:601) +training_run # unused variable (sigopt/objects.py:602) +email # unused variable (sigopt/objects.py:609) +_.pool_classes_by_scheme # unused attribute (sigopt/request_driver.py:26) +ConnectionCls # unused variable (sigopt/urllib3_patch.py:48) +ConnectionCls # unused variable (sigopt/urllib3_patch.py:63) +_.p1 # unused attribute (test/runs/test_factory.py:48) diff --git a/controller/Dockerfile b/controller/Dockerfile deleted file mode 100644 index 66fccd3b..00000000 --- a/controller/Dockerfile +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -ARG SIGOPT_GIT_HASH=d4257f9daf672139d7c3437c2f784eef4ae20f08 - - -FROM python:3.10.8-alpine3.16 AS updated-base - -SHELL ["/bin/ash", "-ex", "-o", "pipefail", "-c"] - -COPY upgrade_packages.txt /upgrade_packages.txt - -# hadolint ignore=DL3018 -RUN : \ - ; apk del --purge krb5-conf krb5-libs \ - ; apk update --no-cache \ - ; xargs apk add --upgrade --no-cache =0.0.20 so there should theoretically be no issue. -# hadolint ignore=DL3018 -RUN : \ - ; xargs apk add --no-cache /pip-reqs.txt - - -FROM python:3.10.7 AS pip-src - -SHELL ["/bin/bash", "-ex", "-o", "pipefail", "-c"] - -COPY --from=pip-reqs /pip-reqs.txt /pip-reqs.txt - -RUN : \ - ; grep -v '@' =0.16.0,<0.17.0", - "boto3>=1.16.34,<2.0.0", - "certifi>=2022.12.7", - "docker>=4.4.0,<5.0.0", - "kubernetes>=12.0.1,<13.0.0", - "pyOpenSSL>=20.0.0", -] xgboost_install_requires = ["xgboost>=1.3.1", "numpy>=1.15.0"] hyperopt_install_requires = ["hyperopt>=0.2.7"] lite_install_requires = ["sigoptlite>=0.1.1"] @@ -58,13 +50,8 @@ }, install_requires=install_requires, extras_require={ - "dev": dev_install_requires - + orchestrate_install_requires - + xgboost_install_requires - + hyperopt_install_requires - + lite_install_requires, + "dev": dev_install_requires + xgboost_install_requires + hyperopt_install_requires + lite_install_requires, "hyperopt": hyperopt_install_requires, - "orchestrate": orchestrate_install_requires, "xgboost": xgboost_install_requires, "lite": lite_install_requires, }, diff --git a/sigopt/cli/arguments/__init__.py b/sigopt/cli/arguments/__init__.py index bb923cd8..95d080dd 100644 --- a/sigopt/cli/arguments/__init__.py +++ b/sigopt/cli/arguments/__init__.py @@ -1,16 +1,11 @@ # Copyright © 2022 Intel Corporation # # SPDX-License-Identifier: MIT -from .cluster_filename import cluster_filename_option -from .cluster_name import cluster_name_option from .commands import commands_argument -from .dockerfile import dockerfile_option from .experiment_file import experiment_file_option from .experiment_id import experiment_id_argument -from .identifiers import identifiers_argument, identifiers_help from .load_yaml import load_yaml_callback from .project import project_name_option, project_option -from .provider import provider_option from .run_file import run_file_option from .source_file import source_file_option from .validate import validate_id, validate_ids diff --git a/sigopt/cli/arguments/cluster_filename.py b/sigopt/cli/arguments/cluster_filename.py deleted file mode 100644 index 81770c35..00000000 --- a/sigopt/cli/arguments/cluster_filename.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import click - -from sigopt.validate import validate_top_level_dict - -from .load_yaml import load_yaml_callback - - -cluster_filename_option = click.option( - "-f", - "--filename", - type=click.Path(exists=True), - callback=load_yaml_callback(validate_top_level_dict), - help="cluster config yaml file", - default="cluster.yml", -) diff --git a/sigopt/cli/arguments/cluster_name.py b/sigopt/cli/arguments/cluster_name.py deleted file mode 100644 index 2019a259..00000000 --- a/sigopt/cli/arguments/cluster_name.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import click - - -cluster_name_option = click.option("-n", "--cluster-name", required=True, help="Name of the cluster") diff --git a/sigopt/cli/arguments/dockerfile.py b/sigopt/cli/arguments/dockerfile.py deleted file mode 100644 index 818919b8..00000000 --- a/sigopt/cli/arguments/dockerfile.py +++ /dev/null @@ -1,12 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import click - - -dockerfile_option = click.option( - "-d", - "--dockerfile", - type=click.Path(exists=True), - default="./Dockerfile", -) diff --git a/sigopt/cli/arguments/identifiers.py b/sigopt/cli/arguments/identifiers.py deleted file mode 100644 index 8d86310c..00000000 --- a/sigopt/cli/arguments/identifiers.py +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import click - -from sigopt.orchestrate.identifier import parse_identifier - - -def identifiers_callback(ctx, p, value): # pylint: disable=unused-argument - try: - return [parse_identifier(raw) for raw in value] - except ValueError as ve: - raise click.BadParameter(str(ve)) from ve - - -identifiers_argument = click.argument( - "identifiers", - nargs=-1, - callback=identifiers_callback, -) - -identifiers_help = "IDENTIFIERS can be the name of a run, or one of the following: experiment/[id], run/[id]" diff --git a/sigopt/cli/arguments/provider.py b/sigopt/cli/arguments/provider.py deleted file mode 100644 index 9cac2dbb..00000000 --- a/sigopt/cli/arguments/provider.py +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import click - -from sigopt.orchestrate.provider.constants import STRING_TO_PROVIDER - - -provider_option = click.option( - "--provider", - type=click.Choice(sorted(STRING_TO_PROVIDER.keys())), - required=True, - help="The cloud provider. Use `custom` for your own cluster.", -) diff --git a/sigopt/cli/commands/__init__.py b/sigopt/cli/commands/__init__.py index 343714f9..c7fd9c8b 100644 --- a/sigopt/cli/commands/__init__.py +++ b/sigopt/cli/commands/__init__.py @@ -1,7 +1,6 @@ # Copyright © 2022 Intel Corporation # # SPDX-License-Identifier: MIT -import sigopt.cli.commands.cluster import sigopt.cli.commands.config import sigopt.cli.commands.experiment import sigopt.cli.commands.init diff --git a/sigopt/cli/commands/cluster/__init__.py b/sigopt/cli/commands/cluster/__init__.py deleted file mode 100644 index 99c1b995..00000000 --- a/sigopt/cli/commands/cluster/__init__.py +++ /dev/null @@ -1,23 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -try: - import sigopt.orchestrate.controller -except ImportError: - import sigopt.cli.commands.cluster.not_installed -else: - import sigopt.cli.commands.cluster.base - import sigopt.cli.commands.cluster.clean - import sigopt.cli.commands.cluster.connect - import sigopt.cli.commands.cluster.create - import sigopt.cli.commands.cluster.destroy - import sigopt.cli.commands.cluster.disconnect - import sigopt.cli.commands.cluster.install_plugins - import sigopt.cli.commands.cluster.kubectl - import sigopt.cli.commands.cluster.optimize - import sigopt.cli.commands.cluster.run - import sigopt.cli.commands.cluster.status - import sigopt.cli.commands.cluster.stop - import sigopt.cli.commands.cluster.test - import sigopt.cli.commands.cluster.test_run - import sigopt.cli.commands.cluster.update diff --git a/sigopt/cli/commands/cluster/base.py b/sigopt/cli/commands/cluster/base.py deleted file mode 100644 index 457aeba9..00000000 --- a/sigopt/cli/commands/cluster/base.py +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import click - -from sigopt.orchestrate.controller import OrchestrateController - -from ..base import sigopt_cli - - -class Context: - _controller = None - - @property - def controller(self): - if self._controller is None: - self._controller = OrchestrateController.create() - return self._controller - - -@sigopt_cli.group("cluster") -@click.pass_context -def cluster_command(ctx): - """Commands for running SigOpt on Kubernetes clusters.""" - ctx.obj = Context() diff --git a/sigopt/cli/commands/cluster/clean.py b/sigopt/cli/commands/cluster/clean.py deleted file mode 100644 index 8672c657..00000000 --- a/sigopt/cli/commands/cluster/clean.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import click - -from .base import cluster_command - - -@cluster_command.command() -@click.pass_context -def clean(ctx): - """Reclaim space for building models.""" - ctx.obj.controller.clean_images() diff --git a/sigopt/cli/commands/cluster/connect.py b/sigopt/cli/commands/cluster/connect.py deleted file mode 100644 index 61e87ed6..00000000 --- a/sigopt/cli/commands/cluster/connect.py +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import click - -from sigopt.orchestrate.provider.constants import Provider, provider_to_string -from sigopt.validate import validate_top_level_dict - -from ...arguments import cluster_name_option, load_yaml_callback, provider_option -from .base import cluster_command - - -@cluster_command.command() -@cluster_name_option -@provider_option -@click.option( - "--kubeconfig", - type=click.Path(exists=True), - callback=load_yaml_callback(validate_top_level_dict), - help="A kubeconfig used to connect to this cluster", -) -@click.option("--registry", help="A custom image registry (host[:port][/path])") -@click.pass_context -def connect(ctx, cluster_name, provider, kubeconfig, registry): - """Connect to an existing Kubernetes cluster.""" - if kubeconfig and provider != provider_to_string(Provider.CUSTOM): - raise click.BadParameter("Only --provider=custom is allowed with --kubeconfig") - ctx.obj.controller.connect_to_cluster(cluster_name, provider, registry, kubeconfig and kubeconfig.data) diff --git a/sigopt/cli/commands/cluster/create.py b/sigopt/cli/commands/cluster/create.py deleted file mode 100644 index f65f51c1..00000000 --- a/sigopt/cli/commands/cluster/create.py +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import click - -from sigopt.validate import validate_top_level_dict - -from ...arguments import load_yaml_callback -from .base import cluster_command - - -@cluster_command.command() -@click.option( - "-f", - "--filename", - type=click.Path(exists=True), - callback=load_yaml_callback(validate_top_level_dict), - help="cluster config yaml file", - default="cluster.yml", -) -@click.pass_context -def create(ctx, filename): - """Create a Kubernetes cluster.""" - ctx.obj.controller.create_cluster(filename.data) diff --git a/sigopt/cli/commands/cluster/destroy.py b/sigopt/cli/commands/cluster/destroy.py deleted file mode 100644 index 980957a8..00000000 --- a/sigopt/cli/commands/cluster/destroy.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import click - -from .base import cluster_command - - -@cluster_command.command() -@click.pass_context -def destroy(ctx): - """Destroy the connected Kubernetes cluster.""" - ctx.obj.controller.destroy_connected_cluster() diff --git a/sigopt/cli/commands/cluster/disconnect.py b/sigopt/cli/commands/cluster/disconnect.py deleted file mode 100644 index 4b88676b..00000000 --- a/sigopt/cli/commands/cluster/disconnect.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import click - -from .base import cluster_command - - -@cluster_command.command() -@click.pass_context -def disconnect(ctx): - """Disconnect from the connected Kubernetes cluster.""" - ctx.obj.controller.disconnect_from_connected_cluster() diff --git a/sigopt/cli/commands/cluster/install_plugins.py b/sigopt/cli/commands/cluster/install_plugins.py deleted file mode 100644 index a9042e37..00000000 --- a/sigopt/cli/commands/cluster/install_plugins.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import click - -from .base import cluster_command - - -@cluster_command.command() -@click.pass_context -def install_plugins(ctx): - """Install plugins on the connected Kubernetes cluster.""" - ctx.obj.controller.install_cluster_plugins() diff --git a/sigopt/cli/commands/cluster/kubectl.py b/sigopt/cli/commands/cluster/kubectl.py deleted file mode 100644 index 2cce745d..00000000 --- a/sigopt/cli/commands/cluster/kubectl.py +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import click - -from .base import cluster_command - - -@cluster_command.command( - add_help_option=False, - context_settings=dict( - allow_interspersed_args=False, - ignore_unknown_options=True, - ), -) -@click.pass_context -@click.argument( - "kubectl_arguments", - nargs=-1, - type=click.UNPROCESSED, -) -def kubectl(ctx, kubectl_arguments): - """Run kubectl with the connected Kubernetes cluster.""" - ctx.obj.controller.exec_kubectl(kubectl_arguments) diff --git a/sigopt/cli/commands/cluster/not_installed.py b/sigopt/cli/commands/cluster/not_installed.py deleted file mode 100644 index aa15e92f..00000000 --- a/sigopt/cli/commands/cluster/not_installed.py +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import click - -from ..base import sigopt_cli - - -INSTALLATION_MESSAGE = " ".join( - [ - "Orchestrate is not installed.", - "Please run the following to enable the cluster subcommand:", - "`pip install 'sigopt[orchestrate]'", - ] -) - - -@sigopt_cli.command(help=INSTALLATION_MESSAGE) -@click.argument( - "_", - nargs=-1, - type=click.UNPROCESSED, -) -def cluster(_): - raise click.ClickException(INSTALLATION_MESSAGE) diff --git a/sigopt/cli/commands/cluster/optimize.py b/sigopt/cli/commands/cluster/optimize.py deleted file mode 100644 index 667fa21c..00000000 --- a/sigopt/cli/commands/cluster/optimize.py +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import click - -from ...arguments import dockerfile_option, project_option -from ..optimize_base import optimize_command -from .base import cluster_command - - -@cluster_command.command( - context_settings=dict( - allow_interspersed_args=False, - ignore_unknown_options=True, - ) -) -@click.pass_context -@dockerfile_option -@optimize_command -@project_option -def optimize(ctx, command, run_options, experiment_file, dockerfile, project): - """Run an Experiment on the connected Kubernetes cluster.""" - ctx.obj.controller.optimize_on_cluster( - command=command, - run_options=run_options, - optimization_options=experiment_file.data, - silent=False, - dockerfile=dockerfile, - project_id=project, - ) diff --git a/sigopt/cli/commands/cluster/run.py b/sigopt/cli/commands/cluster/run.py deleted file mode 100644 index 67d7d106..00000000 --- a/sigopt/cli/commands/cluster/run.py +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import click - -from ...arguments import dockerfile_option, project_option -from ..run_base import run_command -from .base import cluster_command - - -@cluster_command.command( - context_settings=dict( - allow_interspersed_args=False, - ignore_unknown_options=True, - ) -) -@click.pass_context -@dockerfile_option -@run_command -@project_option -def run(ctx, command, run_options, dockerfile, project): - """Launch a SigOpt Run on the connected Kubernetes cluster.""" - ctx.obj.controller.run_on_cluster( - command=command, - run_options=run_options, - silent=False, - dockerfile=dockerfile, - project_id=project, - ) diff --git a/sigopt/cli/commands/cluster/status.py b/sigopt/cli/commands/cluster/status.py deleted file mode 100644 index 3df8a153..00000000 --- a/sigopt/cli/commands/cluster/status.py +++ /dev/null @@ -1,23 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import click - -from ...arguments import identifiers_argument, identifiers_help -from .base import cluster_command - - -@cluster_command.command( - context_settings=dict(ignore_unknown_options=True), - help=f"""Get the status of the connected Kubernetes cluster. {identifiers_help}""", -) -@click.pass_context -@identifiers_argument -def status(ctx, identifiers): - if identifiers: - for i, identifier in enumerate(identifiers): - if i > 0: - print() - ctx.obj.controller.print_status(identifier) - else: - ctx.obj.controller.cluster_status() diff --git a/sigopt/cli/commands/cluster/stop.py b/sigopt/cli/commands/cluster/stop.py deleted file mode 100644 index aeb7756d..00000000 --- a/sigopt/cli/commands/cluster/stop.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import click - -from ...arguments import identifiers_argument, identifiers_help -from .base import cluster_command - - -@cluster_command.command(help=f"""Stop a Run or Experiment. {identifiers_help}""") -@click.pass_context -@identifiers_argument -def stop(ctx, identifiers): - if not identifiers: - print("No identifiers provided, nothing to do.") - return - for identifier in identifiers: - ctx.obj.controller.stop_by_identifier(identifier) diff --git a/sigopt/cli/commands/cluster/test.py b/sigopt/cli/commands/cluster/test.py deleted file mode 100644 index d50bb3d1..00000000 --- a/sigopt/cli/commands/cluster/test.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import click - -from .base import cluster_command - - -@cluster_command.command() -@click.pass_context -def test(ctx): - """Test the connection to the connected Kubernetes cluster.""" - ctx.obj.controller.test_cluster_connection() diff --git a/sigopt/cli/commands/cluster/test_run.py b/sigopt/cli/commands/cluster/test_run.py deleted file mode 100644 index 23964116..00000000 --- a/sigopt/cli/commands/cluster/test_run.py +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import click - -from ...arguments import dockerfile_option, project_option -from ..run_base import run_command -from .base import cluster_command - - -@cluster_command.command( - "test-run", - context_settings=dict( - allow_interspersed_args=False, - ignore_unknown_options=True, - ), -) -@click.pass_context -@dockerfile_option -@run_command -@project_option -def test_run(ctx, command, run_options, dockerfile, project): - """Start and debug a SigOpt Run on the connected Kubernetes cluster.""" - ctx.obj.controller.test_run_on_cluster( - command=command, - run_options=run_options, - dockerfile=dockerfile, - project_id=project, - ) diff --git a/sigopt/cli/commands/cluster/update.py b/sigopt/cli/commands/cluster/update.py deleted file mode 100644 index 5ce2db17..00000000 --- a/sigopt/cli/commands/cluster/update.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import click - -from ...arguments import cluster_filename_option -from .base import cluster_command - - -@cluster_command.command() -@cluster_filename_option -@click.pass_context -def update(ctx, filename): - """Update the connected Kuberentes cluster.""" - ctx.obj.controller.update_cluster(filename.data) diff --git a/sigopt/config.py b/sigopt/config.py index 6ed6daee..2117f095 100644 --- a/sigopt/config.py +++ b/sigopt/config.py @@ -49,10 +49,6 @@ def __init__(self): self._json_context = json.loads(decoded) self._object_context = {} - @property - def config_json_path(self): - return self._config_json_path - def get_context_data(self, entry_cls): key = entry_cls.CONFIG_CONTEXT_KEY instance = self._object_context.get(key) diff --git a/sigopt/decorators.py b/sigopt/decorators.py new file mode 100644 index 00000000..91d2290c --- /dev/null +++ b/sigopt/decorators.py @@ -0,0 +1,11 @@ +# Copyright © 2024 Intel Corporation +# +# SPDX-License-Identifier: MIT + + +def public(f): + """ + Indicates that the function or method is meant to be part of the public interface. + Ie. intended to be used outside sigopt-python. + """ + return f diff --git a/sigopt/exception.py b/sigopt/exception.py index a4ebedb8..2ab957eb 100644 --- a/sigopt/exception.py +++ b/sigopt/exception.py @@ -50,10 +50,6 @@ def to_json(self): return copy.deepcopy(self._body) -class RunException(SigOptException): - pass - - class ConflictingProjectException(SigOptException): def __init__(self, project_id): super().__init__(f"The project with id '{project_id}' already exists.") diff --git a/sigopt/lib.py b/sigopt/lib.py index 357dd25d..59045265 100644 --- a/sigopt/lib.py +++ b/sigopt/lib.py @@ -52,21 +52,10 @@ def is_string(s): return isinstance(s, str) -def find(lis, predicate): - """ - Finds the first element in lis satisfying predicate, or else None - """ - return next((item for item in lis if predicate(item)), None) - - def remove_nones(mapping): return {key: value for key, value in mapping.items() if value is not None} -def safe_format(string, *args, **kwargs): - return string.format(*args, **kwargs) - - def validate_name(warn, name): if not is_string(name): raise ValueError(f"The {warn} must be a string, not {type(name).__name__}") diff --git a/sigopt/log_capture.py b/sigopt/log_capture.py index 96116b7b..3dcb46f2 100644 --- a/sigopt/log_capture.py +++ b/sigopt/log_capture.py @@ -5,6 +5,8 @@ import sys import threading +from .decorators import public + class MonitorStream(io.IOBase): def __init__(self, original_stream): @@ -17,49 +19,63 @@ def __init__(self, original_stream): def _replace_buffer_stream(self): self.buffer_stream = io.StringIO() + @public def close(self): raise IOError("MonitorStream cannot be closed") + @public @property def closed(self): return self.original_stream.closed + @public def fileno(self): raise IOError("MonitorStream has no fileno") + @public def flush(self): return self.original_stream.flush() + @public def isatty(self): return False + @public def readable(self): return False + @public def readline(self, *args, **kwargs): return self.original_stream.readline(*args, **kwargs) + @public def readlines(self, *args, **kwargs): return self.original_stream.readlines(*args, **kwargs) + @public def seek(self, *args, **kwargs): raise IOError("MonitorStream is not seekable") + @public def seekable(self): return False + @public def tell(self, *args, **kwargs): raise IOError("MonitorStream is not seekable") + @public def writable(self): return True + @public def write(self, content): rval = self.original_stream.write(content) with self.buffer_lock: self.buffer_stream.write(content) return rval + @public def writelines(self, lines): for line in lines: self.write(line) @@ -79,6 +95,7 @@ def __enter__(self): raise NotImplementedError() def __exit__(self, typ, value, trace): + del trace raise NotImplementedError() @@ -90,7 +107,7 @@ def __enter__(self): return self def __exit__(self, typ, value, trace): - return None + del trace class SystemOutputStreamMonitor(BaseStreamMonitor): @@ -112,4 +129,5 @@ def __enter__(self): return self def __exit__(self, typ, value, trace): + del trace sys.stdout, sys.stderr = (monitor_stream.original_stream for monitor_stream in self.monitor_streams) diff --git a/sigopt/orchestrate/__init__.py b/sigopt/orchestrate/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/sigopt/orchestrate/aws/__init__.py b/sigopt/orchestrate/aws/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/sigopt/orchestrate/aws/service.py b/sigopt/orchestrate/aws/service.py deleted file mode 100644 index ec4deeca..00000000 --- a/sigopt/orchestrate/aws/service.py +++ /dev/null @@ -1,348 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import datetime -import re -import sys -import time -import types - -import yaml -from botocore.exceptions import ClientError - -from ..cluster.object import AWSCluster -from ..eks.service import DEFAULT_KUBERNETES_VERSION, SUPPORTED_KUBERNETES_VERSIONS -from ..exceptions import AwsClusterSharePermissionError, AwsPermissionsError, ClusterDestroyError, OrchestrateException -from ..node_groups import ALL_NODE_GROUP_TYPES, NODE_GROUP_TYPE_CPU, NODE_GROUP_TYPE_GPU, NODE_GROUP_TYPE_SYSTEM -from ..paths import get_executable_path -from ..provider.constants import Provider -from ..provider.interface import ProviderInterface - - -def is_cuda_gpu_instance_type(instance_type): - prefix, _ = instance_type.split(".", 1) - return prefix in ("p4d", "p3", "p3dn", "p2", "g4dn", "g3") - - -def catch_aws_permissions_errors(func): - def wrapper(*args, **kwargs): - try: - return func(*args, **kwargs) - except ClientError as e: - code = e.response["Error"]["Code"] - http_status_code = e.response["ResponseMetadata"]["HTTPStatusCode"] - if http_status_code == 403 or code in ( - "AccessDeniedException", - "UnauthorizedOperation", - ): - raise AwsPermissionsError(e) from e - raise - - return wrapper - - -def make_role_config_map(node_instance_role_arn, cluster_access_role_arn, cluster_access_role_name): - map_roles = [ - { - "rolearn": node_instance_role_arn, - "username": "system:node:{{EC2PrivateDNSName}}", - "groups": ["system:bootstrappers", "system:nodes"], - }, - { - "rolearn": cluster_access_role_arn, - "username": cluster_access_role_name, - "groups": ["system:masters"], - }, - ] - return { - "apiVersion": "v1", - "kind": "ConfigMap", - "metadata": { - "name": "aws-auth", - "namespace": "kube-system", - }, - "data": { - "mapRoles": yaml.dump(map_roles), - }, - } - - -class AwsService(ProviderInterface): - def __init__(self, services, aws_services): - super().__init__(services) - self.aws_services = aws_services - - def __getattribute__(self, name): - attr = super().__getattribute__(name) - if isinstance(attr, types.MethodType): - attr = catch_aws_permissions_errors(attr) - return attr - - def describe_kubernetes_cluster(self, cluster_name): - try: - return self.aws_services.eks_service.describe_cluster(cluster_name=cluster_name)["cluster"] - except self.aws_services.eks_service.client.exceptions.ResourceNotFoundException as e: - raise OrchestrateException( - f"We cannot find an EKS cluster named '{cluster_name}' using your" - " current AWS credentials. Did someone delete this cluster?" - ) from e - - def validate_cluster_options(self, cluster_name, node_groups_config, kubernetes_version): - if kubernetes_version == "latest": - kubernetes_version = DEFAULT_KUBERNETES_VERSION - if kubernetes_version: - assert kubernetes_version in SUPPORTED_KUBERNETES_VERSIONS, ( - f"Unsupported kubernetes version for EKS: {kubernetes_version}. Must be one of: {SUPPORTED_KUBERNETES_VERSIONS}" - ) - - cpu_nodes_config = node_groups_config.get(NODE_GROUP_TYPE_CPU) - gpu_nodes_config = node_groups_config.get(NODE_GROUP_TYPE_GPU) - - assert ( - cpu_nodes_config or gpu_nodes_config - ), "Looks like your cluster config file is not asking us to spin up any CPU or GPU machines." - name_regex = "^[a-zA-Z][-a-zA-Z0-9]*$" - assert cluster_name and re.match(name_regex, cluster_name), ( - "Cluster names for AWS must match the regex: /" + name_regex + "/" - ) - - if gpu_nodes_config: - gpu_instance_type = gpu_nodes_config["instance_type"] - assert is_cuda_gpu_instance_type( - gpu_instance_type - ), f"GPUs are not supported on the instance type ({gpu_instance_type})" - - def _handle_stack_event(self, _, event): - resource_status = event["ResourceStatus"] - logical_id = event["LogicalResourceId"] - print(f"{resource_status} {event['ResourceType']} {logical_id} {event['PhysicalResourceId']}") - if resource_status.endswith("_FAILED"): - print( - f"Error {resource_status}: {logical_id}: {event['ResourceStatusReason']}", - file=sys.stderr, - ) - - def get_node_groups(self, options): - return {node_group_type: options.get(node_group_type) or {} for node_group_type in ALL_NODE_GROUP_TYPES} - - def _create_or_update_kubernetes_cluster(self, options, update): - start_time = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc) - cluster_name = options["cluster_name"] - kubernetes_version = options.get("kubernetes_version") or DEFAULT_KUBERNETES_VERSION - node_groups = self.get_node_groups(options) - self.validate_cluster_options(cluster_name, node_groups, kubernetes_version) - - aws_options = options.get("aws") or {} - additional_policies = aws_options.get("additional_policies") or [] - - common_kwargs = dict( - cluster_name=cluster_name, - system_node_config=node_groups[NODE_GROUP_TYPE_SYSTEM], - cpu_node_config=node_groups[NODE_GROUP_TYPE_CPU], - gpu_node_config=node_groups[NODE_GROUP_TYPE_GPU], - key_name=self.aws_services.ec2_service.ensure_key_pair_for_cluster(cluster_name).name, - kubernetes_version=kubernetes_version, - ) - - if update: - eks_cluster_stack = self.aws_services.cloudformation_service.update_eks_cluster_stack( - event_handler=self._handle_stack_event, - **common_kwargs, - ) - else: - try: - eks_cluster_stack = self.aws_services.cloudformation_service.ensure_eks_cluster_stack( - **common_kwargs, - ) - self.aws_services.cloudformation_service.wait_for_stack_create_complete( - eks_cluster_stack.name, - event_handler=self._handle_stack_event, - after=start_time, - ) - except Exception as e: - print("*" * 50) - print("ERROR: encountered an error creating EKS cluster; tearing down resources") - print("*" * 50) - # TODO: can we catch something more fine-grained here? - # NOTE: since we're just raising here anyway, we don't need to try-except? - self.aws_services.cloudformation_service.ensure_eks_cluster_stack_deleted( - cluster_name, - self._handle_stack_event, - ) - raise e - eks_cluster_stack.reload() - eks_cluster_stack_outputs = {o["OutputKey"]: o["OutputValue"] for o in eks_cluster_stack.outputs} - node_instance_role_arn = eks_cluster_stack_outputs["NodeInstanceRoleArn"] - - for policy_arn in additional_policies: - self.aws_services.iam_service.attach_policy(node_instance_role_arn, policy_arn) - - # NOTE: no reason to update the autoscaler role stack yet, just create it if it doesn't already exist - eks_cluster = self.aws_services.eks_service.describe_cluster(cluster_name) - self.aws_services.iam_service.ensure_eks_oidc_provider(eks_cluster) - eks_cluster_autoscaler_role_stack = ( - self.aws_services.cloudformation_service.ensure_eks_cluster_autoscaler_role_stack( - cluster_name=cluster_name, - cluster_oidc_provider_url=eks_cluster["cluster"]["identity"]["oidc"]["issuer"], - ) - ) - self.aws_services.cloudformation_service.wait_for_stack_create_complete( - eks_cluster_autoscaler_role_stack.name, - event_handler=self._handle_stack_event, - after=start_time, - ) - - if not update: - self._connect_kubernetes_cluster(cluster_name=cluster_name, ignore_role=True) - self.test_kubernetes_cluster(cluster_name=cluster_name, ignore_role=True) - - # NOTE: no reason to update the aws-auth config map yet - role_arn = eks_cluster_stack_outputs["ClusterAccessRoleArn"] - role_name = eks_cluster_stack_outputs["ClusterAccessRoleName"] - role_config_map = make_role_config_map( - node_instance_role_arn=node_instance_role_arn, - cluster_access_role_arn=role_arn, - cluster_access_role_name=role_name, - ) - self.services.kubernetes_service.ensure_config_map(role_config_map) - - self._disconnect_kubernetes_cluster(cluster_name=cluster_name) - - print("Testing your kubernetes configuration, you may see an error below but we should be able to resolve it...") - self._connect_kubernetes_cluster(cluster_name=cluster_name) - print("Successfully tested your kubernetes configuration, if you saw any errors above you may ignore them...") - self._test_cluster_access_role(cluster_name=cluster_name, retries=3) - # Note: We disconnect and reconnect to solve an intermittent issue where the kubernetes python client - # ends up with an empty api key. This is a temporary fix while we resolve the bug. This solves the issue by - # reloading the key from the config file a second time which I found out works simply by some trial and error. - self._disconnect_kubernetes_cluster(cluster_name=cluster_name) - self._connect_kubernetes_cluster(cluster_name=cluster_name) - - self.test_kubernetes_cluster(cluster_name=cluster_name) - - self.services.kubernetes_service.ensure_plugins(cluster_name, Provider.AWS) - - print(self._node_access_instructions(cluster_name)) - - return self.create_cluster_object( - services=self.services, - name=cluster_name, - registry=None, - ) - - def create_kubernetes_cluster(self, options): - return self._create_or_update_kubernetes_cluster(options, update=False) - - def update_kubernetes_cluster(self, options): - return self._create_or_update_kubernetes_cluster(options, update=True) - - def _test_cluster_access_role(self, cluster_name, retries=0, wait_time=5): - cluster_access_role_arn = self.aws_services.iam_service.get_cluster_access_role_arn(cluster_name) - for try_number in range(retries + 1): - try: - self.aws_services.sts_service.assume_role(role_arn=cluster_access_role_arn) - except ClientError as ce: - if try_number >= retries: - raise AwsClusterSharePermissionError( - "You do not have permission to use the role" - f" '{cluster_access_role_arn}' for accessing this cluster.\n" - "Please read the SigOpt documentation for sharing clusters: " - "https://docs.sigopt.com/ai-module-api-references/orchestrate/aws_cluster#share-your-kubernetes-cluster" - ) from ce - time.sleep(wait_time) - - def _connect_kubernetes_cluster(self, cluster_name, ignore_role=False): - kubeconfig = self.create_kubeconfig(cluster_name, ignore_role) - self.services.kubernetes_service.write_config( - cluster_name=cluster_name, - data=kubeconfig, - ) - - def test_kubernetes_cluster(self, cluster_name, ignore_role=False): - if not ignore_role: - self._test_cluster_access_role(cluster_name=cluster_name, retries=3) - self.services.kubernetes_service.test_config() - - def _disconnect_kubernetes_cluster(self, cluster_name): - self.services.kubernetes_service.ensure_config_deleted(cluster_name=cluster_name) - - def create_kubeconfig(self, cluster_name, ignore_role=False): - cluster = self.describe_kubernetes_cluster(cluster_name) - - if ignore_role: - cluster_access_role_arn = None - else: - cluster_access_role_arn = self.aws_services.iam_service.get_cluster_access_role_arn(cluster_name) - - # TODO: optional role_arn is NOT the role ARN used to create the cluster - # See Step 2 of https://docs.aws.amazon.com/eks/latest/userguide/getting-started.html - - kubeconfig = self.services.resource_service.load_yaml("eks", "kubeconfig.yml") - kubeconfig["clusters"][0]["cluster"] = { - "server": cluster["endpoint"], - "certificate-authority-data": cluster["certificateAuthority"]["data"], - } - command_args = ["token", "-i", cluster_name] - if cluster_access_role_arn: - command_args.extend(["-r", cluster_access_role_arn]) - user = { - "exec": { - "apiVersion": "client.authentication.k8s.io/v1beta1", - "command": get_executable_path("aws-iam-authenticator"), - "args": command_args, - }, - } - kubeconfig["users"][0]["user"] = user - return kubeconfig - - def destroy_kubernetes_cluster(self, cluster_name): - self.services.kubernetes_service.ensure_config_deleted(cluster_name) - self.aws_services.ec2_service.ensure_key_pair_for_cluster_deleted(cluster_name) - - try: - instance_role_arn = self.aws_services.cloudformation_service.get_node_instance_role_arn(cluster_name) - if instance_role_arn: - instance_role = self.aws_services.iam_service.get_role_from_arn(instance_role_arn) - for policy in instance_role.attached_policies.all(): - instance_role.detach_policy(PolicyArn=policy.arn) - except ClientError: - pass - - try: - eks_cluster = self.aws_services.eks_service.describe_cluster(cluster_name) - self.aws_services.iam_service.ensure_eks_oidc_provider_deleted(eks_cluster) - except self.aws_services.eks_service.client.exceptions.ResourceNotFoundException: - pass - - try: - self.aws_services.cloudformation_service.ensure_eks_cluster_autoscaler_role_stack_deleted( - cluster_name, - event_handler=self._handle_stack_event, - ) - self.aws_services.cloudformation_service.ensure_eks_cluster_stack_deleted( - cluster_name, - event_handler=self._handle_stack_event, - ) - except Exception as e: - raise ClusterDestroyError from e - - def _node_access_instructions(self, cluster_name): - filename = self.aws_services.ec2_service.key_pair_location(cluster_name) - return ( - "*Optional:" - "\n\tTo ssh into any ec2 node in your cluster, use the username `ec2-user`" - " with the key pair located at:" - f"\n\t\t{filename}" - "\n\tExample:" - f"\n\t\tssh -i {filename} ec2-user@" - "\n\tYou may be required to change security groups on your ec2 instances" - "\n\tInstructions:" - " https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AccessingInstancesLinux.html" - ) - - def create_cluster_object(self, services, name, registry): - return AWSCluster( - services=services, - name=name, - registry=registry, - ) diff --git a/sigopt/orchestrate/cloudformation/__init__.py b/sigopt/orchestrate/cloudformation/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/sigopt/orchestrate/cloudformation/cluster-autoscaler-role.yaml b/sigopt/orchestrate/cloudformation/cluster-autoscaler-role.yaml deleted file mode 100644 index 3e1e8d81..00000000 --- a/sigopt/orchestrate/cloudformation/cluster-autoscaler-role.yaml +++ /dev/null @@ -1,97 +0,0 @@ -# original https://raw.githubusercontent.com/awslabs/amazon-eks-ami/0ccb71878f92f808dcc80d38e9cf8a296bb72b33/amazon-eks-nodegroup.yaml - -AWSTemplateFormatVersion: "2010-09-09" - -Description: Amazon EKS - Cluster Autoscaler Role - -Metadata: - "AWS::CloudFormation::Interface": - ParameterGroups: - - Label: - default: EKS Cluster - Parameters: - - ClusterName - - ClusterOIDCProviderURL - -Parameters: - ClusterName: - Type: String - Description: The cluster name provided when the cluster was created. If it is incorrect, nodes will not be able to join the cluster. - - ClusterOIDCProviderURL: - Type: String - Description: The OIDC provider URL for the cluster. - -Resources: - ClusterAutoscalerRole: - Type: "AWS::IAM::Role" - Properties: - AssumeRolePolicyDocument: - Fn::Sub: - - | - { - "Version": "2012-10-17", - "Statement": [{ - "Effect": "Allow", - "Principal": { - "Federated": "arn:aws:iam::${AccountId}:oidc-provider/${ClusterOIDCProvider}" - }, - "Action": "sts:AssumeRoleWithWebIdentity", - "Condition": { - "StringEquals": { - "${ClusterOIDCProvider}:sub": "system:serviceaccount:kube-system:cluster-autoscaler" - } - } - }] - } - - AccountId: !Ref AWS::AccountId - ClusterOIDCProvider: !Select [1, !Split ["https://", !Ref ClusterOIDCProviderURL]] - Path: / - - ClusterAutoscalerPolicy: - Type: "AWS::IAM::ManagedPolicy" - Properties: - Roles: - - !Ref ClusterAutoscalerRole - PolicyDocument: - Fn::Sub: - - | - { - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": [ - "autoscaling:DescribeAutoScalingGroups", - "autoscaling:DescribeAutoScalingInstances", - "autoscaling:DescribeLaunchConfigurations", - "autoscaling:DescribeTags", - "ec2:DescribeLaunchTemplateVersions" - ], - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": [ - "autoscaling:SetDesiredCapacity", - "autoscaling:TerminateInstanceInAutoScalingGroup" - ], - "Resource": "*", - "Condition": { - "StringEquals": { - "aws:ResourceTag/k8s.io/cluster-autoscaler/${ClusterName}": "owned" - } - } - } - ] - } - - ClusterName: !Ref ClusterName - -Outputs: - ClusterAutoscalerRole: - Description: The name of the role for the cluster autoscaler - Value: !Ref ClusterAutoscalerRole - - ClusterAutoscalerRoleArn: - Description: The arn of the role for the cluster autoscaler - Value: !GetAtt [ClusterAutoscalerRole, Arn] diff --git a/sigopt/orchestrate/cloudformation/eks-cluster.yaml b/sigopt/orchestrate/cloudformation/eks-cluster.yaml deleted file mode 100644 index f943ebc3..00000000 --- a/sigopt/orchestrate/cloudformation/eks-cluster.yaml +++ /dev/null @@ -1,338 +0,0 @@ -# original https://raw.githubusercontent.com/awslabs/amazon-eks-ami/0ccb71878f92f808dcc80d38e9cf8a296bb72b33/amazon-eks-nodegroup.yaml - -AWSTemplateFormatVersion: "2010-09-09" - -Description: Amazon EKS - Stack for SigOpt clusters - -Metadata: - "AWS::CloudFormation::Interface": - ParameterGroups: - - Label: - default: Stack template URLs - Parameters: - - NodeGroupStackTemplateURL - - NodeSecurityStackTemplateURL - - VPCStackTemplateURL - - Label: - default: User info - Parameters: - - UserArn - - Label: - default: VPC configuration - Parameters: - - AZ01 - - AZ02 - - VPCBlock - - PublicSubnet01Block - - PublicSubnet02Block - - PrivateSubnet01Block - - PrivateSubnet02Block - - Label: - default: EKS Cluster - Parameters: - - ClusterName - - KubernetesVersion - - Label: - default: Worker Node Configuration - Parameters: - - CPUNodeAutoScalingGroupDesiredCapacity - - CPUNodeAutoScalingGroupMaxSize - - CPUNodeAutoScalingGroupMinSize - - CPUNodeInstanceType - - CPUNodeVolumeSize - - GPUNodeAutoScalingGroupDesiredCapacity - - GPUNodeAutoScalingGroupMaxSize - - GPUNodeAutoScalingGroupMinSize - - GPUNodeInstanceType - - GPUNodeVolumeSize - - SSHKeyName - - SystemNodeAutoScalingGroupMaxSize - - SystemNodeInstanceType - - SystemNodeVolumeSize - -Parameters: - NodeGroupStackTemplateURL: - Type: String - Description: The URL of the node group stack template. - - NodeSecurityStackTemplateURL: - Type: String - Description: The URL of the node security stack template. - - VPCStackTemplateURL: - Type: String - Description: The URL of the VPC stack template. - - UserArn: - Type: String - Description: The ARN of the default user accessing the cluster. - - VPCBlock: - Type: String - Description: The CIDR range for the VPC. This should be a valid private (RFC 1918) CIDR range. - - AZ01: - Type: String - Description: The first availability zone to use. - - AZ02: - Type: String - Description: The second availability zone to use. - - PublicSubnet01Block: - Type: String - Description: CidrBlock for public subnet 01 within the VPC - - PublicSubnet02Block: - Type: String - Description: CidrBlock for public subnet 02 within the VPC - - PrivateSubnet01Block: - Type: String - Description: CidrBlock for private subnet 01 within the VPC - - PrivateSubnet02Block: - Type: String - Description: CidrBlock for private subnet 02 within the VPC - - ClusterName: - Type: String - Description: The cluster name provided when the cluster was created. If it is incorrect, nodes will not be able to join the cluster. - - KubernetesVersion: - Type: String - Description: The Kubernetes version, consisting of the major and minor version number. - - SSHKeyName: - Type: "AWS::EC2::KeyPair::KeyName" - Description: The EC2 Key Pair to allow SSH access to the instances - - SystemNodeAutoScalingGroupMaxSize: - Type: Number - Description: Maximum size of Node Group ASG. Set to at least 1 greater than NodeAutoScalingGroupDesiredCapacity. - - CPUNodeAutoScalingGroupDesiredCapacity: - Type: Number - Description: Desired capacity of Node Group ASG. - - CPUNodeAutoScalingGroupMaxSize: - Type: Number - Description: Maximum size of Node Group ASG. Set to at least 1 greater than NodeAutoScalingGroupDesiredCapacity. - - CPUNodeAutoScalingGroupMinSize: - Type: Number - Description: Minimum size of Node Group ASG. - - GPUNodeAutoScalingGroupDesiredCapacity: - Type: Number - Description: Desired capacity of Node Group ASG. - - GPUNodeAutoScalingGroupMaxSize: - Type: Number - Description: Maximum size of Node Group ASG. Set to at least 1 greater than NodeAutoScalingGroupDesiredCapacity. - - GPUNodeAutoScalingGroupMinSize: - Type: Number - Description: Minimum size of Node Group ASG. - - SystemNodeInstanceType: - Type: String - Description: EC2 instance type for the system node instances - - CPUNodeInstanceType: - Type: String - Description: EC2 instance type for the CPU node instances - - GPUNodeInstanceType: - Type: String - Description: EC2 instance type for the GPU node instances - - SystemNodeVolumeSize: - Type: Number - Default: 8 - Description: Node volume size for system nodes - - CPUNodeVolumeSize: - Type: Number - Default: 100 - Description: Node volume size for CPU nodes - - GPUNodeVolumeSize: - Type: Number - Default: 100 - Description: Node volume size for GPU nodes - -Conditions: - ShouldCreateCPUNodes: !Not - - !Equals - - !Ref CPUNodeAutoScalingGroupMaxSize - - "0" - - ShouldCreateGPUNodes: !Not - - !Equals - - !Ref GPUNodeAutoScalingGroupMaxSize - - "0" - -Resources: - ClusterVPC: - Type: "AWS::CloudFormation::Stack" - Properties: - TemplateURL: !Ref VPCStackTemplateURL - Parameters: - AZ01: !Ref AZ01 - AZ02: !Ref AZ02 - VPCBlock: !Ref VPCBlock - PublicSubnet01Block: !Ref PublicSubnet01Block - PublicSubnet02Block: !Ref PublicSubnet02Block - PrivateSubnet01Block: !Ref PrivateSubnet01Block - PrivateSubnet02Block: !Ref PrivateSubnet02Block - - ClusterManagementRole: - Type: "AWS::IAM::Role" - Properties: - AssumeRolePolicyDocument: - Version: "2012-10-17" - Statement: - - Effect: Allow - Principal: - Service: eks.amazonaws.com - Action: sts:AssumeRole - ManagedPolicyArns: - - arn:aws:iam::aws:policy/AmazonEKSClusterPolicy - - arn:aws:iam::aws:policy/AmazonEKSServicePolicy - Path: / - - ClusterControlPlaneSecurityGroup: - Type: AWS::EC2::SecurityGroup - DependsOn: ClusterVPC - Properties: - GroupDescription: Cluster communication with worker nodes - VpcId: !GetAtt ClusterVPC.Outputs.VpcId - - Cluster: - Type: "AWS::EKS::Cluster" - DependsOn: ClusterVPC - Properties: - Name: !Ref ClusterName - Version: !Ref KubernetesVersion - RoleArn: !GetAtt ClusterManagementRole.Arn - ResourcesVpcConfig: - SecurityGroupIds: - - !Ref ClusterControlPlaneSecurityGroup - SubnetIds: - - !GetAtt ClusterVPC.Outputs.AZ1PublicSubnet - - !GetAtt ClusterVPC.Outputs.AZ2PublicSubnet - - !GetAtt ClusterVPC.Outputs.AZ1PrivateSubnet - - !GetAtt ClusterVPC.Outputs.AZ2PrivateSubnet - - ClusterAccessRole: - Type: "AWS::IAM::Role" - Properties: - RoleName: !Sub "${ClusterName}-k8s-access-role" - Description: !Sub "Access the kubernetes cluster: ${ClusterName}, created by SigOpt" - AssumeRolePolicyDocument: - Version: "2012-10-17" - Statement: - - Effect: Allow - Principal: - AWS: !Ref UserArn - Action: "sts:AssumeRole" - Path: / - - NodeSecurity: - Type: "AWS::CloudFormation::Stack" - DependsOn: - - ClusterVPC - - ClusterControlPlaneSecurityGroup - Properties: - TemplateURL: !Ref NodeSecurityStackTemplateURL - Parameters: - ClusterName: !Ref ClusterName - ClusterControlPlaneSecurityGroup: !Ref ClusterControlPlaneSecurityGroup - VpcId: !GetAtt ClusterVPC.Outputs.VpcId - - SystemNodeGroup: - Type: "AWS::CloudFormation::Stack" - DependsOn: - - ClusterVPC - - NodeSecurity - Properties: - TemplateURL: !Ref NodeGroupStackTemplateURL - Parameters: - ClusterName: !Ref ClusterName - NodeGroupName: system-node-group - NodeGroupType: system - NodeSecurityGroup: !GetAtt NodeSecurity.Outputs.NodeSecurityGroup - NodeAutoScalingGroupMinSize: 1 - NodeAutoScalingGroupDesiredCapacity: 1 - NodeAutoScalingGroupMaxSize: !Ref SystemNodeAutoScalingGroupMaxSize - NodeInstanceProfileArn: !GetAtt NodeSecurity.Outputs.NodeInstanceProfileArn - NodeInstanceType: !Ref SystemNodeInstanceType - NodeImageIdSSMParam: !Sub "/aws/service/eks/optimized-ami/${KubernetesVersion}/amazon-linux-2/recommended/image_id" - NodeVolumeSize: !Ref SystemNodeVolumeSize - KeyName: !Ref SSHKeyName - VpcId: !GetAtt ClusterVPC.Outputs.VpcId - Subnets: !GetAtt ClusterVPC.Outputs.AZ1PrivateSubnet - - CPUNodeGroup: - Condition: ShouldCreateCPUNodes - Type: "AWS::CloudFormation::Stack" - DependsOn: ClusterVPC - Properties: - TemplateURL: !Ref NodeGroupStackTemplateURL - Parameters: - ClusterName: !Ref ClusterName - NodeGroupName: cpu-node-group - NodeGroupType: cpu - NodeSecurityGroup: !GetAtt NodeSecurity.Outputs.NodeSecurityGroup - NodeAutoScalingGroupMinSize: !Ref CPUNodeAutoScalingGroupMinSize - NodeAutoScalingGroupDesiredCapacity: !Ref CPUNodeAutoScalingGroupDesiredCapacity - NodeAutoScalingGroupMaxSize: !Ref CPUNodeAutoScalingGroupMaxSize - NodeInstanceProfileArn: !GetAtt NodeSecurity.Outputs.NodeInstanceProfileArn - NodeInstanceType: !Ref CPUNodeInstanceType - NodeImageIdSSMParam: !Sub "/aws/service/eks/optimized-ami/${KubernetesVersion}/amazon-linux-2/recommended/image_id" - NodeVolumeSize: !Ref CPUNodeVolumeSize - KeyName: !Ref SSHKeyName - VpcId: !GetAtt ClusterVPC.Outputs.VpcId - Subnets: !Sub - - "${AZ1PrivateSubnet},${AZ2PrivateSubnet}" - - AZ1PrivateSubnet: !GetAtt ClusterVPC.Outputs.AZ1PrivateSubnet - AZ2PrivateSubnet: !GetAtt ClusterVPC.Outputs.AZ2PrivateSubnet - - GPUNodeGroup: - Condition: ShouldCreateGPUNodes - Type: "AWS::CloudFormation::Stack" - DependsOn: ClusterVPC - Properties: - TemplateURL: !Ref NodeGroupStackTemplateURL - Parameters: - ClusterName: !Ref ClusterName - NodeGroupName: gpu-node-group - NodeGroupType: gpu - NodeSecurityGroup: !GetAtt NodeSecurity.Outputs.NodeSecurityGroup - NodeAutoScalingGroupMinSize: !Ref GPUNodeAutoScalingGroupMinSize - NodeAutoScalingGroupDesiredCapacity: !Ref GPUNodeAutoScalingGroupDesiredCapacity - NodeAutoScalingGroupMaxSize: !Ref GPUNodeAutoScalingGroupMaxSize - NodeInstanceProfileArn: !GetAtt NodeSecurity.Outputs.NodeInstanceProfileArn - NodeInstanceType: !Ref GPUNodeInstanceType - NodeImageIdSSMParam: !Sub "/aws/service/eks/optimized-ami/${KubernetesVersion}/amazon-linux-2-gpu/recommended/image_id" - NodeVolumeSize: !Ref GPUNodeVolumeSize - KeyName: !Ref SSHKeyName - VpcId: !GetAtt ClusterVPC.Outputs.VpcId - Subnets: !Sub - - "${AZ1PrivateSubnet},${AZ2PrivateSubnet}" - - AZ1PrivateSubnet: !GetAtt ClusterVPC.Outputs.AZ1PrivateSubnet - AZ2PrivateSubnet: !GetAtt ClusterVPC.Outputs.AZ2PrivateSubnet - -Outputs: - NodeInstanceRoleArn: - Description: The ARN of the node instance role. - Value: !GetAtt NodeSecurity.Outputs.NodeInstanceRoleArn - - ClusterAccessRoleArn: - Description: The ARN of the cluster access role. - Value: !GetAtt ClusterAccessRole.Arn - - ClusterAccessRoleName: - Description: The name of the cluster access role. - Value: !Ref ClusterAccessRole diff --git a/sigopt/orchestrate/cloudformation/eks-node-security.yaml b/sigopt/orchestrate/cloudformation/eks-node-security.yaml deleted file mode 100644 index 70ae7de3..00000000 --- a/sigopt/orchestrate/cloudformation/eks-node-security.yaml +++ /dev/null @@ -1,158 +0,0 @@ -# original https://raw.githubusercontent.com/awslabs/amazon-eks-ami/0ccb71878f92f808dcc80d38e9cf8a296bb72b33/amazon-eks-nodegroup.yaml - -AWSTemplateFormatVersion: "2010-09-09" - -Description: Amazon EKS - Node Security Group - -Metadata: - "AWS::CloudFormation::Interface": - ParameterGroups: - - Label: - default: EKS Cluster - Parameters: - - ClusterName - - ClusterControlPlaneSecurityGroup - - Label: - default: Worker Network Configuration - Parameters: - - VpcId - -Parameters: - ClusterControlPlaneSecurityGroup: - Type: "AWS::EC2::SecurityGroup::Id" - Description: The security group of the cluster control plane. - - ClusterName: - Type: String - Description: The cluster name provided when the cluster was created. If it is incorrect, nodes will not be able to join the cluster. - - VpcId: - Type: "AWS::EC2::VPC::Id" - Description: The VPC of the worker instances - -Mappings: - PartitionMap: - aws: - EC2ServicePrincipal: "ec2.amazonaws.com" - aws-us-gov: - EC2ServicePrincipal: "ec2.amazonaws.com" - aws-cn: - EC2ServicePrincipal: "ec2.amazonaws.com.cn" - aws-iso: - EC2ServicePrincipal: "ec2.c2s.ic.gov" - aws-iso-b: - EC2ServicePrincipal: "ec2.sc2s.sgov.gov" - -Resources: - NodeSecurityGroup: - Type: "AWS::EC2::SecurityGroup" - Properties: - GroupDescription: Security group for all nodes in the cluster - Tags: - - Key: !Sub kubernetes.io/cluster/${ClusterName} - Value: owned - VpcId: !Ref VpcId - - NodeSecurityGroupIngress: - Type: "AWS::EC2::SecurityGroupIngress" - DependsOn: NodeSecurityGroup - Properties: - Description: Allow node to communicate with each other - FromPort: 0 - GroupId: !Ref NodeSecurityGroup - IpProtocol: "-1" - SourceSecurityGroupId: !Ref NodeSecurityGroup - ToPort: 65535 - - ClusterControlPlaneSecurityGroupIngress: - Type: "AWS::EC2::SecurityGroupIngress" - DependsOn: NodeSecurityGroup - Properties: - Description: Allow pods to communicate with the cluster API Server - FromPort: 443 - GroupId: !Ref ClusterControlPlaneSecurityGroup - IpProtocol: tcp - SourceSecurityGroupId: !Ref NodeSecurityGroup - ToPort: 443 - - ControlPlaneEgressToNodeSecurityGroup: - Type: "AWS::EC2::SecurityGroupEgress" - DependsOn: NodeSecurityGroup - Properties: - Description: Allow the cluster control plane to communicate with worker Kubelet and pods - DestinationSecurityGroupId: !Ref NodeSecurityGroup - FromPort: 1025 - GroupId: !Ref ClusterControlPlaneSecurityGroup - IpProtocol: tcp - ToPort: 65535 - - ControlPlaneEgressToNodeSecurityGroupOn443: - Type: "AWS::EC2::SecurityGroupEgress" - DependsOn: NodeSecurityGroup - Properties: - Description: Allow the cluster control plane to communicate with pods running extension API servers on port 443 - DestinationSecurityGroupId: !Ref NodeSecurityGroup - FromPort: 443 - GroupId: !Ref ClusterControlPlaneSecurityGroup - IpProtocol: tcp - ToPort: 443 - - NodeSecurityGroupFromControlPlaneIngress: - Type: "AWS::EC2::SecurityGroupIngress" - DependsOn: NodeSecurityGroup - Properties: - Description: Allow worker Kubelets and pods to receive communication from the cluster control plane - FromPort: 1025 - GroupId: !Ref NodeSecurityGroup - IpProtocol: tcp - SourceSecurityGroupId: !Ref ClusterControlPlaneSecurityGroup - ToPort: 65535 - - NodeSecurityGroupFromControlPlaneOn443Ingress: - Type: "AWS::EC2::SecurityGroupIngress" - DependsOn: NodeSecurityGroup - Properties: - Description: Allow pods running extension API servers on port 443 to receive communication from cluster control plane - FromPort: 443 - GroupId: !Ref NodeSecurityGroup - IpProtocol: tcp - SourceSecurityGroupId: !Ref ClusterControlPlaneSecurityGroup - ToPort: 443 - - NodeInstanceRole: - Type: "AWS::IAM::Role" - Properties: - AssumeRolePolicyDocument: - Version: "2012-10-17" - Statement: - - Effect: Allow - Principal: - Service: - - !FindInMap [PartitionMap, !Ref "AWS::Partition", EC2ServicePrincipal] - Action: - - "sts:AssumeRole" - ManagedPolicyArns: - - !Sub "arn:${AWS::Partition}:iam::aws:policy/AmazonEKSWorkerNodePolicy" - - !Sub "arn:${AWS::Partition}:iam::aws:policy/AmazonEKS_CNI_Policy" - - !Sub "arn:${AWS::Partition}:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly" - Path: / - - NodeInstanceProfile: - Type: "AWS::IAM::InstanceProfile" - Properties: - Path: / - Roles: - - !Ref NodeInstanceRole - -Outputs: - NodeSecurityGroup: - Description: The security group for the node group - Value: !Ref NodeSecurityGroup - - NodeInstanceProfileArn: - Description: The ARN of the node instance profile. - Value: !GetAtt NodeInstanceProfile.Arn - - NodeInstanceRoleArn: - Description: The ARN of the node instance role. - Value: !GetAtt NodeInstanceRole.Arn diff --git a/sigopt/orchestrate/cloudformation/eks-nodegroup.yaml b/sigopt/orchestrate/cloudformation/eks-nodegroup.yaml deleted file mode 100644 index b243aeab..00000000 --- a/sigopt/orchestrate/cloudformation/eks-nodegroup.yaml +++ /dev/null @@ -1,551 +0,0 @@ -# original https://raw.githubusercontent.com/awslabs/amazon-eks-ami/0ccb71878f92f808dcc80d38e9cf8a296bb72b33/amazon-eks-nodegroup.yaml - -AWSTemplateFormatVersion: "2010-09-09" - -Description: Amazon EKS - Node Group - -Metadata: - "AWS::CloudFormation::Interface": - ParameterGroups: - - Label: - default: EKS Cluster - Parameters: - - ClusterName - - Label: - default: Worker Node Configuration - Parameters: - - NodeGroupName - - NodeGroupType - - NodeSecurityGroup - - NodeAutoScalingGroupMinSize - - NodeAutoScalingGroupDesiredCapacity - - NodeAutoScalingGroupMaxSize - - NodeInstanceProfileArn - - NodeInstanceType - - NodeImageIdSSMParam - - NodeVolumeSize - - KeyName - - Label: - default: Worker Network Configuration - Parameters: - - VpcId - - Subnets - -Parameters: - ClusterName: - Type: String - Description: The cluster name provided when the cluster was created. If it is incorrect, nodes will not be able to join the cluster. - - KeyName: - Type: "AWS::EC2::KeyPair::KeyName" - Description: The EC2 Key Pair to allow SSH access to the instances - - NodeAutoScalingGroupDesiredCapacity: - Type: Number - Description: Desired capacity of Node Group ASG. - - NodeAutoScalingGroupMaxSize: - Type: Number - Description: Maximum size of Node Group ASG. Set to at least 1 greater than NodeAutoScalingGroupDesiredCapacity. - - NodeAutoScalingGroupMinSize: - Type: Number - Description: Minimum size of Node Group ASG. - - NodeGroupName: - Type: String - Description: Unique identifier for the Node Group. - - NodeGroupType: - Type: String - Description: The type of the node group, as defined by SigOpt. Ex. cpu, gpu, system - - NodeSecurityGroup: - Type: String - Description: ARN of the security group to apply to the nodes. - - NodeImageIdSSMParam: - Type: "AWS::SSM::Parameter::Value" - Description: AWS Systems Manager Parameter Store parameter of the AMI ID for the worker node instances. Change this value to match the version of Kubernetes you are using. - - NodeInstanceProfileArn: - Type: String - Description: The ARN of the node instance profile to use for the nodes. - - NodeInstanceType: - Type: String - AllowedValues: - - a1.2xlarge - - a1.4xlarge - - a1.large - - a1.medium - - a1.metal - - a1.xlarge - - c1.medium - - c1.xlarge - - c3.2xlarge - - c3.4xlarge - - c3.8xlarge - - c3.large - - c3.xlarge - - c4.2xlarge - - c4.4xlarge - - c4.8xlarge - - c4.large - - c4.xlarge - - c5.12xlarge - - c5.18xlarge - - c5.24xlarge - - c5.2xlarge - - c5.4xlarge - - c5.9xlarge - - c5.large - - c5.metal - - c5.xlarge - - c5a.12xlarge - - c5a.16xlarge - - c5a.24xlarge - - c5a.2xlarge - - c5a.4xlarge - - c5a.8xlarge - - c5a.large - - c5a.metal - - c5a.xlarge - - c5ad.12xlarge - - c5ad.16xlarge - - c5ad.24xlarge - - c5ad.2xlarge - - c5ad.4xlarge - - c5ad.8xlarge - - c5ad.large - - c5ad.metal - - c5ad.xlarge - - c5d.12xlarge - - c5d.18xlarge - - c5d.24xlarge - - c5d.2xlarge - - c5d.4xlarge - - c5d.9xlarge - - c5d.large - - c5d.metal - - c5d.xlarge - - c5n.18xlarge - - c5n.2xlarge - - c5n.4xlarge - - c5n.9xlarge - - c5n.large - - c5n.metal - - c5n.xlarge - - c6g.12xlarge - - c6g.16xlarge - - c6g.2xlarge - - c6g.4xlarge - - c6g.8xlarge - - c6g.large - - c6g.medium - - c6g.metal - - c6g.xlarge - - c6gd.12xlarge - - c6gd.16xlarge - - c6gd.2xlarge - - c6gd.4xlarge - - c6gd.8xlarge - - c6gd.large - - c6gd.medium - - c6gd.metal - - c6gd.xlarge - - c6gn.12xlarge - - c6gn.16xlarge - - c6gn.2xlarge - - c6gn.4xlarge - - c6gn.8xlarge - - c6gn.large - - c6gn.medium - - c6gn.xlarge - - cc2.8xlarge - - cr1.8xlarge - - d2.2xlarge - - d2.4xlarge - - d2.8xlarge - - d2.xlarge - - d3.2xlarge - - d3.4xlarge - - d3.8xlarge - - d3.xlarge - - d3en.12xlarge - - d3en.2xlarge - - d3en.4xlarge - - d3en.6xlarge - - d3en.8xlarge - - d3en.xlarge - - f1.16xlarge - - f1.2xlarge - - f1.4xlarge - - g2.2xlarge - - g2.8xlarge - - g3.16xlarge - - g3.4xlarge - - g3.8xlarge - - g3s.xlarge - - g4ad.16xlarge - - g4ad.4xlarge - - g4ad.8xlarge - - g4dn.12xlarge - - g4dn.16xlarge - - g4dn.2xlarge - - g4dn.4xlarge - - g4dn.8xlarge - - g4dn.metal - - g4dn.xlarge - - h1.16xlarge - - h1.2xlarge - - h1.4xlarge - - h1.8xlarge - - hs1.8xlarge - - i2.2xlarge - - i2.4xlarge - - i2.8xlarge - - i2.xlarge - - i3.16xlarge - - i3.2xlarge - - i3.4xlarge - - i3.8xlarge - - i3.large - - i3.metal - - i3.xlarge - - i3en.12xlarge - - i3en.24xlarge - - i3en.2xlarge - - i3en.3xlarge - - i3en.6xlarge - - i3en.large - - i3en.metal - - i3en.xlarge - - inf1.24xlarge - - inf1.2xlarge - - inf1.6xlarge - - inf1.xlarge - - m1.large - - m1.medium - - m1.small - - m1.xlarge - - m2.2xlarge - - m2.4xlarge - - m2.xlarge - - m3.2xlarge - - m3.large - - m3.medium - - m3.xlarge - - m4.10xlarge - - m4.16xlarge - - m4.2xlarge - - m4.4xlarge - - m4.large - - m4.xlarge - - m5.12xlarge - - m5.16xlarge - - m5.24xlarge - - m5.2xlarge - - m5.4xlarge - - m5.8xlarge - - m5.large - - m5.metal - - m5.xlarge - - m5a.12xlarge - - m5a.16xlarge - - m5a.24xlarge - - m5a.2xlarge - - m5a.4xlarge - - m5a.8xlarge - - m5a.large - - m5a.xlarge - - m5ad.12xlarge - - m5ad.16xlarge - - m5ad.24xlarge - - m5ad.2xlarge - - m5ad.4xlarge - - m5ad.8xlarge - - m5ad.large - - m5ad.xlarge - - m5d.12xlarge - - m5d.16xlarge - - m5d.24xlarge - - m5d.2xlarge - - m5d.4xlarge - - m5d.8xlarge - - m5d.large - - m5d.metal - - m5d.xlarge - - m5dn.12xlarge - - m5dn.16xlarge - - m5dn.24xlarge - - m5dn.2xlarge - - m5dn.4xlarge - - m5dn.8xlarge - - m5dn.large - - m5dn.xlarge - - m5n.12xlarge - - m5n.16xlarge - - m5n.24xlarge - - m5n.2xlarge - - m5n.4xlarge - - m5n.8xlarge - - m5n.large - - m5n.xlarge - - m5zn.12xlarge - - m5zn.2xlarge - - m5zn.3xlarge - - m5zn.6xlarge - - m5zn.large - - m5zn.metal - - m5zn.xlarge - - m6g.12xlarge - - m6g.16xlarge - - m6g.2xlarge - - m6g.4xlarge - - m6g.8xlarge - - m6g.large - - m6g.medium - - m6g.metal - - m6g.xlarge - - m6gd.12xlarge - - m6gd.16xlarge - - m6gd.2xlarge - - m6gd.4xlarge - - m6gd.8xlarge - - m6gd.large - - m6gd.medium - - m6gd.metal - - m6gd.xlarge - - mac1.metal - - p2.16xlarge - - p2.8xlarge - - p2.xlarge - - p3.16xlarge - - p3.2xlarge - - p3.8xlarge - - p3dn.24xlarge - - p4d.24xlarge - - r3.2xlarge - - r3.4xlarge - - r3.8xlarge - - r3.large - - r3.xlarge - - r4.16xlarge - - r4.2xlarge - - r4.4xlarge - - r4.8xlarge - - r4.large - - r4.xlarge - - r5.12xlarge - - r5.16xlarge - - r5.24xlarge - - r5.2xlarge - - r5.4xlarge - - r5.8xlarge - - r5.large - - r5.metal - - r5.xlarge - - r5a.12xlarge - - r5a.16xlarge - - r5a.24xlarge - - r5a.2xlarge - - r5a.4xlarge - - r5a.8xlarge - - r5a.large - - r5a.xlarge - - r5ad.12xlarge - - r5ad.16xlarge - - r5ad.24xlarge - - r5ad.2xlarge - - r5ad.4xlarge - - r5ad.8xlarge - - r5ad.large - - r5ad.xlarge - - r5b.12xlarge - - r5b.16xlarge - - r5b.24xlarge - - r5b.2xlarge - - r5b.4xlarge - - r5b.8xlarge - - r5b.large - - r5b.metal - - r5b.xlarge - - r5d.12xlarge - - r5d.16xlarge - - r5d.24xlarge - - r5d.2xlarge - - r5d.4xlarge - - r5d.8xlarge - - r5d.large - - r5d.metal - - r5d.xlarge - - r5dn.12xlarge - - r5dn.16xlarge - - r5dn.24xlarge - - r5dn.2xlarge - - r5dn.4xlarge - - r5dn.8xlarge - - r5dn.large - - r5dn.xlarge - - r5n.12xlarge - - r5n.16xlarge - - r5n.24xlarge - - r5n.2xlarge - - r5n.4xlarge - - r5n.8xlarge - - r5n.large - - r5n.xlarge - - r6g.12xlarge - - r6g.16xlarge - - r6g.2xlarge - - r6g.4xlarge - - r6g.8xlarge - - r6g.large - - r6g.medium - - r6g.metal - - r6g.xlarge - - r6gd.12xlarge - - r6gd.16xlarge - - r6gd.2xlarge - - r6gd.4xlarge - - r6gd.8xlarge - - r6gd.large - - r6gd.medium - - r6gd.metal - - r6gd.xlarge - - t1.micro - - t2.2xlarge - - t2.large - - t2.medium - - t2.micro - - t2.nano - - t2.small - - t2.xlarge - - t3.2xlarge - - t3.large - - t3.medium - - t3.micro - - t3.nano - - t3.small - - t3.xlarge - - t3a.2xlarge - - t3a.large - - t3a.medium - - t3a.micro - - t3a.nano - - t3a.small - - t3a.xlarge - - t4g.2xlarge - - t4g.large - - t4g.medium - - t4g.micro - - t4g.nano - - t4g.small - - t4g.xlarge - - u-12tb1.metal - - u-18tb1.metal - - u-24tb1.metal - - u-6tb1.metal - - u-9tb1.metal - - x1.16xlarge - - x1.32xlarge - - x1e.16xlarge - - x1e.2xlarge - - x1e.32xlarge - - x1e.4xlarge - - x1e.8xlarge - - x1e.xlarge - - z1d.12xlarge - - z1d.2xlarge - - z1d.3xlarge - - z1d.6xlarge - - z1d.large - - z1d.metal - - z1d.xlarge - ConstraintDescription: Must be a valid EC2 instance type - Description: EC2 instance type for the node instances - - NodeVolumeSize: - Type: Number - Description: Node volume size - - Subnets: - Type: "List" - Description: The subnets where workers can be created. - - VpcId: - Type: "AWS::EC2::VPC::Id" - Description: The VPC of the worker instances - -Resources: - NodeLaunchTemplate: - Type: "AWS::EC2::LaunchTemplate" - Properties: - LaunchTemplateData: - BlockDeviceMappings: - - DeviceName: /dev/xvda - Ebs: - DeleteOnTermination: true - VolumeSize: !Ref NodeVolumeSize - VolumeType: gp2 - IamInstanceProfile: - Arn: !Ref NodeInstanceProfileArn - ImageId: !Ref NodeImageIdSSMParam - InstanceType: !Ref NodeInstanceType - KeyName: !Ref KeyName - SecurityGroupIds: - - !Ref NodeSecurityGroup - UserData: !Base64 - "Fn::Sub": | - #!/bin/bash - set -o xtrace - /etc/eks/bootstrap.sh ${ClusterName} --kubelet-extra-args '--node-labels=orchestrate.sigopt.com/node-group-type=${NodeGroupType}' - /opt/aws/bin/cfn-signal --exit-code $? \ - --stack ${AWS::StackName} \ - --resource NodeGroup \ - --region ${AWS::Region} - MetadataOptions: - HttpPutResponseHopLimit : 2 - HttpEndpoint: enabled - HttpTokens: optional - - NodeGroup: - Type: "AWS::AutoScaling::AutoScalingGroup" - Properties: - DesiredCapacity: !Ref NodeAutoScalingGroupDesiredCapacity - LaunchTemplate: - LaunchTemplateId: !Ref NodeLaunchTemplate - Version: !GetAtt NodeLaunchTemplate.LatestVersionNumber - MaxSize: !Ref NodeAutoScalingGroupMaxSize - MinSize: !Ref NodeAutoScalingGroupMinSize - Tags: - - Key: Name - PropagateAtLaunch: true - Value: !Sub ${ClusterName}-${NodeGroupName}-Node - - Key: !Sub kubernetes.io/cluster/${ClusterName} - PropagateAtLaunch: true - Value: owned - - Key: !Sub k8s.io/cluster-autoscaler/${ClusterName} - PropagateAtLaunch: true - Value: owned - - Key: k8s.io/cluster-autoscaler/enabled - PropagateAtLaunch: true - Value: "true" - - Key: k8s.io/cluster-autoscaler/node-template/resources/ephemeral-storage - PropagateAtLaunch: false - Value: !Sub ${NodeVolumeSize}Gi - - Key: k8s.io/cluster-autoscaler/node-template/label/orchestrate.sigopt.com/node-group-type - PropagateAtLaunch: false - Value: !Ref NodeGroupType - VPCZoneIdentifier: !Ref Subnets - UpdatePolicy: - AutoScalingRollingUpdate: - MaxBatchSize: 1 - MinInstancesInService: !Ref NodeAutoScalingGroupDesiredCapacity - PauseTime: PT5M - -Outputs: - NodeAutoScalingGroup: - Description: The autoscaling group - Value: !Ref NodeGroup diff --git a/sigopt/orchestrate/cloudformation/eks-vpc.yaml b/sigopt/orchestrate/cloudformation/eks-vpc.yaml deleted file mode 100644 index 7e5e813f..00000000 --- a/sigopt/orchestrate/cloudformation/eks-vpc.yaml +++ /dev/null @@ -1,276 +0,0 @@ -# original https://s3.us-west-2.amazonaws.com/amazon-eks/cloudformation/2020-10-29/amazon-eks-vpc-private-subnets.yaml ---- -AWSTemplateFormatVersion: '2010-09-09' -Description: 'Amazon EKS Sample VPC - Private and Public subnets' - -Parameters: - - VPCBlock: - Type: String - Description: The CIDR range for the VPC. This should be a valid private (RFC 1918) CIDR range. - - AZ01: - Type: String - Description: The first availability zone to use. - - AZ02: - Type: String - Description: The second availability zone to use. - - PublicSubnet01Block: - Type: String - Description: CidrBlock for public subnet 01 within the VPC - - PublicSubnet02Block: - Type: String - Description: CidrBlock for public subnet 02 within the VPC - - PrivateSubnet01Block: - Type: String - Description: CidrBlock for private subnet 01 within the VPC - - PrivateSubnet02Block: - Type: String - Description: CidrBlock for private subnet 02 within the VPC - -Metadata: - AWS::CloudFormation::Interface: - ParameterGroups: - - - Label: - default: "Worker Network Configuration" - Parameters: - - VPCBlock - - PublicSubnet01Block - - PublicSubnet02Block - - PrivateSubnet01Block - - PrivateSubnet02Block - -Resources: - VPC: - Type: AWS::EC2::VPC - Properties: - CidrBlock: !Ref VPCBlock - EnableDnsSupport: true - EnableDnsHostnames: true - Tags: - - Key: Name - Value: !Sub '${AWS::StackName}-VPC' - - InternetGateway: - Type: "AWS::EC2::InternetGateway" - - VPCGatewayAttachment: - Type: "AWS::EC2::VPCGatewayAttachment" - Properties: - InternetGatewayId: !Ref InternetGateway - VpcId: !Ref VPC - - PublicRouteTable: - Type: AWS::EC2::RouteTable - Properties: - VpcId: !Ref VPC - Tags: - - Key: Name - Value: Public Subnets - - Key: Network - Value: Public - - PrivateRouteTable01: - Type: AWS::EC2::RouteTable - Properties: - VpcId: !Ref VPC - Tags: - - Key: Name - Value: Private Subnet AZ1 - - Key: Network - Value: Private01 - - PrivateRouteTable02: - Type: AWS::EC2::RouteTable - Properties: - VpcId: !Ref VPC - Tags: - - Key: Name - Value: Private Subnet AZ2 - - Key: Network - Value: Private02 - - PublicRoute: - DependsOn: VPCGatewayAttachment - Type: AWS::EC2::Route - Properties: - RouteTableId: !Ref PublicRouteTable - DestinationCidrBlock: 0.0.0.0/0 - GatewayId: !Ref InternetGateway - - PrivateRoute01: - DependsOn: - - VPCGatewayAttachment - - NatGateway01 - Type: AWS::EC2::Route - Properties: - RouteTableId: !Ref PrivateRouteTable01 - DestinationCidrBlock: 0.0.0.0/0 - NatGatewayId: !Ref NatGateway01 - - PrivateRoute02: - DependsOn: - - VPCGatewayAttachment - - NatGateway02 - Type: AWS::EC2::Route - Properties: - RouteTableId: !Ref PrivateRouteTable02 - DestinationCidrBlock: 0.0.0.0/0 - NatGatewayId: !Ref NatGateway02 - - NatGateway01: - DependsOn: - - NatGatewayEIP1 - - PublicSubnet01 - - VPCGatewayAttachment - Type: AWS::EC2::NatGateway - Properties: - AllocationId: !GetAtt 'NatGatewayEIP1.AllocationId' - SubnetId: !Ref PublicSubnet01 - Tags: - - Key: Name - Value: !Sub '${AWS::StackName}-NatGatewayAZ1' - - NatGateway02: - DependsOn: - - NatGatewayEIP2 - - PublicSubnet02 - - VPCGatewayAttachment - Type: AWS::EC2::NatGateway - Properties: - AllocationId: !GetAtt 'NatGatewayEIP2.AllocationId' - SubnetId: !Ref PublicSubnet02 - Tags: - - Key: Name - Value: !Sub '${AWS::StackName}-NatGatewayAZ2' - - NatGatewayEIP1: - DependsOn: - - VPCGatewayAttachment - Type: 'AWS::EC2::EIP' - Properties: - Domain: vpc - - NatGatewayEIP2: - DependsOn: - - VPCGatewayAttachment - Type: 'AWS::EC2::EIP' - Properties: - Domain: vpc - - PublicSubnet01: - Type: AWS::EC2::Subnet - Metadata: - Comment: Subnet 01 - Properties: - MapPublicIpOnLaunch: true - AvailabilityZone: !Ref AZ01 - CidrBlock: - Ref: PublicSubnet01Block - VpcId: - Ref: VPC - Tags: - - Key: Name - Value: !Sub "${AWS::StackName}-PublicSubnet01" - - Key: kubernetes.io/role/elb - Value: 1 - - PublicSubnet02: - Type: AWS::EC2::Subnet - Metadata: - Comment: Subnet 02 - Properties: - MapPublicIpOnLaunch: true - AvailabilityZone: !Ref AZ02 - CidrBlock: - Ref: PublicSubnet02Block - VpcId: - Ref: VPC - Tags: - - Key: Name - Value: !Sub "${AWS::StackName}-PublicSubnet02" - - Key: kubernetes.io/role/elb - Value: 1 - - PrivateSubnet01: - Type: AWS::EC2::Subnet - Metadata: - Comment: Subnet 03 - Properties: - AvailabilityZone: !Ref AZ01 - CidrBlock: - Ref: PrivateSubnet01Block - VpcId: - Ref: VPC - Tags: - - Key: Name - Value: !Sub "${AWS::StackName}-PrivateSubnet01" - - Key: kubernetes.io/role/internal-elb - Value: 1 - - PrivateSubnet02: - Type: AWS::EC2::Subnet - Metadata: - Comment: Private Subnet 02 - Properties: - AvailabilityZone: !Ref AZ02 - CidrBlock: - Ref: PrivateSubnet02Block - VpcId: - Ref: VPC - Tags: - - Key: Name - Value: !Sub "${AWS::StackName}-PrivateSubnet02" - - Key: kubernetes.io/role/internal-elb - Value: 1 - - PublicSubnet01RouteTableAssociation: - Type: AWS::EC2::SubnetRouteTableAssociation - Properties: - SubnetId: !Ref PublicSubnet01 - RouteTableId: !Ref PublicRouteTable - - PublicSubnet02RouteTableAssociation: - Type: AWS::EC2::SubnetRouteTableAssociation - Properties: - SubnetId: !Ref PublicSubnet02 - RouteTableId: !Ref PublicRouteTable - - PrivateSubnet01RouteTableAssociation: - Type: AWS::EC2::SubnetRouteTableAssociation - Properties: - SubnetId: !Ref PrivateSubnet01 - RouteTableId: !Ref PrivateRouteTable01 - - PrivateSubnet02RouteTableAssociation: - Type: AWS::EC2::SubnetRouteTableAssociation - Properties: - SubnetId: !Ref PrivateSubnet02 - RouteTableId: !Ref PrivateRouteTable02 - -Outputs: - AZ1PublicSubnet: - Description: Id of the public subnet in the first availability zone. - Value: !Ref PublicSubnet01 - - AZ2PublicSubnet: - Description: Id of the public subnet in the second availability zone. - Value: !Ref PublicSubnet02 - - AZ1PrivateSubnet: - Description: Id of the private subnet in the first availability zone. - Value: !Ref PrivateSubnet01 - - AZ2PrivateSubnet: - Description: Id of the private subnet in the second availability zone. - Value: !Ref PrivateSubnet02 - - VpcId: - Description: The VPC Id - Value: !Ref VPC diff --git a/sigopt/orchestrate/cloudformation/service.py b/sigopt/orchestrate/cloudformation/service.py deleted file mode 100644 index fe6323de..00000000 --- a/sigopt/orchestrate/cloudformation/service.py +++ /dev/null @@ -1,492 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import collections -import datetime -import socket -import struct -import time - -import backoff -import boto3 -import botocore - -from ..exceptions import OrchestrateException -from ..services.aws_base import AwsService - - -DEFAULT_SYSTEM_NODE_GROUP_MIN_NODES = 1 -DEFAULT_SYSTEM_NODE_GROUP_MAX_NODES = 2 -DEFAULT_SYSTEM_NODE_GROUP_INSTANCE_TYPE = "t3.large" - -_call_boto_with_backoff = backoff.on_exception( - backoff.expo, - botocore.exceptions.ClientError, - giveup=lambda ce: ce.response["Error"]["Code"] != "Throttling", -) - - -class FailedEksStackCreationError(OrchestrateException): - def __init__(self, stack_name, stack_events): - super().__init__(f"Failed to create EKS stack: {stack_name}") - self.stack_name = stack_name - self.stack_events = stack_events - - -class StackDeletedException(OrchestrateException): - pass - - -# NOTE: Anatomy of VPC addresses -# AWS doesn't support masks smaller than /16 -# ipv4: 192 .168 .0 .0 -# bits: 11000000.10101000.00000000.000000000 -# desc: aaaaaaaa.aaaaaaaa.bbcdddee.eeeeeeeee -# a: mask -# b: future use -# c: public/private -# d: availability zone number -# e: addresses available within subnet (2^10 - 1 = 1023 addresses in each subnet) - -IP_AZ_ALLOCATED_BITS = 3 -IP_PRIVATE_PUBLIC_BITS = 1 -IP_REMAINING_BITS = 10 -IP_MASK_BITS = 16 -VPC_HOST_IP = "192.168.0.0" -VPC_BLOCK = f"{VPC_HOST_IP}/{IP_MASK_BITS}" - - -class AwsCloudFormationService(AwsService): - def __init__(self, services, aws_services, **kwargs): - super().__init__(services, aws_services) - self._client = boto3.client("cloudformation", **kwargs) - self._cloudformation = boto3.resource("cloudformation", **kwargs) - self.ec2 = boto3.client("ec2", **kwargs) - - @property - def client(self): - return self._client - - @property - def cloudformation(self): - return self._cloudformation - - def eks_cluster_autoscaler_role_stack_name(self, cluster_name): - return f"{cluster_name}-eks-cluster-autoscaler-role" - - def create_eks_cluster_autoscaler_role_stack(self, cluster_name, cluster_oidc_provider_url): - sg_template = self.services.resource_service.read( - "cloudformation", - "cluster-autoscaler-role.yaml", - ).decode("utf-8") - - return self.cloudformation.create_stack( - StackName=self.eks_cluster_autoscaler_role_stack_name(cluster_name), - TemplateBody=sg_template, - Parameters=[ - dict( - ParameterKey=k, - ParameterValue=v, - ) - for (k, v) in [ - ("ClusterName", cluster_name), - ("ClusterOIDCProviderURL", cluster_oidc_provider_url), - ] - ], - Capabilities=[ - "CAPABILITY_IAM", - ], - ) - - def delete_eks_cluster_autoscaler_role_stack(self, cluster_name): - self.describe_eks_cluster_autoscaler_role_stack(cluster_name).delete() - - def describe_eks_cluster_autoscaler_role_stack(self, cluster_name): - return self.cloudformation.Stack(self.eks_cluster_autoscaler_role_stack_name(cluster_name)) - - def ensure_eks_cluster_autoscaler_role_stack(self, cluster_name, *args, **kwargs): - try: - self.create_eks_cluster_autoscaler_role_stack(cluster_name, *args, **kwargs) - except self.client.exceptions.AlreadyExistsException: - pass - - return self.describe_eks_cluster_autoscaler_role_stack(cluster_name) - - def ensure_eks_cluster_autoscaler_role_stack_deleted(self, cluster_name, event_handler=None): - self._ensure_stack_deleted( - self.eks_cluster_autoscaler_role_stack_name(cluster_name), - event_handler=event_handler, - ) - - def eks_cluster_stack_name(self, cluster_name): - return f"{cluster_name}-stack" - - def upload_stack_template(self, template_name): - return self.aws_services.s3_service.upload_resource_by_hash( - path_prefix="stack_templates", - package="cloudformation", - resource_name=template_name, - ) - - def _page_boto(self, func, params, results_key): - next_token = None - while True: - params_ = params.copy() - if next_token: - params_["NextToken"] = next_token - result = _call_boto_with_backoff(func)(**params) - yield from result[results_key] - next_token = result.get("NextToken") - if not next_token: - return - - def get_compatible_availability_zones_for_instance_types(self, instance_types, az_count, prev_azs=None): - supported_azs = set.intersection( - *( - set( - r["Location"] - for r in self._page_boto( - self.ec2.describe_instance_type_offerings, - { - "LocationType": "availability-zone", - "Filters": [ - {"Name": "instance-type", "Values": [it]}, - ], - }, - "InstanceTypeOfferings", - ) - ) - for it in instance_types - ) - ) - assert len(supported_azs) >= az_count, ( - "Not able to find enough supported availability zones for all of the" - f" provided instance types: instance types: {instance_types}, required zone" - f" count: {az_count}, supported zones: {supported_azs}" - ) - if prev_azs: - if not all(az in supported_azs for az in prev_azs): - raise ValueError("The supported availability zones are not compatible with the previous availability zones") - return prev_azs - return sorted(supported_azs)[:az_count] - - def get_cidr_block(self, public, az): - # NOTE: ">" = big endian (most significant bit first), "I" = unsigned integer - network_i = struct.unpack(">I", socket.inet_aton(VPC_HOST_IP))[0] - if not public: - network_i |= 1 << (IP_REMAINING_BITS + IP_AZ_ALLOCATED_BITS) - zone_number = ord(az[-1]) - ord("a") - assert 0 <= zone_number <= (1 << IP_AZ_ALLOCATED_BITS) - network_i |= zone_number << IP_REMAINING_BITS - network = socket.inet_ntoa(struct.pack(">I", network_i)) - return f"{network}/{32 - IP_REMAINING_BITS}" - - def get_kwargs_for_cluster_stack( - self, - cluster_name, - kubernetes_version, - key_name, - system_node_config, - cpu_node_config, - gpu_node_config, - stack=None, - ): - system_max_nodes = system_node_config.get("max_nodes", DEFAULT_SYSTEM_NODE_GROUP_MAX_NODES) - system_instance_type = system_node_config.get("instance_type", DEFAULT_SYSTEM_NODE_GROUP_INSTANCE_TYPE) - cpu_min_nodes = cpu_node_config.get("min_nodes", 0) - cpu_max_nodes = cpu_node_config.get("max_nodes", 0) - cpu_instance_type = cpu_node_config.get("instance_type", "") - cpu_node_volume_size = cpu_node_config.get("node_volume_size") - gpu_min_nodes = gpu_node_config.get("min_nodes", 0) - gpu_max_nodes = gpu_node_config.get("max_nodes", 0) - gpu_instance_type = gpu_node_config.get("instance_type", "") - gpu_node_volume_size = gpu_node_config.get("node_volume_size") - parameters = dict( - UserArn=self.aws_services.iam_service.get_user_arn(), - ClusterName=cluster_name, - KubernetesVersion=kubernetes_version, - SystemNodeAutoScalingGroupMaxSize=str(system_max_nodes), - SystemNodeInstanceType=system_instance_type, - CPUNodeAutoScalingGroupMinSize=str(cpu_min_nodes), - CPUNodeAutoScalingGroupDesiredCapacity=str(cpu_min_nodes), - CPUNodeAutoScalingGroupMaxSize=str(cpu_max_nodes), - CPUNodeInstanceType=cpu_instance_type, - GPUNodeAutoScalingGroupMinSize=str(gpu_min_nodes), - GPUNodeAutoScalingGroupDesiredCapacity=str(gpu_min_nodes), - GPUNodeAutoScalingGroupMaxSize=str(gpu_max_nodes), - GPUNodeInstanceType=gpu_instance_type, - SSHKeyName=key_name, # TODO: generate for user - ) - for volume_size, param in [ - (cpu_node_volume_size, "CPUNodeVolumeSize"), - (gpu_node_volume_size, "GPUNodeVolumeSize"), - ]: - if volume_size: - parameters[param] = str(volume_size) - - instance_types = [ - instance_type - for max_nodes, instance_type in [ - (system_max_nodes, system_instance_type), - (cpu_max_nodes, cpu_instance_type), - (gpu_max_nodes, gpu_instance_type), - ] - if max_nodes > 0 - ] - prev_azs = None - if stack: - prev_parameters = {p["ParameterKey"]: p["ParameterValue"] for p in stack.parameters} - # NOTE: Changing availability zones is extremely complicated, maybe even impossible without creating a new - # cluster. This is because the EKS cluster is created with specific subnets that can't be modified. - prev_azs = (prev_parameters["AZ01"], prev_parameters["AZ02"]) - - try: - az1, az2 = self.get_compatible_availability_zones_for_instance_types(instance_types, 2, prev_azs) - except ValueError as ve: - raise Exception( - "The requested update cannot be done in-place. Please destroy your" - " existing cluster and make a new one if you would like to proceed." - ) from ve - - for param, public, az in [ - ("PublicSubnet01Block", True, az1), - ("PublicSubnet02Block", True, az2), - ("PrivateSubnet01Block", False, az1), - ("PrivateSubnet02Block", False, az2), - ]: - parameters[param] = self.get_cidr_block(public, az) - - parameters["AZ01"] = az1 - parameters["AZ02"] = az2 - - parameters["VPCBlock"] = VPC_BLOCK - - eks_cluster_stack_template_url = self.upload_stack_template("eks-cluster.yaml") - for parameter_name, template_name in [ - ("NodeGroupStackTemplateURL", "eks-nodegroup.yaml"), - ("NodeSecurityStackTemplateURL", "eks-node-security.yaml"), - ("VPCStackTemplateURL", "eks-vpc.yaml"), - ]: - parameters[parameter_name] = self.upload_stack_template(template_name) - - return dict( - StackName=self.eks_cluster_stack_name(cluster_name), - TemplateURL=eks_cluster_stack_template_url, - Parameters=[ - dict( - ParameterKey=k, - ParameterValue=v, - ) - for (k, v) in parameters.items() - ], - Capabilities=[ - "CAPABILITY_IAM", - "CAPABILITY_NAMED_IAM", - ], - ) - - def create_eks_cluster_stack( - self, - cluster_name, - system_node_config, - cpu_node_config, - gpu_node_config, - key_name, - kubernetes_version, - ): - kwargs = self.get_kwargs_for_cluster_stack( - cluster_name=cluster_name, - kubernetes_version=kubernetes_version, - key_name=key_name, - system_node_config=system_node_config, - cpu_node_config=cpu_node_config, - gpu_node_config=gpu_node_config, - ) - return self.cloudformation.create_stack(**kwargs) - - def update_eks_cluster_stack( - self, - cluster_name, - system_node_config, - cpu_node_config, - gpu_node_config, - key_name, - kubernetes_version, - event_handler=None, - ): - try: - stack = self.cloudformation.Stack(self.eks_cluster_stack_name(cluster_name)) - except botocore.exceptions.ClientError as ce: - if ce.response["Error"]["Code"] == "ValidationError": - raise Exception(f"The stack for cluster {cluster_name} does not exist") from ce - raise - last_event_before_update = self.client.describe_stack_events(StackName=stack.stack_id)["StackEvents"][0] - kwargs = self.get_kwargs_for_cluster_stack( - cluster_name=cluster_name, - kubernetes_version=kubernetes_version, - key_name=key_name, - system_node_config=system_node_config, - cpu_node_config=cpu_node_config, - gpu_node_config=gpu_node_config, - stack=stack, - ) - self.client.update_stack(**kwargs) - stack.reload() - self.wait_for_stack_update_complete( - stack.stack_id, - event_handler=event_handler, - after=last_event_before_update["Timestamp"], - ) - stack.reload() - return stack - - def delete_eks_cluster_stack(self, cluster_name): - self.describe_eks_cluster_stack(cluster_name).delete() - - def describe_eks_cluster_stack(self, cluster_name): - return self.cloudformation.Stack(self.eks_cluster_stack_name(cluster_name)) - - def ensure_eks_cluster_stack( - self, - cluster_name, - **kwargs, - ): - try: - stack = self.create_eks_cluster_stack( - cluster_name=cluster_name, - **kwargs, - ) - except self.client.exceptions.AlreadyExistsException: - stack = self.describe_eks_cluster_stack( - cluster_name=cluster_name, - ) - - stack.reload() - return stack - - def get_stack_status(self, stack_name_or_id): - stack = self.cloudformation.Stack(stack_name_or_id) - return _call_boto_with_backoff(lambda: stack.stack_status)() - - # NOTE: if the stack was deleted then describe_stack_events raises a Throttling error. - # We need to make sure a new error gets raised to indicate that the resource does not exist anymore. - def _describe_stack_events_page(self, StackName, **kwargs): - try: - return self.client.describe_stack_events(StackName=StackName, **kwargs) - except botocore.exceptions.ClientError as ce: - try: - stack_status = self.get_stack_status(StackName) - if stack_status == "DELETE_COMPLETE": - raise StackDeletedException("Stack deleted") from ce - raise - except botocore.exceptions.ClientError as e: - raise ce from e - - def describe_stack_events(self, stack_name): - params = {"StackName": stack_name} - return self._page_boto(self._describe_stack_events_page, params, "StackEvents") - - def watch_stack_events(self, stack_name, event_handler, after=None, failures=None): - stack_id = self.cloudformation.Stack(stack_name).stack_id - event_counts = collections.defaultdict(int) - all_stacks = [stack_id] - initial_sleep_time = 1 - sleep_time = initial_sleep_time - backoff_base = 1.1 - if failures is None: - failures = [] - while all_stacks: - for stack in all_stacks: - stack_status = self.get_stack_status(stack) - try: - current_events = list(self.describe_stack_events(stack)) - except StackDeletedException: - all_stacks.remove(stack) - continue - slice_end = len(current_events) - event_counts[stack] - new_events = current_events[:slice_end][::-1] - if new_events: - sleep_time = initial_sleep_time - event_counts[stack] = len(current_events) - if event_handler: - for event in new_events: - if not after or event["Timestamp"] > after: - event_handler(stack, event) - for event in new_events: - event_status = event["ResourceStatus"] - if event_status.endswith("_FAILED"): - failures.append(event) - if event["ResourceType"] == "AWS::CloudFormation::Stack": - resource_id = event.get("PhysicalResourceId") - if not resource_id: - continue - if event_status.endswith("_IN_PROGRESS") and resource_id not in all_stacks: - all_stacks.append(resource_id) - if stack_status.endswith("_COMPLETE") or stack_status.endswith("_FAILED"): - try: - all_stacks.remove(stack) - except ValueError: - pass - if all_stacks: - time.sleep(sleep_time) - sleep_time *= backoff_base - return failures - - def _wait_for_stack_change_complete(self, stack_name, expected_status, event_handler=None, after=None): - def maybe_raise_failures(failures, initial_exc): - if failures: - failures_str = "\n".join( - f"{e['ResourceStatus']} {e['ResourceType']} {e['LogicalResourceId']}: {e['ResourceStatusReason']}" - for e in failures - ) - exc = Exception(f"Encountered failures while watching stack: {stack_name}\n{failures_str}") - if initial_exc: - raise exc from initial_exc - raise exc - if initial_exc: - raise initial_exc - - failures = [] - try: - self.watch_stack_events(stack_name, event_handler, after=after, failures=failures) - except KeyboardInterrupt as ke: - # NOTE: surface any creation errors with a keyboard interrupt, - # since the user might have interrupted the command after observing resources being deleted - maybe_raise_failures(failures, ke) - maybe_raise_failures(failures, None) - - stack = self.cloudformation.Stack(stack_name) - assert stack.stack_status == expected_status, f"Expected {expected_status}, got {stack.stack_status}" - - def wait_for_stack_create_complete(self, stack_name, **kwargs): - return self._wait_for_stack_change_complete(stack_name, "CREATE_COMPLETE", **kwargs) - - def wait_for_stack_update_complete(self, stack_name, **kwargs): - return self._wait_for_stack_change_complete(stack_name, "UPDATE_COMPLETE", **kwargs) - - def ensure_eks_cluster_stack_deleted(self, cluster_name, event_handler=None): - self._ensure_stack_deleted(self.eks_cluster_stack_name(cluster_name), event_handler=event_handler) - - def _ensure_stack_deleted(self, stack_name, event_handler=None): - stack = self.cloudformation.Stack(stack_name) - try: - stack.reload() - except botocore.exceptions.ClientError as ce: - if ce.response["Error"]["Code"] == "ValidationError": - return - raise - - after = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc) - stack.delete() - self.watch_stack_events(stack.stack_id, event_handler, after=after) - self.client.get_waiter("stack_delete_complete").wait(StackName=stack.stack_id) - - def get_stack_output(self, stack, output_key): - outputs = stack.outputs - if outputs is None: - return None - return next(o["OutputValue"] for o in outputs if o["OutputKey"] == output_key) - - def get_node_instance_role_arn(self, cluster_name): - cluster_stack = self.describe_eks_cluster_stack(cluster_name) - return self.get_stack_output(cluster_stack, "NodeInstanceRoleArn") diff --git a/sigopt/orchestrate/cluster/__init__.py b/sigopt/orchestrate/cluster/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/sigopt/orchestrate/cluster/context.py b/sigopt/orchestrate/cluster/context.py deleted file mode 100644 index 40639fc6..00000000 --- a/sigopt/orchestrate/cluster/context.py +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -from .errors import NotConnectedError - - -class DisconnectOnException(object): - def __init__(self, cluster_name, services): - self._cluster_name = cluster_name - self._services = services - - def __enter__(self): - pass - - def __exit__(self, t, exc, tb): - if exc is not None: - try: - self._services.cluster_service.disconnect(cluster_name=self._cluster_name, disconnect_all=False) - except NotConnectedError: - pass - return False - return None diff --git a/sigopt/orchestrate/cluster/errors.py b/sigopt/orchestrate/cluster/errors.py deleted file mode 100644 index 8863e272..00000000 --- a/sigopt/orchestrate/cluster/errors.py +++ /dev/null @@ -1,43 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -from ..exceptions import OrchestrateException -from ..version import CLI_NAME - - -class ClusterError(OrchestrateException): - pass - - -class MultipleClustersConnectionError(ClusterError): - def __init__(self, connected_clusters): - safe_str = "\n\t".join(connected_clusters) - super().__init__( - "You are currently connected to more than one cluster, all of which are" - " listed below.\nPlease disconnect from some of these clusters before" - f" re-running your command.\nConnected clusters::\n\t{safe_str}" - ) - self.connected_clusters = connected_clusters - - -class PleaseDisconnectError(ClusterError): - def __init__(self, current_cluster_name): - super().__init__(f"Please disconnect from this cluster before re-running your command: {current_cluster_name}") - self.current_cluster_name = current_cluster_name - - -class NotConnectedError(ClusterError): - def __init__(self): - super().__init__("You are not currently connected to any cluster") - - -class AlreadyConnectedException(ClusterError): - def __init__(self, current_cluster_name): - super().__init__( - ( - f"You are already connected this cluster: {current_cluster_name}." - f" Please run `{CLI_NAME} cluster test` to verify the details of your" - " connection." - ), - ) - self.current_cluster_name = current_cluster_name diff --git a/sigopt/orchestrate/cluster/object.py b/sigopt/orchestrate/cluster/object.py deleted file mode 100644 index a3ab9fb7..00000000 --- a/sigopt/orchestrate/cluster/object.py +++ /dev/null @@ -1,85 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import base64 - -from ..docker.service import DockerLoginCredentials -from ..provider.constants import Provider, provider_to_string - - -class Cluster(object): - def __init__(self, services, name, registry): - self.services = services - self._name = name - self._registry = registry - self._provider_service = None - - @property - def name(self): - return self._name - - @property - def provider(self): - raise NotImplementedError() - - @property - def provider_string(self): - return provider_to_string(self.provider) - - @property - def provider_service(self): - if self._provider_service is None: - self._provider_service = self.services.provider_broker.get_provider_service(self.provider) - return self._provider_service - - @property - def registry(self): - return self._registry - - def get_registry_login_credentials(self, repository): - raise NotImplementedError() - - def generate_image_tag(self, repository): - raise NotImplementedError() - - -class AWSCluster(Cluster): - @property - def provider(self): - return Provider.AWS - - def get_registry_login_credentials(self, repository): - ecr_service = self.provider_service.aws_services.ecr_service - registry_id = ecr_service.ensure_repositories([repository])["repositories"][0]["registryId"] - authorization_data = ecr_service.get_authorization_token([registry_id])["authorizationData"][0] - authorization_token = authorization_data["authorizationToken"] - decoded_bytes = base64.b64decode(authorization_token) - (username, password) = decoded_bytes.decode("utf-8").split(":") - proxy_endpoint = authorization_data["proxyEndpoint"] - return DockerLoginCredentials( - registry=proxy_endpoint, - username=username, - password=password, - ) - - def generate_image_tag(self, repository): - if self.registry is not None: - return f"{self.registry}/{repository}" - - ecr_service = self.provider_service.aws_services.ecr_service - descriptions = ecr_service.ensure_repositories([repository]) - return descriptions["repositories"][0]["repositoryUri"] - - -class CustomCluster(Cluster): - @property - def provider(self): - return Provider.CUSTOM - - def get_registry_login_credentials(self, repository): - return None - - def generate_image_tag(self, repository): - if self.registry is not None: - return f"{self.registry}/{repository}" - return repository diff --git a/sigopt/orchestrate/cluster/service.py b/sigopt/orchestrate/cluster/service.py deleted file mode 100644 index 057785bd..00000000 --- a/sigopt/orchestrate/cluster/service.py +++ /dev/null @@ -1,137 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -from ..provider.constants import Provider, string_to_provider -from ..services.base import Service -from .context import DisconnectOnException -from .errors import ( - AlreadyConnectedException, - ClusterError, - MultipleClustersConnectionError, - NotConnectedError, - PleaseDisconnectError, -) - - -class ClusterService(Service): - def connected_clusters(self): - return self.services.kubernetes_service.get_cluster_names() - - def assert_is_connected(self): - connected_clusters = self.connected_clusters() - if not connected_clusters: - raise NotConnectedError() - if len(connected_clusters) > 1: - raise MultipleClustersConnectionError(connected_clusters) - return connected_clusters[0] - - def assert_is_disconnected(self): - connected_clusters = self.connected_clusters() - if connected_clusters: - if len(connected_clusters) == 1: - raise PleaseDisconnectError(connected_clusters[0]) - raise MultipleClustersConnectionError(connected_clusters) - - def connect(self, cluster_name, provider_string, kubeconfig, registry): - try: - self.assert_is_disconnected() - except PleaseDisconnectError as e: - if e.current_cluster_name == cluster_name: - raise AlreadyConnectedException(e.current_cluster_name) from e - raise - - provider = string_to_provider(provider_string) - provider_service = self.services.provider_broker.get_provider_service(provider) - - if kubeconfig is None: - kubeconfig = provider_service.create_kubeconfig(cluster_name) - else: - assert provider == Provider.CUSTOM, "Must use --provider custom to connect with a kubeconfig" - - with DisconnectOnException(cluster_name, self.services): - self.services.kubernetes_service.write_config(cluster_name, kubeconfig) - self.services.kubernetes_service.ensure_orchestrate_namespace() - cluster = provider_service.create_cluster_object( - services=self.services, - name=cluster_name, - registry=registry, - ) - self.services.cluster_metadata_service.write_metadata(cluster) - return self.test() - - def create(self, options): - try: - self.assert_is_disconnected() - except PleaseDisconnectError as e: - if e.current_cluster_name == options.get("cluster_name", ""): - raise AlreadyConnectedException(e.current_cluster_name) from e - raise - - self.services.options_validator_service.validate_cluster_options(**options) - cluster_name = options.get("cluster_name", "") - - provider_string = options.get("provider", "") - provider = string_to_provider(provider_string) - provider_service = self.services.provider_broker.get_provider_service(provider) - - with DisconnectOnException(cluster_name, self.services): - cluster = provider_service.create_kubernetes_cluster(options) - self.services.kubernetes_service.ensure_orchestrate_namespace() - self.services.cluster_metadata_service.write_metadata(cluster) - self.services.kubernetes_service.wait_until_nodes_are_ready() - return cluster.name - - def update(self, options): - self.services.options_validator_service.validate_cluster_options(**options) - cluster_name = options.get("cluster_name", "") - - provider_string = options.get("provider", "") - provider = string_to_provider(provider_string) - provider_service = self.services.provider_broker.get_provider_service(provider) - - with DisconnectOnException(cluster_name, self.services): - cluster = provider_service.update_kubernetes_cluster(options) - self.services.kubernetes_service.ensure_orchestrate_namespace() - self.services.kubernetes_service.wait_until_nodes_are_ready() - return cluster.name - - def destroy(self, cluster_name, provider_string): - provider = string_to_provider(provider_string) - provider_service = self.services.provider_broker.get_provider_service(provider) - provider_service.destroy_kubernetes_cluster(cluster_name=cluster_name) - self.services.cluster_metadata_service.ensure_metadata_deleted(cluster_name=cluster_name) - - def disconnect(self, cluster_name, disconnect_all): - if (cluster_name and disconnect_all) or (not cluster_name and not disconnect_all): - raise ClusterError("Must provide exactly one of --cluster-name and --all") - - try: - current_cluster_name = self.assert_is_connected() - if cluster_name is not None and current_cluster_name != cluster_name: - raise PleaseDisconnectError(current_cluster_name) - except MultipleClustersConnectionError: - if not disconnect_all: - raise - - for cname in self.connected_clusters(): - try: - self.services.cluster_metadata_service.ensure_metadata_deleted(cluster_name=cname) - self.services.kubernetes_service.ensure_config_deleted(cluster_name=cname) - self.services.logging_service.warning(f"Successfully disconnected from {cname}") - except Exception as e: - raise ClusterError(f'Looks like an error occured while attempting to disconnect from cluster "{cname}".') from e - - def get_connected_cluster(self): - cluster_name = self.assert_is_connected() - return self.services.cluster_metadata_service.read_metadata(cluster_name) - - def test(self): - cluster = self.get_connected_cluster() - provider_service = self.services.provider_broker.get_provider_service(cluster.provider) - - try: - provider_service.test_kubernetes_cluster(cluster_name=cluster.name) - except Exception as e: - raise ClusterError(f'Looks like an error occured while testing cluster "{cluster.name}".') from e - - return cluster diff --git a/sigopt/orchestrate/cluster_metadata/__init__.py b/sigopt/orchestrate/cluster_metadata/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/sigopt/orchestrate/cluster_metadata/errors.py b/sigopt/orchestrate/cluster_metadata/errors.py deleted file mode 100644 index 5c1221c7..00000000 --- a/sigopt/orchestrate/cluster_metadata/errors.py +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -from ..exceptions import OrchestrateException - - -class MetadataError(OrchestrateException): - def __init__(self, cluster_name, unformatted_msg): - formatted_msg = unformatted_msg.format(cluster_name=cluster_name) - super().__init__( - f"{formatted_msg} Disconnecting and then reconnecting should resolve the issue.", - ) - self.cluster_name = cluster_name - - -class MetadataNotFoundError(MetadataError): - def __init__(self, cluster_name): - super().__init__( - cluster_name, - f"We could not find metadata for cluster {cluster_name}.", - ) - - -class MetadataAlreadyExistsError(MetadataError): - def __init__(self, cluster_name): - super().__init__( - cluster_name, - f"Looks like metadata for cluster {cluster_name} already exists.", - ) diff --git a/sigopt/orchestrate/cluster_metadata/service.py b/sigopt/orchestrate/cluster_metadata/service.py deleted file mode 100644 index 384ffa7c..00000000 --- a/sigopt/orchestrate/cluster_metadata/service.py +++ /dev/null @@ -1,71 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import errno -import os - -import yaml - -from sigopt.paths import ensure_dir, get_root_subdir - -from ..provider.constants import string_to_provider -from ..services.base import Service -from .errors import MetadataAlreadyExistsError, MetadataNotFoundError - - -class ClusterMetadataService(Service): - def __init__(self, services): - super().__init__(services) - self._metadata_dir = get_root_subdir("cluster") - - def read_metadata(self, cluster_name): - metadata_path = self._cluster_metadata_path(cluster_name) - - if not os.path.isfile(metadata_path): - raise MetadataNotFoundError(cluster_name) - - with open(metadata_path, "r") as f: - data = yaml.safe_load(stream=f) - - provider = string_to_provider(data["provider"]) - provider_service = self.services.provider_broker.get_provider_service(provider) - cluster = provider_service.create_cluster_object( - services=self.services, - name=data["name"], - registry=data["registry"], - ) - return cluster - - def write_metadata(self, cluster): - data = dict( - name=cluster.name, - provider=cluster.provider_string, - registry=cluster.registry, - ) - - ensure_dir(self._metadata_dir) - metadata_path = self._cluster_metadata_path(cluster.name) - - if os.path.isfile(metadata_path): - raise MetadataAlreadyExistsError(cluster.name) - - with open(metadata_path, "w") as f: - yaml.safe_dump(data, stream=f) - - def _delete_metadata(self, cluster_name): - try: - os.remove(self._cluster_metadata_path(cluster_name)) - except OSError as e: - if e.errno == errno.ENOENT: - raise MetadataNotFoundError(cluster_name) from e - raise - - def ensure_metadata_deleted(self, cluster_name): - try: - self._delete_metadata(cluster_name) - except MetadataNotFoundError: - pass - - def _cluster_metadata_path(self, cluster_name): - filename = f"metadata-{cluster_name}" - return os.path.join(self._metadata_dir, filename) diff --git a/sigopt/orchestrate/common.py b/sigopt/orchestrate/common.py deleted file mode 100644 index 3bfc1071..00000000 --- a/sigopt/orchestrate/common.py +++ /dev/null @@ -1,52 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import random -import sys -import time -from enum import Enum -from shutil import rmtree -from tempfile import mkdtemp - - -class TemporaryDirectory(object): - def __init__(self, *args, **kwargs): - self.directory = mkdtemp(*args, **kwargs) - - def __enter__(self): - return self.directory - - def __exit__(self, *args): - rmtree(self.directory) - - -class Platform(Enum): - MAC = 1 - LINUX = 2 - - -def current_platform(): - if sys.platform.startswith("linux"): - return Platform.LINUX - if sys.platform == "darwin": - return Platform.MAC - raise Exception( - "You are attempting to run SigOpt cluster features on the following platform:" - f" {sys.platform}. Currently, only Mac and Linux are supported." - ) - - -def retry_with_backoff(func): - # pylint: disable=inconsistent-return-statements - def wrapper(*args, **kwargs): - NUM_RETRIES = 5 - for i in range(NUM_RETRIES + 1): - try: - return func(*args, **kwargs) - except Exception as e: - time.sleep(2**i + random.random()) # nosec - if i == NUM_RETRIES: - raise e - - # pylint: enable=inconsistent-return-statements - return wrapper diff --git a/sigopt/orchestrate/controller.py b/sigopt/orchestrate/controller.py deleted file mode 100644 index 49a5883a..00000000 --- a/sigopt/orchestrate/controller.py +++ /dev/null @@ -1,515 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import os -import re -from collections import defaultdict - -import click -import pint -import yaml -from botocore.exceptions import NoRegionError - -from sigopt.paths import ensure_dir, get_bin_dir -from sigopt.utils import accept_sigopt_not_found - -from .cluster.errors import AlreadyConnectedException, ClusterError, MultipleClustersConnectionError, NotConnectedError -from .docker.service import DockerException, DockerService -from .exceptions import CheckExecutableError, ModelPackingError, OrchestrateException -from .identifier import IDENTIFIER_TYPE_EXPERIMENT, IDENTIFIER_TYPE_RUN, parse_identifier -from .kubernetes.service import ORCHESTRATE_NAMESPACE, CleanupFailedException -from .paths import ( - check_iam_authenticator_executable, - check_kubectl_executable, - download_iam_authenticator_executable, - download_kubectl_executable, -) -from .provider.constants import PROVIDER_TO_STRING, Provider -from .services.orchestrate_bag import OrchestrateServiceBag -from .status import print_status -from .stop import stop_experiment, stop_run - - -class _ExitException(click.ClickException): - def __init__(self, msg, exit_code=1): - super().__init__(msg) - self.exit_code = exit_code - - -def docker_login(cluster, docker_service, repository_name): - creds = cluster.get_registry_login_credentials(repository_name) - if creds is not None: - docker_service.login(creds) - - -class OrchestrateController: - def __init__(self, services): - self.services = services - - @classmethod - def create(cls): - try: - services = OrchestrateServiceBag() - except NoRegionError as e: - raise _ExitException("No default region is selected, please run `aws configure`") from e - return cls(services) - - def clean_images(self): - self.services.cluster_service.assert_is_connected() - docker_service = DockerService.create(self.services) - docker_service.prune() - - def build_and_push_image( - self, - cluster, - docker_service, - dockerfile, - run_options, - quiet, - ): - image_name = run_options.get("image") - repository_name, tag = DockerService.get_repository_and_tag(image_name) - docker_login(cluster, docker_service, repository_name) - - build_image = run_options.get("build_image", True) - - if build_image: - if not quiet: - print("Containerizing and uploading your model, this may take a few minutes...") - try: - image_tag = self.services.model_packer_service.build_image( - docker_service=docker_service, - repository=repository_name, - tag=tag, - quiet=quiet, - dockerfile=dockerfile, - ) - image = docker_service.get_image(image_tag) - except ModelPackingError as mpe: - msg = str(mpe) - match = re.search("manifest for (.*?) not found: manifest unknown: manifest unknown", msg) - if match is not None: - msg = f"Unable to find base image {match.groups()[0]} when building your docker container" - raise _ExitException(msg) from mpe - - repository_name = cluster.generate_image_tag(repository_name) - repository_image_tag = DockerService.format_image_name(repository_name, tag) - if build_image: - image.tag(repository=repository_name, tag=tag) - if not quiet: - print(f"Uploading the model environment to image registry: {repository_image_tag}") - docker_service.push(repository_name, tag=tag, quiet=quiet) - - return repository_name, tag - - def runner( - self, - run_options, - command, - cluster, - docker_service, - dockerfile, - project_id, - quiet=False, - optimize=True, - optimization_options=None, - ): - if optimize: - if not optimization_options: - raise OrchestrateException("optimize jobs require an experiment yaml file") - - repository_name, tag = self.build_and_push_image( - cluster=cluster, - docker_service=docker_service, - dockerfile=dockerfile, - run_options=run_options, - quiet=quiet, - ) - - resource_options = self.services.gpu_options_validator_service.get_resource_options(run_options) - - run_command = command - - job_type_str = "experiment" if optimize else "run" - - if not quiet: - print("Starting your {}".format(job_type_str)) - - if optimize: - return self.services.job_runner_service.start_cluster_experiment( - repository=repository_name, - tag=tag, - resource_options=resource_options, - optimization_options=optimization_options, - run_command=run_command, - project_id=project_id, - ) - return self.services.job_runner_service.start_cluster_run( - repository=repository_name, - tag=tag, - resource_options=resource_options, - run_command=run_command, - project_id=project_id, - ) - - def run_on_cluster(self, command, run_options, silent, dockerfile, project_id): - cluster = self.services.cluster_service.test() - - quiet = silent - - docker_service = DockerService.create(self.services) - identifier = self.runner( - cluster=cluster, - docker_service=docker_service, - quiet=quiet, - optimize=False, - command=command, - run_options=run_options, - dockerfile=dockerfile, - project_id=project_id, - ) - if quiet: - print(identifier) - else: - print(f'Started "{identifier}"') - - def test_run_on_cluster(self, command, run_options, dockerfile, project_id): - cluster = self.services.cluster_service.test() - - docker_service = DockerService.create(self.services) - identifier = self.runner( - cluster=cluster, - docker_service=docker_service, - quiet=False, - optimize=False, - run_options=run_options, - dockerfile=dockerfile, - command=command, - project_id=project_id, - ) - run_identifier = parse_identifier(identifier) - label_selector = run_identifier["pod_label_selector"] - print(f"View your run at https://app.sigopt.com/{identifier}") - print("waiting for controller to start...") - - def check_pod_condition(event): - if event["type"] == "DELETED": - raise Exception("The pod was deleted") - pod = event["object"] - for condition in pod.status.conditions or []: - if condition.type in ("Ready", "PodScheduled") and condition.status == "False": - print(f"Pod '{pod.metadata.name}' in bad condition: {condition.reason}: {condition.message}") - if condition.reason == "Unschedulable": - print( - "Hint: If you configured your nodes with sufficient resources" - " then you probably just need to wait for the cluster to" - " scale up" - ) - for container_status in pod.status.container_statuses or []: - waiting_state = container_status.state.waiting - if waiting_state: - print( - f"Container '{container_status.name}' in pod" - f" '{pod.metadata.name}' is waiting: {waiting_state.reason}:" - f" {waiting_state.message}" - ) - - self.services.kubernetes_service.wait_for_pod_to_start( - label_selector=run_identifier["controller_label_selector"], - event_handler=check_pod_condition, - ) - print("controller started, waiting for run to be created...") - self.services.kubernetes_service.wait_for_pod_to_exist(label_selector=label_selector) - print("run created, waiting for it to start...") - pod = self.services.kubernetes_service.wait_for_pod_to_start( - label_selector=label_selector, - event_handler=check_pod_condition, - ) - print("run started, following logs") - try: - print("*** START RUN LOGS ***") - for log_line in self.services.kubernetes_service.logs(pod.metadata.name, follow=True): - print(log_line) - print("*** END RUN LOGS ***") - except KeyboardInterrupt: - print() - print("Cleaning up") - stop_run(run_identifier, self.services) - - def stop_by_identifier(self, identifier): - identifier_type = identifier["type"] - with accept_sigopt_not_found() as wrap: - if identifier_type == IDENTIFIER_TYPE_RUN: - stop_run(identifier, self.services) - elif identifier_type == IDENTIFIER_TYPE_EXPERIMENT: - stop_experiment(identifier, self.services) - else: - raise NotImplementedError(f"Cannot stop {identifier['raw']}") - if wrap.exception: - print(f"{identifier['raw']}: {str(wrap.exception)}") - else: - print(f"{identifier['raw']}: deleted") - - def optimize_on_cluster(self, command, run_options, optimization_options, silent, dockerfile, project_id): - cluster = self.services.cluster_service.test() - - quiet = silent - - docker_service = DockerService.create(self.services) - identifier = self.runner( - cluster=cluster, - docker_service=docker_service, - quiet=quiet, - optimize=True, - command=command, - run_options=run_options, - optimization_options=optimization_options, - dockerfile=dockerfile, - project_id=project_id, - ) - if quiet: - print(identifier) - else: - print(f'Started "{identifier}"') - - def create_cluster(self, options): - print("Creating your cluster, this process may take 20-30 minutes or longer...") - - # NOTE: checks again now that we know provider, in case aws iam authenticator is needed - check_authenticator_binary(provider=options.get("provider")) - try: - cluster_name = self.services.cluster_service.create(options=options) - except ClusterError as pde: - raise _ExitException(str(pde)) from pde - - print(f"Successfully created kubernetes cluster: {cluster_name}") - - def update_cluster(self, options): - print("Updating your cluster, this process may take 5-10 minutes or longer...") - - # NOTE: checks again now that we know provider, in case aws iam authenticator is needed - check_authenticator_binary(provider=options.get("provider")) - cluster_name = self.services.cluster_service.update(options=options) - - print(f"Successfully updated kubernetes cluster: {cluster_name}") - - def destroy_connected_cluster(self): - cluster = self.services.cluster_service.get_connected_cluster() - print(f"Destroying cluster {cluster.name}, this process may take 20-30 minutes or longer...") - - try: - self.services.kubernetes_service.cleanup_for_destroy() - except CleanupFailedException as cfe: - raise _ExitException(str(cfe)) from cfe - self.services.cluster_service.destroy( - cluster_name=cluster.name, - provider_string=cluster.provider_string, - ) - print(f"Successfully destroyed kubernetes cluster: {cluster.name}") - - def connect_to_cluster(self, cluster_name, provider_string, registry, kubeconfig): - check_authenticator_binary(provider=provider_string) - - print(f"Connecting to cluster {cluster_name}...") - try: - self.services.cluster_service.connect( - cluster_name=cluster_name, - provider_string=provider_string, - kubeconfig=kubeconfig, - registry=registry, - ) - print(f"Successfully connected to kubernetes cluster: {cluster_name}") - except AlreadyConnectedException as ace: - raise _ExitException( - f"Already connected to cluster: {ace.current_cluster_name}", - ) from ace - - def disconnect_from_connected_cluster(self): - cluster = self.services.cluster_service.get_connected_cluster() - print(f"Disconnecting from cluster {cluster.name}...") - - try: - self.services.cluster_service.disconnect(cluster.name, disconnect_all=False) - except NotConnectedError: - self.services.logging_service.warning("Not connected to any clusters") - except MultipleClustersConnectionError as mcce: - cluster_names = ", ".join(mcce.connected_clusters) - self.services.logging_service.warning( - f"Connected to multiple clusters: {cluster_names}. Rerun with `disconnect --all`." - ) - except ClusterError as ce: - raise _ExitException(str(ce)) from ce - - def test_cluster_connection(self): - print("Testing if you are connected to a cluster, this may take a moment...") - try: - cluster = self.services.cluster_service.test() - except NotConnectedError as nce: - raise _ExitException( - "You are not currently connected to a cluster.", - ) from nce - - registry_str = cluster.registry if cluster.registry is not None else "default" - print( - "\nYou are connected to a cluster! Here is the info:" - f"\n\tcluster name: {cluster.name}" - f"\n\tprovider: {cluster.provider_string}" - f"\n\tregistry: {registry_str}" - ) - - try: - docker_service = DockerService.create(self.services) - docker_service.check_connection() - except DockerException as e: - raise _ExitException(str(e)) from e - - def cluster_status(self): - try: - cluster = self.services.cluster_service.test() - except NotConnectedError as nce: - raise _ExitException( - "You are not currently connected to a cluster", - ) from nce - - print(f"You are currently connected to the cluster: {cluster.name}") - all_pods = self.services.kubernetes_service.get_pods() - nodes = self.services.kubernetes_service.get_nodes() - individual_pods = [] - experiment_pods = defaultdict(list) - - def group_by_phase(pods): - pods_by_phase = defaultdict(list) - for pod in pods: - pods_by_phase[pod.status.phase].append(pod) - return pods_by_phase - - collapse_phases = ["Succeeded"] - - def print_pods(all_pods, indent): - by_phase = group_by_phase(all_pods) - tabs = "\t" * indent - for phase, pods in by_phase.items(): - print(f"{tabs}{phase}: {len(pods)} runs") - if phase not in collapse_phases: - for p in pods: - print(f"{tabs}\trun/{p.metadata.labels['run']}\t{p.metadata.name}") - - for pod in all_pods.items: - if pod.metadata.labels["type"] == "run": - try: - experiment_pods[pod.metadata.labels["experiment"]].append(pod) - except KeyError: - individual_pods.append(pod) - if individual_pods: - print(f"One-off: {len(individual_pods)} runs") - print_pods(individual_pods, 1) - if experiment_pods: - print(f"Experiments: {len(experiment_pods)} total") - for eid, exp_pods in sorted(experiment_pods.items(), key=lambda x: x[0]): - print(f"\texperiment/{eid}: {len(exp_pods)} runs") - print_pods(exp_pods, 2) - - print(f"Nodes: {len(nodes.items)} total") - running_pods_by_node = defaultdict(list) - for pod in all_pods.items: - if pod.status.phase == "Running": - running_pods_by_node[pod.spec.node_name].append(pod) - CPU = "cpu" - MEMORY = "memory" - GPU = "nvidia.com/gpu" - RESOURCE_META = ((CPU, "CPU"), (MEMORY, "B"), (GPU, "GPU")) - unit_registry = pint.UnitRegistry() - # NOTE: creates a new unit "CPU". "mCPU = milli CPU = 0.001 * CPU" - unit_registry.define("CPU = [cpu]") - unit_registry.define("GPU = [gpu]") - for node in nodes.items: - print(f"\t{node.metadata.name}:") - node_resources = [ - (c.resources.requests, c.resources.limits) - for p in running_pods_by_node[node.metadata.name] - for c in p.spec.containers - ] - # NOTE: create an inital value for each resource type for requests and limits - all_totals = tuple( - {resource_type: 0 * unit_registry(ext) for resource_type, ext in RESOURCE_META} for _ in range(2) - ) - for resources in node_resources: - for resource_allocation, totals in zip(resources, all_totals): - if not resource_allocation: - continue - for resource_type, ext in RESOURCE_META: - # NOTE: this parses the resource quantity with a magnitude and unit. - # ex. "12Mi" + "B" == "12*2^20 bytes", "100m" + "CPU" == "0.1 CPU" - totals[resource_type] += unit_registry.Quantity(resource_allocation.get(resource_type, "0") + ext) - requests_totals, limits_totals = all_totals - for resource_type, ext in RESOURCE_META: - allocatable = unit_registry.Quantity(node.status.allocatable.get(resource_type, "0") + ext) - if not allocatable: - continue - print(f"\t\t{resource_type}:") - total_request = requests_totals[resource_type] - percent_request = (100 * total_request / allocatable).to_reduced_units() - total_limit = limits_totals[resource_type] - percent_limit = (100 * total_limit / allocatable).to_reduced_units() - allocatable, total_request, total_limit = ( - value.to_compact() for value in (allocatable, total_request, total_limit) - ) - print(f"\t\t\tAllocatable: {allocatable:~.2f}") - print(f"\t\t\tRequests: {total_request:~.2f}, {percent_request:~.2f} %") - print(f"\t\t\tLimits: {total_limit:~.2f}, {percent_limit:~.2f} %") - - def print_status(self, identifier): - print(f"{identifier['raw']}:") - with accept_sigopt_not_found() as wrap: - for line in print_status(identifier, self.services): - print(f"\t{line}") - if wrap.exception: - print(f"\t{str(wrap.exception)}") - - def install_cluster_plugins(self): - cluster = self.services.cluster_service.get_connected_cluster() - print("Installing required kubernetes resources...") - self.services.kubernetes_service.ensure_plugins(cluster.name, cluster.provider) - print("Uploading required images to your registry...") - print("Finished installing plugins") - - def exec_kubectl(self, arguments): - self.services.cluster_service.assert_is_connected() - check_binary(kubectl_check) - cmd = self.services.kubectl_service.get_kubectl_command() - args = [cmd, "--namespace", ORCHESTRATE_NAMESPACE, *arguments] - os.execvpe( - cmd, - args, - env=self.services.kubectl_service.get_kubectl_env(), - ) - - -kubectl_check = (check_kubectl_executable, download_kubectl_executable, "kubernetes") -aws_iam_authenticator_check = ( - check_iam_authenticator_executable, - download_iam_authenticator_executable, - "aws iam-authentication", -) - - -def check_authenticator_binary(provider): - if provider == PROVIDER_TO_STRING[Provider.AWS]: - check_binary(aws_iam_authenticator_check) - - -def check_binary(options): - ensure_dir(get_bin_dir()) - check, download, name = options - try: - check() - except CheckExecutableError: - print(f"Downloading {name} executable, this could take some time...") - download() - check(full_check=True) - - -def load_user_options(filename): - with open(filename) as f: - options = yaml.safe_load(f) or {} - return options diff --git a/sigopt/orchestrate/custom_cluster/__init__.py b/sigopt/orchestrate/custom_cluster/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/sigopt/orchestrate/custom_cluster/service.py b/sigopt/orchestrate/custom_cluster/service.py deleted file mode 100644 index 3d4a211f..00000000 --- a/sigopt/orchestrate/custom_cluster/service.py +++ /dev/null @@ -1,51 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -from ..cluster.object import CustomCluster -from ..exceptions import OrchestrateException -from ..provider.constants import Provider, provider_to_string -from ..provider.interface import ProviderInterface -from ..version import CLI_NAME - - -class CustomClusterService(ProviderInterface): - def create_kubernetes_cluster(self, options): - cluster_name = options.get("cluster_name", "my-cluster") - raise OrchestrateException( - f'When you use provider = "{provider_to_string(Provider.CUSTOM)}", we' - " assume that you have created your own kubernetes cluster. If you are" - " attempting to connect to a custom cluster that you have already created," - f" please use:\n{CLI_NAME} cluster connect --provider custom --kubeconfig" - f" --cluster-name {cluster_name}" - ) - - def destroy_kubernetes_cluster(self, cluster_name): - raise OrchestrateException( - f'When you use provider = "{provider_to_string(Provider.CUSTOM)}", we' - " assume that you have created your own kubernetes cluster. If you are" - " attempting to disconnect from a custom cluster that you have already" - f" created, please use:\n{CLI_NAME} cluster disconnect --cluster-name" - f" {cluster_name}" - ) - - def create_kubeconfig(self, cluster_name, ignore_role=False): - raise OrchestrateException( - f'When you use provider = "{provider_to_string(Provider.CUSTOM)}", we' - " assume that you have created your own kubernetes cluster. Additionally" - " we assume that you have a copy of the kubeconfig file that is used to" - " access the cluster. Please provide the path to the kubeconfig as an" - " argument. You will also need to provide the URL for your container" - f" registry, if using a private one.\n{CLI_NAME} cluster connect --provider" - " custom --kubeconfig --cluster-name" - f" {cluster_name} [--registry ]" - ) - - def test_kubernetes_cluster(self, cluster_name, ignore_role=False): - self.services.kubernetes_service.test_config() - - def create_cluster_object(self, services, name, registry): - return CustomCluster( - services=services, - name=name, - registry=registry, - ) diff --git a/sigopt/orchestrate/docker/__init__.py b/sigopt/orchestrate/docker/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/sigopt/orchestrate/docker/service.py b/sigopt/orchestrate/docker/service.py deleted file mode 100644 index a53bbb0e..00000000 --- a/sigopt/orchestrate/docker/service.py +++ /dev/null @@ -1,218 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import random -import re -import string -import sys -from collections import namedtuple -from tempfile import NamedTemporaryFile - -import docker -import requests -import urllib3 - -from ..exceptions import ModelPackingError, OrchestrateException -from ..json_stream import json_stream -from ..services.base import Service - - -DOCKER_TARGET_VERSION = "1.41" - - -DockerLoginCredentials = namedtuple( - "DockerLoginCredentials", - [ - "username", - "password", - "registry", - ], -) - - -class DockerException(OrchestrateException): - pass - - -class DockerInstallationError(DockerException): - pass - - -class DockerPodTimeoutError(DockerException): - pass - - -class DockerConnectionError(DockerException): - pass - - -class DockerService(Service): - @classmethod - def create(cls, services): - if not services.kubernetes_service.is_docker_installed(): - raise DockerInstallationError( - "\n".join( - [ - "Docker not found in your cluster.", - "SigOpt no longer uses your local Docker installation to build images.", - "Please install the SigOpt plugins to get Docker running on your cluster:", - "\tsigopt cluster install-plugins", - ] - ) - ) - try: - services.kubernetes_service.wait_for_docker_pod() - except TimeoutError as e: - raise DockerPodTimeoutError(str(e)) from e - client = docker.DockerClient( - # HACK: DockerClient can't accept the kubernetes proxy url if it doesn't have a port specified, so give it - # a fake url to initialize - base_url="tcp://a:1", - version=DOCKER_TARGET_VERSION, - ) - services.kubernetes_service.mount_http_proxy_adapter(client.api) - client.api.base_url = services.kubernetes_service.get_docker_connection_url() - return cls(services, client) - - def __init__(self, services, client): - super().__init__(services) - self.client = client - - def check_connection(self): - try: - self.client.images.list() - except (docker.errors.DockerException, requests.exceptions.ConnectionError) as e: - raise DockerConnectionError(f"An error occurred while checking your docker connection: {e}") from e - - def print_logs(self, logs): - for log in logs: - sys.stdout.write(log) - sys.stdout.flush() - - def stream_build_log(self, logs, dockerfile, show_all_logs): - downloading = False - for parsed_log in json_stream(logs): - if "error" in parsed_log: - if show_all_logs: - print(parsed_log["error"], file=sys.stderr) - raise ModelPackingError(parsed_log["error"], dockerfile) - if "status" in parsed_log: - if not downloading and parsed_log["status"] == "Downloading": - yield "Downloading the base image...\n" - downloading = True - elif "stream" in parsed_log: - if show_all_logs: - yield parsed_log["stream"] - downloading = False - - def build( - self, - tag=None, - dockerfile_name=None, - dockerfile_contents=None, - directory=None, - quiet=True, - build_args=None, - show_all_logs=False, - ): - if dockerfile_contents: - assert not dockerfile_name, "only one of dockerfile_name, dockerfile_contents can be provided" - with NamedTemporaryFile(mode="w", delete=False) as dockerfile_fp: - dockerfile_fp.write(dockerfile_contents) - dockerfile = dockerfile_fp.name - else: - dockerfile = dockerfile_name - try: - tag = tag or ( - "sigopt-temp:" + "".join(random.choice(string.ascii_lowercase + string.digits) for _ in range(8)) # nosec - ) - if quiet: - self.client.images.build( - tag=tag, - dockerfile=dockerfile, - path=directory, - quiet=quiet, - buildargs=build_args, - rm=True, - ) - return tag - raw_logs = self.client.api.build( - tag=tag, - dockerfile=dockerfile, - path=directory, - quiet=quiet, - buildargs=build_args, - rm=True, - ) - self.print_logs(self.stream_build_log(raw_logs, dockerfile, show_all_logs)) - return tag - except docker.errors.BuildError as e: - raise ModelPackingError(str(e), dockerfile) from e - - def push(self, repository, tag=None, retries=1, quiet=True): - for try_number in range(retries + 1): - try: - for obj in json_stream(self.client.images.push(repository=repository, tag=tag, stream=True)): - if "error" in obj: - raise Exception(obj["error"]) - except urllib3.exceptions.ReadTimeoutError: - if try_number >= retries: - raise - if not quiet: - print("Docker push failed, retrying...") - - def pull(self, repository, tag="latest"): - self.client.images.pull(repository=repository, tag=tag) - - def login(self, docker_login_credentials): - creds = docker_login_credentials - response = self.client.login( - username=creds.username, - password=creds.password, - registry=creds.registry, - dockercfg_path="/dev/null", - ) - response_status = response.get("Status") - if response_status: - assert response_status == "Login Succeeded", ( - f"Docker failed logging into registry {creds.registry} with username {creds.username}", - ) - - @staticmethod - def format_image_name(repository, tag): - return f"{repository}:{tag}" if tag is not None else repository - - @staticmethod - def get_repository_and_tag(image): - image_regex = r"^([a-z0-9\_\-]+(?::[0-9]+)?\/?[a-z0-9\_\-]+)(:[a-zA-Z0-9\_\-\.]+)?$" - match = re.match(image_regex, image) - assert match, "image must match the regex: /" + image_regex + "/" - groups = match.groups() - repository = groups[0] - tag = groups[1][1:] if groups[1] else None - return repository, tag - - def get_image(self, tag): - return self.client.images.get(tag) - - def remove_tag(self, tag): - self.client.images.remove(tag) - - def untag(self, image): - for tag in image.tags: - self.client.images.remove(tag) - - def untag_all(self, label): - for image in self.client.images.list(filters={"label": label}): - self.untag(image) - - def image_exists_in_registry(self, repo, tag): - try: - for _ in self.client.api.pull(repo + ":" + tag, stream=True): - return True - except docker.errors.NotFound: - pass - return False - - def prune(self): - self.client.images.prune(filters=dict(dangling=False)) diff --git a/sigopt/orchestrate/ec2/__init__.py b/sigopt/orchestrate/ec2/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/sigopt/orchestrate/ec2/service.py b/sigopt/orchestrate/ec2/service.py deleted file mode 100644 index d0661828..00000000 --- a/sigopt/orchestrate/ec2/service.py +++ /dev/null @@ -1,85 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import errno -import os - -import boto3 -from botocore.exceptions import ClientError - -from sigopt.paths import ensure_dir, get_root_subdir - -from ..services.aws_base import AwsService - - -class AwsEc2Service(AwsService): - def __init__(self, services, aws_services, **kwargs): - super().__init__(services, aws_services) - self._ec2 = boto3.resource("ec2", **kwargs) - - @property - def ec2(self): - return self._ec2 - - def get_subnets(self, subnet_ids): - subnets = self.ec2.subnets.all() - return list(subnet for subnet in subnets if subnet.id in subnet_ids) - - def key_pair_for_cluster_name(self, cluster_name): - return f"key-pair-for-cluster-{cluster_name}" - - def describe_key_pair_for_cluster(self, cluster_name): - return self.ec2.KeyPair(self.key_pair_for_cluster_name(cluster_name)) - - def create_key_pair_for_cluster(self, cluster_name): - key_pair = self.ec2.create_key_pair(KeyName=self.key_pair_for_cluster_name(cluster_name)) - - self.ensure_key_pair_directory() - - try: - with os.fdopen( - os.open(self.key_pair_location(cluster_name), os.O_CREAT | os.O_WRONLY, 0o600), - "w", - ) as f: - f.write(key_pair.key_material) - except Exception: - # We only get one chance to read the key, so if we mess up then delete the key so we can try again next time - key_pair.delete() - raise - - return key_pair - - def ensure_key_pair_for_cluster(self, cluster_name): - try: - self.create_key_pair_for_cluster(cluster_name) - except ClientError as e: - if not e.response["Error"]["Code"] == "InvalidKeyPair.Duplicate": - raise e - - return self.describe_key_pair_for_cluster(cluster_name) - - def delete_key_pair_for_cluster(self, cluster_name): - try: - os.remove(self.key_pair_location(cluster_name)) - except OSError as e: - if e.errno != errno.ENOENT: - raise - - self.describe_key_pair_for_cluster(cluster_name).delete() - - @property - def key_pair_directory(self): - return get_root_subdir("ssh") - - def ensure_key_pair_directory(self): - ensure_dir(self.key_pair_directory) - - def key_pair_location(self, cluster_name): - key_name = self.key_pair_for_cluster_name(cluster_name) - filename = f"{key_name}.pem" - return os.path.join(self.key_pair_directory, filename) - - def ensure_key_pair_for_cluster_deleted(self, cluster_name): - # Note: under our current structure, no error occurs if we attempt to - # delete a non-existent keypair - self.delete_key_pair_for_cluster(cluster_name) diff --git a/sigopt/orchestrate/ecr/__init__.py b/sigopt/orchestrate/ecr/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/sigopt/orchestrate/ecr/service.py b/sigopt/orchestrate/ecr/service.py deleted file mode 100644 index 0da3b78c..00000000 --- a/sigopt/orchestrate/ecr/service.py +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import boto3 - -from ..services.aws_base import AwsService - - -class AwsEcrService(AwsService): - def __init__(self, services, aws_services, **kwargs): - super().__init__(services, aws_services) - self._client = boto3.client("ecr", **kwargs) - - @property - def client(self): - return self._client - - def _create_repository(self, repository_name): - return self.client.create_repository(repositoryName=repository_name) - - def _describe_repositories(self, repository_names): - return self.client.describe_repositories(repositoryNames=repository_names) - - def ensure_repositories(self, repository_names): - for name in repository_names: - try: - self._create_repository(repository_name=name) - except self.client.exceptions.RepositoryAlreadyExistsException: - pass - - return self._describe_repositories(repository_names) - - def get_authorization_token(self, registry_ids): - return self.client.get_authorization_token(registryIds=registry_ids) diff --git a/sigopt/orchestrate/eks/__init__.py b/sigopt/orchestrate/eks/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/sigopt/orchestrate/eks/kubeconfig.yml b/sigopt/orchestrate/eks/kubeconfig.yml deleted file mode 100644 index 8a1224d6..00000000 --- a/sigopt/orchestrate/eks/kubeconfig.yml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: v1 -clusters: -- cluster: - name: kubernetes -contexts: -- context: - cluster: kubernetes - user: sigopt - name: sigopt -current-context: sigopt -kind: Config -preferences: {} -users: -- name: sigopt - user: diff --git a/sigopt/orchestrate/eks/service.py b/sigopt/orchestrate/eks/service.py deleted file mode 100644 index d5e387be..00000000 --- a/sigopt/orchestrate/eks/service.py +++ /dev/null @@ -1,23 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import boto3 - -from ..services.aws_base import AwsService - - -DEFAULT_KUBERNETES_VERSION = "1.23" -SUPPORTED_KUBERNETES_VERSIONS = ("1.20", "1.21", "1.22", "1.23") - - -class AwsEksService(AwsService): - def __init__(self, services, aws_services, **kwargs): - super().__init__(services, aws_services) - self._client = boto3.client("eks", **kwargs) - - @property - def client(self): - return self._client - - def describe_cluster(self, cluster_name): - return self.client.describe_cluster(name=cluster_name) diff --git a/sigopt/orchestrate/exceptions.py b/sigopt/orchestrate/exceptions.py deleted file mode 100644 index a921918d..00000000 --- a/sigopt/orchestrate/exceptions.py +++ /dev/null @@ -1,71 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import click - -from .version import CLI_NAME - - -class OrchestrateException(click.ClickException): - def __init__(self, msg=None): - if msg is None: - msg = f"Uncaught exception: {type(self).__name__}" - super().__init__(msg) - - -class CheckExecutableError(OrchestrateException): - pass - - -class CheckConnectionError(OrchestrateException): - pass - - -class AwsClusterSharePermissionError(OrchestrateException): - pass - - -class AwsPermissionsError(OrchestrateException): - def __init__(self, error): - super().__init__( - "Looks like you have encountered the below AWS permissions error." - " Please check out our documentation and ensure you have granted yourself" - " the correct AWS permissions" - " to use SigOpt cluster features:" - " https://docs.sigopt.com/ai-module-api-references/orchestrate/aws_cluster#aws-configuration" - f"\n\n{error}" - ) - - -class MissingGpuNodesException(OrchestrateException): - pass - - -class ModelPackingError(OrchestrateException): - def __init__(self, error_str, dockerfile): - super().__init__( - ( - f"{error_str}\nDockerfile: {dockerfile}\nIf you suspect that you are" - f" out of space, run `{CLI_NAME} clean` and try again." - ), - ) - - -class ClusterDestroyError(OrchestrateException): - def __init__(self): - print("The following exceptions occurred during cluster destroy:\n") - super().__init__() - - -class NodesNotReadyError(OrchestrateException): - pass - - -class FileAlreadyExistsError(OrchestrateException): - def __init__(self, filename): - self.filename = filename - - super().__init__( - "We are attempting to write a file, but it already exists on your system:" - f" {filename}. Please delete the file and try your request again." - ) diff --git a/sigopt/orchestrate/gpu_options_validator/__init__.py b/sigopt/orchestrate/gpu_options_validator/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/sigopt/orchestrate/gpu_options_validator/service.py b/sigopt/orchestrate/gpu_options_validator/service.py deleted file mode 100644 index 9f94d396..00000000 --- a/sigopt/orchestrate/gpu_options_validator/service.py +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -from ..services.base import Service - - -RESOURCES_OPTION = "resources" - - -class GpuOptionsValidatorService(Service): - def get_resource_options(self, run_options): - resource_options = run_options.get(RESOURCES_OPTION, {}) - gpus = resource_options.get("gpus") - gpus = gpus and int(gpus) - - if gpus is not None: - resource_options = resource_options.copy() - resource_options["gpus"] = gpus - return resource_options diff --git a/sigopt/orchestrate/iam/__init__.py b/sigopt/orchestrate/iam/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/sigopt/orchestrate/iam/service.py b/sigopt/orchestrate/iam/service.py deleted file mode 100644 index 4f1683f1..00000000 --- a/sigopt/orchestrate/iam/service.py +++ /dev/null @@ -1,98 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import socket -from urllib.parse import urlparse - -import boto3 -import certifi -import requests -from OpenSSL import SSL - -from ..services.aws_base import AwsService - - -class AwsIamService(AwsService): - def __init__(self, services, aws_services, **kwargs): - super().__init__(services, aws_services) - self._client = boto3.client("iam", **kwargs) - self._iam = boto3.resource("iam", **kwargs) - self._sts = boto3.client("sts", **kwargs) - - @property - def client(self): - return self._client - - @property - def iam(self): - return self._iam - - def get_user_arn(self): - response = self.client.get_user() - user = response["User"] - return user["Arn"] - - def describe_eks_role(self, role_name): - return self.iam.Role(role_name) - - def get_thumbprint_from_oidc_issuer(self, oidc_url): - response = requests.get(f"{oidc_url}/.well-known/openid-configuration") - response.raise_for_status() - keys_url = response.json()["jwks_uri"] - parsed_url = urlparse(keys_url) - hostname = parsed_url.hostname - port = parsed_url.port or 443 - context = SSL.Context(method=SSL.TLSv1_METHOD) - context.load_verify_locations(cafile=certifi.where()) - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: - ssl_conn = SSL.Connection(context, socket=sock) - ssl_conn.connect((hostname, port)) - ssl_conn.setblocking(1) - ssl_conn.do_handshake() - ssl_conn.set_tlsext_host_name(hostname.encode()) - leaf_cert = ssl_conn.get_peer_cert_chain()[-1] - sha1_fingerprint = leaf_cert.digest("sha1").decode("ascii") - return "".join(c for c in sha1_fingerprint if c != ":") - - def ensure_eks_oidc_provider(self, eks_cluster): - url = eks_cluster["cluster"]["identity"]["oidc"]["issuer"] - client_ids = ["sts.amazonaws.com"] - thumbprint = self.get_thumbprint_from_oidc_issuer(url) - try: - self.client.create_open_id_connect_provider( - Url=url, - ClientIDList=client_ids, - ThumbprintList=[thumbprint], - ) - except self.client.exceptions.EntityAlreadyExistsException: - pass - - def get_oidc_arn(self, url): - _, provider = url.split("https://") - account_id = self._sts.get_caller_identity()["Account"] - return f"arn:aws:iam::{account_id}:oidc-provider/{provider}" - - def ensure_eks_oidc_provider_deleted(self, eks_cluster): - url = eks_cluster["cluster"]["identity"]["oidc"]["issuer"] - arn = self.get_oidc_arn(url) - try: - self.client.delete_open_id_connect_provider(OpenIDConnectProviderArn=arn) - except self.client.exceptions.NoSuchEntityException: - pass - - def _role_name_from_role_arn(self, role_arn): - return role_arn.split(":role/")[1] - - def attach_policy(self, role_arn, policy_arn): - role_name = self._role_name_from_role_arn(role_arn) - self.iam.Role(role_name).attach_policy(PolicyArn=policy_arn) - - def get_cluster_access_role_arn(self, cluster_name): - role_name = f"{cluster_name}-k8s-access-role" - return self.iam.Role(role_name).arn - - def get_role_from_arn(self, role_arn): - role_name = self._role_name_from_role_arn(role_arn) - role = self.iam.Role(role_name) - assert role.arn == role_arn - return role diff --git a/sigopt/orchestrate/identifier.py b/sigopt/orchestrate/identifier.py deleted file mode 100644 index ab07286b..00000000 --- a/sigopt/orchestrate/identifier.py +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -IDENTIFIER_TYPE_EXPERIMENT = "experiment" -IDENTIFIER_TYPE_RUN = "run" -IDENTIFIER_TYPE_SUGGESTION = "suggestion" -VALID_IDENTIFIER_TYPES = { - IDENTIFIER_TYPE_EXPERIMENT, - IDENTIFIER_TYPE_RUN, - IDENTIFIER_TYPE_SUGGESTION, -} -IDENTIFIER_QUERY_ID = "id" -IDENTIFIER_QUERY_NAME = "name" -IDENTIFIER_QUERY_SUGGESTION = "suggestion" - - -def parse_identifier(id_str): - if "/" not in id_str: - return { - "raw": id_str, - "type": IDENTIFIER_TYPE_RUN, - "query": IDENTIFIER_QUERY_NAME, - "value": id_str, - "pod_label_selector": f"type=run,run-name={id_str}", - "controller_label_selector": f"type=controller,run-name={id_str}", - } - _type, _id = id_str.split("/", 1) - if _type not in VALID_IDENTIFIER_TYPES: - raise ValueError(f"Invalid type: {_type}") - if not _id.isdigit(): - raise ValueError(f"Invalid id: {_id}") - return { - "raw": id_str, - "type": _type, - "query": IDENTIFIER_QUERY_ID, - "value": _id, - "pod_label_selector": f"type=run,{_type}={_id}", - "controller_label_selector": f"type=controller,{_type}={_id}", - } - - -def maybe_convert_to_run_identifier(identifier): - if identifier["type"] == IDENTIFIER_TYPE_SUGGESTION: - return { - "raw": identifier["raw"], - "type": IDENTIFIER_TYPE_RUN, - "query": IDENTIFIER_QUERY_SUGGESTION, - "value": identifier["value"], - "pod_label_selector": identifier["pod_label_selector"], - "controller_label_selector": identifier["controller_label_selector"], - } - return identifier - - -def get_run_and_pod_from_identifier(identifier, services): - identifier = maybe_convert_to_run_identifier(identifier) - assert identifier["type"] == IDENTIFIER_TYPE_RUN, f"Can't get a single run or pod from {identifier['raw']}" - run = None - run_id = None - pod = None - - # find the run from the identifier - if identifier["query"] in (IDENTIFIER_QUERY_NAME, IDENTIFIER_QUERY_SUGGESTION): - filter_field = identifier["query"] - filter_value = identifier["value"] - runs = list( - services.sigopt_service.iterate_runs_by_filters( - [{"operator": "==", "field": filter_field, "value": filter_value}], - ) - ) - if len(runs) > 1: - raise Exception(f"Multiple runs found with {filter_field}: {filter_value}") - if len(runs) == 1: - run = runs[0] - elif identifier["query"] == IDENTIFIER_QUERY_ID: - run_id = identifier["value"] - run = services.sigopt_service.conn.training_runs(run_id).fetch() - else: - raise NotImplementedError(identifier["query"]) - - pods = services.kubernetes_service.get_pods_by_label_selector(identifier["pod_label_selector"]).items - assert len(pods) < 2, f"Multiple pods found for {identifier['raw']}" - if len(pods) == 1: - pod = pods[0] - - return run, pod diff --git a/sigopt/orchestrate/job_runner/__init__.py b/sigopt/orchestrate/job_runner/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/sigopt/orchestrate/job_runner/service.py b/sigopt/orchestrate/job_runner/service.py deleted file mode 100644 index f7e74c26..00000000 --- a/sigopt/orchestrate/job_runner/service.py +++ /dev/null @@ -1,290 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import json -import os -import random -import string - -from ..docker.service import DockerService -from ..exceptions import OrchestrateException -from ..services.base import Service -from ..version import DEFAULT_CONTROLLER_IMAGE - - -def format_k8s_env_vars(env_vars): - return [dict(name=key, value=value) for key, value in env_vars.items()] - - -class JobRunnerService(Service): - DEFAULT_EPHEMERAL_STORAGE_REQUEST = "128Mi" - EXPERIMENT_ENV_KEY = "ORCHESTRATE_EXPERIMENT_ID" - - @property - def controller_image(self): - return os.environ.get("SIGOPT_CONTROLLER_IMAGE", DEFAULT_CONTROLLER_IMAGE) - - def sigopt_env_vars(self, project_id): - client, project = self.services.sigopt_service.ensure_project_exists(project_id) - return format_k8s_env_vars( - { - "SIGOPT_API_TOKEN": self.services.sigopt_service.api_token, - "SIGOPT_API_URL": self.services.sigopt_service.api_url, - "SIGOPT_PROJECT": project, - "SIGOPT_CLIENT": client, - } - ) - - def format_resources(self, resource_options): - resource_options = resource_options or {} - requests = resource_options.setdefault("requests", {}) - requests.setdefault("ephemeral-storage", self.DEFAULT_EPHEMERAL_STORAGE_REQUEST) - limits = resource_options.setdefault("limits", {}) - - if resource_options.get("gpus"): - if "nvidia.com/gpu" in limits: - raise OrchestrateException( - "The value in resources.gpus will override the value in" - " resources.limits.nvidia.com/gpu,please remove one of these" - " fields." - ) - limits["nvidia.com/gpu"] = resource_options.pop("gpus") - - def random_id_string(self): - return "".join(random.choice(string.ascii_lowercase) for _ in range(8)) # nosec - - def create_sigopt_experiment(self, optimization_options, project_id): - data = optimization_options.copy() - - metadata = data.pop("metadata", None) or {} - - cluster = self.services.cluster_service.get_connected_cluster() - metadata["cluster_name"] = cluster.name - - data["metadata"] = metadata - - experiment = self.services.sigopt_service.create_aiexperiment(data, project_id) - return experiment.id - - def create_controller( - self, - repository, - tag, - resource_options, - run_command, - controller_mode, - controller_name, - extra_labels, - extra_env_vars, - project_id, - ): - image_name = DockerService.format_image_name(repository, tag) - cluster = self.services.cluster_service.get_connected_cluster() - self.format_resources(resource_options) - job_info_path = "/etc/job-info" - job_info_volume_name = "job-info" - env_vars = [ - { - "name": "KUBE_CONFIG", - "value": "incluster", - }, - { - "name": "USER_IMAGE", - "value": image_name, - }, - { - "name": "USER_RESOURCES", - "value": json.dumps(resource_options), - }, - { - "name": "NAMESPACE", - "valueFrom": { - "fieldRef": { - "fieldPath": "metadata.namespace", - }, - }, - }, - { - "name": "CLUSTER_NAME", - "value": cluster.name, - }, - { - "name": "JOB_INFO_PATH", - "value": job_info_path, - }, - { - "name": "CONTROLLER_MODE", - "value": controller_mode, - }, - *(self.sigopt_env_vars(project_id)), - *extra_env_vars, - ] - if self.services.sigopt_service.log_collection_enabled: - env_vars.append( - { - "name": "SIGOPT_LOG_COLLECTION_ENABLED", - "value": "1", - } - ) - - labels = { - "mode": controller_mode, - "type": "controller", - **extra_labels, - } - - if not run_command: - run_command = [] - - self.services.kubernetes_service.start_job( - { - "apiVersion": "batch/v1", - "kind": "Job", - "metadata": { - "name": controller_name, - "labels": labels, - }, - "spec": { - "template": { - "metadata": { - "labels": labels, - }, - "spec": { - "serviceAccount": "controller", - "securityContext": { - "allowPrivilegeEscalation": False, - "readOnlyRootFilesystem": True, - }, - "restartPolicy": "Never", - "containers": [ - { - "image": self.controller_image, - "imagePullPolicy": "Always", - "name": "controller", - "env": env_vars, - "args": run_command, - "resources": { - "limits": { - "cpu": "100m", - "memory": "128Mi", - }, - }, - "volumeMounts": [ - { - "name": job_info_volume_name, - "mountPath": job_info_path, - }, - ], - "securityContext": { - "allowPrivilegeEscalation": False, - "readOnlyRootFilesystem": True, - }, - }, - ], - "volumes": [ - { - # NOTE: the job-info downwardAPI volume allows the controller to link newly created pods - # to the controller job so that the garbage collector will clean up dangling pods - "name": job_info_volume_name, - "downwardAPI": { - "items": [ - { - "path": "name", - "fieldRef": {"fieldPath": "metadata.labels['job-name']"}, - }, - { - "path": "uid", - "fieldRef": {"fieldPath": "metadata.labels['controller-uid']"}, - }, - ], - }, - } - ], - }, - }, - }, - } - ) - - def start_cluster_run( - self, - repository, - tag, - resource_options, - project_id, - run_command=None, - ): - self.services.kubernetes_service.check_nodes_are_ready() - cluster = self.services.cluster_service.get_connected_cluster() - - random_string = self.random_id_string() - run_name = "run-" + random_string - run = self.services.sigopt_service.create_run(run_name, cluster, project_id) - controller_name = f"run-controller-{random_string}" - labels = { - "run-name": run_name, - "run": run.id, - } - env_vars = [ - { - "name": "RUN_NAME", - "value": run_name, - }, - { - "name": "RUN_ID", - "value": run.id, - }, - ] - controller_mode = "run" - - self.create_controller( - repository, - tag, - resource_options, - run_command, - controller_mode, - controller_name, - labels, - env_vars, - project_id, - ) - - return f"run/{run.id}" - - def start_cluster_experiment( - self, - repository, - tag, - optimization_options, - resource_options, - project_id, - experiment_id=None, - run_command=None, - ): - self.services.kubernetes_service.check_nodes_are_ready() - - experiment_id = experiment_id or self.create_sigopt_experiment(optimization_options, project_id) - - controller_name = f"experiment-controller-{experiment_id}" - labels = {"experiment": str(experiment_id)} - controller_mode = "experiment" - env_vars = [ - { - "name": self.EXPERIMENT_ENV_KEY, - "value": str(experiment_id), - } - ] - - self.create_controller( - repository, - tag, - resource_options, - run_command, - controller_mode, - controller_name, - labels, - env_vars, - project_id, - ) - - return f"experiment/{experiment_id}" diff --git a/sigopt/orchestrate/job_status/__init__.py b/sigopt/orchestrate/job_status/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/sigopt/orchestrate/job_status/service.py b/sigopt/orchestrate/job_status/service.py deleted file mode 100644 index 8505e570..00000000 --- a/sigopt/orchestrate/job_status/service.py +++ /dev/null @@ -1,68 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -from ..services.base import Service - - -class JobStatusService(Service): - def parse_job(self, job): - job_name = job.metadata.name - - conditions = [] - if job.status.conditions: - for c in job.status.conditions: - if c.status == "True": - conditions.append(c.type) - elif c.status == "False": - conditions.append(f"Not {c.type}") - else: - conditions.append(f"Maybe {c.type}") - - job_status = ", ".join(conditions) if conditions else "Not Complete" - - experiment_id = self.services.job_runner_service.experiment_id(job_name) - experiment = self.services.sigopt_service.safe_fetch_experiment(experiment_id) - - return dict( - experiment=experiment, - name=job_name, - status=job_status, - experiment_id=experiment_id or "??", - experiment_name=(experiment.name if experiment else "unknown"), - budget=(str(float(experiment.budget)) if experiment and experiment.budget is not None else "n/a"), - total_run_count=str(experiment.progress.total_run_count) if experiment else "n/a", - ) - - def get_runs_by_pod(self, experiment): - runs_by_pod = dict() - for run in self.services.sigopt_service.iterate_runs(experiment): - pod_name = run.metadata.get("pod_name") if run.metadata else "UNKNOWN" - - if pod_name not in runs_by_pod: - runs_by_pod[pod_name] = dict(success=0, failed=0) - - # TODO: Include active state in output as well - if run.state == "failed": - runs_by_pod[pod_name]["failed"] += 1 - elif run.state != "active": - runs_by_pod[pod_name]["success"] += 1 - - return runs_by_pod - - def parse_pod(self, pod, runs_by_pod): - pod_name = pod.metadata.name - runs = runs_by_pod.get(pod_name, dict(success=0, failed=0)) - - phase = pod.status.phase - status = phase - if phase in ["Pending", "Failed", "Unknown"]: - reasons = [condition.reason for condition in pod.status.conditions if condition.reason] - if reasons: - status = f'{status} - {", ".join(reasons)}' - - return dict( - name=pod_name, - success=runs["success"], - failed=runs["failed"], - status=status, - ) diff --git a/sigopt/orchestrate/json_stream.py b/sigopt/orchestrate/json_stream.py deleted file mode 100644 index b575b477..00000000 --- a/sigopt/orchestrate/json_stream.py +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import json - - -class JsonBuffer: - def __init__(self): - self.buffer = [] - - def consume(self, chunk): - if isinstance(chunk, bytes): - chunk = chunk.decode("utf-8") - self.buffer.append(chunk) - return self.emit_data() - - def emit_data(self): - parts = "".join(self.buffer).splitlines(True) - if not parts: - return [] - if parts[-1] and parts[-1][-1] != "\n": - # NOTE: the last line is not a whole line and should be buffered - self.buffer = [parts[-1]] - parts = parts[:-1] - else: - self.buffer = [] - return [json.loads(part) for part in parts if part.strip()] - - -def json_stream(stream): - json_buffer = JsonBuffer() - for chunk in stream: - yield from json_buffer.consume(chunk) diff --git a/sigopt/orchestrate/kubectl/__init__.py b/sigopt/orchestrate/kubectl/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/sigopt/orchestrate/kubectl/service.py b/sigopt/orchestrate/kubectl/service.py deleted file mode 100644 index e4152a6a..00000000 --- a/sigopt/orchestrate/kubectl/service.py +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import os - -from sigopt.paths import get_bin_dir - -from ..paths import get_executable_path -from ..services.base import Service - - -class KubectlService(Service): - def get_kubectl_command(self): - return get_executable_path("kubectl") - - def get_kubectl_env(self): - kube_config = self.get_kube_config() - assert kube_config, "The kubectl service has no kubernetes config" - orchestrate_bin = get_bin_dir() - env = os.environ.copy() - previous_path = env.get("PATH", "") - env.update( - dict( - KUBECONFIG=kube_config, - PATH=f"{orchestrate_bin}:{previous_path}".encode(), - ) - ) - return env - - @property - def get_kube_config(self): - return self.services.kubernetes_service.kube_config diff --git a/sigopt/orchestrate/kubernetes/__init__.py b/sigopt/orchestrate/kubernetes/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/sigopt/orchestrate/kubernetes/http_proxy.py b/sigopt/orchestrate/kubernetes/http_proxy.py deleted file mode 100644 index 49162e5f..00000000 --- a/sigopt/orchestrate/kubernetes/http_proxy.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import ssl - -from kubernetes.client.rest import RESTClientObject -from requests.adapters import HTTPAdapter - - -class KubeProxyHTTPAdapter(HTTPAdapter): - def __init__(self, k8s_api_client, **kwargs): - self.k8s_api_client = k8s_api_client - super().__init__(**kwargs) - - def add_headers(self, request, **_): - query = [] - self.k8s_api_client.update_params_for_auth(request.headers, query, auth_settings=["BearerToken"]) - assert not query, "query string based auth not yet supported" - - def init_poolmanager(self, connections, maxsize, block=None, **_): - rest_client = RESTClientObject(self.k8s_api_client.configuration, pools_size=connections, maxsize=maxsize) - self.poolmanager = rest_client.pool_manager - - def cert_verify(self, conn, url, verify, cert): - # NOTE: Session.request tries to reset the certificate (why???) so just make sure certs are required and - # carry on - assert conn.cert_reqs == ssl.CERT_REQUIRED diff --git a/sigopt/orchestrate/kubernetes/service.py b/sigopt/orchestrate/kubernetes/service.py deleted file mode 100644 index 7657e9be..00000000 --- a/sigopt/orchestrate/kubernetes/service.py +++ /dev/null @@ -1,674 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import base64 -import errno -import json -import os -import random -import shutil -import tempfile -import time -import urllib -from http import client as http_client - -import backoff -import requests -import yaml -from kubernetes import client, config, utils, watch -from kubernetes.client.models.v1_container_image import V1ContainerImage -from OpenSSL import crypto - -from sigopt.paths import ensure_dir, get_root_subdir - -from ..exceptions import FileAlreadyExistsError, NodesNotReadyError, OrchestrateException -from ..provider.constants import Provider -from ..services.base import Service -from ..version import CLI_NAME -from .http_proxy import KubeProxyHTTPAdapter - - -DEFAULT_NAMESPACE = "default" -ORCHESTRATE_NAMESPACE = "orchestrate" -KUBESYSTEM_NAMESPACE = "kube-system" -NVIDIA_DEVICE_PLUGIN_URL = "https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.9.0/nvidia-device-plugin.yml" - - -# NOTE: monkeypatch for containerd not naming all images (?) -# https://github.com/kubernetes-client/python/issues/895#issuecomment-515025300 -# pylint: disable=all -def names(self, names): - self._names = names - - -V1ContainerImage.names = V1ContainerImage.names.setter(names) -# pylint: enable=all - - -class NoNodesInClusterError(NodesNotReadyError): - def __init__(self): - super().__init__( - "Looks like your cluster does not have any nodes. Please check that your" - " cluster configuration file has defined either `cpu` or `gpu` nodes. For" - " AWS clusters, check that you see nodes on the EC2 console." - ) - - -class NodeStatusNotReadyError(NodesNotReadyError): - def __init__(self): - super().__init__( - f"None of your nodes are ready to go. Run `{CLI_NAME} kubectl get nodes` to see the status of your nodes." - ) - - -class KubernetesException(OrchestrateException): - pass - - -class JobNotFoundException(KubernetesException): - pass - - -class PodNotFoundException(KubernetesException): - pass - - -class StartJobException(KubernetesException): - pass - - -class CleanupFailedException(KubernetesException): - pass - - -class KubernetesService(Service): - def __init__(self, services): - super().__init__(services) - self._kube_config = None - self._kube_dir = get_root_subdir("cluster") - self._set_all_clients() - - def warmup(self): - kube_configs = self._get_config_files() - if kube_configs: - self._kube_config = os.path.join(self._kube_dir, kube_configs[0]) - try: - configuration = client.Configuration() - config.load_kube_config(self._kube_config, client_configuration=configuration) - if os.environ.get("HTTPS_PROXY"): - configuration.proxy = os.environ["HTTPS_PROXY"] - api_client = client.ApiClient(configuration) - self._set_all_clients(api_client) - except Exception as e: - if "Invalid kube-config file. No configuration found." not in str(e): - self.services.logging_service.warning( - ( - "Experienced the following error while attempting to create" - " kubernetes client from cluster" - " configuration:\n%s\nDisconnecting and reconnecting may" - " resolve the issue.\nPlease try" - f" running:\n\t{CLI_NAME} cluster disconnect -a" - ), - str(e), - ) - self._set_all_clients(None) - - @property - def kube_config(self): - return self._kube_config - - def get_jobs(self, job_name=None): - if job_name: - try: - return self._v1_batch.read_namespaced_job(job_name, ORCHESTRATE_NAMESPACE) - except client.rest.ApiException as e: - if e.status == http_client.NOT_FOUND: - raise JobNotFoundException(f"Job with name {job_name} not found") from e - else: - raise - else: - return self._v1_batch.list_namespaced_job(ORCHESTRATE_NAMESPACE, watch=False) - - def get_jobs_by_label_selector(self, label_selector): - return self._v1_batch.list_namespaced_job( - ORCHESTRATE_NAMESPACE, - watch=False, - label_selector=label_selector, - ) - - def delete_job(self, job_name, propogation_policy=None): - try: - self._v1_batch.delete_namespaced_job( - job_name, - ORCHESTRATE_NAMESPACE, - body=client.V1DeleteOptions(propagation_policy=propogation_policy), - ) - except client.rest.ApiException as e: - if e.status == http_client.NOT_FOUND: - raise JobNotFoundException(f"Job with name {job_name} not found") from e - else: - raise - - def start_job(self, job_spec_dict): - try: - return self._v1_batch.create_namespaced_job(ORCHESTRATE_NAMESPACE, job_spec_dict) - except client.rest.ApiException as e: - if e.status == http_client.BAD_REQUEST: - k8s_error_message = json.loads(e.body).get("message") - error_message = ( - "\n[ERROR]\t\tKubernetes reported a bad request this is most" - " likely from an error in the experiment configuration" - f" file.\n\t\tFormated Kubernetes Error:\n{k8s_error_message}\n" - ) - raise StartJobException(error_message) from e - else: - raise - - # TODO: control how logs are displayed, should this be sent to stdout by subprocess or by the CLI? - def logs(self, pod_name, follow=False): - if follow: - watcher = watch.Watch() - return watcher.stream( - self._v1_core.read_namespaced_pod_log, - pod_name, - ORCHESTRATE_NAMESPACE, - ) - return self._v1_core.read_namespaced_pod_log(pod_name, ORCHESTRATE_NAMESPACE) - - def pod_names(self, job_name): - data = self.get_pods(job_name=job_name) - return [item.metadata.name for item in data.items] - - def get_pods_by_label_selector(self, label_selector): - return self._v1_core.list_namespaced_pod( - ORCHESTRATE_NAMESPACE, - watch=False, - label_selector=label_selector, - ) - - def get_pods(self, job_name=None): - if job_name: - return self.get_pods_by_label_selector(label_selector=f"job-name={job_name}") - else: - return self._v1_core.list_namespaced_pod(ORCHESTRATE_NAMESPACE, watch=False) - - def get_pod(self, pod_name): - return self._v1_core.read_namespaced_pod(pod_name, ORCHESTRATE_NAMESPACE) - - def delete_pod(self, pod_name): - return self._v1_core.delete_namespaced_pod(pod_name, ORCHESTRATE_NAMESPACE, body=client.V1DeleteOptions()) - - def _watch_pod_events(self, iteratee, **kwargs): - watcher = watch.Watch() - event = None - for event in watcher.stream( - self._v1_core.list_namespaced_pod, - ORCHESTRATE_NAMESPACE, - **kwargs, - ): - if iteratee(event): - break - return event - - def wait_for_pod_to_exist(self, label_selector): - return self._watch_pod_events(lambda e: True, label_selector=label_selector)["object"] - - def wait_for_pod_to_start(self, label_selector, event_handler=None): - def iteratee(event): - if event_handler: - event_handler(event) - return event["object"].status.phase in ("Running", "Succeeded", "Failed") - - return self._watch_pod_events(iteratee, label_selector=label_selector)["object"] - - def wait_until_nodes_are_ready(self, retries=20): - for try_number in range(retries + 1): - try: - self.check_nodes_are_ready() - return - except NodesNotReadyError: - if try_number >= retries: - raise - else: - time.sleep(random.uniform(20, 40)) # nosec - - def check_nodes_are_ready(self): - nodes = self.get_nodes().items - if not nodes: - raise NoNodesInClusterError() - - any_node_ready = any(c.type == "Ready" and c.status == "True" for node in nodes for c in node.status.conditions) - if not any_node_ready: - raise NodeStatusNotReadyError() - - def ensure_config_map(self, config_map): - try: - self._v1_core.create_namespaced_config_map(KUBESYSTEM_NAMESPACE, config_map) - except client.rest.ApiException as e: - if e.status != http_client.CONFLICT: - raise - - def write_config(self, cluster_name, data): - ensure_dir(self._kube_dir) - new_file_path = self._kube_config_path(cluster_name) - if os.path.isfile(new_file_path): - raise FileAlreadyExistsError(new_file_path) - - with open(new_file_path, "w") as f: - yaml.dump(data, f) - - self.warmup() - - def test_config(self, retries=0, wait_time=5): - if self._v1_core is None: - raise OrchestrateException( - "We ran into an issue connecting to your cluster." - "\nDisconnecting and then reconnecting may resolve the issue." - "\nDisconnect by running:" - f"\n\t{CLI_NAME} cluster disconnect -a" - ) - - for try_number in range(retries + 1): - try: - return self._v1_core.list_namespaced_service(DEFAULT_NAMESPACE) - except Exception: - if try_number >= retries: - raise - else: - time.sleep(wait_time) - - def ensure_config_deleted(self, cluster_name): - try: - self._delete_config(cluster_name) - except OSError as e: - if e.errno != errno.ENOENT: - raise - - def get_cluster_names(self): - return [self._cluster_name_from_config(c) for c in self._get_config_files()] - - def ensure_plugins(self, cluster_name, provider): - with urllib.request.urlopen(NVIDIA_DEVICE_PLUGIN_URL) as nvidia_plugin_fp: - self._ensure_plugin_fp(nvidia_plugin_fp, namespace=KUBESYSTEM_NAMESPACE) - self.ensure_orchestrate_namespace() - self._ensure_plugin("orchestrate-controller-roles.yml", namespace=ORCHESTRATE_NAMESPACE) - # NOTE: disabled until remote image builds are working (consistently) - self.ensure_docker_plugin( - resources=dict( - requests=dict( - cpu="0.5", - memory="2Gi", - ), - ), - storage_capacity="512Gi", - ) - if provider == Provider.AWS: - self.create_autoscaler(cluster_name) - - def create_docker_tls_certs(self): - ten_years = 10 * 365 * 24 * 60 * 60 - outputs = {} - ca_key = crypto.PKey() - ca_key.generate_key(crypto.TYPE_RSA, 4096) - ca_cert = crypto.X509() - ca_cert.get_subject().CN = "sigopt:docker ca" - ca_cert.set_serial_number(random.getrandbits(64)) - ca_cert.set_issuer(ca_cert.get_subject()) - ca_cert.set_pubkey(ca_key) - ca_cert.gmtime_adj_notBefore(0) - ca_cert.gmtime_adj_notAfter(ten_years) - ca_cert.add_extensions( - [ - crypto.X509Extension(b"basicConstraints", True, b"CA:TRUE, pathlen:0"), - crypto.X509Extension(b"keyUsage", True, b"keyCertSign, cRLSign"), - crypto.X509Extension(b"subjectKeyIdentifier", False, b"hash", subject=ca_cert), - ] - ) - ca_cert.sign(ca_key, "sha256") - outputs["ca.pem"] = crypto.dump_certificate(crypto.FILETYPE_PEM, ca_cert).decode("ascii") - server_key = crypto.PKey() - server_key.generate_key(crypto.TYPE_RSA, 4096) - outputs["key.pem"] = crypto.dump_privatekey(crypto.FILETYPE_PEM, server_key).decode("ascii") - server_req = crypto.X509Req() - server_req.get_subject().CN = "sigopt:docker server" - server_req.set_pubkey(server_key) - server_req.sign(ca_key, "sha256") - server_cert = crypto.X509() - server_cert.set_serial_number(random.getrandbits(64)) - server_cert.gmtime_adj_notBefore(0) - server_cert.gmtime_adj_notAfter(ten_years) - server_cert.set_issuer(ca_cert.get_subject()) - server_cert.set_subject(server_req.get_subject()) - server_cert.set_pubkey(server_req.get_pubkey()) - server_cert.add_extensions( - [ - crypto.X509Extension( - b"subjectAltName", - False, - f"DNS:localhost, DNS:docker.{KUBESYSTEM_NAMESPACE}.svc.cluster.local, IP:127.0.0.1".encode(), - ), - crypto.X509Extension(b"extendedKeyUsage", False, b"serverAuth"), - ] - ) - server_cert.sign(ca_key, "sha256") - outputs["cert.pem"] = crypto.dump_certificate(crypto.FILETYPE_PEM, server_cert).decode("ascii") - return outputs - - def is_docker_installed(self): - try: - self._v1_apps.read_namespaced_stateful_set("docker", KUBESYSTEM_NAMESPACE) - return True - except client.rest.ApiException as e: - if e.status != http_client.NOT_FOUND: - raise - return False - - def wait_for_docker_pod(self, sleep_time=5, iterations=6): - for _ in range(iterations): - try: - docker_pod = self._v1_core.read_namespaced_pod("docker-0", KUBESYSTEM_NAMESPACE) - if docker_pod.status.phase == "Running": - return - except client.rest.ApiException as e: - if e.status != http_client.NOT_FOUND: - raise - time.sleep(sleep_time) - raise TimeoutError( - "\n".join( - [ - "Timed out waiting for Docker to start.", - "You can find more information by running the following:", - "\tsigopt cluster kubectl -nkube-system describe pod/docker-0", - ] - ) - ) - - def ensure_docker_plugin(self, resources, storage_capacity): - docker_certs_secret_name = "docker-certs" - try: - self._v1_core.read_namespaced_secret(docker_certs_secret_name, KUBESYSTEM_NAMESPACE) - except client.rest.ApiException as e: - if e.status != http_client.NOT_FOUND: - raise - docker_certs = self.create_docker_tls_certs() - self._v1_core.create_namespaced_secret( - KUBESYSTEM_NAMESPACE, - { - "metadata": { - "name": docker_certs_secret_name, - "labels": {"app": "docker"}, - }, - "data": {key: base64.b64encode(value.encode()).decode("ascii") for key, value in docker_certs.items()}, - "type": "Opaque", - }, - ) - - with self.services.resource_service.open("plugins", "docker-statefulset.yml") as resource_fp: - docker_stateful_set_template = yaml.safe_load(resource_fp) - docker_stateful_set_template["spec"]["template"]["spec"]["containers"][0]["resources"] = resources - docker_stateful_set_template["spec"]["volumeClaimTemplates"][0]["spec"]["resources"]["requests"][ - "storage" - ] = storage_capacity - self._apply_object( - self._v1_apps.create_namespaced_stateful_set, - self._v1_apps.patch_namespaced_stateful_set, - docker_stateful_set_template, - KUBESYSTEM_NAMESPACE, - ) - with self.services.resource_service.open("plugins", "docker-service.yml") as resource_fp: - docker_service_template = yaml.safe_load(resource_fp) - self._apply_object( - self._v1_core.create_namespaced_service, - self._v1_core.patch_namespaced_service, - docker_service_template, - KUBESYSTEM_NAMESPACE, - ) - - def mount_http_proxy_adapter(self, session): - session.mount( - self._api_client.configuration.host, - KubeProxyHTTPAdapter(k8s_api_client=self._api_client), - ) - - def get_docker_connection_url(self): - return ( - f"{self._api_client.configuration.host}" - f"/api/v1/namespaces/{KUBESYSTEM_NAMESPACE}/services/https:docker:https/proxy" - ) - - def cleanup_for_destroy(self): - try: - self._v1_apps.delete_namespaced_stateful_set("docker", KUBESYSTEM_NAMESPACE) - except client.rest.ApiException as e: - if e.status != http_client.NOT_FOUND: - raise - selector = "orchestrate/cleanup-before-destroy" - for pvc in self._v1_core.list_persistent_volume_claim_for_all_namespaces( - label_selector=selector, - ).items: - self._v1_core.delete_namespaced_persistent_volume_claim(pvc.metadata.name, pvc.metadata.namespace) - remaining_pvs = backoff.on_predicate(backoff.expo, lambda pvs: len(pvs.items) > 0, max_time=120)( - self._v1_core.list_persistent_volume - )() - if remaining_pvs.items: - raise CleanupFailedException( - "Some volumes could not be cleaned up, please remove them before destroying the cluster" - ) - - def ensure_orchestrate_namespace(self): - try: - self._v1_core.create_namespace(client.V1Namespace(metadata=client.V1ObjectMeta(name=ORCHESTRATE_NAMESPACE))) - except client.rest.ApiException as e: - if e.status != http_client.CONFLICT: - raise - - def _ensure_plugin_fp(self, fp, namespace): - with tempfile.NamedTemporaryFile("wb") as temp_fp: - shutil.copyfileobj(fp, temp_fp) - temp_fp.flush() - try: - utils.create_from_yaml(self._api_client, temp_fp.name) - except utils.FailToCreateError as fce: - if not all(exc.status == http_client.CONFLICT for exc in fce.api_exceptions): - raise - - def _ensure_plugin(self, file_name, namespace): - with self.services.resource_service.open("plugins", file_name) as file_content: - self._ensure_plugin_fp(file_content, namespace) - - def _cluster_name_from_config(self, config_name): - basename = os.path.basename(config_name) - if basename.startswith("config-"): - return basename[len("config-") :] - else: - return None - - def get_nodes(self): - return self._v1_core.list_node() - - def _delete_config(self, cluster_name): - self._kube_config = None - self._set_all_clients() - os.remove(self._kube_config_path(cluster_name)) - - def _kube_config_path(self, cluster_name): - filename = f"config-{cluster_name}" - return os.path.join(self._kube_dir, filename) - - def _get_config_files(self): - if os.path.exists(self._kube_dir): - return [config for config in os.listdir(self._kube_dir) if config.startswith("config-")] - return [] - - def _set_all_clients(self, api_client=None): - self._api_client = api_client - if api_client: - self._v1_apps = client.AppsV1Api(api_client) - self._v1_batch = client.BatchV1Api(api_client) - self._v1_core = client.CoreV1Api(api_client) - self._v1_rbac = client.RbacAuthorizationV1Api(api_client) - else: - self._v1_apps = None - self._v1_batch = None - self._v1_core = None - self._v1_rbac = None - - def _get_autoscaler_args(self, cluster_name): - aws_provider = self.services.provider_broker.get_provider_service(Provider.AWS) - aws_services = aws_provider.aws_services - autoscaler_stack = aws_services.cloudformation_service.describe_eks_cluster_autoscaler_role_stack(cluster_name) - autoscaler_role_arn = [ - out["OutputValue"] for out in autoscaler_stack.outputs if out["OutputKey"] == "ClusterAutoscalerRoleArn" - ][0] - - kubernetes_version = aws_services.eks_service.describe_cluster(cluster_name)["cluster"]["version"] - return (autoscaler_role_arn, kubernetes_version) - - def _get_autoscaler_image_version(self, kubernetes_version): - k8s_version_to_autoscaler_release = { - "1.20": "1.20.3", - "1.21": "1.21.2", - } - return k8s_version_to_autoscaler_release.get(kubernetes_version, f"{kubernetes_version}.0") - - def _parameterize_autoscaler_dicts(self, cluster_name, autoscaler_role_arn, kubernetes_version): - with self.services.resource_service.open("plugins", "autoscaler-plugin-template.yml") as fh: - objs = list(yaml.safe_load_all(fh)) - ( - service_account_dict, - cluster_role_dict, - role_dict, - cluster_role_binding_dict, - role_binding_dict, - deployment_dict, - ) = objs - - service_account_dict["metadata"]["annotations"] = { - "eks.amazonaws.com/role-arn": autoscaler_role_arn, - } - - autoscaler_version = self._get_autoscaler_image_version(kubernetes_version) - autoscaler_image = f"k8s.gcr.io/autoscaling/cluster-autoscaler:v{autoscaler_version}" - deployment_dict["spec"]["template"]["spec"]["containers"][0]["image"] = autoscaler_image - - auto_discovery_tag = f"tag=k8s.io/cluster-autoscaler/enabled,k8s.io/cluster-autoscaler/{cluster_name}" - auto_discovery_arg = f"--node-group-auto-discovery=asg:{auto_discovery_tag}" - deployment_dict["spec"]["template"]["spec"]["containers"][0]["command"].append(auto_discovery_arg) - return ( - service_account_dict, - cluster_role_dict, - role_dict, - cluster_role_binding_dict, - role_binding_dict, - deployment_dict, - ) - - def _apply_object(self, create_func, patch_func, body, namespace=None): - kwargs = {"body": body} - if namespace is not None: - kwargs["namespace"] = namespace - try: - create_func(**kwargs) - except client.rest.ApiException as e: - if e.status == http_client.CONFLICT: - kwargs["name"] = body["metadata"]["name"] - patch_func(**kwargs) - else: - raise - - def create_autoscaler(self, cluster_name): - (autoscaler_role_arn, kubernetes_version) = self._get_autoscaler_args(cluster_name) - autoscaler_dicts = self._parameterize_autoscaler_dicts(cluster_name, autoscaler_role_arn, kubernetes_version) - ( - sa_dict, - cluster_role_dict, - role_dict, - crb_dict, - rb_dict, - deployment_dict, - ) = autoscaler_dicts - self._apply_object( - self._v1_core.create_namespaced_service_account, - self._v1_core.patch_namespaced_service_account, - sa_dict, - KUBESYSTEM_NAMESPACE, - ) - self._apply_object( - self._v1_rbac.create_cluster_role, - self._v1_rbac.patch_cluster_role, - cluster_role_dict, - ) - self._apply_object( - self._v1_rbac.create_namespaced_role, - self._v1_rbac.patch_namespaced_role, - role_dict, - KUBESYSTEM_NAMESPACE, - ) - self._apply_object( - self._v1_rbac.create_cluster_role_binding, - self._v1_rbac.patch_cluster_role_binding, - crb_dict, - ) - self._apply_object( - self._v1_rbac.create_namespaced_role_binding, - self._v1_rbac.patch_namespaced_role_binding, - rb_dict, - KUBESYSTEM_NAMESPACE, - ) - self._apply_object( - self._v1_apps.create_namespaced_deployment, - self._v1_apps.patch_namespaced_deployment, - deployment_dict, - KUBESYSTEM_NAMESPACE, - ) - - def _delete_autoscaler_object(self, delete_func, body, namespace=None): - kwargs = {"name": body["metadata"]["name"]} - if namespace is not None: - kwargs["namespace"] = namespace - try: - delete_func(**kwargs) - except client.rest.ApiException as e: - if e.status != http_client.NOT_FOUND: - raise - - def delete_autoscaler(self, cluster_name): - (autoscaler_role_arn, kubernetes_version) = self._get_autoscaler_args(cluster_name) - autoscaler_dicts = self._parameterize_autoscaler_dicts(cluster_name, autoscaler_role_arn, kubernetes_version) - ( - sa_dict, - cluster_role_dict, - role_dict, - crb_dict, - rb_dict, - deployment_dict, - ) = autoscaler_dicts - - self._delete_autoscaler_object( - self._v1_core.delete_namespaced_service_account, - sa_dict, - KUBESYSTEM_NAMESPACE, - ) - self._delete_autoscaler_object( - self._v1_rbac.delete_cluster_role, - cluster_role_dict, - ) - self._delete_autoscaler_object( - self._v1_rbac.delete_namespaced_role, - role_dict, - KUBESYSTEM_NAMESPACE, - ) - self._delete_autoscaler_object( - self._v1_rbac.delete_cluster_role_binding, - crb_dict, - ) - self._delete_autoscaler_object( - self._v1_rbac.delete_namespaced_role_binding, - rb_dict, - KUBESYSTEM_NAMESPACE, - ) - self._delete_autoscaler_object( - self._v1_apps.delete_namespaced_deployment, - deployment_dict, - KUBESYSTEM_NAMESPACE, - ) diff --git a/sigopt/orchestrate/lib/__init__.py b/sigopt/orchestrate/lib/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/sigopt/orchestrate/lib/lists.py b/sigopt/orchestrate/lib/lists.py deleted file mode 100644 index e3ef9786..00000000 --- a/sigopt/orchestrate/lib/lists.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -""" -Common utility functions for working with lists -""" -from .types import is_mapping, is_sequence, is_set - - -def list_get(lis, index): - """ - Gets the list item at the provided index, or None if that index is invalid - """ - try: - return lis[index] - except IndexError: - return None - - -def remove_nones(lis): - """ - Returns a copy of this object with all `None` values removed. - """ - if is_mapping(lis): - return {k: v for k, v in lis.items() if v is not None} - if is_set(lis): - return lis - {None} - if is_sequence(lis): - return [l for l in lis if l is not None] - raise Exception(f"Unsupported type: {type(lis)}") - - -def coalesce(*args): - """ - Returns the first non-None value, or None if no such value exists - """ - return list_get(remove_nones(args), 0) - - -def partition(lis, predicate): - """ - Splits a list into two lists based on a predicate. The first list will contain - all elements of the provided list where predicate is true, and the second list - will contain the rest - """ - as_list = list(lis) - true_list = [] - false_list = [] - for l in as_list: - pred_value = predicate(l) - if pred_value is True: - true_list.append(l) - elif pred_value is False: - false_list.append(l) - else: - raise Exception("Invalid predicate") - - return true_list, false_list diff --git a/sigopt/orchestrate/lib/types.py b/sigopt/orchestrate/lib/types.py deleted file mode 100644 index 310a1da7..00000000 --- a/sigopt/orchestrate/lib/types.py +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -from collections import abc as _abc - - -def is_sequence(val): - """ - Returns True iff this is a "list-like" type. Avoids the common error that strings - are iterable, and handles numpy and protobufs correctly - """ - return isinstance(val, _abc.Sequence) and not isinstance(val, str) - - -def is_string_sequence(val): - """ - Returns True iff this is a "list-like" type and all list elements are strings. - """ - return is_sequence(val) and all(is_string(element) for element in val) - - -def is_mapping(val): - """ - Returns True iff this is a "dict-like" type - """ - return isinstance(val, _abc.Mapping) - - -def is_set(val): - """ - Returns True iff this is a "set-like" type - """ - return isinstance(val, (frozenset, set)) - - -def is_string(val): - """ - Return True iff this is a string - """ - return isinstance(val, str) - - -def is_integer(val): - """ - Return True iff this is an integer - """ - return (val is not True) and (val is not False) and isinstance(val, int) - - -def is_boolean(val): - """ - Return True iff this is a boolean - """ - return isinstance(val, bool) diff --git a/sigopt/orchestrate/logging/__init__.py b/sigopt/orchestrate/logging/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/sigopt/orchestrate/logging/service.py b/sigopt/orchestrate/logging/service.py deleted file mode 100644 index cde11965..00000000 --- a/sigopt/orchestrate/logging/service.py +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import logging - -from ..services.base import Service - - -class LoggingService(Service): - def __init__(self, services, logger_name="sigopt"): - super().__init__(services) - self._logger = logging.getLogger(logger_name) - - @property - def logger(self): - return self._logger - - def debug(self, *args, **kwargs): - return self.logger.debug(*args, **kwargs) - - def info(self, *args, **kwargs): - return self.logger.info(*args, **kwargs) - - def warning(self, *args, **kwargs): - return self.logger.warning(*args, **kwargs) - - def error(self, *args, **kwargs): - return self.logger.error(*args, **kwargs) diff --git a/sigopt/orchestrate/model_packer/__init__.py b/sigopt/orchestrate/model_packer/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/sigopt/orchestrate/model_packer/service.py b/sigopt/orchestrate/model_packer/service.py deleted file mode 100644 index 56067025..00000000 --- a/sigopt/orchestrate/model_packer/service.py +++ /dev/null @@ -1,95 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import os -import shlex -import shutil - -from sigopt.config import config as sigopt_config - -from ..common import TemporaryDirectory -from ..services.base import Service - - -def create_model_packer_dockerfile( - base_image, - verify_ssl_certs, - no_verify_ssl_certs, - sigopt_home, -): - lines = [] - lines.append(f"FROM {shlex.quote(base_image)}") - lines.append("LABEL orchestrate-user-created=true") - lines.append("COPY . /") - if verify_ssl_certs: - lines.append(f"ENV SIGOPT_API_VERIFY_SSL_CERTS {shlex.quote(verify_ssl_certs)}") - if no_verify_ssl_certs: - lines.append(f"ENV SIGOPT_API_NO_VERIFY_SSL_CERTS {shlex.quote(no_verify_ssl_certs)}") - if sigopt_home: - lines.append(f"ENV SIGOPT_HOME {shlex.quote(sigopt_home)}") - return "".join(f"{l}\n" for l in lines) - - -class ModelPackerService(Service): - def build_image( - self, - docker_service, - repository, - tag, - quiet=False, - dockerfile=None, - ): - if not os.path.isfile(dockerfile): - raise Exception("Please specify a path to a Dockerfile") - - with open(dockerfile) as dockerfile_fp: - dockerfile_contents = dockerfile_fp.read() - cwd = os.getcwd() - - user_image_tag = docker_service.build( - directory=cwd, - dockerfile_contents=dockerfile_contents, - quiet=quiet, - show_all_logs=True, - ) - - try: - with TemporaryDirectory() as root_dirname: - ssl_dirname = os.path.join(root_dirname, "etc", "ssl") - sigopt_config_dirname = os.path.join(root_dirname, "etc", "sigopt", "client") - for dirname in (ssl_dirname, sigopt_config_dirname): - os.makedirs(dirname) - - verify_ssl_certs = None - no_verify_ssl_certs = None - local_verify_ssl_certs = self.services.sigopt_service.verify_ssl_certs - # NOTE: we intentionally leave verify_ssl_certs as None in the bool/True case because - # verify_ssl_certs must refer to a file when being passed as an environment variable - if isinstance(local_verify_ssl_certs, bool): - no_verify_ssl_certs = not local_verify_ssl_certs - elif local_verify_ssl_certs is not None: - build_context_verify_ssl_certs = os.path.join(ssl_dirname, "sigopt-ca.crt") - shutil.copyfile(local_verify_ssl_certs, build_context_verify_ssl_certs) - verify_ssl_certs = build_context_verify_ssl_certs.replace(root_dirname, "/") - - sigopt_home = None - local_config_path = sigopt_config.config_json_path - if local_config_path is not None and os.path.exists(local_config_path): - build_context_config_path = os.path.join(sigopt_config_dirname, "config.json") - shutil.copyfile(local_config_path, build_context_config_path) - sigopt_home = os.path.dirname(os.path.dirname(build_context_config_path.replace(root_dirname, "/"))) - - return docker_service.build( - tag=docker_service.format_image_name(repository, tag), - directory=root_dirname, - dockerfile_contents=create_model_packer_dockerfile( - base_image=user_image_tag, - verify_ssl_certs=verify_ssl_certs, - no_verify_ssl_certs=no_verify_ssl_certs, - sigopt_home=sigopt_home, - ), - quiet=quiet, - show_all_logs=False, - ) - finally: - docker_service.remove_tag(user_image_tag) diff --git a/sigopt/orchestrate/node_groups.py b/sigopt/orchestrate/node_groups.py deleted file mode 100644 index 9e2e2cf5..00000000 --- a/sigopt/orchestrate/node_groups.py +++ /dev/null @@ -1,12 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -NODE_GROUP_TYPE_CPU = "cpu" -NODE_GROUP_TYPE_GPU = "gpu" -NODE_GROUP_TYPE_SYSTEM = "system" - -ALL_NODE_GROUP_TYPES = { - NODE_GROUP_TYPE_CPU, - NODE_GROUP_TYPE_GPU, - NODE_GROUP_TYPE_SYSTEM, -} diff --git a/sigopt/orchestrate/options_validator/__init__.py b/sigopt/orchestrate/options_validator/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/sigopt/orchestrate/options_validator/service.py b/sigopt/orchestrate/options_validator/service.py deleted file mode 100644 index ec4ff483..00000000 --- a/sigopt/orchestrate/options_validator/service.py +++ /dev/null @@ -1,112 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -from ..lib.types import is_boolean, is_integer, is_mapping, is_sequence, is_string -from ..node_groups import ALL_NODE_GROUP_TYPES, NODE_GROUP_TYPE_CPU, NODE_GROUP_TYPE_GPU, NODE_GROUP_TYPE_SYSTEM -from ..services.base import Service - - -class OptionsValidatorService(Service): - def validate_resources(self, gpus=None, requests=None, limits=None): - if gpus is not None: - assert is_integer(gpus) and gpus >= 0, f"resources.gpus is not a non-negative integer: {gpus}" - if requests is not None: - assert is_mapping(requests), f"resources.requests is not a mapping: {requests}" - if limits is not None: - assert is_mapping(limits), f"resources.limits is not a mapping: {limits}" - - def validate_aws_for_orchestrate( - self, - aws_access_key_id=None, - aws_secret_access_key=None, - ): - self.validate_aws_keys(aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key) - - def validate_aws_for_cluster( - self, - aws_access_key_id=None, - aws_secret_access_key=None, - additional_policies=None, - ): - self.validate_aws_keys(aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key) - - if additional_policies: - assert is_sequence(additional_policies), f"aws.additional_policies is not a list: {additional_policies}" - - def validate_aws_keys( - self, - aws_access_key_id=None, - aws_secret_access_key=None, - ): - if aws_secret_access_key is not None: - assert ( - is_string(aws_secret_access_key) and aws_secret_access_key - ), f"Please provide a string aws.aws_secret_access_key: {aws_secret_access_key}" - if aws_access_key_id is not None: - assert ( - is_string(aws_access_key_id) and aws_access_key_id - ), f"Please provide a string aws.aws_access_key_id: {aws_access_key_id}" - - def validate_sigopt(self, api_token=None, verify_ssl_certs=None): - if api_token is not None: - assert is_string(api_token) and api_token, f"Please provide a string sigopt.api_token: {api_token}" - if verify_ssl_certs is not None: - assert is_boolean(verify_ssl_certs) or ( - is_string(verify_ssl_certs) and verify_ssl_certs - ), f"Please provide a boolean or string sigopt.verify_ssl_certs: {verify_ssl_certs}" - - def validate_cluster_options( - self, - provider=None, - cluster_name=None, - aws=None, - kubernetes_version=None, - **kwargs, - ): - unknown_options = set(kwargs) - ALL_NODE_GROUP_TYPES - assert not unknown_options, f"Unknown options provided: {', '.join(unknown_options)}" - assert provider and is_string(provider), f"We need a string `provider` to create your cluster: {provider}" - - if aws is not None: - self.validate_aws_for_cluster(**aws) - - if kubernetes_version is not None: - assert is_string(kubernetes_version), "kubernetes_version should have a string value" - - assert is_string(cluster_name) and cluster_name, "We need a string `cluster_name` to create your cluster" - assert kwargs.get(NODE_GROUP_TYPE_CPU) or kwargs.get( - NODE_GROUP_TYPE_GPU - ), "Please specify some cpu or gpu (or both) nodes for your cluster" - for node_group_type in ALL_NODE_GROUP_TYPES: - node_group_options = kwargs.get(node_group_type) - if not node_group_options: - continue - assert is_mapping(node_group_options), f"{node_group_type} is not a mapping: {node_group_options}" - self.validate_worker_stack(name=node_group_type, **node_group_options) - - def validate_worker_stack( - self, - name, - instance_type=None, - max_nodes=None, - min_nodes=None, - node_volume_size=None, - ): - if name != NODE_GROUP_TYPE_SYSTEM: - assert instance_type is not None, f"Missing: {name}.instance_type" - assert max_nodes is not None, f"Missing: {name}.max_nodes (can be the same as {name}.min_nodes)" - assert min_nodes is not None, f"Missing: {name}.min_nodes (can be the same as {name}.max_nodes)" - - if instance_type is not None: - assert is_string(instance_type), f"{name}.instance_type is not a string: {instance_type}" - - if max_nodes is not None: - assert is_integer(max_nodes) and max_nodes > 0, f"{name}.max_nodes is not a positive integer: {max_nodes}" - - if min_nodes is not None: - assert is_integer(min_nodes) and min_nodes >= 0, f"{name}.min_nodes is not a non-negative integer: {min_nodes}" - - if node_volume_size is not None: - assert ( - is_integer(node_volume_size) and node_volume_size > 0 - ), f"{name}.node_volume_size is not a positive integer: {node_volume_size}" diff --git a/sigopt/orchestrate/paths.py b/sigopt/orchestrate/paths.py deleted file mode 100644 index 1fe185b7..00000000 --- a/sigopt/orchestrate/paths.py +++ /dev/null @@ -1,111 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import errno -import hashlib -import os -import subprocess # nosec -from urllib.request import urlretrieve - -from sigopt.paths import get_bin_dir, get_executable_path - -from .common import Platform, current_platform -from .exceptions import CheckExecutableError - - -def check_executable(command, sha256, full_check): - exec_path = get_executable_path(command) - try: - if full_check: - with open(exec_path, "rb") as exec_fp: - contents = exec_fp.read() - file_sha256 = hashlib.sha256(contents).hexdigest() - else: - with open(f"{exec_path}.sha256", "r") as exec_sha256_fp: - file_sha256 = exec_sha256_fp.read() - except IOError as e: - if e.errno == errno.ENOENT: - raise CheckExecutableError(f"Error opening the hash files for: {command}") from e - raise - - if not sha256 == file_sha256: - filetype = "executable" if full_check else "hash file" - raise CheckExecutableError(f"the {filetype} for '{command}' does not have the expected hash") - - if not os.access(exec_path, os.X_OK): - raise CheckExecutableError(f"the file for '{command}' is not executable") - - if full_check: - with open(os.devnull, "w") as devnull: - try: - subprocess.check_call([exec_path], stdout=devnull, stderr=devnull) - subprocess.check_call( - [command], - env={"PATH": get_bin_dir()}, - stdout=devnull, - stderr=devnull, - ) - except subprocess.CalledProcessError as e: - raise CheckExecutableError(f"Exception checking the excecutable for {command}: {e}") from e - except OSError as e: - if e.errno == errno.ENOENT: - raise CheckExecutableError(f"System cannot find executable for {command}") from e - raise - - -KUBECTL_VERSION = "v1.25.2" -KUBECTL_URL_FORMAT = "https://dl.k8s.io/release/{}/bin/{}/amd64/kubectl" -KUBECTL_SHA256_LINUX = "8639f2b9c33d38910d706171ce3d25be9b19fc139d0e3d4627f38ce84f9040eb" -KUBECTL_SHA256_MAC = "b859766d7b47267af5cc1ee01a2d0c3c137dbfc53cd5be066181beed11ec7d34" - -AWS_IAM_AUTHENTICATOR_URL_FORMAT = ( - "https://github.com/kubernetes-sigs/aws-iam-authenticator/releases/download/v0.5.9/aws-iam-" - "authenticator_0.5.9_{}_amd64" -) -AWS_IAM_AUTHENTICATOR_SHA256_LINUX = "b192431c22d720c38adbf53b016c33ab17105944ee73b25f485aa52c9e9297a7" -AWS_IAM_AUTHENTICATOR_SHA256_MAC = "7656bd290a7e9cb588df1d9ccec43fab7f2447b88ed4f41d3f5092fd114b0939" - - -def check_kubectl_executable(full_check=False): - check_executable( - command="kubectl", - sha256=(KUBECTL_SHA256_MAC if current_platform() == Platform.MAC else KUBECTL_SHA256_LINUX), - full_check=full_check, - ) - - -def check_iam_authenticator_executable(full_check=False): - check_executable( - command="aws-iam-authenticator", - sha256=( - AWS_IAM_AUTHENTICATOR_SHA256_MAC if current_platform() == Platform.MAC else AWS_IAM_AUTHENTICATOR_SHA256_LINUX - ), - full_check=full_check, - ) - - -def download_executable(command, url): - executable_path = get_executable_path(command) - urlretrieve(url, executable_path) # nosec - os.chmod(executable_path, 0o700) - with open(executable_path, "rb") as exec_fp, open(f"{executable_path}.sha256", "w") as sha256_fp: - sha256_fp.write(hashlib.sha256(exec_fp.read()).hexdigest()) - - -def download_kubectl_executable(): - download_executable( - "kubectl", - KUBECTL_URL_FORMAT.format( - KUBECTL_VERSION, - ("darwin" if current_platform() == Platform.MAC else "linux"), - ), - ) - - -def download_iam_authenticator_executable(): - download_executable( - "aws-iam-authenticator", - AWS_IAM_AUTHENTICATOR_URL_FORMAT.format( - ("darwin" if current_platform() == Platform.MAC else "linux"), - ), - ) diff --git a/sigopt/orchestrate/plugins/__init__.py b/sigopt/orchestrate/plugins/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/sigopt/orchestrate/plugins/autoscaler-plugin-template.yml b/sigopt/orchestrate/plugins/autoscaler-plugin-template.yml deleted file mode 100644 index af6fb299..00000000 --- a/sigopt/orchestrate/plugins/autoscaler-plugin-template.yml +++ /dev/null @@ -1,177 +0,0 @@ -# original https://raw.githubusercontent.com/kubernetes/autoscaler/7f6f6e12813c0f4726a3f369a407b0c64d462bc1/cluster-autoscaler/cloudprovider/aws/examples/cluster-autoscaler-autodiscover.yaml ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - labels: - k8s-addon: cluster-autoscaler.addons.k8s.io - k8s-app: cluster-autoscaler - name: cluster-autoscaler - namespace: kube-system ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: cluster-autoscaler - labels: - k8s-addon: cluster-autoscaler.addons.k8s.io - k8s-app: cluster-autoscaler -rules: - - apiGroups: [""] - resources: ["events", "endpoints"] - verbs: ["create", "patch"] - - apiGroups: [""] - resources: ["pods/eviction"] - verbs: ["create"] - - apiGroups: [""] - resources: ["pods/status"] - verbs: ["update"] - - apiGroups: [""] - resources: ["endpoints"] - resourceNames: ["cluster-autoscaler"] - verbs: ["get", "update"] - - apiGroups: [""] - resources: ["nodes"] - verbs: ["watch", "list", "get", "update"] - - apiGroups: [""] - resources: - - "pods" - - "services" - - "replicationcontrollers" - - "persistentvolumeclaims" - - "persistentvolumes" - verbs: ["watch", "list", "get"] - - apiGroups: ["extensions"] - resources: ["replicasets", "daemonsets"] - verbs: ["watch", "list", "get"] - - apiGroups: ["policy"] - resources: ["poddisruptionbudgets"] - verbs: ["watch", "list"] - - apiGroups: ["apps"] - resources: ["statefulsets", "replicasets", "daemonsets"] - verbs: ["watch", "list", "get"] - - apiGroups: ["storage.k8s.io"] - resources: ["storageclasses", "csinodes"] - verbs: ["watch", "list", "get"] - - apiGroups: ["batch", "extensions"] - resources: ["jobs"] - verbs: ["get", "list", "watch", "patch"] - - apiGroups: ["coordination.k8s.io"] - resources: ["leases"] - verbs: ["create"] - - apiGroups: ["coordination.k8s.io"] - resourceNames: ["cluster-autoscaler"] - resources: ["leases"] - verbs: ["get", "update"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: cluster-autoscaler - namespace: kube-system - labels: - k8s-addon: cluster-autoscaler.addons.k8s.io - k8s-app: cluster-autoscaler -rules: - - apiGroups: [""] - resources: ["configmaps"] - verbs: ["create","list","watch"] - - apiGroups: [""] - resources: ["configmaps"] - resourceNames: ["cluster-autoscaler-status", "cluster-autoscaler-priority-expander"] - verbs: ["delete", "get", "update", "watch"] - ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: cluster-autoscaler - labels: - k8s-addon: cluster-autoscaler.addons.k8s.io - k8s-app: cluster-autoscaler -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: cluster-autoscaler -subjects: - - kind: ServiceAccount - name: cluster-autoscaler - namespace: kube-system - ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: cluster-autoscaler - namespace: kube-system - labels: - k8s-addon: cluster-autoscaler.addons.k8s.io - k8s-app: cluster-autoscaler -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: cluster-autoscaler -subjects: - - kind: ServiceAccount - name: cluster-autoscaler - namespace: kube-system - ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: cluster-autoscaler - namespace: kube-system - labels: - app: cluster-autoscaler -spec: - replicas: 1 - selector: - matchLabels: - app: cluster-autoscaler - template: - metadata: - labels: - app: cluster-autoscaler - annotations: - prometheus.io/scrape: 'true' - prometheus.io/port: '8085' - cluster-autoscaler.kubernetes.io/safe-to-evict: 'true' - spec: - serviceAccountName: cluster-autoscaler - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: orchestrate.sigopt.com/node-group-type - operator: In - values: - - system - containers: - - name: cluster-autoscaler - resources: - limits: - cpu: 100m - memory: 300Mi - requests: - cpu: 100m - memory: 300Mi - command: - - ./cluster-autoscaler - - --v=4 - - --stderrthreshold=info - - --cloud-provider=aws - - --skip-nodes-with-local-storage=false - - --expander=least-waste - - --balance-similar-node-groups - - --skip-nodes-with-system-pods=false - volumeMounts: - - name: ssl-certs - mountPath: /etc/ssl/certs/ca-certificates.crt #/etc/ssl/certs/ca-bundle.crt for Amazon Linux Worker Nodes - readOnly: true - imagePullPolicy: "Always" - volumes: - - name: ssl-certs - hostPath: - path: "/etc/ssl/certs/ca-bundle.crt" diff --git a/sigopt/orchestrate/plugins/docker-service.yml b/sigopt/orchestrate/plugins/docker-service.yml deleted file mode 100644 index 9f70fb29..00000000 --- a/sigopt/orchestrate/plugins/docker-service.yml +++ /dev/null @@ -1,14 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: docker - labels: - app: docker -spec: - selector: - app: docker - ports: - - name: https - protocol: TCP - port: 443 - targetPort: 2376 diff --git a/sigopt/orchestrate/plugins/docker-statefulset.yml b/sigopt/orchestrate/plugins/docker-statefulset.yml deleted file mode 100644 index 8d4b5043..00000000 --- a/sigopt/orchestrate/plugins/docker-statefulset.yml +++ /dev/null @@ -1,66 +0,0 @@ -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: docker - labels: - app: docker -spec: - serviceName: docker - replicas: 1 - selector: - matchLabels: - app: docker - template: - metadata: - labels: - app: docker - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: orchestrate.sigopt.com/node-group-type - operator: NotIn - values: - - cpu - - gpu - containers: - - name: docker-in-docker - image: docker:20.10.5-dind - command: - - dockerd - - --host=unix:///var/run/docker.sock - - --host=tcp://0.0.0.0:2376 - - --tlscacert=/etc/ssl/docker/ca.pem - - --tlscert=/etc/ssl/docker/cert.pem - - --tlskey=/etc/ssl/docker/key.pem - - --tlsverify=false - resources: - securityContext: - capabilities: - add: - - NET_ADMIN - - SYS_ADMIN - privileged: true - volumeMounts: - - name: layer-storage - mountPath: /var/lib/docker - - name: docker-certs - mountPath: /etc/ssl/docker - readOnly: true - volumes: - - name: docker-certs - secret: - secretName: docker-certs - volumeClaimTemplates: - - metadata: - name: layer-storage - labels: - orchestrate/cleanup-before-destroy: "" - spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: diff --git a/sigopt/orchestrate/plugins/orchestrate-controller-roles.yml b/sigopt/orchestrate/plugins/orchestrate-controller-roles.yml deleted file mode 100644 index 7a6de69d..00000000 --- a/sigopt/orchestrate/plugins/orchestrate-controller-roles.yml +++ /dev/null @@ -1,45 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: orchestrate:controller -rules: - - apiGroups: [""] - resources: - - pods - verbs: - - create - - get - - list - - watch - - apiGroups: [""] - resources: - - pods/log - verbs: - - get - - list - - watch - - apiGroups: [""] - resources: - - configmaps - verbs: - - create ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: controller - namespace: orchestrate ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: orchestrate:controller - namespace: orchestrate -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: orchestrate:controller -subjects: -- kind: ServiceAccount - name: controller - namespace: orchestrate diff --git a/sigopt/orchestrate/provider/__init__.py b/sigopt/orchestrate/provider/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/sigopt/orchestrate/provider/broker.py b/sigopt/orchestrate/provider/broker.py deleted file mode 100644 index d12bfc8d..00000000 --- a/sigopt/orchestrate/provider/broker.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -from ..aws.service import AwsService -from ..custom_cluster.service import CustomClusterService -from ..provider.constants import Provider -from ..services.aws_provider_bag import AwsProviderServiceBag -from ..services.base import Service - - -class ProviderBroker(Service): - def get_provider_service(self, provider): - if provider == Provider.AWS: - return AwsService(self.services, AwsProviderServiceBag(self.services)) - if provider == Provider.CUSTOM: - return CustomClusterService(self.services) - raise NotImplementedError() diff --git a/sigopt/orchestrate/provider/constants.py b/sigopt/orchestrate/provider/constants.py deleted file mode 100644 index b2b28f2f..00000000 --- a/sigopt/orchestrate/provider/constants.py +++ /dev/null @@ -1,43 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -from enum import Enum - -from ..exceptions import OrchestrateException - - -class Provider(Enum): - AWS = 1 - CUSTOM = 2 - - -STRING_TO_PROVIDER = dict( - aws=Provider.AWS, - custom=Provider.CUSTOM, -) -PROVIDER_TO_STRING = dict((v, k) for (k, v) in STRING_TO_PROVIDER.items()) - - -class UnknownProviderStringError(OrchestrateException): - def __init__(self, provider_string): - if provider_string is None: - provider_error = "Please include a provider with your request." - else: - provider_error = f"{provider_string!r} is not a supported provider." - - super().__init__(f"{provider_error} Supported providers are: {', '.join(STRING_TO_PROVIDER)}") - self.provider_string = provider_string - - -def string_to_provider(provider_string): - try: - return STRING_TO_PROVIDER[provider_string.lower()] - except (KeyError, AttributeError) as e: - raise UnknownProviderStringError(provider_string) from e - - -def provider_to_string(provider): - try: - return PROVIDER_TO_STRING[provider] - except KeyError as e: - raise NotImplementedError() from e diff --git a/sigopt/orchestrate/provider/interface.py b/sigopt/orchestrate/provider/interface.py deleted file mode 100644 index 8c848494..00000000 --- a/sigopt/orchestrate/provider/interface.py +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -from ..services.base import Service - - -class ProviderInterface(Service): - def create_kubernetes_cluster(self, options): - raise NotImplementedError() - - def destroy_kubernetes_cluster(self, cluster_name): - raise NotImplementedError() - - def create_kubeconfig(self, cluster_name, ignore_role=False): - raise NotImplementedError() - - def test_kubernetes_cluster(self, cluster_name, ignore_role=False): - raise NotImplementedError() - - def create_cluster_object(self, services, name, registry): - raise NotImplementedError() diff --git a/sigopt/orchestrate/resource/__init__.py b/sigopt/orchestrate/resource/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/sigopt/orchestrate/resource/service.py b/sigopt/orchestrate/resource/service.py deleted file mode 100644 index 8736783c..00000000 --- a/sigopt/orchestrate/resource/service.py +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import shutil -import tempfile - -import pkg_resources -import yaml - -from ..services.base import Service - - -class ResourceService(Service): - def get_package_name(self, package): - return f"sigopt.orchestrate.{package}" - - def stream(self, package, resource): - return pkg_resources.resource_stream(self.get_package_name(package), resource) - - def open(self, package, resource): - contents_fp = tempfile.NamedTemporaryFile("wb+") - with self.stream(package, resource) as source: - shutil.copyfileobj(source, contents_fp) - contents_fp.seek(0) - return contents_fp - - def read(self, package, resource): - return pkg_resources.resource_string(self.get_package_name(package), resource) - - def load_yaml(self, package, resource): - with self.open(package, resource) as fp: - return yaml.safe_load(fp) diff --git a/sigopt/orchestrate/s3/__init__.py b/sigopt/orchestrate/s3/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/sigopt/orchestrate/s3/service.py b/sigopt/orchestrate/s3/service.py deleted file mode 100644 index 4723caab..00000000 --- a/sigopt/orchestrate/s3/service.py +++ /dev/null @@ -1,73 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import base64 -import hashlib - -import boto3 - -from ..services.aws_base import AwsService - - -class AwsS3Service(AwsService): - def __init__(self, services, aws_services, **kwargs): - super().__init__(services, aws_services) - self._client = boto3.client("s3", **kwargs) - self.region = boto3.session.Session().region_name - self._init_kwargs = kwargs - - @property - def client(self): - return self._client - - @property - def account_id(self): - return boto3.client("sts", **self._init_kwargs).get_caller_identity()["Account"] - - @property - def orchestrate_bucket_name(self): - return f"sigopt.{self.account_id}" - - def ensure_orchestrate_bucket(self): - create_bucket_params = dict( - ACL="private", - Bucket=self.orchestrate_bucket_name, - ) - # NOTE: LocationConstraint is required for all regions but us-east-1. - # In us-east-1 create_bucket will fail when LocationConstraint is provided. - # https://github.com/boto/boto3/issues/125 - if self.region != "us-east-1": - create_bucket_params["CreateBucketConfiguration"] = {"LocationConstraint": self.region} - try: - self.client.create_bucket(**create_bucket_params) - self.client.put_bucket_encryption( - Bucket=self.orchestrate_bucket_name, - ServerSideEncryptionConfiguration={ - "Rules": [ - { - "ApplyServerSideEncryptionByDefault": { - "SSEAlgorithm": "AES256", - }, - "BucketKeyEnabled": True, - }, - ] - }, - ) - except self.client.exceptions.BucketAlreadyOwnedByYou: - pass - return self.orchestrate_bucket_name - - def upload_resource_by_hash(self, path_prefix, package, resource_name): - resource_content = self.services.resource_service.read(package, resource_name) - md5_hash = hashlib.md5(resource_content) # nosec - md5_hex_hash = md5_hash.hexdigest() - resource_path = f"orchestrate/resources/{path_prefix}/md5-{md5_hex_hash}/{resource_name}" - md5_b64_hash = base64.b64encode(md5_hash.digest()).decode("ascii") - bucket = self.ensure_orchestrate_bucket() - self.client.put_object( - Bucket=bucket, - Key=resource_path, - Body=resource_content, - ContentMD5=md5_b64_hash, - ) - return f"https://{bucket}.s3.amazonaws.com/{resource_path}" diff --git a/sigopt/orchestrate/services/__init__.py b/sigopt/orchestrate/services/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/sigopt/orchestrate/services/aws_base.py b/sigopt/orchestrate/services/aws_base.py deleted file mode 100644 index 3e264383..00000000 --- a/sigopt/orchestrate/services/aws_base.py +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -from ..services.base import Service - - -class AwsService(Service): - """ - Base class for all AWS services. - """ - - def __init__(self, services, aws_services): - super().__init__(services) - self.aws_services = aws_services diff --git a/sigopt/orchestrate/services/aws_provider_bag.py b/sigopt/orchestrate/services/aws_provider_bag.py deleted file mode 100644 index 1f6b6a2f..00000000 --- a/sigopt/orchestrate/services/aws_provider_bag.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -from ..cloudformation.service import AwsCloudFormationService -from ..ec2.service import AwsEc2Service -from ..ecr.service import AwsEcrService -from ..eks.service import AwsEksService -from ..iam.service import AwsIamService -from ..s3.service import AwsS3Service -from ..services.bag import ServiceBag -from ..sts.service import AwsStsService - - -class AwsProviderServiceBag(ServiceBag): - def __init__(self, orchestrate_services): - self.orchestrate_services = orchestrate_services - super().__init__() - - def _create_services(self): - super()._create_services() - self.cloudformation_service = AwsCloudFormationService(self.orchestrate_services, self) - self.ec2_service = AwsEc2Service(self.orchestrate_services, self) - self.ecr_service = AwsEcrService(self.orchestrate_services, self) - self.eks_service = AwsEksService(self.orchestrate_services, self) - self.iam_service = AwsIamService(self.orchestrate_services, self) - self.sts_service = AwsStsService(self.orchestrate_services, self) - self.s3_service = AwsS3Service(self.orchestrate_services, self) diff --git a/sigopt/orchestrate/services/bag.py b/sigopt/orchestrate/services/bag.py deleted file mode 100644 index c9e67486..00000000 --- a/sigopt/orchestrate/services/bag.py +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -class ServiceBag(object): - """ - A top-level container for all of our services. A service bag should be passed - around where needed to grant access to these services. This gives us - dependency injection, and lets us reuse services when they have a startup - cost (such as creating DB connections). - """ - - def __init__(self): - self._create_services() - self._warmup_services() - - def _create_services(self): - pass - - def _warmup_services(self): - pass diff --git a/sigopt/orchestrate/services/base.py b/sigopt/orchestrate/services/base.py deleted file mode 100644 index aa1e9430..00000000 --- a/sigopt/orchestrate/services/base.py +++ /dev/null @@ -1,10 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -class Service(object): - """ - Base class for all services. - """ - - def __init__(self, services): - self.services = services diff --git a/sigopt/orchestrate/services/orchestrate_bag.py b/sigopt/orchestrate/services/orchestrate_bag.py deleted file mode 100644 index bd2ce66b..00000000 --- a/sigopt/orchestrate/services/orchestrate_bag.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -from ..cluster.service import ClusterService -from ..cluster_metadata.service import ClusterMetadataService -from ..gpu_options_validator.service import GpuOptionsValidatorService -from ..job_runner.service import JobRunnerService -from ..job_status.service import JobStatusService -from ..kubectl.service import KubectlService -from ..kubernetes.service import KubernetesService -from ..logging.service import LoggingService -from ..model_packer.service import ModelPackerService -from ..options_validator.service import OptionsValidatorService -from ..provider.broker import ProviderBroker -from ..resource.service import ResourceService -from ..sigopt.service import SigOptService -from .bag import ServiceBag - - -class OrchestrateServiceBag(ServiceBag): - def _create_services(self): - super()._create_services() - self.resource_service = ResourceService(self) - self.provider_broker = ProviderBroker(self) - self.cluster_metadata_service = ClusterMetadataService(self) - self.cluster_service = ClusterService(self) - self.job_runner_service = JobRunnerService(self) - self.job_status_service = JobStatusService(self) - self.kubectl_service = KubectlService(self) - self.kubernetes_service = KubernetesService(self) - self.logging_service = LoggingService(self) - self.model_packer_service = ModelPackerService(self) - self.options_validator_service = OptionsValidatorService(self) - self.gpu_options_validator_service = GpuOptionsValidatorService(self) - self.sigopt_service = SigOptService(self) - - def _warmup_services(self): - super()._warmup_services() - self.kubernetes_service.warmup() diff --git a/sigopt/orchestrate/sigopt/__init__.py b/sigopt/orchestrate/sigopt/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/sigopt/orchestrate/sigopt/service.py b/sigopt/orchestrate/sigopt/service.py deleted file mode 100644 index a25cf2c0..00000000 --- a/sigopt/orchestrate/sigopt/service.py +++ /dev/null @@ -1,95 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import json -from urllib.parse import urlparse - -from sigopt.config import config -from sigopt.exception import ApiException -from sigopt.factory import SigOptFactory -from sigopt.interface import get_connection - -from ..exceptions import CheckConnectionError -from ..services.base import Service - - -class SigOptService(Service): - def __init__(self, services): - super().__init__(services) - self._conn = get_connection() - - @property - def conn(self): - return self._conn - - @property - def api_token(self): - return self.conn.impl.driver.auth.username - - @property - def api_url(self): - api_url = self.conn.impl.driver.api_url - urlparse(api_url) - return api_url - - @property - def verify_ssl_certs(self): - return self.conn.impl.driver.verify_ssl_certs - - def log_collection_enabled(self): - return config.log_collection_enabled - - def check_connection(self): - try: - self.conn.experiments().fetch(limit=1) - except ApiException as e: - raise CheckConnectionError(f"An error occured while checking your SigOpt connection: {e}") from e - - def create_aiexperiment(self, experiment_body, project_id): - factory = SigOptFactory(project_id) - return factory.create_prevalidated_aiexperiment(experiment_body) - - def fetch_experiment(self, experiment_id): - factory = SigOptFactory.from_default_project() - return factory.get_aiexperiment(experiment_id) - - def create_run(self, run_name, cluster, project_id): - factory = SigOptFactory(project_id) - return factory.create_run( - name=run_name, - metadata={"cluster_name": cluster.name}, - ) - - def fetch_run(self, run_id): - return self.conn.training_runs(run_id).fetch() - - def ensure_project_exists(self, project_id): - factory = SigOptFactory(project_id) - return factory.ensure_project_exists() - - def iterate_runs_by_filters(self, filters, project=None, client=None): - if project is None: - client, project = SigOptFactory.from_default_project().ensure_project_exists() - return ( - self.conn.clients(client).projects(project).training_runs().fetch(filters=json.dumps(filters)).iterate_pages() - ) - - def iterate_runs(self, experiment): - if experiment.project: - return self.iterate_runs_by_filters( - [{"operator": "==", "field": "experiment", "value": experiment.id}], - project=experiment.project, - client=experiment.client, - ) - # TODO: api.sigopt.com returns extended JSON for the new endpoint fetch, which we need for the state - # field. But we can only do that for experiments in projects. - # So we fall back safely here, but this can be removed in the future - return self.conn.experiments(experiment.id).training_runs().fetch().iterate_pages() - - def safe_fetch_experiment(self, experiment_id): - try: - return self.fetch_experiment(experiment_id) - except ApiException as e: - if e.status_code in [403, 404]: - return None - raise diff --git a/sigopt/orchestrate/status.py b/sigopt/orchestrate/status.py deleted file mode 100644 index 8b53d5a8..00000000 --- a/sigopt/orchestrate/status.py +++ /dev/null @@ -1,131 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -from .identifier import ( - IDENTIFIER_QUERY_ID, - IDENTIFIER_QUERY_NAME, - IDENTIFIER_QUERY_SUGGESTION, - IDENTIFIER_TYPE_EXPERIMENT, - IDENTIFIER_TYPE_RUN, - IDENTIFIER_TYPE_SUGGESTION, - get_run_and_pod_from_identifier, - maybe_convert_to_run_identifier, -) - - -def print_experiment_status(experiment_identifier, services): - assert experiment_identifier["type"] == IDENTIFIER_TYPE_EXPERIMENT - assert experiment_identifier["query"] == IDENTIFIER_QUERY_ID - experiment_id = experiment_identifier["value"] - experiment = services.sigopt_service.fetch_experiment(experiment_id) - - parsed_job = {} - parsed_job["experiment_id"] = experiment_id - parsed_job["experiment_name"] = experiment.name - parsed_job["budget"] = str(float(experiment.budget)) if experiment and experiment.budget is not None else "n/a" - parsed_job["total_run_count"] = str(experiment.progress.total_run_count) if experiment else "n/a" - - runs = list(services.sigopt_service.iterate_runs(experiment)) - total_failures = sum(v.state == "failed" for v in runs) - - yield "Experiment Name: {experiment_name}".format(**parsed_job) - yield "{total_run_count} / {budget} budget".format(**parsed_job) - yield f"{total_failures} Run(s) failed" - - yield "{:20}\t{:15}\t{:15}\t{:35}".format( - "Run Name", - "Pod phase", - "Status", - "Link", - ) - - pods_by_name = { - pod.metadata.name: pod - for pod in services.kubernetes_service.get_pods_by_label_selector( - experiment_identifier["pod_label_selector"], - ).items - } - runs_by_name = {run.to_json()["name"]: run for run in runs} - for run_name in sorted(set(pods_by_name) | set(runs_by_name)): - run = runs_by_name.get(run_name) - pod = pods_by_name.get(run_name) - state = run.state if run else "creating" - phase = pod.status.phase if pod else "Deleted" - url = f"https://app.sigopt.com/run/{run.id}" if run else "" - yield f"{run_name:20}\t{phase:15}\t{state:15}\t{url:35}" - - yield (f"Follow logs: sigopt cluster kubectl logs -ltype=run,experiment={experiment.id} --max-log-requests=1000 -f") - yield f"View more at: https://app.sigopt.com/aiexperiment/{experiment_id}" - - -def print_run_status(run_identifier, services): - run_identifier = maybe_convert_to_run_identifier(run_identifier) - run, pod = get_run_and_pod_from_identifier(run_identifier, services) - if not run and not pod: - yield f"Could not find a run for {run_identifier['raw']}" - return - - run_id = None - run_name = None - run_state = None - pod_phase = None - node_name = None - suggestion_id = None - observation_id = None - experiment_id = None - - # scrape info from identifier - if run_identifier["query"] == IDENTIFIER_QUERY_NAME: - run_name = run_identifier["value"] - elif run_identifier["query"] == IDENTIFIER_QUERY_SUGGESTION: - suggestion_id = run_identifier["value"] - - # scrape info from the run - if run: - run_data = run.to_json() - run_id = run.id - run_name = run_name or run_data["name"] - run_state = run_state or run.state - suggestion_id = suggestion_id or run_data.get("suggestion") - observation_id = observation_id or run_data.get("observation") - experiment_id = experiment_id or run_data.get("experiment") - - # scrape info from the pod - if pod: - run_name = run_name or pod.metadata.name - node_name = node_name or pod.spec.node_name - pod_phase = pod_phase or pod.status.phase - - # set values if still None - run_state = run_state or "creating" - pod_phase = pod_phase or "Deleted" - node_name = node_name or "unknown" - - yield f"Run Name: {run_name}" - if run_id is not None: - yield f"Link: https://app.sigopt.com/run/{run_id}" - yield f"State: {run_state}" - if experiment_id is not None: - yield f"Experiment link: https://app.sigopt.com/experiment/{experiment_id}" - if suggestion_id is not None: - yield f"Suggestion id: {suggestion_id}" - if observation_id is not None: - yield f"Observation id: {observation_id}" - yield f"Pod phase: {pod_phase}" - yield f"Node name: {node_name}" - yield (f'Follow logs: sigopt cluster kubectl logs "pod/{run_name}" -f') - - -IDENTIFIER_TYPE_TO_PRINTER = { - IDENTIFIER_TYPE_EXPERIMENT: print_experiment_status, - IDENTIFIER_TYPE_RUN: print_run_status, - IDENTIFIER_TYPE_SUGGESTION: print_run_status, -} - - -def print_status(identifier, services): - try: - printer = IDENTIFIER_TYPE_TO_PRINTER[identifier["type"]] - except KeyError as ke: - raise NotImplementedError() from ke - return printer(identifier, services) diff --git a/sigopt/orchestrate/stop.py b/sigopt/orchestrate/stop.py deleted file mode 100644 index 16e970e0..00000000 --- a/sigopt/orchestrate/stop.py +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -from .identifier import IDENTIFIER_TYPE_EXPERIMENT, IDENTIFIER_TYPE_RUN, get_run_and_pod_from_identifier - - -def stop_experiment(experiment_identifier, services): - assert experiment_identifier["type"] == IDENTIFIER_TYPE_EXPERIMENT - experiment_jobs = services.kubernetes_service.get_jobs_by_label_selector( - experiment_identifier["controller_label_selector"], - ).items - - for job in experiment_jobs: - services.kubernetes_service.delete_job(job.metadata.name, propogation_policy="Background") - - -def stop_run(run_identifier, services): - assert run_identifier["type"] == IDENTIFIER_TYPE_RUN - - _, pod = get_run_and_pod_from_identifier(run_identifier, services) - - if pod: - services.kubernetes_service.delete_pod(pod.metadata.name) - - run_controller_jobs = services.kubernetes_service.get_jobs_by_label_selector( - run_identifier["controller_label_selector"], - ).items - for job in run_controller_jobs: - services.kubernetes_service.delete_job(job.metadata.name, propogation_policy="Background") diff --git a/sigopt/orchestrate/sts/__init__.py b/sigopt/orchestrate/sts/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/sigopt/orchestrate/sts/service.py b/sigopt/orchestrate/sts/service.py deleted file mode 100644 index 33102c17..00000000 --- a/sigopt/orchestrate/sts/service.py +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import uuid - -import boto3 - -from ..services.aws_base import AwsService - - -class AwsStsService(AwsService): - def __init__(self, services, aws_services, **kwargs): - super().__init__(services, aws_services) - self._client = boto3.client("sts", **kwargs) - - @property - def client(self): - return self._client - - def assume_role(self, role_arn, duration_seconds=900): - return self.client.assume_role( - RoleArn=role_arn, - RoleSessionName=f"sigopt-{uuid.uuid4()}", - DurationSeconds=duration_seconds, - ) diff --git a/sigopt/orchestrate/test/__init__.py b/sigopt/orchestrate/test/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/sigopt/orchestrate/test/test_file.txt b/sigopt/orchestrate/test/test_file.txt deleted file mode 100644 index 8f9427ad..00000000 --- a/sigopt/orchestrate/test/test_file.txt +++ /dev/null @@ -1 +0,0 @@ -This is a test file for testing the resource service. diff --git a/sigopt/orchestrate/version.py b/sigopt/orchestrate/version.py deleted file mode 100644 index c35a3ed8..00000000 --- a/sigopt/orchestrate/version.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -CLI_NAME = "sigopt" -CONTROLLER_IMAGE_VERSION = "2022-11-09a" -CONTROLLER_REPOSITORY = "intel/sigopt-controller" -DEFAULT_CONTROLLER_IMAGE = f"{CONTROLLER_REPOSITORY}:{CONTROLLER_IMAGE_VERSION}" diff --git a/sigopt/orchestrate/zigopt/__init__.py b/sigopt/orchestrate/zigopt/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/sigopt/paths.py b/sigopt/paths.py index 20f09d9e..5c715917 100644 --- a/sigopt/paths.py +++ b/sigopt/paths.py @@ -1,7 +1,6 @@ # Copyright © 2022 Intel Corporation # # SPDX-License-Identifier: MIT -import errno import os @@ -12,19 +11,3 @@ def get_root_dir(): def get_root_subdir(dirname): return os.path.join(get_root_dir(), dirname) - - -def get_bin_dir(): - return get_root_subdir("bin") - - -def ensure_dir(path): - try: - os.makedirs(path) - except os.error as oserr: - if oserr.errno != errno.EEXIST: - raise - - -def get_executable_path(command): - return os.path.join(get_bin_dir(), command) diff --git a/sigopt/resource.py b/sigopt/resource.py index fb341023..45aac3cf 100644 --- a/sigopt/resource.py +++ b/sigopt/resource.py @@ -10,7 +10,6 @@ class BoundApiResource(object): def __init__(self, resource, id_, path): self._resource = resource - self._id = id_ self._base_path = list(path) if id_ is not _NO_ARG: diff --git a/sigopt/run_context.py b/sigopt/run_context.py index 93cc666d..81053bd5 100644 --- a/sigopt/run_context.py +++ b/sigopt/run_context.py @@ -6,7 +6,6 @@ import requests from .config import config -from .exception import RunException from .file_utils import create_api_image_payload, get_blob_properties from .interface import get_connection from .lib import is_mapping, is_string, remove_nones, sanitize_number, validate_name @@ -38,11 +37,6 @@ def maybe_truncate_log(log_content): return log_content -class NoDefaultParameterError(RunException): - def __init__(self, parameter_name): - super().__init__(f'No default provided for parameter "{parameter_name}"') - - class BaseRunContext(object): @property def id(self): @@ -105,12 +99,6 @@ def set_parameter(self, name, value): """ return self._set_parameters({name: value}) - def set_parameter_meta(self, name, value): - return self._set_parameters_meta({name: value}) - - def set_parameters_meta(self, parameters_meta): - return self._set_parameters_meta(parameters_meta) - def set_parameter_source(self, name, source): return self._set_parameters_meta({name: {"source": source}}) @@ -360,6 +348,7 @@ def __enter__(self): return self def __exit__(self, type_, value, tb): + del tb self._end(exception=value) def _end(self, exception): diff --git a/sigopt/run_params.py b/sigopt/run_params.py index 6b0cc2f1..b81f7364 100644 --- a/sigopt/run_params.py +++ b/sigopt/run_params.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: MIT from collections.abc import MutableMapping +from .decorators import public from .lib import is_string @@ -21,6 +22,7 @@ def __init__(self, run_context, fixed_items, default_items=None): _set(self, "__run_context", run_context) _set(self, "__fixed_keys", set(fixed_items.keys())) + @public def update(self, *args, **kwds): # pylint: disable=arguments-differ # this update is atomic, which reduces the number of calls to set_parameter(s) # the default implementation of update would result in a partial update if any of the setters failed @@ -31,6 +33,7 @@ def update(self, *args, **kwds): # pylint: disable=arguments-differ _get(self, "__items").update(tmp) _get(self, "__run_context").set_parameters(tmp) + @public def setdefaults(self, *args, **kwds): tmp = dict() tmp.update(*args, **kwds) diff --git a/sigopt/utils.py b/sigopt/utils.py index 47016c92..bfe8342c 100644 --- a/sigopt/utils.py +++ b/sigopt/utils.py @@ -1,26 +1,6 @@ # Copyright © 2022 Intel Corporation # # SPDX-License-Identifier: MIT -import contextlib -from http import HTTPStatus - -from sigopt.exception import ApiException - - -class HandledException: - def __init__(self): - self.exception = None - - -@contextlib.contextmanager -def accept_sigopt_not_found(): - handled = HandledException() - try: - yield handled - except ApiException as ae: - if ae.status_code != HTTPStatus.NOT_FOUND: - raise - handled.exception = ae def batcher(alist, n=1): diff --git a/sigopt/xgboost/checkpoint_callback.py b/sigopt/xgboost/checkpoint_callback.py index eb2c3052..a096a279 100644 --- a/sigopt/xgboost/checkpoint_callback.py +++ b/sigopt/xgboost/checkpoint_callback.py @@ -1,6 +1,7 @@ # Copyright © 2022 Intel Corporation # # SPDX-License-Identifier: MIT +from ..decorators import public from .compat import xgboost @@ -11,6 +12,7 @@ def __init__(self, run, period=1): self._latest = None super().__init__() + @public def after_iteration(self, model, epoch, evals_log): if not evals_log: return False @@ -31,6 +33,7 @@ def after_iteration(self, model, epoch, evals_log): return False + @public def after_training(self, model): if self._latest is not None: self.run.log_checkpoint(self._latest) diff --git a/test/cli/test_cluster_connect.py b/test/cli/test_cluster_connect.py deleted file mode 100644 index e0c407bd..00000000 --- a/test/cli/test_cluster_connect.py +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -from click.testing import CliRunner -from mock import Mock, patch - -from sigopt.cli import cli - - -class TestClusterConnectCli(object): - def test_cluster_connect_command(self): - services = Mock() - runner = CliRunner() - with patch("sigopt.orchestrate.controller.OrchestrateServiceBag", return_value=services): - result = runner.invoke(cli, ["cluster", "connect", "-n", "foobar", "--provider", "custom"]) - assert result.exit_code == 0 diff --git a/test/cli/test_cluster_create.py b/test/cli/test_cluster_create.py deleted file mode 100644 index 9de5fca9..00000000 --- a/test/cli/test_cluster_create.py +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -from click.testing import CliRunner -from mock import Mock, patch - -from sigopt.cli import cli - - -class TestClusterCreateCli(object): - def test_cluster_create(self): - services = Mock() - runner = CliRunner() - with runner.isolated_filesystem(), patch( - "sigopt.orchestrate.controller.OrchestrateServiceBag", return_value=services - ): - open("cluster.yml", "w").close() - result = runner.invoke(cli, ["cluster", "create"]) - assert result.exit_code == 0 diff --git a/test/cli/test_cluster_destroy.py b/test/cli/test_cluster_destroy.py deleted file mode 100644 index d7abc63d..00000000 --- a/test/cli/test_cluster_destroy.py +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -from click.testing import CliRunner -from mock import Mock, patch - -from sigopt.cli import cli - - -class TestClusterDestroyCli(object): - def test_cluster_destroy_command(self): - services = Mock() - cluster = Mock() - cluster.name = "foobar" - cluster.provider_string = "aws" - services.cluster_service.get_connected_cluster.return_value = cluster - runner = CliRunner() - with patch("sigopt.orchestrate.controller.OrchestrateServiceBag", return_value=services): - result = runner.invoke(cli, ["cluster", "destroy"]) - services.kubernetes_service.cleanup_for_destroy.assert_called_once() - services.cluster_service.destroy.assert_called_once_with( - cluster_name="foobar", - provider_string="aws", - ) - assert result.output.splitlines() == [ - "Destroying cluster foobar, this process may take 20-30 minutes or longer...", - "Successfully destroyed kubernetes cluster: foobar", - ] - assert result.exit_code == 0 diff --git a/test/cli/test_cluster_disconnect.py b/test/cli/test_cluster_disconnect.py deleted file mode 100644 index ddb689ff..00000000 --- a/test/cli/test_cluster_disconnect.py +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -from click.testing import CliRunner -from mock import Mock, patch - -from sigopt.cli import cli - - -class TestClusterDisconnectCli(object): - def test_cluster_disconnect_command(self): - services = Mock() - runner = CliRunner() - with patch("sigopt.orchestrate.controller.OrchestrateServiceBag", return_value=services): - result = runner.invoke(cli, ["cluster", "disconnect"]) - assert result.exit_code == 0 diff --git a/test/cli/test_cluster_kubectl.py b/test/cli/test_cluster_kubectl.py deleted file mode 100644 index cdcdb63b..00000000 --- a/test/cli/test_cluster_kubectl.py +++ /dev/null @@ -1,45 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import pytest -from click.testing import CliRunner -from mock import patch - -from sigopt.cli import cli -from sigopt.orchestrate.kubernetes.service import ORCHESTRATE_NAMESPACE -from sigopt.orchestrate.paths import get_executable_path - - -class TestClusterKubectlCli(object): - @pytest.mark.parametrize( - "arguments", - [ - (), - ("-h",), - ("--help",), - ("--help",), - ("get", "--help"), - ("exec", "-ti", "po/helloworld", "--", "/bin/sh"), - ], - ) - def test_cluster_kubectl_command(self, arguments): - kubectl_env_dict = { - "KUBECONFIG": "dummy_kubeconfig", - "PATH": "/dummy/bin", - } - runner = CliRunner() - with patch("os.execvpe") as mock_execvpe, patch("sigopt.orchestrate.sigopt.service.get_connection"), patch( - "sigopt.orchestrate.kubectl.service.KubectlService.get_kubectl_env", - side_effect=[kubectl_env_dict], - ), patch( - "sigopt.orchestrate.cluster.service.ClusterService.assert_is_connected", - return_value="foobar", - ): - result = runner.invoke(cli, ["cluster", "kubectl", *arguments], catch_exceptions=False) - exec_path = get_executable_path("kubectl") - mock_execvpe.assert_called_once_with( - exec_path, - [exec_path, "--namespace", ORCHESTRATE_NAMESPACE, *arguments], - env=kubectl_env_dict, - ) - assert result.exit_code == 0 diff --git a/test/cli/test_cluster_run.py b/test/cli/test_cluster_run.py deleted file mode 100644 index 61f953be..00000000 --- a/test/cli/test_cluster_run.py +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -from click.testing import CliRunner -from mock import Mock, patch - -from sigopt.cli import cli - - -class TestRunCli(object): - def test_orchestrate_run(self): - services = Mock() - runner = CliRunner() - with runner.isolated_filesystem(): - with patch("sigopt.orchestrate.controller.OrchestrateServiceBag", return_value=services), patch( - "sigopt.orchestrate.docker.service.DockerService.create" - ), patch( - "sigopt.orchestrate.docker.service.DockerService.get_repository_and_tag", - return_value=("docker.io/test", "123"), - ): - services.cluster_service.assert_is_connected = Mock() - services.gpu_options_validator_service.get_resource_options = Mock(return_value=None) - open("Dockerfile", "w").close() - result = runner.invoke(cli, ["cluster", "run", "echo", "hello"]) - assert result.exit_code == 0 diff --git a/test/cli/test_cluster_test.py b/test/cli/test_cluster_test.py deleted file mode 100644 index 541c54fd..00000000 --- a/test/cli/test_cluster_test.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -from click.testing import CliRunner -from mock import Mock, patch - -from sigopt.cli import cli - - -class TestClusterTestCli(object): - def test_cluster_test_command(self): - services = Mock() - runner = CliRunner() - with patch("sigopt.orchestrate.controller.OrchestrateServiceBag", return_value=services), patch( - "sigopt.orchestrate.controller.DockerService" - ): - result = runner.invoke(cli, ["cluster", "test"]) - assert result.exit_code == 0 diff --git a/test/cli/test_files/import_hello.py b/test/cli/test_files/import_hello.py index 16079073..5fe084bf 100644 --- a/test/cli/test_files/import_hello.py +++ b/test/cli/test_files/import_hello.py @@ -2,3 +2,6 @@ # # SPDX-License-Identifier: MIT import print_hello # pylint: disable=unused-import + + +del print_hello diff --git a/test/client/test_interface.py b/test/client/test_interface.py index 08dda7a0..dcbf97da 100644 --- a/test/client/test_interface.py +++ b/test/client/test_interface.py @@ -15,6 +15,7 @@ class TestInterface(object): @pytest.yield_fixture def config_dict(self, autouse=True): + del autouse with mock.patch.dict(config._configuration, {}): yield config._configuration diff --git a/test/orchestrate/__init__.py b/test/orchestrate/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/test/orchestrate/aws/__init__.py b/test/orchestrate/aws/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/test/orchestrate/aws/service_test.py b/test/orchestrate/aws/service_test.py deleted file mode 100644 index 4766790d..00000000 --- a/test/orchestrate/aws/service_test.py +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import pytest -from mock import Mock - -from sigopt.orchestrate.aws.service import AwsService, is_cuda_gpu_instance_type - - -class TestAwsService(object): - @pytest.fixture() - def aws_service(self): - services = Mock() - aws_services = Mock() - return AwsService(services, aws_services) - - def test_gpu_instance_type(self): - assert is_cuda_gpu_instance_type("p4d.24xlarge") - assert is_cuda_gpu_instance_type("p3.2xlarge") - assert is_cuda_gpu_instance_type("p3dn.24xlarge") - assert is_cuda_gpu_instance_type("p2.16xlarge") - assert is_cuda_gpu_instance_type("g4dn.xlarge") - assert is_cuda_gpu_instance_type("g4dn.metal") - assert is_cuda_gpu_instance_type("g3.16xlarge") - - assert not is_cuda_gpu_instance_type("g4ad.16xlarge") - assert not is_cuda_gpu_instance_type("f1.16xlarge") - assert not is_cuda_gpu_instance_type("c5.24xlarge") - assert not is_cuda_gpu_instance_type("t2.small") - - @pytest.fixture - def cpu_config(self): - return {"min_size": 1, "max_size": 2, "instance_type": "m5.large"} - - @pytest.fixture - def gpu_config(self): - return {"min_size": 1, "max_size": 2, "instance_type": "p3.2xlarge"} - - @pytest.mark.parametrize( - "cluster_name", - [ - "", - "inval_id1-123", - "also-invalid_cluster-name", - "123", - ], - ) - def test_invalid_cluster_name(self, aws_service, cluster_name, cpu_config, gpu_config): - with pytest.raises(AssertionError): - aws_service.validate_cluster_options(cluster_name, {"cpu": cpu_config, "gpu": gpu_config}, None) - - @pytest.mark.parametrize( - "cluster_name", - [ - "valid-123", - "also-valid-cluster-name", - ], - ) - def test_valid_cluster_names(self, aws_service, cluster_name, cpu_config, gpu_config): - aws_service.validate_cluster_options(cluster_name, {"cpu": cpu_config, "gpu": gpu_config}, None) - - @pytest.mark.parametrize( - "kubernetes_version", - [ - "1.9", - "1.15", - "not-a-version", - ], - ) - def test_invalid_kubernetes_version(self, aws_service, kubernetes_version, cpu_config, gpu_config): - with pytest.raises(AssertionError): - aws_service.validate_cluster_options("cluster-name", {"cpu": cpu_config, "gpu": gpu_config}, kubernetes_version) - - @pytest.mark.parametrize( - "kubernetes_version", - [ - None, - "1.20", - "1.23", - "latest", - ], - ) - def test_valid_kubernetes_versions(self, aws_service, kubernetes_version, cpu_config, gpu_config): - aws_service.validate_cluster_options("valid-name", {"cpu": cpu_config, "gpu": gpu_config}, kubernetes_version) - - def test_create_kubernetes_cluster_fail(self, aws_service, cpu_config, gpu_config): - with pytest.raises(AssertionError): - aws_service.create_kubernetes_cluster(dict(cluster_name="44_44", cpu=cpu_config, gpu=gpu_config)) diff --git a/test/orchestrate/cluster/__init__.py b/test/orchestrate/cluster/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/test/orchestrate/cluster/service_test.py b/test/orchestrate/cluster/service_test.py deleted file mode 100644 index be872ccb..00000000 --- a/test/orchestrate/cluster/service_test.py +++ /dev/null @@ -1,238 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import pytest -from mock import Mock - -from sigopt.orchestrate.cluster.errors import ( - AlreadyConnectedException, - MultipleClustersConnectionError, - NotConnectedError, - PleaseDisconnectError, -) -from sigopt.orchestrate.cluster.object import AWSCluster, CustomCluster -from sigopt.orchestrate.cluster.service import ClusterService -from sigopt.orchestrate.custom_cluster.service import CustomClusterService -from sigopt.orchestrate.exceptions import OrchestrateException -from sigopt.orchestrate.provider.constants import Provider - - -class TestClusterService(object): - @pytest.fixture - def services(self): - mock_services = Mock() - mock_services.get_option = Mock(return_value="bar") - - def fake_get_provider_service(provider): - if provider == Provider.AWS: - return Mock( - create_kubernetes_cluster=Mock( - return_value=AWSCluster( - services=mock_services, - name="foobar", - registry=None, - ) - ) - ) - elif provider == Provider.CUSTOM: - return CustomClusterService(mock_services) - else: - raise NotImplementedError() - - mock_services.provider_broker.get_provider_service = fake_get_provider_service - return mock_services - - @pytest.fixture - def cluster_service(self, services): - cluster_service = ClusterService(services) - services.cluster_service = cluster_service - return cluster_service - - def test_connected_clusters(self, cluster_service): - cluster_service.services.kubernetes_service.get_cluster_names.return_value = [] - assert cluster_service.connected_clusters() == [] - cluster_service.services.kubernetes_service.get_cluster_names.return_value = ["foo"] - assert cluster_service.connected_clusters() == ["foo"] - cluster_service.services.kubernetes_service.get_cluster_names.return_value = [ - "foo", - "bar", - ] - assert sorted(cluster_service.connected_clusters()) == ["bar", "foo"] - - def test_multiple_clusters(self, cluster_service): - cluster_service.connected_clusters = Mock(return_value=["bar", "foo"]) - - with pytest.raises(MultipleClustersConnectionError): - cluster_service.assert_is_connected() - with pytest.raises(MultipleClustersConnectionError): - cluster_service.assert_is_disconnected() - with pytest.raises(MultipleClustersConnectionError): - cluster_service.connect( - cluster_name=None, - provider_string="aws", - kubeconfig=None, - registry=None, - ) - with pytest.raises(MultipleClustersConnectionError): - cluster_service.create(None) - with pytest.raises(MultipleClustersConnectionError): - cluster_service.disconnect("bar", None) - with pytest.raises(MultipleClustersConnectionError): - cluster_service.test() - - cluster_service.disconnect(cluster_name=None, disconnect_all=True) - - # TODO: decide which permissions to validate for cluster destroy - cluster_service.destroy(None, "aws") - - def test_no_clusters(self, cluster_service): - cluster_service.connected_clusters = Mock(return_value=[]) - - with pytest.raises(NotConnectedError): - cluster_service.assert_is_connected() - with pytest.raises(NotConnectedError): - cluster_service.disconnect("bar", None) - with pytest.raises(NotConnectedError): - cluster_service.disconnect(cluster_name=None, disconnect_all=True) - with pytest.raises(NotConnectedError): - cluster_service.test() - - cluster_service.assert_is_disconnected() - cluster_service.test = Mock() - cluster_service.connect( - cluster_name="cluster_name", - provider_string="aws", - kubeconfig=None, - registry=None, - ) - assert cluster_service.test.call_count == 1 - cluster_service.create(dict(provider="aws")) - - # TODO: decide which permissions to validate for cluster destroy - cluster_service.destroy(None, "aws") - - def test_one_clusters(self, cluster_service): - cluster_service.connected_clusters = Mock(return_value=["foo"]) - cluster_service.services.cluster_metadata_service.read_metadata = Mock( - return_value=CustomCluster( - services=cluster_service.services, - name="foo", - registry=None, - ) - ) - - with pytest.raises(PleaseDisconnectError): - cluster_service.assert_is_disconnected() - with pytest.raises(PleaseDisconnectError): - cluster_service.connect( - cluster_name="bar", - provider_string="aws", - kubeconfig=None, - registry=None, - ) - with pytest.raises(PleaseDisconnectError): - cluster_service.create(dict(cluster_name="bar")) - with pytest.raises(PleaseDisconnectError): - cluster_service.disconnect("bar", None) - - cluster_service.assert_is_connected() - - with pytest.raises(AlreadyConnectedException): - cluster_service.connect( - cluster_name="foo", - provider_string="aws", - kubeconfig=None, - registry=None, - ) - with pytest.raises(AlreadyConnectedException): - cluster_service.create(dict(cluster_name="foo")) - - cluster_service.disconnect("foo", None) - cluster_service.disconnect(cluster_name=None, disconnect_all=True) - cluster_service.destroy("foo", "aws") - cluster_service.test() - - # TODO: decide which permissions to validate for cluster destroy - cluster_service.destroy("bar", "aws") - - def test_create_cluster(self, cluster_service): - cluster_service.connected_clusters = Mock(return_value=[]) - cluster_name = cluster_service.create(dict(provider="aws")) - assert cluster_name == "foobar" - - def test_create_cluster_fails(self, cluster_service): - # Mock this function so that cluster create things that we are not connected, and will try to creat a cluster - cluster_service.assert_is_disconnected = Mock() - # Mock this function so that cluster disconnect will think that we are connected to foobar - cluster_service.connected_clusters = Mock(return_value=["foobar"]) - - exc = Exception() - cluster_service.services.provider_broker.get_provider_service = Mock( - return_value=Mock( - create_kubernetes_cluster=Mock(side_effect=exc), - ), - ) - with pytest.raises(Exception) as e: - cluster_service.create(dict(cluster_name="foobar", provider="aws")) - assert e.value is exc - cluster_service.services.kubernetes_service.ensure_config_deleted.assert_called_with(cluster_name="foobar") - - def test_cluster_test(self, cluster_service): - cluster_service.connected_clusters = Mock(return_value=["foobar"]) - cluster_service.services.cluster_metadata_service.read_metadata = Mock( - return_value=CustomCluster( - services=cluster_service.services, - name="foobar", - registry=None, - ) - ) - - cluster = cluster_service.test() - assert cluster.name == "foobar" - assert cluster.provider_string == "custom" - - cluster_service.connected_clusters = Mock(return_value=["bar"]) - cluster_service.services.cluster_metadata_service.read_metadata = Mock( - return_value=AWSCluster( - services=cluster_service.services, - name="bar", - registry=None, - ) - ) - - cluster = cluster_service.test() - assert cluster.name == "bar" - assert cluster.provider_string == "aws" - - def test_cluster_connect(self, cluster_service): - cluster_service.connected_clusters = Mock(return_value=[]) - cluster_service.test = Mock() - cluster_service.connect( - cluster_name="bar", - provider_string="custom", - kubeconfig="foo", - registry=None, - ) - - cluster_service.connect( - cluster_name="bar", - provider_string="aws", - kubeconfig=None, - registry=None, - ) - - with pytest.raises(AssertionError): - cluster_service.connect( - cluster_name="bar", - provider_string="aws", - kubeconfig="foo", - registry=None, - ) - - with pytest.raises(OrchestrateException): - cluster_service.connect( - cluster_name="bar", - provider_string="custom", - kubeconfig=None, - registry=None, - ) diff --git a/test/orchestrate/cluster_metadata/__init__.py b/test/orchestrate/cluster_metadata/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/test/orchestrate/cluster_metadata/service_test.py b/test/orchestrate/cluster_metadata/service_test.py deleted file mode 100644 index 6e2855d5..00000000 --- a/test/orchestrate/cluster_metadata/service_test.py +++ /dev/null @@ -1,90 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import pytest -from mock import Mock - -from sigopt.orchestrate.cluster.object import AWSCluster -from sigopt.orchestrate.cluster_metadata.errors import * -from sigopt.orchestrate.cluster_metadata.service import ClusterMetadataService -from sigopt.orchestrate.common import TemporaryDirectory -from sigopt.orchestrate.custom_cluster.service import CustomClusterService -from sigopt.orchestrate.provider.constants import Provider - - -# pylint: disable=protected-access - - -class TestClusterService(object): - @pytest.fixture - def services(self): - mock_services = Mock() - - def fake_create_cluster_object(services, name, registry): - return AWSCluster( - services=services, - name=name, - registry=registry, - ) - - def fake_get_provider_service(provider): - if provider == Provider.AWS: - return Mock(create_cluster_object=fake_create_cluster_object) - elif provider == Provider.CUSTOM: - return CustomClusterService(mock_services) - else: - raise NotImplementedError() - - mock_services.provider_broker.get_provider_service = fake_get_provider_service - return mock_services - - @pytest.fixture - def cluster_metadata_service(self, services): - cluster_metadata_service = ClusterMetadataService(services) - services.cluster_metadata_service = cluster_metadata_service - return cluster_metadata_service - - @pytest.mark.parametrize( - "provider", - [ - Provider.AWS, - Provider.CUSTOM, - ], - ) - def test_custom_cluster(self, cluster_metadata_service, provider): - with TemporaryDirectory() as root_dirname: - cluster_metadata_service._metadata_dir = root_dirname - - provider_service = cluster_metadata_service.services.provider_broker.get_provider_service(provider) - cluster = provider_service.create_cluster_object( - services=cluster_metadata_service.services, - name="foobar", - registry=None, - ) - cluster_metadata_service.write_metadata(cluster) - - cluster = cluster_metadata_service.read_metadata("foobar") - assert cluster.name == "foobar" - assert cluster.provider == provider - - def test_double_write(self, cluster_metadata_service): - with TemporaryDirectory() as root_dirname: - cluster_metadata_service._metadata_dir = root_dirname - - custom_cluster_service = cluster_metadata_service.services.provider_broker.get_provider_service(Provider.CUSTOM) - cluster = custom_cluster_service.create_cluster_object( - services=cluster_metadata_service.services, - name="foobar", - registry=None, - ) - cluster_metadata_service.write_metadata(cluster) - - with pytest.raises(MetadataAlreadyExistsError): - cluster_metadata_service.write_metadata(cluster) - - def test_no_metadata(self, cluster_metadata_service): - with TemporaryDirectory() as root_dirname: - cluster_metadata_service._metadata_dir = root_dirname - - with pytest.raises(Exception): - cluster_metadata_service.read_metadata("foobar") diff --git a/test/orchestrate/common_test.py b/test/orchestrate/common_test.py deleted file mode 100644 index ba67737f..00000000 --- a/test/orchestrate/common_test.py +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import pytest -from mock import patch - -from sigopt.orchestrate.common import Platform, current_platform - - -class TestCurrentPlatform(object): - @pytest.mark.parametrize("platform", ["foobar.linux", "foobar", ""]) - def test_bad_platform(self, platform): - with patch("sigopt.orchestrate.common.sys.platform", platform): - with pytest.raises(Exception): - current_platform() - - def test_mac_platform(self): - with patch("sigopt.orchestrate.common.sys.platform", "darwin"): - assert current_platform() == Platform.MAC - - @pytest.mark.parametrize("platform", ["linux", "linux.foobar"]) - def test_linux_platform(self, platform): - with patch("sigopt.orchestrate.common.sys.platform", platform): - assert current_platform() == Platform.LINUX diff --git a/test/orchestrate/docker/__init__.py b/test/orchestrate/docker/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/test/orchestrate/docker/service_test.py b/test/orchestrate/docker/service_test.py deleted file mode 100644 index d9ee7d00..00000000 --- a/test/orchestrate/docker/service_test.py +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import pytest -from mock import Mock - -from sigopt.orchestrate.docker.service import DockerService - - -class TestDockerService(object): - @pytest.fixture() - def docker_service(self): - services = Mock() - return DockerService(services, Mock()) - - def test_get_repository_and_tag_bad(self, docker_service): - bad_images = [ - "my.registry.com:port/username/repo:tag", - "username//repo", - "username/repo:tag1:tag2", - "", - "username/repo:tag/test", - "USERNAME/REPO:tag", - "username/repo:", - "username/repo: ", - ] - for image in bad_images: - with pytest.raises(AssertionError): - docker_service.get_repository_and_tag(image) - - def test_get_repository_and_tag_good(self, docker_service): - image = "username/repo:tag" - repository, tag = docker_service.get_repository_and_tag(image) - assert repository == "username/repo" and tag == "tag" - - image = "repo" - repository, tag = docker_service.get_repository_and_tag(image) - assert repository == "repo" and tag is None - - image = "username_1-2/repo_1-2:tag" - repository, tag = docker_service.get_repository_and_tag(image) - assert repository == "username_1-2/repo_1-2" and tag == "tag" - - image = "username/repo:tag_1-2" - repository, tag = docker_service.get_repository_and_tag(image) - assert repository == "username/repo" and tag == "tag_1-2" - - image = "username/repo:TAG" - repository, tag = docker_service.get_repository_and_tag(image) - assert repository == "username/repo" and tag == "TAG" - - image = "username/repo:tag1.2" - repository, tag = docker_service.get_repository_and_tag(image) - assert repository == "username/repo" and tag == "tag1.2" diff --git a/test/orchestrate/ecr/__init__.py b/test/orchestrate/ecr/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/test/orchestrate/ecr/service_test.py b/test/orchestrate/ecr/service_test.py deleted file mode 100644 index 10f8ebbf..00000000 --- a/test/orchestrate/ecr/service_test.py +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import pytest -from mock import Mock - -from sigopt.orchestrate.ecr.service import AwsEcrService - - -class TestAwsEcrService(object): - @pytest.fixture - def orchestrate_services(self): - return Mock() - - @pytest.fixture - def aws_services(self): - return Mock() - - def test_constructor(self, orchestrate_services, aws_services): - ecr_service = AwsEcrService(orchestrate_services, aws_services) - assert ecr_service.client is not None diff --git a/test/orchestrate/gpu_options_validator/__init__.py b/test/orchestrate/gpu_options_validator/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/test/orchestrate/gpu_options_validator/service_test.py b/test/orchestrate/gpu_options_validator/service_test.py deleted file mode 100644 index 6e1bb477..00000000 --- a/test/orchestrate/gpu_options_validator/service_test.py +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import pytest -from mock import Mock - -from sigopt.orchestrate.gpu_options_validator.service import RESOURCES_OPTION, GpuOptionsValidatorService - - -class TestOptionsValidatorService(object): - @pytest.fixture() - def gpu_options_validator_service(self): - services = Mock() - return GpuOptionsValidatorService(services) - - @pytest.mark.parametrize( - "input_gpus,expected_resources", - [ - (None, {}), - (0, {"gpus": 0}), - ], - ) - def test_get_resources_without_confirmation( - self, - gpu_options_validator_service, - input_gpus, - expected_resources, - ): - options = {} - if input_gpus is not None: - options[RESOURCES_OPTION] = {"gpus": input_gpus} - resource_options = gpu_options_validator_service.get_resource_options(options) - assert resource_options == expected_resources - - def test_get_local_without_gpus(self, gpu_options_validator_service): - options = {} - resource_options = gpu_options_validator_service.get_resource_options(options) - assert RESOURCES_OPTION not in resource_options diff --git a/test/orchestrate/job_runner/__init__.py b/test/orchestrate/job_runner/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/test/orchestrate/job_runner/service_test.py b/test/orchestrate/job_runner/service_test.py deleted file mode 100644 index 83a123e4..00000000 --- a/test/orchestrate/job_runner/service_test.py +++ /dev/null @@ -1,78 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import pytest -from mock import Mock - -from sigopt.orchestrate.job_runner.service import JobRunnerService -from sigopt.orchestrate.resource.service import ResourceService - - -class TestJobRunnerService(object): - @pytest.fixture - def services(self): - services = Mock(sigopt_service=Mock(api_token="sigopt_api_token")) - services.resource_service = ResourceService(services) - return services - - @pytest.fixture - def job_runner_service(self, services): - return JobRunnerService(services) - - @pytest.mark.parametrize( - "resources,expected_output", - [ - ( - dict(requests={"cpu": 5}), - dict( - requests={ - "cpu": 5, - "ephemeral-storage": JobRunnerService.DEFAULT_EPHEMERAL_STORAGE_REQUEST, - }, - limits={}, - ), - ), - ( - dict(requests={"cpu": "300m"}, gpus=1), - dict( - requests={ - "cpu": "300m", - "ephemeral-storage": JobRunnerService.DEFAULT_EPHEMERAL_STORAGE_REQUEST, - }, - limits={"nvidia.com/gpu": 1}, - ), - ), - ( - dict(requests={"cpu": "1"}, limits={"memory": "2Gi", "cpu": 2}, gpus=1), - dict( - requests={ - "cpu": "1", - "ephemeral-storage": JobRunnerService.DEFAULT_EPHEMERAL_STORAGE_REQUEST, - }, - limits={ - "memory": "2Gi", - "cpu": 2, - "nvidia.com/gpu": 1, - }, - ), - ), - ( - dict( - requests={"cpu": "1", "ephemeral-storage": "1Ti"}, - limits={"memory": "2Gi", "cpu": 2}, - gpus=1, - ), - dict( - requests={"cpu": "1", "ephemeral-storage": "1Ti"}, - limits={ - "memory": "2Gi", - "cpu": 2, - "nvidia.com/gpu": 1, - }, - ), - ), - ], - ) - def test_format_resources(self, job_runner_service, resources, expected_output): - job_runner_service.format_resources(resources) - assert resources == expected_output diff --git a/test/orchestrate/job_status/__init__.py b/test/orchestrate/job_status/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/test/orchestrate/job_status/service_test.py b/test/orchestrate/job_status/service_test.py deleted file mode 100644 index 7e2bc38c..00000000 --- a/test/orchestrate/job_status/service_test.py +++ /dev/null @@ -1,65 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import pytest -from mock import MagicMock, Mock - -from sigopt.orchestrate.job_status.service import JobStatusService - - -class TestJobStatusService(object): - @pytest.fixture - def mock_sigopt_experiment(self): - return Mock( - name="my experiment", - budget=50, - progress=Mock(total_run_count=100), - ) - - @pytest.fixture - def services(self, mock_sigopt_experiment): - return Mock(sigopt_service=Mock(safe_fetch_experiment=Mock(return_value=mock_sigopt_experiment))) - - @pytest.fixture - def mock_job(self): - mock = MagicMock() - mock.metadata.name = "job" - mock.status.conditions = None - return mock - - @pytest.fixture - def job_status_service(self, services): - return JobStatusService(services) - - def test_parse_job_no_conditions(self, job_status_service, mock_job): - job_status_service.parse_job(mock_job) - - @pytest.mark.parametrize( - "conditions,expected_status", - [ - ([], "Not Complete"), - ([dict(status="True", type="Complete")], "Complete"), - ([dict(status="False", type="Complete")], "Not Complete"), - ([dict(status="Unknown", type="Complete")], "Maybe Complete"), - ( - [ - dict(status="True", type="Foo"), - dict(status="False", type="Bar"), - dict(status="Unknown", type="Baz"), - ], - "Foo, Not Bar, Maybe Baz", - ), - ], - ) - def test_parse_job_conditions(self, job_status_service, mock_job, conditions, expected_status): - mock_conditions = [] - for c in conditions: - mock_conditions.append(self.get_condition_mock(c["status"], c["type"])) - mock_job.status.conditions = mock_conditions - assert job_status_service.parse_job(mock_job)["status"] == expected_status - - def get_condition_mock(self, status, cond_type): - mock = Mock() - mock.status = status - mock.type = cond_type - return mock diff --git a/test/orchestrate/kubernetes/__init__.py b/test/orchestrate/kubernetes/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/test/orchestrate/kubernetes/service_test.py b/test/orchestrate/kubernetes/service_test.py deleted file mode 100644 index eecb1db4..00000000 --- a/test/orchestrate/kubernetes/service_test.py +++ /dev/null @@ -1,130 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import pytest -from kubernetes import client -from mock import MagicMock, Mock, patch - -from sigopt.orchestrate.exceptions import NodesNotReadyError -from sigopt.orchestrate.kubernetes.service import ORCHESTRATE_NAMESPACE, KubernetesService - - -# pylint: disable=protected-access -class TestKubernetesService(object): - @pytest.fixture() - def kubernetes_service(self): - services = Mock() - return KubernetesService(services) - - def test_delete_job(self, kubernetes_service): - kubernetes_service._v1_batch = Mock() - kubernetes_service.delete_job("test_job_name") - - kubernetes_service._v1_batch.delete_namespaced_job.assert_called_with( - "test_job_name", - ORCHESTRATE_NAMESPACE, - body=client.V1DeleteOptions(), - ) - - def test_start_job(self, kubernetes_service): - kubernetes_service._v1_batch = Mock() - kubernetes_service.start_job("test_job_spec") - - kubernetes_service._v1_batch.create_namespaced_job.assert_called_with(ORCHESTRATE_NAMESPACE, "test_job_spec") - - def test_logs(self, kubernetes_service): - kubernetes_service._v1_core = Mock() - kubernetes_service.logs("foobar") - - kubernetes_service._v1_core.read_namespaced_pod_log.assert_called_with("foobar", ORCHESTRATE_NAMESPACE) - - def test_pod_names(self, kubernetes_service): - foo_mock = Mock() - foo_mock.metadata.name = "foo" - bar_mock = Mock() - bar_mock.metadata.name = "bar" - - get_pods_result = MagicMock() - get_pods_result.items = [foo_mock, bar_mock] - kubernetes_service.get_pods = Mock(return_value=get_pods_result) - - assert kubernetes_service.pod_names("baz") == ["foo", "bar"] - kubernetes_service.get_pods.assert_called_with(job_name="baz") - - def test_get_pods(self, kubernetes_service): - kubernetes_service._v1_core = Mock() - kubernetes_service.get_pods() - - kubernetes_service._v1_core.list_namespaced_pod.assert_called_with(ORCHESTRATE_NAMESPACE, watch=False) - - def test_get_pods_with_job_name(self, kubernetes_service): - kubernetes_service._v1_core = Mock() - kubernetes_service.get_pods("test_job_name") - - kubernetes_service._v1_core.list_namespaced_pod.assert_called_with( - ORCHESTRATE_NAMESPACE, watch=False, label_selector="job-name=test_job_name" - ) - - def test_wait_until_nodes_are_ready(self, kubernetes_service): - with patch("sigopt.orchestrate.kubernetes.service.time") as mock_time: - kubernetes_service.check_nodes_are_ready = Mock( - side_effect=[ - NodesNotReadyError("not ready"), - NodesNotReadyError("not ready"), - NodesNotReadyError("not ready"), - None, - None, - ] - ) - kubernetes_service.wait_until_nodes_are_ready() - assert kubernetes_service.check_nodes_are_ready.call_count == 4 - assert mock_time.sleep.called - - def test_check_nodes_are_ready(self, kubernetes_service): - ready_true_cond = Mock(status="True", type="Ready") - foobar_true_cond = Mock(status="True", type="foobar") - foobar_false_cond = Mock(status="False", type="foobar") - - node_mock1 = Mock(status=Mock(conditions=[ready_true_cond, foobar_true_cond])) - node_mock2 = Mock(status=Mock(conditions=[ready_true_cond, foobar_false_cond])) - - kubernetes_service.get_nodes = MagicMock() - kubernetes_service.get_nodes().items = [node_mock1, node_mock2] - - kubernetes_service.check_nodes_are_ready() - - def test_check_nodes_are_not_ready_status(self, kubernetes_service): - ready_false_cond = Mock(status="False", type="Ready") - foobar_true_cond = Mock(status="True", type="foobar") - - node_mock1 = Mock(status=Mock(conditions=[ready_false_cond, foobar_true_cond])) - node_mock2 = Mock(status=Mock(conditions=[ready_false_cond, foobar_true_cond])) - - kubernetes_service.get_nodes = MagicMock() - kubernetes_service.get_nodes().items = [node_mock1, node_mock2] - - with pytest.raises(NodesNotReadyError): - kubernetes_service.check_nodes_are_ready() - - def test_check_nodes_are_not_ready_no_nodes(self, kubernetes_service): - kubernetes_service.get_nodes = MagicMock() - kubernetes_service.get_nodes().items = [] - - with pytest.raises(NodesNotReadyError): - kubernetes_service.check_nodes_are_ready() - - def test_get_nodes(self, kubernetes_service): - kubernetes_service._v1_core = Mock() - kubernetes_service.get_nodes() - - kubernetes_service._v1_core.list_node.assert_called_with() - - def test_get_cluster_names(self, kubernetes_service): - kubernetes_service._get_config_files = MagicMock(return_value=["config-test-cluster"]) - assert kubernetes_service.get_cluster_names() == ["test-cluster"] - - kubernetes_service._get_config_files = MagicMock(return_value=[]) - assert kubernetes_service.get_cluster_names() == [] - - -# pylint: enable=protected-access diff --git a/test/orchestrate/lib/__init__.py b/test/orchestrate/lib/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/test/orchestrate/lib/lists_test.py b/test/orchestrate/lib/lists_test.py deleted file mode 100644 index 5023f638..00000000 --- a/test/orchestrate/lib/lists_test.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -from sigopt.orchestrate.lib.lists import * - - -class TestLists(object): - def test_remove_nones(self): - assert remove_nones([]) == [] - assert remove_nones([False, None, [], 0, {}]) == [False, [], 0, {}] - assert remove_nones([False, None, [], 0, {}, 1, 2, 3, True, [1]]) == [ - False, - [], - 0, - {}, - 1, - 2, - 3, - True, - [1], - ] - assert remove_nones({}) == {} - assert remove_nones({"a": False, "b": None, "c": [], "d": 0, "e": {},}) == { - "a": False, - "c": [], - "d": 0, - "e": {}, - } - assert remove_nones({"a": False, "b": None, "c": [], "d": 0, "e": {}, "f": 1, "g": True,}) == { - "a": False, - "c": [], - "d": 0, - "e": {}, - "f": 1, - "g": True, - } - assert remove_nones({"a": {"b": None,},}) == { - "a": { - "b": None, - }, - } - assert remove_nones(set((1, "a", None))) == set((1, "a")) - assert remove_nones(set((1, "a"))) == set((1, "a")) - assert remove_nones(set()) == set() - - def test_coalesce(self): - assert coalesce() is None - assert coalesce(None) is None - assert coalesce(None, None) is None - assert coalesce(None, None, None) is None - assert coalesce(True) is True - assert coalesce(False) is False - assert coalesce(None, 1) == 1 - assert coalesce(None, 0) == 0 - assert coalesce(None, 0, 5) == 0 - assert coalesce(None, 1, 5) == 1 - - def test_list_get(self): - assert list_get([], 0) is None - assert list_get([], 100) is None - assert list_get([], -5) is None - assert list_get([1], 0) == 1 - assert list_get([1], -1) == 1 - assert list_get([1], 100) is None - assert list_get([1, 2, 3], 0) == 1 - assert list_get([1, 2, 3], 2) == 3 - assert list_get([1, 2, 3], -1) == 3 - assert list_get([1, 2, 3], -3) == 1 - assert list_get([1, 2, 3], 100) is None - - def test_partition(self): - assert partition([], lambda x: True) == ([], []) - assert partition([], lambda x: False) == ([], []) - assert partition([1, 2], lambda x: True) == ([1, 2], []) - assert partition([1, 2], lambda x: False) == ([], [1, 2]) - assert partition([1, 2, 3, 4], lambda x: x % 2 == 0) == ([2, 4], [1, 3]) - assert partition((i for i in range(1, 5)), lambda x: x % 2 == 0) == ([2, 4], [1, 3]) diff --git a/test/orchestrate/lib/types_test.py b/test/orchestrate/lib/types_test.py deleted file mode 100644 index 9bd84785..00000000 --- a/test/orchestrate/lib/types_test.py +++ /dev/null @@ -1,152 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -from sigopt.orchestrate.lib.types import * - - -class TestTypes(object): - def test_is_sequence(self): - assert is_sequence([]) - assert is_sequence([1, 2, 3]) - assert is_sequence(()) - assert is_sequence((1, 2, 3)) - - assert not is_sequence(None) - assert not is_sequence(False) - assert not is_sequence(True) - assert not is_sequence(0) - assert not is_sequence(1.0) - assert not is_sequence("abc") - assert not is_sequence({}) - assert not is_sequence({"a": 123}) - assert not is_sequence(set()) - assert not is_sequence(set((1, "a"))) - assert not is_sequence({1, "a"}) - assert not is_sequence(frozenset((1, "a"))) - - def test_is_string_sequence(self): - assert is_string_sequence([]) - assert is_string_sequence(()) - assert is_string_sequence(("a", "b", "c")) - assert is_string_sequence([""]) - assert is_string_sequence(["a"]) - assert is_string_sequence(["a", "", "b"]) - - assert not is_string_sequence(None) - assert not is_string_sequence(True) - assert not is_string_sequence(False) - assert not is_string_sequence((None, "a", "b")) - assert not is_string_sequence(["a", 1]) - assert not is_string_sequence(["b", None]) - assert not is_string_sequence(["a", "b", True, "c"]) - assert not is_string_sequence("string") - assert not is_string_sequence(("a", False)) - - def test_is_mapping(self): - assert is_mapping({}) - assert is_mapping({"a": 123}) - - assert not is_mapping([]) - assert not is_mapping([1, 2, 3]) - assert not is_mapping(()) - assert not is_mapping((1, 2, 3)) - assert not is_mapping(None) - assert not is_mapping(False) - assert not is_mapping(True) - assert not is_mapping(0) - assert not is_mapping(1.0) - assert not is_mapping("abc") - assert not is_mapping(set()) - assert not is_mapping(set((1, "a"))) - assert not is_mapping({1, "a"}) - assert not is_mapping(frozenset((1, "a"))) - - def test_is_set(self): - assert is_set(set()) - assert is_set(set((1, "a"))) - assert is_set({1, "a"}) - assert is_set(frozenset((1, "a"))) - - assert not is_set({}) - assert not is_set({"a": 123}) - assert not is_set([]) - assert not is_set([1, 2, 3]) - assert not is_set(()) - assert not is_set((1, 2, 3)) - assert not is_set(None) - assert not is_set(False) - assert not is_set(True) - assert not is_set(0) - assert not is_set(1.0) - assert not is_set("abc") - - def test_is_string(self): - assert is_string("") - assert is_string("abc") - assert is_string("123") - - assert not is_string(set()) - assert not is_string(set((1, "a"))) - assert not is_string({1, "a"}) - assert not is_string(frozenset((1, "a"))) - assert not is_string({}) - assert not is_string({"a": 123}) - assert not is_string([]) - assert not is_string([1, 2, 3]) - assert not is_string(()) - assert not is_string((1, 2, 3)) - assert not is_string(None) - assert not is_string(False) - assert not is_string(True) - assert not is_string(0) - assert not is_string(1.0) - - def test_is_integer(self): - assert is_integer(0) - assert is_integer(1) - assert is_integer(-1) - - assert not is_integer(set()) - assert not is_integer(set((1, "a"))) - assert not is_integer({1, "a"}) - assert not is_integer(frozenset((1, "a"))) - assert not is_integer({}) - assert not is_integer({"a": 123}) - assert not is_integer([]) - assert not is_integer([1, 2, 3]) - assert not is_integer(()) - assert not is_integer((1, 2, 3)) - assert not is_integer(None) - assert not is_integer(False) - assert not is_integer(True) - assert not is_integer(1.0) - assert not is_integer("") - assert not is_integer("abc") - assert not is_integer("123") - - def test_is_boolean(self): - assert is_boolean(True) - assert is_boolean(False) - - assert not is_boolean(set()) - assert not is_boolean(set((True, False))) - assert not is_boolean({True, False}) - assert not is_boolean(frozenset((True, False))) - assert not is_boolean({}) - assert not is_boolean({"a": True}) - assert not is_boolean([]) - assert not is_boolean([True, False]) - assert not is_boolean(()) - assert not is_boolean((True, False)) - assert not is_boolean("") - assert not is_boolean(None) - assert not is_boolean("abc") - assert not is_boolean("123") - assert not is_boolean("True") - assert not is_boolean("False") - assert not is_boolean("true") - assert not is_boolean("false") - assert not is_boolean(0) - assert not is_boolean(1) - assert not is_boolean(1.0) - assert not is_boolean(-1) diff --git a/test/orchestrate/model_packer/__init__.py b/test/orchestrate/model_packer/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/test/orchestrate/options_validator/__init__.py b/test/orchestrate/options_validator/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/test/orchestrate/options_validator/service_test.py b/test/orchestrate/options_validator/service_test.py deleted file mode 100644 index b45f5745..00000000 --- a/test/orchestrate/options_validator/service_test.py +++ /dev/null @@ -1,330 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import pytest -from mock import Mock - -from sigopt.orchestrate.options_validator.service import OptionsValidatorService - - -class TestOptionsValidatorService(object): - @pytest.fixture() - def options_validator_service(self): - services = Mock() - return OptionsValidatorService(services) - - @pytest.mark.parametrize( - "resource", - [ - {"requests": {"cpu": 1, "memory": "200Gi"}, "gpus": 1}, - {"limits": {"cpu": "200m", "memory": 200}}, - {"requests": None, "gpus": 1}, - {"requests": {"cpu": 1, "memory": "200Gi"}, "gpus": None}, - ], - ) - def test_validate_resources(self, options_validator_service, resource): - options_validator_service.validate_resources(**resource) - - @pytest.mark.parametrize( - "resource", - [ - {"requests": {"cpu": 1, "memory": "200Gi"}, "gpus": -1}, - {"limits": {"cpu": "200m", "memory": 200}, "requests": 55}, - ], - ) - def test_orchestrate_resources_bad(self, options_validator_service, resource): - with pytest.raises(AssertionError): - options_validator_service.validate_resources(**resource) - - @pytest.mark.parametrize("gpus", [-1, [], dict()]) - def test_validate_resources_wrong_type(self, options_validator_service, gpus): - with pytest.raises(AssertionError): - options_validator_service.validate_resources(gpus=gpus) - - def test_validate_aws(self, options_validator_service): - options_validator_service.validate_aws_for_orchestrate( - aws_access_key_id="foobar", - aws_secret_access_key="barfoo", - ) - - options_validator_service.validate_aws_for_cluster( - aws_access_key_id="foobar", - aws_secret_access_key="barfoo", - additional_policies=["bar"], - ) - - def test_validate_aws_simple(self, options_validator_service): - options_validator_service.validate_aws_for_orchestrate() - options_validator_service.validate_aws_for_cluster() - - def test_validate_aws_rejects_ecr(self, options_validator_service): - with pytest.raises(TypeError): - options_validator_service.validate_aws_for_cluster( - ecr=dict( - image="orchestrate/test", - ), - ) - - with pytest.raises(TypeError): - options_validator_service.validate_aws_for_orchestrate( - ecr=dict( - image="orchestrate/test", - ), - ) - - with pytest.raises(TypeError): - options_validator_service.validate_aws_for_orchestrate( - ecr=dict(), - ) - - def test_validate_aws_additional_policies(self, options_validator_service): - options_validator_service.validate_aws_for_cluster(additional_policies=[]) - options_validator_service.validate_aws_for_cluster(additional_policies=None) - - with pytest.raises(AssertionError): - options_validator_service.validate_aws_for_cluster(additional_policies="policy") - - def test_validate_sigopt(self, options_validator_service): - options_validator_service.validate_sigopt( - api_token="foobar", - ) - - def test_validate_sigopt_simple(self, options_validator_service): - options_validator_service.validate_sigopt() - - @pytest.mark.parametrize("api_token", ["", 0]) - def test_validate_sigopt_wrong_value(self, options_validator_service, api_token): - with pytest.raises(AssertionError): - options_validator_service.validate_sigopt( - api_token=api_token, - ) - - def test_validate_cluster_options(self, options_validator_service): - options_validator_service.validate_cluster_options( - provider="aws", - cluster_name="test-cluster", - cpu=dict( - instance_type="t2.small", - min_nodes=1, - max_nodes=1, - ), - gpu=dict( - instance_type="p3.2xlarge", - min_nodes=2, - max_nodes=2, - ), - system=dict( - instance_type="t3.small", - min_nodes=1, - max_nodes=2, - ), - ) - - def test_validate_cluster_options_ok_missing_values(self, options_validator_service): - options_validator_service.validate_cluster_options( - cluster_name="test-cluster", - provider="custom", - cpu=dict( - instance_type="t2.small", - min_nodes=1, - max_nodes=1, - ), - gpu=dict( - instance_type="p3.2xlarge", - min_nodes=2, - max_nodes=2, - ), - system=dict( - instance_type="t3.small", - min_nodes=1, - max_nodes=2, - ), - ) - - options_validator_service.validate_cluster_options( - provider="aws", - cluster_name="test-cluster", - gpu=dict( - instance_type="p3.2xlarge", - min_nodes=2, - max_nodes=2, - ), - system=dict( - instance_type="t3.small", - min_nodes=1, - max_nodes=2, - ), - ) - - options_validator_service.validate_cluster_options( - provider="aws", - cluster_name="test-cluster", - cpu=dict( - instance_type="t2.small", - min_nodes=1, - max_nodes=1, - ), - system=dict( - instance_type="t3.small", - min_nodes=1, - max_nodes=2, - ), - ) - - @pytest.mark.parametrize("cluster_name", ["", None, dict()]) - def test_validate_cluster_options_cluster_name(self, options_validator_service, cluster_name): - with pytest.raises(AssertionError): - options_validator_service.validate_cluster_options( - provider="aws", - cluster_name=cluster_name, - cpu=dict( - instance_type="t2.small", - min_nodes=1, - max_nodes=1, - ), - system=dict( - instance_type="t3.small", - min_nodes=1, - max_nodes=2, - ), - ) - - def test_validate_cluster_options_extra_options(self, options_validator_service): - with pytest.raises(AssertionError): - options_validator_service.validate_cluster_options( - provider="aws", - cluster_name="test-cluster", - tpu=dict( - instance_type="p3.2xlarge", - min_nodes=2, - max_nodes=2, - ), - ) - - def test_validate_cluster_options_wrong_type(self, options_validator_service): - with pytest.raises(AssertionError): - options_validator_service.validate_cluster_options( - provider="aws", - cluster_name="test-cluster", - gpu=[ - dict( - instance_type="p3.2xlarge", - min_nodes=2, - max_nodes=2, - ) - ], - ) - - with pytest.raises(AssertionError): - options_validator_service.validate_cluster_options( - provider="aws", - cluster_name="test-cluster", - cpu=[ - dict( - instance_type="t2.small", - min_nodes=1, - max_nodes=1, - ) - ], - system=dict( - instance_type="t3.small", - min_nodes=1, - max_nodes=2, - ), - ) - - def test_validate_cluster_options_ignore_values(self, options_validator_service): - options_validator_service.validate_cluster_options( - provider="aws", - cluster_name="test-cluster", - cpu=dict( - instance_type="t2.small", - min_nodes=1, - max_nodes=1, - ), - system=dict( - instance_type="t3.small", - min_nodes=1, - max_nodes=2, - ), - ) - - def test_validate_worker_stack(self, options_validator_service): - options_validator_service.validate_worker_stack( - name="cpu", - instance_type="t2.small", - min_nodes=1, - max_nodes=1, - ) - - def test_validate_worker_stack_ignores_values(self, options_validator_service): - options_validator_service.validate_worker_stack( - name="foobar", - instance_type="bazzle", - min_nodes=2, - max_nodes=19, - ) - - def test_validate_worker_stack_missing_options(self, options_validator_service): - with pytest.raises(AssertionError): - options_validator_service.validate_worker_stack( - name="cpu", - min_nodes=1, - max_nodes=1, - ) - - with pytest.raises(AssertionError): - options_validator_service.validate_worker_stack( - name="cpu", - instance_type="t2.small", - max_nodes=1, - ) - - with pytest.raises(AssertionError): - options_validator_service.validate_worker_stack( - name="cpu", - instance_type="t2.small", - min_nodes=1, - ) - - def test_validate_worker_stack_wrong_type(self, options_validator_service): - with pytest.raises(AssertionError): - options_validator_service.validate_worker_stack( - name="cpu", - instance_type=2, - min_nodes=1, - max_nodes=1, - ) - - with pytest.raises(AssertionError): - options_validator_service.validate_worker_stack( - name="cpu", - instance_type="t2.small", - min_nodes="1", - max_nodes=1, - ) - - with pytest.raises(AssertionError): - options_validator_service.validate_worker_stack( - name="cpu", - instance_type="t2.small", - min_nodes=1, - max_nodes="1", - ) - - def test_validate_worker_stack_negative(self, options_validator_service): - with pytest.raises(AssertionError): - options_validator_service.validate_worker_stack( - name="cpu", - instance_type="t2.small", - min_nodes=-1, - max_nodes=1, - ) - - with pytest.raises(AssertionError): - options_validator_service.validate_worker_stack( - name="cpu", - instance_type="t2.small", - min_nodes=1, - max_nodes=-1, - ) diff --git a/test/orchestrate/provider/__init__.py b/test/orchestrate/provider/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/test/orchestrate/provider/broker_test.py b/test/orchestrate/provider/broker_test.py deleted file mode 100644 index 78a1709a..00000000 --- a/test/orchestrate/provider/broker_test.py +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import pytest -from mock import Mock - -from sigopt.orchestrate.aws.service import AwsService -from sigopt.orchestrate.custom_cluster.service import CustomClusterService -from sigopt.orchestrate.provider.broker import ProviderBroker -from sigopt.orchestrate.provider.constants import Provider, string_to_provider - - -class TestProviderBroker(object): - @pytest.fixture - def services(self): - return Mock( - get_option=Mock(return_value="foo"), - ) - - @pytest.fixture - def provider_broker(self, services): - return ProviderBroker(services) - - def test_get_provider_service(self, provider_broker, services): - assert isinstance(provider_broker.get_provider_service(string_to_provider("aws")), AwsService) - assert isinstance(provider_broker.get_provider_service(string_to_provider("AWS")), AwsService) - assert isinstance(provider_broker.get_provider_service(Provider.AWS), AwsService) - - def test_custom_provider(self, provider_broker, services): - assert isinstance( - provider_broker.get_provider_service(string_to_provider("custom")), - CustomClusterService, - ) - assert isinstance(provider_broker.get_provider_service(Provider.CUSTOM), CustomClusterService) - - def test_unknown_provider(self, provider_broker): - with pytest.raises(NotImplementedError): - provider_broker.get_provider_service(0) diff --git a/test/orchestrate/resource/__init__.py b/test/orchestrate/resource/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/test/orchestrate/resource/service_test.py b/test/orchestrate/resource/service_test.py deleted file mode 100644 index 498a034b..00000000 --- a/test/orchestrate/resource/service_test.py +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import os - -import pytest - -from sigopt.orchestrate.resource.service import ResourceService - - -TEST_MODULE = "test" -TEST_FILE = "test_file.txt" -ACTUAL_TEXT = "This is a test file for testing the resource service.\n".encode() - - -class TestResourceService(object): - @pytest.fixture - def resource_service(self): - return ResourceService(None) - - def test_resource_stream(self, resource_service): - with resource_service.stream(TEST_MODULE, TEST_FILE) as stream: - assert ACTUAL_TEXT == stream.read() - - def test_resource_open(self, resource_service): - with resource_service.open(TEST_MODULE, TEST_FILE) as test_fp: - assert ACTUAL_TEXT == test_fp.read() - with open(test_fp.name, mode="rb") as second_open: - assert ACTUAL_TEXT == second_open.read() - - def test_tempfile_removed(self, resource_service): - test_fp = resource_service.open(TEST_MODULE, TEST_FILE) - test_fp.close() - assert not os.path.isfile(test_fp.name) - - def test_resource_read(self, resource_service): - assert ACTUAL_TEXT == resource_service.read(TEST_MODULE, TEST_FILE) diff --git a/test/orchestrate/services/__init__.py b/test/orchestrate/services/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/test/orchestrate/services/aws_provider_bag_test.py b/test/orchestrate/services/aws_provider_bag_test.py deleted file mode 100644 index 068de159..00000000 --- a/test/orchestrate/services/aws_provider_bag_test.py +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import pytest -from mock import Mock - -from sigopt.orchestrate.services.aws_provider_bag import AwsProviderServiceBag - - -class TestOrchestrateServiceBag(object): - @pytest.fixture - def orchestrate_services(self): - return Mock() - - def test_orchestrate_service_bag(self, orchestrate_services): - services = AwsProviderServiceBag(orchestrate_services) - assert services.cloudformation_service is not None - assert services.cloudformation_service.client is not None - assert services.cloudformation_service.cloudformation is not None - assert services.ec2_service is not None - assert services.ec2_service.ec2 is not None - assert services.ecr_service is not None - assert services.ecr_service.client is not None - assert services.eks_service is not None - assert services.eks_service.client is not None - assert services.iam_service is not None - assert services.iam_service.client is not None - assert services.iam_service.iam is not None - assert services.sts_service is not None - assert services.sts_service.client is not None diff --git a/test/orchestrate/services/base_test.py b/test/orchestrate/services/base_test.py deleted file mode 100644 index f00a1c83..00000000 --- a/test/orchestrate/services/base_test.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -from mock import Mock - -from sigopt.orchestrate.services.base import Service - - -class TestService(object): - def test_services(self): - mock_services = Mock() - services = Service(mock_services) - assert services.services is not None diff --git a/test/orchestrate/sigopt/service_test.py b/test/orchestrate/sigopt/service_test.py deleted file mode 100644 index f1ebea1b..00000000 --- a/test/orchestrate/sigopt/service_test.py +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import os - -import pytest -from mock import Mock, patch - -from sigopt.orchestrate.sigopt.service import SigOptService - - -class TestSigOptService(object): - @pytest.fixture - def services(self): - return Mock() - - def test_reads_from_environment(self, services): - with patch.dict( - os.environ, - dict(SIGOPT_API_TOKEN="foobar", SIGOPT_API_URL="https://api-env.sigopt.com"), - ): - sigopt_service = SigOptService(services) - assert sigopt_service.conn is not None - assert sigopt_service.api_token == "foobar" - assert sigopt_service.api_url == "https://api-env.sigopt.com" diff --git a/test/orchestrate/sts/__init__.py b/test/orchestrate/sts/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/test/orchestrate/sts/service_test.py b/test/orchestrate/sts/service_test.py deleted file mode 100644 index 3c9d3340..00000000 --- a/test/orchestrate/sts/service_test.py +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright © 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -import pytest -from mock import Mock - -from sigopt.orchestrate.sts.service import AwsStsService - - -class TestAwsStsService(object): - @pytest.fixture - def orchestrate_services(self): - return Mock() - - @pytest.fixture - def aws_services(self): - return Mock() - - def test_constructor(self, orchestrate_services, aws_services): - sts_service = AwsStsService(orchestrate_services, aws_services) - assert sts_service.client is not None diff --git a/tools/generate_vulture_allowlist b/tools/generate_vulture_allowlist index d1408e52..968e6bfa 100755 --- a/tools/generate_vulture_allowlist +++ b/tools/generate_vulture_allowlist @@ -4,6 +4,6 @@ import subprocess cmd = "./tools/run_vulture.sh . --make-whitelist" -out = subprocess.run(cmd, shell=True, capture_output=True, text=True, check=True).stdout.rstrip() +proc = subprocess.run(cmd, shell=True, capture_output=True, text=True, check=False) pwd = os.getcwd() -print(out.replace(pwd + "/", "")) +print(proc.stdout.rstrip().replace(pwd + "/", "")) diff --git a/tools/run_vulture.sh b/tools/run_vulture.sh index e8134e54..b6bf5fd5 100755 --- a/tools/run_vulture.sh +++ b/tools/run_vulture.sh @@ -5,4 +5,4 @@ set -e set -o pipefail -exec vulture --exclude="build,venv" --ignore-decorators="@click.*,@sigopt_cli.*,@pytest.*" "$@" +exec vulture --exclude="build,venv" --ignore-decorators="@click.*,@sigopt_cli.*,@public,@pytest.*" --ignore-names="side_effect" "$@"