From 1b6312823c22c18fd3bb9422a9f181676f1158ab Mon Sep 17 00:00:00 2001 From: Avnish Narayan <38871737+avnishn@users.noreply.github.com> Date: Fri, 30 Jun 2023 12:35:20 -0700 Subject: [PATCH] [RLlib] Various updates to the Release CI RLlib (#36883) Signed-off-by: Avnish --- release/rllib_tests/1gpu_16cpus.yaml | 2 +- release/rllib_tests/1gpu_16cpus_gce.yaml | 2 +- release/rllib_tests/1gpu_24cpus.yaml | 2 +- release/rllib_tests/1gpu_24cpus_gce.yaml | 2 +- release/rllib_tests/1gpu_32cpus.yaml | 22 ------------- release/rllib_tests/1gpu_32cpus_gce.yaml | 32 ------------------- release/rllib_tests/1gpu_4cpus_gce.yaml | 2 +- release/rllib_tests/4gpus_64cpus.yaml | 11 +++++-- release/rllib_tests/4gpus_64cpus_gce.yaml | 2 +- release/rllib_tests/8gpus_96cpus.yaml | 4 +-- release/rllib_tests/8gpus_96cpus_gce.yaml | 11 ++----- release/rllib_tests/app_config.yaml | 7 ++-- release/rllib_tests/debug_app_config.yaml | 2 +- ...lti_gpu_with_attention_learning_tests.yaml | 2 ++ .../multi_gpu_with_lstm_learning_tests.yaml | 31 ++---------------- 15 files changed, 27 insertions(+), 107 deletions(-) delete mode 100644 release/rllib_tests/1gpu_32cpus.yaml delete mode 100644 release/rllib_tests/1gpu_32cpus_gce.yaml diff --git a/release/rllib_tests/1gpu_16cpus.yaml b/release/rllib_tests/1gpu_16cpus.yaml index 36adc09444f7..2a0cdea1c0b3 100644 --- a/release/rllib_tests/1gpu_16cpus.yaml +++ b/release/rllib_tests/1gpu_16cpus.yaml @@ -5,7 +5,7 @@ max_workers: 0 head_node_type: name: head_node - instance_type: g3.4xlarge + instance_type: g5.4xlarge worker_node_types: [] diff --git a/release/rllib_tests/1gpu_16cpus_gce.yaml b/release/rllib_tests/1gpu_16cpus_gce.yaml index c534041d0863..f0ad9d505d4a 100644 --- a/release/rllib_tests/1gpu_16cpus_gce.yaml +++ b/release/rllib_tests/1gpu_16cpus_gce.yaml @@ -7,7 +7,7 @@ max_workers: 0 head_node_type: name: head_node - instance_type: n1-standard-16-nvidia-tesla-t4-1 # g3.4xlarge + instance_type: n1-standard-16-nvidia-v100-16gb-1 worker_node_types: [] diff --git a/release/rllib_tests/1gpu_24cpus.yaml b/release/rllib_tests/1gpu_24cpus.yaml index 9a7351f981a9..af4def71489d 100644 --- a/release/rllib_tests/1gpu_24cpus.yaml +++ b/release/rllib_tests/1gpu_24cpus.yaml @@ -5,7 +5,7 @@ max_workers: 2 head_node_type: name: head_node - instance_type: g3.4xlarge + instance_type: g5.4xlarge worker_node_types: - name: worker_node diff --git a/release/rllib_tests/1gpu_24cpus_gce.yaml b/release/rllib_tests/1gpu_24cpus_gce.yaml index fdb6665c7f01..ec79552e4984 100644 --- a/release/rllib_tests/1gpu_24cpus_gce.yaml +++ b/release/rllib_tests/1gpu_24cpus_gce.yaml @@ -7,7 +7,7 @@ max_workers: 2 head_node_type: name: head_node - instance_type: n1-standard-16-nvidia-tesla-t4-1 # g3.4xlarge + instance_type: n1-standard-16-nvidia-v100-16gb-1 worker_node_types: - name: worker_node diff --git a/release/rllib_tests/1gpu_32cpus.yaml b/release/rllib_tests/1gpu_32cpus.yaml deleted file mode 100644 index 88270242d131..000000000000 --- a/release/rllib_tests/1gpu_32cpus.yaml +++ /dev/null @@ -1,22 +0,0 @@ -cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} -region: us-west-2 - -max_workers: 7 - -head_node_type: - name: head_node - instance_type: g3s.xlarge - -worker_node_types: - - name: worker_node - instance_type: m5.xlarge - min_workers: 7 - max_workers: 7 - use_spot: false - -aws: - BlockDeviceMappings: - - DeviceName: /dev/sda1 - Ebs: - DeleteOnTermination: true - VolumeSize: 500 diff --git a/release/rllib_tests/1gpu_32cpus_gce.yaml b/release/rllib_tests/1gpu_32cpus_gce.yaml deleted file mode 100644 index 8b76571eefe2..000000000000 --- a/release/rllib_tests/1gpu_32cpus_gce.yaml +++ /dev/null @@ -1,32 +0,0 @@ -cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} -region: us-west1 -allowed_azs: - - us-west1-b - -max_workers: 7 - -head_node_type: - name: head_node - instance_type: n1-highmem-4-nvidia-tesla-v100-1 # g3s.xlarge - -worker_node_types: - - name: worker_node - instance_type: n2-standard-4 # m5.xlarge - min_workers: 7 - max_workers: 7 - use_spot: false - -gcp_advanced_configurations_json: - instance_properties: - disks: - - boot: true - auto_delete: true - initialize_params: - disk_size_gb: 500 - -#aws: -# BlockDeviceMappings: -# - DeviceName: /dev/sda1 -# Ebs: -# DeleteOnTermination: true -# VolumeSize: 500 diff --git a/release/rllib_tests/1gpu_4cpus_gce.yaml b/release/rllib_tests/1gpu_4cpus_gce.yaml index 156fda105107..b239d699f91c 100644 --- a/release/rllib_tests/1gpu_4cpus_gce.yaml +++ b/release/rllib_tests/1gpu_4cpus_gce.yaml @@ -7,7 +7,7 @@ max_workers: 0 head_node_type: name: head_node - instance_type: n1-standard-4-nvidia-tesla-t4-1 # p2.xlarge + instance_type: n1-standard-4-nvidia-v100-16gb-1 worker_node_types: [] diff --git a/release/rllib_tests/4gpus_64cpus.yaml b/release/rllib_tests/4gpus_64cpus.yaml index cf61a583c078..c0f4c76ee300 100644 --- a/release/rllib_tests/4gpus_64cpus.yaml +++ b/release/rllib_tests/4gpus_64cpus.yaml @@ -1,13 +1,18 @@ cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} region: us-west-2 -max_workers: 0 +max_workers: 2 head_node_type: name: head_node - instance_type: g3.16xlarge + instance_type: g5.12xlarge -worker_node_types: [] +worker_node_types: + - name: worker_node + instance_type: m5.4xlarge + min_workers: 1 + max_workers: 1 + use_spot: false aws: BlockDeviceMappings: diff --git a/release/rllib_tests/4gpus_64cpus_gce.yaml b/release/rllib_tests/4gpus_64cpus_gce.yaml index dcddaa23c70f..82b95a8b4fdc 100644 --- a/release/rllib_tests/4gpus_64cpus_gce.yaml +++ b/release/rllib_tests/4gpus_64cpus_gce.yaml @@ -7,7 +7,7 @@ max_workers: 0 head_node_type: name: head_node - instance_type: n1-highmem-64-nvidia-tesla-v100-8 # g3.16xlarge + instance_type: n1-standard-64-nvidia-v100-16gb-4 worker_node_types: [] diff --git a/release/rllib_tests/8gpus_96cpus.yaml b/release/rllib_tests/8gpus_96cpus.yaml index 7cae28446a12..d9509c85dbda 100644 --- a/release/rllib_tests/8gpus_96cpus.yaml +++ b/release/rllib_tests/8gpus_96cpus.yaml @@ -1,11 +1,11 @@ cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} region: us-west-2 -max_workers: 0 +max_workers: 2 head_node_type: name: head_node - instance_type: g4dn.metal + instance_type: g5.24xlarge worker_node_types: [] diff --git a/release/rllib_tests/8gpus_96cpus_gce.yaml b/release/rllib_tests/8gpus_96cpus_gce.yaml index b249a5a2ed01..ea7832ae22be 100644 --- a/release/rllib_tests/8gpus_96cpus_gce.yaml +++ b/release/rllib_tests/8gpus_96cpus_gce.yaml @@ -3,18 +3,13 @@ region: us-west1 allowed_azs: - us-west1-b -max_workers: 1 +max_workers: 0 head_node_type: name: head_node - instance_type: n1-standard-64-nvidia-tesla-t4-4 + instance_type: n1-standard-96-nvidia-v100-16gb-8 -worker_node_types: - - name: worker_node - instance_type: n1-standard-32-nvidia-tesla-t4-4 - min_workers: 1 - max_workers: 1 - use_spot: false +worker_node_types: [] gcp_advanced_configurations_json: instance_properties: diff --git a/release/rllib_tests/app_config.yaml b/release/rllib_tests/app_config.yaml index dc765eab31c3..fc3084dbd1f6 100755 --- a/release/rllib_tests/app_config.yaml +++ b/release/rllib_tests/app_config.yaml @@ -1,4 +1,4 @@ -base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] | default("anyscale/ray-ml:nightly-py37-gpu") }} +base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] | default("anyscale/ray-ml:nightly-py38-gpu") }} env_vars: {"LD_LIBRARY_PATH": "$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin", "RLLIB_TEST_NO_JAX_IMPORT": "1"} debian_packages: - unzip @@ -32,9 +32,8 @@ post_build_cmds: # Clone the rl-experiments repo for offline-RL files. - git clone https://github.com/ray-project/rl-experiments.git - unzip rl-experiments/halfcheetah-sac/2022-12-17/halfcheetah_1500_mean_reward_sac.zip -d ~/. - # Use torch+CUDA10.2 for our release tests. CUDA11.x has known performance issues in combination with torch+GPU+CNNs - # TODO(sven): remove once nightly image gets upgraded. - - pip3 install torch==1.12.1+cu102 torchvision==0.13.1+cu102 --extra-index-url https://download.pytorch.org/whl/cu102 + + - pip3 install torch==2.0.0+cu118 torchvision==0.15.1+cu118 --index-url https://download.pytorch.org/whl/cu118 # TODO(sven): remove once nightly image gets gymnasium and the other new dependencies. - wget https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz diff --git a/release/rllib_tests/debug_app_config.yaml b/release/rllib_tests/debug_app_config.yaml index 7b07f4c2eea7..aecbaa914ad5 100755 --- a/release/rllib_tests/debug_app_config.yaml +++ b/release/rllib_tests/debug_app_config.yaml @@ -1,4 +1,4 @@ -base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] | default("anyscale/ray-ml:nightly-py37-gpu") }} +base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] | default("anyscale/ray-ml:nightly-py38-gpu") }} env_vars: {"LD_LIBRARY_PATH": "$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin", "RLLIB_TEST_NO_JAX_IMPORT": "1"} debian_packages: - unzip diff --git a/release/rllib_tests/multi_gpu_with_attention_learning_tests/multi_gpu_with_attention_learning_tests.yaml b/release/rllib_tests/multi_gpu_with_attention_learning_tests/multi_gpu_with_attention_learning_tests.yaml index 8491f98a81f9..a5b3c3a7f51b 100644 --- a/release/rllib_tests/multi_gpu_with_attention_learning_tests/multi_gpu_with_attention_learning_tests.yaml +++ b/release/rllib_tests/multi_gpu_with_attention_learning_tests/multi_gpu_with_attention_learning_tests.yaml @@ -153,6 +153,8 @@ ppo-stateless-cartpole: attention_position_wise_mlp_dim: 32 # Double batch size (2 GPUs). train_batch_size: 8000 + _enable_learner_api: false + _enable_rl_module_api: false # TODO (Kourosh): Activate these tests back when the new modeling stack is merged # r2d2-stateless-cartpole: diff --git a/release/rllib_tests/multi_gpu_with_lstm_learning_tests/multi_gpu_with_lstm_learning_tests.yaml b/release/rllib_tests/multi_gpu_with_lstm_learning_tests/multi_gpu_with_lstm_learning_tests.yaml index 911c8ba0e8ef..e5a218a83dcc 100644 --- a/release/rllib_tests/multi_gpu_with_lstm_learning_tests/multi_gpu_with_lstm_learning_tests.yaml +++ b/release/rllib_tests/multi_gpu_with_lstm_learning_tests/multi_gpu_with_lstm_learning_tests.yaml @@ -133,32 +133,5 @@ ppo-stateless-cartpole: use_lstm: true # Double batch size (2 GPUs). train_batch_size: 8000 - -# TODO (Kourosh): Activate these tests back when the new modeling stack is merged -# r2d2-stateless-cartpole: -# env: ray.rllib.examples.env.stateless_cartpole.StatelessCartPole -# run: R2D2 -# # Minimum reward and total ts (in given time_total_s) to pass this test. -# pass_criteria: -# sampler_results/episode_reward_mean: 150.0 -# timesteps_total: 65000 -# stop: -# time_total_s: 800 -# config: -# num_gpus: 2 -# num_workers: 0 -# # R2D2 settings. -# burn_in: 20 -# zero_init_states: true -# lr: 0.0005 -# # Give some more time to explore. -# exploration_config: -# epsilon_timesteps: 50000 -# model: -# # Test w/ LSTMs. -# use_lstm: true -# lstm_cell_size: 64 -# # Use a very simple base-model. -# fcnet_hiddens: [64] -# fcnet_activation: linear -# max_seq_len: 20 + _enable_learner_api: false + _enable_rl_module_api: false