[RLlib] Various updates to the Release CI RLlib (#36883)

Signed-off-by: Avnish <avnishnarayan@gmail.com>
ray-project · Jun 30, 2023 · 1b63128 · 1b63128
1 parent 0db7f95
commit 1b63128
Show file tree

Hide file tree

Showing 15 changed files with 27 additions and 107 deletions.
diff --git a/release/rllib_tests/1gpu_16cpus.yaml b/release/rllib_tests/1gpu_16cpus.yaml
@@ -5,7 +5,7 @@ max_workers: 0
 
 head_node_type:
     name: head_node
-    instance_type: g3.4xlarge
+    instance_type: g5.4xlarge
 
 worker_node_types: []
 

diff --git a/release/rllib_tests/1gpu_16cpus_gce.yaml b/release/rllib_tests/1gpu_16cpus_gce.yaml
@@ -7,7 +7,7 @@ max_workers: 0
 
 head_node_type:
     name: head_node
-    instance_type: n1-standard-16-nvidia-tesla-t4-1 # g3.4xlarge
+    instance_type: n1-standard-16-nvidia-v100-16gb-1
 
 worker_node_types: []
 

diff --git a/release/rllib_tests/1gpu_24cpus.yaml b/release/rllib_tests/1gpu_24cpus.yaml
@@ -5,7 +5,7 @@ max_workers: 2
 
 head_node_type:
     name: head_node
-    instance_type: g3.4xlarge
+    instance_type: g5.4xlarge
 
 worker_node_types:
     - name: worker_node

diff --git a/release/rllib_tests/1gpu_24cpus_gce.yaml b/release/rllib_tests/1gpu_24cpus_gce.yaml
@@ -7,7 +7,7 @@ max_workers: 2
 
 head_node_type:
     name: head_node
-    instance_type: n1-standard-16-nvidia-tesla-t4-1 # g3.4xlarge
+    instance_type: n1-standard-16-nvidia-v100-16gb-1
 
 worker_node_types:
     - name: worker_node

diff --git a/release/rllib_tests/1gpu_32cpus.yaml b/release/rllib_tests/1gpu_32cpus.yaml
diff --git a/release/rllib_tests/1gpu_32cpus_gce.yaml b/release/rllib_tests/1gpu_32cpus_gce.yaml
diff --git a/release/rllib_tests/1gpu_4cpus_gce.yaml b/release/rllib_tests/1gpu_4cpus_gce.yaml
@@ -7,7 +7,7 @@ max_workers: 0
 
 head_node_type:
     name: head_node
-    instance_type: n1-standard-4-nvidia-tesla-t4-1 # p2.xlarge
+    instance_type: n1-standard-4-nvidia-v100-16gb-1
 
 worker_node_types: []
 

diff --git a/release/rllib_tests/4gpus_64cpus.yaml b/release/rllib_tests/4gpus_64cpus.yaml
@@ -1,13 +1,18 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-max_workers: 0
+max_workers: 2
 
 head_node_type:
     name: head_node
-    instance_type: g3.16xlarge
+    instance_type: g5.12xlarge
 
-worker_node_types: []
+worker_node_types:
+    - name: worker_node
+      instance_type: m5.4xlarge
+      min_workers: 1
+      max_workers: 1
+      use_spot: false
 
 aws:
     BlockDeviceMappings:

diff --git a/release/rllib_tests/4gpus_64cpus_gce.yaml b/release/rllib_tests/4gpus_64cpus_gce.yaml
@@ -7,7 +7,7 @@ max_workers: 0
 
 head_node_type:
     name: head_node
-    instance_type: n1-highmem-64-nvidia-tesla-v100-8 # g3.16xlarge
+    instance_type: n1-standard-64-nvidia-v100-16gb-4
 
 worker_node_types: []
 

diff --git a/release/rllib_tests/8gpus_96cpus.yaml b/release/rllib_tests/8gpus_96cpus.yaml
@@ -1,11 +1,11 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-max_workers: 0
+max_workers: 2
 
 head_node_type:
     name: head_node
-    instance_type: g4dn.metal
+    instance_type: g5.24xlarge
 
 worker_node_types: []
 

diff --git a/release/rllib_tests/8gpus_96cpus_gce.yaml b/release/rllib_tests/8gpus_96cpus_gce.yaml
@@ -3,18 +3,13 @@ region: us-west1
 allowed_azs:
     - us-west1-b
 
-max_workers: 1
+max_workers: 0
 
 head_node_type:
     name: head_node
-    instance_type: n1-standard-64-nvidia-tesla-t4-4
+    instance_type: n1-standard-96-nvidia-v100-16gb-8
 
-worker_node_types:
-    - name: worker_node
-      instance_type: n1-standard-32-nvidia-tesla-t4-4
-      min_workers: 1
-      max_workers: 1
-      use_spot: false
+worker_node_types: []
 
 gcp_advanced_configurations_json:
   instance_properties:

diff --git a/release/rllib_tests/app_config.yaml b/release/rllib_tests/app_config.yaml
@@ -1,4 +1,4 @@
-base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] | default("anyscale/ray-ml:nightly-py37-gpu") }}
+base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] | default("anyscale/ray-ml:nightly-py38-gpu") }}
 env_vars: {"LD_LIBRARY_PATH": "$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin", "RLLIB_TEST_NO_JAX_IMPORT": "1"}
 debian_packages:
   - unzip
@@ -32,9 +32,8 @@ post_build_cmds:
   # Clone the rl-experiments repo for offline-RL files.
   - git clone https://github.com/ray-project/rl-experiments.git
   - unzip rl-experiments/halfcheetah-sac/2022-12-17/halfcheetah_1500_mean_reward_sac.zip -d ~/.
-  # Use torch+CUDA10.2 for our release tests. CUDA11.x has known performance issues in combination with torch+GPU+CNNs
-  # TODO(sven): remove once nightly image gets upgraded.
-  - pip3 install torch==1.12.1+cu102 torchvision==0.13.1+cu102 --extra-index-url https://download.pytorch.org/whl/cu102
+
+  - pip3 install torch==2.0.0+cu118 torchvision==0.15.1+cu118 --index-url https://download.pytorch.org/whl/cu118
 
   # TODO(sven): remove once nightly image gets gymnasium and the other new dependencies.
   - wget https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz

diff --git a/release/rllib_tests/debug_app_config.yaml b/release/rllib_tests/debug_app_config.yaml
@@ -1,4 +1,4 @@
-base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] | default("anyscale/ray-ml:nightly-py37-gpu") }}
+base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] | default("anyscale/ray-ml:nightly-py38-gpu") }}
 env_vars: {"LD_LIBRARY_PATH": "$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin", "RLLIB_TEST_NO_JAX_IMPORT": "1"}
 debian_packages:
   - unzip

diff --git a/...ests/multi_gpu_with_attention_learning_tests/multi_gpu_with_attention_learning_tests.yaml b/...ests/multi_gpu_with_attention_learning_tests/multi_gpu_with_attention_learning_tests.yaml
@@ -153,6 +153,8 @@ ppo-stateless-cartpole:
             attention_position_wise_mlp_dim: 32
         # Double batch size (2 GPUs).
         train_batch_size: 8000
+        _enable_learner_api: false
+        _enable_rl_module_api: false
 
 # TODO (Kourosh): Activate these tests back when the new modeling stack is merged
 # r2d2-stateless-cartpole:

diff --git a/...se/rllib_tests/multi_gpu_with_lstm_learning_tests/multi_gpu_with_lstm_learning_tests.yaml b/...se/rllib_tests/multi_gpu_with_lstm_learning_tests/multi_gpu_with_lstm_learning_tests.yaml
@@ -133,32 +133,5 @@ ppo-stateless-cartpole:
             use_lstm: true
         # Double batch size (2 GPUs).
         train_batch_size: 8000
-
-# TODO (Kourosh): Activate these tests back when the new modeling stack is merged
-# r2d2-stateless-cartpole:
-#     env: ray.rllib.examples.env.stateless_cartpole.StatelessCartPole
-#     run: R2D2
-#     # Minimum reward and total ts (in given time_total_s) to pass this test.
-#     pass_criteria:
-#         sampler_results/episode_reward_mean: 150.0
-#         timesteps_total: 65000
-#     stop:
-#         time_total_s: 800
-#     config:
-#         num_gpus: 2
-#         num_workers: 0
-#         # R2D2 settings.
-#         burn_in: 20
-#         zero_init_states: true
-#         lr: 0.0005
-#         # Give some more time to explore.
-#         exploration_config:
-#           epsilon_timesteps: 50000
-#         model:
-#             # Test w/ LSTMs.
-#             use_lstm: true
-#             lstm_cell_size: 64
-#             # Use a very simple base-model.
-#             fcnet_hiddens: [64]
-#             fcnet_activation: linear
-#             max_seq_len: 20
+        _enable_learner_api: false
+        _enable_rl_module_api: false