From 1b6312823c22c18fd3bb9422a9f181676f1158ab Mon Sep 17 00:00:00 2001
From: Avnish Narayan <38871737+avnishn@users.noreply.github.com>
Date: Fri, 30 Jun 2023 12:35:20 -0700
Subject: [PATCH] [RLlib] Various updates to the Release CI RLlib (#36883)

Signed-off-by: Avnish <avnishnarayan@gmail.com>
---
 release/rllib_tests/1gpu_16cpus.yaml          |  2 +-
 release/rllib_tests/1gpu_16cpus_gce.yaml      |  2 +-
 release/rllib_tests/1gpu_24cpus.yaml          |  2 +-
 release/rllib_tests/1gpu_24cpus_gce.yaml      |  2 +-
 release/rllib_tests/1gpu_32cpus.yaml          | 22 -------------
 release/rllib_tests/1gpu_32cpus_gce.yaml      | 32 -------------------
 release/rllib_tests/1gpu_4cpus_gce.yaml       |  2 +-
 release/rllib_tests/4gpus_64cpus.yaml         | 11 +++++--
 release/rllib_tests/4gpus_64cpus_gce.yaml     |  2 +-
 release/rllib_tests/8gpus_96cpus.yaml         |  4 +--
 release/rllib_tests/8gpus_96cpus_gce.yaml     | 11 ++-----
 release/rllib_tests/app_config.yaml           |  7 ++--
 release/rllib_tests/debug_app_config.yaml     |  2 +-
 ...lti_gpu_with_attention_learning_tests.yaml |  2 ++
 .../multi_gpu_with_lstm_learning_tests.yaml   | 31 ++----------------
 15 files changed, 27 insertions(+), 107 deletions(-)
 delete mode 100644 release/rllib_tests/1gpu_32cpus.yaml
 delete mode 100644 release/rllib_tests/1gpu_32cpus_gce.yaml

diff --git a/release/rllib_tests/1gpu_16cpus.yaml b/release/rllib_tests/1gpu_16cpus.yaml
index 36adc09444f7..2a0cdea1c0b3 100644
--- a/release/rllib_tests/1gpu_16cpus.yaml
+++ b/release/rllib_tests/1gpu_16cpus.yaml
@@ -5,7 +5,7 @@ max_workers: 0
 
 head_node_type:
     name: head_node
-    instance_type: g3.4xlarge
+    instance_type: g5.4xlarge
 
 worker_node_types: []
 
diff --git a/release/rllib_tests/1gpu_16cpus_gce.yaml b/release/rllib_tests/1gpu_16cpus_gce.yaml
index c534041d0863..f0ad9d505d4a 100644
--- a/release/rllib_tests/1gpu_16cpus_gce.yaml
+++ b/release/rllib_tests/1gpu_16cpus_gce.yaml
@@ -7,7 +7,7 @@ max_workers: 0
 
 head_node_type:
     name: head_node
-    instance_type: n1-standard-16-nvidia-tesla-t4-1 # g3.4xlarge
+    instance_type: n1-standard-16-nvidia-v100-16gb-1
 
 worker_node_types: []
 
diff --git a/release/rllib_tests/1gpu_24cpus.yaml b/release/rllib_tests/1gpu_24cpus.yaml
index 9a7351f981a9..af4def71489d 100644
--- a/release/rllib_tests/1gpu_24cpus.yaml
+++ b/release/rllib_tests/1gpu_24cpus.yaml
@@ -5,7 +5,7 @@ max_workers: 2
 
 head_node_type:
     name: head_node
-    instance_type: g3.4xlarge
+    instance_type: g5.4xlarge
 
 worker_node_types:
     - name: worker_node
diff --git a/release/rllib_tests/1gpu_24cpus_gce.yaml b/release/rllib_tests/1gpu_24cpus_gce.yaml
index fdb6665c7f01..ec79552e4984 100644
--- a/release/rllib_tests/1gpu_24cpus_gce.yaml
+++ b/release/rllib_tests/1gpu_24cpus_gce.yaml
@@ -7,7 +7,7 @@ max_workers: 2
 
 head_node_type:
     name: head_node
-    instance_type: n1-standard-16-nvidia-tesla-t4-1 # g3.4xlarge
+    instance_type: n1-standard-16-nvidia-v100-16gb-1
 
 worker_node_types:
     - name: worker_node
diff --git a/release/rllib_tests/1gpu_32cpus.yaml b/release/rllib_tests/1gpu_32cpus.yaml
deleted file mode 100644
index 88270242d131..000000000000
--- a/release/rllib_tests/1gpu_32cpus.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
-region: us-west-2
-
-max_workers: 7
-
-head_node_type:
-    name: head_node
-    instance_type: g3s.xlarge
-
-worker_node_types:
-    - name: worker_node
-      instance_type: m5.xlarge
-      min_workers: 7
-      max_workers: 7
-      use_spot: false
-
-aws:
-    BlockDeviceMappings:
-        - DeviceName: /dev/sda1
-          Ebs:
-            DeleteOnTermination: true
-            VolumeSize: 500
diff --git a/release/rllib_tests/1gpu_32cpus_gce.yaml b/release/rllib_tests/1gpu_32cpus_gce.yaml
deleted file mode 100644
index 8b76571eefe2..000000000000
--- a/release/rllib_tests/1gpu_32cpus_gce.yaml
+++ /dev/null
@@ -1,32 +0,0 @@
-cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
-region: us-west1
-allowed_azs:
-    - us-west1-b
-
-max_workers: 7
-
-head_node_type:
-    name: head_node
-    instance_type: n1-highmem-4-nvidia-tesla-v100-1 # g3s.xlarge
-
-worker_node_types:
-    - name: worker_node
-      instance_type: n2-standard-4 # m5.xlarge
-      min_workers: 7
-      max_workers: 7
-      use_spot: false
-
-gcp_advanced_configurations_json:
-  instance_properties:
-    disks:
-      - boot: true
-        auto_delete: true
-        initialize_params:
-          disk_size_gb: 500
-
-#aws:
-#    BlockDeviceMappings:
-#        - DeviceName: /dev/sda1
-#          Ebs:
-#            DeleteOnTermination: true
-#            VolumeSize: 500
diff --git a/release/rllib_tests/1gpu_4cpus_gce.yaml b/release/rllib_tests/1gpu_4cpus_gce.yaml
index 156fda105107..b239d699f91c 100644
--- a/release/rllib_tests/1gpu_4cpus_gce.yaml
+++ b/release/rllib_tests/1gpu_4cpus_gce.yaml
@@ -7,7 +7,7 @@ max_workers: 0
 
 head_node_type:
     name: head_node
-    instance_type: n1-standard-4-nvidia-tesla-t4-1 # p2.xlarge
+    instance_type: n1-standard-4-nvidia-v100-16gb-1
 
 worker_node_types: []
 
diff --git a/release/rllib_tests/4gpus_64cpus.yaml b/release/rllib_tests/4gpus_64cpus.yaml
index cf61a583c078..c0f4c76ee300 100644
--- a/release/rllib_tests/4gpus_64cpus.yaml
+++ b/release/rllib_tests/4gpus_64cpus.yaml
@@ -1,13 +1,18 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-max_workers: 0
+max_workers: 2
 
 head_node_type:
     name: head_node
-    instance_type: g3.16xlarge
+    instance_type: g5.12xlarge
 
-worker_node_types: []
+worker_node_types:
+    - name: worker_node
+      instance_type: m5.4xlarge
+      min_workers: 1
+      max_workers: 1
+      use_spot: false
 
 aws:
     BlockDeviceMappings:
diff --git a/release/rllib_tests/4gpus_64cpus_gce.yaml b/release/rllib_tests/4gpus_64cpus_gce.yaml
index dcddaa23c70f..82b95a8b4fdc 100644
--- a/release/rllib_tests/4gpus_64cpus_gce.yaml
+++ b/release/rllib_tests/4gpus_64cpus_gce.yaml
@@ -7,7 +7,7 @@ max_workers: 0
 
 head_node_type:
     name: head_node
-    instance_type: n1-highmem-64-nvidia-tesla-v100-8 # g3.16xlarge
+    instance_type: n1-standard-64-nvidia-v100-16gb-4
 
 worker_node_types: []
 
diff --git a/release/rllib_tests/8gpus_96cpus.yaml b/release/rllib_tests/8gpus_96cpus.yaml
index 7cae28446a12..d9509c85dbda 100644
--- a/release/rllib_tests/8gpus_96cpus.yaml
+++ b/release/rllib_tests/8gpus_96cpus.yaml
@@ -1,11 +1,11 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-max_workers: 0
+max_workers: 2
 
 head_node_type:
     name: head_node
-    instance_type: g4dn.metal
+    instance_type: g5.24xlarge
 
 worker_node_types: []
 
diff --git a/release/rllib_tests/8gpus_96cpus_gce.yaml b/release/rllib_tests/8gpus_96cpus_gce.yaml
index b249a5a2ed01..ea7832ae22be 100644
--- a/release/rllib_tests/8gpus_96cpus_gce.yaml
+++ b/release/rllib_tests/8gpus_96cpus_gce.yaml
@@ -3,18 +3,13 @@ region: us-west1
 allowed_azs:
     - us-west1-b
 
-max_workers: 1
+max_workers: 0
 
 head_node_type:
     name: head_node
-    instance_type: n1-standard-64-nvidia-tesla-t4-4
+    instance_type: n1-standard-96-nvidia-v100-16gb-8
 
-worker_node_types:
-    - name: worker_node
-      instance_type: n1-standard-32-nvidia-tesla-t4-4
-      min_workers: 1
-      max_workers: 1
-      use_spot: false
+worker_node_types: []
 
 gcp_advanced_configurations_json:
   instance_properties:
diff --git a/release/rllib_tests/app_config.yaml b/release/rllib_tests/app_config.yaml
index dc765eab31c3..fc3084dbd1f6 100755
--- a/release/rllib_tests/app_config.yaml
+++ b/release/rllib_tests/app_config.yaml
@@ -1,4 +1,4 @@
-base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] | default("anyscale/ray-ml:nightly-py37-gpu") }}
+base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] | default("anyscale/ray-ml:nightly-py38-gpu") }}
 env_vars: {"LD_LIBRARY_PATH": "$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin", "RLLIB_TEST_NO_JAX_IMPORT": "1"}
 debian_packages:
   - unzip
@@ -32,9 +32,8 @@ post_build_cmds:
   # Clone the rl-experiments repo for offline-RL files.
   - git clone https://github.com/ray-project/rl-experiments.git
   - unzip rl-experiments/halfcheetah-sac/2022-12-17/halfcheetah_1500_mean_reward_sac.zip -d ~/.
-  # Use torch+CUDA10.2 for our release tests. CUDA11.x has known performance issues in combination with torch+GPU+CNNs
-  # TODO(sven): remove once nightly image gets upgraded.
-  - pip3 install torch==1.12.1+cu102 torchvision==0.13.1+cu102 --extra-index-url https://download.pytorch.org/whl/cu102
+  
+  - pip3 install torch==2.0.0+cu118 torchvision==0.15.1+cu118 --index-url https://download.pytorch.org/whl/cu118
 
   # TODO(sven): remove once nightly image gets gymnasium and the other new dependencies.
   - wget https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz
diff --git a/release/rllib_tests/debug_app_config.yaml b/release/rllib_tests/debug_app_config.yaml
index 7b07f4c2eea7..aecbaa914ad5 100755
--- a/release/rllib_tests/debug_app_config.yaml
+++ b/release/rllib_tests/debug_app_config.yaml
@@ -1,4 +1,4 @@
-base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] | default("anyscale/ray-ml:nightly-py37-gpu") }}
+base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] | default("anyscale/ray-ml:nightly-py38-gpu") }}
 env_vars: {"LD_LIBRARY_PATH": "$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin", "RLLIB_TEST_NO_JAX_IMPORT": "1"}
 debian_packages:
   - unzip
diff --git a/release/rllib_tests/multi_gpu_with_attention_learning_tests/multi_gpu_with_attention_learning_tests.yaml b/release/rllib_tests/multi_gpu_with_attention_learning_tests/multi_gpu_with_attention_learning_tests.yaml
index 8491f98a81f9..a5b3c3a7f51b 100644
--- a/release/rllib_tests/multi_gpu_with_attention_learning_tests/multi_gpu_with_attention_learning_tests.yaml
+++ b/release/rllib_tests/multi_gpu_with_attention_learning_tests/multi_gpu_with_attention_learning_tests.yaml
@@ -153,6 +153,8 @@ ppo-stateless-cartpole:
             attention_position_wise_mlp_dim: 32
         # Double batch size (2 GPUs).
         train_batch_size: 8000
+        _enable_learner_api: false
+        _enable_rl_module_api: false
 
 # TODO (Kourosh): Activate these tests back when the new modeling stack is merged
 # r2d2-stateless-cartpole:
diff --git a/release/rllib_tests/multi_gpu_with_lstm_learning_tests/multi_gpu_with_lstm_learning_tests.yaml b/release/rllib_tests/multi_gpu_with_lstm_learning_tests/multi_gpu_with_lstm_learning_tests.yaml
index 911c8ba0e8ef..e5a218a83dcc 100644
--- a/release/rllib_tests/multi_gpu_with_lstm_learning_tests/multi_gpu_with_lstm_learning_tests.yaml
+++ b/release/rllib_tests/multi_gpu_with_lstm_learning_tests/multi_gpu_with_lstm_learning_tests.yaml
@@ -133,32 +133,5 @@ ppo-stateless-cartpole:
             use_lstm: true
         # Double batch size (2 GPUs).
         train_batch_size: 8000
-
-# TODO (Kourosh): Activate these tests back when the new modeling stack is merged
-# r2d2-stateless-cartpole:
-#     env: ray.rllib.examples.env.stateless_cartpole.StatelessCartPole
-#     run: R2D2
-#     # Minimum reward and total ts (in given time_total_s) to pass this test.
-#     pass_criteria:
-#         sampler_results/episode_reward_mean: 150.0
-#         timesteps_total: 65000
-#     stop:
-#         time_total_s: 800
-#     config:
-#         num_gpus: 2
-#         num_workers: 0
-#         # R2D2 settings.
-#         burn_in: 20
-#         zero_init_states: true
-#         lr: 0.0005
-#         # Give some more time to explore.
-#         exploration_config:
-#           epsilon_timesteps: 50000
-#         model:
-#             # Test w/ LSTMs.
-#             use_lstm: true
-#             lstm_cell_size: 64
-#             # Use a very simple base-model.
-#             fcnet_hiddens: [64]
-#             fcnet_activation: linear
-#             max_seq_len: 20
+        _enable_learner_api: false
+        _enable_rl_module_api: false