Skip to content

Commit

Permalink
[RLlib] Various updates to the Release CI RLlib (#36883)
Browse files Browse the repository at this point in the history
Signed-off-by: Avnish <avnishnarayan@gmail.com>
  • Loading branch information
avnishn authored Jun 30, 2023
1 parent 0db7f95 commit 1b63128
Show file tree
Hide file tree
Showing 15 changed files with 27 additions and 107 deletions.
2 changes: 1 addition & 1 deletion release/rllib_tests/1gpu_16cpus.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ max_workers: 0

head_node_type:
name: head_node
instance_type: g3.4xlarge
instance_type: g5.4xlarge

worker_node_types: []

Expand Down
2 changes: 1 addition & 1 deletion release/rllib_tests/1gpu_16cpus_gce.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ max_workers: 0

head_node_type:
name: head_node
instance_type: n1-standard-16-nvidia-tesla-t4-1 # g3.4xlarge
instance_type: n1-standard-16-nvidia-v100-16gb-1

worker_node_types: []

Expand Down
2 changes: 1 addition & 1 deletion release/rllib_tests/1gpu_24cpus.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ max_workers: 2

head_node_type:
name: head_node
instance_type: g3.4xlarge
instance_type: g5.4xlarge

worker_node_types:
- name: worker_node
Expand Down
2 changes: 1 addition & 1 deletion release/rllib_tests/1gpu_24cpus_gce.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ max_workers: 2

head_node_type:
name: head_node
instance_type: n1-standard-16-nvidia-tesla-t4-1 # g3.4xlarge
instance_type: n1-standard-16-nvidia-v100-16gb-1

worker_node_types:
- name: worker_node
Expand Down
22 changes: 0 additions & 22 deletions release/rllib_tests/1gpu_32cpus.yaml

This file was deleted.

32 changes: 0 additions & 32 deletions release/rllib_tests/1gpu_32cpus_gce.yaml

This file was deleted.

2 changes: 1 addition & 1 deletion release/rllib_tests/1gpu_4cpus_gce.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ max_workers: 0

head_node_type:
name: head_node
instance_type: n1-standard-4-nvidia-tesla-t4-1 # p2.xlarge
instance_type: n1-standard-4-nvidia-v100-16gb-1

worker_node_types: []

Expand Down
11 changes: 8 additions & 3 deletions release/rllib_tests/4gpus_64cpus.yaml
Original file line number Diff line number Diff line change
@@ -1,13 +1,18 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west-2

max_workers: 0
max_workers: 2

head_node_type:
name: head_node
instance_type: g3.16xlarge
instance_type: g5.12xlarge

worker_node_types: []
worker_node_types:
- name: worker_node
instance_type: m5.4xlarge
min_workers: 1
max_workers: 1
use_spot: false

aws:
BlockDeviceMappings:
Expand Down
2 changes: 1 addition & 1 deletion release/rllib_tests/4gpus_64cpus_gce.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ max_workers: 0

head_node_type:
name: head_node
instance_type: n1-highmem-64-nvidia-tesla-v100-8 # g3.16xlarge
instance_type: n1-standard-64-nvidia-v100-16gb-4

worker_node_types: []

Expand Down
4 changes: 2 additions & 2 deletions release/rllib_tests/8gpus_96cpus.yaml
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west-2

max_workers: 0
max_workers: 2

head_node_type:
name: head_node
instance_type: g4dn.metal
instance_type: g5.24xlarge

worker_node_types: []

Expand Down
11 changes: 3 additions & 8 deletions release/rllib_tests/8gpus_96cpus_gce.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,13 @@ region: us-west1
allowed_azs:
- us-west1-b

max_workers: 1
max_workers: 0

head_node_type:
name: head_node
instance_type: n1-standard-64-nvidia-tesla-t4-4
instance_type: n1-standard-96-nvidia-v100-16gb-8

worker_node_types:
- name: worker_node
instance_type: n1-standard-32-nvidia-tesla-t4-4
min_workers: 1
max_workers: 1
use_spot: false
worker_node_types: []

gcp_advanced_configurations_json:
instance_properties:
Expand Down
7 changes: 3 additions & 4 deletions release/rllib_tests/app_config.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] | default("anyscale/ray-ml:nightly-py37-gpu") }}
base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] | default("anyscale/ray-ml:nightly-py38-gpu") }}
env_vars: {"LD_LIBRARY_PATH": "$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin", "RLLIB_TEST_NO_JAX_IMPORT": "1"}
debian_packages:
- unzip
Expand Down Expand Up @@ -32,9 +32,8 @@ post_build_cmds:
# Clone the rl-experiments repo for offline-RL files.
- git clone https://github.com/ray-project/rl-experiments.git
- unzip rl-experiments/halfcheetah-sac/2022-12-17/halfcheetah_1500_mean_reward_sac.zip -d ~/.
# Use torch+CUDA10.2 for our release tests. CUDA11.x has known performance issues in combination with torch+GPU+CNNs
# TODO(sven): remove once nightly image gets upgraded.
- pip3 install torch==1.12.1+cu102 torchvision==0.13.1+cu102 --extra-index-url https://download.pytorch.org/whl/cu102

- pip3 install torch==2.0.0+cu118 torchvision==0.15.1+cu118 --index-url https://download.pytorch.org/whl/cu118

# TODO(sven): remove once nightly image gets gymnasium and the other new dependencies.
- wget https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz
Expand Down
2 changes: 1 addition & 1 deletion release/rllib_tests/debug_app_config.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] | default("anyscale/ray-ml:nightly-py37-gpu") }}
base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] | default("anyscale/ray-ml:nightly-py38-gpu") }}
env_vars: {"LD_LIBRARY_PATH": "$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin", "RLLIB_TEST_NO_JAX_IMPORT": "1"}
debian_packages:
- unzip
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,8 @@ ppo-stateless-cartpole:
attention_position_wise_mlp_dim: 32
# Double batch size (2 GPUs).
train_batch_size: 8000
_enable_learner_api: false
_enable_rl_module_api: false

# TODO (Kourosh): Activate these tests back when the new modeling stack is merged
# r2d2-stateless-cartpole:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,32 +133,5 @@ ppo-stateless-cartpole:
use_lstm: true
# Double batch size (2 GPUs).
train_batch_size: 8000

# TODO (Kourosh): Activate these tests back when the new modeling stack is merged
# r2d2-stateless-cartpole:
# env: ray.rllib.examples.env.stateless_cartpole.StatelessCartPole
# run: R2D2
# # Minimum reward and total ts (in given time_total_s) to pass this test.
# pass_criteria:
# sampler_results/episode_reward_mean: 150.0
# timesteps_total: 65000
# stop:
# time_total_s: 800
# config:
# num_gpus: 2
# num_workers: 0
# # R2D2 settings.
# burn_in: 20
# zero_init_states: true
# lr: 0.0005
# # Give some more time to explore.
# exploration_config:
# epsilon_timesteps: 50000
# model:
# # Test w/ LSTMs.
# use_lstm: true
# lstm_cell_size: 64
# # Use a very simple base-model.
# fcnet_hiddens: [64]
# fcnet_activation: linear
# max_seq_len: 20
_enable_learner_api: false
_enable_rl_module_api: false

0 comments on commit 1b63128

Please sign in to comment.