Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RLlib] Various updates to the Release CI RLlib #36883

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion release/rllib_tests/1gpu_16cpus.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ max_workers: 0

head_node_type:
name: head_node
instance_type: g3.4xlarge
instance_type: g5.4xlarge

worker_node_types: []

Expand Down
2 changes: 1 addition & 1 deletion release/rllib_tests/1gpu_16cpus_gce.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ max_workers: 0

head_node_type:
name: head_node
instance_type: n1-standard-16-nvidia-tesla-t4-1 # g3.4xlarge
instance_type: n1-standard-16-nvidia-v100-16gb-1

worker_node_types: []

Expand Down
2 changes: 1 addition & 1 deletion release/rllib_tests/1gpu_24cpus.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ max_workers: 2

head_node_type:
name: head_node
instance_type: g3.4xlarge
instance_type: g5.4xlarge

worker_node_types:
- name: worker_node
Expand Down
2 changes: 1 addition & 1 deletion release/rllib_tests/1gpu_24cpus_gce.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ max_workers: 2

head_node_type:
name: head_node
instance_type: n1-standard-16-nvidia-tesla-t4-1 # g3.4xlarge
instance_type: n1-standard-16-nvidia-v100-16gb-1

worker_node_types:
- name: worker_node
Expand Down
22 changes: 0 additions & 22 deletions release/rllib_tests/1gpu_32cpus.yaml

This file was deleted.

32 changes: 0 additions & 32 deletions release/rllib_tests/1gpu_32cpus_gce.yaml

This file was deleted.

2 changes: 1 addition & 1 deletion release/rllib_tests/1gpu_4cpus_gce.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ max_workers: 0

head_node_type:
name: head_node
instance_type: n1-standard-4-nvidia-tesla-t4-1 # p2.xlarge
instance_type: n1-standard-4-nvidia-v100-16gb-1

worker_node_types: []

Expand Down
11 changes: 8 additions & 3 deletions release/rllib_tests/4gpus_64cpus.yaml
Original file line number Diff line number Diff line change
@@ -1,13 +1,18 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west-2

max_workers: 0
max_workers: 2

head_node_type:
name: head_node
instance_type: g3.16xlarge
instance_type: g5.12xlarge

worker_node_types: []
worker_node_types:
- name: worker_node
instance_type: m5.4xlarge
min_workers: 1
max_workers: 1
use_spot: false

aws:
BlockDeviceMappings:
Expand Down
2 changes: 1 addition & 1 deletion release/rllib_tests/4gpus_64cpus_gce.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ max_workers: 0

head_node_type:
name: head_node
instance_type: n1-highmem-64-nvidia-tesla-v100-8 # g3.16xlarge
instance_type: n1-standard-64-nvidia-v100-16gb-4

worker_node_types: []

Expand Down
4 changes: 2 additions & 2 deletions release/rllib_tests/8gpus_96cpus.yaml
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west-2

max_workers: 0
max_workers: 2

head_node_type:
name: head_node
instance_type: g4dn.metal
instance_type: g5.24xlarge

worker_node_types: []

Expand Down
11 changes: 3 additions & 8 deletions release/rllib_tests/8gpus_96cpus_gce.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,13 @@ region: us-west1
allowed_azs:
- us-west1-b

max_workers: 1
max_workers: 0

head_node_type:
name: head_node
instance_type: n1-standard-64-nvidia-tesla-t4-4
instance_type: n1-standard-96-nvidia-v100-16gb-8

worker_node_types:
- name: worker_node
instance_type: n1-standard-32-nvidia-tesla-t4-4
min_workers: 1
max_workers: 1
use_spot: false
worker_node_types: []

gcp_advanced_configurations_json:
instance_properties:
Expand Down
7 changes: 3 additions & 4 deletions release/rllib_tests/app_config.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] | default("anyscale/ray-ml:nightly-py37-gpu") }}
base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] | default("anyscale/ray-ml:nightly-py38-gpu") }}
env_vars: {"LD_LIBRARY_PATH": "$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin", "RLLIB_TEST_NO_JAX_IMPORT": "1"}
debian_packages:
- unzip
Expand Down Expand Up @@ -32,9 +32,8 @@ post_build_cmds:
# Clone the rl-experiments repo for offline-RL files.
- git clone https://github.com/ray-project/rl-experiments.git
- unzip rl-experiments/halfcheetah-sac/2022-12-17/halfcheetah_1500_mean_reward_sac.zip -d ~/.
# Use torch+CUDA10.2 for our release tests. CUDA11.x has known performance issues in combination with torch+GPU+CNNs
# TODO(sven): remove once nightly image gets upgraded.
- pip3 install torch==1.12.1+cu102 torchvision==0.13.1+cu102 --extra-index-url https://download.pytorch.org/whl/cu102

- pip3 install torch==2.0.0+cu118 torchvision==0.15.1+cu118 --index-url https://download.pytorch.org/whl/cu118

# TODO(sven): remove once nightly image gets gymnasium and the other new dependencies.
- wget https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz
Expand Down
2 changes: 1 addition & 1 deletion release/rllib_tests/debug_app_config.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] | default("anyscale/ray-ml:nightly-py37-gpu") }}
base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] | default("anyscale/ray-ml:nightly-py38-gpu") }}
env_vars: {"LD_LIBRARY_PATH": "$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin", "RLLIB_TEST_NO_JAX_IMPORT": "1"}
debian_packages:
- unzip
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,8 @@ ppo-stateless-cartpole:
attention_position_wise_mlp_dim: 32
# Double batch size (2 GPUs).
train_batch_size: 8000
_enable_learner_api: false
_enable_rl_module_api: false

# TODO (Kourosh): Activate these tests back when the new modeling stack is merged
# r2d2-stateless-cartpole:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,32 +133,5 @@ ppo-stateless-cartpole:
use_lstm: true
# Double batch size (2 GPUs).
train_batch_size: 8000

# TODO (Kourosh): Activate these tests back when the new modeling stack is merged
# r2d2-stateless-cartpole:
# env: ray.rllib.examples.env.stateless_cartpole.StatelessCartPole
# run: R2D2
# # Minimum reward and total ts (in given time_total_s) to pass this test.
# pass_criteria:
# sampler_results/episode_reward_mean: 150.0
# timesteps_total: 65000
# stop:
# time_total_s: 800
# config:
# num_gpus: 2
# num_workers: 0
# # R2D2 settings.
# burn_in: 20
# zero_init_states: true
# lr: 0.0005
# # Give some more time to explore.
# exploration_config:
# epsilon_timesteps: 50000
# model:
# # Test w/ LSTMs.
# use_lstm: true
# lstm_cell_size: 64
# # Use a very simple base-model.
# fcnet_hiddens: [64]
# fcnet_activation: linear
# max_seq_len: 20
_enable_learner_api: false
_enable_rl_module_api: false