Skip to content

Commit

Permalink
Remove smoke tests for e2e multi node trainings (ray-project#37324)
Browse files Browse the repository at this point in the history
Signed-off-by: Artur Niederfahrenhorst <attaismyname@googlemail.com>
Signed-off-by: e428265 <arvind.chandramouli@lmco.com>
  • Loading branch information
ArturNiederfahrenhorst authored and arvind-chandra committed Aug 31, 2023
1 parent eb8ca4c commit 3c3b2e7
Show file tree
Hide file tree
Showing 2 changed files with 0 additions and 154 deletions.
35 changes: 0 additions & 35 deletions release/release_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3696,41 +3696,6 @@
cluster_env: app_config.yaml
cluster_compute: multi_node_checkpointing_compute_config_gce.yaml

- name: rllib_multi_node_e2e_training_smoke_test
group: RLlib tests
working_dir: rllib_tests

frequency: nightly
team: rllib
python: "3.8"
cluster:
byod:
type: gpu
post_build_script: byod_rllib_test.sh
runtime_env:
- RLLIB_TEST_NO_JAX_IMPORT=1
- LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin
cluster_env: app_config.yaml
cluster_compute: multi_node_checkpointing_compute_config.yaml

run:
timeout: 3600
script: pytest smoke_tests/smoke_test_basic_multi_node_training_learner.py

wait_for_nodes:
num_nodes: 2

alert: default

variations:
- __suffix__: aws
- __suffix__: gce
env: gce
frequency: manual
cluster:
cluster_env: app_config.yaml
cluster_compute: multi_node_checkpointing_compute_config_gce.yaml

- name: rllib_learning_tests_a2c_tf
group: RLlib tests
working_dir: rllib_tests
Expand Down
Original file line number Diff line number Diff line change
@@ -1,119 +0,0 @@
import ray
from ray import air, tune
from ray.rllib.algorithms.ppo import PPOConfig


def run_with_tuner_n_rollout_worker_2_gpu(config):
"""Run training with n rollout workers and 2 learner workers with gpu."""
config = config.rollouts(num_rollout_workers=5)
tuner = tune.Tuner(
"PPO",
param_space=config,
run_config=air.RunConfig(
storage_path="/mnt/cluster_storage",
stop={"timesteps_total": 128},
failure_config=air.FailureConfig(fail_fast=True),
),
)
tuner.fit()


def run_with_tuner_0_rollout_worker_2_gpu(config):
"""Run training with 0 rollout workers with 2 learner workers with gpu."""
config = config.rollouts(num_rollout_workers=0)
tuner = tune.Tuner(
"PPO",
param_space=config,
run_config=air.RunConfig(
storage_path="/mnt/cluster_storage",
stop={"timesteps_total": 128},
failure_config=air.FailureConfig(fail_fast=True),
),
)
tuner.fit()


def run_tuner_n_rollout_workers_0_gpu(config):
"""Run training with n rollout workers, multiple learner workers, and no gpu."""
config = config.rollouts(num_rollout_workers=5)
config = config.resources(
num_cpus_per_learner_worker=1,
num_learner_workers=2,
)

tuner = tune.Tuner(
"PPO",
param_space=config,
run_config=air.RunConfig(
storage_path="/mnt/cluster_storage",
stop={"timesteps_total": 128},
failure_config=air.FailureConfig(fail_fast=True),
),
)
tuner.fit()


def run_tuner_n_rollout_workers_1_gpu_local(config):
"""Run training with n rollout workers, local learner, and 1 gpu."""
config = config.rollouts(num_rollout_workers=5)
config = config.resources(
num_gpus_per_learner_worker=1,
num_learner_workers=0,
)

tuner = tune.Tuner(
"PPO",
param_space=config,
run_config=air.RunConfig(
storage_path="/mnt/cluster_storage",
stop={"timesteps_total": 128},
failure_config=air.FailureConfig(fail_fast=True),
),
)
tuner.fit()


def test_multi_node_training_smoke():
"""A smoke test to see if we can run multi node training without pg problems.
This test is run on a 3 node cluster. The head node is a m5.xlarge (4 cpu),
the worker nodes are 2 g4dn.xlarge (1 gpu, 4 cpu) machines.
"""

ray.init()

config = (
PPOConfig()
.training(
_enable_learner_api=True,
model={
"fcnet_hiddens": [256, 256, 256],
"fcnet_activation": "relu",
"vf_share_layers": True,
},
train_batch_size=128,
)
.rl_module(_enable_rl_module_api=True)
.environment("CartPole-v1")
.resources(
num_gpus_per_learner_worker=1,
num_learner_workers=2,
)
.rollouts(num_rollout_workers=2)
.reporting(min_time_s_per_iteration=0, min_sample_timesteps_per_iteration=10)
)
for fw in ["tf2", "torch"]:
config = config.framework(fw, eager_tracing=True)

run_with_tuner_0_rollout_worker_2_gpu(config)
run_with_tuner_n_rollout_worker_2_gpu(config)
run_tuner_n_rollout_workers_0_gpu(config)
run_tuner_n_rollout_workers_1_gpu_local(config)


if __name__ == "__main__":
import sys
import pytest

sys.exit(pytest.main(["-v", __file__]))

0 comments on commit 3c3b2e7

Please sign in to comment.