Remove smoke tests for e2e multi node trainings (ray-project#37324)

Signed-off-by: Artur Niederfahrenhorst <attaismyname@googlemail.com> Signed-off-by: e428265 <arvind.chandramouli@lmco.com>
lmco · Aug 31, 2023 · 3c3b2e7 · 3c3b2e7
1 parent eb8ca4c
commit 3c3b2e7
Show file tree

Hide file tree

Showing 2 changed files with 0 additions and 154 deletions.
diff --git a/release/release_tests.yaml b/release/release_tests.yaml
@@ -3696,41 +3696,6 @@
         cluster_env: app_config.yaml
         cluster_compute: multi_node_checkpointing_compute_config_gce.yaml
 
-- name: rllib_multi_node_e2e_training_smoke_test
-  group: RLlib tests
-  working_dir: rllib_tests
-
-  frequency: nightly
-  team: rllib
-  python: "3.8"
-  cluster:
-    byod:
-      type: gpu
-      post_build_script: byod_rllib_test.sh
-      runtime_env:
-        - RLLIB_TEST_NO_JAX_IMPORT=1
-        - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin
-    cluster_env: app_config.yaml
-    cluster_compute: multi_node_checkpointing_compute_config.yaml
-
-  run:
-    timeout: 3600
-    script: pytest smoke_tests/smoke_test_basic_multi_node_training_learner.py
-
-    wait_for_nodes:
-      num_nodes: 2
-
-  alert: default
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_env: app_config.yaml
-        cluster_compute: multi_node_checkpointing_compute_config_gce.yaml
-
 - name: rllib_learning_tests_a2c_tf
   group: RLlib tests
   working_dir: rllib_tests

diff --git a/release/rllib_tests/smoke_tests/smoke_test_basic_multi_node_training_learner.py b/release/rllib_tests/smoke_tests/smoke_test_basic_multi_node_training_learner.py
@@ -1,119 +0,0 @@
-import ray
-from ray import air, tune
-from ray.rllib.algorithms.ppo import PPOConfig
-
-
-def run_with_tuner_n_rollout_worker_2_gpu(config):
-    """Run training with n rollout workers and 2 learner workers with gpu."""
-    config = config.rollouts(num_rollout_workers=5)
-    tuner = tune.Tuner(
-        "PPO",
-        param_space=config,
-        run_config=air.RunConfig(
-            storage_path="/mnt/cluster_storage",
-            stop={"timesteps_total": 128},
-            failure_config=air.FailureConfig(fail_fast=True),
-        ),
-    )
-    tuner.fit()
-
-
-def run_with_tuner_0_rollout_worker_2_gpu(config):
-    """Run training with 0 rollout workers with 2 learner workers with gpu."""
-    config = config.rollouts(num_rollout_workers=0)
-    tuner = tune.Tuner(
-        "PPO",
-        param_space=config,
-        run_config=air.RunConfig(
-            storage_path="/mnt/cluster_storage",
-            stop={"timesteps_total": 128},
-            failure_config=air.FailureConfig(fail_fast=True),
-        ),
-    )
-    tuner.fit()
-
-
-def run_tuner_n_rollout_workers_0_gpu(config):
-    """Run training with n rollout workers, multiple learner workers, and no gpu."""
-    config = config.rollouts(num_rollout_workers=5)
-    config = config.resources(
-        num_cpus_per_learner_worker=1,
-        num_learner_workers=2,
-    )
-
-    tuner = tune.Tuner(
-        "PPO",
-        param_space=config,
-        run_config=air.RunConfig(
-            storage_path="/mnt/cluster_storage",
-            stop={"timesteps_total": 128},
-            failure_config=air.FailureConfig(fail_fast=True),
-        ),
-    )
-    tuner.fit()
-
-
-def run_tuner_n_rollout_workers_1_gpu_local(config):
-    """Run training with n rollout workers, local learner, and 1 gpu."""
-    config = config.rollouts(num_rollout_workers=5)
-    config = config.resources(
-        num_gpus_per_learner_worker=1,
-        num_learner_workers=0,
-    )
-
-    tuner = tune.Tuner(
-        "PPO",
-        param_space=config,
-        run_config=air.RunConfig(
-            storage_path="/mnt/cluster_storage",
-            stop={"timesteps_total": 128},
-            failure_config=air.FailureConfig(fail_fast=True),
-        ),
-    )
-    tuner.fit()
-
-
-def test_multi_node_training_smoke():
-    """A smoke test to see if we can run multi node training without pg problems.
-
-    This test is run on a 3 node cluster. The head node is a m5.xlarge (4 cpu),
-    the worker nodes are 2 g4dn.xlarge (1 gpu, 4 cpu) machines.
-
-    """
-
-    ray.init()
-
-    config = (
-        PPOConfig()
-        .training(
-            _enable_learner_api=True,
-            model={
-                "fcnet_hiddens": [256, 256, 256],
-                "fcnet_activation": "relu",
-                "vf_share_layers": True,
-            },
-            train_batch_size=128,
-        )
-        .rl_module(_enable_rl_module_api=True)
-        .environment("CartPole-v1")
-        .resources(
-            num_gpus_per_learner_worker=1,
-            num_learner_workers=2,
-        )
-        .rollouts(num_rollout_workers=2)
-        .reporting(min_time_s_per_iteration=0, min_sample_timesteps_per_iteration=10)
-    )
-    for fw in ["tf2", "torch"]:
-        config = config.framework(fw, eager_tracing=True)
-
-        run_with_tuner_0_rollout_worker_2_gpu(config)
-        run_with_tuner_n_rollout_worker_2_gpu(config)
-        run_tuner_n_rollout_workers_0_gpu(config)
-        run_tuner_n_rollout_workers_1_gpu_local(config)
-
-
-if __name__ == "__main__":
-    import sys
-    import pytest
-
-    sys.exit(pytest.main(["-v", __file__]))