disable GPU tests hitting OOM

pytorch · Mar 12, 2024 · e857c3e · e857c3e
1 parent cfa3f97
commit e857c3e
Showing 1 changed file with 6 additions and 3 deletions.
diff --git a/.circleci/common.sh b/.circleci/common.sh
@@ -141,13 +141,16 @@ function run_torch_xla_python_tests() {
         # single-host-multi-process
         num_devices=$(nvidia-smi --list-gpus | wc -l)
         PJRT_DEVICE=CUDA GPU_NUM_DEVICES=$GPU_NUM_DEVICES python3 test/test_train_mp_imagenet.py --fake_data --batch_size=16 --num_epochs=1 --num_steps=25 --model=resnet18
-        PJRT_DEVICE=CUDA torchrun --nnodes=1 --node_rank=0 --nproc_per_node=$num_devices test/test_train_mp_imagenet.py --fake_data --pjrt_distributed --batch_size=16 --num_epochs=1  --num_steps=25 --model=resnet18
+        # TODO: Disable GPU test due to OOM in CI after pin update to 03/05/2024 (#6677)
+        # PJRT_DEVICE=CUDA torchrun --nnodes=1 --node_rank=0 --nproc_per_node=$num_devices test/test_train_mp_imagenet.py --fake_data --pjrt_distributed --batch_size=16 --num_epochs=1  --num_steps=25 --model=resnet18
 
         # single-host-SPMD
         XLA_USE_SPMD=1 PJRT_DEVICE=CUDA torchrun --nnodes=1 --node_rank=0 --nproc_per_node=1 test/spmd/test_train_spmd_imagenet.py --fake_data --batch_size 16 --model=resnet50 --sharding=batch --num_epochs=1  --num_steps=25 --model=resnet18
 
-        PJRT_DEVICE=CUDA python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1
-        PJRT_DEVICE=CUDA python test/test_train_mp_imagenet_fsdp.py --fake_data --auto_wrap_policy type_based --use_small_fake_sample --num_epochs=1
+        # TODO: Disable GPU test due to OOM in CI after pin update to 03/05/2024 (#6677)
+        # PJRT_DEVICE=CUDA python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1
+        # TODO: Disable GPU test due to OOM in CI after pin update to 03/05/2024 (#6677)
+        # PJRT_DEVICE=CUDA python test/test_train_mp_imagenet_fsdp.py --fake_data --auto_wrap_policy type_based --use_small_fake_sample --num_epochs=1
         XLA_DISABLE_FUNCTIONALIZATION=1 PJRT_DEVICE=CUDA python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1
         # Syncfree SGD optimizer tests
         if [ -d ./torch_xla/amp/syncfree ]; then