Skip to content

Commit

Permalink
[fix] Better support for rank_zero_only setting for SLURM and torchel…
Browse files Browse the repository at this point in the history
…astic (#6802)

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
  • Loading branch information
ananthsub and awaelchli authored Apr 7, 2021
1 parent a2c6057 commit 86e1d9f
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 1 deletion.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

### Fixed

- Set better defaults for `rank_zero_only.rank` when training is launched with SLURM and torchelastic ([#6802](https://github.com/PyTorchLightning/pytorch-lightning/pull/6802/))


- Sanitize `None` params during pruning ([#6836](https://github.com/PyTorchLightning/pytorch-lightning/pull/6836))


Expand Down
12 changes: 11 additions & 1 deletion pytorch_lightning/utilities/distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,18 @@ def wrapped_fn(*args, **kwargs):
return wrapped_fn


# TODO: this should be part of the cluster environment
def _get_rank() -> int:
rank_keys = ('RANK', 'SLURM_PROCID', 'LOCAL_RANK')
for key in rank_keys:
rank = os.environ.get(key)
if rank is not None:
return int(rank)
return 0


# add the attribute to the function but don't overwrite in case Trainer has already set it
rank_zero_only.rank = getattr(rank_zero_only, 'rank', int(os.environ.get('LOCAL_RANK', 0)))
rank_zero_only.rank = getattr(rank_zero_only, 'rank', _get_rank())


def _warn(*args, **kwargs):
Expand Down
56 changes: 56 additions & 0 deletions tests/utilities/test_distributed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# Copyright The PyTorch Lightning team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from typing import Mapping
from unittest import mock

import pytest


@pytest.mark.parametrize("env_vars", [{"RANK": "0"}, {"SLURM_PROCID": "0"}])
def test_rank_zero_known_cluster_envs(env_vars: Mapping[str, str]):
""" Test that SLURM environment variables are properly checked for rank_zero_only. """
from pytorch_lightning.utilities.distributed import _get_rank, rank_zero_only
rank_zero_only.rank = _get_rank()

with mock.patch.dict(os.environ, env_vars):
from pytorch_lightning.utilities.distributed import _get_rank, rank_zero_only
rank_zero_only.rank = _get_rank()

@rank_zero_only
def foo(): # The return type is optional because on non-zero ranks it will not be called
return 1

x = foo()
assert x == 1


@pytest.mark.parametrize("rank_key,rank", [
("RANK", "1"),
("SLURM_PROCID", "2"),
("LOCAL_RANK", "3"),
])
def test_rank_zero_none_set(rank_key, rank):
""" Test that function is not called when rank environment variables are not global zero. """

with mock.patch.dict(os.environ, {rank_key: rank}):
from pytorch_lightning.utilities.distributed import _get_rank, rank_zero_only
rank_zero_only.rank = _get_rank()

@rank_zero_only
def foo():
return 1

x = foo()
assert x is None

0 comments on commit 86e1d9f

Please sign in to comment.