Lightning-AI · tchaton · Jul 9, 2021 · Dec 3, 2020 · Dec 4, 2020 · Dec 4, 2020
diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -123,9 +123,6 @@ def __init__(
         self.interactive_ddp_procs = []
         self.global_rank = 0
 
-        # NVIDIA setup
-        # self.set_nvidia_flags(self.trainer.is_slurm_managing_tasks, self.trainer.data_parallel_device_ids)
-
         # benchmarking
         # TODO: should this be moved to GPU accelerator?
         torch.backends.cudnn.benchmark = self.benchmark

diff --git a/pytorch_lightning/accelerators/legacy/ddp2_accelerator.py b/pytorch_lightning/accelerators/legacy/ddp2_accelerator.py
@@ -150,8 +150,7 @@ def ddp_train(self, process_idx, mp_queue, model):
         model.trainer = self.trainer
         self.init_ddp_connection(
             self.trainer.global_rank,
-            self.trainer.world_size,
-            self.trainer.is_slurm_managing_tasks
+            self.trainer.world_size
         )
 
         if isinstance(self.ddp_plugin, RPCPlugin):

diff --git a/pytorch_lightning/accelerators/legacy/ddp_accelerator.py b/pytorch_lightning/accelerators/legacy/ddp_accelerator.py
@@ -260,8 +260,7 @@ def ddp_train(self, process_idx, model):
         model.trainer = self.trainer
         self.init_ddp_connection(
             self.trainer.global_rank,
-            self.trainer.world_size,
-            self.trainer.is_slurm_managing_tasks
+            self.trainer.world_size
         )
 
         if isinstance(self.ddp_plugin, RPCPlugin):

diff --git a/pytorch_lightning/accelerators/legacy/ddp_cpu_spawn_accelerator.py b/pytorch_lightning/accelerators/legacy/ddp_cpu_spawn_accelerator.py
@@ -205,8 +205,7 @@ def set_world_ranks(self, process_idx):
         self.trainer.global_rank = self.trainer.node_rank * self.trainer.num_processes + process_idx
         self.trainer.world_size = self.trainer.num_nodes * self.trainer.num_processes
 
-    def model_to_device(self, model, process_idx):
-        # Todo: required argument `process_idx` is not used
+    def model_to_device(self, model):
         model.cpu()
 
     def get_device_ids(self):

diff --git a/pytorch_lightning/accelerators/legacy/ddp_hpc_accelerator.py b/pytorch_lightning/accelerators/legacy/ddp_hpc_accelerator.py
@@ -19,6 +19,7 @@
 from torch.nn.parallel import DistributedDataParallel
 
 from pytorch_lightning import _logger as log
+from pytorch_lightning.accelerators.nvidia_mixin import NVIDIAMixin
 from pytorch_lightning.accelerators.legacy.accelerator import Accelerator, ReduceOp
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.distributed.dist import LightningDistributed
@@ -29,7 +30,7 @@
 from pytorch_lightning.utilities.distributed import all_gather_ddp_if_available, rank_zero_only, sync_ddp_if_available
 
 
-class DDPHPCAccelerator(Accelerator):
+class DDPHPCAccelerator(Accelerator, NVIDIAMixin):
 
     def __init__(self,
                  trainer,
@@ -52,6 +53,10 @@ def __init__(self,
 
     def setup(self, model):
         self.trainer.model = model
+        # ----------------------------
+        # NVIDIA FLAGS
+        # ----------------------------
+        self.set_nvidia_flags(self.trainer.data_parallel_device_ids)
         self.task_idx = self.cluster_environment.local_rank()
 
     def train(self):
@@ -60,7 +65,7 @@ def train(self):
 
     def set_world_ranks(self, process_idx):
         self.trainer.local_rank = process_idx
-        self.trainer.global_rank = self.trainer.node_rank * self.trainer.num_processes + process_idx
+        self.trainer.global_rank = self.cluster_environment.node_rank() * self.trainer.num_processes + process_idx
         self.trainer.world_size = self.trainer.num_nodes * self.trainer.num_processes
 
     def init_device(self, process_idx):
@@ -132,14 +137,16 @@ def ddp_train(self, process_idx, model):
         # set warning rank
         rank_zero_only.rank = self.trainer.global_rank
 
+        # Initialize cuda device
+        self.init_device(process_idx)
+
         # set up server using proc 0's ip address
         # try to init for 20 times at max in case ports are taken
         # where to store ip_table
         model.trainer = self.trainer
         self.init_ddp_connection(
             self.trainer.global_rank,
-            self.trainer.world_size,
-            self.trainer.is_slurm_managing_tasks
+            self.trainer.world_size
         )
 
         if isinstance(self.ddp_plugin, RPCPlugin):

diff --git a/pytorch_lightning/accelerators/legacy/ddp_spawn_accelerator.py b/pytorch_lightning/accelerators/legacy/ddp_spawn_accelerator.py
@@ -21,6 +21,7 @@
 from torch.nn.parallel import DistributedDataParallel
 
 from pytorch_lightning import _logger as log
+from pytorch_lightning.accelerators.nvidia_mixin import NVIDIAMixin
 from pytorch_lightning.accelerators.legacy.accelerator import Accelerator, ReduceOp
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.distributed import LightningDistributed
@@ -40,7 +41,7 @@
 from pytorch_lightning.utilities.seed import seed_everything
 
 
-class DDPSpawnAccelerator(Accelerator):
+class DDPSpawnAccelerator(Accelerator, NVIDIAMixin):
 
     def __init__(self,
                  trainer,
@@ -64,6 +65,10 @@ def __init__(self,
 
     def setup(self, model):
         os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', str(find_free_network_port()))
+        # ----------------------------
+        # NVIDIA FLAGS
+        # ----------------------------
+        self.set_nvidia_flags(self.trainer.data_parallel_device_ids)
 
         # pass in a state q
         smp = mp.get_context('spawn')

diff --git a/pytorch_lightning/accelerators/legacy/gpu_accelerator.py b/pytorch_lightning/accelerators/legacy/gpu_accelerator.py
@@ -15,13 +15,14 @@
 
 import torch
 
+from pytorch_lightning.accelerators.nvidia_mixin import NVIDIAMixin
 from pytorch_lightning.accelerators.legacy.accelerator import Accelerator, ReduceOp
 from pytorch_lightning.distributed.dist import LightningDistributed
 from pytorch_lightning.plugins.environments import ClusterEnvironment
 from pytorch_lightning.utilities import AMPType
 
 
-class GPUAccelerator(Accelerator):
+class GPUAccelerator(Accelerator, NVIDIAMixin):
     amp_backend: AMPType
 
     def __init__(self, trainer, cluster_environment: Optional[ClusterEnvironment] = None):
@@ -39,6 +40,10 @@ def __init__(self, trainer, cluster_environment: Optional[ClusterEnvironment] =
         self.nickname = None
 
     def setup(self, model):
+        # ----------------------------
+        # NVIDIA FLAGS
+        # ----------------------------
+        self.set_nvidia_flags(self.trainer.data_parallel_device_ids)
 
         # call setup
         self.trainer.call_setup_hook(model)

diff --git a/pytorch_lightning/accelerators/nvidia_mixin.py b/pytorch_lightning/accelerators/nvidia_mixin.py
@@ -0,0 +1,31 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+import os
+
+import torch
+
+from pytorch_lightning import _logger as log
+
+
+class NVIDIAMixin:
+
+    def set_nvidia_flags(self, data_parallel_device_ids):
+        if data_parallel_device_ids is None:
+            return
+
+        # set the correct cuda visible devices (using pci order)
+        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+        all_gpu_ids = ",".join([str(x) for x in range(torch.cuda.device_count())])
+        devices = os.environ.get("CUDA_VISIBLE_DEVICES", all_gpu_ids)
+        log.info(f'LOCAL_RANK: {self.trainer.local_rank} - CUDA_VISIBLE_DEVICES: [{devices}]')
diff --git a/pytorch_lightning/plugins/environments/cluster_environment.py b/pytorch_lightning/plugins/environments/cluster_environment.py
@@ -31,3 +31,14 @@ def world_size(self):
 
     def local_rank(self):
         pass
+
+    def global_rank(self):
+        pass
+
+    def node_rank(self):
+        pass
+
+    def set_ranks_to_trainer(self):
+        trainer.local_rank = self.accelerator_backend.cluster_environment.local_rank()
+        trainer.node_rank = self.accelerator_backend.cluster_environment.node_rank()
+        trainer.global_rank = self.accelerator_backend.cluster_environment.global_rank()
@@ -0,0 +1,176 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+import socket
+import warnings
+from pytorch_lightning import _logger as log
+from pytorch_lightning.cluster_environments.cluster_environment import ClusterEnvironment
+
+
+class LSFEnvironment(ClusterEnvironment):
+    """An environment for running on clusters managed by the LSF resource manager.
+
+    It is expected that any execution using this ClusterEnvironment was executed
+    using the Job Step Manager i.e. jsrun.
+
+    This plugin expects the following environment variables:
+
+    LSB_JOBID
+      The LSF assigned job ID
+
+    LSB_HOSTS
+      The hosts used in the job. This string is expected to have the format "batch <rank_0_host> ...."
+
+    JSM_NAMESPACE_LOCAL_RANK
+      The node local rank for the task. This environment variable is set by jsrun
+
+    JSM_NAMESPACE_SIZE
+      The world size for the task. This environment variable is set by jsrun
+    """
+
+    def __init__(self):
+        self._master_address = self._get_master_address()
+        self._master_port = self._get_master_port()
+        self._local_rank = self._get_local_rank()
+        self._global_rank = self._get_global_rank()
+        self._world_size = self._get_world_size()
+        self._node_rank = self._get_node_rank()
+
+        # set environment variables needed for initializing torch distributed process group
+        os.environ["MASTER_ADDR"] = str(self._master_address)
+        log.debug(f"MASTER_ADDR: {os.environ['MASTER_ADDR']}")
+        os.environ["MASTER_PORT"] = str(self._master_port)
+        log.debug(f"MASTER_PORT: {os.environ['MASTER_PORT']}")
+
+    def _read_hosts(self):
+        var = "LSB_HOSTS"
+        hosts = os.environ.get(var)
+        if not hosts:
+            raise ValueError("Could not find hosts -- expected in environment variable %s" % var)
+        hosts = hosts.split()
+        if len(hosts) < 2:
+            raise ValueError("Cannot parse hosts from LSB_HOSTS environment variable -- "
+                             "expected format \"batch <rank_0_host> ...\"")
+        return hosts
+
+    def _get_master_address(self):
+        """A helper for getting the master address"""
+        hosts = self._read_hosts()
+        return hosts[1]
+
+    def _get_master_port(self):
+        """A helper for getting the master port
+
+        Use the LSF job ID so all ranks can compute the master port
+        """
+        # check for user-specified master port
+        port = os.environ.get("MASTER_PORT")
+        if not port:
+            var = "LSB_JOBID"
+            jobid = os.environ.get(var)
+            if not jobid:
+                raise ValueError("Could not find job id -- expected in environment variable %s" % var)
+            else:
+                port = int(jobid)
+                # all ports should be in the 10k+ range
+                port = int(port) % 1000 + 10000
+            log.debug("calculated master port")
+        else:
+            log.debug("using externally specified master port")
+        return port
+
+    def _get_global_rank(self):
+        """A helper function for getting the global rank
+
+        Read this from the environment variable JSM_NAMESPACE_LOCAL_RANK
+        """
+        var = "JSM_NAMESPACE_RANK"
+        global_rank = os.environ.get(var)
+        if global_rank is None:
+            raise ValueError("Cannot determine global rank -- expected in %s "
+                             "-- make sure you run your executable with jsrun" % var)
+        return int(global_rank)
+
+    def _get_local_rank(self):
+        """A helper function for getting the local rank
+
+        Read this from the environment variable JSM_NAMESPACE_LOCAL_RANK
+        """
+        var = "JSM_NAMESPACE_LOCAL_RANK"
+        local_rank = os.environ.get(var)
+        if local_rank is None:
+            raise ValueError("Cannot determine local rank -- expected in %s "
+                             "-- make sure you run your executable with jsrun" % var)
+        return int(local_rank)
+
+    def _get_world_size(self):
+        """A helper function for getting the world size
+
+        Read this from the environment variable JSM_NAMESPACE_SIZE
+        """
+        var = "JSM_NAMESPACE_SIZE"
+        world_size = os.environ.get(var)
+        if world_size is None:
+            raise ValueError("Cannot determine local rank -- expected in %s "
+                             "-- make sure you run your executable with jsrun" % var)
+        return int(world_size)
+
+    def _get_node_rank(self):
+        """A helper function for getting the node rank"""
+        hosts = self._read_hosts()
+        count = dict()
+        for host in hosts:
+            if 'batch' in host or 'login' in host:
+                continue
+            if host not in count:
+                count[host] = len(count)
+        return count[socket.gethostname()]
+
+    def master_address(self):
+        """
+        Master address is read from a list of hosts contained in the environment variable *LSB_HOSTS*
+        """
+        return self._master_address
+
+    def master_port(self):
+        """
+        Master port is calculated from the LSF job ID
+        """
+        return self._master_port
+
+    def world_size(self):
+        """
+        World size is read from the environment variable JSM_NAMESPACE_SIZE
+        """
+        return self._world_size
+
+    def local_rank(self):
+        """
+        World size is read from the environment variable JSM_NAMESPACE_LOCAL_RANK
+        """
+        return self._local_rank
+
+    def node_rank(self):
+        """
+        Node rank is determined by the position of the current hostname in the list of hosts stored in LSB_HOSTS
+        """
+        return self._node_rank
+
+    def global_rank(self):
+        """
+        World size is read from the environment variable JSM_NAMESPACE_RANK
+        """
+        return self._global_rank