microsoft · tjruwase · Oct 31, 2021 · Jun 25, 2021 · Jun 30, 2021 · Jun 30, 2021
@@ -11,7 +11,12 @@
 import itertools
 
 import torch
-from torch.distributed.distributed_c10d import _get_global_rank
+from torch.distributed.distributed_c10d import _get_global_rank, group
+
+try:
+    from torch.distributed.distributed_c10d import _all_gather_base as all_gather
+except:
+    from torch.distributed.distributed_c10d import all_gather
 
 from .linear import LinearModuleForZeroStage3, LinearFunctionForZeroStage3
 from .offload_constants import *
@@ -678,7 +683,9 @@ def _all_gather(self, param_list, async_op=False, hierarchy=None):
                     all_gather_list.append(param)
 
         if not async_op:
-            ret_value = self._allgather_params(all_gather_list, hierarchy=hierarchy)
+            # ret_value = self._allgather_params(all_gather_list, hierarchy=hierarchy)
+            ret_value = self._allgather_params_coalesced(all_gather_list, hierarchy)
+
             for param in all_gather_list:
                 param.ds_status = ZeroParamStatus.AVAILABLE
             return ret_value
@@ -724,8 +731,10 @@ def _partition_param(self, param, buffer=None, has_been_updated=False):
                 see_memory_usage(
                     f'Before partitioning param {param.ds_id} {param.shape}',
                     force=False)
+
                 # param.data does not store anything meaningful in partitioned state
-                param.data = torch.ones(1, dtype=self.dtype).to(param.device)
+                param.data = torch.empty(1, dtype=self.dtype, device=param.device)
+
                 see_memory_usage(f'After partitioning param {param.ds_id} {param.shape}',
                                  force=False)
 
@@ -746,7 +755,7 @@ def _partition_param(self, param, buffer=None, has_been_updated=False):
                         numel=partition_size):
                     final_location = OFFLOAD_NVME_DEVICE
                     buffer = self.param_swapper.get_buffer(param, partition_size)
-                    partitioned_tensor = torch.zeros(1,
+                    partitioned_tensor = torch.empty(1,
                                                      dtype=param.dtype,
                                                      device=buffer.device)
                     partitioned_tensor.data = buffer.data
@@ -755,7 +764,7 @@ def _partition_param(self, param, buffer=None, has_been_updated=False):
                     )
 
                 else:
-                    partitioned_tensor = torch.zeros(
+                    partitioned_tensor = torch.empty(
                         partition_size,
                         dtype=param.dtype,
                         device=OFFLOAD_CPU_DEVICE
@@ -865,22 +874,97 @@ def _allgather_param(self, param, async_op=False, hierarchy=0):
         #                                                   param.ds_numel).view(param.ds_shape)
         #            param.data = replicated_tensor.data
         #            return None
-        partitions = []
-        for i in range(self.world_size):
-            partitions.append(flat_tensor.narrow(0, partition_size * i, partition_size))
+        try:
+            # try the _all_gather_base on PyTorch master branch
+            handle = all_gather(flat_tensor,
+                                param.ds_tensor,
+                                group=self.ds_process_group,
+                                async_op=async_op)
+        except:
+            partitions = []
+            for i in range(self.world_size):
+                partitions.append(
+                    flat_tensor.narrow(0,
+                                       partition_size * i,
+                                       partition_size))
 
-            if i == torch.distributed.get_rank(group=self.ds_process_group):
-                partitions[i].data.copy_(param.ds_tensor.data, non_blocking=True)
+                if i == torch.distributed.get_rank(group=self.ds_process_group):
+                    partitions[i].data.copy_(param.ds_tensor.data, non_blocking=True)
 
-        handle = torch.distributed.all_gather(partitions,
-                                              partitions[self.rank],
-                                              group=self.ds_process_group,
-                                              async_op=async_op)
+            handle = torch.distributed.all_gather(partitions,
+                                                  partitions[self.rank],
+                                                  group=self.ds_process_group,
+                                                  async_op=async_op)
 
         replicated_tensor = flat_tensor.narrow(0, 0, param.ds_numel).view(param.ds_shape)
         param.data = replicated_tensor.data
         return handle
 
+    def _allgather_params_coalesced(self, param_list, hierarchy=0):
+        """ blocking call
+        avoid explicit memory copy in _allgather_params
+        """
+        if len(param_list) == 0:
+            return
+        # collect local tensors and partition sizes
+        partition_sizes = []
+        local_tensors = []
+        for param in param_list:
+            partition_sizes.append(param.ds_tensor.ds_numel)
+            local_tensors.append(param.ds_tensor)
+
+        # allocate memory for allgather params
+        allgather_params = []
+        for psize in partition_sizes:
+            tensor_size = psize * self.world_size
+            flat_tensor = torch.empty(tensor_size,
+                                      dtype=param_list[0].dtype,
+                                      device=self.local_device).view(-1)
+            flat_tensor.requires_grad = False
+            allgather_params.append(flat_tensor)
+
+        # launch
+        launch_handles = []
+        # backend = get_backend(self.ds_process_group)
+        # with _batch_p2p_manager(backend):
+        for param_idx, param in enumerate(param_list):
+            input_tensor = local_tensors[param_idx].view(-1)
+
+            try:
+                # try the _all_gather_base from Pytorch master
+                h = all_gather(allgather_params[param_idx],
+                               input_tensor,
+                               group=self.ds_process_group,
+                               async_op=True)
+            except:
+                output_list = []
+                for i in range(self.world_size):
+                    psize = partition_sizes[param_idx]
+                    partition = allgather_params[param_idx].narrow(0, i * psize, psize)
+                    output_list.append(partition)
+
+                # back to old all_gather function signature
+                h = all_gather(output_list,
+                               input_tensor,
+                               group=self.ds_process_group,
+                               async_op=True)
+            launch_handles.append(h)
+
+        # Wait ensures the operation is enqueued, but not necessarily complete.
+        launch_handles[-1].wait()
+
+        # assign to param.data (not copy)
+        for i, param in enumerate(param_list):
+            gathered_tensor = allgather_params[i]
+            param.data = gathered_tensor.narrow(0,
+                                                0,
+                                                param.ds_numel).view(param.ds_shape).data
+
+        # guarantee the communication to be completed
+        torch.cuda.synchronize()
+
+        return None
+
     def _allgather_params(self, param_list, hierarchy=0):
         if len(param_list) == 0:
             return

@@ -444,7 +444,7 @@ def fetch_sub_module(self, sub_module):
         self.hierarchy += 1
 
         # parameters are partitioned and need to be allgathered
-        self._all_gather(partitioned_params, async_op=True)
+        self._all_gather(partitioned_params, async_op=False)
 
         # parameters are inflight and communication needs to be completed
         if partitioned_params or params_in_flight: