microsoft · tjruwase · Jan 21, 2022 · Aug 14, 2021 · Sep 29, 2021 · Oct 12, 2021
diff --git a/DeepSpeedExamples b/DeepSpeedExamples
@@ -11,7 +11,7 @@
     "stage3_max_reuse_distance": 1e9,
     "stage3_prefetch_bucket_size": 5e8,
     "stage3_param_persistence_threshold": 1e6,
-    "stage3_gather_fp16_weights_on_model_save": false,
+    "stage3_gather_16bit_weights_on_model_save": false,
     "sub_group_size": 1e12
   }
 }
@@ -0,0 +1,116 @@
+"""batched collective operations for overhead amortization and better
+bandwidth utilization"""
+
+import math
+from typing import List
+
+import torch
+from torch import Tensor
+import torch.distributed
+from torch.distributed import ProcessGroup
+import torch.nn.functional
+
+from deepspeed.utils import instrument_w_nvtx
+from deepspeed.utils.logging import logger
+
+if hasattr(torch.distributed, "_reduce_scatter_base"):
+
+    def torch_reduce_scatter_fn(input_tensor: Tensor, output_tensor: Tensor, group):
+        instrument_w_nvtx(torch.distributed._reduce_scatter_base)(
+            output_tensor,
+            input_tensor,
+            group=group,
+        )
+else:
+    logger.warning(
+        "unable to find torch.distributed._reduce_scatter_base. will fall back to "
+        "torch.distributed.reduce_scatter which will result in suboptimal performance. "
+        "please consider upgrading your pytorch installation.")
+
+    def torch_reduce_scatter_fn(input_tensor: Tensor, output_tensor: Tensor, group):
+        input_tensor_lst = list(
+            torch.chunk(input_tensor,
+                        torch.distributed.get_world_size(group)))
+        instrument_w_nvtx(torch.distributed.reduce_scatter)(
+            output_tensor,
+            input_tensor_lst,
+            group=group,
+        )
+
+
+@instrument_w_nvtx
+@torch.no_grad()
+def reduce_scatter_coalesced(
+    tensors: List[Tensor],
+    group: ProcessGroup = None,
+) -> List[Tensor]:
+    """simultaneously reduce-scatter a list of tensors - this can be done more
+    efficiently than individual reduce scatter calls
+
+    TODO. see if PyTorch team wants a c++ verson of this for ProcessGroupNCCL
+    """
+    this_rank = torch.distributed.get_rank(group)
+    world_sz = torch.distributed.get_world_size(group)
+
+    partition_lst_for_each_tensor = [None] * len(tensors)
+    for tensor_idx, tensor in enumerate(tensors):
+        flattened_tensor = tensor.view(-1)
+        chunk_sz = math.ceil(tensor.numel() / world_sz)
+        partition_lst_for_each_tensor[tensor_idx] = [
+            flattened_tensor[rank * chunk_sz:rank * chunk_sz + chunk_sz]
+            for rank in range(0,
+                              world_sz)
+        ]
+
+    padded_partition_sz_for_each_tensor = tuple(
+        math.ceil(t.numel() / world_sz) for t in tensors)
+
+    if len(tensors) == 1 and tensors[0].numel() % world_sz == 0:
+        # if there's only one tensor being reduced and we don't need to pad
+        # we have an opportunity to avoid a memory allocation
+        tensor_partition_flat_buffer = tensors[0].view(-1)
+    else:
+        # interleave tensor partitions such that the correct reduced partitions of each tensor
+        # end up at each rank
+        tensor_partitions_lst_with_padding = []
+        for rank in range(world_sz):
+            for tensor_idx in range(len(tensors)):
+                # add tensor content
+                tensor_chunk = partition_lst_for_each_tensor[tensor_idx][rank]
+                tensor_partitions_lst_with_padding.append(tensor_chunk)
+
+                # add padding if necessary
+                padding_sz = padded_partition_sz_for_each_tensor[
+                    tensor_idx] - tensor_chunk.numel()
+                if padding_sz > 0:
+                    tensor_partitions_lst_with_padding.append(
+                        torch.empty(padding_sz,
+                                    dtype=tensor_chunk.dtype,
+                                    device=tensor_chunk.device))
+
+        tensor_partition_flat_buffer = instrument_w_nvtx(
+            torch.cat)(tensor_partitions_lst_with_padding)
+
+    tensor_partition_flat_buffer.div_(world_sz)  # pre-divide
+    tensor_partition_buffer_for_each_rank: List[Tensor] = torch.chunk(
+        tensor_partition_flat_buffer,
+        world_sz)
+
+    # batched reduce-scatter call
+    torch_reduce_scatter_fn(tensor_partition_flat_buffer,
+                            tensor_partition_buffer_for_each_rank[this_rank],
+                            group)
+
+    # reverse procedure of the interleaving done previously, done on the
+    # result of the batched reduce-scatter
+    output_lst: List[Tensor] = [None] * len(tensors)
+    offset = 0
+    for tensor_idx in range(len(tensors)):
+        output_lst[tensor_idx] = tensor_partition_buffer_for_each_rank[this_rank].narrow(
+            0,
+            offset,
+            partition_lst_for_each_tensor[tensor_idx][this_rank].numel())
+
+        offset += padded_partition_sz_for_each_tensor[tensor_idx]
+
+    return output_lst
@@ -134,12 +134,12 @@ def get_fp16_enabled(param_dict):
 
 
 def get_bfloat16_enabled(param_dict):
-    if BFLOAT16 in param_dict.keys():
-        return get_scalar_param(param_dict[BFLOAT16],
-                                BFLOAT16_ENABLED,
-                                BFLOAT16_ENABLED_DEFAULT)
-    else:
-        return False
+    for key in [BFLOAT16, BFLOAT16_OLD]:
+        if key in param_dict.keys():
+            return get_scalar_param(param_dict[key],
+                                    BFLOAT16_ENABLED,
+                                    BFLOAT16_ENABLED_DEFAULT)
+    return False
 
 
 def get_fp16_master_weights_and_grads_enabled(param_dict):
@@ -899,7 +899,7 @@ def _initialize_params(self, param_dict):
         self.fp16_enabled = get_fp16_enabled(param_dict)
         self.bfloat16_enabled = get_bfloat16_enabled(param_dict)
         assert not (self.fp16_enabled and self.bfloat16_enabled), 'bfloat16 and fp16 modes cannot be simultaneously enabled'
-        assert not (self.bfloat16_enabled and (self.zero_optimization_stage != 2)), 'bfloat16 mode is only enabled for Zero2 currently'
+        assert not (self.bfloat16_enabled and (self.zero_optimization_stage not in {2, 3})), f'bfloat16 mode is only enabled for Zero 2 and 3 currently. got {self.zero_optimization_stage}'
         self.fp16_master_weights_and_gradients = get_fp16_master_weights_and_grads_enabled(
             param_dict)
         self.amp_enabled = get_amp_enabled(param_dict)

@@ -114,11 +114,12 @@
 # Users can configure in ds_config.json as below example:
 BFLOAT16_FORMAT = '''
 BFLOAT16 parameters should be of the format:
-"bfloat16": {
+"bf16": {
   "enabled": true
 }
 '''
-BFLOAT16 = "bfloat16"
+BFLOAT16 = "bf16"
+BFLOAT16_OLD = "bfloat16"  # keeping for backwards compatibility
 
 BFLOAT16_ENABLED = "enabled"
 BFLOAT16_ENABLED_DEFAULT = False

@@ -47,7 +47,7 @@
 import deepspeed.runtime.lr_schedules as lr_schedules
 import deepspeed.utils.groups as groups
 from deepspeed.runtime.utils import get_grad_norm
-from deepspeed.utils import logger, log_dist, init_distributed
+from deepspeed.utils import logger, log_dist, init_distributed, instrument_w_nvtx
 from deepspeed.utils.timer import ThroughputTimer, SynchronizedWallClockTimer
 from deepspeed.utils.debug import debug_extract_module_and_param_names
 from deepspeed.runtime.progressive_layer_drop import ProgressiveLayerDrop
@@ -706,8 +706,8 @@ def zero_prefetch_bucket_size(self):
     def zero_param_persistence_threshold(self):
         return self._config.zero_config.param_persistence_threshold
 
-    def zero_gather_fp16_weights_on_model_save(self):
-        return self._config.zero_config.gather_fp16_weights_on_model_save
+    def zero_gather_16bit_weights_on_model_save(self):
+        return self._config.zero_config.gather_16bit_weights_on_model_save
 
     def zero_grad_hooks(self):
         return self._config.zero_config.grad_hooks
@@ -969,6 +969,16 @@ def is_replicated(p):
                                    self.broadcast_src_rank,
                                    group=self.data_parallel_group)
 
+    @staticmethod
+    def __check_params(model: Module, dtype: torch.dtype) -> None:
+        if not all(param.dtype == dtype
+                   for param in model.parameters()) and dist.get_rank() == 0:
+            raise ValueError(
+                f"{dtype} is enabled but the following parameters have dtype that is "
+                f"not {dtype}: "
+                f"{[(n, p.dtype) for n, p in model.named_parameters() if p.dtype != dtype]}"
+            )
+
     def _configure_distributed_model(self, model):
         self.module = model
         if self.fp16_enabled():
@@ -986,17 +996,13 @@ def _configure_distributed_model(self, model):
                     )
             self.module.half()
         elif self.bfloat16_enabled():
+            if self.zero_optimization_partition_weights() and any(
+                    hasattr(param,
+                            'ds_id') for param in self.module.parameters()):
+                self.__check_params(self.module, torch.bfloat16)
             self.module.bfloat16()
         else:
-            if not all(
-                [param.dtype == torch.float for param in self.module.parameters()]):
-                names = [
-                    n for n,
-                    p in self.module.named_parameters() if p.dtype != torch.float
-                ]
-                raise ValueError(
-                    f"fp32 is enabled but the following parameters have dtype that is not fp32: {', '.join(names)}"
-                )
+            self.__check_params(self.module, torch.float)
 
         if not self.dont_change_device:
             self.module.to(self.device)
@@ -1542,6 +1548,7 @@ def _scale_loss_by_gas(self, prescaled_loss):
 
         return scaled_loss
 
+    @instrument_w_nvtx
     def forward(self, *inputs, **kwargs):
         r"""Execute forward propagation
         Arguments:
@@ -1637,6 +1644,7 @@ def print_forward_breakdown(self, fwd_time):
             f"rank={torch.distributed.get_rank()} time (ms) | forward: {fwd_time:.2f} (forward_moe: {moe_time:.2f}, 1st alltoall: {falltoall:.2f}, 2nd alltoall: {salltoall:.2f}, top-k: {gate_time:.2f})",
             ranks=[0])
 
+    @instrument_w_nvtx
     def allreduce_gradients(self, bucket_size=MEMORY_OPT_ALLREDUCE_SIZE):
         # Pass (PP) gas boundary flag to optimizer (required for zero)
         self.optimizer.is_gradient_accumulation_boundary = self.is_gradient_accumulation_boundary(
@@ -1654,6 +1662,7 @@ def allreduce_gradients(self, bucket_size=MEMORY_OPT_ALLREDUCE_SIZE):
             else:
                 self.buffered_allreduce_fallback(elements_per_buffer=bucket_size)
 
+    @instrument_w_nvtx
     def backward(self, loss, allreduce_gradients=True, release_loss=False):
         r"""Execute backward pass on the loss
 
@@ -3013,7 +3022,7 @@ def _save_zero_checkpoint(self, save_path, tag):
             self._copy_recovery_script(save_path)
         logger.info('zero checkpoint saved {}'.format(zero_checkpoint_name))
 
-    def _zero3_consolidated_fp16_state_dict(self):
+    def _zero3_consolidated_16bit_state_dict(self):
         """
 
         Get a full non-partitioned state_dict with fp16 weights on cpu.
@@ -3082,17 +3091,22 @@ def get_layer_state_dict(module, prefix=""):
         return state_dict
 
     def save_fp16_model(self, save_dir, save_filename="pytorch_model.bin"):
-        r"""Save fp16 model weights
+        """has been renamed to save_16bit_model, keeping this around for backwards
+        compatibility"""
+        return self.save_16bit_model(save_dir, save_filename)
+
+    def save_16bit_model(self, save_dir, save_filename="pytorch_model.bin"):
+        r"""Save 16bit model weights
 
-        This method saves the fp16 model weights at the desired destination.
+        This method saves the 16bit model weights at the desired destination.
 
         Arguments:
             save_dir: Required. Directory for saving the model
             save_filename: Optional. Filename to save to. Defaults to ``pytorch_model.bin``
 
         Returns:
             ``True`` when a model has been saved, ``False`` otherwise. It will not be saved if
-            stage3_gather_fp16_weights_on_model_save is ``False``.
+            stage3_gather_16bit_weights_on_model_save is ``False``.
 
         Important: all processes must call this method and not just the process with rank 0. It is
         because the processes need to work in sync to gather the weights. This method will hang
@@ -3103,13 +3117,13 @@ def save_fp16_model(self, save_dir, save_filename="pytorch_model.bin"):
         path = os.path.join(save_dir, save_filename)
 
         if self.zero_optimization_partition_weights():
-            if self.zero_gather_fp16_weights_on_model_save():
+            if self.zero_gather_16bit_weights_on_model_save():
                 # consolidation is expensive in time and memory and therefore isn't a default
-                state_dict = self._zero3_consolidated_fp16_state_dict()
+                state_dict = self._zero3_consolidated_16bit_state_dict()
             else:
                 # the model will be bogus if not consolidated so don't confuse the user by saving it
                 logger.info(
-                    f"Did not save the model {path} because `stage3_gather_fp16_weights_on_model_save` is False"
+                    f"Did not save the model {path} because `stage3_gather_16bit_weights_on_model_save` is False"
                 )
                 return False
         else:

@@ -858,3 +858,12 @@ def call_to_str(base, *args, **kwargs):
         name += ', '.join(f'{key}={repr(arg)}' for key, arg in kwargs.items())
     name += ')'
     return name
+
+
+def get_only_unique_item(items):
+    item_set = set(items)
+    if len(item_set) != 1:
+        raise RuntimeError(f"expected there to be only one unique element in {items}")
+    unique_item, = item_set
+
+    return unique_item
@@ -36,7 +36,7 @@ def __init__(self, param_dict):
         self.param_persistence_threshold = None
         self.max_live_parameters = None
         self.max_reuse_distance = None
-        self.gather_fp16_weights_on_model_save = None
+        self.gather_16bit_weights_on_model_save = None
 
         self.ignore_unused_parameters = None
         self.round_robin_gradients = None
@@ -171,10 +171,16 @@ def _initialize(self, zero_config_dict):
             ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD,
             ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD_DEFAULT)
 
-        self.gather_fp16_weights_on_model_save = get_scalar_param(
-            zero_config_dict,
-            ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE,
-            ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE_DEFAULT)
+        # config key has been renamed to use "16bit" instead of "fp16." falling back
+        # to old config name in order to preserve backwards compatibility
+        self.gather_16bit_weights_on_model_save = ZERO_OPTIMIZATION_GATHER_16BIT_WEIGHTS_ON_MODEL_SAVE_DEFAULT
+        for key in [
+                ZERO_OPTIMIZATION_GATHER_16BIT_WEIGHTS_ON_MODEL_SAVE,
+                ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE
+        ]:
+            if key in zero_config_dict:
+                self.gather_16bit_weights_on_model_save = zero_config_dict[key]
+                break
 
         self.ignore_unused_parameters = get_scalar_param(
             zero_config_dict,

@@ -116,7 +116,8 @@
 
 # gathers params for saving a model - inefficient but is required in certain situations
 ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE = 'stage3_gather_fp16_weights_on_model_save'
-ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE_DEFAULT = False
+ZERO_OPTIMIZATION_GATHER_16BIT_WEIGHTS_ON_MODEL_SAVE = 'stage3_gather_16bit_weights_on_model_save'
+ZERO_OPTIMIZATION_GATHER_16BIT_WEIGHTS_ON_MODEL_SAVE_DEFAULT = False
 
 # Now just used in stage2 complete_grad_norm_calculation_for_cpu_offload
 # Enable this option to avoid:
@@ -164,8 +165,8 @@
     ZERO_OPTIMIZATION_PREFETCH_BUCKET_SIZE_DEFAULT,
     ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD:
     ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD_DEFAULT,
-    ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE:
-    ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE_DEFAULT,
+    ZERO_OPTIMIZATION_GATHER_16BIT_WEIGHTS_ON_MODEL_SAVE:
+    ZERO_OPTIMIZATION_GATHER_16BIT_WEIGHTS_ON_MODEL_SAVE_DEFAULT,
     ZERO_OPTIMIZATION_IGNORE_UNUSED_PARAMETERS:
     ZERO_OPTIMIZATION_IGNORE_UNUSED_PARAMETERS_DEFAULT,
     ZERO_OPTIMIZATION_LEGACY_STAGE1: