microsoft · tjruwase · Jan 21, 2022 · Aug 14, 2021 · Sep 29, 2021 · Oct 12, 2021
diff --git a/DeepSpeedExamples b/DeepSpeedExamples
diff --git a/csrc/includes/type_shim.h b/csrc/includes/type_shim.h
@@ -26,6 +26,11 @@
             __VA_ARGS__;                                                         \
             break;                                                               \
         }                                                                        \
+        case at::ScalarType::BFloat16: {                                         \
+            using scalar_t_##LEVEL = at::BFloat16;                               \
+            __VA_ARGS__;                                                         \
+            break;                                                               \
+        }                                                                        \
         default: AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
     }
 
@@ -46,6 +51,11 @@
             __VA_ARGS__;                                                         \
             break;                                                               \
         }                                                                        \
+        case at::ScalarType::BFloat16: {                                         \
+            using scalar_t_##LEVEL = at::BFloat16;                               \
+            __VA_ARGS__;                                                         \
+            break;                                                               \
+        }                                                                        \
         default: AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
     }
 

@@ -0,0 +1,88 @@
+"""batched collective operations for overhead amortization and better
+bandwidth utilization"""
+
+import math
+from typing import List
+
+import torch
+from torch import Tensor
+import torch.distributed
+from torch.distributed import ProcessGroup
+import torch.nn.functional
+
+from deepspeed.utils import instrument_w_nvtx
+
+
+@instrument_w_nvtx
+@torch.no_grad()
+def reduce_scatter_coalesced(
+        tensors: List[Tensor],
+        group: ProcessGroup = None,
+) -> List[Tensor]:
+    """simultaneously reduce-scatter a list of tensors - this can be done more
+    efficiently than individual reduce scatter calls
+
+    TODO. see if PyTorch team wants a c++ verson of this for ProcessGroupNCCL
+    """
+    this_rank = torch.distributed.get_rank(group)
+    world_sz = torch.distributed.get_world_size(group)
+
+    partition_lst_for_each_tensor = tuple(
+        torch.chunk(tensor.view(-1),
+                    world_sz) for tensor in tensors)
+    padded_partition_sz_for_each_tensor = tuple(
+        math.ceil(t.numel() / world_sz) for t in tensors)
+
+    if len(tensors) == 1 and tensors[0].numel() % world_sz == 0:
+        # if there's only one tensor being reduced and we don't need to pad
+        # we have an opportunity to avoid a memory allocation
+        tensor_partition_flat_buffer = tensors[0].view(-1)
+    else:
+        # interleave tensor partitions such that the correct reduced partitions of each tensor
+        # end up at each rank
+        tensor_partitions_lst_with_padding = []
+        for rank in range(world_sz):
+            for tensor_idx in range(len(tensors)):
+                # add tensor content
+                tensor_chunk = partition_lst_for_each_tensor[tensor_idx][rank]
+                tensor_partitions_lst_with_padding.append(tensor_chunk)
+
+                # add padding if necessary
+                padding_sz = padded_partition_sz_for_each_tensor[
+                    tensor_idx] - tensor_chunk.numel()
+                if padding_sz > 0:
+                    tensor_partitions_lst_with_padding.append(
+                        torch.empty(padding_sz,
+                                    dtype=tensor_chunk.dtype,
+                                    device=tensor_chunk.device))
+
+        tensor_partition_flat_buffer = instrument_w_nvtx(
+            torch.cat)(tensor_partitions_lst_with_padding)
+
+    tensor_partition_buffer_for_each_rank: List[Tensor] = torch.chunk(
+        tensor_partition_flat_buffer,
+        world_sz)
+
+    # batched reduce-scatter call
+    instrument_w_nvtx(torch.distributed._reduce_scatter_base)(
+        tensor_partition_buffer_for_each_rank[this_rank],
+        tensor_partition_flat_buffer,
+        group=group,
+    )
+
+    # post-divide
+    tensor_partition_buffer_for_each_rank[this_rank].div_(world_sz)
+
+    # reverse procedure of the interleaving done previously, done on the
+    # result of the batched reduce-scatter
+    output_lst: List[Tensor] = [None] * len(tensors)
+    offset = 0
+    for tensor_idx in range(len(tensors)):
+        output_lst[tensor_idx] = tensor_partition_buffer_for_each_rank[this_rank].narrow(
+            0,
+            offset,
+            partition_lst_for_each_tensor[tensor_idx][this_rank].numel())
+
+        offset += padded_partition_sz_for_each_tensor[tensor_idx]
+
+    return output_lst
@@ -114,6 +114,15 @@ def get_fp16_enabled(param_dict):
         return False
 
 
+def get_bfloat16_enabled(param_dict):
+    if BFLOAT16 in param_dict.keys():
+        return get_scalar_param(param_dict[BFLOAT16],
+                                BFLOAT16_ENABLED,
+                                BFLOAT16_ENABLED_DEFAULT)
+    else:
+        return False
+
+
 def get_fp16_master_weights_and_grads_enabled(param_dict):
     if get_fp16_enabled(param_dict):
         return get_scalar_param(param_dict[FP16],
@@ -128,6 +137,8 @@ def get_loss_scale(param_dict):
         return get_scalar_param(param_dict[FP16],
                                 FP16_LOSS_SCALE,
                                 FP16_LOSS_SCALE_DEFAULT)
+    elif get_bfloat16_enabled(param_dict):
+        return 1.0
     else:
         return FP16_LOSS_SCALE_DEFAULT
 
@@ -137,6 +148,8 @@ def get_initial_dynamic_scale(param_dict):
         initial_scale_power = get_scalar_param(param_dict[FP16],
                                                FP16_INITIAL_SCALE_POWER,
                                                FP16_INITIAL_SCALE_POWER_DEFAULT)
+    elif get_bfloat16_enabled(param_dict):
+        initial_scale_power = 0
     else:
         initial_scale_power = FP16_INITIAL_SCALE_POWER_DEFAULT
 
@@ -791,6 +804,9 @@ def _initialize_params(self, param_dict):
         self.fp16_enabled = get_fp16_enabled(param_dict)
         self.fp16_master_weights_and_gradients = get_fp16_master_weights_and_grads_enabled(
             param_dict)
+        self.bfloat16_enabled = get_bfloat16_enabled(param_dict)
+        assert not (self.fp16_enabled and self.bfloat16_enabled), 'bfloat16 and fp16 modes cannot be simultaneously enabled'
+        assert not (self.bfloat16_enabled and (self.zero_optimization_stage not in {2, 3})), f"bfloat16 mode is only enabled for ZeRO 2 and 3 currently, got {self.zero_optimization_stage}"
         self.amp_enabled = get_amp_enabled(param_dict)
         self.amp_params = get_amp_params(param_dict)
         self.loss_scale = get_loss_scale(param_dict)
@@ -964,7 +980,7 @@ def _do_error_check(self):
             assert self.zero_enabled and self.zero_optimization_stage == ZERO_OPTIMIZATION_GRADIENTS, "Fp16_master_weights_and_grads is only supported with ZeRO Stage 2 for now."
 
     def _do_warning_check(self):
-        fp16_enabled = self.fp16_enabled or self.zero_enabled
+        fp16_enabled = self.fp16_enabled
 
         vocabulary_size = self._param_dict.get(VOCABULARY_SIZE, VOCABULARY_SIZE_DEFAULT)
         if vocabulary_size and vocabulary_size % TENSOR_CORE_ALIGN_SIZE != 0:

@@ -107,6 +107,22 @@
 SPARSE_GRADIENTS = "sparse_gradients"
 SPARSE_GRADIENTS_DEFAULT = False
 
+#########################################
+# BFLOAT16 support
+#########################################
+# BFLOAT16 feature. By default, this feature is not enabled.
+# Users can configure in ds_config.json as below example:
+BFLOAT16_FORMAT = '''
+BFLOAT16 parameters should be of the format:
+"bfloat16": {
+  "enabled": true
+}
+'''
+BFLOAT16 = "bfloat16"
+
+BFLOAT16_ENABLED = "enabled"
+BFLOAT16_ENABLED_DEFAULT = False
+
 #########################################
 # FP16 support
 #########################################

@@ -43,7 +43,7 @@
 import deepspeed.runtime.lr_schedules as lr_schedules
 import deepspeed.utils.groups as groups
 from deepspeed.runtime.utils import get_grad_norm
-from deepspeed.utils import logger, log_dist, init_distributed
+from deepspeed.utils import logger, log_dist, init_distributed, instrument_w_nvtx
 from deepspeed.utils.timer import ThroughputTimer, SynchronizedWallClockTimer
 from deepspeed.utils.debug import debug_extract_module_and_param_names
 from deepspeed.runtime.progressive_layer_drop import ProgressiveLayerDrop
@@ -82,6 +82,7 @@ def split_half_float_double_csr(tensors):
         "torch.cuda.HalfTensor",
         "torch.cuda.FloatTensor",
         "torch.cuda.DoubleTensor",
+        "torch.cuda.BFloat16Tensor",
         CSRTensor.type()
     ]
 
@@ -527,6 +528,9 @@ def fp16_enabled(self):
     def fp16_master_weights_and_gradients(self):
         return self._config.fp16_master_weights_and_gradients
 
+    def bfloat16_enabled(self):
+        return self._config.bfloat16_enabled
+
     def amp_enabled(self):
         return self._config.amp_enabled
 
@@ -740,32 +744,32 @@ def is_replicated(p):
                                    self.broadcast_src_rank,
                                    group=self.data_parallel_group)
 
+    @staticmethod
+    def __check_params(model: Module, dtype: torch.dtype) -> None:
+        if not all(param.dtype == dtype
+                   for param in model.parameters()) and dist.get_rank() == 0:
+            raise ValueError(
+                f"{dtype} is enabled but the following parameters have dtype that is "
+                f"not {dtype}: "
+                f"{[(n, p.dtype) for n, p in model.named_parameters() if p.dtype != dtype]}"
+            )
+
     def _configure_distributed_model(self, model):
         self.module = model
         if self.fp16_enabled():
             if self.zero_optimization_partition_weights() and any(
                 [hasattr(param,
                          'ds_id') for param in self.module.parameters()]):
-                if not all(
-                    [param.dtype == torch.half for param in self.module.parameters()]):
-                    names = [
-                        n for n,
-                        p in self.module.named_parameters() if p.dtype != torch.half
-                    ]
-                    raise ValueError(
-                        f"fp16 is enabled but the following parameters have dtype that is not fp16: {', '.join(names)}"
-                    )
+                self.__check_params(self.module, torch.half)
             self.module.half()
+        elif self.bfloat16_enabled():
+            if self.zero_optimization_partition_weights() and any(
+                    hasattr(param,
+                            'ds_id') for param in self.module.parameters()):
+                self.__check_params(self.module, torch.bfloat16)
+            self.module.bfloat16()
         else:
-            if not all(
-                [param.dtype == torch.float for param in self.module.parameters()]):
-                names = [
-                    n for n,
-                    p in self.module.named_parameters() if p.dtype != torch.float
-                ]
-                raise ValueError(
-                    f"fp32 is enabled but the following parameters have dtype that is not fp32: {', '.join(names)}"
-                )
+            self.__check_params(self.module, torch.float)
 
         if not self.dont_change_device:
             self.module.to(self.device)
@@ -882,7 +886,7 @@ def _configure_optimizer(self, client_optimizer, model_parameters):
                     )
             self.optimizer = self._configure_zero_optimizer(basic_optimizer)
         elif self.amp_enabled():
-            assert not self.fp16_enabled(), "Cannot enable both amp with (legacy) fp16 mode"
+            assert not (self.fp16_enabled() or self.bfloat16_enabled()), "Cannot enable both amp with (legacy) fp16 or bfloat16 mode"
             amp_params = self.amp_params()
             if self.global_rank == 0:
                 logger.info(f"Initializing AMP with these params: {amp_params}")
@@ -1278,6 +1282,7 @@ def _scale_loss_by_gas(self, prescaled_loss):
 
         return scaled_loss
 
+    @instrument_w_nvtx
     def forward(self, *inputs, **kwargs):
         r"""Execute forward propagation
 
@@ -1318,7 +1323,8 @@ def forward(self, *inputs, **kwargs):
         if self.training_dataloader is None:
             self.tput_timer.start()
 
-        loss = self.module(*inputs, **kwargs)
+        with torch.cuda.nvtx.range("DeepspeedEngine.forward::module_forward"):
+            loss = self.module(*inputs, **kwargs)
 
         if self.zero_optimization_partition_weights():
             # Reset the ZeRO-3 state if we are only doing forward-passes (ie evaluation).
@@ -1347,6 +1353,7 @@ def forward(self, *inputs, **kwargs):
 
         return loss
 
+    @instrument_w_nvtx
     def allreduce_gradients(self, bucket_size=MEMORY_OPT_ALLREDUCE_SIZE):
         # Pass (PP) gas boundary flag to optimizer (required for zero)
         self.optimizer.is_gradient_accumulation_boundary = self.is_gradient_accumulation_boundary(
@@ -1364,6 +1371,7 @@ def allreduce_gradients(self, bucket_size=MEMORY_OPT_ALLREDUCE_SIZE):
             else:
                 self.buffered_allreduce_fallback(elements_per_buffer=bucket_size)
 
+    @instrument_w_nvtx
     def backward(self, loss, allreduce_gradients=True, release_loss=False):
         r"""Execute backward pass on the loss
 

@@ -840,3 +840,12 @@ def call_to_str(base, *args, **kwargs):
         name += ', '.join(f'{key}={repr(arg)}' for key, arg in kwargs.items())
     name += ')'
     return name
+
+
+def get_only_unique_item(items):
+    item_set = set(items)
+    if len(item_set) != 1:
+        raise RuntimeError(f"expected there to be only one unique element in {items}")
+    unique_item, = item_set
+
+    return unique_item