microsoft · tjruwase · Jun 17, 2021 · Jun 17, 2021 · Jun 17, 2021 · Jun 18, 2021
@@ -479,8 +479,11 @@ def get_model():
 
         self._validate_remote_device(remote_device, _ds_config)
 
+        self.available_parameter_numel = 0
+
         # Remote device is the device where parameter partiitons are stored
-        # It can be same as local_device or it could be CPU or NVMe.
+        #It can be same as local_device or it could be CPU or NVMe.
+
         self.remote_device = self.local_device if remote_device is None else remote_device
         self.pin_memory = pin_memory if (
             self.remote_device == OFFLOAD_CPU_DEVICE) else False
@@ -494,11 +497,14 @@ def get_model():
         # If we are provided an already-allocated module to prepare.
         if module is not None:
             assert isinstance(module, torch.nn.Module)
-            for param in module.parameters(recurse=True):
-                if is_zero_param(param):
-                    continue
-                self._convert_to_deepspeed_param(param)
-                param.partition()
+            self._convert_to_zero_parameters(module.parameters(recurse=True))
+
+    def _convert_to_zero_parameters(self, param_list):
+        for param in param_list:
+            if is_zero_param(param):
+                continue
+            self._convert_to_deepspeed_param(param)
+            param.partition()
 
     def _validate_remote_device(self, remote_device, ds_config):
         if ds_config is not None:
@@ -550,6 +556,11 @@ def _convert_to_deepspeed_param(self, param):
         # Stores the number of elements in the original parameter without padding
         param.ds_numel = param.numel()
 
+        # Update status book keeping
+        self._update_param_status(new_status=ZeroParamStatus.AVAILABLE,
+                                  old_status=ZeroParamStatus.NOT_AVAILABLE,
+                                  numel=param.ds_numel)
+
         # Stores the partitioned copy of the tensor
         param.ds_tensor = None
 
@@ -621,6 +632,25 @@ def padding_size():
         def partitioned_size():
             return self._partitioned_size(param)
 
+        def update_status(new_status, param_list=None, hierarchy=0):
+            cls = param
+            if param_list is None:
+                param_list = [cls]
+            self._update_status(param_list, new_status)
+
+        def get_available_parameter_numel():
+            return self._get_available_parameter_numel()
+
+        def synchronize_communication(param_list=None, handle_list=None, hierarchy=0):
+            cls = param
+            if param_list is None:
+                param_list = [cls]
+
+            self._synchronize_communication(param_list, handle_list)
+
+        def convert_to_zero_parameters(param_list):
+            self._convert_to_zero_parameters(param_list)
+
         # Collectives for gathering and partitioning parameters
         param.all_gather = all_gather
         param.partition = partition
@@ -634,6 +664,13 @@ def partitioned_size():
         param.padding_size = padding_size
         param.partitioned_size = partitioned_size
 
+        # Status utilities
+        param.update_status = update_status
+        param.get_available_parameter_numel = get_available_parameter_numel
+
+        param.synchronize_communication = synchronize_communication
+        param.convert_to_zero_parameters = convert_to_zero_parameters
+
     def _aligned_size(self, param):
         return param.ds_numel + self._padding_size(param)
 
@@ -672,15 +709,24 @@ def _all_gather(self, param_list, async_op=False, hierarchy=None):
                     handle = self._allgather_param(param,
                                                    async_op=async_op,
                                                    hierarchy=hierarchy)
-                    param.ds_status = ZeroParamStatus.INFLIGHT  # if async_op else ZeroParamStatus.AVAILABLE
+                    param.update_status(ZeroParamStatus.INFLIGHT)
                     handles.append(handle)
                 else:
                     all_gather_list.append(param)
 
         if not async_op:
             ret_value = self._allgather_params(all_gather_list, hierarchy=hierarchy)
+            avail_params = []
+            status_params = []
             for param in all_gather_list:
-                param.ds_status = ZeroParamStatus.AVAILABLE
+                avail_params.append(param.ds_id)
+                status_params.append(param.ds_status)
+                param.update_status(ZeroParamStatus.AVAILABLE)
+
+            print_rank_0(
+                f'_all_gather marks available params = {avail_params} status = {status_params}',
+                force=False)
+
             return ret_value
 
         return handles
@@ -690,8 +736,8 @@ def _partition(self, param_list, force=False, has_been_updated=False):
             #print_rank_0(f"Before Partitioning Param {param.ds_id}")
             # self._param_status(param)
             self._partition_param(param, has_been_updated=has_been_updated)
-            param.ds_status = ZeroParamStatus.NOT_AVAILABLE
-            # if param.ds_tensor is not None:
+            param.update_status(ZeroParamStatus.NOT_AVAILABLE)
+            #if param.ds_tensor is not None:
             #    assert id(param.data) == id(param.ds_tensor.data), \
             #    "After the parameters are initially partitioned, make sure we are not recreating the partition."
             #print_rank_0(f"After Partitioning Param {param.ds_id}")
@@ -854,7 +900,8 @@ def _allgather_param(self, param, async_op=False, hierarchy=0):
             f'After allocate allgather param {debug_param2name_id_shape_status(param)} {aligned_param_size} {partition_size} ',
             force=False)
 
-        torch.cuda.synchronize()
+        if not async_op:
+            torch.cuda.synchronize()
 
         print_rank_0(
             f"{'--'* hierarchy}----allgather param with {debug_param2name_id_shape_status(param)} partition size={partition_size}"
@@ -1088,6 +1135,34 @@ def _partition_gradient(self, param, partition_buffer=None, accumulate=False):
         param.grad.data = dest_tensor_full_buffer.data
         see_memory_usage("After partitioning gradients", force=False)
 
+    def _update_status(self, param_list, new_status):
+        if len(param_list) == 0:
+            return
+
+        for param in param_list:
+            old_status = param.ds_status
+            param.ds_status = new_status
+            self._update_param_status(new_status, old_status, param.ds_numel)
+
+    def _update_param_status(self, new_status, old_status, numel):
+        if old_status == ZeroParamStatus.AVAILABLE:
+            self.available_parameter_numel -= numel
+
+        if new_status == ZeroParamStatus.AVAILABLE:
+            self.available_parameter_numel += numel
+
+        assert self.available_parameter_numel >= 0, f'available_parameter numel is negative: {self.available_parameter_numel}'
+
+    def _get_available_parameter_numel(self):
+        return self.available_parameter_numel
+
+    def _synchronize_communication(self, param_list, handle_list):
+        for param, handle in zip(param_list, handle_list):
+            if handle is not None:
+                handle.wait()
+
+        self._update_status(param_list=param_list, new_status=ZeroParamStatus.AVAILABLE)
+
 
 class GatheredParameters:
     def __init__(self, params, modifier_rank=None, fwd_module=None, enabled=True):