microsoft · tjruwase · Jan 21, 2022 · Aug 14, 2021 · Sep 29, 2021 · Oct 12, 2021
diff --git a/deepspeed/autotuning/config_templates/template_zero3.json b/deepspeed/autotuning/config_templates/template_zero3.json
@@ -11,7 +11,7 @@
     "stage3_max_reuse_distance": 1e9,
     "stage3_prefetch_bucket_size": 5e8,
     "stage3_param_persistence_threshold": 1e6,
-    "stage3_gather_fp16_weights_on_model_save": false,
+    "stage3_gather_16bit_weights_on_model_save": false,
     "sub_group_size": 1e12
   }
 }
@@ -708,8 +708,8 @@ def zero_prefetch_bucket_size(self):
     def zero_param_persistence_threshold(self):
         return self._config.zero_config.param_persistence_threshold
 
-    def zero_gather_fp16_weights_on_model_save(self):
-        return self._config.zero_config.gather_fp16_weights_on_model_save
+    def zero_gather_16bit_weights_on_model_save(self):
+        return self._config.zero_config.gather_16bit_weights_on_model_save
 
     def zero_grad_hooks(self):
         return self._config.zero_config.grad_hooks
@@ -2955,7 +2955,7 @@ def _save_zero_checkpoint(self, save_path, tag):
             self._copy_recovery_script(save_path)
         logger.info('zero checkpoint saved {}'.format(zero_checkpoint_name))
 
-    def _zero3_consolidated_fp16_state_dict(self):
+    def _zero3_consolidated_16bit_state_dict(self):
         """
 
         Get a full non-partitioned state_dict with fp16 weights on cpu.
@@ -3024,17 +3024,22 @@ def get_layer_state_dict(module, prefix=""):
         return state_dict
 
     def save_fp16_model(self, save_dir, save_filename="pytorch_model.bin"):
-        r"""Save fp16 model weights
+        """has been renamed to save_16bit_model, keeping this around for backwards
+        compatibility"""
+        return self.save_16bit_model(save_dir, save_filename)
 
-        This method saves the fp16 model weights at the desired destination.
+    def save_16bit_model(self, save_dir, save_filename="pytorch_model.bin"):
+        r"""Save 16bit model weights
+
+        This method saves the 16bit model weights at the desired destination.
 
         Arguments:
             save_dir: Required. Directory for saving the model
             save_filename: Optional. Filename to save to. Defaults to ``pytorch_model.bin``
 
         Returns:
             ``True`` when a model has been saved, ``False`` otherwise. It will not be saved if
-            stage3_gather_fp16_weights_on_model_save is ``False``.
+            stage3_gather_16bit_weights_on_model_save is ``False``.
 
         Important: all processes must call this method and not just the process with rank 0. It is
         because the processes need to work in sync to gather the weights. This method will hang
@@ -3045,13 +3050,13 @@ def save_fp16_model(self, save_dir, save_filename="pytorch_model.bin"):
         path = os.path.join(save_dir, save_filename)
 
         if self.zero_optimization_partition_weights():
-            if self.zero_gather_fp16_weights_on_model_save():
+            if self.zero_gather_16bit_weights_on_model_save():
                 # consolidation is expensive in time and memory and therefore isn't a default
-                state_dict = self._zero3_consolidated_fp16_state_dict()
+                state_dict = self._zero3_consolidated_16bit_state_dict()
             else:
                 # the model will be bogus if not consolidated so don't confuse the user by saving it
                 logger.info(
-                    f"Did not save the model {path} because `stage3_gather_fp16_weights_on_model_save` is False"
+                    f"Did not save the model {path} because `stage3_gather_16bit_weights_on_model_save` is False"
                 )
                 return False
         else:

@@ -36,7 +36,7 @@ def __init__(self, param_dict):
         self.param_persistence_threshold = None
         self.max_live_parameters = None
         self.max_reuse_distance = None
-        self.gather_fp16_weights_on_model_save = None
+        self.gather_16bit_weights_on_model_save = None
 
         self.ignore_unused_parameters = None
         self.round_robin_gradients = None
@@ -171,10 +171,16 @@ def _initialize(self, zero_config_dict):
             ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD,
             ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD_DEFAULT)
 
-        self.gather_fp16_weights_on_model_save = get_scalar_param(
-            zero_config_dict,
-            ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE,
-            ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE_DEFAULT)
+        # config key has been renamed to use "16bit" instead of "fp16." falling back
+        # to old config name in order to preserve backwards compatibility
+        self.gather_16bit_weights_on_model_save = ZERO_OPTIMIZATION_GATHER_16BIT_WEIGHTS_ON_MODEL_SAVE_DEFAULT
+        for key in [
+                ZERO_OPTIMIZATION_GATHER_16BIT_WEIGHTS_ON_MODEL_SAVE,
+                ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE
+        ]:
+            if key in zero_config_dict:
+                self.gather_16bit_weights_on_model_save = zero_config_dict[key]
+                break
 
         self.ignore_unused_parameters = get_scalar_param(
             zero_config_dict,

@@ -113,7 +113,8 @@
 
 # gathers params for saving a model - inefficient but is required in certain situations
 ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE = 'stage3_gather_fp16_weights_on_model_save'
-ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE_DEFAULT = False
+ZERO_OPTIMIZATION_GATHER_16BIT_WEIGHTS_ON_MODEL_SAVE = 'stage3_gather_16bit_weights_on_model_save'
+ZERO_OPTIMIZATION_GATHER_16BIT_WEIGHTS_ON_MODEL_SAVE_DEFAULT = False
 
 # Now just used in stage2 complete_grad_norm_calculation_for_cpu_offload
 # Enable this option to avoid:
@@ -161,8 +162,8 @@
     ZERO_OPTIMIZATION_PREFETCH_BUCKET_SIZE_DEFAULT,
     ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD:
     ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD_DEFAULT,
-    ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE:
-    ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE_DEFAULT,
+    ZERO_OPTIMIZATION_GATHER_16BIT_WEIGHTS_ON_MODEL_SAVE:
+    ZERO_OPTIMIZATION_GATHER_16BIT_WEIGHTS_ON_MODEL_SAVE_DEFAULT,
     ZERO_OPTIMIZATION_IGNORE_UNUSED_PARAMETERS:
     ZERO_OPTIMIZATION_IGNORE_UNUSED_PARAMETERS_DEFAULT,
     ZERO_OPTIMIZATION_LEGACY_STAGE1:

diff --git a/docs/_pages/config-json.md b/docs/_pages/config-json.md
@@ -329,7 +329,7 @@ Enabling and configuring ZeRO memory optimizations
     "stage3_param_persistence_threshold" : 1e6,
     "sub_group_size" : 1e12,
     "elastic_checkpoint" : [true|false],
-    "stage3_gather_fp16_weights_on_model_save": [true|false],
+    "stage3_gather_16bit_weights_on_model_save": [true|false],
     "ignore_unused_parameters": [true|false]
     "round_robin_gradients": [true|false]
     }
@@ -433,11 +433,11 @@ Enabling and configuring ZeRO memory optimizations
 | Do not partition parameters smaller than this threshold. Smaller values use less memory, but can greatly increase communication (especially latency-bound messages). | `1e6`   |
 
 
-***stage3_gather_fp16_weights_on_model_save***: [boolean]
+***stage3_gather_16bit_weights_on_model_save***: [boolean]
 
 | Description                                                                                                                                                                                                                                                                    | Default |
 |--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| ------- |
-| Consolidate the weights before saving the model by `save_fp16_model()`. Since the weights are partitioned across GPUs, they aren't part of `state_dict`, so this function automatically gathers the weights when this option is enabled and then saves the fp16 model weights. | `False` |
+| Consolidate the weights before saving the model by `save_16bit_model()`. Since the weights are partitioned across GPUs, they aren't part of `state_dict`, so this function automatically gathers the weights when this option is enabled and then saves the fp16 model weights. | `False` |
 
 
 ***cpu_offload***: [boolean]

diff --git a/docs/_tutorials/zero.md b/docs/_tutorials/zero.md
@@ -252,19 +252,19 @@ If you need to take the pretrained weights out of Deepspeed here is what you can
 
 ```json
     "zero_optimization": {
-        "stage3_gather_fp16_weights_on_model_save": true
+        "stage3_gather_16bit_weights_on_model_save": true
     },
 ```
 And then save the model using:
 
 ```python
             if self.deepspeed:
-                self.deepspeed.save_fp16_model(output_dir, output_file)
+                self.deepspeed.save_16bit_model(output_dir, output_file)
 ```
 
 Because it requires consolidation of the weights on one GPU it can be slow and memory demanding, so only use this feature when needed.
 
-Note that if `stage3_gather_fp16_weights_on_model_save` is `False`, no weights will be saved (again, because `state_dict` doesn't have them).
+Note that if `stage3_gather_16bit_weights_on_model_save` is `False`, no weights will be saved (again, because `state_dict` doesn't have them).
 You can use this method to save ZeRO-2 weights as well.
 
 If you'd like to get the fp32 weights, we supply a special script that can do offline consolidation. It requires no configuration files or GPUs. Here is an example of its usage:

diff --git a/docs/code-docs/source/training.rst b/docs/code-docs/source/training.rst
@@ -35,7 +35,7 @@ Gradient Accumulation
 
 Model Saving
 ------------
-.. autofunction:: deepspeed.DeepSpeedEngine.save_fp16_model
+.. autofunction:: deepspeed.DeepSpeedEngine.save_16bit_model
 
 
 Additionally when a DeepSpeed checkpoint is created, a script ``zero_to_fp32.py`` is added there which can be used to reconstruct fp32 master weights into a single pytorch ``state_dict`` file.