From e449eaf955da87b4ef4ef67dfdcd12a94f78c7e7 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Mon, 29 Nov 2021 19:56:09 -0800
Subject: [PATCH 01/25] [WIP] add support for bf16 mode

---
 src/transformers/deepspeed.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/transformers/deepspeed.py b/src/transformers/deepspeed.py
index edbcbd50cca200..42a928bb4b8017 100644
--- a/src/transformers/deepspeed.py
+++ b/src/transformers/deepspeed.py
@@ -244,12 +244,14 @@ def trainer_config_process(self, args):
         self.fill_match("amp.enabled", fp16_backend == "apex", "fp16+fp16_backend(apex)")
         self.fill_match("amp.opt_level", args.fp16_opt_level, "fp16_opt_level")
 
-        # only if we have an explicit fp16.enabled = False then it's fp32, if it's True or this
-        # whole config section is missing then the fallback is fp16
+        # deepspeed's default mode is fp16 unless there is a config that says differently
         if self.is_false("fp16.enabled"):
-            self._dtype = torch.float32
-        # later there will be other dtypes besides just fp16 and fp32
-        # also not quite sure what dtype should be under apex, defaulting to fp16 for now
+            if self.is_true("bfloat16.enabled"):
+                self._dtype = torch.bfloat16
+            else:
+                self._dtype = torch.float32
+        else:
+            self._dtype = torch.float16
 
     def trainer_config_finalize(self, args, model, num_training_steps):
         """

From 758ed4c34e914a9bd154808dd0e41e891f160b27 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Sat, 4 Dec 2021 21:41:21 -0800
Subject: [PATCH 02/25] prep for bf16

---
 docs/source/main_classes/deepspeed.rst        |  14 +-
 src/transformers/deepspeed.py                 |  12 +-
 ...g_zero2.json => ds_config_zero2_fp16.json} |   0
 ...g_zero3.json => ds_config_zero3_fp16.json} |   0
 tests/deepspeed/test_deepspeed.py             | 149 +++++++++++-------
 tests/deepspeed/test_model_zoo.py             |  15 +-
 6 files changed, 117 insertions(+), 73 deletions(-)
 rename tests/deepspeed/{ds_config_zero2.json => ds_config_zero2_fp16.json} (100%)
 rename tests/deepspeed/{ds_config_zero3.json => ds_config_zero3_fp16.json} (100%)

diff --git a/docs/source/main_classes/deepspeed.rst b/docs/source/main_classes/deepspeed.rst
index 5b2e6e64e5c0c5..831e94fa515711 100644
--- a/docs/source/main_classes/deepspeed.rst
+++ b/docs/source/main_classes/deepspeed.rst
@@ -206,7 +206,7 @@ Here is an example of running ``run_translation.py`` under DeepSpeed deploying a
 .. code-block:: bash
 
     deepspeed examples/pytorch/translation/run_translation.py \
-    --deepspeed tests/deepspeed/ds_config_zero3.json \
+    --deepspeed tests/deepspeed/ds_config_zero3_fp16.json \
     --model_name_or_path t5-small --per_device_train_batch_size 1   \
     --output_dir output_dir --overwrite_output_dir --fp16 \
     --do_train --max_train_samples 500 --num_train_epochs 1 \
@@ -233,7 +233,7 @@ To deploy DeepSpeed with one GPU adjust the :class:`~transformers.Trainer` comma
 .. code-block:: bash
 
     deepspeed --num_gpus=1 examples/pytorch/translation/run_translation.py \
-    --deepspeed tests/deepspeed/ds_config_zero2.json \
+    --deepspeed tests/deepspeed/ds_config_zero2_fp16.json \
     --model_name_or_path t5-small --per_device_train_batch_size 1   \
     --output_dir output_dir --overwrite_output_dir --fp16 \
     --do_train --max_train_samples 500 --num_train_epochs 1 \
@@ -320,7 +320,7 @@ If you're using only 1 GPU, here is how you'd have to adjust your training code
     os.environ['WORLD_SIZE'] = "1"
 
     # Now proceed as normal, plus pass the deepspeed config file
-    training_args = TrainingArguments(..., deepspeed="ds_config_zero3.json")
+    training_args = TrainingArguments(..., deepspeed="ds_config_zero3_fp16.json")
     trainer = Trainer(...)
     trainer.train()
 
@@ -336,7 +336,7 @@ cell with:
 .. code-block:: python
 
     %%bash
-    cat <<'EOT' > ds_config_zero3.json
+    cat <<'EOT' > ds_config_zero3_fp16.json
     {
         "fp16": {
             "enabled": "auto",
@@ -823,7 +823,7 @@ these help you to trade scalability for speed depending on your needs.
 ZeRO-2 Example
 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 
-Here is a full ZeRO-2 auto-configuration file ``ds_config_zero2.json``:
+Here is a full ZeRO-2 auto-configuration file ``ds_config_zero2_fp16.json``:
 
 .. code-block:: json
 
@@ -938,7 +938,7 @@ values look like, but we highly recommend using the one with multiple ``auto`` s
 ZeRO-3 Example
 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 
-Here is a full ZeRO-3 auto-configuration file ``ds_config_zero3.json``:
+Here is a full ZeRO-3 auto-configuration file ``ds_config_zero3_fp16.json``:
 
 
 .. code-block:: json
@@ -1666,7 +1666,7 @@ Here is an example of running ``run_translation.py`` under DeepSpeed deploying a
 .. code-block:: bash
 
     deepspeed examples/pytorch/translation/run_translation.py \
-    --deepspeed tests/deepspeed/ds_config_zero3.json \
+    --deepspeed tests/deepspeed/ds_config_zero3_fp16.json \
     --model_name_or_path t5-small --output_dir output_dir \
     --do_eval --max_eval_samples 50 --warmup_steps 50  \
     --max_source_length 128 --val_max_target_length 128 \
diff --git a/src/transformers/deepspeed.py b/src/transformers/deepspeed.py
index 42a928bb4b8017..56cfcc9269eeec 100644
--- a/src/transformers/deepspeed.py
+++ b/src/transformers/deepspeed.py
@@ -73,7 +73,7 @@ def __init__(self, config_file_or_dict):
 
         # zero stage - this is done as early as possible, before model is created, to allow
         # ``is_deepspeed_zero3_enabled`` query and getting to the early deepspeed config object
-        # during ``zero.Init()`` which needs whether fp16 is enabled, dtype, etc.
+        # during ``zero.Init()`` which needs to know the dtype, and some other hparams.
         self._stage = self.get_value("zero_optimization.stage", -1)
 
         # offload
@@ -171,10 +171,12 @@ class HfTrainerDeepSpeedConfig(HfDeepSpeedConfig):
 
     def __init__(self, config_file_or_dict):
         super().__init__(config_file_or_dict)
-        self._dtype = torch.float16
+        self._dtype = None
         self.mismatches = []
 
     def dtype(self):
+        if self._dtype is None:
+            raise ValueError("trainer_config_process() wasn't called yet to tell dtype")
         return self._dtype
 
     def fill_match(self, ds_key_long, hf_val, hf_key=None, must_match=True):
@@ -230,20 +232,22 @@ def trainer_config_process(self, args):
         # total_num_steps - will get set in trainer_config_finalize
 
         # fp16
-        if args.fp16:
+        if args.fp16 or args.fp16_full_eval:
             fp16_backend = "apex" if args.fp16_backend == "apex" else "amp"
         else:
             fp16_backend = None
 
         # amp: similar to the pytorch native amp - it has a bunch of optional params but we won't set
         # any here unless the user did the work
-        self.fill_match("fp16.enabled", fp16_backend == "amp", "fp16+fp16_backend(amp)")
+        self.fill_match("fp16.enabled", fp16_backend == "amp", "fp16|fp16_full_eval+fp16_backend(amp)")
 
         # apex: delegates amp work to apex (which needs to be available), but it cannot be used with any
         # ZeRO features
         self.fill_match("amp.enabled", fp16_backend == "apex", "fp16+fp16_backend(apex)")
         self.fill_match("amp.opt_level", args.fp16_opt_level, "fp16_opt_level")
 
+        self.fill_match("bf16.enabled", (args.bf16 or args.bf16_full_eval), "bf16|bf16_full_eval")
+
         # deepspeed's default mode is fp16 unless there is a config that says differently
         if self.is_false("fp16.enabled"):
             if self.is_true("bfloat16.enabled"):
diff --git a/tests/deepspeed/ds_config_zero2.json b/tests/deepspeed/ds_config_zero2_fp16.json
similarity index 100%
rename from tests/deepspeed/ds_config_zero2.json
rename to tests/deepspeed/ds_config_zero2_fp16.json
diff --git a/tests/deepspeed/ds_config_zero3.json b/tests/deepspeed/ds_config_zero3_fp16.json
similarity index 100%
rename from tests/deepspeed/ds_config_zero3.json
rename to tests/deepspeed/ds_config_zero3_fp16.json
diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
index 8e7587235df08e..360a9b8f4d0f98 100644
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -120,7 +120,20 @@ def get_launcher(distributed=False):
 
 ZERO2 = "zero2"
 ZERO3 = "zero3"
+
+FP16 = "fp16"
+BF16 = "bf16"
+
+ZERO2_FP16 = "zero2_fp16"
+ZERO3_FP16 = "zero3_fp16"
+ZERO2_BF16 = "zero2_bf16"
+ZERO3_BF16 = "zero3_bf16"
+
+stages_fp16 = [ZERO2_FP16, ZERO3_FP16]
+stages_bf16 = [ZERO2_BF16, ZERO3_BF16]
+
 stages = [ZERO2, ZERO3]
+dtypes = [FP16, BF16]
 
 
 @require_deepspeed
@@ -138,8 +151,8 @@ def setUp(self):
             MASTER_ADDR="localhost", MASTER_PORT=master_port, RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
         )
 
-    def test_init_zero3(self):
-        # test that zero.Init() works correctly under zero3
+    def test_init_zero3_fp16(self):
+        # test that zero.Init() works correctly under zero3_fp16
         ds_config = {
             "train_batch_size": 1,
             "zero_optimization": {
@@ -209,25 +222,39 @@ def setUp(self):
         )
 
         self.ds_config_file = dict(
-            zero2=f"{self.test_file_dir_str}/ds_config_zero2.json",
-            zero3=f"{self.test_file_dir_str}/ds_config_zero3.json",
+            zero2_fp16=f"{self.test_file_dir_str}/ds_config_zero2_fp16.json",
+            zero3_fp16=f"{self.test_file_dir_str}/ds_config_zero3_fp16.json",
+            zero2_bf16=f"{self.test_file_dir_str}/ds_config_zero2_bf16.json",
+            zero3_bf16=f"{self.test_file_dir_str}/ds_config_zero3_bf16.json",
         )
 
         # use self.get_config_dict(stage) to use these to ensure the original is not modified
-        with io.open(self.ds_config_file[ZERO2], "r", encoding="utf-8") as f:
-            config_zero2 = json.load(f)
+        with io.open(self.ds_config_file[ZERO2_FP16], "r", encoding="utf-8") as f:
+            config_zero2_fp16 = json.load(f)
             # by default use fp16
-            config_zero2["fp16"]["enabled"] = True
-        with io.open(self.ds_config_file[ZERO3], "r", encoding="utf-8") as f:
-            config_zero3 = json.load(f)
+            config_zero2_fp16["fp16"]["enabled"] = True
+        with io.open(self.ds_config_file[ZERO3_FP16], "r", encoding="utf-8") as f:
+            config_zero3_fp16 = json.load(f)
             # by default use fp16
-            config_zero3["fp16"]["enabled"] = True
+            config_zero3_fp16["fp16"]["enabled"] = True
             # This setting slows things down, so don't enable it by default unless needed by a test.
             # It's in the file as a demo for users since we want everything to work out of the box even if slower.
-            config_zero3["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = False
+            config_zero3_fp16["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = False
+
+        with io.open(self.ds_config_file[ZERO2_BF16], "r", encoding="utf-8") as f:
+            config_zero2_bf16 = json.load(f)
+            # by default use fp16
+            config_zero2_bf16["bf16"]["enabled"] = True
+        with io.open(self.ds_config_file[ZERO3_BF16], "r", encoding="utf-8") as f:
+            config_zero3_bf16 = json.load(f)
+            # by default use fp16
+            config_zero3_bf16["bf16"]["enabled"] = True
+
         self.ds_config_dict = dict(
-            zero2=config_zero2,
-            zero3=config_zero3,
+            zero2_fp16=config_zero2_fp16,
+            zero3_fp16=config_zero3_fp16,
+            zero2_bf16=config_zero2_bf16,
+            zero3_bf16=config_zero3_bf16,
         )
 
     def get_config_dict(self, stage):
@@ -238,7 +265,7 @@ def get_config_dict(self, stage):
 
     def test_hf_ds_config_mismatch(self):
 
-        ds_config = self.get_config_dict(ZERO2)
+        ds_config = self.get_config_dict(ZERO2_FP16)
 
         # Purposefully configure these values to mismatch TrainingArguments values.
         # This currently doesn't cover all keys (but it could)
@@ -297,12 +324,12 @@ def test_hf_ds_config_mismatch(self):
     def test_hf_scheduler_hf_optimizer(self):
         a = 0
         with mockenv_context(**self.dist_env_1_gpu):
-            ds_config_zero2_dict = self.get_config_dict(ZERO2)
-            del ds_config_zero2_dict["optimizer"]  # force default HF Trainer optimizer
-            del ds_config_zero2_dict["scheduler"]  # force default HF Trainer scheduler
-            ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
-            ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
-            trainer = get_regression_trainer(a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict)
+            ds_config_zero2_fp16_dict = self.get_config_dict(ZERO2_FP16)
+            del ds_config_zero2_fp16_dict["optimizer"]  # force default HF Trainer optimizer
+            del ds_config_zero2_fp16_dict["scheduler"]  # force default HF Trainer scheduler
+            ds_config_zero2_fp16_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
+            ds_config_zero2_fp16_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
+            trainer = get_regression_trainer(a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_fp16_dict)
             trainer.train()
         new_a = trainer.model.a.item()
         self.assertNotEqual(new_a, a)
@@ -310,11 +337,11 @@ def test_hf_scheduler_hf_optimizer(self):
     def test_ds_scheduler_hf_optimizer(self):
         a = 0
         with mockenv_context(**self.dist_env_1_gpu):
-            ds_config_zero2_dict = self.get_config_dict(ZERO2)
-            del ds_config_zero2_dict["optimizer"]  # force default HF Trainer optimizer
-            ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
-            ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
-            trainer = get_regression_trainer(a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict)
+            ds_config_zero2_fp16_dict = self.get_config_dict(ZERO2_FP16)
+            del ds_config_zero2_fp16_dict["optimizer"]  # force default HF Trainer optimizer
+            ds_config_zero2_fp16_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
+            ds_config_zero2_fp16_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
+            trainer = get_regression_trainer(a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_fp16_dict)
             trainer.train()
         new_a = trainer.model.a.item()
         self.assertNotEqual(new_a, a)
@@ -322,11 +349,11 @@ def test_ds_scheduler_hf_optimizer(self):
     def test_hf_scheduler_ds_optimizer(self):
         a = 0
         with mockenv_context(**self.dist_env_1_gpu):
-            ds_config_zero2_dict = self.get_config_dict(ZERO2)
-            del ds_config_zero2_dict["scheduler"]  # force default HF Trainer scheduler
-            ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
-            ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
-            trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict)
+            ds_config_zero2_fp16_dict = self.get_config_dict(ZERO2_FP16)
+            del ds_config_zero2_fp16_dict["scheduler"]  # force default HF Trainer scheduler
+            ds_config_zero2_fp16_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
+            ds_config_zero2_fp16_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
+            trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=ds_config_zero2_fp16_dict)
             trainer.train()
         new_a = trainer.model.a.item()
         self.assertNotEqual(new_a, a)
@@ -338,17 +365,17 @@ def test_stage3_nvme_offload(self):
             # runs a simple check that we can use some directory as if it were NVMe
             nvme_path = self.get_auto_remove_tmp_dir()
             nvme_config = dict(device="nvme", nvme_path=nvme_path)
-            ds_config_zero3_dict = self.get_config_dict(ZERO3)
-            ds_config_zero3_dict["zero_optimization"]["offload_optimizer"] = nvme_config
-            ds_config_zero3_dict["zero_optimization"]["offload_param"] = nvme_config
-            trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=ds_config_zero3_dict)
+            ds_config_zero3_fp16_dict = self.get_config_dict(ZERO3_FP16)
+            ds_config_zero3_fp16_dict["zero_optimization"]["offload_optimizer"] = nvme_config
+            ds_config_zero3_fp16_dict["zero_optimization"]["offload_param"] = nvme_config
+            trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=ds_config_zero3_fp16_dict)
             with CaptureLogger(deepspeed_logger) as cl:
                 trainer.train()
             self.assertIn("DeepSpeed info", cl.out, "expected DeepSpeed logger output but got none")
 
     # --- These tests need to run on both zero stages --- #
 
-    @parameterized.expand(stages)
+    @parameterized.expand(stages_fp16)
     def test_hf_optimizer_with_offload(self, stage):
         # non-DS optimizers can be used with ZERO-offload (as long as they have both CPU and GPU implementation (except LAMB))
         ds_config_dict = self.get_config_dict(stage)
@@ -361,7 +388,7 @@ def test_hf_optimizer_with_offload(self, stage):
                 trainer.train()
             self.assertIn("DeepSpeed info", cl.out, "expected DeepSpeed logger output but got none")
 
-    @parameterized.expand(stages)
+    @parameterized.expand(stages_fp16)
     def test_fake_notebook_no_launcher(self, stage):
         # this setup emulates a notebook where a launcher needs to be emulated by hand
 
@@ -375,7 +402,7 @@ def test_fake_notebook_no_launcher(self, stage):
                 trainer.train()
             self.assertIn("DeepSpeed info", cl.out, "expected DeepSpeed logger output but got none")
 
-    @parameterized.expand(stages)
+    @parameterized.expand(stages_fp16)
     def test_early_get_last_lr(self, stage):
         # with deepspeed's fp16 and dynamic loss scale enabled the optimizer/scheduler steps may
         # not run for the first few dozen steps while loss scale is too large, and thus during
@@ -404,14 +431,14 @@ def test_early_get_last_lr(self, stage):
             # print(trainer.model.a.item())
             # print(trainer.model.b.item())
             # need to investigate at some point
-            if stage == ZERO3:
+            if stage == ZERO3_FP16:
                 return
 
             # it's enough that train didn't fail for this test, but we must check that
             # optimizer/scheduler didn't run (since if it did this test isn't testing the right thing)
             self.assertEqual(post_train_a, a)
 
-    @parameterized.expand(stages)
+    @parameterized.expand(stages_fp16)
     def test_gradient_accumulation(self, stage):
         # this test measures that we get identical weights and similar loss with:
         # 1. per_device_train_batch_size=8, gradient_accumulation_steps=1
@@ -475,9 +502,9 @@ def check_saved_checkpoints_deepspeed(self, output_dir, freq, total, stage):
 
         file_list = [WEIGHTS_NAME, "training_args.bin", "trainer_state.json", "config.json"]
 
-        if stage == ZERO2:
+        if stage == ZERO2_FP16:
             ds_file_list = ["mp_rank_00_model_states.pt"]
-        elif stage == ZERO3:
+        elif stage == ZERO3_FP16:
             ds_file_list = ["zero_pp_rank_0_mp_rank_00_model_states.pt"]
         else:
             raise ValueError(f"unknown stage {stage}")
@@ -509,7 +536,7 @@ def check_saved_checkpoints_deepspeed(self, output_dir, freq, total, stage):
                 path = os.path.join(ds_path, filename)
                 self.assertTrue(os.path.isfile(path), f"[{stage}] {path} is not found")
 
-    @parameterized.expand(stages)
+    @parameterized.expand(stages_fp16)
     def test_save_checkpoints(self, stage):
         # adapted from  TrainerIntegrationTest.test_save_checkpoints
 
@@ -517,7 +544,7 @@ def test_save_checkpoints(self, stage):
         output_dir = self.get_auto_remove_tmp_dir()
         ds_config_dict = self.get_config_dict(stage)
         ds_config_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
-        if stage == ZERO3:
+        if stage == ZERO3_FP16:
             ds_config_dict["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = True
 
         # save checkpoints
@@ -533,7 +560,7 @@ def test_save_checkpoints(self, stage):
         total = int(self.n_epochs * 64 / self.batch_size)
         self.check_saved_checkpoints_deepspeed(output_dir, freq, total, stage)
 
-    @parameterized.expand(stages)
+    @parameterized.expand(stages_fp16)
     def test_can_resume_training_errors(self, stage):
 
         with mockenv_context(**self.dist_env_1_gpu):
@@ -557,14 +584,14 @@ def test_can_resume_training_errors(self, stage):
                 "Can't find a valid checkpoint at" in str(context.exception), f"got exception: {context.exception}"
             )
 
-    @parameterized.expand(stages)
+    @parameterized.expand(stages_fp16)
     def test_can_resume_training_normal(self, stage):
         # adapted from TrainerIntegrationTest.test_can_resume_training
         # test normal resume for each stage separately, error-handling is tested in a different test
         output_dir = self.get_auto_remove_tmp_dir()
         ds_config_dict = self.get_config_dict(stage)
         ds_config_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
-        if stage == ZERO3:
+        if stage == ZERO3_FP16:
             ds_config_dict["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = True
 
         kwargs = dict(
@@ -602,7 +629,7 @@ def test_can_resume_training_normal(self, stage):
             self.assertEqual(b, b1)
             self.check_trainer_state_are_the_same(state, state1)
 
-    @parameterized.expand(stages)
+    @parameterized.expand(stages_fp16)
     def test_load_state_dict_from_zero_checkpoint(self, stage):
         # test that we can load fp32 weights directly from the zero checkpoint into the current model
 
@@ -643,20 +670,20 @@ def test_config_object(self):
         output_dir = self.get_auto_remove_tmp_dir()
         kwargs = dict(output_dir=output_dir, train_len=8, fp16=True)
 
-        ds_config_zero3_dict = self.get_config_dict("zero3")
-        ds_config_zero2_dict = self.get_config_dict("zero2")
+        ds_config_zero3_fp16_dict = self.get_config_dict("zero3_fp16")
+        ds_config_zero2_fp16_dict = self.get_config_dict("zero2_fp16")
 
         with mockenv_context(**self.dist_env_1_gpu):
-            trainer = get_regression_trainer(deepspeed=ds_config_zero3_dict, **kwargs)
+            trainer = get_regression_trainer(deepspeed=ds_config_zero3_fp16_dict, **kwargs)
             self.assertTrue(is_deepspeed_zero3_enabled())
 
             # test we can repeat that and with train this time
-            trainer = get_regression_trainer(deepspeed=ds_config_zero3_dict, **kwargs)
+            trainer = get_regression_trainer(deepspeed=ds_config_zero3_fp16_dict, **kwargs)
             trainer.train()
             self.assertTrue(is_deepspeed_zero3_enabled())
 
             # test zero3 is disabled
-            trainer = get_regression_trainer(deepspeed=ds_config_zero2_dict, **kwargs)
+            trainer = get_regression_trainer(deepspeed=ds_config_zero2_fp16_dict, **kwargs)
             self.assertFalse(is_deepspeed_zero3_enabled())
 
             # check config obj
@@ -693,21 +720,21 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
     #
 
     @require_torch_multi_gpu
-    @parameterized.expand(stages)
+    @parameterized.expand(stages_fp16)
     def test_basic_distributed(self, stage):
         self.run_and_check(stage=stage, distributed=True)
 
     def test_do_eval_no_train(self):
         # testing only zero3 since zero2 makes no sense with inference
         self.run_and_check(
-            stage=ZERO3,
+            stage=ZERO3_FP16,
             eval_steps=1,
             distributed=False,
             do_train=False,
             do_eval=True,
         )
 
-    @parameterized.expand(stages)
+    @parameterized.expand(stages_fp16)
     def test_fp32_non_distributed(self, stage):
         # real model needs too much GPU memory under stage2+fp32, so using tiny random model here -
         # therefore no quality checks, just basic completion checks are done
@@ -722,7 +749,7 @@ def test_fp32_non_distributed(self, stage):
         )
 
     @require_torch_multi_gpu
-    @parameterized.expand(stages)
+    @parameterized.expand(stages_fp16)
     def test_fp32_distributed(self, stage):
         # real model needs too much GPU memory under stage2+fp32, so using tiny random model here -
         # therefore no quality checks, just basic completion checks are done
@@ -736,7 +763,7 @@ def test_fp32_distributed(self, stage):
             fp16=False,
         )
 
-    @parameterized.expand(stages)
+    @parameterized.expand(stages_fp16)
     def test_resume_train_not_from_ds_checkpoint(self, stage):
         # do normal training and then resume not from the deepspeed checkpoint but explicitly from
         # the saved model dir
@@ -761,7 +788,7 @@ def test_inference(self, dtype):
         # it only works for z3 (makes no sense with z1-z2)
         fp16 = True if dtype == "fp16" else False
         self.run_and_check(
-            stage=ZERO3,
+            stage=ZERO3_FP16,
             model_name=T5_TINY,
             distributed=True,
             do_train=False,
@@ -901,7 +928,7 @@ def run_trainer(
 
         return output_dir
 
-    @parameterized.expand(stages)
+    @parameterized.expand(stages_fp16)
     def test_clm(self, stage):
         # this test exercises model.resize_token_embeddings() which requires param gathering outside
         # of forward - it's not used by `run_translation.py`, but it is in `run_clm.py`
@@ -936,7 +963,7 @@ def test_clm(self, stage):
         # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
         execute_subprocess_async(cmd, env=self.get_env())
 
-    def test_clm_from_config_zero3(self):
+    def test_clm_from_config_zero3_fp16(self):
         # this test exercises AutoModel.from_config(config) - to ensure zero.Init is called
 
         data_dir = self.tests_dir / "fixtures"
@@ -958,7 +985,7 @@ def test_clm_from_config_zero3(self):
             --report_to none
             """.split()
 
-        ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_zero3.json".split()
+        ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_zero3_fp16.json".split()
         script = [f"{self.examples_dir_str}/pytorch/language-modeling/run_clm.py"]
         launcher = get_launcher(distributed=True)
 
diff --git a/tests/deepspeed/test_model_zoo.py b/tests/deepspeed/test_model_zoo.py
index 321e8b2bf0b5a0..5cd4a3f131e3f5 100644
--- a/tests/deepspeed/test_model_zoo.py
+++ b/tests/deepspeed/test_model_zoo.py
@@ -203,7 +203,20 @@ def make_task_cmds():
 
 ZERO2 = "zero2"
 ZERO3 = "zero3"
+
+FP16 = "fp16"
+BF16 = "bf16"
+
+ZERO2_FP16 = "zero2_fp16"
+ZERO3_FP16 = "zero3_fp16"
+ZERO2_BF16 = "zero2_bf16"
+ZERO3_BF16 = "zero3_bf16"
+
+stages_fp16 = [ZERO2_FP16, ZERO3_FP16]
+stages_bf16 = [ZERO2_BF16, ZERO3_BF16]
+
 stages = [ZERO2, ZERO3]
+dtypes = [FP16, BF16]
 
 
 def parameterized_custom_name_func(func, param_num, param):
@@ -214,7 +227,7 @@ def parameterized_custom_name_func(func, param_num, param):
 
 
 # Cartesian-product of zero stages with models to test
-params = list(itertools.product(stages, task_cmds.keys()))
+params = list(itertools.product(stages_fp16, task_cmds.keys()))
 
 
 @slow

From 013e44dd14bdc2a26aae2346cb8952684bd3538d Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Sat, 4 Dec 2021 22:28:30 -0800
Subject: [PATCH 03/25] prep for bf16

---
 tests/deepspeed/test_deepspeed.py | 143 +++++++++++++++++-------------
 1 file changed, 79 insertions(+), 64 deletions(-)

diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
index 360a9b8f4d0f98..509c6c370e942a 100644
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -64,6 +64,16 @@ def load_json(path):
         return json.load(f)
 
 
+def stage_dtype_to_dtype(stage_dtype):
+    stage, dtype = stage_dtype.split("_")
+    return dtype
+
+
+def stage_dtype_to_dtype_kwargs(stage_dtype):
+    stage, dtype = stage_dtype.split("_")
+    return dtype, True
+
+
 def get_master_port(real_launcher=False):
     """
     When using a single gpu launcher emulation (i.e. not deepspeed or python -m torch.distributed)
@@ -132,6 +142,8 @@ def get_launcher(distributed=False):
 stages_fp16 = [ZERO2_FP16, ZERO3_FP16]
 stages_bf16 = [ZERO2_BF16, ZERO3_BF16]
 
+stages_all = stages_fp16 + stages_bf16
+
 stages = [ZERO2, ZERO3]
 dtypes = [FP16, BF16]
 
@@ -257,9 +269,9 @@ def setUp(self):
             zero3_bf16=config_zero3_bf16,
         )
 
-    def get_config_dict(self, stage):
+    def get_config_dict(self, stage_dtype):
         # As some tests modify the dict, always make a copy
-        return deepcopy(self.ds_config_dict[stage])
+        return deepcopy(self.ds_config_dict[stage_dtype])
 
     # --- These tests are enough to run on one of zero stages --- #
 
@@ -376,9 +388,9 @@ def test_stage3_nvme_offload(self):
     # --- These tests need to run on both zero stages --- #
 
     @parameterized.expand(stages_fp16)
-    def test_hf_optimizer_with_offload(self, stage):
+    def test_hf_optimizer_with_offload(self, stage_dtype):
         # non-DS optimizers can be used with ZERO-offload (as long as they have both CPU and GPU implementation (except LAMB))
-        ds_config_dict = self.get_config_dict(stage)
+        ds_config_dict = self.get_config_dict(stage_dtype)
         del ds_config_dict["optimizer"]  # force default HF Trainer optimizer
         # force cpu offload
         ds_config_dict["zero_optimization"]["offload_optimizer"]["device"] = "cpu"
@@ -389,7 +401,7 @@ def test_hf_optimizer_with_offload(self, stage):
             self.assertIn("DeepSpeed info", cl.out, "expected DeepSpeed logger output but got none")
 
     @parameterized.expand(stages_fp16)
-    def test_fake_notebook_no_launcher(self, stage):
+    def test_fake_notebook_no_launcher(self, stage_dtype):
         # this setup emulates a notebook where a launcher needs to be emulated by hand
 
         # note that unittest resets sys.stdout each test, so `CaptureStd` will work here to capture
@@ -397,13 +409,13 @@ def test_fake_notebook_no_launcher(self, stage):
         # it's run not as a first test as `sys.stdout` will no longer be the same. So we either have
         # to reset `deepspeed_logger.handlers[0].setStream(sys.stdout)` or directly capture from the deepspeed_logger.
         with mockenv_context(**self.dist_env_1_gpu):
-            trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=self.get_config_dict(stage))
+            trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=self.get_config_dict(stage_dtype))
             with CaptureLogger(deepspeed_logger) as cl:
                 trainer.train()
             self.assertIn("DeepSpeed info", cl.out, "expected DeepSpeed logger output but got none")
 
     @parameterized.expand(stages_fp16)
-    def test_early_get_last_lr(self, stage):
+    def test_early_get_last_lr(self, stage_dtype):
         # with deepspeed's fp16 and dynamic loss scale enabled the optimizer/scheduler steps may
         # not run for the first few dozen steps while loss scale is too large, and thus during
         # that time `get_last_lr` will fail if called during that warm up stage,
@@ -418,7 +430,7 @@ def test_early_get_last_lr(self, stage):
                 local_rank=0,
                 train_len=8,
                 fp16=True,
-                deepspeed=self.get_config_dict(stage),
+                deepspeed=self.get_config_dict(stage_dtype),
                 per_device_train_batch_size=8,
                 logging_steps=1,
             )
@@ -431,7 +443,7 @@ def test_early_get_last_lr(self, stage):
             # print(trainer.model.a.item())
             # print(trainer.model.b.item())
             # need to investigate at some point
-            if stage == ZERO3_FP16:
+            if stage_dtype == ZERO3_FP16:
                 return
 
             # it's enough that train didn't fail for this test, but we must check that
@@ -439,7 +451,7 @@ def test_early_get_last_lr(self, stage):
             self.assertEqual(post_train_a, a)
 
     @parameterized.expand(stages_fp16)
-    def test_gradient_accumulation(self, stage):
+    def test_gradient_accumulation(self, stage_dtype):
         # this test measures that we get identical weights and similar loss with:
         # 1. per_device_train_batch_size=8, gradient_accumulation_steps=1
         # 2. per_device_train_batch_size=4, gradient_accumulation_steps=2
@@ -460,9 +472,9 @@ def test_gradient_accumulation(self, stage):
             b=b,
             local_rank=0,
             train_len=train_len,
-            fp16=True,
-            deepspeed=self.get_config_dict(stage),
+            deepspeed=self.get_config_dict(stage_dtype),
         )
+        kwargs[stage_dtype_to_dtype(stage_dtype)] = True
 
         with mockenv_context(**self.dist_env_1_gpu):
             no_grad_accum_trainer = get_regression_trainer(
@@ -497,17 +509,17 @@ def test_gradient_accumulation(self, stage):
         # see the note above how to get identical loss on a small bs
         self.assertAlmostEqual(no_grad_accum_loss, yes_grad_accum_loss, places=2)
 
-    def check_saved_checkpoints_deepspeed(self, output_dir, freq, total, stage):
+    def check_saved_checkpoints_deepspeed(self, output_dir, freq, total, stage_dtype):
         # adapted from TrainerIntegrationCommon.check_saved_checkpoints
 
         file_list = [WEIGHTS_NAME, "training_args.bin", "trainer_state.json", "config.json"]
 
-        if stage == ZERO2_FP16:
+        if stage_dtype == ZERO2_FP16:
             ds_file_list = ["mp_rank_00_model_states.pt"]
-        elif stage == ZERO3_FP16:
+        elif stage_dtype == ZERO3_FP16:
             ds_file_list = ["zero_pp_rank_0_mp_rank_00_model_states.pt"]
         else:
-            raise ValueError(f"unknown stage {stage}")
+            raise ValueError(f"unknown stage_dtype {stage_dtype}")
 
         # XXX: this can be recoded and then removed once we require deepspeed>0.3.13
         from packaging import version
@@ -521,12 +533,12 @@ def check_saved_checkpoints_deepspeed(self, output_dir, freq, total, stage):
 
         for step in range(freq, total, freq):
             checkpoint = os.path.join(output_dir, f"checkpoint-{step}")
-            self.assertTrue(os.path.isdir(checkpoint), f"[{stage}] {checkpoint} dir is not found")
+            self.assertTrue(os.path.isdir(checkpoint), f"[{stage_dtype}] {checkpoint} dir is not found")
 
             # common files
             for filename in file_list:
                 path = os.path.join(checkpoint, filename)
-                self.assertTrue(os.path.isfile(path), f"[{stage}] {path} is not found")
+                self.assertTrue(os.path.isfile(path), f"[{stage_dtype}] {path} is not found")
 
             # ds files
             ds_path = os.path.join(checkpoint, f"global_step{step}")
@@ -534,39 +546,42 @@ def check_saved_checkpoints_deepspeed(self, output_dir, freq, total, stage):
                 # filename = os.path.join(path, filename)
                 # print(filename)
                 path = os.path.join(ds_path, filename)
-                self.assertTrue(os.path.isfile(path), f"[{stage}] {path} is not found")
+                self.assertTrue(os.path.isfile(path), f"[{stage_dtype}] {path} is not found")
 
     @parameterized.expand(stages_fp16)
-    def test_save_checkpoints(self, stage):
+    def test_save_checkpoints(self, stage_dtype):
         # adapted from  TrainerIntegrationTest.test_save_checkpoints
 
         freq = 5
         output_dir = self.get_auto_remove_tmp_dir()
-        ds_config_dict = self.get_config_dict(stage)
+        ds_config_dict = self.get_config_dict(stage_dtype)
         ds_config_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
-        if stage == ZERO3_FP16:
+        if stage_dtype == ZERO3_FP16:
             ds_config_dict["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = True
 
         # save checkpoints
         with mockenv_context(**self.dist_env_1_gpu):
-            trainer = get_regression_trainer(
+            kwargs = dict(
                 output_dir=output_dir,
                 save_steps=freq,
-                fp16=True,
                 deepspeed=ds_config_dict,
             )
+            kwargs[stage_dtype_to_dtype(stage_dtype)] = True
+            trainer = get_regression_trainer(**kwargs)
             trainer.train()
 
         total = int(self.n_epochs * 64 / self.batch_size)
-        self.check_saved_checkpoints_deepspeed(output_dir, freq, total, stage)
+        self.check_saved_checkpoints_deepspeed(output_dir, freq, total, stage_dtype)
 
     @parameterized.expand(stages_fp16)
-    def test_can_resume_training_errors(self, stage):
+    def test_can_resume_training_errors(self, stage_dtype):
 
         with mockenv_context(**self.dist_env_1_gpu):
-            ds_config_dict = self.get_config_dict(stage)
+            ds_config_dict = self.get_config_dict(stage_dtype)
             output_dir = self.get_auto_remove_tmp_dir()
-            trainer = get_regression_trainer(output_dir=output_dir, fp16=True, deepspeed=ds_config_dict)
+            kwargs = dict(output_dir=output_dir, deepspeed=ds_config_dict)
+            kwargs[stage_dtype_to_dtype(stage_dtype)] = True
+            trainer = get_regression_trainer(**kwargs)
 
             # 1. fail to find any checkpoint - due a fresh output_dir
             with self.assertRaises(Exception) as context:
@@ -585,18 +600,17 @@ def test_can_resume_training_errors(self, stage):
             )
 
     @parameterized.expand(stages_fp16)
-    def test_can_resume_training_normal(self, stage):
+    def test_can_resume_training_normal(self, stage_dtype):
         # adapted from TrainerIntegrationTest.test_can_resume_training
         # test normal resume for each stage separately, error-handling is tested in a different test
         output_dir = self.get_auto_remove_tmp_dir()
-        ds_config_dict = self.get_config_dict(stage)
+        ds_config_dict = self.get_config_dict(stage_dtype)
         ds_config_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
-        if stage == ZERO3_FP16:
+        if stage_dtype == ZERO3_FP16:
             ds_config_dict["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = True
 
-        kwargs = dict(
-            output_dir=output_dir, train_len=128, save_steps=5, learning_rate=0.1, fp16=True, deepspeed=ds_config_dict
-        )
+        kwargs = dict(output_dir=output_dir, train_len=128, save_steps=5, learning_rate=0.1, deepspeed=ds_config_dict)
+        kwargs[stage_dtype_to_dtype(stage_dtype)] = True
 
         with mockenv_context(**self.dist_env_1_gpu):
             trainer = get_regression_trainer(**kwargs)
@@ -630,12 +644,12 @@ def test_can_resume_training_normal(self, stage):
             self.check_trainer_state_are_the_same(state, state1)
 
     @parameterized.expand(stages_fp16)
-    def test_load_state_dict_from_zero_checkpoint(self, stage):
+    def test_load_state_dict_from_zero_checkpoint(self, stage_dtype):
         # test that we can load fp32 weights directly from the zero checkpoint into the current model
 
         output_dir = self.get_auto_remove_tmp_dir()  # "./xxx", after=False, before=False)
 
-        ds_config_dict = self.get_config_dict(stage)
+        ds_config_dict = self.get_config_dict(stage_dtype)
 
         kwargs = dict(
             output_dir=output_dir,
@@ -645,9 +659,9 @@ def test_load_state_dict_from_zero_checkpoint(self, stage):
             save_strategy="steps",
             save_steps=1,
             learning_rate=0.1,
-            fp16=True,
             deepspeed=ds_config_dict,
         )
+        kwargs[stage_dtype_to_dtype(stage_dtype)] = True
 
         with mockenv_context(**self.dist_env_1_gpu):
             trainer = get_regression_trainer(**kwargs)
@@ -670,8 +684,8 @@ def test_config_object(self):
         output_dir = self.get_auto_remove_tmp_dir()
         kwargs = dict(output_dir=output_dir, train_len=8, fp16=True)
 
-        ds_config_zero3_fp16_dict = self.get_config_dict("zero3_fp16")
-        ds_config_zero2_fp16_dict = self.get_config_dict("zero2_fp16")
+        ds_config_zero3_fp16_dict = self.get_config_dict(ZERO3_FP16)
+        ds_config_zero2_fp16_dict = self.get_config_dict(ZERO2_FP16)
 
         with mockenv_context(**self.dist_env_1_gpu):
             trainer = get_regression_trainer(deepspeed=ds_config_zero3_fp16_dict, **kwargs)
@@ -721,13 +735,13 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
 
     @require_torch_multi_gpu
     @parameterized.expand(stages_fp16)
-    def test_basic_distributed(self, stage):
-        self.run_and_check(stage=stage, distributed=True)
+    def test_basic_distributed(self, stage_dtype):
+        self.run_and_check(stage_dtype=stage_dtype, distributed=True)
 
     def test_do_eval_no_train(self):
         # testing only zero3 since zero2 makes no sense with inference
         self.run_and_check(
-            stage=ZERO3_FP16,
+            stage_dtype=ZERO3_FP16,
             eval_steps=1,
             distributed=False,
             do_train=False,
@@ -735,42 +749,42 @@ def test_do_eval_no_train(self):
         )
 
     @parameterized.expand(stages_fp16)
-    def test_fp32_non_distributed(self, stage):
+    def test_fp32_non_distributed(self, stage_dtype):
         # real model needs too much GPU memory under stage2+fp32, so using tiny random model here -
         # therefore no quality checks, just basic completion checks are done
         self.run_and_check(
-            stage=stage,
+            stage_dtype=stage_dtype,
             model_name=T5_TINY,
             distributed=False,
             do_train=True,
             do_eval=True,
             quality_checks=False,
-            fp16=False,
+            fp32=True,
         )
 
     @require_torch_multi_gpu
     @parameterized.expand(stages_fp16)
-    def test_fp32_distributed(self, stage):
+    def test_fp32_distributed(self, stage_dtype):
         # real model needs too much GPU memory under stage2+fp32, so using tiny random model here -
         # therefore no quality checks, just basic completion checks are done
         self.run_and_check(
-            stage=stage,
+            stage_dtype=stage_dtype,
             model_name=T5_TINY,
             distributed=True,
             do_train=True,
             do_eval=True,
             quality_checks=False,
-            fp16=False,
+            fp32=True,
         )
 
     @parameterized.expand(stages_fp16)
-    def test_resume_train_not_from_ds_checkpoint(self, stage):
+    def test_resume_train_not_from_ds_checkpoint(self, stage_dtype):
         # do normal training and then resume not from the deepspeed checkpoint but explicitly from
         # the saved model dir
 
         do_train = True
         do_eval = False
-        kwargs = dict(stage=stage, eval_steps=1, distributed=True, do_train=do_train, do_eval=do_eval)
+        kwargs = dict(stage_dtype=stage_dtype, eval_steps=1, distributed=True, do_train=do_train, do_eval=do_eval)
 
         # 1. normal training
         output_dir = self.run_and_check(**kwargs)
@@ -786,15 +800,15 @@ def test_resume_train_not_from_ds_checkpoint(self, stage):
     def test_inference(self, dtype):
         # this is just inference, so no optimizer should be loaded
         # it only works for z3 (makes no sense with z1-z2)
-        fp16 = True if dtype == "fp16" else False
+        fp32 = True if dtype == "fp32" else False
         self.run_and_check(
-            stage=ZERO3_FP16,
+            stage_dtype=ZERO3_FP16,
             model_name=T5_TINY,
             distributed=True,
             do_train=False,
             do_eval=True,
             quality_checks=False,
-            fp16=fp16,
+            fp32=fp32,
         )
 
     def do_checks(self, output_dir, do_train=True, do_eval=True, quality_checks=True):
@@ -814,28 +828,28 @@ def do_checks(self, output_dir, do_train=True, do_eval=True, quality_checks=True
     # XXX: need to do better validation beyond just that the run was successful
     def run_and_check(
         self,
-        stage,
+        stage_dtype,
         model_name: str = T5_SMALL,
         eval_steps: int = 10,
         distributed: bool = True,
         do_train: bool = True,
         do_eval: bool = True,
         quality_checks: bool = True,
-        fp16: bool = True,
+        fp32: bool = False,
         extra_args_str: str = None,
         remove_args_str: str = None,
     ):
 
         # we are doing quality testing so using a small real model
         output_dir = self.run_trainer(
-            stage=stage,
+            stage_dtype=stage_dtype,
             model_name=model_name,
             eval_steps=eval_steps,
             num_train_epochs=1,
             do_train=do_train,
             do_eval=do_eval,
             distributed=distributed,
-            fp16=fp16,
+            fp32=fp32,
             extra_args_str=extra_args_str,
             remove_args_str=remove_args_str,
         )
@@ -846,14 +860,14 @@ def run_and_check(
 
     def run_trainer(
         self,
-        stage: str,
+        stage_dtype: str,
         model_name: str,
         eval_steps: int = 10,
         num_train_epochs: int = 1,
         do_train: bool = False,
         do_eval: bool = True,
         distributed: bool = True,
-        fp16: bool = True,
+        fp32: bool = False,
         extra_args_str: str = None,
         remove_args_str: str = None,
     ):
@@ -881,8 +895,9 @@ def run_trainer(
         """.split()
         args.extend(["--source_prefix", '"translate English to Romanian: "'])
 
-        if fp16:
-            args.extend(["--fp16"])
+        if not fp32:
+            dtype = stage_dtype_to_dtype(stage_dtype)
+            args.extend([f"--{dtype}"])
 
         actions = 0
         if do_train:
@@ -917,7 +932,7 @@ def run_trainer(
             remove_args = remove_args_str.split()
             args = [x for x in args if x not in remove_args]
 
-        ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage}.json".split()
+        ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage_dtype}.json".split()
         script = [f"{self.examples_dir_str}/pytorch/translation/run_translation.py"]
         launcher = get_launcher(distributed)
 
@@ -929,7 +944,7 @@ def run_trainer(
         return output_dir
 
     @parameterized.expand(stages_fp16)
-    def test_clm(self, stage):
+    def test_clm(self, stage_dtype):
         # this test exercises model.resize_token_embeddings() which requires param gathering outside
         # of forward - it's not used by `run_translation.py`, but it is in `run_clm.py`
 
@@ -954,7 +969,7 @@ def test_clm(self, stage):
             --report_to none
             """.split()
 
-        ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage}.json".split()
+        ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage_dtype}.json".split()
         script = [f"{self.examples_dir_str}/pytorch/language-modeling/run_clm.py"]
         launcher = get_launcher(distributed=True)
 

From 4cba76bf53cb84c5a56bb8706589efd0b816eb35 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Sat, 4 Dec 2021 23:03:11 -0800
Subject: [PATCH 04/25] fix; zero2/bf16 is ok

---
 src/transformers/deepspeed.py     | 23 ++++++++++++++++-------
 tests/deepspeed/test_deepspeed.py | 16 ++++++++++------
 2 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/src/transformers/deepspeed.py b/src/transformers/deepspeed.py
index 56cfcc9269eeec..6482e7d88fb282 100644
--- a/src/transformers/deepspeed.py
+++ b/src/transformers/deepspeed.py
@@ -239,24 +239,33 @@ def trainer_config_process(self, args):
 
         # amp: similar to the pytorch native amp - it has a bunch of optional params but we won't set
         # any here unless the user did the work
-        self.fill_match("fp16.enabled", fp16_backend == "amp", "fp16|fp16_full_eval+fp16_backend(amp)")
+        self.fill_match(
+            "fp16.enabled",
+            ((args.fp16 or args.fp16_full_eval) and fp16_backend == "amp"),
+            "fp16|fp16_full_eval+fp16_backend(amp)",
+        )
 
         # apex: delegates amp work to apex (which needs to be available), but it cannot be used with any
         # ZeRO features
         self.fill_match("amp.enabled", fp16_backend == "apex", "fp16+fp16_backend(apex)")
         self.fill_match("amp.opt_level", args.fp16_opt_level, "fp16_opt_level")
 
-        self.fill_match("bf16.enabled", (args.bf16 or args.bf16_full_eval), "bf16|bf16_full_eval")
+        self.fill_match("bfloat16.enabled", (args.bf16 or args.bf16_full_eval), "bf16|bf16_full_eval")
 
         # deepspeed's default mode is fp16 unless there is a config that says differently
-        if self.is_false("fp16.enabled"):
-            if self.is_true("bfloat16.enabled"):
-                self._dtype = torch.bfloat16
-            else:
-                self._dtype = torch.float32
+        if self.is_true("bfoat16.enabled"):
+            self._dtype = torch.bfloat16
+        elif self.is_false("fp16.enabled"):
+            self._dtype = torch.float32
         else:
             self._dtype = torch.float16
 
+        # print(self._dtype)
+        # print(self.get_value("fp16.enabled"))
+        # print(self.get_value("bf16.enabled"))
+        # print(self.config)
+        # die
+
     def trainer_config_finalize(self, args, model, num_training_steps):
         """
         This stage is run after we have the model and know num_training_steps.
diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
index 509c6c370e942a..5a3689390d8739 100644
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -33,6 +33,7 @@
     get_gpu_count,
     mockenv_context,
     require_deepspeed,
+    require_torch_bf16,
     require_torch_gpu,
     require_torch_multi_gpu,
     slow,
@@ -140,7 +141,10 @@ def get_launcher(distributed=False):
 ZERO3_BF16 = "zero3_bf16"
 
 stages_fp16 = [ZERO2_FP16, ZERO3_FP16]
-stages_bf16 = [ZERO2_BF16, ZERO3_BF16]
+
+# XXX: for now only zero2 is supported
+# stages_bf16 = [ZERO2_BF16, ZERO3_BF16]
+stages_bf16 = [ZERO2_BF16]
 
 stages_all = stages_fp16 + stages_bf16
 
@@ -734,7 +738,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
     #
 
     @require_torch_multi_gpu
-    @parameterized.expand(stages_fp16)
+    @parameterized.expand(stages_all)
     def test_basic_distributed(self, stage_dtype):
         self.run_and_check(stage_dtype=stage_dtype, distributed=True)
 
@@ -748,7 +752,7 @@ def test_do_eval_no_train(self):
             do_eval=True,
         )
 
-    @parameterized.expand(stages_fp16)
+    @parameterized.expand(stages_all)
     def test_fp32_non_distributed(self, stage_dtype):
         # real model needs too much GPU memory under stage2+fp32, so using tiny random model here -
         # therefore no quality checks, just basic completion checks are done
@@ -763,7 +767,7 @@ def test_fp32_non_distributed(self, stage_dtype):
         )
 
     @require_torch_multi_gpu
-    @parameterized.expand(stages_fp16)
+    @parameterized.expand(stages_all)
     def test_fp32_distributed(self, stage_dtype):
         # real model needs too much GPU memory under stage2+fp32, so using tiny random model here -
         # therefore no quality checks, just basic completion checks are done
@@ -777,7 +781,7 @@ def test_fp32_distributed(self, stage_dtype):
             fp32=True,
         )
 
-    @parameterized.expand(stages_fp16)
+    @parameterized.expand(stages_all)
     def test_resume_train_not_from_ds_checkpoint(self, stage_dtype):
         # do normal training and then resume not from the deepspeed checkpoint but explicitly from
         # the saved model dir
@@ -943,7 +947,7 @@ def run_trainer(
 
         return output_dir
 
-    @parameterized.expand(stages_fp16)
+    @parameterized.expand(stages_all)
     def test_clm(self, stage_dtype):
         # this test exercises model.resize_token_embeddings() which requires param gathering outside
         # of forward - it's not used by `run_translation.py`, but it is in `run_clm.py`

From 85b16d193530f54a1c5efa267892490a67f89063 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Sat, 4 Dec 2021 23:08:03 -0800
Subject: [PATCH 05/25] check bf16 is available

---
 tests/deepspeed/test_deepspeed.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
index 5a3689390d8739..ab863334b0fa82 100644
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -22,7 +22,7 @@
 from parameterized import parameterized
 from transformers import AutoModel, TrainingArguments, is_torch_available, logging
 from transformers.deepspeed import HfDeepSpeedConfig, is_deepspeed_available
-from transformers.file_utils import WEIGHTS_NAME
+from transformers.file_utils import WEIGHTS_NAME, is_torch_bf16_available
 from transformers.testing_utils import (
     CaptureLogger,
     CaptureStderr,
@@ -33,7 +33,6 @@
     get_gpu_count,
     mockenv_context,
     require_deepspeed,
-    require_torch_bf16,
     require_torch_gpu,
     require_torch_multi_gpu,
     slow,
@@ -142,9 +141,12 @@ def get_launcher(distributed=False):
 
 stages_fp16 = [ZERO2_FP16, ZERO3_FP16]
 
-# XXX: for now only zero2 is supported
-# stages_bf16 = [ZERO2_BF16, ZERO3_BF16]
-stages_bf16 = [ZERO2_BF16]
+if is_torch_bf16_available():
+    # XXX: for now only zero2 is supported
+    # stages_bf16 = [ZERO2_BF16, ZERO3_BF16]
+    stages_bf16 = [ZERO2_BF16]
+else:
+    stages_bf16 = []
 
 stages_all = stages_fp16 + stages_bf16
 

From e8b69a6e2bdce5599dadb549556b65b64da173b8 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Wed, 8 Dec 2021 17:02:40 -0800
Subject: [PATCH 06/25] test fixes

---
 tests/deepspeed/test_deepspeed.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
index 5944e9dbd8818f..56fad5a49b2942 100644
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -26,6 +26,7 @@
 from transformers.testing_utils import (
     CaptureLogger,
     CaptureStderr,
+    CaptureStdout,
     ExtendSysPath,
     LoggingLevel,
     TestCasePlus,
@@ -1022,8 +1023,8 @@ def test_clm_from_config_zero3_fp16(self):
             execute_subprocess_async(cmd, env=self.get_env())
         assert "Detected DeepSpeed ZeRO-3" in cs.err
 
-    @parameterized.expand(stages)
-    def test_load_best_model(self, stage):
+    @parameterized.expand(stages_all)
+    def test_load_best_model(self, stage_dtype):
         # this test exercises --load_best_model_at_end - the key is being able to resume after some training
 
         data_dir = self.tests_dir / "fixtures/tests_samples/wmt_en_ro"
@@ -1056,14 +1057,14 @@ def test_load_best_model(self, stage):
             """.split()
         args.extend(["--source_prefix", "translate English to Romanian: "])
 
-        ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_zero3.json".split()
+        ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage_dtype}.json".split()
         script = [f"{self.examples_dir_str}/pytorch/translation/run_translation.py"]
         launcher = get_launcher(distributed=False)
 
         cmd = launcher + script + args + ds_args
         # keep for quick debug
         # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
-        with CaptureStderr() as cs:
+        with CaptureStdout() as cs:
             execute_subprocess_async(cmd, env=self.get_env())
-        # enough to test it didn't fail
-        assert "Detected DeepSpeed ZeRO-3" in cs.err
+        # enough to test deespeed was invoked and it didn't fail
+        assert "DeepSpeed info" in cs.out

From 087ba85826dc20acaf6502eaaea2028d9c3f38d6 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Wed, 8 Dec 2021 17:30:04 -0800
Subject: [PATCH 07/25] enable zero3_bf16

---
 tests/deepspeed/test_deepspeed.py | 64 +++++++++++++++++--------------
 1 file changed, 36 insertions(+), 28 deletions(-)

diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
index 56fad5a49b2942..682a60c05cd6f3 100644
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -70,11 +70,6 @@ def stage_dtype_to_dtype(stage_dtype):
     return dtype
 
 
-def stage_dtype_to_dtype_kwargs(stage_dtype):
-    stage, dtype = stage_dtype.split("_")
-    return dtype, True
-
-
 def get_master_port(real_launcher=False):
     """
     When using a single gpu launcher emulation (i.e. not deepspeed or python -m torch.distributed)
@@ -140,14 +135,18 @@ def get_launcher(distributed=False):
 ZERO2_BF16 = "zero2_bf16"
 ZERO3_BF16 = "zero3_bf16"
 
+stages_zero2 = [ZERO2_FP16]
+stages_zero3 = [ZERO3_FP16]
+
 stages_fp16 = [ZERO2_FP16, ZERO3_FP16]
+stages_bf16 = []
 
 if is_torch_bf16_available():
+    stages_bf16 = [ZERO2_BF16, ZERO3_BF16]
     # XXX: for now only zero2 is supported
-    # stages_bf16 = [ZERO2_BF16, ZERO3_BF16]
-    stages_bf16 = [ZERO2_BF16]
-else:
-    stages_bf16 = []
+    # stages_bf16 = [ZERO2_BF16]
+    stages_zero2 += [ZERO2_BF16]
+    stages_zero3 += [ZERO3_BF16]
 
 stages_all = stages_fp16 + stages_bf16
 
@@ -263,11 +262,11 @@ def setUp(self):
         with io.open(self.ds_config_file[ZERO2_BF16], "r", encoding="utf-8") as f:
             config_zero2_bf16 = json.load(f)
             # by default use fp16
-            config_zero2_bf16["bf16"]["enabled"] = True
+            config_zero2_bf16["bfloat16"]["enabled"] = True
         with io.open(self.ds_config_file[ZERO3_BF16], "r", encoding="utf-8") as f:
             config_zero3_bf16 = json.load(f)
             # by default use fp16
-            config_zero3_bf16["bf16"]["enabled"] = True
+            config_zero3_bf16["bfloat16"]["enabled"] = True
 
         self.ds_config_dict = dict(
             zero2_fp16=config_zero2_fp16,
@@ -394,7 +393,7 @@ def test_stage3_nvme_offload(self):
 
     # --- These tests need to run on both zero stages --- #
 
-    @parameterized.expand(stages_fp16)
+    @parameterized.expand(stages_all)
     def test_hf_optimizer_with_offload(self, stage_dtype):
         # non-DS optimizers can be used with ZERO-offload (as long as they have both CPU and GPU implementation (except LAMB))
         ds_config_dict = self.get_config_dict(stage_dtype)
@@ -402,12 +401,14 @@ def test_hf_optimizer_with_offload(self, stage_dtype):
         # force cpu offload
         ds_config_dict["zero_optimization"]["offload_optimizer"]["device"] = "cpu"
         with mockenv_context(**self.dist_env_1_gpu):
-            trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=ds_config_dict)
+            kwargs = dict(local_rank=0, deepspeed=ds_config_dict)
+            kwargs[stage_dtype_to_dtype(stage_dtype)] = True
+            trainer = get_regression_trainer(**kwargs)
             with CaptureLogger(deepspeed_logger) as cl:
                 trainer.train()
             self.assertIn("DeepSpeed info", cl.out, "expected DeepSpeed logger output but got none")
 
-    @parameterized.expand(stages_fp16)
+    @parameterized.expand(stages_all)
     def test_fake_notebook_no_launcher(self, stage_dtype):
         # this setup emulates a notebook where a launcher needs to be emulated by hand
 
@@ -416,12 +417,15 @@ def test_fake_notebook_no_launcher(self, stage_dtype):
         # it's run not as a first test as `sys.stdout` will no longer be the same. So we either have
         # to reset `deepspeed_logger.handlers[0].setStream(sys.stdout)` or directly capture from the deepspeed_logger.
         with mockenv_context(**self.dist_env_1_gpu):
-            trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=self.get_config_dict(stage_dtype))
+            kwargs = dict(local_rank=0, deepspeed=self.get_config_dict(stage_dtype))
+            kwargs[stage_dtype_to_dtype(stage_dtype)] = True
+            trainer = get_regression_trainer(**kwargs)
+
             with CaptureLogger(deepspeed_logger) as cl:
                 trainer.train()
             self.assertIn("DeepSpeed info", cl.out, "expected DeepSpeed logger output but got none")
 
-    @parameterized.expand(stages_fp16)
+    @parameterized.expand(stages_all)
     def test_early_get_last_lr(self, stage_dtype):
         # with deepspeed's fp16 and dynamic loss scale enabled the optimizer/scheduler steps may
         # not run for the first few dozen steps while loss scale is too large, and thus during
@@ -431,16 +435,18 @@ def test_early_get_last_lr(self, stage_dtype):
         # `self.lr_scheduler.get_last_lr()` and originally it'd fail on the very first step.
         with mockenv_context(**self.dist_env_1_gpu):
             a = b = 0.0
-            trainer = get_regression_trainer(
+            kwargs = dict(
                 a=a,
                 b=b,
                 local_rank=0,
                 train_len=8,
-                fp16=True,
                 deepspeed=self.get_config_dict(stage_dtype),
                 per_device_train_batch_size=8,
                 logging_steps=1,
             )
+            kwargs[stage_dtype_to_dtype(stage_dtype)] = True
+            trainer = get_regression_trainer(**kwargs)
+
             trainer.train()
             post_train_a = trainer.model.a.item()
 
@@ -450,14 +456,14 @@ def test_early_get_last_lr(self, stage_dtype):
             # print(trainer.model.a.item())
             # print(trainer.model.b.item())
             # need to investigate at some point
-            if stage_dtype == ZERO3_FP16:
+            if stage_dtype in stages_zero3:
                 return
 
             # it's enough that train didn't fail for this test, but we must check that
             # optimizer/scheduler didn't run (since if it did this test isn't testing the right thing)
             self.assertEqual(post_train_a, a)
 
-    @parameterized.expand(stages_fp16)
+    @parameterized.expand(stages_all)
     def test_gradient_accumulation(self, stage_dtype):
         # this test measures that we get identical weights and similar loss with:
         # 1. per_device_train_batch_size=8, gradient_accumulation_steps=1
@@ -521,9 +527,9 @@ def check_saved_checkpoints_deepspeed(self, output_dir, freq, total, stage_dtype
 
         file_list = [WEIGHTS_NAME, "training_args.bin", "trainer_state.json", "config.json"]
 
-        if stage_dtype == ZERO2_FP16:
+        if stage_dtype in stages_zero2:
             ds_file_list = ["mp_rank_00_model_states.pt"]
-        elif stage_dtype == ZERO3_FP16:
+        elif stage_dtype in stages_zero3:
             ds_file_list = ["zero_pp_rank_0_mp_rank_00_model_states.pt"]
         else:
             raise ValueError(f"unknown stage_dtype {stage_dtype}")
@@ -555,14 +561,15 @@ def check_saved_checkpoints_deepspeed(self, output_dir, freq, total, stage_dtype
                 path = os.path.join(ds_path, filename)
                 self.assertTrue(os.path.isfile(path), f"[{stage_dtype}] {path} is not found")
 
-    @parameterized.expand(stages_fp16)
+    @parameterized.expand(stages_all)
     def test_save_checkpoints(self, stage_dtype):
         # adapted from  TrainerIntegrationTest.test_save_checkpoints
 
         freq = 5
         output_dir = self.get_auto_remove_tmp_dir()
         ds_config_dict = self.get_config_dict(stage_dtype)
-        ds_config_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
+        if stage_dtype in stages_fp16:
+            ds_config_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
         if stage_dtype == ZERO3_FP16:
             ds_config_dict["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = True
 
@@ -580,7 +587,7 @@ def test_save_checkpoints(self, stage_dtype):
         total = int(self.n_epochs * 64 / self.batch_size)
         self.check_saved_checkpoints_deepspeed(output_dir, freq, total, stage_dtype)
 
-    @parameterized.expand(stages_fp16)
+    @parameterized.expand(stages_all)
     def test_can_resume_training_errors(self, stage_dtype):
 
         with mockenv_context(**self.dist_env_1_gpu):
@@ -606,13 +613,14 @@ def test_can_resume_training_errors(self, stage_dtype):
                 "Can't find a valid checkpoint at" in str(context.exception), f"got exception: {context.exception}"
             )
 
-    @parameterized.expand(stages_fp16)
+    @parameterized.expand(stages_all)
     def test_can_resume_training_normal(self, stage_dtype):
         # adapted from TrainerIntegrationTest.test_can_resume_training
         # test normal resume for each stage separately, error-handling is tested in a different test
         output_dir = self.get_auto_remove_tmp_dir()
         ds_config_dict = self.get_config_dict(stage_dtype)
-        ds_config_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
+        if stage_dtype in stages_fp16:
+            ds_config_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
         if stage_dtype == ZERO3_FP16:
             ds_config_dict["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = True
 
@@ -655,7 +663,7 @@ def test_can_resume_training_normal(self, stage_dtype):
             # trainer.train(resume_from_checkpoint=checkpoint)
             # a workaround needs to be used that re-creates the deepspeed engine
 
-    @parameterized.expand(stages_fp16)
+    @parameterized.expand(stages_all)
     def test_load_state_dict_from_zero_checkpoint(self, stage_dtype):
         # test that we can load fp32 weights directly from the zero checkpoint into the current model
 

From a9c1721ff6eeba4729ba7760c6ec809c72935b2b Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Wed, 8 Dec 2021 17:39:34 -0800
Subject: [PATCH 08/25] config files

---
 tests/deepspeed/ds_config_zero2_bf16.json | 45 ++++++++++++++++++++
 tests/deepspeed/ds_config_zero3_bf16.json | 52 +++++++++++++++++++++++
 2 files changed, 97 insertions(+)
 create mode 100644 tests/deepspeed/ds_config_zero2_bf16.json
 create mode 100644 tests/deepspeed/ds_config_zero3_bf16.json

diff --git a/tests/deepspeed/ds_config_zero2_bf16.json b/tests/deepspeed/ds_config_zero2_bf16.json
new file mode 100644
index 00000000000000..2dd534d0163173
--- /dev/null
+++ b/tests/deepspeed/ds_config_zero2_bf16.json
@@ -0,0 +1,45 @@
+{
+    "bfloat16": {
+        "enabled": "auto"
+    },
+
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+
+    "zero_optimization": {
+        "stage": 2,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "allgather_partitions": true,
+        "allgather_bucket_size": 2e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 2e8,
+        "contiguous_gradients": true
+    },
+
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
diff --git a/tests/deepspeed/ds_config_zero3_bf16.json b/tests/deepspeed/ds_config_zero3_bf16.json
new file mode 100644
index 00000000000000..5e78c00189d778
--- /dev/null
+++ b/tests/deepspeed/ds_config_zero3_bf16.json
@@ -0,0 +1,52 @@
+{
+    "bfloat16": {
+        "enabled": "auto"
+    },
+
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_fp16_weights_on_model_save": true
+    },
+
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}

From 967f383f7eaf1c2db4cdcf0520830642bdac200a Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Wed, 8 Dec 2021 18:07:47 -0800
Subject: [PATCH 09/25] docs

---
 docs/source/main_classes/deepspeed.rst | 39 ++++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/docs/source/main_classes/deepspeed.rst b/docs/source/main_classes/deepspeed.rst
index 831e94fa515711..a243db763317a7 100644
--- a/docs/source/main_classes/deepspeed.rst
+++ b/docs/source/main_classes/deepspeed.rst
@@ -1267,7 +1267,8 @@ benchmarks, please, see `TensorFloat-32(TF32) on Ampere devices
 <https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices>`__. The document includes
 instructions on how to disable this automatic conversion if for some reason you prefer not to use it.
 
-
+With the 🤗 Trainer you can use `--tf32` to enable it, or disable it with `--tf32 0` or `--no_tf32`. By default the
+PyTorch default is used.
 
 
 .. _deepspeed-amp:
@@ -1277,6 +1278,10 @@ Automatic Mixed Precision
 
 You can use automatic mixed precision with either a pytorch-like AMP way or the apex-like way:
 
+fp16 / float16
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+
 To configure pytorch AMP-like mode set:
 
 .. code-block:: json
@@ -1295,7 +1300,7 @@ To configure pytorch AMP-like mode set:
 and the :class:`~transformers.Trainer` will automatically enable or disable it based on the value of
 ``args.fp16_backend``. The rest of config values are up to you.
 
-This mode gets enabled when ``--fp16 --fp16_backend amp`` command line args are passed.
+This mode gets enabled when ``--fp16 --fp16_backend amp`` or ``--fp16_full_eval`` command line args are passed.
 
 You can also enable/disable this mode explicitly:
 
@@ -1317,6 +1322,36 @@ configuration.
 
 Here is the `documentation <https://www.deepspeed.ai/docs/config-json/#fp16-training-options>`__.
 
+bf16 / bfloat16
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+If bf16 is desired instead of fp16 then the following configuration section is to be used:
+
+.. code-block:: json
+
+    {
+        "bfloat16": {
+            "enabled": auto,
+        }
+    }
+
+bf16 has the same dynamic range as fp32 and thus doesn't require loss scaling.
+
+This mode gets enabled when ``--bf16`` or ``--bf16_full_eval`` command line args are passed.
+
+You can also enable/disable this mode explicitly:
+
+.. code-block:: json
+
+    {
+        "bfloat16": {
+            "enabled": true,
+        }
+    }
+
+apex
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
 To configure apex AMP-like mode set:
 
 .. code-block:: json

From c9e16e675f6d07db089cb4862670f686189127e6 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Wed, 8 Dec 2021 21:21:22 -0800
Subject: [PATCH 10/25] split stage_dtype; merge back to non-dtype-specific
 config file

---
 docs/source/main_classes/deepspeed.rst        |  14 +-
 ...g_zero2_fp16.json => ds_config_zero2.json} |   4 +
 tests/deepspeed/ds_config_zero2_bf16.json     |  45 ---
 ...g_zero3_fp16.json => ds_config_zero3.json} |   4 +
 tests/deepspeed/ds_config_zero3_bf16.json     |  52 ---
 tests/deepspeed/test_deepspeed.py             | 367 +++++++++---------
 tests/deepspeed/test_model_zoo.py             |  17 +-
 7 files changed, 195 insertions(+), 308 deletions(-)
 rename tests/deepspeed/{ds_config_zero2_fp16.json => ds_config_zero2.json} (95%)
 delete mode 100644 tests/deepspeed/ds_config_zero2_bf16.json
 rename tests/deepspeed/{ds_config_zero3_fp16.json => ds_config_zero3.json} (96%)
 delete mode 100644 tests/deepspeed/ds_config_zero3_bf16.json

diff --git a/docs/source/main_classes/deepspeed.rst b/docs/source/main_classes/deepspeed.rst
index a243db763317a7..9430d8390c4fb7 100644
--- a/docs/source/main_classes/deepspeed.rst
+++ b/docs/source/main_classes/deepspeed.rst
@@ -206,7 +206,7 @@ Here is an example of running ``run_translation.py`` under DeepSpeed deploying a
 .. code-block:: bash
 
     deepspeed examples/pytorch/translation/run_translation.py \
-    --deepspeed tests/deepspeed/ds_config_zero3_fp16.json \
+    --deepspeed tests/deepspeed/ds_config_zero3.json \
     --model_name_or_path t5-small --per_device_train_batch_size 1   \
     --output_dir output_dir --overwrite_output_dir --fp16 \
     --do_train --max_train_samples 500 --num_train_epochs 1 \
@@ -233,7 +233,7 @@ To deploy DeepSpeed with one GPU adjust the :class:`~transformers.Trainer` comma
 .. code-block:: bash
 
     deepspeed --num_gpus=1 examples/pytorch/translation/run_translation.py \
-    --deepspeed tests/deepspeed/ds_config_zero2_fp16.json \
+    --deepspeed tests/deepspeed/ds_config_zero2.json \
     --model_name_or_path t5-small --per_device_train_batch_size 1   \
     --output_dir output_dir --overwrite_output_dir --fp16 \
     --do_train --max_train_samples 500 --num_train_epochs 1 \
@@ -320,7 +320,7 @@ If you're using only 1 GPU, here is how you'd have to adjust your training code
     os.environ['WORLD_SIZE'] = "1"
 
     # Now proceed as normal, plus pass the deepspeed config file
-    training_args = TrainingArguments(..., deepspeed="ds_config_zero3_fp16.json")
+    training_args = TrainingArguments(..., deepspeed="ds_config_zero3.json")
     trainer = Trainer(...)
     trainer.train()
 
@@ -336,7 +336,7 @@ cell with:
 .. code-block:: python
 
     %%bash
-    cat <<'EOT' > ds_config_zero3_fp16.json
+    cat <<'EOT' > ds_config_zero3.json
     {
         "fp16": {
             "enabled": "auto",
@@ -823,7 +823,7 @@ these help you to trade scalability for speed depending on your needs.
 ZeRO-2 Example
 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 
-Here is a full ZeRO-2 auto-configuration file ``ds_config_zero2_fp16.json``:
+Here is a full ZeRO-2 auto-configuration file ``ds_config_zero2.json``:
 
 .. code-block:: json
 
@@ -938,7 +938,7 @@ values look like, but we highly recommend using the one with multiple ``auto`` s
 ZeRO-3 Example
 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 
-Here is a full ZeRO-3 auto-configuration file ``ds_config_zero3_fp16.json``:
+Here is a full ZeRO-3 auto-configuration file ``ds_config_zero3.json``:
 
 
 .. code-block:: json
@@ -1701,7 +1701,7 @@ Here is an example of running ``run_translation.py`` under DeepSpeed deploying a
 .. code-block:: bash
 
     deepspeed examples/pytorch/translation/run_translation.py \
-    --deepspeed tests/deepspeed/ds_config_zero3_fp16.json \
+    --deepspeed tests/deepspeed/ds_config_zero3.json \
     --model_name_or_path t5-small --output_dir output_dir \
     --do_eval --max_eval_samples 50 --warmup_steps 50  \
     --max_source_length 128 --val_max_target_length 128 \
diff --git a/tests/deepspeed/ds_config_zero2_fp16.json b/tests/deepspeed/ds_config_zero2.json
similarity index 95%
rename from tests/deepspeed/ds_config_zero2_fp16.json
rename to tests/deepspeed/ds_config_zero2.json
index dec097dd19887f..b16ec70ca90d56 100644
--- a/tests/deepspeed/ds_config_zero2_fp16.json
+++ b/tests/deepspeed/ds_config_zero2.json
@@ -8,6 +8,10 @@
         "min_loss_scale": 1
     },
 
+    "bfloat16": {
+        "enabled": "auto"
+    },
+
     "optimizer": {
         "type": "AdamW",
         "params": {
diff --git a/tests/deepspeed/ds_config_zero2_bf16.json b/tests/deepspeed/ds_config_zero2_bf16.json
deleted file mode 100644
index 2dd534d0163173..00000000000000
--- a/tests/deepspeed/ds_config_zero2_bf16.json
+++ /dev/null
@@ -1,45 +0,0 @@
-{
-    "bfloat16": {
-        "enabled": "auto"
-    },
-
-    "optimizer": {
-        "type": "AdamW",
-        "params": {
-            "lr": "auto",
-            "betas": "auto",
-            "eps": "auto",
-            "weight_decay": "auto"
-        }
-    },
-
-    "scheduler": {
-        "type": "WarmupLR",
-        "params": {
-            "warmup_min_lr": "auto",
-            "warmup_max_lr": "auto",
-            "warmup_num_steps": "auto"
-        }
-    },
-
-    "zero_optimization": {
-        "stage": 2,
-        "offload_optimizer": {
-            "device": "cpu",
-            "pin_memory": true
-        },
-        "allgather_partitions": true,
-        "allgather_bucket_size": 2e8,
-        "overlap_comm": true,
-        "reduce_scatter": true,
-        "reduce_bucket_size": 2e8,
-        "contiguous_gradients": true
-    },
-
-    "gradient_accumulation_steps": "auto",
-    "gradient_clipping": "auto",
-    "steps_per_print": 2000,
-    "train_batch_size": "auto",
-    "train_micro_batch_size_per_gpu": "auto",
-    "wall_clock_breakdown": false
-}
diff --git a/tests/deepspeed/ds_config_zero3_fp16.json b/tests/deepspeed/ds_config_zero3.json
similarity index 96%
rename from tests/deepspeed/ds_config_zero3_fp16.json
rename to tests/deepspeed/ds_config_zero3.json
index a80a173b7a9704..f3397da2b08387 100644
--- a/tests/deepspeed/ds_config_zero3_fp16.json
+++ b/tests/deepspeed/ds_config_zero3.json
@@ -8,6 +8,10 @@
         "min_loss_scale": 1
     },
 
+    "bfloat16": {
+        "enabled": "auto"
+    },
+
     "optimizer": {
         "type": "AdamW",
         "params": {
diff --git a/tests/deepspeed/ds_config_zero3_bf16.json b/tests/deepspeed/ds_config_zero3_bf16.json
deleted file mode 100644
index 5e78c00189d778..00000000000000
--- a/tests/deepspeed/ds_config_zero3_bf16.json
+++ /dev/null
@@ -1,52 +0,0 @@
-{
-    "bfloat16": {
-        "enabled": "auto"
-    },
-
-    "optimizer": {
-        "type": "AdamW",
-        "params": {
-            "lr": "auto",
-            "betas": "auto",
-            "eps": "auto",
-            "weight_decay": "auto"
-        }
-    },
-
-    "scheduler": {
-        "type": "WarmupLR",
-        "params": {
-            "warmup_min_lr": "auto",
-            "warmup_max_lr": "auto",
-            "warmup_num_steps": "auto"
-        }
-    },
-
-    "zero_optimization": {
-        "stage": 3,
-        "offload_optimizer": {
-            "device": "cpu",
-            "pin_memory": true
-        },
-        "offload_param": {
-            "device": "cpu",
-            "pin_memory": true
-        },
-        "overlap_comm": true,
-        "contiguous_gradients": true,
-        "sub_group_size": 1e9,
-        "reduce_bucket_size": "auto",
-        "stage3_prefetch_bucket_size": "auto",
-        "stage3_param_persistence_threshold": "auto",
-        "stage3_max_live_parameters": 1e9,
-        "stage3_max_reuse_distance": 1e9,
-        "stage3_gather_fp16_weights_on_model_save": true
-    },
-
-    "gradient_accumulation_steps": "auto",
-    "gradient_clipping": "auto",
-    "steps_per_print": 2000,
-    "train_batch_size": "auto",
-    "train_micro_batch_size_per_gpu": "auto",
-    "wall_clock_breakdown": false
-}
diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
index 682a60c05cd6f3..60fc817c2e6659 100644
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -14,6 +14,7 @@
 
 import dataclasses
 import io
+import itertools
 import json
 import os
 import unittest
@@ -65,11 +66,6 @@ def load_json(path):
         return json.load(f)
 
 
-def stage_dtype_to_dtype(stage_dtype):
-    stage, dtype = stage_dtype.split("_")
-    return dtype
-
-
 def get_master_port(real_launcher=False):
     """
     When using a single gpu launcher emulation (i.e. not deepspeed or python -m torch.distributed)
@@ -130,28 +126,22 @@ def get_launcher(distributed=False):
 FP16 = "fp16"
 BF16 = "bf16"
 
-ZERO2_FP16 = "zero2_fp16"
-ZERO3_FP16 = "zero3_fp16"
-ZERO2_BF16 = "zero2_bf16"
-ZERO3_BF16 = "zero3_bf16"
+stages = [ZERO2, ZERO3]
+if is_torch_bf16_available():
+    dtypes = [FP16, BF16]
+else:
+    dtypes = [FP16]
 
-stages_zero2 = [ZERO2_FP16]
-stages_zero3 = [ZERO3_FP16]
 
-stages_fp16 = [ZERO2_FP16, ZERO3_FP16]
-stages_bf16 = []
+def parameterized_custom_name_func(func, param_num, param):
+    # customize the test name generator function as we want both params to appear in the sub-test
+    # name, as by default it shows only the first param
+    param_based_name = parameterized.to_safe_name("_".join(str(x) for x in param.args))
+    return f"{func.__name__}_{param_based_name}"
 
-if is_torch_bf16_available():
-    stages_bf16 = [ZERO2_BF16, ZERO3_BF16]
-    # XXX: for now only zero2 is supported
-    # stages_bf16 = [ZERO2_BF16]
-    stages_zero2 += [ZERO2_BF16]
-    stages_zero3 += [ZERO3_BF16]
 
-stages_all = stages_fp16 + stages_bf16
-
-stages = [ZERO2, ZERO3]
-dtypes = [FP16, BF16]
+# Cartesian-product of zero stages with models to test
+params = list(itertools.product(stages, dtypes))
 
 
 @require_deepspeed
@@ -170,7 +160,7 @@ def setUp(self):
         )
 
     def test_init_zero3_fp16(self):
-        # test that zero.Init() works correctly under zero3_fp16
+        # test that zero.Init() works correctly under zero3/fp16
         ds_config = {
             "train_batch_size": 1,
             "zero_optimization": {
@@ -240,50 +230,37 @@ def setUp(self):
         )
 
         self.ds_config_file = dict(
-            zero2_fp16=f"{self.test_file_dir_str}/ds_config_zero2_fp16.json",
-            zero3_fp16=f"{self.test_file_dir_str}/ds_config_zero3_fp16.json",
-            zero2_bf16=f"{self.test_file_dir_str}/ds_config_zero2_bf16.json",
-            zero3_bf16=f"{self.test_file_dir_str}/ds_config_zero3_bf16.json",
+            zero2=f"{self.test_file_dir_str}/ds_config_zero2.json",
+            zero3=f"{self.test_file_dir_str}/ds_config_zero3.json",
         )
 
         # use self.get_config_dict(stage) to use these to ensure the original is not modified
-        with io.open(self.ds_config_file[ZERO2_FP16], "r", encoding="utf-8") as f:
-            config_zero2_fp16 = json.load(f)
+        with io.open(self.ds_config_file[ZERO2], "r", encoding="utf-8") as f:
+            config_zero2 = json.load(f)
             # by default use fp16
-            config_zero2_fp16["fp16"]["enabled"] = True
-        with io.open(self.ds_config_file[ZERO3_FP16], "r", encoding="utf-8") as f:
-            config_zero3_fp16 = json.load(f)
+            # config_zero2["fp16"]["enabled"] = True
+        with io.open(self.ds_config_file[ZERO3], "r", encoding="utf-8") as f:
+            config_zero3 = json.load(f)
             # by default use fp16
-            config_zero3_fp16["fp16"]["enabled"] = True
+            # config_zero3["fp16"]["enabled"] = True
             # This setting slows things down, so don't enable it by default unless needed by a test.
             # It's in the file as a demo for users since we want everything to work out of the box even if slower.
-            config_zero3_fp16["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = False
-
-        with io.open(self.ds_config_file[ZERO2_BF16], "r", encoding="utf-8") as f:
-            config_zero2_bf16 = json.load(f)
-            # by default use fp16
-            config_zero2_bf16["bfloat16"]["enabled"] = True
-        with io.open(self.ds_config_file[ZERO3_BF16], "r", encoding="utf-8") as f:
-            config_zero3_bf16 = json.load(f)
-            # by default use fp16
-            config_zero3_bf16["bfloat16"]["enabled"] = True
+            config_zero3["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = False
 
         self.ds_config_dict = dict(
-            zero2_fp16=config_zero2_fp16,
-            zero3_fp16=config_zero3_fp16,
-            zero2_bf16=config_zero2_bf16,
-            zero3_bf16=config_zero3_bf16,
+            zero2=config_zero2,
+            zero3=config_zero3,
         )
 
-    def get_config_dict(self, stage_dtype):
+    def get_config_dict(self, stage):
         # As some tests modify the dict, always make a copy
-        return deepcopy(self.ds_config_dict[stage_dtype])
+        return deepcopy(self.ds_config_dict[stage])
 
     # --- These tests are enough to run on one of zero stages --- #
 
     def test_hf_ds_config_mismatch(self):
 
-        ds_config = self.get_config_dict(ZERO2_FP16)
+        ds_config = self.get_config_dict(ZERO2)
 
         # Purposefully configure these values to mismatch TrainingArguments values.
         # This currently doesn't cover all keys (but it could)
@@ -342,12 +319,12 @@ def test_hf_ds_config_mismatch(self):
     def test_hf_scheduler_hf_optimizer(self):
         a = 0
         with mockenv_context(**self.dist_env_1_gpu):
-            ds_config_zero2_fp16_dict = self.get_config_dict(ZERO2_FP16)
-            del ds_config_zero2_fp16_dict["optimizer"]  # force default HF Trainer optimizer
-            del ds_config_zero2_fp16_dict["scheduler"]  # force default HF Trainer scheduler
-            ds_config_zero2_fp16_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
-            ds_config_zero2_fp16_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
-            trainer = get_regression_trainer(a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_fp16_dict)
+            ds_config_zero2_dict = self.get_config_dict(ZERO2)
+            del ds_config_zero2_dict["optimizer"]  # force default HF Trainer optimizer
+            del ds_config_zero2_dict["scheduler"]  # force default HF Trainer scheduler
+            ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
+            ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
+            trainer = get_regression_trainer(a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict)
             trainer.train()
         new_a = trainer.model.a.item()
         self.assertNotEqual(new_a, a)
@@ -355,11 +332,11 @@ def test_hf_scheduler_hf_optimizer(self):
     def test_ds_scheduler_hf_optimizer(self):
         a = 0
         with mockenv_context(**self.dist_env_1_gpu):
-            ds_config_zero2_fp16_dict = self.get_config_dict(ZERO2_FP16)
-            del ds_config_zero2_fp16_dict["optimizer"]  # force default HF Trainer optimizer
-            ds_config_zero2_fp16_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
-            ds_config_zero2_fp16_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
-            trainer = get_regression_trainer(a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_fp16_dict)
+            ds_config_zero2_dict = self.get_config_dict(ZERO2)
+            del ds_config_zero2_dict["optimizer"]  # force default HF Trainer optimizer
+            ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
+            ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
+            trainer = get_regression_trainer(a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict)
             trainer.train()
         new_a = trainer.model.a.item()
         self.assertNotEqual(new_a, a)
@@ -367,11 +344,11 @@ def test_ds_scheduler_hf_optimizer(self):
     def test_hf_scheduler_ds_optimizer(self):
         a = 0
         with mockenv_context(**self.dist_env_1_gpu):
-            ds_config_zero2_fp16_dict = self.get_config_dict(ZERO2_FP16)
-            del ds_config_zero2_fp16_dict["scheduler"]  # force default HF Trainer scheduler
-            ds_config_zero2_fp16_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
-            ds_config_zero2_fp16_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
-            trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=ds_config_zero2_fp16_dict)
+            ds_config_zero2_dict = self.get_config_dict(ZERO2)
+            del ds_config_zero2_dict["scheduler"]  # force default HF Trainer scheduler
+            ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
+            ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
+            trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict)
             trainer.train()
         new_a = trainer.model.a.item()
         self.assertNotEqual(new_a, a)
@@ -383,33 +360,33 @@ def test_stage3_nvme_offload(self):
             # runs a simple check that we can use some directory as if it were NVMe
             nvme_path = self.get_auto_remove_tmp_dir()
             nvme_config = dict(device="nvme", nvme_path=nvme_path)
-            ds_config_zero3_fp16_dict = self.get_config_dict(ZERO3_FP16)
-            ds_config_zero3_fp16_dict["zero_optimization"]["offload_optimizer"] = nvme_config
-            ds_config_zero3_fp16_dict["zero_optimization"]["offload_param"] = nvme_config
-            trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=ds_config_zero3_fp16_dict)
+            ds_config_zero3_dict = self.get_config_dict(ZERO3)
+            ds_config_zero3_dict["zero_optimization"]["offload_optimizer"] = nvme_config
+            ds_config_zero3_dict["zero_optimization"]["offload_param"] = nvme_config
+            trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=ds_config_zero3_dict)
             with CaptureLogger(deepspeed_logger) as cl:
                 trainer.train()
             self.assertIn("DeepSpeed info", cl.out, "expected DeepSpeed logger output but got none")
 
     # --- These tests need to run on both zero stages --- #
 
-    @parameterized.expand(stages_all)
-    def test_hf_optimizer_with_offload(self, stage_dtype):
+    @parameterized.expand(params, name_func=parameterized_custom_name_func)
+    def test_hf_optimizer_with_offload(self, stage, dtype):
         # non-DS optimizers can be used with ZERO-offload (as long as they have both CPU and GPU implementation (except LAMB))
-        ds_config_dict = self.get_config_dict(stage_dtype)
+        ds_config_dict = self.get_config_dict(stage)
         del ds_config_dict["optimizer"]  # force default HF Trainer optimizer
         # force cpu offload
         ds_config_dict["zero_optimization"]["offload_optimizer"]["device"] = "cpu"
         with mockenv_context(**self.dist_env_1_gpu):
             kwargs = dict(local_rank=0, deepspeed=ds_config_dict)
-            kwargs[stage_dtype_to_dtype(stage_dtype)] = True
+            kwargs[dtype] = True
             trainer = get_regression_trainer(**kwargs)
             with CaptureLogger(deepspeed_logger) as cl:
                 trainer.train()
             self.assertIn("DeepSpeed info", cl.out, "expected DeepSpeed logger output but got none")
 
-    @parameterized.expand(stages_all)
-    def test_fake_notebook_no_launcher(self, stage_dtype):
+    @parameterized.expand(params, name_func=parameterized_custom_name_func)
+    def test_fake_notebook_no_launcher(self, stage, dtype):
         # this setup emulates a notebook where a launcher needs to be emulated by hand
 
         # note that unittest resets sys.stdout each test, so `CaptureStd` will work here to capture
@@ -417,16 +394,16 @@ def test_fake_notebook_no_launcher(self, stage_dtype):
         # it's run not as a first test as `sys.stdout` will no longer be the same. So we either have
         # to reset `deepspeed_logger.handlers[0].setStream(sys.stdout)` or directly capture from the deepspeed_logger.
         with mockenv_context(**self.dist_env_1_gpu):
-            kwargs = dict(local_rank=0, deepspeed=self.get_config_dict(stage_dtype))
-            kwargs[stage_dtype_to_dtype(stage_dtype)] = True
+            kwargs = dict(local_rank=0, deepspeed=self.get_config_dict(stage))
+            kwargs[dtype] = True
             trainer = get_regression_trainer(**kwargs)
 
             with CaptureLogger(deepspeed_logger) as cl:
                 trainer.train()
             self.assertIn("DeepSpeed info", cl.out, "expected DeepSpeed logger output but got none")
 
-    @parameterized.expand(stages_all)
-    def test_early_get_last_lr(self, stage_dtype):
+    @parameterized.expand(params, name_func=parameterized_custom_name_func)
+    def test_early_get_last_lr(self, stage, dtype):
         # with deepspeed's fp16 and dynamic loss scale enabled the optimizer/scheduler steps may
         # not run for the first few dozen steps while loss scale is too large, and thus during
         # that time `get_last_lr` will fail if called during that warm up stage,
@@ -440,31 +417,31 @@ def test_early_get_last_lr(self, stage_dtype):
                 b=b,
                 local_rank=0,
                 train_len=8,
-                deepspeed=self.get_config_dict(stage_dtype),
+                deepspeed=self.get_config_dict(stage),
                 per_device_train_batch_size=8,
                 logging_steps=1,
             )
-            kwargs[stage_dtype_to_dtype(stage_dtype)] = True
+            kwargs[dtype] = True
             trainer = get_regression_trainer(**kwargs)
 
             trainer.train()
             post_train_a = trainer.model.a.item()
 
-            # XXX: for some reason the following check fails with zero3 - not a broken but a
-            # different qualitative outcome - as if optimizer did run
+            # XXX: for some reason the following check fails with zero3/fp16 and zero2/bf16 - not a
+            # broken but a different qualitative outcome - as if optimizer did run
             # oddly getting 1.0 for both a and b from 0.0 - there is a bug somewhere
             # print(trainer.model.a.item())
             # print(trainer.model.b.item())
             # need to investigate at some point
-            if stage_dtype in stages_zero3:
+            if (stage == ZERO3 and dtype == FP16) or (stage == ZERO2 and dtype == BF16):
                 return
 
             # it's enough that train didn't fail for this test, but we must check that
             # optimizer/scheduler didn't run (since if it did this test isn't testing the right thing)
             self.assertEqual(post_train_a, a)
 
-    @parameterized.expand(stages_all)
-    def test_gradient_accumulation(self, stage_dtype):
+    @parameterized.expand(params, name_func=parameterized_custom_name_func)
+    def test_gradient_accumulation(self, stage, dtype):
         # this test measures that we get identical weights and similar loss with:
         # 1. per_device_train_batch_size=8, gradient_accumulation_steps=1
         # 2. per_device_train_batch_size=4, gradient_accumulation_steps=2
@@ -485,9 +462,9 @@ def test_gradient_accumulation(self, stage_dtype):
             b=b,
             local_rank=0,
             train_len=train_len,
-            deepspeed=self.get_config_dict(stage_dtype),
+            deepspeed=self.get_config_dict(stage),
         )
-        kwargs[stage_dtype_to_dtype(stage_dtype)] = True
+        kwargs[dtype] = True
 
         with mockenv_context(**self.dist_env_1_gpu):
             no_grad_accum_trainer = get_regression_trainer(
@@ -522,36 +499,28 @@ def test_gradient_accumulation(self, stage_dtype):
         # see the note above how to get identical loss on a small bs
         self.assertAlmostEqual(no_grad_accum_loss, yes_grad_accum_loss, places=2)
 
-    def check_saved_checkpoints_deepspeed(self, output_dir, freq, total, stage_dtype):
+    def check_saved_checkpoints_deepspeed(self, output_dir, freq, total, stage):
         # adapted from TrainerIntegrationCommon.check_saved_checkpoints
 
         file_list = [WEIGHTS_NAME, "training_args.bin", "trainer_state.json", "config.json"]
 
-        if stage_dtype in stages_zero2:
+        if stage == ZERO2:
             ds_file_list = ["mp_rank_00_model_states.pt"]
-        elif stage_dtype in stages_zero3:
+        elif stage == ZERO3:
             ds_file_list = ["zero_pp_rank_0_mp_rank_00_model_states.pt"]
         else:
-            raise ValueError(f"unknown stage_dtype {stage_dtype}")
-
-        # XXX: this can be recoded and then removed once we require deepspeed>0.3.13
-        from packaging import version
+            raise ValueError(f"unknown stage {stage}")
 
-        import deepspeed
-
-        if version.parse(deepspeed.__version__) > version.parse("0.3.13"):
-            ds_file_list.append("zero_pp_rank_0_mp_rank_00_optim_states.pt")
-        else:
-            ds_file_list.append("zero_pp_rank_0_mp_rank_00optim_states.pt")
+        ds_file_list.append("zero_pp_rank_0_mp_rank_00_optim_states.pt")
 
         for step in range(freq, total, freq):
             checkpoint = os.path.join(output_dir, f"checkpoint-{step}")
-            self.assertTrue(os.path.isdir(checkpoint), f"[{stage_dtype}] {checkpoint} dir is not found")
+            self.assertTrue(os.path.isdir(checkpoint), f"[{stage}] {checkpoint} dir is not found")
 
             # common files
             for filename in file_list:
                 path = os.path.join(checkpoint, filename)
-                self.assertTrue(os.path.isfile(path), f"[{stage_dtype}] {path} is not found")
+                self.assertTrue(os.path.isfile(path), f"[{stage}] {path} is not found")
 
             # ds files
             ds_path = os.path.join(checkpoint, f"global_step{step}")
@@ -559,18 +528,19 @@ def check_saved_checkpoints_deepspeed(self, output_dir, freq, total, stage_dtype
                 # filename = os.path.join(path, filename)
                 # print(filename)
                 path = os.path.join(ds_path, filename)
-                self.assertTrue(os.path.isfile(path), f"[{stage_dtype}] {path} is not found")
+                self.assertTrue(os.path.isfile(path), f"[{stage}] {path} is not found")
 
-    @parameterized.expand(stages_all)
-    def test_save_checkpoints(self, stage_dtype):
+    @parameterized.expand(params, name_func=parameterized_custom_name_func)
+    def test_save_checkpoints(self, stage, dtype):
         # adapted from  TrainerIntegrationTest.test_save_checkpoints
 
         freq = 5
         output_dir = self.get_auto_remove_tmp_dir()
-        ds_config_dict = self.get_config_dict(stage_dtype)
-        if stage_dtype in stages_fp16:
+        ds_config_dict = self.get_config_dict(stage)
+        if dtype == FP16:
             ds_config_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
-        if stage_dtype == ZERO3_FP16:
+        # XXX:
+        if stage == ZERO3:  # and dtype == FP16:
             ds_config_dict["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = True
 
         # save checkpoints
@@ -580,21 +550,21 @@ def test_save_checkpoints(self, stage_dtype):
                 save_steps=freq,
                 deepspeed=ds_config_dict,
             )
-            kwargs[stage_dtype_to_dtype(stage_dtype)] = True
+            kwargs[dtype] = True
             trainer = get_regression_trainer(**kwargs)
             trainer.train()
 
         total = int(self.n_epochs * 64 / self.batch_size)
-        self.check_saved_checkpoints_deepspeed(output_dir, freq, total, stage_dtype)
+        self.check_saved_checkpoints_deepspeed(output_dir, freq, total, stage)
 
-    @parameterized.expand(stages_all)
-    def test_can_resume_training_errors(self, stage_dtype):
+    @parameterized.expand(params, name_func=parameterized_custom_name_func)
+    def test_can_resume_training_errors(self, stage, dtype):
 
         with mockenv_context(**self.dist_env_1_gpu):
-            ds_config_dict = self.get_config_dict(stage_dtype)
+            ds_config_dict = self.get_config_dict(stage)
             output_dir = self.get_auto_remove_tmp_dir()
             kwargs = dict(output_dir=output_dir, deepspeed=ds_config_dict)
-            kwargs[stage_dtype_to_dtype(stage_dtype)] = True
+            kwargs[dtype] = True
             trainer = get_regression_trainer(**kwargs)
 
             # 1. fail to find any checkpoint - due a fresh output_dir
@@ -613,19 +583,20 @@ def test_can_resume_training_errors(self, stage_dtype):
                 "Can't find a valid checkpoint at" in str(context.exception), f"got exception: {context.exception}"
             )
 
-    @parameterized.expand(stages_all)
-    def test_can_resume_training_normal(self, stage_dtype):
+    @parameterized.expand(params, name_func=parameterized_custom_name_func)
+    def test_can_resume_training_normal(self, stage, dtype):
         # adapted from TrainerIntegrationTest.test_can_resume_training
         # test normal resume for each stage separately, error-handling is tested in a different test
-        output_dir = self.get_auto_remove_tmp_dir()
-        ds_config_dict = self.get_config_dict(stage_dtype)
-        if stage_dtype in stages_fp16:
+        output_dir = self.get_auto_remove_tmp_dir("./xxx", after=False)
+        ds_config_dict = self.get_config_dict(stage)
+        if dtype == FP16:
             ds_config_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
-        if stage_dtype == ZERO3_FP16:
+        # XXX:
+        if stage == ZERO3:  # and dtype == FP16:
             ds_config_dict["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = True
 
         kwargs = dict(output_dir=output_dir, train_len=128, save_steps=5, learning_rate=0.1, deepspeed=ds_config_dict)
-        kwargs[stage_dtype_to_dtype(stage_dtype)] = True
+        kwargs[dtype] = True
 
         with mockenv_context(**self.dist_env_1_gpu):
             trainer = get_regression_trainer(**kwargs)
@@ -663,13 +634,13 @@ def test_can_resume_training_normal(self, stage_dtype):
             # trainer.train(resume_from_checkpoint=checkpoint)
             # a workaround needs to be used that re-creates the deepspeed engine
 
-    @parameterized.expand(stages_all)
-    def test_load_state_dict_from_zero_checkpoint(self, stage_dtype):
+    @parameterized.expand(params, name_func=parameterized_custom_name_func)
+    def test_load_state_dict_from_zero_checkpoint(self, stage, dtype):
         # test that we can load fp32 weights directly from the zero checkpoint into the current model
 
         output_dir = self.get_auto_remove_tmp_dir()  # "./xxx", after=False, before=False)
 
-        ds_config_dict = self.get_config_dict(stage_dtype)
+        ds_config_dict = self.get_config_dict(stage)
 
         kwargs = dict(
             output_dir=output_dir,
@@ -681,7 +652,7 @@ def test_load_state_dict_from_zero_checkpoint(self, stage_dtype):
             learning_rate=0.1,
             deepspeed=ds_config_dict,
         )
-        kwargs[stage_dtype_to_dtype(stage_dtype)] = True
+        kwargs[dtype] = True
 
         with mockenv_context(**self.dist_env_1_gpu):
             trainer = get_regression_trainer(**kwargs)
@@ -704,20 +675,20 @@ def test_config_object(self):
         output_dir = self.get_auto_remove_tmp_dir()
         kwargs = dict(output_dir=output_dir, train_len=8, fp16=True)
 
-        ds_config_zero3_fp16_dict = self.get_config_dict(ZERO3_FP16)
-        ds_config_zero2_fp16_dict = self.get_config_dict(ZERO2_FP16)
+        ds_config_zero3_dict = self.get_config_dict(ZERO3)
+        ds_config_zero2_dict = self.get_config_dict(ZERO2)
 
         with mockenv_context(**self.dist_env_1_gpu):
-            trainer = get_regression_trainer(deepspeed=ds_config_zero3_fp16_dict, **kwargs)
+            trainer = get_regression_trainer(deepspeed=ds_config_zero3_dict, **kwargs)
             self.assertTrue(is_deepspeed_zero3_enabled())
 
             # test we can repeat that and with train this time
-            trainer = get_regression_trainer(deepspeed=ds_config_zero3_fp16_dict, **kwargs)
+            trainer = get_regression_trainer(deepspeed=ds_config_zero3_dict, **kwargs)
             trainer.train()
             self.assertTrue(is_deepspeed_zero3_enabled())
 
             # test zero3 is disabled
-            trainer = get_regression_trainer(deepspeed=ds_config_zero2_fp16_dict, **kwargs)
+            trainer = get_regression_trainer(deepspeed=ds_config_zero2_dict, **kwargs)
             self.assertFalse(is_deepspeed_zero3_enabled())
 
             # check config obj
@@ -754,26 +725,28 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
     #
 
     @require_torch_multi_gpu
-    @parameterized.expand(stages_all)
-    def test_basic_distributed(self, stage_dtype):
-        self.run_and_check(stage_dtype=stage_dtype, distributed=True)
+    @parameterized.expand(params, name_func=parameterized_custom_name_func)
+    def test_basic_distributed(self, stage, dtype):
+        self.run_and_check(stage=stage, dtype=dtype, distributed=True)
 
     def test_do_eval_no_train(self):
         # testing only zero3 since zero2 makes no sense with inference
         self.run_and_check(
-            stage_dtype=ZERO3_FP16,
+            stage=ZERO3,
+            dtype=FP16,
             eval_steps=1,
             distributed=False,
             do_train=False,
             do_eval=True,
         )
 
-    @parameterized.expand(stages_all)
-    def test_fp32_non_distributed(self, stage_dtype):
+    @parameterized.expand(params, name_func=parameterized_custom_name_func)
+    def test_fp32_non_distributed(self, stage, dtype):
         # real model needs too much GPU memory under stage2+fp32, so using tiny random model here -
         # therefore no quality checks, just basic completion checks are done
         self.run_and_check(
-            stage_dtype=stage_dtype,
+            stage=stage,
+            dtype=dtype,
             model_name=T5_TINY,
             distributed=False,
             do_train=True,
@@ -783,12 +756,13 @@ def test_fp32_non_distributed(self, stage_dtype):
         )
 
     @require_torch_multi_gpu
-    @parameterized.expand(stages_all)
-    def test_fp32_distributed(self, stage_dtype):
+    @parameterized.expand(params, name_func=parameterized_custom_name_func)
+    def test_fp32_distributed(self, stage, dtype):
         # real model needs too much GPU memory under stage2+fp32, so using tiny random model here -
         # therefore no quality checks, just basic completion checks are done
         self.run_and_check(
-            stage_dtype=stage_dtype,
+            stage=stage,
+            dtype=dtype,
             model_name=T5_TINY,
             distributed=True,
             do_train=True,
@@ -797,14 +771,14 @@ def test_fp32_distributed(self, stage_dtype):
             fp32=True,
         )
 
-    @parameterized.expand(stages_all)
-    def test_resume_train_not_from_ds_checkpoint(self, stage_dtype):
+    @parameterized.expand(params, name_func=parameterized_custom_name_func)
+    def test_resume_train_not_from_ds_checkpoint(self, stage, dtype):
         # do normal training and then resume not from the deepspeed checkpoint but explicitly from
         # the saved model dir
 
         do_train = True
         do_eval = False
-        kwargs = dict(stage_dtype=stage_dtype, eval_steps=1, distributed=True, do_train=do_train, do_eval=do_eval)
+        kwargs = dict(stage=stage, dtype=dtype, eval_steps=1, distributed=True, do_train=do_train, do_eval=do_eval)
 
         # 1. normal training
         output_dir = self.run_and_check(**kwargs)
@@ -816,13 +790,14 @@ def test_resume_train_not_from_ds_checkpoint(self, stage_dtype):
         self.do_checks(output_dir, do_train=do_train, do_eval=do_eval)
 
     @require_torch_multi_gpu
-    @parameterized.expand(["fp16", "fp32"])
+    @parameterized.expand(["bf16", "fp16", "fp32"])
     def test_inference(self, dtype):
         # this is just inference, so no optimizer should be loaded
         # it only works for z3 (makes no sense with z1-z2)
         fp32 = True if dtype == "fp32" else False
         self.run_and_check(
-            stage_dtype=ZERO3_FP16,
+            stage=ZERO3,
+            dtype=FP16,
             model_name=T5_TINY,
             distributed=True,
             do_train=False,
@@ -848,7 +823,8 @@ def do_checks(self, output_dir, do_train=True, do_eval=True, quality_checks=True
     # XXX: need to do better validation beyond just that the run was successful
     def run_and_check(
         self,
-        stage_dtype,
+        stage,
+        dtype,
         model_name: str = T5_SMALL,
         eval_steps: int = 10,
         distributed: bool = True,
@@ -862,7 +838,8 @@ def run_and_check(
 
         # we are doing quality testing so using a small real model
         output_dir = self.run_trainer(
-            stage_dtype=stage_dtype,
+            stage=stage,
+            dtype=dtype,
             model_name=model_name,
             eval_steps=eval_steps,
             num_train_epochs=1,
@@ -880,7 +857,8 @@ def run_and_check(
 
     def run_trainer(
         self,
-        stage_dtype: str,
+        stage: str,
+        dtype: str,
         model_name: str,
         eval_steps: int = 10,
         num_train_epochs: int = 1,
@@ -916,7 +894,6 @@ def run_trainer(
         args.extend(["--source_prefix", '"translate English to Romanian: "'])
 
         if not fp32:
-            dtype = stage_dtype_to_dtype(stage_dtype)
             args.extend([f"--{dtype}"])
 
         actions = 0
@@ -952,7 +929,7 @@ def run_trainer(
             remove_args = remove_args_str.split()
             args = [x for x in args if x not in remove_args]
 
-        ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage_dtype}.json".split()
+        ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage}.json".split()
         script = [f"{self.examples_dir_str}/pytorch/translation/run_translation.py"]
         launcher = get_launcher(distributed)
 
@@ -963,8 +940,8 @@ def run_trainer(
 
         return output_dir
 
-    @parameterized.expand(stages_all)
-    def test_clm(self, stage_dtype):
+    @parameterized.expand(params, name_func=parameterized_custom_name_func)
+    def test_clm(self, stage, dtype):
         # this test exercises model.resize_token_embeddings() which requires param gathering outside
         # of forward - it's not used by `run_translation.py`, but it is in `run_clm.py`
 
@@ -985,54 +962,22 @@ def test_clm(self, stage_dtype):
             --num_train_epochs 1
             --warmup_steps 8
             --block_size 64
-            --fp16
             --report_to none
             """.split()
 
-        ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage_dtype}.json".split()
-        script = [f"{self.examples_dir_str}/pytorch/language-modeling/run_clm.py"]
-        launcher = get_launcher(distributed=True)
-
-        cmd = launcher + script + args + ds_args
-        # keep for quick debug
-        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
-        execute_subprocess_async(cmd, env=self.get_env())
-
-    def test_clm_from_config_zero3_fp16(self):
-        # this test exercises AutoModel.from_config(config) - to ensure zero.Init is called
-
-        data_dir = self.tests_dir / "fixtures"
-        output_dir = self.get_auto_remove_tmp_dir()
-        args = f"""
-            --model_type gpt2
-            --tokenizer_name {GPT2_TINY}
-            --train_file {data_dir}/sample_text.txt
-            --validation_file {data_dir}/sample_text.txt
-            --output_dir {output_dir}
-            --overwrite_output_dir
-            --do_train
-            --max_train_samples 4
-            --per_device_train_batch_size 2
-            --num_train_epochs 1
-            --warmup_steps 8
-            --block_size 8
-            --fp16
-            --report_to none
-            """.split()
+        args.extend([f"--{dtype}"])
 
-        ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_zero3_fp16.json".split()
+        ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage}.json".split()
         script = [f"{self.examples_dir_str}/pytorch/language-modeling/run_clm.py"]
         launcher = get_launcher(distributed=True)
 
         cmd = launcher + script + args + ds_args
         # keep for quick debug
         # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
-        with CaptureStderr() as cs:
-            execute_subprocess_async(cmd, env=self.get_env())
-        assert "Detected DeepSpeed ZeRO-3" in cs.err
+        execute_subprocess_async(cmd, env=self.get_env())
 
-    @parameterized.expand(stages_all)
-    def test_load_best_model(self, stage_dtype):
+    @parameterized.expand(params, name_func=parameterized_custom_name_func)
+    def test_load_best_model(self, stage, dtype):
         # this test exercises --load_best_model_at_end - the key is being able to resume after some training
 
         data_dir = self.tests_dir / "fixtures/tests_samples/wmt_en_ro"
@@ -1060,12 +1005,13 @@ def test_load_best_model(self, stage_dtype):
             --per_device_train_batch_size 1
             --per_device_eval_batch_size 1
             --num_train_epochs 1
-            --fp16
             --report_to none
             """.split()
         args.extend(["--source_prefix", "translate English to Romanian: "])
 
-        ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage_dtype}.json".split()
+        args.extend([f"--{dtype}"])
+
+        ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage}.json".split()
         script = [f"{self.examples_dir_str}/pytorch/translation/run_translation.py"]
         launcher = get_launcher(distributed=False)
 
@@ -1076,3 +1022,36 @@ def test_load_best_model(self, stage_dtype):
             execute_subprocess_async(cmd, env=self.get_env())
         # enough to test deespeed was invoked and it didn't fail
         assert "DeepSpeed info" in cs.out
+
+    def test_clm_from_config_zero3_fp16(self):
+        # this test exercises AutoModel.from_config(config) - to ensure zero.Init is called
+
+        data_dir = self.tests_dir / "fixtures"
+        output_dir = self.get_auto_remove_tmp_dir()
+        args = f"""
+            --model_type gpt2
+            --tokenizer_name {GPT2_TINY}
+            --train_file {data_dir}/sample_text.txt
+            --validation_file {data_dir}/sample_text.txt
+            --output_dir {output_dir}
+            --overwrite_output_dir
+            --do_train
+            --max_train_samples 4
+            --per_device_train_batch_size 2
+            --num_train_epochs 1
+            --warmup_steps 8
+            --block_size 8
+            --fp16
+            --report_to none
+            """.split()
+
+        ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_zero3.json".split()
+        script = [f"{self.examples_dir_str}/pytorch/language-modeling/run_clm.py"]
+        launcher = get_launcher(distributed=True)
+
+        cmd = launcher + script + args + ds_args
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+        with CaptureStderr() as cs:
+            execute_subprocess_async(cmd, env=self.get_env())
+        assert "Detected DeepSpeed ZeRO-3" in cs.err
diff --git a/tests/deepspeed/test_model_zoo.py b/tests/deepspeed/test_model_zoo.py
index 5cd4a3f131e3f5..de65ef8760dcfe 100644
--- a/tests/deepspeed/test_model_zoo.py
+++ b/tests/deepspeed/test_model_zoo.py
@@ -207,16 +207,13 @@ def make_task_cmds():
 FP16 = "fp16"
 BF16 = "bf16"
 
-ZERO2_FP16 = "zero2_fp16"
-ZERO3_FP16 = "zero3_fp16"
-ZERO2_BF16 = "zero2_bf16"
-ZERO3_BF16 = "zero3_bf16"
-
-stages_fp16 = [ZERO2_FP16, ZERO3_FP16]
-stages_bf16 = [ZERO2_BF16, ZERO3_BF16]
-
 stages = [ZERO2, ZERO3]
-dtypes = [FP16, BF16]
+
+# for now test just fp16, as these tests are quite slow
+# dtypes = [FP16]
+# so just hardcoding --fp16 for now
+# if is_torch_bf16_available():
+#     dtypes += [BF16]
 
 
 def parameterized_custom_name_func(func, param_num, param):
@@ -227,7 +224,7 @@ def parameterized_custom_name_func(func, param_num, param):
 
 
 # Cartesian-product of zero stages with models to test
-params = list(itertools.product(stages_fp16, task_cmds.keys()))
+params = list(itertools.product(stages, task_cmds.keys()))
 
 
 @slow

From 3dec8fdf0e188b0e1e966661cb3ef2a2825eff82 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Wed, 8 Dec 2021 21:22:26 -0800
Subject: [PATCH 11/25] fix doc

---
 docs/source/main_classes/deepspeed.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/main_classes/deepspeed.rst b/docs/source/main_classes/deepspeed.rst
index 9430d8390c4fb7..f97f692a3583e2 100644
--- a/docs/source/main_classes/deepspeed.rst
+++ b/docs/source/main_classes/deepspeed.rst
@@ -1331,7 +1331,7 @@ If bf16 is desired instead of fp16 then the following configuration section is t
 
     {
         "bfloat16": {
-            "enabled": auto,
+            "enabled": auto
         }
     }
 
@@ -1345,7 +1345,7 @@ You can also enable/disable this mode explicitly:
 
     {
         "bfloat16": {
-            "enabled": true,
+            "enabled": true
         }
     }
 

From ac138b836eafcfffd33aa849b2a0fd704bf4b137 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Wed, 8 Dec 2021 21:24:39 -0800
Subject: [PATCH 12/25] cleanup

---
 tests/deepspeed/test_model_zoo.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/deepspeed/test_model_zoo.py b/tests/deepspeed/test_model_zoo.py
index de65ef8760dcfe..456ae143c6ae9c 100644
--- a/tests/deepspeed/test_model_zoo.py
+++ b/tests/deepspeed/test_model_zoo.py
@@ -204,12 +204,13 @@ def make_task_cmds():
 ZERO2 = "zero2"
 ZERO3 = "zero3"
 
-FP16 = "fp16"
-BF16 = "bf16"
-
 stages = [ZERO2, ZERO3]
 
+# future preparation:
 # for now test just fp16, as these tests are quite slow
+# FP16 = "fp16"
+# BF16 = "bf16"
+#
 # dtypes = [FP16]
 # so just hardcoding --fp16 for now
 # if is_torch_bf16_available():

From 8b516e3797ff06db3ffbae5a40017b1074b2640d Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Wed, 8 Dec 2021 21:26:19 -0800
Subject: [PATCH 13/25] cleanup

---
 docs/source/main_classes/deepspeed.rst | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/docs/source/main_classes/deepspeed.rst b/docs/source/main_classes/deepspeed.rst
index f97f692a3583e2..82060371874b1f 100644
--- a/docs/source/main_classes/deepspeed.rst
+++ b/docs/source/main_classes/deepspeed.rst
@@ -1278,11 +1278,10 @@ Automatic Mixed Precision
 
 You can use automatic mixed precision with either a pytorch-like AMP way or the apex-like way:
 
-fp16 / float16
+fp16
 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 
-
-To configure pytorch AMP-like mode set:
+To configure pytorch AMP-like mode with fp16 (float16) set:
 
 .. code-block:: json
 
@@ -1322,10 +1321,10 @@ configuration.
 
 Here is the `documentation <https://www.deepspeed.ai/docs/config-json/#fp16-training-options>`__.
 
-bf16 / bfloat16
+bf16
 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 
-If bf16 is desired instead of fp16 then the following configuration section is to be used:
+If bf16 (bfloat16) is desired instead of fp16 then the following configuration section is to be used:
 
 .. code-block:: json
 

From d48f68ed38e40d11bd2d39a11170a335cb7cc999 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Tue, 18 Jan 2022 17:59:01 -0800
Subject: [PATCH 14/25] bfloat16 => bf16 to match the PR changes

---
 docs/source/main_classes/deepspeed.mdx | 4 ++--
 src/transformers/deepspeed.py          | 2 +-
 tests/deepspeed/ds_config_zero2.json   | 2 +-
 tests/deepspeed/ds_config_zero3.json   | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/source/main_classes/deepspeed.mdx b/docs/source/main_classes/deepspeed.mdx
index 9dd3dbab503e86..c59c32b80cd1ef 100644
--- a/docs/source/main_classes/deepspeed.mdx
+++ b/docs/source/main_classes/deepspeed.mdx
@@ -1279,7 +1279,7 @@ If bf16 (bfloat16) is desired instead of fp16 then the following configuration s
 
 ```json
 {
-    "bfloat16": {
+    "bf16": {
         "enabled": "auto"
     }
 }
@@ -1293,7 +1293,7 @@ You can also enable/disable this mode explicitly:
 
 ```json
 {
-    "bfloat16": {
+    "bf16": {
         "enabled": true
     }
 }
diff --git a/src/transformers/deepspeed.py b/src/transformers/deepspeed.py
index 1c22caeb28b719..aab8e558b47d47 100644
--- a/src/transformers/deepspeed.py
+++ b/src/transformers/deepspeed.py
@@ -248,7 +248,7 @@ def trainer_config_process(self, args):
         self.fill_match("amp.enabled", fp16_backend == "apex", "fp16+fp16_backend(apex)")
         self.fill_match("amp.opt_level", args.fp16_opt_level, "fp16_opt_level")
 
-        self.fill_match("bfloat16.enabled", (args.bf16 or args.bf16_full_eval), "bf16|bf16_full_eval")
+        self.fill_match("bf16.enabled", (args.bf16 or args.bf16_full_eval), "bf16|bf16_full_eval")
 
         # deepspeed's default mode is fp16 unless there is a config that says differently
         if self.is_true("bfoat16.enabled"):
diff --git a/tests/deepspeed/ds_config_zero2.json b/tests/deepspeed/ds_config_zero2.json
index b16ec70ca90d56..6f0a546e51614d 100644
--- a/tests/deepspeed/ds_config_zero2.json
+++ b/tests/deepspeed/ds_config_zero2.json
@@ -8,7 +8,7 @@
         "min_loss_scale": 1
     },
 
-    "bfloat16": {
+    "bf16": {
         "enabled": "auto"
     },
 
diff --git a/tests/deepspeed/ds_config_zero3.json b/tests/deepspeed/ds_config_zero3.json
index f3397da2b08387..93eda91966d777 100644
--- a/tests/deepspeed/ds_config_zero3.json
+++ b/tests/deepspeed/ds_config_zero3.json
@@ -8,7 +8,7 @@
         "min_loss_scale": 1
     },
 
-    "bfloat16": {
+    "bf16": {
         "enabled": "auto"
     },
 

From 0fca05723bdcdc6b512143df7e301cc38d558faa Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Tue, 18 Jan 2022 18:17:19 -0800
Subject: [PATCH 15/25] 
 s/zero_gather_fp16_weights_on_model_save/zero_gather_16bit_weights_on_model_save/;
 s/save_fp16_model/save_16bit_model/

---
 docs/source/main_classes/deepspeed.mdx           | 16 ++++++++--------
 .../wav2vec2/ds_config_wav2vec2_zero3.json       |  2 +-
 src/transformers/trainer.py                      |  6 +++---
 tests/deepspeed/ds_config_zero3.json             |  2 +-
 tests/deepspeed/test_deepspeed.py                |  6 +++---
 5 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/docs/source/main_classes/deepspeed.mdx b/docs/source/main_classes/deepspeed.mdx
index c59c32b80cd1ef..bdc5c6a4a7d99f 100644
--- a/docs/source/main_classes/deepspeed.mdx
+++ b/docs/source/main_classes/deepspeed.mdx
@@ -367,7 +367,7 @@ cat <<'EOT' > ds_config_zero3.json
         "stage3_param_persistence_threshold": "auto",
         "stage3_max_live_parameters": 1e9,
         "stage3_max_reuse_distance": 1e9,
-        "stage3_gather_fp16_weights_on_model_save": true
+        "stage3_gather_16bit_weights_on_model_save": true
     },
 
     "gradient_accumulation_steps": "auto",
@@ -641,7 +641,7 @@ The following is an example configuration for ZeRO stage 3:
         "stage3_param_persistence_threshold": "auto",
         "stage3_max_live_parameters": 1e9,
         "stage3_max_reuse_distance": 1e9,
-        "stage3_gather_fp16_weights_on_model_save": true
+        "stage3_gather_16bit_weights_on_model_save": true
     }
 }
 ```
@@ -680,7 +680,7 @@ The following configuration values depend on the model's hidden size:
 therefore set these values to `auto` and the [`Trainer`] will automatically assign the recommended
 values. But, of course, feel free to set these explicitly as well.
 
-`stage3_gather_fp16_weights_on_model_save` enables model fp16 weights consolidation when model gets saved. With large
+`stage3_gather_16bit_weights_on_model_save` enables model fp16 weights consolidation when model gets saved. With large
 models and multiple GPUs this is an expensive operation both in terms of memory and speed. It's currently required if
 you plan to resume the training. Watch out for future updates that will remove this limitation and make things more
 flexible.
@@ -749,7 +749,7 @@ The following configuration example enables NVMe to offload both optimizer state
         "stage3_param_persistence_threshold": "auto",
         "stage3_max_live_parameters": 1e9,
         "stage3_max_reuse_distance": 1e9,
-        "stage3_gather_fp16_weights_on_model_save": true
+        "stage3_gather_16bit_weights_on_model_save": true
     },
 }
 ```
@@ -955,7 +955,7 @@ Here is a full ZeRO-3 auto-configuration file `ds_config_zero3.json`:
         "stage3_param_persistence_threshold": "auto",
         "stage3_max_live_parameters": 1e9,
         "stage3_max_reuse_distance": 1e9,
-        "stage3_gather_fp16_weights_on_model_save": true
+        "stage3_gather_16bit_weights_on_model_save": true
     },
 
     "gradient_accumulation_steps": "auto",
@@ -1018,7 +1018,7 @@ values look like, but we highly recommend using the one with multiple `auto` set
         "stage3_param_persistence_threshold": 1e4,
         "stage3_max_live_parameters": 1e9,
         "stage3_max_reuse_distance": 1e9,
-        "stage3_gather_fp16_weights_on_model_save": true
+        "stage3_gather_16bit_weights_on_model_save": true
     },
 
     "steps_per_print": 2000,
@@ -1432,7 +1432,7 @@ When a model is saved under ZeRO-2, you end up having the normal `pytorch_model.
 they are only the fp16 version of the weights.
 
 Under ZeRO-3, things are much more complicated, since the model weights are partitioned out over multiple GPUs,
-therefore `"stage3_gather_fp16_weights_on_model_save": true` is required to get the `Trainer` to save the fp16
+therefore `"stage3_gather_16bit_weights_on_model_save": true` is required to get the `Trainer` to save the fp16
 version of the weights. If this setting is `False` ``pytorch_model.bin` won't be created. This is because by default DeepSpeed's `state_dict` contains a placeholder and not the real weights. If we were to save this `state_dict`` it
 won't be possible to load it back.
 
@@ -1440,7 +1440,7 @@ won't be possible to load it back.
 ```json
 {
     "zero_optimization": {
-        "stage3_gather_fp16_weights_on_model_save": true
+        "stage3_gather_16bit_weights_on_model_save": true
     }
 }
 ```
diff --git a/examples/research_projects/wav2vec2/ds_config_wav2vec2_zero3.json b/examples/research_projects/wav2vec2/ds_config_wav2vec2_zero3.json
index a80a173b7a9704..1beb972ba89504 100644
--- a/examples/research_projects/wav2vec2/ds_config_wav2vec2_zero3.json
+++ b/examples/research_projects/wav2vec2/ds_config_wav2vec2_zero3.json
@@ -45,7 +45,7 @@
         "stage3_param_persistence_threshold": "auto",
         "stage3_max_live_parameters": 1e9,
         "stage3_max_reuse_distance": 1e9,
-        "stage3_gather_fp16_weights_on_model_save": true
+        "stage3_gather_16bit_weights_on_model_save": true
     },
 
     "gradient_accumulation_steps": "auto",
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 4b97f1778b4659..05eda1ebf5237d 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1637,7 +1637,7 @@ def _save_checkpoint(self, model, trial, metrics=None):
         self.save_model(output_dir)
         if self.deepspeed:
             # under zero3 model file itself doesn't get saved since it's bogus! Unless deepspeed
-            # config `stage3_gather_fp16_weights_on_model_save` is True
+            # config `stage3_gather_16bit_weights_on_model_save` is True
             self.deepspeed.save_checkpoint(output_dir)
 
         # Save optimizer and scheduler
@@ -2043,10 +2043,10 @@ def save_model(self, output_dir: Optional[str] = None):
                         # logger.info(f"deepspeed zero3: removing {file}, see zero_to_fp32.py to recover weights")
                         os.remove(file)
 
-                # now save the real model if stage3_gather_fp16_weights_on_model_save=True
+                # now save the real model if stage3_gather_16bit_weights_on_model_save=True
                 # if false it will not be saved.
                 # This must be called on all ranks
-                self.deepspeed.save_fp16_model(output_dir, WEIGHTS_NAME)
+                self.deepspeed.save_16bit_model(output_dir, WEIGHTS_NAME)
 
         elif self.args.should_save:
             self._save(output_dir)
diff --git a/tests/deepspeed/ds_config_zero3.json b/tests/deepspeed/ds_config_zero3.json
index 93eda91966d777..4d7a154c9b0d6f 100644
--- a/tests/deepspeed/ds_config_zero3.json
+++ b/tests/deepspeed/ds_config_zero3.json
@@ -49,7 +49,7 @@
         "stage3_param_persistence_threshold": "auto",
         "stage3_max_live_parameters": 1e9,
         "stage3_max_reuse_distance": 1e9,
-        "stage3_gather_fp16_weights_on_model_save": true
+        "stage3_gather_16bit_weights_on_model_save": true
     },
 
     "gradient_accumulation_steps": "auto",
diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
index 60fc817c2e6659..fa25b2f162a982 100644
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -245,7 +245,7 @@ def setUp(self):
             # config_zero3["fp16"]["enabled"] = True
             # This setting slows things down, so don't enable it by default unless needed by a test.
             # It's in the file as a demo for users since we want everything to work out of the box even if slower.
-            config_zero3["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = False
+            config_zero3["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"] = False
 
         self.ds_config_dict = dict(
             zero2=config_zero2,
@@ -541,7 +541,7 @@ def test_save_checkpoints(self, stage, dtype):
             ds_config_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
         # XXX:
         if stage == ZERO3:  # and dtype == FP16:
-            ds_config_dict["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = True
+            ds_config_dict["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"] = True
 
         # save checkpoints
         with mockenv_context(**self.dist_env_1_gpu):
@@ -593,7 +593,7 @@ def test_can_resume_training_normal(self, stage, dtype):
             ds_config_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
         # XXX:
         if stage == ZERO3:  # and dtype == FP16:
-            ds_config_dict["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = True
+            ds_config_dict["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"] = True
 
         kwargs = dict(output_dir=output_dir, train_len=128, save_steps=5, learning_rate=0.1, deepspeed=ds_config_dict)
         kwargs[dtype] = True

From 41176e4c0245baf3b9ccd9f43fb557c4341bd2db Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Tue, 18 Jan 2022 18:45:31 -0800
Subject: [PATCH 16/25] test fixes/skipping

---
 tests/deepspeed/test_deepspeed.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
index fa25b2f162a982..19d24a1807c56a 100644
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -427,13 +427,13 @@ def test_early_get_last_lr(self, stage, dtype):
             trainer.train()
             post_train_a = trainer.model.a.item()
 
-            # XXX: for some reason the following check fails with zero3/fp16 and zero2/bf16 - not a
+            # XXX: for some reason the following check fails with zero3/fp16 and any/bf16 - not a
             # broken but a different qualitative outcome - as if optimizer did run
             # oddly getting 1.0 for both a and b from 0.0 - there is a bug somewhere
             # print(trainer.model.a.item())
             # print(trainer.model.b.item())
             # need to investigate at some point
-            if (stage == ZERO3 and dtype == FP16) or (stage == ZERO2 and dtype == BF16):
+            if (stage == ZERO3 and dtype == FP16) or (dtype == BF16):
                 return
 
             # it's enough that train didn't fail for this test, but we must check that
@@ -792,6 +792,9 @@ def test_resume_train_not_from_ds_checkpoint(self, stage, dtype):
     @require_torch_multi_gpu
     @parameterized.expand(["bf16", "fp16", "fp32"])
     def test_inference(self, dtype):
+        if dtype == "bf16" and not is_torch_bf16_available():
+            self.skipTest("test requires bfloat16 hardware support")
+
         # this is just inference, so no optimizer should be loaded
         # it only works for z3 (makes no sense with z1-z2)
         fp32 = True if dtype == "fp32" else False

From 9c20180dfe61383649f80e3284ca8abd1e42dd03 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Thu, 10 Feb 2022 16:02:31 -0800
Subject: [PATCH 17/25] move

---
 tests/deepspeed/test_deepspeed.py | 68 ++++++++++++++++---------------
 1 file changed, 35 insertions(+), 33 deletions(-)

diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
index 19d24a1807c56a..d4b8d6b35a999c 100644
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -979,6 +979,41 @@ def test_clm(self, stage, dtype):
         # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
         execute_subprocess_async(cmd, env=self.get_env())
 
+
+    def test_clm_from_config_zero3_fp16(self):
+        # this test exercises AutoModel.from_config(config) - to ensure zero.Init is called
+
+        data_dir = self.tests_dir / "fixtures"
+        output_dir = self.get_auto_remove_tmp_dir()
+        args = f"""
+            --model_type gpt2
+            --tokenizer_name {GPT2_TINY}
+            --train_file {data_dir}/sample_text.txt
+            --validation_file {data_dir}/sample_text.txt
+            --output_dir {output_dir}
+            --overwrite_output_dir
+            --do_train
+            --max_train_samples 4
+            --per_device_train_batch_size 2
+            --num_train_epochs 1
+            --warmup_steps 8
+            --block_size 8
+            --fp16
+            --report_to none
+            """.split()
+
+        ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_zero3.json".split()
+        script = [f"{self.examples_dir_str}/pytorch/language-modeling/run_clm.py"]
+        launcher = get_launcher(distributed=True)
+
+        cmd = launcher + script + args + ds_args
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+        with CaptureStderr() as cs:
+            execute_subprocess_async(cmd, env=self.get_env())
+        assert "Detected DeepSpeed ZeRO-3" in cs.err
+
+
     @parameterized.expand(params, name_func=parameterized_custom_name_func)
     def test_load_best_model(self, stage, dtype):
         # this test exercises --load_best_model_at_end - the key is being able to resume after some training
@@ -1025,36 +1060,3 @@ def test_load_best_model(self, stage, dtype):
             execute_subprocess_async(cmd, env=self.get_env())
         # enough to test deespeed was invoked and it didn't fail
         assert "DeepSpeed info" in cs.out
-
-    def test_clm_from_config_zero3_fp16(self):
-        # this test exercises AutoModel.from_config(config) - to ensure zero.Init is called
-
-        data_dir = self.tests_dir / "fixtures"
-        output_dir = self.get_auto_remove_tmp_dir()
-        args = f"""
-            --model_type gpt2
-            --tokenizer_name {GPT2_TINY}
-            --train_file {data_dir}/sample_text.txt
-            --validation_file {data_dir}/sample_text.txt
-            --output_dir {output_dir}
-            --overwrite_output_dir
-            --do_train
-            --max_train_samples 4
-            --per_device_train_batch_size 2
-            --num_train_epochs 1
-            --warmup_steps 8
-            --block_size 8
-            --fp16
-            --report_to none
-            """.split()
-
-        ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_zero3.json".split()
-        script = [f"{self.examples_dir_str}/pytorch/language-modeling/run_clm.py"]
-        launcher = get_launcher(distributed=True)
-
-        cmd = launcher + script + args + ds_args
-        # keep for quick debug
-        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
-        with CaptureStderr() as cs:
-            execute_subprocess_async(cmd, env=self.get_env())
-        assert "Detected DeepSpeed ZeRO-3" in cs.err

From 9f50e90c4503827295843d7492e3a73b51e61f9d Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Thu, 10 Feb 2022 16:06:06 -0800
Subject: [PATCH 18/25] fix

---
 tests/deepspeed/test_deepspeed.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
index c1f6fbd5a7d8b7..540155437845d0 100644
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -28,7 +28,6 @@
     CaptureLogger,
     CaptureStd,
     CaptureStderr,
-    CaptureStdout,
     ExtendSysPath,
     LoggingLevel,
     TestCasePlus,
@@ -980,7 +979,6 @@ def test_clm(self, stage, dtype):
         # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
         execute_subprocess_async(cmd, env=self.get_env())
 
-
     def test_clm_from_config_zero3_fp16(self):
         # this test exercises AutoModel.from_config(config) - to ensure zero.Init is called
 
@@ -1014,7 +1012,6 @@ def test_clm_from_config_zero3_fp16(self):
             execute_subprocess_async(cmd, env=self.get_env())
         self.assertIn("Detected DeepSpeed ZeRO-3", cs.err)
 
-
     @parameterized.expand(params, name_func=parameterized_custom_name_func)
     def test_load_best_model(self, stage, dtype):
         # this test exercises --load_best_model_at_end - the key is being able to resume after some training

From 4ea8cab1c819e806243dbc6e298e1b0092288540 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Fri, 11 Feb 2022 10:49:10 -0800
Subject: [PATCH 19/25] Update docs/source/main_classes/deepspeed.mdx

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 docs/source/main_classes/deepspeed.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/main_classes/deepspeed.mdx b/docs/source/main_classes/deepspeed.mdx
index c92c06ad679563..1c683734e39d35 100644
--- a/docs/source/main_classes/deepspeed.mdx
+++ b/docs/source/main_classes/deepspeed.mdx
@@ -1251,7 +1251,7 @@ To configure pytorch AMP-like mode with fp16 (float16) set:
 and the [`Trainer`] will automatically enable or disable it based on the value of
 `args.fp16_backend`. The rest of config values are up to you.
 
-This mode gets enabled when ``--fp16 --fp16_backend amp`` or ``--fp16_full_eval`` command line args are passed.
+This mode gets enabled when `--fp16 --fp16_backend amp` or `--fp16_full_eval` command line args are passed.
 
 You can also enable/disable this mode explicitly:
 

From b8df8475a6b236ef5bc3556c7660256e9b6ae813 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Fri, 11 Feb 2022 19:09:25 -0800
Subject: [PATCH 20/25] backticks

---
 docs/source/main_classes/deepspeed.mdx | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/docs/source/main_classes/deepspeed.mdx b/docs/source/main_classes/deepspeed.mdx
index c92c06ad679563..60a25a4c5241b2 100644
--- a/docs/source/main_classes/deepspeed.mdx
+++ b/docs/source/main_classes/deepspeed.mdx
@@ -1251,7 +1251,7 @@ To configure pytorch AMP-like mode with fp16 (float16) set:
 and the [`Trainer`] will automatically enable or disable it based on the value of
 `args.fp16_backend`. The rest of config values are up to you.
 
-This mode gets enabled when ``--fp16 --fp16_backend amp`` or ``--fp16_full_eval`` command line args are passed.
+This mode gets enabled when `--fp16 --fp16_backend amp` or `--fp16_full_eval` command line args are passed.
 
 You can also enable/disable this mode explicitly:
 
@@ -1287,7 +1287,7 @@ If bf16 (bfloat16) is desired instead of fp16 then the following configuration s
 
 bf16 has the same dynamic range as fp32 and thus doesn't require loss scaling.
 
-This mode gets enabled when ``--bf16`` or ``--bf16_full_eval`` command line args are passed.
+This mode gets enabled when `--bf16` or `--bf16_full_eval` command line args are passed.
 
 You can also enable/disable this mode explicitly:
 
@@ -1433,8 +1433,7 @@ they are only the fp16 version of the weights.
 
 Under ZeRO-3, things are much more complicated, since the model weights are partitioned out over multiple GPUs,
 therefore `"stage3_gather_16bit_weights_on_model_save": true` is required to get the `Trainer` to save the fp16
-version of the weights. If this setting is `False` ``pytorch_model.bin` won't be created. This is because by default DeepSpeed's `state_dict` contains a placeholder and not the real weights. If we were to save this `state_dict`` it
-won't be possible to load it back.
+version of the weights. If this setting is `False` `pytorch_model.bin` won't be created. This is because by default DeepSpeed's `state_dict` contains a placeholder and not the real weights. If we were to save this `state_dict` it won't be possible to load it back.
 
 
 ```json

From 7062bb2baf128699d361c55335644e585a800c74 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Fri, 11 Feb 2022 19:10:52 -0800
Subject: [PATCH 21/25] cleanup

---
 src/transformers/deepspeed.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/transformers/deepspeed.py b/src/transformers/deepspeed.py
index aab8e558b47d47..37e71380ae1909 100644
--- a/src/transformers/deepspeed.py
+++ b/src/transformers/deepspeed.py
@@ -258,11 +258,6 @@ def trainer_config_process(self, args):
         else:
             self._dtype = torch.float16
 
-        # print(self._dtype)
-        # print(self.get_value("fp16.enabled"))
-        # print(self.get_value("bf16.enabled"))
-        # print(self.config)
-        # die
 
     def trainer_config_finalize(self, args, model, num_training_steps):
         """

From fcc40d3e5db7cb34406d41f3abb82e0babdd3fb7 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Fri, 11 Feb 2022 19:16:41 -0800
Subject: [PATCH 22/25] cleanup

---
 tests/deepspeed/test_deepspeed.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
index 540155437845d0..22df12622b4e9e 100644
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -237,13 +237,9 @@ def setUp(self):
         # use self.get_config_dict(stage) to use these to ensure the original is not modified
         with io.open(self.ds_config_file[ZERO2], "r", encoding="utf-8") as f:
             config_zero2 = json.load(f)
-            # by default use fp16
-            # config_zero2["fp16"]["enabled"] = True
         with io.open(self.ds_config_file[ZERO3], "r", encoding="utf-8") as f:
             config_zero3 = json.load(f)
-            # by default use fp16
-            # config_zero3["fp16"]["enabled"] = True
-            # This setting slows things down, so don't enable it by default unless needed by a test.
+            # The following setting slows things down, so don't enable it by default unless needed by a test.
             # It's in the file as a demo for users since we want everything to work out of the box even if slower.
             config_zero3["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"] = False
 
@@ -540,7 +536,7 @@ def test_save_checkpoints(self, stage, dtype):
         if dtype == FP16:
             ds_config_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
         # XXX:
-        if stage == ZERO3:  # and dtype == FP16:
+        if stage == ZERO3:
             ds_config_dict["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"] = True
 
         # save checkpoints
@@ -592,7 +588,7 @@ def test_can_resume_training_normal(self, stage, dtype):
         if dtype == FP16:
             ds_config_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
         # XXX:
-        if stage == ZERO3:  # and dtype == FP16:
+        if stage == ZERO3:
             ds_config_dict["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"] = True
 
         kwargs = dict(output_dir=output_dir, train_len=128, save_steps=5, learning_rate=0.1, deepspeed=ds_config_dict)

From 8529f872a2746853f70c423897264d910610ef47 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Fri, 11 Feb 2022 19:17:04 -0800
Subject: [PATCH 23/25] cleanup

---
 src/transformers/deepspeed.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/deepspeed.py b/src/transformers/deepspeed.py
index 37e71380ae1909..993cf5d3996a16 100644
--- a/src/transformers/deepspeed.py
+++ b/src/transformers/deepspeed.py
@@ -258,7 +258,6 @@ def trainer_config_process(self, args):
         else:
             self._dtype = torch.float16
 
-
     def trainer_config_finalize(self, args, model, num_training_steps):
         """
         This stage is run after we have the model and know num_training_steps.

From 7cb6455de9d267f4caa562b7a283d2080b2316a1 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Fri, 11 Mar 2022 16:12:11 -0800
Subject: [PATCH 24/25] new version

---
 setup.py                                      | 2 +-
 src/transformers/dependency_versions_table.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 9b36c771cd08d3..343bea3acfd733 100644
--- a/setup.py
+++ b/setup.py
@@ -98,7 +98,7 @@
     "cookiecutter==1.7.2",
     "dataclasses",
     "datasets",
-    "deepspeed>=0.5.9",
+    "deepspeed>=0.6.0",
     "fairscale>0.3",
     "faiss-cpu",
     "fastapi",
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index 8131c6f5e99935..1ffaa15036452e 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -8,7 +8,7 @@
     "cookiecutter": "cookiecutter==1.7.2",
     "dataclasses": "dataclasses",
     "datasets": "datasets",
-    "deepspeed": "deepspeed>=0.5.9",
+    "deepspeed": "deepspeed>=0.6.0",
     "fairscale": "fairscale>0.3",
     "faiss-cpu": "faiss-cpu",
     "fastapi": "fastapi",

From 75fc6ceaa57dfe27af41a762472455addbad0b37 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Fri, 11 Mar 2022 17:16:49 -0800
Subject: [PATCH 25/25] add note about grad accum in bf16

---
 docs/source/main_classes/deepspeed.mdx | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/docs/source/main_classes/deepspeed.mdx b/docs/source/main_classes/deepspeed.mdx
index a6b6394f19c654..863cab408cc425 100644
--- a/docs/source/main_classes/deepspeed.mdx
+++ b/docs/source/main_classes/deepspeed.mdx
@@ -1310,6 +1310,14 @@ You can also enable/disable this mode explicitly:
 }
 ```
 
+<Tip>
+
+As of `deepspeed==0.6.0` the bf16 support is new and experimental.
+
+If you use [gradient accumulation](#gradient-accumulation) with bf16-enabled, you need to be aware that it'll accumulate gradients in bf16, which may not be what you want due to this format's low precision, as it may lead to a lossy accumulation.
+
+</Tip>
+
 
 ### apex