Fix DeepSpeed config validation error by changing `stage3_prefetch_bu…

…cket_size` value to an integer (#2814)
huggingface · Jun 6, 2024 · bad2ce4 · bad2ce4
1 parent 30cb7ec
commit bad2ce4
Show file tree

Hide file tree

Showing 3 changed files with 4 additions and 4 deletions.
diff --git a/docs/source/usage_guides/deepspeed.md b/docs/source/usage_guides/deepspeed.md
@@ -433,7 +433,7 @@ Only the `auto` fields specified in above examples are handled by `prepare` meth
 The `auto` values are calculated as:
 
 - `reduce_bucket_size`: `hidden_size * hidden_size`
-- `stage3_prefetch_bucket_size`: `0.9 * hidden_size * hidden_size`
+- `stage3_prefetch_bucket_size`: `int(0.9 * hidden_size * hidden_size)`
 - `stage3_param_persistence_threshold`: `10 * hidden_size`
 
 For the `auto` feature to work for these 3 config entries - Accelerate will use `model.config.hidden_size` or `max(model.config.hidden_sizes)` as `hidden_size`. If neither of these is available, the launching will fail and you will have to set these 3 config entries manually. Remember the first 2 config entries are the communication buffers - the larger they are the more efficient the comms will be, and the larger they are the more GPU memory they will consume, so it's a tunable performance trade-off.

diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py
@@ -1706,7 +1706,7 @@ def _prepare_deepspeed(self, *args):
                 config_kwargs.update(
                     {
                         "zero_optimization.reduce_bucket_size": hidden_size * hidden_size,
-                        "zero_optimization.stage3_prefetch_bucket_size": 0.9 * hidden_size * hidden_size,
+                        "zero_optimization.stage3_prefetch_bucket_size": int(0.9 * hidden_size * hidden_size),
                         "zero_optimization.stage3_param_persistence_threshold": 10 * hidden_size,
                     }
                 )

diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
@@ -638,7 +638,7 @@ def test_autofill_dsconfig(self):
 
             assert config["gradient_clipping"] == 1.0
             assert config["zero_optimization"]["reduce_bucket_size"] == (hidden_size * hidden_size)
-            assert config["zero_optimization"]["stage3_prefetch_bucket_size"] == ((0.9 * hidden_size) * hidden_size)
+            assert config["zero_optimization"]["stage3_prefetch_bucket_size"] == int((0.9 * hidden_size) * hidden_size)
             assert config["zero_optimization"]["stage3_param_persistence_threshold"] == (10 * hidden_size)
             assert not config["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"]
 
@@ -695,7 +695,7 @@ def test_autofill_comm_buffers_dsconfig(self, model_type):
                 )
                 zero_opt = accelerator.deepspeed_config["zero_optimization"]
                 assert zero_opt["reduce_bucket_size"] == (hidden_size * hidden_size)
-                assert zero_opt["stage3_prefetch_bucket_size"] == (0.9 * hidden_size) * hidden_size
+                assert zero_opt["stage3_prefetch_bucket_size"] == int((0.9 * hidden_size) * hidden_size)
                 assert zero_opt["stage3_param_persistence_threshold"] == (10 * hidden_size)
 
     @parameterized.expand([FP16, BF16], name_func=parameterized_custom_name_func)