Skip to content

Commit

Permalink
Fix DeepSpeed config validation error by changing `stage3_prefetch_bu…
Browse files Browse the repository at this point in the history
…cket_size` value to an integer (#2814)
  • Loading branch information
adk9 authored Jun 6, 2024
1 parent 30cb7ec commit bad2ce4
Show file tree
Hide file tree
Showing 3 changed files with 4 additions and 4 deletions.
2 changes: 1 addition & 1 deletion docs/source/usage_guides/deepspeed.md
Original file line number Diff line number Diff line change
Expand Up @@ -433,7 +433,7 @@ Only the `auto` fields specified in above examples are handled by `prepare` meth
The `auto` values are calculated as:

- `reduce_bucket_size`: `hidden_size * hidden_size`
- `stage3_prefetch_bucket_size`: `0.9 * hidden_size * hidden_size`
- `stage3_prefetch_bucket_size`: `int(0.9 * hidden_size * hidden_size)`
- `stage3_param_persistence_threshold`: `10 * hidden_size`

For the `auto` feature to work for these 3 config entries - Accelerate will use `model.config.hidden_size` or `max(model.config.hidden_sizes)` as `hidden_size`. If neither of these is available, the launching will fail and you will have to set these 3 config entries manually. Remember the first 2 config entries are the communication buffers - the larger they are the more efficient the comms will be, and the larger they are the more GPU memory they will consume, so it's a tunable performance trade-off.
Expand Down
2 changes: 1 addition & 1 deletion src/accelerate/accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -1706,7 +1706,7 @@ def _prepare_deepspeed(self, *args):
config_kwargs.update(
{
"zero_optimization.reduce_bucket_size": hidden_size * hidden_size,
"zero_optimization.stage3_prefetch_bucket_size": 0.9 * hidden_size * hidden_size,
"zero_optimization.stage3_prefetch_bucket_size": int(0.9 * hidden_size * hidden_size),
"zero_optimization.stage3_param_persistence_threshold": 10 * hidden_size,
}
)
Expand Down
4 changes: 2 additions & 2 deletions tests/deepspeed/test_deepspeed.py
Original file line number Diff line number Diff line change
Expand Up @@ -638,7 +638,7 @@ def test_autofill_dsconfig(self):

assert config["gradient_clipping"] == 1.0
assert config["zero_optimization"]["reduce_bucket_size"] == (hidden_size * hidden_size)
assert config["zero_optimization"]["stage3_prefetch_bucket_size"] == ((0.9 * hidden_size) * hidden_size)
assert config["zero_optimization"]["stage3_prefetch_bucket_size"] == int((0.9 * hidden_size) * hidden_size)
assert config["zero_optimization"]["stage3_param_persistence_threshold"] == (10 * hidden_size)
assert not config["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"]

Expand Down Expand Up @@ -695,7 +695,7 @@ def test_autofill_comm_buffers_dsconfig(self, model_type):
)
zero_opt = accelerator.deepspeed_config["zero_optimization"]
assert zero_opt["reduce_bucket_size"] == (hidden_size * hidden_size)
assert zero_opt["stage3_prefetch_bucket_size"] == (0.9 * hidden_size) * hidden_size
assert zero_opt["stage3_prefetch_bucket_size"] == int((0.9 * hidden_size) * hidden_size)
assert zero_opt["stage3_param_persistence_threshold"] == (10 * hidden_size)

@parameterized.expand([FP16, BF16], name_func=parameterized_custom_name_func)
Expand Down

0 comments on commit bad2ce4

Please sign in to comment.