Skip to content

Commit

Permalink
Fix racing condition in GatheredParameters (#3819)
Browse files Browse the repository at this point in the history
* Fix racing condition in GatheredParameters
  • Loading branch information
HeyangQin authored Jun 29, 2023
1 parent 78b7693 commit f8551b4
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 2 deletions.
4 changes: 3 additions & 1 deletion deepspeed/runtime/zero/partition_parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -1899,13 +1899,15 @@ def load(module: nn.Module, prefix=""):
else:
# single param
params = [params]

# enable if at least one is zero-param, otherwise a noop
if not any(is_zero_param(p) for p in params):
self.enabled = False
return

self.params = [p for p in params if hasattr(p, "ds_id")]
self.params = sorted(
set(self.params), key=lambda x: x.ds_id
) # remove the duplicates to prevent racing condition, we must also make sure the order is the same on all ranks otherwise we'll get deadlocks
self.src_rank = None
if modifier_rank is not None:
if self.params[0].ds_process_group == dist.get_world_group():
Expand Down
2 changes: 1 addition & 1 deletion deepspeed/runtime/zero/partitioned_param_coordinator.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ def reset_step(self) -> None:
"""indicate that we have completed one fwd+bwd for the model"""
if self.__inflight_param_registry:
raise RuntimeError(f"still have inflight params "
f"{[p.ds_summary for p in self.__inflight_param_registry.keys()]}")
f"{[p.ds_summary() for p in self.__inflight_param_registry.keys()]}")

if not self.is_complete_trace(): # not self.trace_complete:
# Make sure that recorded submodule orders are identical across ranks
Expand Down

0 comments on commit f8551b4

Please sign in to comment.