diff --git a/DeepSpeedExamples b/DeepSpeedExamples --- a/DeepSpeedExamples +++ b/DeepSpeedExamples @@ -1 +1 @@ -Subproject commit 1fed12e8b375b0c54902827e7140d8266dfccd59 +Subproject commit 1fed12e8b375b0c54902827e7140d8266dfccd59-dirty diff --git a/deepspeed/runtime/zero/stage2.py b/deepspeed/runtime/zero/stage2.py index b995e4d..8df4997 100755 --- a/deepspeed/runtime/zero/stage2.py +++ b/deepspeed/runtime/zero/stage2.py @@ -1622,6 +1622,14 @@ class FP16_DeepSpeedZeroOptimizer(object): prev_scale = self.loss_scale self._update_scale(self.overflow) if self.overflow: + + if dist.get_rank() == 0: + logger.info( + "[deepscale] OVERFLOW! Rank {} Skipping step. Attempted loss scale: {}, " + "reducing to {}".format(dist.get_rank(), + prev_scale, + self.loss_scale)) + see_memory_usage('After overflow before clearing gradients') self.zero_grad() if self.cpu_offload: