deepspeedai · desire2020 · Mar 11, 2024 · Apr 1, 2024 · Apr 2, 2024 · Apr 12, 2024
@@ -1307,7 +1307,8 @@ def complete_grad_norm_calculation_for_cpu_offload(self, params):
         total_norm = total_norm_cuda[0].item()**(1. / norm_type)
 
         if total_norm == float('inf') or total_norm == -float('inf') or total_norm != total_norm:
-            total_norm = -1
+            logger.info(f"Warning: invalid gradient detected. Please check your model implementation/configuration to improve the numerical stability.")
+            total_norm = -1.
-            logger.info(f"Warning: invalid gradient detected. Please check your model implementation/configuration to improve the numerical stability.")
-            total_norm = -1.
+            total_norm = torch.tensor(-1.0, device=self.device, dtype=torch.float)
-            logger.info(f"Warning: invalid gradient detected. Please check your model implementation/configuration to improve the numerical stability.")
-            total_norm = -1.
+            total_norm = torch.tensor(-1.0, device=self.device, dtype=torch.float)
 
         return total_norm