Lightning-AI · SeanNaren · Nov 12, 2020 · Nov 12, 2020 · Nov 12, 2020 · Nov 12, 2020
@@ -12,33 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
-import math
 from enum import Enum
 from typing import Any, Optional, Union
 
 import torch
 
-from pytorch_lightning.utilities import AMPType, rank_zero_warn
+from pytorch_lightning.utilities import AMPType
 from pytorch_lightning.utilities.apply_func import move_data_to_device
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.parsing import AttributeDict
 import torch.distributed as torch_distrib
 from pytorch_lightning import _logger as log
 
-try:
-    from apex import amp
-except ImportError:
-    amp = None
-
 if torch.distributed.is_available():
     from torch.distributed import ReduceOp
 else:
     class ReduceOp:
         SUM = None
 
-EPSILON = 1e-6
-EPSILON_FP16 = 1e-5
-
 
 class Accelerator(object):
 
@@ -139,48 +130,22 @@ def optimizer_zero_grad(self, batch_idx, optimizer, opt_idx):
         model_ref.optimizer_zero_grad(self.trainer.current_epoch, batch_idx, optimizer, opt_idx)
 
     def clip_gradients(self, optimizer, clip_val=None):
-        # TODO: separate TPU case from here
-        self._clip_gradients(optimizer, clip_val)
-
-    def _clip_gradients(self, optimizer, clip_val=None):
         # use the trainer's clip val if none passed
         grad_clip_val = self.trainer.gradient_clip_val
         if clip_val is not None:
             grad_clip_val = clip_val
         grad_clip_val = float(grad_clip_val)
 
-        # this code is a modification of torch.nn.utils.clip_grad_norm_
-        # with TPU support based on https://github.com/pytorch/xla/blob/master/TROUBLESHOOTING.md
         if grad_clip_val <= 0:
             return
+        self._clip_gradients(optimizer, grad_clip_val)
 
-        model = self.trainer.get_model()
-        if self.trainer.amp_backend == AMPType.APEX:
-            parameters = amp.master_params(optimizer)
+    def _clip_gradients(self, optimizer, grad_clip_val):
+        if self.trainer.amp_backend:
+            self.trainer.precision_connector.backend.clip_gradients(grad_clip_val, optimizer)
         else:
-            parameters = model.parameters()
-
-        max_norm = grad_clip_val
-        norm_type = float(2.0)
-
-        if isinstance(parameters, torch.Tensor):
-            parameters = [parameters]
-        parameters = list(filter(lambda p: p.grad is not None, parameters))
-
-        if norm_type == math.inf:
-            total_norm = max(p.grad.data.abs().max() for p in parameters)
-        else:
-            device = parameters[0].device
-            out = torch.empty(len(parameters), device=device)
-            for i, p in enumerate(parameters):
-                torch.norm(p.grad.data.to(device), norm_type, out=out[i])
-            total_norm = torch.norm(out, norm_type)
-
-        eps = EPSILON_FP16 if self.trainer.precision == 16 else EPSILON
-        clip_coef = torch.tensor(max_norm, device=device) / (total_norm + eps)
-        clip_coef = torch.min(clip_coef, torch.ones_like(clip_coef))
-        for p in parameters:
-            p.grad.data.mul_(clip_coef.to(p.grad.data.device))
+            model = self.trainer.get_model()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip_val, norm_type=2.0)
 
     def on_train_epoch_end(self, outputs):
         pass

@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import io
+import math
 import os
 import re
 from typing import Optional, Union, Any
@@ -35,6 +36,8 @@
     import torch_xla.distributed.parallel_loader as xla_pl
     import torch_xla.distributed.xla_multiprocessing as xmp
 
+EPSILON = 1e-6
+
 
 class TPUAccelerator(Accelerator):
 
@@ -261,10 +264,31 @@ def optimizer_step(self, optimizer, batch_idx, opt_idx, lambda_closure):
             using_lbfgs=is_lbfgs
         )
 
-    def clip_gradients(self, optimizer, clip_val=None):
-        # apply clip gradients
-        # TODO: separate TPU case from here
-        self._clip_gradients(optimizer, clip_val)
+    def _clip_gradients(self, optimizer, grad_clip_val):
+        # this code is a modification of torch.nn.utils.clip_grad_norm_
+        # with TPU support based on https://github.com/pytorch/xla/blob/master/TROUBLESHOOTING.md
+        model = self.trainer.get_model()
+        parameters = model.parameters()
+        max_norm = grad_clip_val
+        norm_type = 2.0
+
+        if isinstance(parameters, torch.Tensor):
+            parameters = [parameters]
+        parameters = list(filter(lambda p: p.grad is not None, parameters))
+
+        if norm_type == math.inf:
+            total_norm = max(p.grad.data.abs().max() for p in parameters)
+        else:
+            device = parameters[0].device
+            out = torch.empty(len(parameters), device=device)
+            for i, p in enumerate(parameters):
+                torch.norm(p.grad.data.to(device), norm_type, out=out[i])
+            total_norm = torch.norm(out, norm_type)
+
+        clip_coef = torch.tensor(max_norm, device=device) / (total_norm + EPSILON)
+        clip_coef = torch.min(clip_coef, torch.ones_like(clip_coef))
+        for p in parameters:
+            p.grad.data.mul_(clip_coef.to(p.grad.data.device))
 
     def barrier(self, name: Optional[str] = None):
         torch_xla.core.xla_model.rendezvous(f"pl.Trainer.{name}")

@@ -11,11 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import math
 from typing import List, Tuple
 
+import torch
 from torch.optim.optimizer import Optimizer
 
 from pytorch_lightning.core.lightning import LightningModule
+from pytorch_lightning.plugins.precision_plugin import PrecisionPlugin
 from pytorch_lightning.utilities.distributed import rank_zero_warn
 from pytorch_lightning.utilities import AMPType
 
@@ -24,8 +27,10 @@
 except ImportError:
     amp = None
 
+FP16_EPSILON = 1e-5
 
-class ApexPlugin:
+
+class ApexPlugin(PrecisionPlugin):
 
     def __init__(self, trainer=None):
         self.trainer = trainer
@@ -98,3 +103,34 @@ def configure_apex(self, amp, model, optimizers, amp_level):
         """
         model, optimizers = amp.initialize(model, optimizers, opt_level=amp_level)
         return model, optimizers
+
+    def clip_gradients(self, grad_clip_val, optimizer):
+        """
+        This code is a modification of :meth:`torch.nn.utils.clip_grad_norm_` using a higher epsilon for fp16 weights.
+        This is important when setting amp_level to O2, and the master weights are in fp16.
+        Args:
+            grad_clip_val: Maximum norm of gradients.
+            optimizer: Optimizer with gradients that will be clipped.
+        """
+        model = self.trainer.get_model()
+        parameters = model.parameters()
+        max_norm = grad_clip_val
+        norm_type = 2.0
+
+        if isinstance(parameters, torch.Tensor):
+            parameters = [parameters]
+        parameters = [p for p in parameters if p.grad is not None]
+        max_norm = float(max_norm)
+        norm_type = float(norm_type)
+        if len(parameters) == 0:
+            return torch.tensor(0.)
+        device = parameters[0].grad.device
+        if norm_type == math.inf:
+            total_norm = max(p.grad.detach().abs().max().to(device) for p in parameters)
+        else:
+            total_norm = torch.norm(
+                torch.stack([torch.norm(p.grad.detach(), norm_type).to(device) for p in parameters]), norm_type)
+        clip_coef = max_norm / (total_norm + FP16_EPSILON)
+        if clip_coef < 1:
+            for p in parameters:
+                p.grad.detach().mul_(clip_coef.to(p.grad.device))
@@ -14,8 +14,10 @@
 
 import torch
 
+from pytorch_lightning.plugins.precision_plugin import PrecisionPlugin
 
-class NativeAMPPlugin:
+
+class NativeAMPPlugin(PrecisionPlugin):
 
     def __init__(self, trainer=None):
         """
@@ -51,3 +53,7 @@ def training_step(self, fx, args):
         with torch.cuda.amp.autocast():
             output = fx(*args)
         return output
+
+    def clip_gradients(self, grad_clip_val, optimizer):
+        model = self.trainer.get_model()
+        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip_val, norm_type=2.0)
@@ -0,0 +1,35 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import abc
+
+
+class PrecisionPlugin(abc.ABC):
+    """
+    Abstract class to extend for precision support (32/16 etc).
+
+    This is extended to cover any specific logic required for precision support such as AMP/APEX or sharded
+    training.
+    """
+
+    def connect(self, model, optimizers):
+        raise NotImplementedError
+
+    def training_step(self, fx, args):
+        raise NotImplementedError
+
+    def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs):
+        raise NotImplementedError
+
+    def clip_gradients(self, grad_clip_val, optimizer):
+        raise NotImplementedError