Merge pull request #6 from microsoft/dev/mallamanis/amp

Support AMP training in GNNs and other neural models.
microsoft · Nov 10, 2020 · 0ba8308 · 0ba8308
2 parents e1c507f + 3bfe1f8
commit 0ba8308
Show file tree

Hide file tree

Showing 6 changed files with 34 additions and 15 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -13,7 +13,7 @@ jobs:
       max-parallel: 2
       matrix:
         python-version: [3.6, 3.7, 3.8]
-        torch-version: [1.4.0, 1.5.0]
+        torch-version: [1.6.0, 1.7.0]
 
     steps:
       - uses: actions/checkout@v1

diff --git a/ptgnn/baseneuralmodel/trainer.py b/ptgnn/baseneuralmodel/trainer.py
@@ -51,6 +51,7 @@ def __init__(
         clip_gradient_norm: Optional[float] = None,
         target_validation_metric: Optional[str] = None,
         target_validation_metric_higher_is_better: bool = False,
+        enable_amp: bool = False,
     ):
         """
         :param model: The Component to be built and trained
@@ -84,6 +85,7 @@ def __init__(
         self.__train_epoch_end_hooks: List[EndOfEpochHook] = []
         self.__validation_epoch_end_hooks: List[EndOfEpochHook] = []
         self.__clip_gradient_norm = clip_gradient_norm
+        self.__enable_amp = enable_amp
 
         self.__target_metric = target_validation_metric
         if target_validation_metric is not None:
@@ -182,6 +184,8 @@ def _run_training(
         sum_epoch_loss, running_avg_loss, num_minibatches, num_samples = 0.0, 0.0, 0, 0
         start_time = time.time()
         self.neural_module.train()
+
+        scaler = torch.cuda.amp.GradScaler(enabled=self.__enable_amp)
         with tqdm(desc="Training", disable=not show_progress_bar, leave=False) as progress_bar:
             for step_idx, (mb_data, raw_samples) in enumerate(
                 self.__model.minibatch_iterator(
@@ -194,20 +198,23 @@ def _run_training(
                 )
             ):
                 optimizer.zero_grad()
-                mb_loss = self.neural_module(**mb_data)
-                mb_loss.backward()
+                with torch.cuda.amp.autocast(enabled=self.__enable_amp):
+                    mb_loss = self.neural_module(**mb_data)
+                    if torch.isnan(mb_loss):
+                        raise Exception("Loss has a NaN value.")
 
-                if self.__clip_gradient_norm is not None:
-                    torch.nn.utils.clip_grad_norm_(
-                        self.neural_module.parameters(recurse=True), self.__clip_gradient_norm
-                    )
+                    scaler.scale(mb_loss).backward()
 
-                if torch.isnan(mb_loss):
-                    raise Exception("Loss has a NaN value.")
+                    if self.__clip_gradient_norm is not None:
+                        scaler.unscale_(optimizer)
+                        torch.nn.utils.clip_grad_norm_(
+                            self.neural_module.parameters(recurse=True), self.__clip_gradient_norm
+                        )
 
-                optimizer.step()
-                if scheduler is not None:
-                    scheduler.step(epoch_idx=epoch, epoch_step=step_idx)
+                    scaler.step(optimizer)
+                    scaler.update()
+                    if scheduler is not None:
+                        scheduler.step(epoch_idx=epoch, epoch_step=step_idx)
 
                 num_minibatches += 1
                 num_samples += len(raw_samples)
@@ -258,7 +265,8 @@ def _run_validation(
                 shuffle_input=False,
                 parallelize=parallelize,
             ):
-                mb_loss = self.neural_module(**mb_data)
+                with torch.cuda.amp.autocast(enabled=self.__enable_amp):
+                    mb_loss = self.neural_module(**mb_data)
                 num_minibatches += 1
                 num_samples += len(raw_samples)
                 sum_epoch_loss += float(mb_loss.cpu())

diff --git a/ptgnn/implementations/graph2seq/train.py b/ptgnn/implementations/graph2seq/train.py
@@ -5,6 +5,7 @@
 
 Options:
     --aml                      Run this in Azure ML
+    --amp                      Enable automatic mixed precision.
     --azure-info=<path>        Azure authentication information file (JSON). Used to load data from Azure storage.
     --max-num-epochs=<epochs>  The maximum number of epochs to run training for. [default: 100]
     --minibatch-size=<size>    The minibatch size. [default: 300]
@@ -109,6 +110,7 @@ def create_mp_layers(num_edges: int):
         model_path,
         max_num_epochs=int(arguments["--max-num-epochs"]),
         minibatch_size=int(arguments["--minibatch-size"]),
+        enable_amp=arguments["--amp"],
     )
     if nn is not None:
         trainer.neural_module = nn

diff --git a/ptgnn/implementations/graph2seq/trainandtest.py b/ptgnn/implementations/graph2seq/trainandtest.py
@@ -5,6 +5,7 @@
 
 Options:
     --aml                      Run this in Azure ML
+    --amp                      Enable automatic mixed precision.
     --azure-info=<path>        Azure authentication information file (JSON). Used to load data from Azure storage.
     --max-num-epochs=<epochs>  The maximum number of epochs to run training for. [default: 100]
     --minibatch-size=<size>    The minibatch size. [default: 300]

diff --git a/ptgnn/implementations/typilus/train.py b/ptgnn/implementations/typilus/train.py
@@ -5,6 +5,7 @@
 
 Options:
     --aml                      Run this in Azure ML
+    --amp                      Enable automatic mixed precision.
     --azure-info=<path>        Azure authentication information file (JSON). Used to load data from Azure storage.
     --max-num-epochs=<epochs>  The maximum number of epochs to run training for. [default: 100]
     --minibatch-size=<size>    The minibatch size. [default: 300]
@@ -167,6 +168,7 @@ def create_optimizer(parameters):
         clip_gradient_norm=1,
         target_validation_metric="Accuracy",
         target_validation_metric_higher_is_better=True,
+        enable_amp=arguments["--amp"],
     )
     if nn is not None:
         trainer.neural_module = nn

diff --git a/ptgnn/neuralmodels/gnn/messagepassing/abstractmessagepassing.py b/ptgnn/neuralmodels/gnn/messagepassing/abstractmessagepassing.py
@@ -30,9 +30,15 @@ def _aggregate_messages(
         self, messages: torch.Tensor, message_targets: torch.Tensor, num_nodes, aggregation_fn: str
     ):
         """Utility function to be used by concrete implementors."""
+        # Support AMP
+        msg_dtype = messages.dtype
         return scatter(
-            messages, index=message_targets, dim=0, dim_size=num_nodes, reduce=aggregation_fn
-        )
+            messages.to(torch.float32),
+            index=message_targets,
+            dim=0,
+            dim_size=num_nodes,
+            reduce=aggregation_fn,
+        ).to(msg_dtype)
 
     @property
     @abstractmethod