Update mlp_cuda test (#1425)

to use `torch.testing.assert_close` instead of `numpy.testing.assert_allclose`. The former uses a bit looser threshold values. Signed-off-by: Masaki Kozuki <mkozuki@nvidia.com>
NVIDIA · Aug 9, 2022 · 71e5871 · 71e5871
1 parent 26ff4e4
commit 71e5871
Show file tree

Hide file tree

Showing 2 changed files with 104 additions and 111 deletions.
diff --git a/apex/mlp/mlp.py b/apex/mlp/mlp.py
@@ -21,6 +21,7 @@ def backward(ctx, grad_o):
         del ctx.outputs
         return (None, None, *grads)
 
+# TODO(crcrpar): Should make this compatible with torch.cuda.amp
 mlp_function = amp.half_function(MlpFunction.apply)
 
 class MLP(torch.nn.Module):

diff --git a/tests/L0/run_mlp/test_mlp.py b/tests/L0/run_mlp/test_mlp.py
@@ -1,19 +1,20 @@
 """Tests for c++ MLP"""
 import unittest
 from time import time
-import numpy as np
 
 import torch
 from torch import nn
 
 from apex.mlp import MLP
 
+
 batch_size = 1024
 mlp_sizes = [480, 1024, 1024, 512, 256, 1]
 num_iters = 10
 
-class TestMLP(unittest.TestCase):
 
+# note(crcrpar): On Ampere, this test should be run without TF32 enabled.
+class TestMLP(unittest.TestCase):
     def test_creation(self):
         MLP(mlp_sizes)
 
@@ -30,105 +31,89 @@ def test_numeric(self):
 
         ref_mlp = nn.Sequential(*mlp_layers).cuda()
 
-        test_input = torch.empty(batch_size, mlp_sizes[0], device="cuda").uniform_(-1., 1.).requires_grad_()
+        test_input = (
+            torch.empty(batch_size, mlp_sizes[0], device="cuda")
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
         ref_input = test_input.clone().detach().requires_grad_()
         mlp_out = mlp(test_input)
         ref_out = ref_mlp(ref_input)
-        np.testing.assert_allclose(
-            mlp_out.detach().cpu().numpy(),
-            ref_out.detach().cpu().numpy(),
-            atol=1e-7, rtol=1e-5)
+        torch.testing.assert_close(mlp_out, ref_out)
 
         # Use mean value as scalar loss. Multiply 10 to make it big enough not zero out
-        mlp_out.mean().mul(10.).backward()
-        ref_out.mean().mul(10.).backward()
-        np.testing.assert_allclose(
-            test_input.grad.detach().cpu().numpy(),
-            ref_input.grad.detach().cpu().numpy(),
-            atol=0, rtol=1e-5)
-        np.testing.assert_allclose(
-            mlp.biases[0].grad.detach().cpu().numpy(),
-            ref_mlp[0].bias.grad.detach().cpu().numpy(),
-            atol=1e-7, rtol=1e-5)
+        mlp_out.mean().mul(10.0).backward()
+        ref_out.mean().mul(10.0).backward()
+        torch.testing.assert_close(test_input.grad, ref_input.grad)
+        torch.testing.assert_close(mlp.biases[0].grad, ref_mlp[0].bias.grad)
 
     def test_no_bias(self):
-        for use_activation in ['none', 'relu', 'sigmoid']:
-            mlp = MLP(mlp_sizes, bias=False, activation=use_activation).cuda()
-
-            mlp_layers = []
-            for i in range(mlp.num_layers):
-                linear = nn.Linear(mlp_sizes[i], mlp_sizes[i + 1], bias=False)
-                mlp.weights[i].data.copy_(linear.weight)
-                mlp_layers.append(linear)
-                if use_activation == 'relu':
-                    mlp_layers.append(nn.ReLU(inplace=True))
-                if use_activation == 'sigmoid':
-                    mlp_layers.append(nn.Sigmoid())
-
-            ref_mlp = nn.Sequential(*mlp_layers).cuda()
-
-            test_input = torch.empty(batch_size, mlp_sizes[0], device="cuda").uniform_(-1., 1.).requires_grad_()
-            ref_input = test_input.clone().detach().requires_grad_()
-            mlp_out = mlp(test_input)
-            ref_out = ref_mlp(ref_input)
-            np.testing.assert_allclose(
-                mlp_out.detach().cpu().numpy(),
-                ref_out.detach().cpu().numpy(),
-                atol=1e-7, rtol=1e-5)
-
-            # Use mean value as scalar loss. Multiply 10 to make it big enough not zero out
-            mlp_out.mean().mul(10.).backward()
-            ref_out.mean().mul(10.).backward()
-            np.testing.assert_allclose(
-                test_input.grad.detach().cpu().numpy(),
-                ref_input.grad.detach().cpu().numpy(),
-                atol=0, rtol=100)
-            np.testing.assert_allclose(
-                mlp.weights[0].grad.detach().cpu().numpy(),
-                ref_mlp[0].weight.grad.detach().cpu().numpy(),
-                atol=1e-7, rtol=100)
+        for use_activation in ["none", "relu", "sigmoid"]:
+            with self.subTest(use_activation=use_activation):
+                mlp = MLP(mlp_sizes, bias=False, activation=use_activation).cuda()
+
+                mlp_layers = []
+                for i in range(mlp.num_layers):
+                    linear = nn.Linear(mlp_sizes[i], mlp_sizes[i + 1], bias=False)
+                    mlp.weights[i].data.copy_(linear.weight)
+                    mlp_layers.append(linear)
+                    if use_activation == "relu":
+                        mlp_layers.append(nn.ReLU(inplace=True))
+                    if use_activation == "sigmoid":
+                        mlp_layers.append(nn.Sigmoid())
+
+                ref_mlp = nn.Sequential(*mlp_layers).cuda()
+
+                test_input = (
+                    torch.empty(batch_size, mlp_sizes[0], device="cuda")
+                    .uniform_(-1.0, 1.0)
+                    .requires_grad_()
+                )
+                ref_input = test_input.clone().detach().requires_grad_()
+                mlp_out = mlp(test_input)
+                ref_out = ref_mlp(ref_input)
+                torch.testing.assert_close(mlp_out, ref_out)
+
+                # Use mean value as scalar loss. Multiply 10 to make it big enough not zero out
+                mlp_out.mean().mul(10.0).backward()
+                ref_out.mean().mul(10.0).backward()
+                torch.testing.assert_close(test_input.grad, ref_input.grad)
+                torch.testing.assert_close(mlp.weights[0].grad, ref_mlp[0].weight.grad)
 
     def test_with_bias(self):
-        for use_activation in ['none', 'relu', 'sigmoid']:
-            mlp = MLP(mlp_sizes, bias=True, activation=use_activation).cuda()
-
-            mlp_layers = []
-            for i in range(mlp.num_layers):
-                linear = nn.Linear(mlp_sizes[i], mlp_sizes[i + 1], bias=True)
-                mlp.weights[i].data.copy_(linear.weight)
-                mlp.biases[i].data.copy_(linear.bias)
-                mlp_layers.append(linear)
-                if use_activation == 'relu':
-                    mlp_layers.append(nn.ReLU(inplace=True))
-                if use_activation == 'sigmoid':
-                    mlp_layers.append(nn.Sigmoid())
-
-            ref_mlp = nn.Sequential(*mlp_layers).cuda()
-
-            test_input = torch.empty(batch_size, mlp_sizes[0], device="cuda").uniform_(-1., 1.).requires_grad_()
-            ref_input = test_input.clone().detach().requires_grad_()
-            mlp_out = mlp(test_input)
-            ref_out = ref_mlp(ref_input)
-            np.testing.assert_allclose(
-                mlp_out.detach().cpu().numpy(),
-                ref_out.detach().cpu().numpy(),
-                atol=1e-7, rtol=1e-5)
-
-            # Use mean value as scalar loss. Multiply 10 to make it big enough not zero out
-            mlp_out.mean().mul(10.).backward()
-            ref_out.mean().mul(10.).backward()
-            np.testing.assert_allclose(
-                test_input.grad.detach().cpu().numpy(),
-                ref_input.grad.detach().cpu().numpy(),
-                atol=0, rtol=1)
-            np.testing.assert_allclose(
-                mlp.weights[0].grad.detach().cpu().numpy(),
-                ref_mlp[0].weight.grad.detach().cpu().numpy(),
-                atol=1e-7, rtol=1)
-            np.testing.assert_allclose(
-                mlp.biases[0].grad.detach().cpu().numpy(),
-                ref_mlp[0].bias.grad.detach().cpu().numpy(),
-                atol=1e-7, rtol=1e-5)
+        for use_activation in ["none", "relu", "sigmoid"]:
+            with self.subTest(use_activation=use_activation):
+                mlp = MLP(mlp_sizes, bias=True, activation=use_activation).cuda()
+
+                mlp_layers = []
+                for i in range(mlp.num_layers):
+                    linear = nn.Linear(mlp_sizes[i], mlp_sizes[i + 1], bias=True)
+                    mlp.weights[i].data.copy_(linear.weight)
+                    mlp.biases[i].data.copy_(linear.bias)
+                    mlp_layers.append(linear)
+                    if use_activation == "relu":
+                        mlp_layers.append(nn.ReLU(inplace=True))
+                    if use_activation == "sigmoid":
+                        mlp_layers.append(nn.Sigmoid())
+
+                ref_mlp = nn.Sequential(*mlp_layers).cuda()
+
+                test_input = (
+                    torch.empty(batch_size, mlp_sizes[0], device="cuda")
+                    .uniform_(-1.0, 1.0)
+                    .requires_grad_()
+                )
+                ref_input = test_input.clone().detach().requires_grad_()
+                mlp_out = mlp(test_input)
+                ref_out = ref_mlp(ref_input)
+                torch.testing.assert_close(mlp_out, ref_out)
+
+                # Use mean value as scalar loss. Multiply 10 to make it big enough not zero out
+                mlp_out.mean().mul(10.0).backward()
+                ref_out.mean().mul(10.0).backward()
+                torch.testing.assert_close(test_input.grad, ref_input.grad)
+                torch.testing.assert_close(mlp.weights[0].grad, ref_mlp[0].weight.grad)
+                torch.testing.assert_close(mlp.biases[0].grad, ref_mlp[0].bias.grad)
 
     def test_no_grad(self):
         mlp = MLP(mlp_sizes).cuda()
@@ -143,23 +128,16 @@ def test_no_grad(self):
 
         ref_mlp = nn.Sequential(*mlp_layers).cuda()
 
-        test_input = torch.empty(batch_size, mlp_sizes[0], device="cuda").uniform_(-1., 1.)
+        test_input = torch.empty(batch_size, mlp_sizes[0], device="cuda").uniform_(-1.0, 1.0)
         ref_input = test_input.clone().detach()
         mlp_out = mlp(test_input)
         ref_out = ref_mlp(ref_input)
-        np.testing.assert_allclose(
-            mlp_out.detach().cpu().numpy(),
-            ref_out.detach().cpu().numpy(),
-            atol=1e-7, rtol=1e-5)
+        torch.testing.assert_close(mlp_out, ref_out)
 
         # Use mean value as scalar loss. Multiply 10 to make it big enough not zero out
-        mlp_out.mean().mul(10.).backward()
-        ref_out.mean().mul(10.).backward()
-        np.testing.assert_allclose(
-            mlp.weights[0].grad.detach().cpu().numpy(),
-            ref_mlp[0].weight.grad.detach().cpu().numpy(),
-            atol=1e-7, rtol=1e-5)
-
+        mlp_out.mean().mul(10.0).backward()
+        ref_out.mean().mul(10.0).backward()
+        torch.testing.assert_close(mlp.weights[0].grad, ref_mlp[0].weight.grad)
 
     def test_performance_half(self):
         mlp = MLP(mlp_sizes).cuda().half()
@@ -174,10 +152,16 @@ def test_performance_half(self):
 
         ref_mlp = nn.Sequential(*mlp_layers).cuda().half()
 
-        test_input = torch.empty(
-            batch_size, mlp_sizes[0], device="cuda", dtype=torch.half).fill_(10.).requires_grad_()
-        ref_input = torch.empty(
-            batch_size, mlp_sizes[0], device="cuda", dtype=torch.half).fill_(10.).requires_grad_()
+        test_input = (
+            torch.empty(batch_size, mlp_sizes[0], device="cuda", dtype=torch.half)
+            .fill_(10.0)
+            .requires_grad_()
+        )
+        ref_input = (
+            torch.empty(batch_size, mlp_sizes[0], device="cuda", dtype=torch.half)
+            .fill_(10.0)
+            .requires_grad_()
+        )
 
         # Warm up GPU
         for _ in range(100):
@@ -200,7 +184,8 @@ def test_performance_half(self):
             ref_loss.backward()
         torch.cuda.synchronize()
         stop_time = time()
-        print(F"\nPytorch MLP time {(stop_time - start_time) * 1000. / num_iters:.4f} ms")
+        ref_time = (stop_time - start_time) * 1000.0 / num_iters
+        print(f"\nPytorch MLP time {ref_time:.4f} ms")
 
         torch.cuda.synchronize()
         start_time = time()
@@ -211,8 +196,15 @@ def test_performance_half(self):
             test_loss.backward()
         torch.cuda.synchronize()
         stop_time = time()
-        print(F"C++ MLP time {(stop_time - start_time) * 1000. / num_iters:.4f} ms")
+        actual_time = (stop_time - start_time) * 1000.0 / num_iters
+        print(f"C++ MLP time {actual_time:.4f} ms")
         torch.cuda.profiler.stop()
+        self.assertLessEqual(
+            actual_time,
+            ref_time,
+            msg=f"Custom extension took {actual_time:.4f} while PyTorch took {ref_time:.4f}",
+        )
+
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()