Skip to content

Commit

Permalink
Update mlp_cuda test (#1425)
Browse files Browse the repository at this point in the history
to use `torch.testing.assert_close` instead of
`numpy.testing.assert_allclose`. The former uses a bit looser threshold
values.

Signed-off-by: Masaki Kozuki <mkozuki@nvidia.com>
  • Loading branch information
crcrpar authored Aug 9, 2022
1 parent 26ff4e4 commit 71e5871
Show file tree
Hide file tree
Showing 2 changed files with 104 additions and 111 deletions.
1 change: 1 addition & 0 deletions apex/mlp/mlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def backward(ctx, grad_o):
del ctx.outputs
return (None, None, *grads)

# TODO(crcrpar): Should make this compatible with torch.cuda.amp
mlp_function = amp.half_function(MlpFunction.apply)

class MLP(torch.nn.Module):
Expand Down
214 changes: 103 additions & 111 deletions tests/L0/run_mlp/test_mlp.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
"""Tests for c++ MLP"""
import unittest
from time import time
import numpy as np

import torch
from torch import nn

from apex.mlp import MLP


batch_size = 1024
mlp_sizes = [480, 1024, 1024, 512, 256, 1]
num_iters = 10

class TestMLP(unittest.TestCase):

# note(crcrpar): On Ampere, this test should be run without TF32 enabled.
class TestMLP(unittest.TestCase):
def test_creation(self):
MLP(mlp_sizes)

Expand All @@ -30,105 +31,89 @@ def test_numeric(self):

ref_mlp = nn.Sequential(*mlp_layers).cuda()

test_input = torch.empty(batch_size, mlp_sizes[0], device="cuda").uniform_(-1., 1.).requires_grad_()
test_input = (
torch.empty(batch_size, mlp_sizes[0], device="cuda")
.uniform_(-1.0, 1.0)
.requires_grad_()
)
ref_input = test_input.clone().detach().requires_grad_()
mlp_out = mlp(test_input)
ref_out = ref_mlp(ref_input)
np.testing.assert_allclose(
mlp_out.detach().cpu().numpy(),
ref_out.detach().cpu().numpy(),
atol=1e-7, rtol=1e-5)
torch.testing.assert_close(mlp_out, ref_out)

# Use mean value as scalar loss. Multiply 10 to make it big enough not zero out
mlp_out.mean().mul(10.).backward()
ref_out.mean().mul(10.).backward()
np.testing.assert_allclose(
test_input.grad.detach().cpu().numpy(),
ref_input.grad.detach().cpu().numpy(),
atol=0, rtol=1e-5)
np.testing.assert_allclose(
mlp.biases[0].grad.detach().cpu().numpy(),
ref_mlp[0].bias.grad.detach().cpu().numpy(),
atol=1e-7, rtol=1e-5)
mlp_out.mean().mul(10.0).backward()
ref_out.mean().mul(10.0).backward()
torch.testing.assert_close(test_input.grad, ref_input.grad)
torch.testing.assert_close(mlp.biases[0].grad, ref_mlp[0].bias.grad)

def test_no_bias(self):
for use_activation in ['none', 'relu', 'sigmoid']:
mlp = MLP(mlp_sizes, bias=False, activation=use_activation).cuda()

mlp_layers = []
for i in range(mlp.num_layers):
linear = nn.Linear(mlp_sizes[i], mlp_sizes[i + 1], bias=False)
mlp.weights[i].data.copy_(linear.weight)
mlp_layers.append(linear)
if use_activation == 'relu':
mlp_layers.append(nn.ReLU(inplace=True))
if use_activation == 'sigmoid':
mlp_layers.append(nn.Sigmoid())

ref_mlp = nn.Sequential(*mlp_layers).cuda()

test_input = torch.empty(batch_size, mlp_sizes[0], device="cuda").uniform_(-1., 1.).requires_grad_()
ref_input = test_input.clone().detach().requires_grad_()
mlp_out = mlp(test_input)
ref_out = ref_mlp(ref_input)
np.testing.assert_allclose(
mlp_out.detach().cpu().numpy(),
ref_out.detach().cpu().numpy(),
atol=1e-7, rtol=1e-5)

# Use mean value as scalar loss. Multiply 10 to make it big enough not zero out
mlp_out.mean().mul(10.).backward()
ref_out.mean().mul(10.).backward()
np.testing.assert_allclose(
test_input.grad.detach().cpu().numpy(),
ref_input.grad.detach().cpu().numpy(),
atol=0, rtol=100)
np.testing.assert_allclose(
mlp.weights[0].grad.detach().cpu().numpy(),
ref_mlp[0].weight.grad.detach().cpu().numpy(),
atol=1e-7, rtol=100)
for use_activation in ["none", "relu", "sigmoid"]:
with self.subTest(use_activation=use_activation):
mlp = MLP(mlp_sizes, bias=False, activation=use_activation).cuda()

mlp_layers = []
for i in range(mlp.num_layers):
linear = nn.Linear(mlp_sizes[i], mlp_sizes[i + 1], bias=False)
mlp.weights[i].data.copy_(linear.weight)
mlp_layers.append(linear)
if use_activation == "relu":
mlp_layers.append(nn.ReLU(inplace=True))
if use_activation == "sigmoid":
mlp_layers.append(nn.Sigmoid())

ref_mlp = nn.Sequential(*mlp_layers).cuda()

test_input = (
torch.empty(batch_size, mlp_sizes[0], device="cuda")
.uniform_(-1.0, 1.0)
.requires_grad_()
)
ref_input = test_input.clone().detach().requires_grad_()
mlp_out = mlp(test_input)
ref_out = ref_mlp(ref_input)
torch.testing.assert_close(mlp_out, ref_out)

# Use mean value as scalar loss. Multiply 10 to make it big enough not zero out
mlp_out.mean().mul(10.0).backward()
ref_out.mean().mul(10.0).backward()
torch.testing.assert_close(test_input.grad, ref_input.grad)
torch.testing.assert_close(mlp.weights[0].grad, ref_mlp[0].weight.grad)

def test_with_bias(self):
for use_activation in ['none', 'relu', 'sigmoid']:
mlp = MLP(mlp_sizes, bias=True, activation=use_activation).cuda()

mlp_layers = []
for i in range(mlp.num_layers):
linear = nn.Linear(mlp_sizes[i], mlp_sizes[i + 1], bias=True)
mlp.weights[i].data.copy_(linear.weight)
mlp.biases[i].data.copy_(linear.bias)
mlp_layers.append(linear)
if use_activation == 'relu':
mlp_layers.append(nn.ReLU(inplace=True))
if use_activation == 'sigmoid':
mlp_layers.append(nn.Sigmoid())

ref_mlp = nn.Sequential(*mlp_layers).cuda()

test_input = torch.empty(batch_size, mlp_sizes[0], device="cuda").uniform_(-1., 1.).requires_grad_()
ref_input = test_input.clone().detach().requires_grad_()
mlp_out = mlp(test_input)
ref_out = ref_mlp(ref_input)
np.testing.assert_allclose(
mlp_out.detach().cpu().numpy(),
ref_out.detach().cpu().numpy(),
atol=1e-7, rtol=1e-5)

# Use mean value as scalar loss. Multiply 10 to make it big enough not zero out
mlp_out.mean().mul(10.).backward()
ref_out.mean().mul(10.).backward()
np.testing.assert_allclose(
test_input.grad.detach().cpu().numpy(),
ref_input.grad.detach().cpu().numpy(),
atol=0, rtol=1)
np.testing.assert_allclose(
mlp.weights[0].grad.detach().cpu().numpy(),
ref_mlp[0].weight.grad.detach().cpu().numpy(),
atol=1e-7, rtol=1)
np.testing.assert_allclose(
mlp.biases[0].grad.detach().cpu().numpy(),
ref_mlp[0].bias.grad.detach().cpu().numpy(),
atol=1e-7, rtol=1e-5)
for use_activation in ["none", "relu", "sigmoid"]:
with self.subTest(use_activation=use_activation):
mlp = MLP(mlp_sizes, bias=True, activation=use_activation).cuda()

mlp_layers = []
for i in range(mlp.num_layers):
linear = nn.Linear(mlp_sizes[i], mlp_sizes[i + 1], bias=True)
mlp.weights[i].data.copy_(linear.weight)
mlp.biases[i].data.copy_(linear.bias)
mlp_layers.append(linear)
if use_activation == "relu":
mlp_layers.append(nn.ReLU(inplace=True))
if use_activation == "sigmoid":
mlp_layers.append(nn.Sigmoid())

ref_mlp = nn.Sequential(*mlp_layers).cuda()

test_input = (
torch.empty(batch_size, mlp_sizes[0], device="cuda")
.uniform_(-1.0, 1.0)
.requires_grad_()
)
ref_input = test_input.clone().detach().requires_grad_()
mlp_out = mlp(test_input)
ref_out = ref_mlp(ref_input)
torch.testing.assert_close(mlp_out, ref_out)

# Use mean value as scalar loss. Multiply 10 to make it big enough not zero out
mlp_out.mean().mul(10.0).backward()
ref_out.mean().mul(10.0).backward()
torch.testing.assert_close(test_input.grad, ref_input.grad)
torch.testing.assert_close(mlp.weights[0].grad, ref_mlp[0].weight.grad)
torch.testing.assert_close(mlp.biases[0].grad, ref_mlp[0].bias.grad)

def test_no_grad(self):
mlp = MLP(mlp_sizes).cuda()
Expand All @@ -143,23 +128,16 @@ def test_no_grad(self):

ref_mlp = nn.Sequential(*mlp_layers).cuda()

test_input = torch.empty(batch_size, mlp_sizes[0], device="cuda").uniform_(-1., 1.)
test_input = torch.empty(batch_size, mlp_sizes[0], device="cuda").uniform_(-1.0, 1.0)
ref_input = test_input.clone().detach()
mlp_out = mlp(test_input)
ref_out = ref_mlp(ref_input)
np.testing.assert_allclose(
mlp_out.detach().cpu().numpy(),
ref_out.detach().cpu().numpy(),
atol=1e-7, rtol=1e-5)
torch.testing.assert_close(mlp_out, ref_out)

# Use mean value as scalar loss. Multiply 10 to make it big enough not zero out
mlp_out.mean().mul(10.).backward()
ref_out.mean().mul(10.).backward()
np.testing.assert_allclose(
mlp.weights[0].grad.detach().cpu().numpy(),
ref_mlp[0].weight.grad.detach().cpu().numpy(),
atol=1e-7, rtol=1e-5)

mlp_out.mean().mul(10.0).backward()
ref_out.mean().mul(10.0).backward()
torch.testing.assert_close(mlp.weights[0].grad, ref_mlp[0].weight.grad)

def test_performance_half(self):
mlp = MLP(mlp_sizes).cuda().half()
Expand All @@ -174,10 +152,16 @@ def test_performance_half(self):

ref_mlp = nn.Sequential(*mlp_layers).cuda().half()

test_input = torch.empty(
batch_size, mlp_sizes[0], device="cuda", dtype=torch.half).fill_(10.).requires_grad_()
ref_input = torch.empty(
batch_size, mlp_sizes[0], device="cuda", dtype=torch.half).fill_(10.).requires_grad_()
test_input = (
torch.empty(batch_size, mlp_sizes[0], device="cuda", dtype=torch.half)
.fill_(10.0)
.requires_grad_()
)
ref_input = (
torch.empty(batch_size, mlp_sizes[0], device="cuda", dtype=torch.half)
.fill_(10.0)
.requires_grad_()
)

# Warm up GPU
for _ in range(100):
Expand All @@ -200,7 +184,8 @@ def test_performance_half(self):
ref_loss.backward()
torch.cuda.synchronize()
stop_time = time()
print(F"\nPytorch MLP time {(stop_time - start_time) * 1000. / num_iters:.4f} ms")
ref_time = (stop_time - start_time) * 1000.0 / num_iters
print(f"\nPytorch MLP time {ref_time:.4f} ms")

torch.cuda.synchronize()
start_time = time()
Expand All @@ -211,8 +196,15 @@ def test_performance_half(self):
test_loss.backward()
torch.cuda.synchronize()
stop_time = time()
print(F"C++ MLP time {(stop_time - start_time) * 1000. / num_iters:.4f} ms")
actual_time = (stop_time - start_time) * 1000.0 / num_iters
print(f"C++ MLP time {actual_time:.4f} ms")
torch.cuda.profiler.stop()
self.assertLessEqual(
actual_time,
ref_time,
msg=f"Custom extension took {actual_time:.4f} while PyTorch took {ref_time:.4f}",
)


if __name__ == '__main__':
if __name__ == "__main__":
unittest.main()

0 comments on commit 71e5871

Please sign in to comment.