Skip to content

Commit

Permalink
Use xmlrunner.XMLTestRunner accordingly in tests/L0/run_test.py (N…
Browse files Browse the repository at this point in the history
…VIDIA#1451)

* Use xmlrunner.XMLTestRunner accordingly

TODO:
- [x] Remove `subTest` because it's not compatible with the current way
of running L0 tests

Signed-off-by: Masaki Kozuki <mkozuki@nvidia.com>

* use `torch.testing` more to enable xmlrunner

Signed-off-by: Masaki Kozuki <mkozuki@nvidia.com>

* Remove `subTest` for xmlrunner

Signed-off-by: Masaki Kozuki <mkozuki@nvidia.com>

* removing subTest

Signed-off-by: Masaki Kozuki <mkozuki@nvidia.com>

* not depend on an env var

Signed-off-by: Masaki Kozuki <mkozuki@nvidia.com>

* fix syntax errors

* open with `"wb"`

* xml file per dir

Signed-off-by: Masaki Kozuki <mkozuki@nvidia.com>

* remove comment-out

Signed-off-by: Masaki Kozuki <mkozuki@nvidia.com>

* Refactor `TestTransformer`: define member methods (NVIDIA#5)

* setUpClass to define `test_` methods

Signed-off-by: Masaki Kozuki <mkozuki@nvidia.com>

* manually define

Signed-off-by: Masaki Kozuki <mkozuki@nvidia.com>

Signed-off-by: Masaki Kozuki <mkozuki@nvidia.com>

* add a missing test

Signed-off-by: Masaki Kozuki <mkozuki@nvidia.com>

* remove print

Signed-off-by: Masaki Kozuki <mkozuki@nvidia.com>

* remove ext

Signed-off-by: Masaki Kozuki <mkozuki@nvidia.com>

Signed-off-by: Masaki Kozuki <mkozuki@nvidia.com>
  • Loading branch information
crcrpar authored and yuanzhedong committed Jul 14, 2023
1 parent 4c17895 commit ad47763
Show file tree
Hide file tree
Showing 13 changed files with 941 additions and 950 deletions.
43 changes: 23 additions & 20 deletions tests/L0/run_fused_layer_norm/test_fused_layer_norm.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@

import torch

import apex
from apex.normalization import FusedLayerNorm
from apex.normalization import FusedRMSNorm
from apex.normalization import MixedFusedLayerNorm
from apex.normalization import MixedFusedRMSNorm


class TestFusedLayerNorm(unittest.TestCase):
Expand All @@ -18,15 +21,15 @@ class TestFusedLayerNorm(unittest.TestCase):
def setUp(self):
# bias and weight are set to 0 and 1 respectively, so no need to copy parameters from cpu module to the gpu one
if not self.mixed_fused:
self.module_cpu_ = apex.normalization.FusedLayerNorm(
self.module_cpu_ = FusedLayerNorm(
normalized_shape=self.normalized_shape, elementwise_affine=self.elementwise_affine).cpu()
self.module_cuda_ = apex.normalization.FusedLayerNorm(
self.module_cuda_ = FusedLayerNorm(
normalized_shape=self.normalized_shape, elementwise_affine=self.elementwise_affine).to(device="cuda", dtype=self.dtype)
else:
assert self.elementwise_affine
self.module_cpu_ = apex.normalization.MixedFusedLayerNorm(
self.module_cpu_ = MixedFusedLayerNorm(
normalized_shape=self.normalized_shape).cpu()
self.module_cuda_ = apex.normalization.MixedFusedLayerNorm(
self.module_cuda_ = MixedFusedLayerNorm(
normalized_shape=self.normalized_shape).to(device="cuda", dtype=self.dtype)


Expand Down Expand Up @@ -65,8 +68,7 @@ def _check_same_output(self, batch_size, contiguous):

def _test_same_output(self, batch_size):
for contiguous in (True, False):
with self.subTest(contiguous=contiguous):
self._check_same_output(batch_size, contiguous)
self._check_same_output(batch_size, contiguous)

def test_layer_norm(self):
self._test_same_output(16)
Expand All @@ -87,15 +89,15 @@ class TestFusedRMSNorm(unittest.TestCase):
def setUp(self):
# bias and weight are set to 0 and 1 respectively, so no need to copy parameters from cpu module to the gpu one
if not self.mixed_fused:
self.module_cpu_ = apex.normalization.FusedRMSNorm(
self.module_cpu_ = FusedRMSNorm(
normalized_shape=self.normalized_shape, elementwise_affine=self.elementwise_affine).cpu()
self.module_cuda_ = apex.normalization.FusedRMSNorm(
self.module_cuda_ = FusedRMSNorm(
normalized_shape=self.normalized_shape, elementwise_affine=self.elementwise_affine).to(device="cuda", dtype=self.dtype)
else:
assert self.elementwise_affine
self.module_cpu_ = apex.normalization.MixedFusedRMSNorm(
self.module_cpu_ = MixedFusedRMSNorm(
normalized_shape=self.normalized_shape).cpu()
self.module_cuda_ = apex.normalization.MixedFusedRMSNorm(
self.module_cuda_ = MixedFusedRMSNorm(
normalized_shape=self.normalized_shape).to(device="cuda", dtype=self.dtype)

def _check_same_output(self, batch_size, contiguous):
Expand Down Expand Up @@ -136,8 +138,7 @@ def _check_same_output(self, batch_size, contiguous):

def _test_same_output(self, batch_size):
for contiguous in (True, False):
with self.subTest(contiguous=contiguous):
self._check_same_output(batch_size, contiguous)
self._check_same_output(batch_size, contiguous)

def test_layer_norm(self):
self._test_same_output(16)
Expand Down Expand Up @@ -204,17 +205,17 @@ def _prep_layers(normalized_shape, elementwise_affine, dtype):
native = torch.nn.LayerNorm(
normalized_shape=normalized_shape, elementwise_affine=elementwise_affine
).to(device="cuda", dtype=dtype)
fused = apex.normalization.FusedLayerNorm(
fused = FusedLayerNorm(
normalized_shape=normalized_shape, elementwise_affine=elementwise_affine
).cuda()
return native, fused


def _prep_rms_layers(normalized_shape, elementwise_affine, dtype):
native = apex.normalization.FusedRMSNorm(
native = FusedRMSNorm(
normalized_shape=normalized_shape, elementwise_affine=elementwise_affine
)
fused = apex.normalization.FusedRMSNorm(
fused = FusedRMSNorm(
normalized_shape=normalized_shape, elementwise_affine=elementwise_affine
).cuda()
return native, fused
Expand Down Expand Up @@ -259,8 +260,7 @@ def _run_test(self, dtype, elementwise_affine):

def test_autocast(self):
for (dtype, elementwise_affine) in itertools.product(autocast_dtypes, (True, False)):
with self.subTest(f"{dtype}-{elementwise_affine}"):
self._run_test(dtype, elementwise_affine)
self._run_test(dtype, elementwise_affine)

class TestAutocastFusedRMSNorm(unittest.TestCase):
bf16_fwd_thresholds = dict(rtol=1.6e-2, atol=3e-4)
Expand Down Expand Up @@ -291,5 +291,8 @@ def _run_test(self, dtype, elementwise_affine):

def test_autocast(self):
for (dtype, elementwise_affine) in itertools.product(autocast_dtypes, (True, False)):
with self.subTest(f"{dtype}-{elementwise_affine}"):
self._run_test(dtype, elementwise_affine)
self._run_test(dtype, elementwise_affine)


if __name__ == "__main__":
unittest.main()
142 changes: 61 additions & 81 deletions tests/L0/run_mlp/test_mlp.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
"""Tests for c++ MLP"""
import unittest
from itertools import product
from time import time

import torch
from torch import nn
from torch.testing._internal import common_utils
from torch.testing._internal.common_device_type import instantiate_device_type_tests
from torch.testing._internal.common_device_type import onlyCUDA

from apex.mlp import MLP

Expand All @@ -14,7 +17,7 @@


# note(crcrpar): On Ampere, this test should be run without TF32 enabled.
class TestMLP(unittest.TestCase):
class TestMLP(common_utils.TestCase):
def test_creation(self):
MLP(mlp_sizes)

Expand All @@ -24,10 +27,51 @@ def test_numeric(self):
mlp_layers = []
for i in range(mlp.num_layers):
linear = nn.Linear(mlp_sizes[i], mlp_sizes[i + 1])
mlp.weights[i].data.copy_(linear.weight)
mlp.biases[i].data.copy_(linear.bias)
with torch.no_grad():
mlp.weights[i].copy_(linear.weight)
mlp.biases[i].copy_(linear.bias)
mlp_layers.append(linear)
mlp_layers.append(nn.ReLU(inplace=True))
mlp_layers.append(nn.ReLU())

ref_mlp = nn.Sequential(*mlp_layers).cuda()

test_input = (
torch.empty(batch_size, mlp_sizes[0], device="cuda")
.uniform_(-1.0, 1.0)
.requires_grad_()
)
ref_input = test_input.clone().detach().requires_grad_()
mlp_out = mlp(test_input)
ref_out = ref_mlp(ref_input)
self.assertEqual(mlp_out, ref_out)

# Use mean value as scalar loss. Multiply 10 to make it big enough not zero out
mlp_out.mean().mul(10.0).backward()
ref_out.mean().mul(10.0).backward()
self.assertEqual(test_input.grad, ref_input.grad)
self.assertEqual(mlp.biases[0].grad, ref_mlp[0].bias.grad)

@common_utils.parametrize(
"use_activation,bias",
list(product(("none", "relu", "sigmoid"), (True, False))),
)
def test_mlp(self, use_activation: str, bias: bool):
# for use_activation in ["none", "relu", "sigmoid"]:
msg = f"activation: {use_activation}, bias: {bias}"
mlp = MLP(mlp_sizes, bias=bias, activation=use_activation).cuda()

mlp_layers = []
for i in range(mlp.num_layers):
linear = nn.Linear(mlp_sizes[i], mlp_sizes[i + 1], bias=bias)
with torch.no_grad():
mlp.weights[i].copy_(linear.weight)
if bias:
mlp.biases[i].copy_(linear.bias)
mlp_layers.append(linear)
if use_activation == "relu":
mlp_layers.append(nn.ReLU())
if use_activation == "sigmoid":
mlp_layers.append(nn.Sigmoid())

ref_mlp = nn.Sequential(*mlp_layers).cuda()

Expand All @@ -39,90 +83,23 @@ def test_numeric(self):
ref_input = test_input.clone().detach().requires_grad_()
mlp_out = mlp(test_input)
ref_out = ref_mlp(ref_input)
torch.testing.assert_close(mlp_out, ref_out)
self.assertEqual(mlp_out, ref_out, msg=msg)

# Use mean value as scalar loss. Multiply 10 to make it big enough not zero out
mlp_out.mean().mul(10.0).backward()
ref_out.mean().mul(10.0).backward()
torch.testing.assert_close(test_input.grad, ref_input.grad)
torch.testing.assert_close(mlp.biases[0].grad, ref_mlp[0].bias.grad)

def test_no_bias(self):
for use_activation in ["none", "relu", "sigmoid"]:
with self.subTest(use_activation=use_activation):
mlp = MLP(mlp_sizes, bias=False, activation=use_activation).cuda()

mlp_layers = []
for i in range(mlp.num_layers):
linear = nn.Linear(mlp_sizes[i], mlp_sizes[i + 1], bias=False)
mlp.weights[i].data.copy_(linear.weight)
mlp_layers.append(linear)
if use_activation == "relu":
mlp_layers.append(nn.ReLU(inplace=True))
if use_activation == "sigmoid":
mlp_layers.append(nn.Sigmoid())

ref_mlp = nn.Sequential(*mlp_layers).cuda()

test_input = (
torch.empty(batch_size, mlp_sizes[0], device="cuda")
.uniform_(-1.0, 1.0)
.requires_grad_()
)
ref_input = test_input.clone().detach().requires_grad_()
mlp_out = mlp(test_input)
ref_out = ref_mlp(ref_input)
torch.testing.assert_close(mlp_out, ref_out)

# Use mean value as scalar loss. Multiply 10 to make it big enough not zero out
mlp_out.mean().mul(10.0).backward()
ref_out.mean().mul(10.0).backward()
torch.testing.assert_close(test_input.grad, ref_input.grad)
torch.testing.assert_close(mlp.weights[0].grad, ref_mlp[0].weight.grad)

def test_with_bias(self):
for use_activation in ["none", "relu", "sigmoid"]:
with self.subTest(use_activation=use_activation):
mlp = MLP(mlp_sizes, bias=True, activation=use_activation).cuda()

mlp_layers = []
for i in range(mlp.num_layers):
linear = nn.Linear(mlp_sizes[i], mlp_sizes[i + 1], bias=True)
mlp.weights[i].data.copy_(linear.weight)
mlp.biases[i].data.copy_(linear.bias)
mlp_layers.append(linear)
if use_activation == "relu":
mlp_layers.append(nn.ReLU(inplace=True))
if use_activation == "sigmoid":
mlp_layers.append(nn.Sigmoid())

ref_mlp = nn.Sequential(*mlp_layers).cuda()

test_input = (
torch.empty(batch_size, mlp_sizes[0], device="cuda")
.uniform_(-1.0, 1.0)
.requires_grad_()
)
ref_input = test_input.clone().detach().requires_grad_()
mlp_out = mlp(test_input)
ref_out = ref_mlp(ref_input)
torch.testing.assert_close(mlp_out, ref_out)

# Use mean value as scalar loss. Multiply 10 to make it big enough not zero out
mlp_out.mean().mul(10.0).backward()
ref_out.mean().mul(10.0).backward()
torch.testing.assert_close(test_input.grad, ref_input.grad)
torch.testing.assert_close(mlp.weights[0].grad, ref_mlp[0].weight.grad)
torch.testing.assert_close(mlp.biases[0].grad, ref_mlp[0].bias.grad)
self.assertEqual(test_input.grad, ref_input.grad, msg=msg)
self.assertEqual(mlp.weights[0].grad, ref_mlp[0].weight.grad, msg=msg)

def test_no_grad(self):
mlp = MLP(mlp_sizes).cuda()

mlp_layers = []
for i in range(mlp.num_layers):
linear = nn.Linear(mlp_sizes[i], mlp_sizes[i + 1])
mlp.weights[i].data.copy_(linear.weight)
mlp.biases[i].data.copy_(linear.bias)
with torch.no_grad():
mlp.weights[i].copy_(linear.weight)
mlp.biases[i].copy_(linear.bias)
mlp_layers.append(linear)
mlp_layers.append(nn.ReLU(inplace=True))

Expand All @@ -132,12 +109,12 @@ def test_no_grad(self):
ref_input = test_input.clone().detach()
mlp_out = mlp(test_input)
ref_out = ref_mlp(ref_input)
torch.testing.assert_close(mlp_out, ref_out)
self.assertEqual(mlp_out, ref_out)

# Use mean value as scalar loss. Multiply 10 to make it big enough not zero out
mlp_out.mean().mul(10.0).backward()
ref_out.mean().mul(10.0).backward()
torch.testing.assert_close(mlp.weights[0].grad, ref_mlp[0].weight.grad)
self.assertEqual(mlp.weights[0].grad, ref_mlp[0].weight.grad)

def test_performance_half(self):
mlp = MLP(mlp_sizes).cuda().half()
Expand Down Expand Up @@ -206,5 +183,8 @@ def test_performance_half(self):
)


instantiate_device_type_tests(TestMLP, globals(), only_for=("cuda",))


if __name__ == "__main__":
unittest.main()
common_utils.run_tests()
29 changes: 25 additions & 4 deletions tests/L0/run_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
How to run this script?
1. Run all the tests: `python /path/to/apex/tests/L0/run_test.py`
1. Run all the tests: `python /path/to/apex/tests/L0/run_test.py` If you want an xml report,
pass `--xml-report`, i.e. `python /path/to/apex/tests/L0/run_test.py --xml-report` and
the file is created in `/path/to/apex/tests/L0`.
2. Run one of the tests (e.g. fused layer norm):
`python /path/to/apex/tests/L0/run_test.py --include run_fused_layer_norm`
3. Run two or more of the tests (e.g. optimizers and fused layer norm):
Expand Down Expand Up @@ -43,16 +45,35 @@ def parse_args():
default=DEFAULT_TEST_DIRS,
help="select a set of tests to run (defaults to ALL tests).",
)
parser.add_argument(
"--xml-report",
action="store_true",
help="pass this argument to get a junit xml report. (requires `xmlrunner`)",
)
args, _ = parser.parse_known_args()
return args


def main(args):
runner = unittest.TextTestRunner(verbosity=2)
def main(args: argparse.Namespace) -> None:
test_runner_kwargs = {"verbosity": 2}
Runner = unittest.TextTestRunner
if args.xml_report:
import xmlrunner
from datetime import date # NOQA
Runner = xmlrunner.XMLTestRunner

errcode = 0
for test_dir in args.include:
if args.xml_report:
this_dir = os.path.abspath(os.path.dirname(__file__))
xml_output = os.path.join(
this_dir,
f"""TEST_{test_dir}_{date.today().strftime("%y%m%d")}""",
)
test_runner_kwargs["output"] = xml_output

runner = Runner(**test_runner_kwargs)
test_dir = os.path.join(TEST_ROOT, test_dir)
print(test_dir)
suite = unittest.TestLoader().discover(test_dir)

print("\nExecuting tests from " + test_dir)
Expand Down
Loading

0 comments on commit ad47763

Please sign in to comment.