From 88413806fd9f7f27d65f55d4a961954a8aa47690 Mon Sep 17 00:00:00 2001 From: AndrewZhaoLuo Date: Thu, 7 Jul 2022 16:45:17 -0700 Subject: [PATCH] [Pylint] Pylint integration_tests folder (#11672) * add folder to pylint * add init py * lint test_arm_mrpofile_dsp.py * one more change to tests/python/integratoin/test_arm_mprofile_dsp.py * add test_dot * test_ewise_fpga.py * test_ewise.py * test gemm * test_lower.py * test_meta_schedule_auto_tensorize.py * test_reduce.py pt1 * test_reduce.py pt2 * test_scan.py * test_tuning.py * test_winograd_nnpack.py * final test pass * comments * clean up test_lower more --- tests/lint/pylint.sh | 1 + tests/python/integration/__init__.py | 17 + .../integration/test_arm_mprofile_dsp.py | 10 +- tests/python/integration/test_dot.py | 43 +- tests/python/integration/test_ewise.py | 278 +++++---- tests/python/integration/test_ewise_fpga.py | 75 ++- tests/python/integration/test_gemm.py | 115 ++-- tests/python/integration/test_lower.py | 360 ++++++----- .../test_meta_schedule_auto_tensorize.py | 61 +- tests/python/integration/test_reduce.py | 585 ++++++++++-------- tests/python/integration/test_scan.py | 59 +- tests/python/integration/test_tuning.py | 188 +++--- .../integration/test_winograd_nnpack.py | 67 +- 13 files changed, 1089 insertions(+), 770 deletions(-) create mode 100644 tests/python/integration/__init__.py diff --git a/tests/lint/pylint.sh b/tests/lint/pylint.sh index 39568fd3417e..61ffb0fd9254 100755 --- a/tests/lint/pylint.sh +++ b/tests/lint/pylint.sh @@ -23,3 +23,4 @@ python3 -m pylint tests/python/unittest/test_tvmscript_type.py --rcfile="$(dirna python3 -m pylint tests/python/contrib/test_cmsisnn --rcfile="$(dirname "$0")"/pylintrc python3 -m pylint tests/python/relay/aot/*.py --rcfile="$(dirname "$0")"/pylintrc python3 -m pylint tests/python/ci --rcfile="$(dirname "$0")"/pylintrc +python3 -m pylint tests/python/integration/ --rcfile="$(dirname "$0")"/pylintrc diff --git a/tests/python/integration/__init__.py b/tests/python/integration/__init__.py new file mode 100644 index 000000000000..56984ac61535 --- /dev/null +++ b/tests/python/integration/__init__.py @@ -0,0 +1,17 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Infrastructure and tests for e2e integration tests.""" diff --git a/tests/python/integration/test_arm_mprofile_dsp.py b/tests/python/integration/test_arm_mprofile_dsp.py index 2bcf284f3d77..22b4ebaab832 100644 --- a/tests/python/integration/test_arm_mprofile_dsp.py +++ b/tests/python/integration/test_arm_mprofile_dsp.py @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -import sys +"""Test arm mprofile dsp.""" import numpy as np import pytest import tvm @@ -173,16 +173,16 @@ def test_conv1d(data_shape_nwc, kernel_size, num_filter, strides, padding, dtype @tvm.testing.requires_corstone300 @pytest.mark.parametrize( - "M, K, N", + "dim_m, dim_k, dim_n", [ (1, 32, 64), (3, 12, 10), ], ) -def test_dense(M, K, N): +def test_dense(dim_m, dim_k, dim_n): """Test a subgraph with a single dense operator.""" - ishape = (M, K) - wshape = (N, K) + ishape = (dim_m, dim_k) + wshape = (dim_n, dim_k) input0 = relay.var("input", relay.TensorType(ishape, "int8")) dense_f = relay.op.nn.batch_flatten(input0) diff --git a/tests/python/integration/test_dot.py b/tests/python/integration/test_dot.py index 41abb51a2e99..20e628c8c14b 100644 --- a/tests/python/integration/test_dot.py +++ b/tests/python/integration/test_dot.py @@ -14,31 +14,46 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +"""Test scheduling and running a dot product.""" +import numpy as np + import tvm import tvm.testing from tvm import te -import numpy as np @tvm.testing.requires_llvm def test_dot(): - nn = 12 - n = tvm.runtime.convert(nn) - A = te.placeholder((n,), name="A") - B = te.placeholder((n,), name="B") - k = te.reduce_axis((0, n), "k") - C = te.compute((), lambda: te.sum(A[k] * B[k], axis=k), name="C") - s = te.create_schedule(C.op) + """Test dot product.""" + arr_length = 12 + arr_length_tvm = tvm.runtime.convert(arr_length) + placeholder_a = te.placeholder((arr_length_tvm,), name="A") + placeholder_b = te.placeholder((arr_length_tvm,), name="B") + reduce_axis_k = te.reduce_axis((0, arr_length_tvm), "k") + result_c = te.compute( + (), + lambda: te.sum( + placeholder_a[reduce_axis_k] * placeholder_b[reduce_axis_k], axis=reduce_axis_k + ), + name="C", + ) + schedule = te.create_schedule(result_c.op) def verify(target): - f = tvm.driver.build(s, [A, B, C], target) + f = tvm.driver.build(schedule, [placeholder_a, placeholder_b, result_c], target) # verify dev = tvm.cpu(0) - a = tvm.nd.array(np.random.uniform(size=(nn,)).astype(A.dtype), dev) - b = tvm.nd.array(np.random.uniform(size=(nn,)).astype(B.dtype), dev) - c = tvm.nd.array(np.zeros((), dtype=C.dtype), dev) - f(a, b, c) - tvm.testing.assert_allclose(c.numpy(), np.dot(a.numpy(), b.numpy()), rtol=1e-4) + buff_a = tvm.nd.array( + np.random.uniform(size=(arr_length,)).astype(placeholder_a.dtype), dev + ) + buff_b = tvm.nd.array( + np.random.uniform(size=(arr_length,)).astype(placeholder_b.dtype), dev + ) + buff_c = tvm.nd.array(np.zeros((), dtype=result_c.dtype), dev) + f(buff_a, buff_b, buff_c) + tvm.testing.assert_allclose( + buff_c.numpy(), np.dot(buff_a.numpy(), buff_b.numpy()), rtol=1e-4 + ) verify("llvm") diff --git a/tests/python/integration/test_ewise.py b/tests/python/integration/test_ewise.py index 3250efc3f71e..8bfa6b17175d 100644 --- a/tests/python/integration/test_ewise.py +++ b/tests/python/integration/test_ewise.py @@ -14,26 +14,29 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +"""Test elementwise integration.""" +import numpy as np + import tvm +import tvm.testing from tvm import te from tvm.contrib import nvcc -import numpy as np -import time -import tvm.testing @tvm.testing.requires_gpu def test_exp(): + """Test scheduling and running exponent.""" # graph - n = tvm.runtime.convert(1024) - A = te.placeholder((n,), name="A") - B = te.compute(A.shape, lambda *i: te.exp(A(*i)), name="B") - s = te.create_schedule(B.op) + arr_length = 1024 + arr_length_tvm = tvm.runtime.convert(arr_length) + placeholder_a = te.placeholder((arr_length_tvm,), name="A") + placeholder_b = te.compute(placeholder_a.shape, lambda *i: te.exp(placeholder_a(*i)), name="B") + schedule = te.create_schedule(placeholder_b.op) # create iter var and assign them tags. num_thread = 8 - bx, tx = s[B].split(B.op.axis[0], factor=num_thread) - s[B].bind(bx, te.thread_axis("blockIdx.x")) - s[B].bind(tx, te.thread_axis("threadIdx.x")) + axis1, axis2 = schedule[placeholder_b].split(placeholder_b.op.axis[0], factor=num_thread) + schedule[placeholder_b].bind(axis1, te.thread_axis("blockIdx.x")) + schedule[placeholder_b].bind(axis2, te.thread_axis("threadIdx.x")) # one line to build the function. def check_device(device, host="stackvm"): @@ -43,14 +46,13 @@ def check_device(device, host="stackvm"): if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return - fexp = tvm.build(s, [A, B], device, host, name="myexp") + fexp = tvm.build(schedule, [placeholder_a, placeholder_b], device, host, name="myexp") dev = tvm.device(device, 0) # launch the kernel. - n = 1024 - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) - b = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev) - fexp(a, b) - tvm.testing.assert_allclose(b.numpy(), np.exp(a.numpy()), rtol=1e-5) + buff_a = tvm.nd.array(np.random.uniform(size=arr_length).astype(placeholder_a.dtype), dev) + buff_b = tvm.nd.array(np.zeros(arr_length, dtype=placeholder_b.dtype), dev) + fexp(buff_a, buff_b) + tvm.testing.assert_allclose(buff_b.numpy(), np.exp(buff_a.numpy()), rtol=1e-5) check_device("opencl -device=intel_graphics") check_device("cuda", "llvm") @@ -59,16 +61,19 @@ def check_device(device, host="stackvm"): @tvm.testing.requires_gpu def test_fmod(): + """Test scheduling and running fmod.""" # graph def run(dtype): - n = te.size_var("n") - A = te.placeholder((n,), name="A", dtype=dtype) - B = te.placeholder((n,), name="B", dtype=dtype) - C = te.compute(A.shape, lambda *i: te.fmod(A(*i), B(*i)), name="C") - s = te.create_schedule(C.op) + size_var_n = te.size_var("n") + placeholder_a = te.placeholder((size_var_n,), name="A", dtype=dtype) + placeholder_b = te.placeholder((size_var_n,), name="B", dtype=dtype) + result_c = te.compute( + placeholder_a.shape, lambda *i: te.fmod(placeholder_a(*i), placeholder_b(*i)), name="C" + ) + schedule = te.create_schedule(result_c.op) # create iter var and assign them tags. num_thread = 8 - bx, tx = s[C].split(C.op.axis[0], factor=num_thread) + axis0, axis1 = schedule[result_c].split(result_c.op.axis[0], factor=num_thread) def check_device(device): dev = tvm.device(device, 0) @@ -77,26 +82,29 @@ def check_device(device): return target = tvm.target.Target(device) if "cpu" not in target.keys: - s[C].bind(bx, te.thread_axis("blockIdx.x")) - s[C].bind(tx, te.thread_axis("threadIdx.x")) - fmod = tvm.build(s, [A, B, C], device, name="myfmod") + schedule[result_c].bind(axis0, te.thread_axis("blockIdx.x")) + schedule[result_c].bind(axis1, te.thread_axis("threadIdx.x")) + fmod = tvm.build( + schedule, [placeholder_a, placeholder_b, result_c], device, name="myfmod" + ) # launch the kernel. - n = 1024 - a_np = (np.random.uniform(size=n) * 256).astype(A.dtype) - b_np = (np.random.uniform(size=n) * 256).astype(B.dtype) + value_n = 1024 + a_np = (np.random.uniform(size=value_n) * 256).astype(placeholder_a.dtype) + b_np = (np.random.uniform(size=value_n) * 256).astype(placeholder_b.dtype) # "fix" the values in a and b to avoid the result being too small b_np += (b_np < 2.0) * 2 a_np[np.abs(np.fmod(a_np, b_np)) < 1] += 1 - a = tvm.nd.array(a_np, dev) - b = tvm.nd.array(b_np, dev) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) + buff_a = tvm.nd.array(a_np, dev) + buff_b = tvm.nd.array(b_np, dev) + buff_c = tvm.nd.array(np.zeros(value_n, dtype=result_c.dtype), dev) ftimer = fmod.time_evaluator(fmod.entry_name, dev, number=1) - tcost = ftimer(a, b, c).mean - # fmod(a, b, c) - np.testing.assert_allclose(c.numpy(), np.mod(a.numpy(), b.numpy()), rtol=1e-5) + _ = ftimer(buff_a, buff_b, buff_c).mean + np.testing.assert_allclose( + buff_c.numpy(), np.mod(buff_a.numpy(), buff_b.numpy()), rtol=1e-5 + ) check_device("cuda") check_device("opencl -device=intel_graphics") @@ -107,21 +115,30 @@ def check_device(device): @tvm.testing.requires_gpu def test_multiple_cache_write(): + """Test multiple cache writes.""" # graph - n = tvm.runtime.convert(1024) - A0 = te.placeholder((n,), name="A0", dtype="float32") - A1 = te.placeholder((n,), name="A1", dtype="float32") - B0, B1 = te.compute((n,), lambda *i: (A0(*i) + A1(*i), A0(*i) * A1(*i)), name="B") - C = te.compute((n,), lambda *i: B0(*i) + B1(*i), name="C") - s = te.create_schedule(C.op) + arr_length = 1024 + arr_length_tvm = tvm.runtime.convert(arr_length) + placeholder_a0 = te.placeholder((arr_length_tvm,), name="A0", dtype="float32") + placeholder_a1 = te.placeholder((arr_length_tvm,), name="A1", dtype="float32") + result_b0, result_b1 = te.compute( + (arr_length_tvm,), + lambda *i: ( + placeholder_a0(*i) + placeholder_a1(*i), + placeholder_a0(*i) * placeholder_a1(*i), + ), + name="B", + ) + result_c = te.compute((arr_length_tvm,), lambda *i: result_b0(*i) + result_b1(*i), name="C") + schedule = te.create_schedule(result_c.op) # create iter var and assign them tags. num_thread = 8 - B0_cache, B1_cache = s.cache_write([B0, B1], "local") - bx, tx = s[C].split(C.op.axis[0], factor=num_thread) - s[B0].compute_at(s[C], bx) - s[B0_cache].compute_at(s[C], bx) - s[C].bind(bx, te.thread_axis("blockIdx.x")) - s[C].bind(tx, te.thread_axis("threadIdx.x")) + cache_b0, _ = schedule.cache_write([result_b0, result_b1], "local") + axis0, axis1 = schedule[result_c].split(result_c.op.axis[0], factor=num_thread) + schedule[result_b0].compute_at(schedule[result_c], axis0) + schedule[cache_b0].compute_at(schedule[result_c], axis0) + schedule[result_c].bind(axis0, te.thread_axis("blockIdx.x")) + schedule[result_c].bind(axis1, te.thread_axis("threadIdx.x")) # one line to build the function. def check_device(device, host="stackvm"): if not tvm.testing.device_enabled(host): @@ -129,16 +146,23 @@ def check_device(device, host="stackvm"): dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): return - func = tvm.build(s, [A0, A1, C], device, host, name="multiple_cache_write") + func = tvm.build( + schedule, + [placeholder_a0, placeholder_a1, result_c], + device, + host, + name="multiple_cache_write", + ) dev = tvm.device(device, 0) # launch the kernel. - n = 1024 - a0 = tvm.nd.array(np.random.uniform(size=n).astype(A0.dtype), dev) - a1 = tvm.nd.array(np.random.uniform(size=n).astype(A1.dtype), dev) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) - func(a0, a1, c) + buff_a0 = tvm.nd.array(np.random.uniform(size=arr_length).astype(placeholder_a0.dtype), dev) + buff_a1 = tvm.nd.array(np.random.uniform(size=arr_length).astype(placeholder_a1.dtype), dev) + buff_c = tvm.nd.array(np.zeros(arr_length, dtype=result_c.dtype), dev) + func(buff_a0, buff_a1, buff_c) tvm.testing.assert_allclose( - c.numpy(), a0.numpy() + a1.numpy() + (a0.numpy() * a1.numpy()), rtol=1e-5 + buff_c.numpy(), + buff_a0.numpy() + buff_a1.numpy() + (buff_a0.numpy() * buff_a1.numpy()), + rtol=1e-5, ) check_device("cuda", "llvm") @@ -147,41 +171,49 @@ def check_device(device, host="stackvm"): def test_log_pow_llvm(): + """Test log pow using llvm to lower.""" # graph - n = te.size_var("n") - A = te.placeholder((n,), name="A") - B = te.compute(A.shape, lambda *i: te.power(te.log(A(*i)), 2.0), name="B") - s = te.create_schedule(B.op) + size_var_n = te.size_var("n") + placeholder_a = te.placeholder((size_var_n,), name="A") + result_b = te.compute( + placeholder_a.shape, lambda *i: te.power(te.log(placeholder_a(*i)), 2.0), name="B" + ) + schedule = te.create_schedule(result_b.op) # create iter var and assign them tags. - bx, tx = s[B].split(B.op.axis[0], factor=32) + schedule[result_b].split(result_b.op.axis[0], factor=32) # one line to build the function. if not tvm.testing.device_enabled("llvm"): return - flog = tvm.build(s, [A, B], "llvm", name="mylog") + flog = tvm.build(schedule, [placeholder_a, result_b], "llvm", name="mylog") dev = tvm.cpu(0) # launch the kernel. - n = 1028 - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) - b = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev) + size_var_n = 1028 + buff_a = tvm.nd.array(np.random.uniform(size=size_var_n).astype(placeholder_a.dtype), dev) + buff_b = tvm.nd.array(np.zeros(size_var_n, dtype=result_b.dtype), dev) repeat = 10 ftimer = flog.time_evaluator(flog.entry_name, dev, number=1, repeat=repeat) - res = ftimer(a, b) + res = ftimer(buff_a, buff_b) assert len(res.results) == repeat - tvm.testing.assert_allclose(b.numpy(), np.power(np.log(a.numpy()), 2.0), rtol=1e-5) + tvm.testing.assert_allclose(buff_b.numpy(), np.power(np.log(buff_a.numpy()), 2.0), rtol=1e-5) @tvm.testing.uses_gpu def test_popcount(): + """Test popcount.""" + def run(dtype): # graph - n = tvm.runtime.convert(1024) - A = te.placeholder((n,), name="A", dtype=dtype) - B = te.compute(A.shape, lambda *i: tvm.tir.popcount(A(*i)), name="B") - s = te.create_schedule(B.op) + arr_length = 1024 + arr_length_tvm = tvm.runtime.convert(1024) + placeholder_a = te.placeholder((arr_length_tvm,), name="A", dtype=dtype) + placeholder_b = te.compute( + placeholder_a.shape, lambda *i: tvm.tir.popcount(placeholder_a(*i)), name="B" + ) + schedule = te.create_schedule(placeholder_b.op) # simple schedule num_thread = 8 - bx, tx = s[B].split(B.op.axis[0], factor=num_thread) + axis1, axis2 = schedule[placeholder_b].split(placeholder_b.op.axis[0], factor=num_thread) def check_device(device): dev = tvm.device(device, 0) @@ -190,16 +222,17 @@ def check_device(device): return target = tvm.target.Target(device) if "cpu" not in target.keys: - s[B].bind(bx, te.thread_axis("blockIdx.x")) - s[B].bind(tx, te.thread_axis("threadIdx.x")) - func = tvm.build(s, [A, B], device) + schedule[placeholder_b].bind(axis1, te.thread_axis("blockIdx.x")) + schedule[placeholder_b].bind(axis2, te.thread_axis("threadIdx.x")) + func = tvm.build(schedule, [placeholder_a, placeholder_b], device) # launch the kernel. - n = 1024 - a = tvm.nd.array(np.random.randint(low=0, high=1000, size=n, dtype=A.dtype), dev) - b = tvm.nd.array(np.zeros(shape=n, dtype=B.dtype), dev) - func(a, b) + buff_a = tvm.nd.array( + np.random.randint(low=0, high=1000, size=arr_length, dtype=placeholder_a.dtype), dev + ) + buff_b = tvm.nd.array(np.zeros(shape=arr_length, dtype=placeholder_b.dtype), dev) + func(buff_a, buff_b) tvm.testing.assert_allclose( - b.numpy(), list(map(lambda x: bin(x).count("1"), a.numpy())), rtol=1e-5 + buff_b.numpy(), list(map(lambda x: bin(x).count("1"), buff_a.numpy())), rtol=1e-5 ) check_device("llvm") @@ -215,24 +248,26 @@ def check_device(device): @tvm.testing.requires_gpu def test_add(): + """Test addition.""" + def run(dtype): # graph - n = te.size_var("n") - A = te.placeholder((n,), name="A", dtype=dtype) - B = te.placeholder((n,), name="B", dtype=dtype) - bias = te.var("bias", dtype=dtype) - scale = te.var("scale", dtype=dtype) - C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C") + size_var_n = te.size_var("n") + placeholder_a = te.placeholder((size_var_n,), name="A", dtype=dtype) + placeholder_b = te.placeholder((size_var_n,), name="B", dtype=dtype) + result_c = te.compute( + placeholder_a.shape, lambda *i: placeholder_a(*i) + placeholder_b(*i), name="C" + ) # schedule - s = te.create_schedule(C.op) + schedule = te.create_schedule(result_c.op) # create iter var and assign them tags. num_thread = 16 - bx, x = s[C].split(C.op.axis[0], factor=num_thread * 4) - tx, x = s[C].split(x, nparts=num_thread) - _, x = s[C].split(x, factor=4) - s[C].bind(bx, te.thread_axis("blockIdx.x")) - s[C].bind(tx, te.thread_axis("threadIdx.x")) - s[C].vectorize(x) + axis_bx, axis_x = schedule[result_c].split(result_c.op.axis[0], factor=num_thread * 4) + axis_tx, axis_x = schedule[result_c].split(axis_x, nparts=num_thread) + _, axis_x = schedule[result_c].split(axis_x, factor=4) + schedule[result_c].bind(axis_bx, te.thread_axis("blockIdx.x")) + schedule[result_c].bind(axis_tx, te.thread_axis("threadIdx.x")) + schedule[result_c].vectorize(axis_x) # one line to build the function. def check_device(device): @@ -240,16 +275,22 @@ def check_device(device): if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return - fadd = tvm.build(s, [A, B, C], device, name="myadd") + fadd = tvm.build( + schedule, [placeholder_a, placeholder_b, result_c], device, name="myadd" + ) # launch the kernel. n = 1024 - a = tvm.nd.array((np.random.uniform(size=n) * 256).astype(A.dtype), dev) - b = tvm.nd.array((np.random.uniform(size=n) * 256).astype(B.dtype), dev) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) + buff_a = tvm.nd.array( + (np.random.uniform(size=n) * 256).astype(placeholder_a.dtype), dev + ) + buff_b = tvm.nd.array( + (np.random.uniform(size=n) * 256).astype(placeholder_b.dtype), dev + ) + buff_c = tvm.nd.array(np.zeros(n, dtype=result_c.dtype), dev) ftimer = fadd.time_evaluator(fadd.entry_name, dev, number=1) - tcost = ftimer(a, b, c).mean - tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy(), rtol=1e-6) + _ = ftimer(buff_a, buff_b, buff_c).mean + tvm.testing.assert_allclose(buff_c.numpy(), buff_a.numpy() + buff_b.numpy(), rtol=1e-6) check_device("opencl") check_device("cuda") @@ -265,25 +306,26 @@ def check_device(device): @tvm.testing.requires_gpu def try_warp_memory(): - """skip this in default test because it require higher arch""" - m = 128 - A = te.placeholder((m,), name="A") - B = te.compute((m,), lambda i: A[i] + 3, name="B") + """Test using warp memory + skip this in default test because it require higher arch""" + arr_size = 128 + placeholder_a = te.placeholder((arr_size,), name="A") + result_b = te.compute((arr_size,), lambda i: placeholder_a[i] + 3, name="B") warp_size = 32 - s = te.create_schedule(B.op) - AA = s.cache_read(A, "warp", [B]) - xo, xi = s[B].split(B.op.axis[0], warp_size * 2) - xi0, xi1 = s[B].split(xi, factor=warp_size) - tx = te.thread_axis("threadIdx.x") - s[B].bind(xi1, tx) - s[B].bind(xo, te.thread_axis("blockIdx.x")) - s[AA].compute_at(s[B], xo) - xo, xi = s[AA].split(s[AA].op.axis[0], warp_size) - s[AA].bind(xi, tx) + schedule = te.create_schedule(result_b.op) + cache_read_aa = schedule.cache_read(placeholder_a, "warp", [result_b]) + axis_x0, axis_xi = schedule[result_b].split(result_b.op.axis[0], warp_size * 2) + _, axis_xi1 = schedule[result_b].split(axis_xi, factor=warp_size) + thread_axis_tx = te.thread_axis("threadIdx.x") + schedule[result_b].bind(axis_xi1, thread_axis_tx) + schedule[result_b].bind(axis_x0, te.thread_axis("blockIdx.x")) + schedule[cache_read_aa].compute_at(schedule[result_b], axis_x0) + axis_x0, axis_xi = schedule[cache_read_aa].split(schedule[cache_read_aa].op.axis[0], warp_size) + schedule[cache_read_aa].bind(axis_xi, thread_axis_tx) @tvm.register_func("tvm_callback_cuda_compile", override=True) - def tvm_callback_cuda_compile(code): - ptx = nvcc.compile_cuda(code, target_format="ptx") + def tvm_callback_cuda_compile(code): # pylint: disable=unused-variable + ptx = nvcc.compile_cuda(code) return ptx # one line to build the function. @@ -292,11 +334,13 @@ def check_device(device): if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return - f = tvm.build(s, [A, B], device) - a = tvm.nd.array((np.random.uniform(size=m) * 256).astype(A.dtype), dev) - b = tvm.nd.array(np.zeros(m, dtype=B.dtype), dev) - f(a, b) - tvm.testing.assert_allclose(b.numpy(), a.numpy() + 3, rtol=1e-6) + myfunc = tvm.build(schedule, [placeholder_a, result_b], device) + buff_a = tvm.nd.array( + (np.random.uniform(size=arr_size) * 256).astype(placeholder_a.dtype), dev + ) + buff_b = tvm.nd.array(np.zeros(arr_size, dtype=result_b.dtype), dev) + myfunc(buff_a, buff_b) + tvm.testing.assert_allclose(buff_b.numpy(), buff_a.numpy() + 3, rtol=1e-6) check_device("cuda") diff --git a/tests/python/integration/test_ewise_fpga.py b/tests/python/integration/test_ewise_fpga.py index 6171c37b1672..7b247d7d527f 100644 --- a/tests/python/integration/test_ewise_fpga.py +++ b/tests/python/integration/test_ewise_fpga.py @@ -14,11 +14,14 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +"""Test elementwise ops on fpga.""" +import os + +import numpy as np + import tvm import tvm.testing from tvm import te -import numpy as np -import os os.environ["XCL_EMULATION_MODE"] = "1" os.environ["CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA"] = "1" @@ -32,28 +35,29 @@ def tvm_callback_vhls_postproc(code): def test_exp(): + """Test scheduling and running exp function.""" # graph - n = tvm.runtime.convert(1024) - A = te.placeholder((n,), name="A") - B = te.compute(A.shape, lambda *i: te.exp(A(*i)), name="B") - s = te.create_schedule(B.op) + arr_length = 1024 + arr_length_tvm = tvm.runtime.convert(arr_length) + placeholder_b = te.placeholder((arr_length_tvm,), name="A") + result_b = te.compute(placeholder_b.shape, lambda *i: te.exp(placeholder_b(*i)), name="B") + schedule = te.create_schedule(result_b.op) # create iter var and assign them tags. - px, x = s[B].split(B.op.axis[0], nparts=1) - s[B].bind(px, te.thread_axis("pipeline")) + axis1, _ = schedule[result_b].split(result_b.op.axis[0], nparts=1) + schedule[result_b].bind(axis1, te.thread_axis("pipeline")) # one line to build the function. def check_device(device, host="llvm"): if not tvm.testing.device_enabled(device): return dev = tvm.device(device, 0) - fexp = tvm.build(s, [A, B], device, host, name="myexp") + fexp = tvm.build(schedule, [placeholder_b, result_b], device, host, name="myexp") dev = tvm.device(device, 0) # launch the kernel. - n = 1024 - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) - b = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev) - fexp(a, b) - tvm.testing.assert_allclose(b.numpy(), np.exp(a.numpy()), rtol=1e-5) + buff_a = tvm.nd.array(np.random.uniform(size=arr_length).astype(placeholder_b.dtype), dev) + buff_b = tvm.nd.array(np.zeros(arr_length, dtype=result_b.dtype), dev) + fexp(buff_a, buff_b) + tvm.testing.assert_allclose(buff_b.numpy(), np.exp(buff_a.numpy()), rtol=1e-5) check_device("sdaccel") if "AWS_PLATFORM" in os.environ: @@ -63,34 +67,41 @@ def check_device(device, host="llvm"): def test_multi_kernel(): + """Test scheduling with multiple computes.""" # graph - n = tvm.runtime.convert(1024) - A = te.placeholder((n,), name="A") - B = te.placeholder((n,), name="B") - C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C") - D = te.compute(A.shape, lambda *i: A(*i) + C(*i), name="D") - s = te.create_schedule(D.op) + arr_length = 1024 + arr_length_tvm = tvm.runtime.convert(arr_length) + placeholder_a = te.placeholder((arr_length_tvm,), name="A") + placeholder_b = te.placeholder((arr_length_tvm,), name="B") + result_c = te.compute( + placeholder_a.shape, lambda *i: placeholder_a(*i) + placeholder_b(*i), name="C" + ) + result_d = te.compute( + placeholder_a.shape, lambda *i: placeholder_a(*i) + result_c(*i), name="D" + ) + schedule = te.create_schedule(result_d.op) # create iter var and assign them tags. - px, x = s[C].split(C.op.axis[0], nparts=1) - s[C].bind(px, te.thread_axis("pipeline")) - px, x = s[D].split(D.op.axis[0], nparts=1) - s[D].bind(px, te.thread_axis("pipeline")) + axis1, _ = schedule[result_c].split(result_c.op.axis[0], nparts=1) + schedule[result_c].bind(axis1, te.thread_axis("pipeline")) + axis1, _ = schedule[result_d].split(result_d.op.axis[0], nparts=1) + schedule[result_d].bind(axis1, te.thread_axis("pipeline")) # one line to build the function. def check_device(device, host="llvm"): if not tvm.testing.device_enabled(device): return dev = tvm.device(device, 0) - fadd = tvm.build(s, [A, B, C, D], device, host, name="myadd") + fadd = tvm.build( + schedule, [placeholder_a, placeholder_b, result_c, result_d], device, host, name="myadd" + ) dev = tvm.device(device, 0) # launch the kernel. - n = 1024 - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) - b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) - c = tvm.nd.array(np.random.uniform(size=n).astype(C.dtype), dev) - d = tvm.nd.array(np.random.uniform(size=n).astype(D.dtype), dev) - fadd(a, b, c, d) - tvm.testing.assert_allclose(d.numpy(), a.numpy() * 2 + b.numpy(), rtol=1e-5) + buff_a = tvm.nd.array(np.random.uniform(size=arr_length).astype(placeholder_a.dtype), dev) + buff_b = tvm.nd.array(np.random.uniform(size=arr_length).astype(placeholder_b.dtype), dev) + buff_c = tvm.nd.array(np.random.uniform(size=arr_length).astype(result_c.dtype), dev) + buff_d = tvm.nd.array(np.random.uniform(size=arr_length).astype(result_d.dtype), dev) + fadd(buff_a, buff_b, buff_c, buff_d) + tvm.testing.assert_allclose(buff_d.numpy(), buff_a.numpy() * 2 + buff_b.numpy(), rtol=1e-5) check_device("sdaccel") check_device("aocl_sw_emu") diff --git a/tests/python/integration/test_gemm.py b/tests/python/integration/test_gemm.py index aa6c5a1e74e1..66d777989d8c 100644 --- a/tests/python/integration/test_gemm.py +++ b/tests/python/integration/test_gemm.py @@ -14,27 +14,32 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -import tvm -from tvm import te +"""Test scheduling and running a gemm!""" import numpy as np -import time + +import tvm import tvm.testing +from tvm import te @tvm.testing.requires_gpu def test_gemm(): + """Test the gemm!""" # graph - nn = 1024 - n = tvm.runtime.convert(nn) - m = n - l = n - A = te.placeholder((n, l), name="A") - B = te.placeholder((m, l), name="B") - k = te.reduce_axis((0, l), name="k") - C = te.compute((n, m), lambda ii, jj: te.sum(A[ii, k] * B[jj, k], axis=k), name="CC") + dim1_length = 1024 + dim_n = tvm.runtime.convert(dim1_length) + dim_m = dim_n + dim_l = dim_n + placeholder_a = te.placeholder((dim_n, dim_l), name="A") + placeholder_b = te.placeholder((dim_m, dim_l), name="B") + axis_k = te.reduce_axis((0, dim_l), name="k") + result_c = te.compute( + (dim_n, dim_m), + lambda ii, jj: te.sum(placeholder_a[ii, axis_k] * placeholder_b[jj, axis_k], axis=axis_k), + name="CC", + ) # schedule - s = te.create_schedule(C.op) - xtile, ytile = 32, 32 + schedule = te.create_schedule(result_c.op) scale = 8 num_thread = 8 block_factor = scale * num_thread @@ -43,39 +48,43 @@ def test_gemm(): block_y = te.thread_axis("blockIdx.y") thread_y = te.thread_axis("threadIdx.y") - CC = s.cache_write(C, "local") - AA = s.cache_read(A, "shared", [CC]) - BB = s.cache_read(B, "shared", [CC]) - by, yi = s[C].split(C.op.axis[0], factor=block_factor) - bx, xi = s[C].split(C.op.axis[1], factor=block_factor) - s[C].reorder(by, bx, yi, xi) - s[C].bind(by, block_y) - s[C].bind(bx, block_x) - ty, yi = s[C].split(yi, nparts=num_thread) - tx, xi = s[C].split(xi, nparts=num_thread) - s[C].reorder(ty, tx, yi, xi) - s[C].bind(ty, thread_y) - s[C].bind(tx, thread_x) - yo, xo = CC.op.axis - s[CC].reorder(k, yo, xo) + cache_write = schedule.cache_write(result_c, "local") + cache_read_a = schedule.cache_read(placeholder_a, "shared", [cache_write]) + cache_read_b = schedule.cache_read(placeholder_b, "shared", [cache_write]) + axis_by, axis_yi = schedule[result_c].split(result_c.op.axis[0], factor=block_factor) + axis_bx, axis_xi = schedule[result_c].split(result_c.op.axis[1], factor=block_factor) + schedule[result_c].reorder(axis_by, axis_bx, axis_yi, axis_xi) + schedule[result_c].bind(axis_by, block_y) + schedule[result_c].bind(axis_bx, block_x) + axis_ty, axis_yi = schedule[result_c].split(axis_yi, nparts=num_thread) + axis_tx, axis_xi = schedule[result_c].split(axis_xi, nparts=num_thread) + schedule[result_c].reorder(axis_ty, axis_tx, axis_yi, axis_xi) + schedule[result_c].bind(axis_ty, thread_y) + schedule[result_c].bind(axis_tx, thread_x) + axis_yo, axis_xo = cache_write.op.axis + schedule[cache_write].reorder(axis_k, axis_yo, axis_xo) - s[CC].compute_at(s[C], tx) - s[AA].compute_at(s[CC], k) - s[BB].compute_at(s[CC], k) - s[AA].double_buffer() - s[BB].double_buffer() - ty, xi = s[AA].split(s[AA].op.axis[0], nparts=num_thread) - tx, xi = s[AA].split(xi, nparts=num_thread) - s[AA].bind(ty, thread_y) - s[AA].bind(tx, thread_x) + schedule[cache_write].compute_at(schedule[result_c], axis_tx) + schedule[cache_read_a].compute_at(schedule[cache_write], axis_k) + schedule[cache_read_b].compute_at(schedule[cache_write], axis_k) + schedule[cache_read_a].double_buffer() + schedule[cache_read_b].double_buffer() + axis_ty, axis_xi = schedule[cache_read_a].split( + schedule[cache_read_a].op.axis[0], nparts=num_thread + ) + axis_tx, axis_xi = schedule[cache_read_a].split(axis_xi, nparts=num_thread) + schedule[cache_read_a].bind(axis_ty, thread_y) + schedule[cache_read_a].bind(axis_tx, thread_x) - ty, xi = s[BB].split(s[BB].op.axis[0], nparts=num_thread) - tx, xi = s[BB].split(xi, nparts=num_thread) - s[BB].bind(ty, thread_y) - s[BB].bind(tx, thread_x) + axis_ty, axis_xi = schedule[cache_read_b].split( + schedule[cache_read_b].op.axis[0], nparts=num_thread + ) + axis_tx, axis_xi = schedule[cache_read_b].split(axis_xi, nparts=num_thread) + schedule[cache_read_b].bind(axis_ty, thread_y) + schedule[cache_read_b].bind(axis_tx, thread_x) # lowering test - s = s.normalize() + schedule = schedule.normalize() # one line to build the function. def check_device(device): @@ -85,21 +94,21 @@ def check_device(device): return with tvm.target.Target(device): - f = tvm.build(s, [A, B, C]) + f = tvm.build(schedule, [placeholder_a, placeholder_b, result_c]) # launch the kernel. - n = nn - m = n - l = n - a_np = np.random.uniform(size=(n, l)).astype(A.dtype) - b_np = np.random.uniform(size=(m, l)).astype(B.dtype) - a = tvm.nd.array(a_np, dev) - b = tvm.nd.array(b_np, dev) - c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), dev) + num_n = dim1_length + num_m = num_n + num_l = num_n + a_np = np.random.uniform(size=(num_n, num_l)).astype(placeholder_a.dtype) + b_np = np.random.uniform(size=(num_m, num_l)).astype(placeholder_b.dtype) + buff_a = tvm.nd.array(a_np, dev) + buff_b = tvm.nd.array(b_np, dev) + buff_c = tvm.nd.array(np.zeros((num_n, num_m), dtype=result_c.dtype), dev) ftimer = f.time_evaluator(f.entry_name, dev, number=1) - tcost = ftimer(a, b, c).mean + tcost = ftimer(buff_a, buff_b, buff_c).mean print("%s: exec=%g sec/op" % (dev, tcost)) - tvm.testing.assert_allclose(c.numpy(), np.dot(a_np, b_np.T), rtol=1e-5) + tvm.testing.assert_allclose(buff_c.numpy(), np.dot(a_np, b_np.T), rtol=1e-5) check_device("vulkan") check_device("nvptx -mcpu=sm_20") diff --git a/tests/python/integration/test_lower.py b/tests/python/integration/test_lower.py index 63733b05ab3f..1ccdde8b1337 100644 --- a/tests/python/integration/test_lower.py +++ b/tests/python/integration/test_lower.py @@ -14,42 +14,52 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -# pylint: disable=invalid-name, too-many-locals, too-many-statements, unused-argument -"""Test workload for lowering and build""" +"""Test workload for lowering and build.""" +import numpy as np + import tvm -from tvm import tir -from tvm.script import tir as T import tvm.testing -import numpy as np +from tvm.script import tir as T @T.prim_func -def tensorcore_gemm(a: T.handle, b: T.handle, c: T.handle) -> None: +def tensorcore_gemm(handle_a: T.handle, handle_b: T.handle, handle_c: T.handle) -> None: + # pylint: disable=missing-function-docstring # match buffer - A = T.match_buffer(a, [1024, 1024], "float16") - B = T.match_buffer(b, [1024, 1024], "float16") - C = T.match_buffer(c, [1024, 1024], "float32") + match_buffer_a = T.match_buffer(handle_a, [1024, 1024], "float16") + match_buffer_b = T.match_buffer(handle_b, [1024, 1024], "float16") + match_buffer_c = T.match_buffer(handle_c, [1024, 1024], "float32") # body - for blockIdx_x in T.thread_binding(0, 16, "blockIdx.x"): - for blockIdx_y in T.thread_binding(0, 8, "blockIdx.y"): + for block_idx_x in T.thread_binding(0, 16, "blockIdx.x"): + for block_idx_y in T.thread_binding(0, 8, "blockIdx.y"): with T.block(): - bx, by = T.axis.remap("SS", [blockIdx_x, blockIdx_y]) - shared_A = T.alloc_buffer([1024, 1024], "float16", scope="shared") - shared_B = T.alloc_buffer([1024, 1024], "float16", scope="shared") - wmma_A = T.alloc_buffer([1024, 1024], "float16", scope="wmma.matrix_a") - wmma_B = T.alloc_buffer([1024, 1024], "float16", scope="wmma.matrix_b") - wmma_C = T.alloc_buffer([1024, 1024], "float32", scope="wmma.accumulator") - for ty in T.thread_binding(0, 2, "threadIdx.y"): - for tz in T.thread_binding(0, 2, "threadIdx.z"): - for i, j in T.grid(2, 4): + axis_bx, axis_by = T.axis.remap("SS", [block_idx_x, block_idx_y]) + shared_a = T.alloc_buffer([1024, 1024], "float16", scope="shared") + shared_b = T.alloc_buffer([1024, 1024], "float16", scope="shared") + wmma_a = T.alloc_buffer([1024, 1024], "float16", scope="wmma.matrix_a") + wmma_b = T.alloc_buffer([1024, 1024], "float16", scope="wmma.matrix_b") + wmma_c = T.alloc_buffer([1024, 1024], "float32", scope="wmma.accumulator") + + # pylint: disable=too-many-nested-blocks + for thread_ty in T.thread_binding(0, 2, "threadIdx.y"): + for thread_tz in T.thread_binding(0, 2, "threadIdx.z"): + for index_i, index_jj in T.grid(2, 4): with T.block(): - vi = T.axis.S(64, bx * 4 + ty * 2 + i) - vj = T.axis.S(64, by * 8 + tz * 4 + j) + new_axis_vi = T.axis.S(64, axis_bx * 4 + thread_ty * 2 + index_i) + new_axis_vj = T.axis.S(64, axis_by * 8 + thread_tz * 4 + index_jj) T.reads([]) - T.writes(wmma_C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16]) - C0 = T.match_buffer( - wmma_C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16], + T.writes( + wmma_c[ + new_axis_vi * 16 : new_axis_vi * 16 + 16, + new_axis_vj * 16 : new_axis_vj * 16 + 16, + ] + ) + match_buffer_c0 = T.match_buffer( + wmma_c[ + new_axis_vi * 16 : new_axis_vi * 16 + 16, + new_axis_vj * 16 : new_axis_vj * 16 + 16, + ], (16, 16), "float32", strides=[16 * 4, 1], @@ -58,62 +68,92 @@ def tensorcore_gemm(a: T.handle, b: T.handle, c: T.handle) -> None: ) T.evaluate( T.tvm_fill_fragment( - C0.data, + match_buffer_c0.data, 16, 16, 16, - i * 4 + j, - T.float32(0), + index_i * 4 + index_jj, + T.float32(0), # pylint: disable=not-callable dtype="handle", ) ) - for ko in range(0, 32): + for k_o in range(0, 32): # copy data from global to shared - for tx in T.thread_binding(0, 32, "threadIdx.x"): - for i0, j0 in T.grid(1, 4): - for j1 in T.vectorized(0, 4): + for thread_tx in T.thread_binding(0, 32, "threadIdx.x"): + for index_i0, index_j0 in T.grid(1, 4): + for index_j1 in T.vectorized(0, 4): with T.block(): - vi = T.axis.S(1024, bx * 64 + ty * 32 + tx + i0) - vj = T.axis.S(1024, ko * 32 + tz * 16 + j0 * 4 + j1) - shared_A[vi, vj + 8] = A[vi, vj] + new_axis_vi = T.axis.S( + 1024, + axis_bx * 64 + + thread_ty * 32 + + thread_tx + + index_i0, + ) + new_axis_vj = T.axis.S( + 1024, + k_o * 32 + thread_tz * 16 + index_j0 * 4 + index_j1, + ) + shared_a[new_axis_vi, new_axis_vj + 8] = match_buffer_a[ + new_axis_vi, new_axis_vj + ] - for i0, j0 in T.grid(2, 4): - for j1 in T.vectorized(0, 4): + for index_i0, index_j0 in T.grid(2, 4): + for index_j1 in T.vectorized(0, 4): with T.block(): - vi = T.axis.S(1024, by * 128 + ty * 64 + tx * 2 + i0) - vj = T.axis.S(1024, ko * 32 + tz * 16 + j0 * 4 + j1) - shared_B[vi, vj + 8] = B[vi, vj] + new_axis_vi = T.axis.S( + 1024, + axis_by * 128 + + thread_ty * 64 + + thread_tx * 2 + + index_i0, + ) + new_axis_vj = T.axis.S( + 1024, + k_o * 32 + thread_tz * 16 + index_j0 * 4 + index_j1, + ) + shared_b[new_axis_vi, new_axis_vj + 8] = match_buffer_b[ + new_axis_vi, new_axis_vj + ] - for ki in range(0, 2): - for i in range(0, 2): + for k_i in range(0, 2): + for index_i in range(0, 2): with T.block(): - vi = T.axis.S(64, bx * 4 + ty * 2 + i) - vk = T.axis.S(64, ko * 2 + ki) + new_axis_vi = T.axis.S( + 64, axis_bx * 4 + thread_ty * 2 + index_i + ) + axis_vk = T.axis.S(64, k_o * 2 + k_i) T.reads( - shared_A[ - vi * 16 : vi * 16 + 16, - vk * 16 : vk * 16 + 16 + 8, + shared_a[ + new_axis_vi * 16 : new_axis_vi * 16 + 16, + axis_vk * 16 : axis_vk * 16 + 16 + 8, ] ) T.writes( - wmma_A[vi * 16 : vi * 16 + 16, vk * 16 : vk * 16 + 16] + wmma_a[ + new_axis_vi * 16 : new_axis_vi * 16 + 16, + axis_vk * 16 : axis_vk * 16 + 16, + ] ) - s0 = T.var("int32") - s1 = T.var("int32") - A0 = T.match_buffer( - shared_A[ - vi * 16 : vi * 16 + 16, - vk * 16 : vk * 16 + 16 + 8, + stride0 = T.var("int32") + stride1 = T.var("int32") + match_buffer_a0 = T.match_buffer( + shared_a[ + new_axis_vi * 16 : new_axis_vi * 16 + 16, + axis_vk * 16 : axis_vk * 16 + 16 + 8, ], (16, 16 + 8), "float16", - strides=[s0, s1], + strides=[stride0, stride1], scope="shared", offset_factor=1, ) - wmma_A0 = T.match_buffer( - wmma_A[vi * 16 : vi * 16 + 16, vk * 16 : vk * 16 + 16], + wmma_a0 = T.match_buffer( + wmma_a[ + new_axis_vi * 16 : new_axis_vi * 16 + 16, + axis_vk * 16 : axis_vk * 16 + 16, + ], (16, 16), "float16", strides=[16, 1], @@ -122,52 +162,60 @@ def tensorcore_gemm(a: T.handle, b: T.handle, c: T.handle) -> None: ) T.evaluate( T.tvm_load_matrix_sync( - wmma_A0.data, + wmma_a0.data, 16, 16, 16, - i, + index_i, T.tvm_access_ptr( T.type_annotation(dtype="float16"), - A0.data, - A0.elem_offset + 8, - A0.strides[0], + match_buffer_a0.data, + match_buffer_a0.elem_offset + 8, + match_buffer_a0.strides[0], 1, dtype="handle", ), - A0.strides[0], + match_buffer_a0.strides[0], "row_major", dtype="handle", ) ) - for j in range(0, 4): + for index_jj in range(0, 4): with T.block(): - vj = T.axis.S(64, by * 8 + tz * 4 + j) - vk = T.axis.S(64, ko * 2 + ki) + new_axis_vj = T.axis.S( + 64, axis_by * 8 + thread_tz * 4 + index_jj + ) + axis_vk = T.axis.S(64, k_o * 2 + k_i) T.reads( - shared_B[ - vj * 16 : vj * 16 + 16, - vk * 16 : vk * 16 + 16 + 8, + shared_b[ + new_axis_vj * 16 : new_axis_vj * 16 + 16, + axis_vk * 16 : axis_vk * 16 + 16 + 8, ] ) T.writes( - wmma_B[vj * 16 : vj * 16 + 16, vk * 16 : vk * 16 + 16] + wmma_b[ + new_axis_vj * 16 : new_axis_vj * 16 + 16, + axis_vk * 16 : axis_vk * 16 + 16, + ] ) - s0 = T.var("int32") - s1 = T.var("int32") - B0 = T.match_buffer( - shared_B[ - vj * 16 : vj * 16 + 16, - vk * 16 : vk * 16 + 16 + 8, + stride0 = T.var("int32") + stride1 = T.var("int32") + match_buffer_b0 = T.match_buffer( + shared_b[ + new_axis_vj * 16 : new_axis_vj * 16 + 16, + axis_vk * 16 : axis_vk * 16 + 16 + 8, ], (16, 16 + 8), "float16", - strides=[s0, s1], + strides=[stride0, stride1], scope="shared", offset_factor=1, ) - wmma_B0 = T.match_buffer( - wmma_B[vj * 16 : vj * 16 + 16, vk * 16 : vk * 16 + 16], + wmma_b0 = T.match_buffer( + wmma_b[ + new_axis_vj * 16 : new_axis_vj * 16 + 16, + axis_vk * 16 : axis_vk * 16 + 16, + ], (16, 16), "float16", strides=[16, 1], @@ -176,63 +224,82 @@ def tensorcore_gemm(a: T.handle, b: T.handle, c: T.handle) -> None: ) T.evaluate( T.tvm_load_matrix_sync( - wmma_B0.data, + wmma_b0.data, 16, 16, 16, - j, + index_jj, T.tvm_access_ptr( T.type_annotation(dtype="float16"), - B0.data, - B0.elem_offset + 8, - B0.strides[0], + match_buffer_b0.data, + match_buffer_b0.elem_offset + 8, + match_buffer_b0.strides[0], 1, dtype="handle", ), - B0.strides[0], + match_buffer_b0.strides[0], "col_major", dtype="handle", ) ) - for i, j in T.grid(2, 4): + for index_i, index_jj in T.grid(2, 4): with T.block(): - vi = T.axis.S(64, bx * 4 + ty * 2 + i) - vj = T.axis.S(64, by * 8 + tz * 4 + j) - vk = T.axis.R(64, ko * 2 + ki) + new_axis_vi = T.axis.S( + 64, axis_bx * 4 + thread_ty * 2 + index_i + ) + new_axis_vj = T.axis.S( + 64, axis_by * 8 + thread_tz * 4 + index_jj + ) + axis_vk = T.axis.R(64, k_o * 2 + k_i) T.reads( [ - wmma_A[ - vi * 16 : vi * 16 + 16, vk * 16 : vk * 16 + 16 + wmma_a[ + new_axis_vi * 16 : new_axis_vi * 16 + 16, + axis_vk * 16 : axis_vk * 16 + 16, ], - wmma_B[ - vj * 16 : vj * 16 + 16, vk * 16 : vk * 16 + 16 + wmma_b[ + new_axis_vj * 16 : new_axis_vj * 16 + 16, + axis_vk * 16 : axis_vk * 16 + 16, ], - wmma_C[ - vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16 + wmma_c[ + new_axis_vi * 16 : new_axis_vi * 16 + 16, + new_axis_vj * 16 : new_axis_vj * 16 + 16, ], ] ) T.writes( - wmma_C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16] + wmma_c[ + new_axis_vi * 16 : new_axis_vi * 16 + 16, + new_axis_vj * 16 : new_axis_vj * 16 + 16, + ] ) - wmma_A1 = T.match_buffer( - wmma_A[vi * 16 : vi * 16 + 16, vk * 16 : vk * 16 + 16], + wmma_a1 = T.match_buffer( + wmma_a[ + new_axis_vi * 16 : new_axis_vi * 16 + 16, + axis_vk * 16 : axis_vk * 16 + 16, + ], (16, 16), "float16", strides=[16, 1], scope="wmma.matrix_a", offset_factor=1, ) - wmma_B1 = T.match_buffer( - wmma_B[vj * 16 : vj * 16 + 16, vk * 16 : vk * 16 + 16], + wmma_b1 = T.match_buffer( + wmma_b[ + new_axis_vj * 16 : new_axis_vj * 16 + 16, + axis_vk * 16 : axis_vk * 16 + 16, + ], (16, 16), "float16", strides=[16, 1], scope="wmma.matrix_b", offset_factor=1, ) - wmma_C1 = T.match_buffer( - wmma_C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16], + wmma_c1 = T.match_buffer( + wmma_c[ + new_axis_vi * 16 : new_axis_vi * 16 + 16, + new_axis_vj * 16 : new_axis_vj * 16 + 16, + ], (16, 16), "float32", strides=[16 * 4, 1], @@ -241,56 +308,72 @@ def tensorcore_gemm(a: T.handle, b: T.handle, c: T.handle) -> None: ) T.evaluate( T.tvm_mma_sync( - wmma_C1.data, - i * 4 + j, - wmma_A1.data, - i, - wmma_B1.data, - j, - wmma_C1.data, - i * 4 + j, + wmma_c1.data, + index_i * 4 + index_jj, + wmma_a1.data, + index_i, + wmma_b1.data, + index_jj, + wmma_c1.data, + index_i * 4 + index_jj, dtype="handle", ) ) - for i, j in T.grid(2, 4): + for index_i, index_jj in T.grid(2, 4): with T.block(): - vi = T.axis.S(64, bx * 4 + ty * 2 + i) - vj = T.axis.S(64, by * 8 + tz * 4 + j) - T.reads(wmma_C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16]) - T.writes(C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16]) - s0 = T.var("int32") - s1 = T.var("int32") - wmma_C2 = T.match_buffer( - wmma_C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16], + new_axis_vi = T.axis.S(64, axis_bx * 4 + thread_ty * 2 + index_i) + new_axis_vj = T.axis.S(64, axis_by * 8 + thread_tz * 4 + index_jj) + T.reads( + wmma_c[ + new_axis_vi * 16 : new_axis_vi * 16 + 16, + new_axis_vj * 16 : new_axis_vj * 16 + 16, + ] + ) + T.writes( + match_buffer_c[ + new_axis_vi * 16 : new_axis_vi * 16 + 16, + new_axis_vj * 16 : new_axis_vj * 16 + 16, + ] + ) + stride0 = T.var("int32") + stride1 = T.var("int32") + wmma_c2 = T.match_buffer( + wmma_c[ + new_axis_vi * 16 : new_axis_vi * 16 + 16, + new_axis_vj * 16 : new_axis_vj * 16 + 16, + ], (16, 16), "float32", strides=[16 * 4, 1], scope="wmma.accumulator", offset_factor=1, ) - C1 = T.match_buffer( - C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16], + match_buffer_c1 = T.match_buffer( + match_buffer_c[ + new_axis_vi * 16 : new_axis_vi * 16 + 16, + new_axis_vj * 16 : new_axis_vj * 16 + 16, + ], (16, 16), "float32", - strides=[s0, s1], + strides=[stride0, stride1], offset_factor=1, ) T.evaluate( T.tvm_store_matrix_sync( - wmma_C2.data, + wmma_c2.data, 16, 16, 16, - i * 4 + j, + index_i * 4 + index_jj, T.tvm_access_ptr( T.type_annotation(dtype="float32"), - C1.data, - C1.elem_offset, - C1.strides[0], + match_buffer_c1.data, + match_buffer_c1.elem_offset, + match_buffer_c1.strides[0], 1, dtype="handle", ), - C1.strides[0], + match_buffer_c1.strides[0], "row_major", dtype="handle", ) @@ -299,22 +382,23 @@ def tensorcore_gemm(a: T.handle, b: T.handle, c: T.handle) -> None: @tvm.testing.requires_cuda def test_gemm_tensorcore(): + """Test running gemm on tensorcore.""" dev = tvm.device("cuda", 0) a_np = np.random.uniform(size=(1024, 1024)).astype("float16") b_np = np.random.uniform(size=(1024, 1024)).astype("float16") c_np = np.dot(a_np.astype("float32"), b_np.T.astype("float32")) - a = tvm.nd.array(a_np, dev) - b = tvm.nd.array(b_np, dev) - c = tvm.nd.array(np.zeros((1024, 1024), dtype="float32"), dev) - f = tvm.build(tensorcore_gemm, target="cuda", name="dense") - f(a, b, c) - tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-3) + buff_a = tvm.nd.array(a_np, dev) + buff_b = tvm.nd.array(b_np, dev) + buff_c = tvm.nd.array(np.zeros((1024, 1024), dtype="float32"), dev) + myfunc = tvm.build(tensorcore_gemm, target="cuda", name="dense") + myfunc(buff_a, buff_b, buff_c) + tvm.testing.assert_allclose(buff_c.numpy(), c_np, rtol=1e-3) - evaluator = f.time_evaluator(f.entry_name, dev, number=100) - t = evaluator(a, b, c).mean + evaluator = myfunc.time_evaluator(myfunc.entry_name, dev, number=100) + time_elapsed = evaluator(buff_a, buff_b, buff_c).mean num_flops = 2 * 1024 * 1024 * 1024 - gflops = num_flops / (t * 1e3) / 1e6 - print("gemm with tensor core: %f ms" % (t * 1e3)) + gflops = num_flops / (time_elapsed * 1e3) / 1e6 + print("gemm with tensor core: %f ms" % (time_elapsed * 1e3)) print("GFLOPS: %f" % gflops) diff --git a/tests/python/integration/test_meta_schedule_auto_tensorize.py b/tests/python/integration/test_meta_schedule_auto_tensorize.py index 511e75723b03..b855dc6fa09e 100644 --- a/tests/python/integration/test_meta_schedule_auto_tensorize.py +++ b/tests/python/integration/test_meta_schedule_auto_tensorize.py @@ -14,34 +14,32 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +"""Integration test for metascheduler's auto tensorization.""" +import tempfile + +import numpy as np import pytest + import tvm -from tvm import relay import tvm.testing -import numpy as np -from tvm.meta_schedule.tune import tune_extracted_tasks +import tvm.topi.testing +from tvm import meta_schedule as ms +from tvm import relay +from tvm.meta_schedule import ApplyHistoryBest, postproc, schedule_rule from tvm.meta_schedule.relay_integration import extract_task_from_relay -from tvm.meta_schedule import ApplyHistoryBest -from tvm.meta_schedule import schedule_rule, postproc from tvm.meta_schedule.testing.tlcbench import load_quantized_bert_base -from tvm import meta_schedule as ms -from tvm.tir.tensor_intrin import ( - VNNI_DOT_16x4_INTRIN as VNNI_INTRIN, - DP4A_INTRIN, - AMDGPU_SDOT4_INTRIN, -) -import tempfile -import tvm.topi.testing - +from tvm.meta_schedule.tune import tune_extracted_tasks +from tvm.tir.tensor_intrin import AMDGPU_SDOT4_INTRIN, DP4A_INTRIN +from tvm.tir.tensor_intrin import VNNI_DOT_16x4_INTRIN as VNNI_INTRIN -config = ms.TuneConfig( +CONFIG = ms.TuneConfig( strategy="evolutionary", num_trials_per_iter=32, max_trials_per_task=32, max_trials_global=20000, ) -sch_rules_for_vnni = [ +SCH_RULES_FOR_VNNI = [ schedule_rule.AutoInline( into_producer=False, into_consumer=True, @@ -113,17 +111,17 @@ def get_sch_rules_for_dp4a(intrin): ] -sch_rules_for_dp4a = get_sch_rules_for_dp4a(DP4A_INTRIN) -sch_rules_for_sdot4 = get_sch_rules_for_dp4a(AMDGPU_SDOT4_INTRIN) +SCH_RULES_FOR_DP4A = get_sch_rules_for_dp4a(DP4A_INTRIN) +SCH_RULES_FOR_SDOT4 = get_sch_rules_for_dp4a(AMDGPU_SDOT4_INTRIN) -postprocs_for_vnni = [ +POSTPROCS_FOR_VNNI = [ postproc.DisallowDynamicLoop(), postproc.RewriteParallelVectorizeUnroll(), postproc.RewriteReductionBlock(), postproc.RewriteTensorize(vectorize_init_loop=True), ] -postprocs_for_dp4a = [ +POSTPROCS_FOR_DP4A = [ postproc.DisallowDynamicLoop(), postproc.RewriteCooperativeFetch(), postproc.RewriteUnboundBlock(), @@ -135,6 +133,7 @@ def get_sch_rules_for_dp4a(intrin): def tune_and_test(relay_mod, data_np, weight_np, op_name, target, sch_rules, postprocs): + """Test tuning.""" tgt = "cuda" if "nvidia" in target else target dev = tvm.device(tgt, 0) @@ -158,7 +157,7 @@ def tune_and_test(relay_mod, data_np, weight_np, op_name, target, sch_rules, pos with tempfile.TemporaryDirectory() as work_dir: database = tune_extracted_tasks( tune_tasks, - config, + CONFIG, work_dir=work_dir, sch_rules=lambda: sch_rules, postprocs=lambda: postprocs, @@ -186,9 +185,9 @@ def tune_and_test(relay_mod, data_np, weight_np, op_name, target, sch_rules, pos def _test_dense(data_dtype, sch_rules, postprocs, target): - M, N, K = 1024, 1024, 1024 - data_shape = (M, K) - weight_shape = (N, K) + dim_m, dim_n, dim_k = 1024, 1024, 1024 + data_shape = (dim_m, dim_k) + weight_shape = (dim_n, dim_k) weight_dtype = "int8" out_dtype = "int32" @@ -255,7 +254,7 @@ def _test_bert_int8(target, sch_rules, postprocs): with tempfile.TemporaryDirectory() as work_dir: database = tune_extracted_tasks( tune_tasks, - config, + CONFIG, work_dir=work_dir, sch_rules=lambda: sch_rules, postprocs=lambda: postprocs, @@ -284,14 +283,14 @@ def _test_bert_int8(target, sch_rules, postprocs): @pytest.mark.skip("Requires cascadelake") def test_vnni_dense(): _test_dense( - "uint8", sch_rules_for_vnni, postprocs_for_vnni, "llvm -mcpu=cascadelake -num-cores 4" + "uint8", SCH_RULES_FOR_VNNI, POSTPROCS_FOR_VNNI, "llvm -mcpu=cascadelake -num-cores 4" ) @pytest.mark.skip("Only tested locally on sm_86 (for cuda) which is not supported by CI") @tvm.testing.requires_gpu def test_dp4a_dense(): - _test_dense("int8", sch_rules_for_dp4a, postprocs_for_dp4a, "nvidia/geforce-rtx-3070") + _test_dense("int8", SCH_RULES_FOR_DP4A, POSTPROCS_FOR_DP4A, "nvidia/geforce-rtx-3070") # Uncomment to test on vulkan or rocm target # _test_dense( @@ -305,14 +304,14 @@ def test_dp4a_dense(): @pytest.mark.skip("Requires cascadelake") def test_vnni_conv2d(): _test_conv2d( - "uint8", sch_rules_for_vnni, postprocs_for_vnni, "llvm -mcpu=cascadelake -num-cores 4" + "uint8", SCH_RULES_FOR_VNNI, POSTPROCS_FOR_VNNI, "llvm -mcpu=cascadelake -num-cores 4" ) @pytest.mark.skip("Only tested locally on sm_86 (for cuda) which is not supported by CI") @tvm.testing.requires_gpu def test_dp4a_conv2d(): - _test_conv2d("int8", sch_rules_for_dp4a, postprocs_for_dp4a, "nvidia/geforce-rtx-3070") + _test_conv2d("int8", SCH_RULES_FOR_DP4A, POSTPROCS_FOR_DP4A, "nvidia/geforce-rtx-3070") # Uncomment to test on vulkan or rocm target # _test_conv2d( @@ -325,13 +324,13 @@ def test_dp4a_conv2d(): @pytest.mark.skip("Requires cascadelake") def test_vnni_bert_int8(): - _test_bert_int8("llvm -mcpu=cascadelake -num-cores 4", sch_rules_for_vnni, postprocs_for_vnni) + _test_bert_int8("llvm -mcpu=cascadelake -num-cores 4", SCH_RULES_FOR_VNNI, POSTPROCS_FOR_VNNI) @tvm.testing.requires_gpu @pytest.mark.skip("Slow on CI") def test_dp4a_bert_int8(): - _test_bert_int8("nvidia/geforce-rtx-3070", sch_rules_for_dp4a, postprocs_for_dp4a) + _test_bert_int8("nvidia/geforce-rtx-3070", SCH_RULES_FOR_DP4A, POSTPROCS_FOR_DP4A) # Uncomment to test on vulkan or rocm target # _test_bert_int8("vulkan -from_device=0", sch_rules_for_dp4a, postprocs_for_dp4a) diff --git a/tests/python/integration/test_reduce.py b/tests/python/integration/test_reduce.py index f3886374ccb6..eaac8ed26684 100644 --- a/tests/python/integration/test_reduce.py +++ b/tests/python/integration/test_reduce.py @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +"""Test scheduling of reduction operations.""" import pytest import numpy as np @@ -26,22 +27,28 @@ @tvm.testing.requires_gpu def test_reduce_prims(): + """Test reduction operations.""" + def test_prim(reducer, np_reducer): # graph - n = tvm.te.size_var("n") - m = tvm.te.size_var("m") - A = te.placeholder((n, m), name="A") - R = te.compute((n,), lambda i: tvm.tir.Select((i > 1), 1, 0), name="R") - k = te.reduce_axis((0, m)) - B = te.compute((n,), lambda i: reducer(A[i, k], axis=k, where=(R[i] == 1)), name="B") + size_var_n = tvm.te.size_var("n") + size_var_m = tvm.te.size_var("m") + placeholder_a = te.placeholder((size_var_n, size_var_m), name="A") + result_r = te.compute((size_var_n,), lambda i: tvm.tir.Select((i > 1), 1, 0), name="R") + axis_k = te.reduce_axis((0, size_var_m)) + result_b = te.compute( + (size_var_n,), + lambda i: reducer(placeholder_a[i, axis_k], axis=axis_k, where=(result_r[i] == 1)), + name="B", + ) # schedule - s = te.create_schedule(B.op) + schedule = te.create_schedule(result_b.op) # create iter var and assign them tags. num_thread = 1 - xo, xi = s[B].split(B.op.axis[0], factor=num_thread) - s[B].bind(xo, te.thread_axis("blockIdx.x")) - s[B].bind(xi, te.thread_axis("threadIdx.x")) - s[R].compute_inline() + axis_x0, axis_x1 = schedule[result_b].split(result_b.op.axis[0], factor=num_thread) + schedule[result_b].bind(axis_x0, te.thread_axis("blockIdx.x")) + schedule[result_b].bind(axis_x1, te.thread_axis("threadIdx.x")) + schedule[result_r].compute_inline() # one line to build the function. def check_device(device, host="llvm"): @@ -50,17 +57,22 @@ def check_device(device, host="llvm"): print("skip because %s is not enabled.." % device) return freduce = tvm.build( - s, args=[A, B], target=tvm.target.Target(device, host), name="myreduce" + schedule, + args=[placeholder_a, result_b], + target=tvm.target.Target(device, host), + name="myreduce", ) # launch the kernel. - n = 1028 - m = 129 - x = tvm.nd.array(np.random.uniform(size=(n, m)).astype(A.dtype), dev) - y = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev) - freduce(x, y) - npy = y.numpy() + num_n = 1028 + num_m = 129 + buff_x = tvm.nd.array( + np.random.uniform(size=(num_n, num_m)).astype(placeholder_a.dtype), dev + ) + buff_y = tvm.nd.array(np.zeros(num_n, dtype=result_b.dtype), dev) + freduce(buff_x, buff_y) + npy = buff_y.numpy() npy[:2] = 0 - res = np_reducer(x.numpy(), axis=1) + res = np_reducer(buff_x.numpy(), axis=1) res[:2] = 0 tvm.testing.assert_allclose(npy, res, rtol=1e-4) @@ -76,192 +88,228 @@ def check_device(device, host="llvm"): def test_init_imm(): - n = tvm.runtime.convert(1027) - A = te.placeholder((n,), name="A") - k = te.reduce_axis((0, n)) - B = te.compute((), lambda: te.sum(A[k], axis=k, init=10.0), name="B") + """Test initial values which are immutable in reduction ops.""" + num_n = 1027 + arr_length = tvm.runtime.convert(num_n) + placeholder_a = te.placeholder((arr_length,), name="A") + axis_k = te.reduce_axis((0, arr_length)) + result_b = te.compute( + (), lambda: te.sum(placeholder_a[axis_k], axis=axis_k, init=10.0), name="B" + ) # schedule - s = te.create_schedule(B.op) + schedule_s = te.create_schedule(result_b.op) # one line to build the function. def check_target(target="llvm"): if not tvm.runtime.enabled(target): return dev = tvm.cpu(0) - fapi = tvm.lower(s, args=[A, B]) + fapi = tvm.lower(schedule_s, args=[placeholder_a, result_b]) fsum = tvm.build(fapi, target=target, name="mysum") # launch the kernel. - n = 1027 - a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), dev) - b = tvm.nd.array(np.zeros((), dtype=B.dtype), dev) - fsum(a, b) - res = 10.0 + np.sum(a.numpy(), axis=0) - tvm.testing.assert_allclose(b.numpy(), res, rtol=1e-4) + buff_a = tvm.nd.array(np.random.uniform(size=(num_n,)).astype(placeholder_a.dtype), dev) + buff_b = tvm.nd.array(np.zeros((), dtype=result_b.dtype), dev) + fsum(buff_a, buff_b) + res = 10.0 + np.sum(buff_a.numpy(), axis=0) + tvm.testing.assert_allclose(buff_b.numpy(), res, rtol=1e-4) check_target() def test_init(): - n = tvm.runtime.convert(1027) - A = te.placeholder((n, n), name="A") - C = te.placeholder((n, n), name="C") - I = te.placeholder((n, n), name="I") - k = te.reduce_axis((0, n)) - B = te.compute((n, n), lambda i, j: te.sum(A[i, k] * C[k, j], axis=k, init=I[i, j]), name="B") + """Test initializer which is non-const.""" + num_n = 1027 + arr_length = tvm.runtime.convert(num_n) + placeholder_a = te.placeholder((arr_length, arr_length), name="A") + placeholder_c = te.placeholder((arr_length, arr_length), name="C") + placeholder_i = te.placeholder((arr_length, arr_length), name="I") + axis_k = te.reduce_axis((0, arr_length)) + result_b = te.compute( + (arr_length, arr_length), + lambda i, j: te.sum( + placeholder_a[i, axis_k] * placeholder_c[axis_k, j], + axis=axis_k, + init=placeholder_i[i, j], + ), + name="B", + ) # schedule - s = te.create_schedule(B.op) + schedule = te.create_schedule(result_b.op) # one line to build the function. def check_target(target="llvm"): if not tvm.runtime.enabled(target): return dev = tvm.cpu(0) - fapi = tvm.lower(s, args=[A, C, I, B]) + fapi = tvm.lower(schedule, args=[placeholder_a, placeholder_c, placeholder_i, result_b]) print(fapi) mmult = tvm.build(fapi, target=target, name="mmult") # launch the kernel. - n = 1027 - a = tvm.nd.array(np.random.uniform(size=(n, n)).astype(A.dtype), dev) - c = tvm.nd.array(np.random.uniform(size=(n, n)).astype(C.dtype), dev) - ii = tvm.nd.array(np.random.uniform(size=(n, n)).astype(B.dtype), dev) - b = tvm.nd.array(np.zeros((n, n), dtype=B.dtype), dev) - mmult(a, c, ii, b) - res = ii.numpy() + np.matmul(a.numpy(), c.numpy()) - tvm.testing.assert_allclose(b.numpy(), res, rtol=1e-4) + buff_a = tvm.nd.array( + np.random.uniform(size=(num_n, num_n)).astype(placeholder_a.dtype), dev + ) + buff_c = tvm.nd.array( + np.random.uniform(size=(num_n, num_n)).astype(placeholder_c.dtype), dev + ) + buff_i = tvm.nd.array(np.random.uniform(size=(num_n, num_n)).astype(result_b.dtype), dev) + buf_b = tvm.nd.array(np.zeros((num_n, num_n), dtype=result_b.dtype), dev) + mmult(buff_a, buff_c, buff_i, buf_b) + res = buff_i.numpy() + np.matmul(buff_a.numpy(), buff_c.numpy()) + tvm.testing.assert_allclose(buf_b.numpy(), res, rtol=1e-4) check_target() def test_rfactor(): - n = tvm.runtime.convert(1027) - A = te.placeholder((n,), name="A") - k = te.reduce_axis((0, n)) - B = te.compute((), lambda: te.sum(A[k], axis=k), name="B") + """Test rfactors.""" + num_n = 1027 + arr_length = tvm.runtime.convert(num_n) + placeholder_a = te.placeholder((arr_length,), name="A") + axis_k = te.reduce_axis((0, arr_length)) + placeholder_b = te.compute((), lambda: te.sum(placeholder_a[axis_k], axis=axis_k), name="B") # schedule - s = te.create_schedule(B.op) - kf, ki = s[B].split(k, nparts=4) - BF = s.rfactor(B, kf) - s[BF].parallel(BF.op.axis[0]) + schedule = te.create_schedule(placeholder_b.op) + axis_kf, _ = schedule[placeholder_b].split(axis_k, nparts=4) + rfactor_bf = schedule.rfactor(placeholder_b, axis_kf) + schedule[rfactor_bf].parallel(rfactor_bf.op.axis[0]) # one line to build the function. def check_target(target="llvm"): if not tvm.testing.device_enabled(target): return dev = tvm.cpu(0) - fapi = tvm.lower(s, args=[A, B]) + fapi = tvm.lower(schedule, args=[placeholder_a, placeholder_b]) fsum = tvm.build(fapi, target=target, name="mysum") # launch the kernel. - n = 1027 - a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), dev) - b = tvm.nd.array(np.zeros((), dtype=B.dtype), dev) - fsum(a, b) - res = np.sum(a.numpy(), axis=0) - tvm.testing.assert_allclose(b.numpy(), res, rtol=1e-4) + buff_a = tvm.nd.array(np.random.uniform(size=(num_n,)).astype(placeholder_a.dtype), dev) + buff_b = tvm.nd.array(np.zeros((), dtype=placeholder_b.dtype), dev) + fsum(buff_a, buff_b) + res = np.sum(buff_a.numpy(), axis=0) + tvm.testing.assert_allclose(buff_b.numpy(), res, rtol=1e-4) check_target() def test_rfactor_init(): - n = tvm.runtime.convert(1027) - A = te.placeholder((n, n), name="A") - C = te.placeholder((n, n), name="C") - I = te.placeholder((n, n), name="I") - k = te.reduce_axis((0, n)) - B = te.compute((n, n), lambda i, j: te.sum(A[i, k] * C[k, j], axis=k, init=I[i, j]), name="B") + """Test rfactors with constant inits.""" + num_n = 1027 + arr_length = tvm.runtime.convert(num_n) + placeholder_a = te.placeholder((arr_length, arr_length), name="A") + placeholder_c = te.placeholder((arr_length, arr_length), name="C") + placeholder_i = te.placeholder((arr_length, arr_length), name="I") + axis_k = te.reduce_axis((0, arr_length)) + result_b = te.compute( + (arr_length, arr_length), + lambda i, j: te.sum( + placeholder_a[i, axis_k] * placeholder_c[axis_k, j], + axis=axis_k, + init=placeholder_i[i, j], + ), + name="B", + ) # schedule - s = te.create_schedule(B.op) - kf, ki = s[B].split(k, nparts=4) - BF = s.rfactor(B, kf, 1) - s[BF].parallel(BF.op.axis[0]) + schedule = te.create_schedule(result_b.op) + axis_kf, _ = schedule[result_b].split(axis_k, nparts=4) + rfactor_bf = schedule.rfactor(result_b, axis_kf, 1) + schedule[rfactor_bf].parallel(rfactor_bf.op.axis[0]) # one line to build the function. def check_target(target="llvm"): if not tvm.runtime.enabled(target): return dev = tvm.cpu(0) - fapi = tvm.lower(s, args=[A, C, I, B]) + fapi = tvm.lower(schedule, args=[placeholder_a, placeholder_c, placeholder_i, result_b]) print(fapi) mmult = tvm.build(fapi, target=target, name="mmult") # launch the kernel. - n = 1027 - a = tvm.nd.array(np.random.uniform(size=(n, n)).astype(A.dtype), dev) - c = tvm.nd.array(np.random.uniform(size=(n, n)).astype(C.dtype), dev) - ii = tvm.nd.array(np.random.uniform(size=(n, n)).astype(B.dtype), dev) - b = tvm.nd.array(np.zeros((n, n), dtype=B.dtype), dev) - mmult(a, c, ii, b) - res = ii.numpy() + np.matmul(a.numpy(), c.numpy()) - tvm.testing.assert_allclose(b.numpy(), res, rtol=1e-4) + buff_a = tvm.nd.array( + np.random.uniform(size=(num_n, num_n)).astype(placeholder_a.dtype), dev + ) + buff_c = tvm.nd.array( + np.random.uniform(size=(num_n, num_n)).astype(placeholder_c.dtype), dev + ) + buff_i = tvm.nd.array(np.random.uniform(size=(num_n, num_n)).astype(result_b.dtype), dev) + buff_b = tvm.nd.array(np.zeros((num_n, num_n), dtype=result_b.dtype), dev) + mmult(buff_a, buff_c, buff_i, buff_b) + res = buff_i.numpy() + np.matmul(buff_a.numpy(), buff_c.numpy()) + tvm.testing.assert_allclose(buff_b.numpy(), res, rtol=1e-4) check_target() def test_rfactor_factor_axis(): - n = tvm.runtime.convert(1027) - A = te.placeholder((n,), name="A") - k = te.reduce_axis((0, n)) - B = te.compute((), lambda: te.sum(A[k], axis=k), name="B") + """Test rfactors across axis.""" + num_n = 1027 + arr_length = tvm.runtime.convert(num_n) + placeholder_a = te.placeholder((arr_length,), name="A") + axis_k = te.reduce_axis((0, arr_length)) + placeholder_b = te.compute((), lambda: te.sum(placeholder_a[axis_k], axis=axis_k), name="B") # schedule - s = te.create_schedule(B.op) - kf, ki = s[B].split(k, nparts=4) - BF = s.rfactor(B, kf, 0) - s[BF].parallel(BF.op.axis[0]) + schedule = te.create_schedule(placeholder_b.op) + axis_kf, _ = schedule[placeholder_b].split(axis_k, nparts=4) + rfactor_bf = schedule.rfactor(placeholder_b, axis_kf, 0) + schedule[rfactor_bf].parallel(rfactor_bf.op.axis[0]) # one line to build the function. def check_target(target="llvm"): if not tvm.testing.device_enabled(target): return dev = tvm.cpu(0) - fapi = tvm.lower(s, args=[A, B]) + fapi = tvm.lower(schedule, args=[placeholder_a, placeholder_b]) fsum = tvm.build(fapi, target=target, name="mysum") # launch the kernel. - n = 1027 - a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), dev) - b = tvm.nd.array(np.zeros((), dtype=B.dtype), dev) - fsum(a, b) - res = np.sum(a.numpy(), axis=0) - tvm.testing.assert_allclose(b.numpy(), res, rtol=1e-4) + buff_a = tvm.nd.array(np.random.uniform(size=(num_n,)).astype(placeholder_a.dtype), dev) + buff_b = tvm.nd.array(np.zeros((), dtype=placeholder_b.dtype), dev) + fsum(buff_a, buff_b) + res = np.sum(buff_a.numpy(), axis=0) + tvm.testing.assert_allclose(buff_b.numpy(), res, rtol=1e-4) check_target() @tvm.testing.requires_gpu def test_rfactor_threads(): - nn = 1027 - mm = 10 - n = tvm.runtime.convert(nn) - m = tvm.runtime.convert(mm) - A = te.placeholder((m, n), name="A") - k = te.reduce_axis((0, n)) + """Test rfactors across threads.""" + num_n = 1027 + num_m = 10 + length_n = tvm.runtime.convert(num_n) + length_m = tvm.runtime.convert(num_m) + placeholder_a = te.placeholder((length_m, length_n), name="A") + axis_k = te.reduce_axis((0, length_n)) nthread = 16 - B = te.compute((m,), lambda i: te.sum(A[i, k], axis=k, where=(i > 1)), name="B") + result_b = te.compute( + (length_m,), + lambda i: te.sum(placeholder_a[i, axis_k], axis=axis_k, where=(i > 1)), + name="B", + ) # schedule - s = te.create_schedule(B.op) - ko, kf = s[B].split(k, factor=nthread) - BF = s.rfactor(B, kf) - bx, ty = s[B].split(s[B].op.axis[0], factor=nthread) - s[B].bind(bx, te.thread_axis("blockIdx.x")) - s[B].bind(ty, te.thread_axis("threadIdx.y")) - tx = s[B].op.reduce_axis[0] + schedule = te.create_schedule(result_b.op) + _, axis_kf = schedule[result_b].split(axis_k, factor=nthread) + rfactor_bf = schedule.rfactor(result_b, axis_kf) + axis_bx, axis_ty = schedule[result_b].split(schedule[result_b].op.axis[0], factor=nthread) + schedule[result_b].bind(axis_bx, te.thread_axis("blockIdx.x")) + schedule[result_b].bind(axis_ty, te.thread_axis("threadIdx.y")) + axis_tx = schedule[result_b].op.reduce_axis[0] thread_x = te.thread_axis("threadIdx.x") - s[B].bind(tx, thread_x) - s[BF].compute_at(s[B], tx) - s[B].set_store_predicate(thread_x.var.equal(0)) + schedule[result_b].bind(axis_tx, thread_x) + schedule[rfactor_bf].compute_at(schedule[result_b], axis_tx) + schedule[result_b].set_store_predicate(thread_x.var.equal(0)) # one line to build the function. - def check_target(device, host="stackvm"): + def check_target(device): dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return - fapi = tvm.lower(s, args=[A, B]) + fapi = tvm.lower(schedule, args=[placeholder_a, result_b]) fsum = tvm.build(fapi, target=device, name="mysum") # launch the kernel. - n = nn - m = mm - a = tvm.nd.array(np.random.uniform(size=(m, n)).astype(A.dtype), dev) - b = tvm.nd.array(np.zeros(m, dtype=B.dtype), dev) - fsum(a, b) - res = np.sum(a.numpy(), axis=1) + buff_a = tvm.nd.array( + np.random.uniform(size=(num_m, num_n)).astype(placeholder_a.dtype), dev + ) + buff_b = tvm.nd.array(np.zeros(num_m, dtype=result_b.dtype), dev) + fsum(buff_a, buff_b) + res = np.sum(buff_a.numpy(), axis=1) res[:2] = 0 - tvm.testing.assert_allclose(b.numpy(), res, rtol=1e-4) + tvm.testing.assert_allclose(buff_b.numpy(), res, rtol=1e-4) check_target("vulkan") check_target("cuda") @@ -272,46 +320,51 @@ def check_target(device, host="stackvm"): @tvm.testing.requires_gpu def test_rfactor_elemwise_threads(): - n = 1025 - m = 10 - A = te.placeholder((m, n), name="A") - k = te.reduce_axis((0, n)) + """Test rfactor elemwise threads.""" + num_n = 1025 + num_m = 10 + placeholder_a = te.placeholder((num_m, num_n), name="A") + axis_k = te.reduce_axis((0, num_n)) nthread = 16 - B = te.compute((m,), lambda i: te.sum(A[i, k], axis=k), name="B") - BB = te.compute((m,), lambda i: B[i] + 1, name="BB") - C = te.compute((m,), lambda i: BB[i] + 1, name="C") + result_b = te.compute( + (num_m,), lambda i: te.sum(placeholder_a[i, axis_k], axis=axis_k), name="B" + ) + result_bb = te.compute((num_m,), lambda i: result_b[i] + 1, name="BB") + result_c = te.compute((num_m,), lambda i: result_bb[i] + 1, name="C") # schedule - s = te.create_schedule(C.op) - s[BB].compute_inline() - bx, ty = s[C].split(s[C].op.axis[0], factor=nthread) - ko, kf = s[B].split(k, factor=nthread) - BF = s.rfactor(B, kf) - s[B].compute_at(s[C], ty) - s[C].bind(bx, te.thread_axis("blockIdx.x")) - s[C].bind(ty, te.thread_axis("threadIdx.y")) - tx = s[B].op.reduce_axis[0] + schedule = te.create_schedule(result_c.op) + schedule[result_bb].compute_inline() + axis_bx, axis_ty = schedule[result_c].split(schedule[result_c].op.axis[0], factor=nthread) + _, axis_kf = schedule[result_b].split(axis_k, factor=nthread) + rfactor_bf = schedule.rfactor(result_b, axis_kf) + schedule[result_b].compute_at(schedule[result_c], axis_ty) + schedule[result_c].bind(axis_bx, te.thread_axis("blockIdx.x")) + schedule[result_c].bind(axis_ty, te.thread_axis("threadIdx.y")) + axis_tx = schedule[result_b].op.reduce_axis[0] thread_x = te.thread_axis("threadIdx.x") - s[B].bind(tx, thread_x) - s[BF].compute_at(s[B], tx) + schedule[result_b].bind(axis_tx, thread_x) + schedule[rfactor_bf].compute_at(schedule[result_b], axis_tx) # Since thread_x is shared across reductions # only one of them need to do write back - s[B].set_store_predicate(thread_x.var.equal(0)) - s[C].set_store_predicate(thread_x.var.equal(0)) + schedule[result_b].set_store_predicate(thread_x.var.equal(0)) + schedule[result_c].set_store_predicate(thread_x.var.equal(0)) # one line to build the function. - def check_target(device, host="stackvm"): + def check_target(device): dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return - fapi = tvm.lower(s, args=[A, C]) + fapi = tvm.lower(schedule, args=[placeholder_a, result_c]) fsum = tvm.build(fapi, target=device, name="mysum") # launch the kernel. - a = tvm.nd.array(np.random.uniform(size=(m, n)).astype(A.dtype), dev) - b = tvm.nd.array(np.zeros(m, dtype=B.dtype), dev) - fsum(a, b) - res = np.sum(a.numpy(), axis=1) + 2 - tvm.testing.assert_allclose(b.numpy(), res, rtol=1e-4) + buff_a = tvm.nd.array( + np.random.uniform(size=(num_m, num_n)).astype(placeholder_a.dtype), dev + ) + buff_b = tvm.nd.array(np.zeros(num_m, dtype=result_b.dtype), dev) + fsum(buff_a, buff_b) + res = np.sum(buff_a.numpy(), axis=1) + 2 + tvm.testing.assert_allclose(buff_b.numpy(), res, rtol=1e-4) check_target("vulkan") check_target("cuda") @@ -321,22 +374,26 @@ def check_target(device, host="stackvm"): def test_argmax(): - def fcombine(x, y): - lhs = tvm.tir.Select((x[1] >= y[1]), x[0], y[0]) - rhs = tvm.tir.Select((x[1] >= y[1]), x[1], y[1]) + """Test argmax.""" + + def fcombine(tensor_x, tensor_y): + lhs = tvm.tir.Select((tensor_x[1] >= tensor_y[1]), tensor_x[0], tensor_y[0]) + rhs = tvm.tir.Select((tensor_x[1] >= tensor_y[1]), tensor_x[1], tensor_y[1]) return lhs, rhs - def fidentity(t0, t1): - return tvm.tir.const(-1, t0), tvm.te.min_value(t1) + def fidentity(tensor1, tensor2): + return tvm.tir.const(-1, tensor1), tvm.te.min_value(tensor2) argmax = te.comm_reducer(fcombine, fidentity, name="argmax") - m = te.size_var("m") - n = te.size_var("n") - idx = te.placeholder((m, n), name="idx", dtype="int32") - val = te.placeholder((m, n), name="val", dtype="float32") - k = te.reduce_axis((0, n), "k") - T0, T1 = te.compute((m,), lambda i: argmax((idx[i, k], val[i, k]), axis=k), name="T") - s = te.create_schedule(T0.op) + size_var_m = te.size_var("m") + size_var_n = te.size_var("n") + idx = te.placeholder((size_var_m, size_var_n), name="idx", dtype="int32") + val = te.placeholder((size_var_m, size_var_n), name="val", dtype="float32") + axis_k = te.reduce_axis((0, size_var_n), "k") + result_t0, result_t1 = te.compute( + (size_var_m,), lambda i: argmax((idx[i, axis_k], val[i, axis_k]), axis=axis_k), name="T" + ) + schedule = te.create_schedule(result_t0.op) def check_target(): device = "cpu" @@ -344,19 +401,19 @@ def check_target(): print("skip because %s is not enabled.." % device) return dev = tvm.device(device, 0) - fapi = tvm.lower(s, args=[idx, val, T0, T1]) + fapi = tvm.lower(schedule, args=[idx, val, result_t0, result_t1]) fargmax = tvm.build(fapi, target="llvm", name="argmax") - mm = 12 - nn = 16 - np_idx = np.repeat(np.arange(nn, dtype="int32").reshape(1, nn), mm, axis=0) - np_val = np.random.uniform(size=(mm, nn)).astype("float32") + height = 12 + width = 16 + np_idx = np.repeat(np.arange(width, dtype="int32").reshape(1, width), height, axis=0) + np_val = np.random.uniform(size=(height, width)).astype("float32") np_res = np.argmax(np_val, axis=1) nd_idx = tvm.nd.array(np_idx, dev) nd_val = tvm.nd.array(np_val, dev) - nd_res0 = tvm.nd.array(np.zeros(mm, dtype="int32"), dev) - nd_res1 = tvm.nd.array(np.zeros(mm, dtype="float32"), dev) + nd_res0 = tvm.nd.array(np.zeros(height, dtype="int32"), dev) + nd_res1 = tvm.nd.array(np.zeros(height, dtype="float32"), dev) fargmax(nd_idx, nd_val, nd_res0, nd_res1) tvm.testing.assert_allclose(np_res, nd_res0.numpy()) @@ -365,55 +422,63 @@ def check_target(): @tvm.testing.requires_gpu def test_rfactor_argmax(): - def fcombine(x, y): - lhs = tvm.tir.Select((x[1] >= y[1]), x[0], y[0]) - rhs = tvm.tir.Select((x[1] >= y[1]), x[1], y[1]) + """Test rfactor argmax""" + + def fcombine(tensor0, tensor1): + lhs = tvm.tir.Select((tensor0[1] >= tensor1[1]), tensor0[0], tensor1[0]) + rhs = tvm.tir.Select((tensor0[1] >= tensor1[1]), tensor0[1], tensor1[1]) return lhs, rhs - def fidentity(t0, t1): - return tvm.tir.const(-1, t0), tvm.te.min_value(t1) + def fidentity(tensor0, tensor1): + return tvm.tir.const(-1, tensor0), tvm.te.min_value(tensor1) argmax = te.comm_reducer(fcombine, fidentity, name="argmax") - nn = 1027 - mm = 10 - n = tvm.runtime.convert(nn) - m = tvm.runtime.convert(mm) - A0 = te.placeholder((m, n), name="A0", dtype="int32") - A1 = te.placeholder((m, n), name="A1", dtype="float32") - k = te.reduce_axis((0, n)) - B0, B1 = te.compute((m,), lambda i: argmax((A0[i, k], A1[i, k]), axis=k), name="B") + num_width = 1027 + num_height = 10 + width = tvm.runtime.convert(num_width) + height = tvm.runtime.convert(num_height) + placeholder_a0 = te.placeholder((height, width), name="A0", dtype="int32") + placeholder_a1 = te.placeholder((height, width), name="A1", dtype="float32") + axis_k = te.reduce_axis((0, width)) + result_b0, result_b1 = te.compute( + (height,), + lambda i: argmax((placeholder_a0[i, axis_k], placeholder_a1[i, axis_k]), axis=axis_k), + name="B", + ) # schedule - s = te.create_schedule(B0.op) + schedule = te.create_schedule(result_b0.op) nthread = 16 - ko, kf = s[B0].split(k, factor=nthread) - BF0, BF1 = s.rfactor(B0, kf) - bx, ty = s[B0].split(s[B0].op.axis[0], factor=nthread) - s[B0].bind(bx, te.thread_axis("blockIdx.x")) - s[B0].bind(ty, te.thread_axis("threadIdx.y")) - tx = s[B0].op.reduce_axis[0] + _, axis_kf = schedule[result_b0].split(axis_k, factor=nthread) + rfactor_bf0, _ = schedule.rfactor(result_b0, axis_kf) + axis_bx, axis_ty = schedule[result_b0].split(schedule[result_b0].op.axis[0], factor=nthread) + schedule[result_b0].bind(axis_bx, te.thread_axis("blockIdx.x")) + schedule[result_b0].bind(axis_ty, te.thread_axis("threadIdx.y")) + axis_tx = schedule[result_b0].op.reduce_axis[0] thread_x = te.thread_axis("threadIdx.x") - s[B0].bind(tx, thread_x) - s[BF0.op].compute_at(s[B0], tx) - s[B0].set_store_predicate(thread_x.var.equal(0)) + schedule[result_b0].bind(axis_tx, thread_x) + schedule[rfactor_bf0.op].compute_at(schedule[result_b0], axis_tx) + schedule[result_b0].set_store_predicate(thread_x.var.equal(0)) def check_target(device): dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return - fapi = tvm.lower(s, args=[A0, A1, B0, B1]) + fapi = tvm.lower(schedule, args=[placeholder_a0, placeholder_a1, result_b0, result_b1]) fargmax = tvm.build(fapi, target=device, name="argmax") - np_idx = np.repeat(np.arange(nn, dtype="int32").reshape(1, nn), mm, axis=0) - np_val = np.random.uniform(size=(mm, nn)).astype("float32") + np_idx = np.repeat( + np.arange(num_width, dtype="int32").reshape(1, num_width), num_height, axis=0 + ) + np_val = np.random.uniform(size=(num_height, num_width)).astype("float32") np_res = np.argmax(np_val, axis=1) nd_idx = tvm.nd.array(np_idx, dev) nd_val = tvm.nd.array(np_val, dev) - nd_res0 = tvm.nd.array(np.zeros(mm, dtype="int32"), dev) - nd_res1 = tvm.nd.array(np.zeros(mm, dtype="float32"), dev) + nd_res0 = tvm.nd.array(np.zeros(num_height, dtype="int32"), dev) + nd_res1 = tvm.nd.array(np.zeros(num_height, dtype="float32"), dev) fargmax(nd_idx, nd_val, nd_res0, nd_res1) tvm.testing.assert_allclose(np_res, nd_res0.numpy()) @@ -424,6 +489,7 @@ def check_target(device): @tvm.testing.requires_gpu def test_warp_reduction1(): + """Test warp reductions.""" nthx = 32 nthy = 4 block_x = te.thread_axis("blockIdx.x") @@ -437,30 +503,34 @@ def check_target(device, m, n): return # compute - A = te.placeholder((m, n), name="A") - k = te.reduce_axis((0, n)) - B = te.compute((m,), lambda i: te.max(A[i][k], axis=k), name="B") - s = te.create_schedule(B.op) + placeholder_a = te.placeholder((m, n), name="A") + axis_k = te.reduce_axis((0, n)) + placeholder_b = te.compute( + (m,), lambda i: te.max(placeholder_a[i][axis_k], axis=axis_k), name="B" + ) + schedule = te.create_schedule(placeholder_b.op) # schedule - k = s[B].op.reduce_axis[0] - ko, _ = s[B].split(k, nparts=nthx) - s[B].bind(ko, thread_x) - xo, xi = s[B].split(s[B].op.axis[0], factor=nthy) - s[B].bind(xi, thread_y) - s[B].bind(xo, block_x) + axis_k = schedule[placeholder_b].op.reduce_axis[0] + axis_ko, _ = schedule[placeholder_b].split(axis_k, nparts=nthx) + schedule[placeholder_b].bind(axis_ko, thread_x) + axis_xo, axis_xi = schedule[placeholder_b].split( + schedule[placeholder_b].op.axis[0], factor=nthy + ) + schedule[placeholder_b].bind(axis_xi, thread_y) + schedule[placeholder_b].bind(axis_xo, block_x) - tvm.lower(s, [A, B], simple_mode=True) + tvm.lower(schedule, [placeholder_a, placeholder_b], simple_mode=True) # validation - func = tvm.build(s, [A, B], device, name="warp_reduction") - a_np = np.random.uniform(size=(m, n)).astype(A.dtype) - b_np = np.zeros((m,), dtype=A.dtype) - a = tvm.nd.array(a_np, dev) - b = tvm.nd.array(b_np, dev) + func = tvm.build(schedule, [placeholder_a, placeholder_b], device, name="warp_reduction") + a_np = np.random.uniform(size=(m, n)).astype(placeholder_a.dtype) + b_np = np.zeros((m,), dtype=placeholder_a.dtype) + buff_a = tvm.nd.array(a_np, dev) + buff_b = tvm.nd.array(b_np, dev) b_np = np.max(a_np, axis=1) - func(a, b) - tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-3, atol=1e-3) + func(buff_a, buff_b) + tvm.testing.assert_allclose(buff_b.numpy(), b_np, rtol=1e-3, atol=1e-3) check_target("cuda", m=32, n=256) check_target("cuda", m=10, n=20) @@ -472,21 +542,29 @@ def check_target(device, m, n): @tvm.testing.requires_gpu def test_warp_reduction2(): - def fcombine(x, y): - return x[0] + y[0], x[1] * y[1] + """Test warp reductions.""" + + def fcombine(tensor1, tensor2): + return tensor1[0] + tensor2[0], tensor1[1] * tensor2[1] - def fidentity(t0, t1): - return tvm.tir.const(0, t0), tvm.tir.const(1, t1) + def fidentity(tensor1, tensor2): + return tvm.tir.const(0, tensor1), tvm.tir.const(1, tensor2) add_mul_reducer = te.comm_reducer(fcombine, fidentity, name="add_mul_reducer") # compute - m = 16 - n = 256 - A0 = te.placeholder((m, n), name="A0", dtype="float32") - A1 = te.placeholder((m, n), name="Al", dtype="float32") - k = te.reduce_axis((0, n), "k") - T0, T1 = te.compute((m,), lambda i: add_mul_reducer((A0[i, k], A1[i, k]), axis=k), name="T") + num_m = 16 + num_n = 256 + placeholder_a0 = te.placeholder((num_m, num_n), name="A0", dtype="float32") + placeholder_a1 = te.placeholder((num_m, num_n), name="Al", dtype="float32") + axis_k = te.reduce_axis((0, num_n), "k") + result0, result1 = te.compute( + (num_m,), + lambda i: add_mul_reducer( + (placeholder_a0[i, axis_k], placeholder_a1[i, axis_k]), axis=axis_k + ), + name="T", + ) nthdx, nthdy = 32, 2 block_x = te.thread_axis("blockIdx.x") @@ -500,29 +578,31 @@ def check_target(device): return # schedule - s = te.create_schedule(T0.op) - ko, _ = s[T0].split(k, nparts=nthdx) - xo, xi = s[T0].split(s[T0].op.axis[0], factor=nthdy) - s[T0].bind(ko, thread_x) - s[T0].bind(xi, thread_y) - s[T0].bind(xo, block_x) + schedule = te.create_schedule(result0.op) + axis_ko, _ = schedule[result0].split(axis_k, nparts=nthdx) + axis_xo, axis_xi = schedule[result0].split(schedule[result0].op.axis[0], factor=nthdy) + schedule[result0].bind(axis_ko, thread_x) + schedule[result0].bind(axis_xi, thread_y) + schedule[result0].bind(axis_xo, block_x) # validation dev = tvm.device(device, 0) - a0_np = np.random.uniform(size=(m, n)).astype(A0.dtype) - a1_np = np.random.uniform(size=(m, n)).astype(A1.dtype) - t0_np = np.zeros((m,), dtype=A0.dtype) - t1_np = np.zeros((m,), dtype=A1.dtype) - a0 = tvm.nd.array(a0_np, dev) - a1 = tvm.nd.array(a1_np, dev) - t0 = tvm.nd.array(t0_np, dev) - t1 = tvm.nd.array(t1_np, dev) - func = tvm.build(s, [A0, A1, T0, T1], device, name="reduction") - func(a0, a1, t0, t1) + a0_np = np.random.uniform(size=(num_m, num_n)).astype(placeholder_a0.dtype) + a1_np = np.random.uniform(size=(num_m, num_n)).astype(placeholder_a1.dtype) + t0_np = np.zeros((num_m,), dtype=placeholder_a0.dtype) + t1_np = np.zeros((num_m,), dtype=placeholder_a1.dtype) + buff_a0 = tvm.nd.array(a0_np, dev) + buff_a1 = tvm.nd.array(a1_np, dev) + buff_t0 = tvm.nd.array(t0_np, dev) + buff_t1 = tvm.nd.array(t1_np, dev) + func = tvm.build( + schedule, [placeholder_a0, placeholder_a1, result0, result1], device, name="reduction" + ) + func(buff_a0, buff_a1, buff_t0, buff_t1) t0_np = np.sum(a0_np, axis=1) t1_np = np.product(a1_np, axis=1) - tvm.testing.assert_allclose(t0.numpy(), t0_np, rtol=1e-3, atol=1e-3) - tvm.testing.assert_allclose(t1.numpy(), t1_np, rtol=1e-3, atol=1e-3) + tvm.testing.assert_allclose(buff_t0.numpy(), t0_np, rtol=1e-3, atol=1e-3) + tvm.testing.assert_allclose(buff_t1.numpy(), t1_np, rtol=1e-3, atol=1e-3) check_target("cuda") check_target("rocm") @@ -530,6 +610,7 @@ def check_target(device): @tvm.testing.requires_cuda def test_reduce_storage_reuse(): + """Test reduction reuses storage.""" target = tvm.target.Target("cuda") def run_passes(sch, args): @@ -547,13 +628,13 @@ def run_passes(sch, args): dev = tvm.device(target.kind.name, 0) shape = (16, 16) - A = te.placeholder(shape, dtype="float32", name="A") - B = topi.nn.softmax(A, axis=1) + 1.0 + placeholder_a = te.placeholder(shape, dtype="float32", name="A") + placeholder_b = topi.nn.softmax(placeholder_a, axis=1) + 1.0 with tvm.target.Target(target): - s = topi.cuda.schedule_softmax(B) + schedule = topi.cuda.schedule_softmax(placeholder_b) - mod = run_passes(s, [A, B]) + mod = run_passes(schedule, [placeholder_a, placeholder_b]) # Due to the storage rewrite pass, the reduction output storage reduce_temp0 can be reused as # the storage of the next compute. @@ -586,12 +667,12 @@ def check_store_dst_remapped(op): inp = np.random.uniform(size=shape).astype("float32") ref = tvm.topi.testing.softmax_python(inp) + 1.0 - f = tvm.build(s, [A, B], target) - a = tvm.nd.array(inp, dev) - b = tvm.nd.array(np.zeros(shape, dtype=B.dtype), dev) - f(a, b) - tvm.testing.assert_allclose(b.numpy(), ref, rtol=1e-5) + func = tvm.build(schedule, [placeholder_a, placeholder_b], target) + buff_a = tvm.nd.array(inp, dev) + buff_b = tvm.nd.array(np.zeros(shape, dtype=placeholder_b.dtype), dev) + func(buff_a, buff_b) + tvm.testing.assert_allclose(buff_b.numpy(), ref, rtol=1e-5) if __name__ == "__main__": - pytest.main([__pfile__]) + pytest.main([__file__]) diff --git a/tests/python/integration/test_scan.py b/tests/python/integration/test_scan.py index edeb862cd5fc..fa920e513502 100644 --- a/tests/python/integration/test_scan.py +++ b/tests/python/integration/test_scan.py @@ -14,38 +14,43 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -import tvm -from tvm import te +"""Test scheduling adn running scan operators.""" import numpy as np + +import tvm import tvm.testing +from tvm import te @tvm.testing.requires_gpu def test_scan(): - m = te.size_var("m") - n = te.size_var("n") - X = te.placeholder((m, n), name="X") - s_state = te.placeholder((m, n)) - s_init = te.compute((1, n), lambda _, i: X[0, i]) - s_update = te.compute((m, n), lambda t, i: s_state[t - 1, i] + X[t, i]) + """Test scan operators.""" + size_var_m = te.size_var("m") + size_var_n = te.size_var("n") + placeholder_x = te.placeholder((size_var_m, size_var_n), name="X") + s_state = te.placeholder((size_var_m, size_var_n)) + s_init = te.compute((1, size_var_n), lambda _, i: placeholder_x[0, i]) + s_update = te.compute( + (size_var_m, size_var_n), lambda t, i: s_state[t - 1, i] + placeholder_x[t, i] + ) scan = tvm.te.scan(s_init, s_update, s_state) # test scan + compute case - res = te.compute((m, n), lambda i, j: scan[i, j]) + res = te.compute((size_var_m, size_var_n), lambda i, j: scan[i, j]) # schedule - s = te.create_schedule(res.op) + schedule = te.create_schedule(res.op) num_thread = 256 block_x = te.thread_axis(None, "blockIdx.x") thread_x = te.thread_axis((0, num_thread), "threadIdx.x") - xo, xi = s[s_init].split(s_init.op.axis[1], factor=num_thread) - s[s_init].bind(xo, block_x) - s[s_init].bind(xi, thread_x) - xo, xi = s[s_update].split(s_update.op.axis[1], factor=num_thread) - s[s_update].bind(xo, block_x) - s[s_update].bind(xi, thread_x) - xo, xi = s[res].split(res.op.axis[1], factor=num_thread) - s[res].bind(xo, block_x) - s[res].bind(xi, thread_x) + axis_xo, axis_xi = schedule[s_init].split(s_init.op.axis[1], factor=num_thread) + schedule[s_init].bind(axis_xo, block_x) + schedule[s_init].bind(axis_xi, thread_x) + axis_xo, axis_xi = schedule[s_update].split(s_update.op.axis[1], factor=num_thread) + schedule[s_update].bind(axis_xo, block_x) + schedule[s_update].bind(axis_xi, thread_x) + axis_xo, axis_xi = schedule[res].split(res.op.axis[1], factor=num_thread) + schedule[res].bind(axis_xo, block_x) + schedule[res].bind(axis_xi, thread_x) # one line to build the function. def check_device(device): @@ -53,15 +58,15 @@ def check_device(device): if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return - fscan = tvm.build(s, [X, res], device, name="myscan") + fscan = tvm.build(schedule, [placeholder_x, res], device, name="myscan") # launch the kernel. - n = 1024 - m = 10 - a_np = np.random.uniform(size=(m, n)).astype(res.dtype) - a = tvm.nd.array(a_np, dev) - b = tvm.nd.array(np.zeros((m, n), dtype=res.dtype), dev) - fscan(a, b) - tvm.testing.assert_allclose(b.numpy(), np.cumsum(a_np, axis=0)) + num_n = 1024 + num_m = 10 + a_np = np.random.uniform(size=(num_m, num_n)).astype(res.dtype) + buff_a = tvm.nd.array(a_np, dev) + buff_b = tvm.nd.array(np.zeros((num_m, num_n), dtype=res.dtype), dev) + fscan(buff_a, buff_b) + tvm.testing.assert_allclose(buff_b.numpy(), np.cumsum(a_np, axis=0)) check_device("vulkan") check_device("cuda") diff --git a/tests/python/integration/test_tuning.py b/tests/python/integration/test_tuning.py index 963609ea5901..04c5f85ce5d4 100644 --- a/tests/python/integration/test_tuning.py +++ b/tests/python/integration/test_tuning.py @@ -19,11 +19,8 @@ """ import logging import multiprocessing as mp -import sys import textwrap -import time -import pytest import tvm import tvm.relay import tvm.testing @@ -34,100 +31,138 @@ from tvm.ir.instrument import pass_instrument from tvm.ir.transform import PassContext from tvm.target import Target +from tvm.tir.analysis import _ffi_api as _analysis_ffi_api def setup_module(): + """Setup the module used for testing.""" + @autotvm.template("testing/conv2d_no_batching") - def conv2d_no_batching(N, H, W, CI, CO, KH, KW): + def conv2d_no_batching( # pylint: disable=unused-variable + batch_size, input_h, input_w, channels_in, channels_out, kernel_h, kernel_w + ): """An example template for testing""" - assert N == 1, "Only consider batch_size = 1 in this template" + assert batch_size == 1, "Only consider batch_size = 1 in this template" - data = te.placeholder((N, CI, H, W), name="data") - kernel = te.placeholder((CO, CI, KH, KW), name="kernel") + data = te.placeholder((batch_size, channels_in, input_h, input_w), name="data") + kernel = te.placeholder((channels_out, channels_in, kernel_h, kernel_w), name="kernel") - rc = te.reduce_axis((0, CI), name="rc") - ry = te.reduce_axis((0, KH), name="ry") - rx = te.reduce_axis((0, KW), name="rx") + axis_rc = te.reduce_axis((0, channels_in), name="rc") + axis_ry = te.reduce_axis((0, kernel_h), name="ry") + axis_rx = te.reduce_axis((0, kernel_w), name="rx") conv = te.compute( - (N, CO, H - KH + 1, W - KW + 1), + (batch_size, channels_out, input_h - kernel_h + 1, input_w - kernel_w + 1), lambda nn, ff, yy, xx: te.sum( - data[nn, rc, yy + ry, xx + rx] * kernel[ff, rc, ry, rx], axis=[rc, ry, rx] + data[nn, axis_rc, yy + axis_ry, xx + axis_rx] + * kernel[ff, axis_rc, axis_ry, axis_rx], + axis=[axis_rc, axis_ry, axis_rx], ), tag="conv2d_nchw", ) - s = te.create_schedule([conv.op]) + schedule = te.create_schedule([conv.op]) output = conv - OL = s.cache_write(conv, "local") + cache_write_ol = schedule.cache_write(conv, "local") # create cache stage - AA = s.cache_read(data, "shared", [OL]) - WW = s.cache_read(kernel, "shared", [OL]) - AL = s.cache_read(AA, "local", [OL]) - WL = s.cache_read(WW, "local", [OL]) + cache_read_aa = schedule.cache_read(data, "shared", [cache_write_ol]) + cache_read_ww = schedule.cache_read(kernel, "shared", [cache_write_ol]) + cache_read_al = schedule.cache_read(cache_read_aa, "local", [cache_write_ol]) + cache_read_wl = schedule.cache_read(cache_read_ww, "local", [cache_write_ol]) # tile and bind spatial axes - n, f, y, x = s[output].op.axis + axis_n, axis_f, axis_y, axis_x = schedule[output].op.axis cfg = autotvm.get_config() - cfg.define_split("tile_f", cfg.axis(f), num_outputs=4) - cfg.define_split("tile_y", cfg.axis(y), num_outputs=4) - cfg.define_split("tile_x", cfg.axis(x), num_outputs=4) - bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f) - by, vy, ty, yi = cfg["tile_y"].apply(s, output, y) - bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x) - kernel_scope = n # this is the scope to attach global config inside this kernel - - s[output].bind(bf, te.thread_axis("blockIdx.z")) - s[output].bind(by, te.thread_axis("blockIdx.y")) - s[output].bind(bx, te.thread_axis("blockIdx.x")) - s[output].bind(vf, te.thread_axis("vthread")) - s[output].bind(vy, te.thread_axis("vthread")) - s[output].bind(vx, te.thread_axis("vthread")) - s[output].bind(tf, te.thread_axis("threadIdx.z")) - s[output].bind(ty, te.thread_axis("threadIdx.y")) - s[output].bind(tx, te.thread_axis("threadIdx.x")) - s[output].reorder(n, bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi) - s[OL].compute_at(s[output], tx) + cfg.define_split("tile_f", cfg.axis(axis_f), num_outputs=4) + cfg.define_split("tile_y", cfg.axis(axis_y), num_outputs=4) + cfg.define_split("tile_x", cfg.axis(axis_x), num_outputs=4) + axis_bf, axis_vf, axis_tf, axis_fi = cfg["tile_f"].apply(schedule, output, axis_f) + axis_by, axis_vy, axis_ty, axis_yi = cfg["tile_y"].apply(schedule, output, axis_y) + axis_bx, axis_vx, axis_tx, axis_xi = cfg["tile_x"].apply(schedule, output, axis_x) + kernel_scope = axis_n # this is the scope to attach global config inside this kernel + + schedule[output].bind(axis_bf, te.thread_axis("blockIdx.z")) + schedule[output].bind(axis_by, te.thread_axis("blockIdx.y")) + schedule[output].bind(axis_bx, te.thread_axis("blockIdx.x")) + schedule[output].bind(axis_vf, te.thread_axis("vthread")) + schedule[output].bind(axis_vy, te.thread_axis("vthread")) + schedule[output].bind(axis_vx, te.thread_axis("vthread")) + schedule[output].bind(axis_tf, te.thread_axis("threadIdx.z")) + schedule[output].bind(axis_ty, te.thread_axis("threadIdx.y")) + schedule[output].bind(axis_tx, te.thread_axis("threadIdx.x")) + schedule[output].reorder( + axis_n, + axis_bf, + axis_by, + axis_bx, + axis_vf, + axis_vy, + axis_vx, + axis_tf, + axis_ty, + axis_tx, + axis_fi, + axis_yi, + axis_xi, + ) + schedule[cache_write_ol].compute_at(schedule[output], axis_tx) # tile and bind reduction axes - n, f, y, x = s[OL].op.axis - rc, ry, rx = s[OL].op.reduce_axis - cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=3) - cfg.define_split("tile_ry", cfg.axis(ry), num_outputs=3) - cfg.define_split("tile_rx", cfg.axis(rx), num_outputs=3) - rco, rcm, rci = cfg["tile_rc"].apply(s, OL, rc) - ryo, rym, ryi = cfg["tile_rx"].apply(s, OL, ry) - rxo, rxm, rxi = cfg["tile_ry"].apply(s, OL, rx) - s[OL].reorder(rco, ryo, rxo, rcm, rym, rxm, rci, ryi, rxi, n, f, y, x) - - s[AA].compute_at(s[OL], rxo) - s[WW].compute_at(s[OL], rxo) - s[AL].compute_at(s[OL], rxm) - s[WL].compute_at(s[OL], rxm) + axis_n, axis_f, axis_y, axis_x = schedule[cache_write_ol].op.axis + axis_rc, axis_ry, axis_rx = schedule[cache_write_ol].op.reduce_axis + cfg.define_split("tile_rc", cfg.axis(axis_rc), num_outputs=3) + cfg.define_split("tile_ry", cfg.axis(axis_ry), num_outputs=3) + cfg.define_split("tile_rx", cfg.axis(axis_rx), num_outputs=3) + axis_rco, axis_rcm, axis_rci = cfg["tile_rc"].apply(schedule, cache_write_ol, axis_rc) + axis_ryo, axis_rym, axis_ryi = cfg["tile_rx"].apply(schedule, cache_write_ol, axis_ry) + axis_rxo, axis_rxm, axis_rxi = cfg["tile_ry"].apply(schedule, cache_write_ol, axis_rx) + schedule[cache_write_ol].reorder( + axis_rco, + axis_ryo, + axis_rxo, + axis_rcm, + axis_rym, + axis_rxm, + axis_rci, + axis_ryi, + axis_rxi, + axis_n, + axis_f, + axis_y, + axis_x, + ) + + schedule[cache_read_aa].compute_at(schedule[cache_write_ol], axis_rxo) + schedule[cache_read_ww].compute_at(schedule[cache_write_ol], axis_rxo) + schedule[cache_read_al].compute_at(schedule[cache_write_ol], axis_rxm) + schedule[cache_read_wl].compute_at(schedule[cache_write_ol], axis_rxm) # cooperative fetching - for load in [AA, WW]: - n, f, y, x = s[load].op.axis - fused = s[load].fuse(n, f, y, x) - tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2]) - ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2]) - tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2]) - s[load].bind(tz, te.thread_axis("threadIdx.z")) - s[load].bind(ty, te.thread_axis("threadIdx.y")) - s[load].bind(tx, te.thread_axis("threadIdx.x")) + for load in [cache_read_aa, cache_read_ww]: + axis_n, axis_f, axis_y, axis_x = schedule[load].op.axis + fused = schedule[load].fuse(axis_n, axis_f, axis_y, axis_x) + axis_tz, fused = schedule[load].split(fused, nparts=cfg["tile_f"].size[2]) + axis_ty, fused = schedule[load].split(fused, nparts=cfg["tile_y"].size[2]) + axis_tx, fused = schedule[load].split(fused, nparts=cfg["tile_x"].size[2]) + schedule[load].bind(axis_tz, te.thread_axis("threadIdx.z")) + schedule[load].bind(axis_ty, te.thread_axis("threadIdx.y")) + schedule[load].bind(axis_tx, te.thread_axis("threadIdx.x")) # tune unroll cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) cfg.define_knob("unroll_explicit", [0, 1]) - s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val) - s[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val) + schedule[output].pragma( + kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val + ) + schedule[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val) - return s, [data, kernel, conv] + return schedule, [data, kernel, conv] def teardown_module(): + """Remove the module from the autotvm task tables.""" # TODO(areusch): Tasks should not be registered into a global. del autotvm.task.task.TASK_TABLE["testing/conv2d_no_batching"] @@ -158,8 +193,10 @@ def run_test_with_all_multiprocessing(func, *args, **kwargs): @tvm.testing.parametrize_targets("cuda", "opencl") -def test_tuning_gpu(target, dev): - def runner(target, dev): +def test_tuning_gpu(target): + """Test gpu tuning.""" + + def runner(target): # init task task, target = get_sample_task(target, None) logging.info("task config space: %s", task.config_space) @@ -181,22 +218,21 @@ def runner(target, dev): r for r in results if r.error_no == autotvm.MeasureErrorNo.NO_ERROR - # Autotvm can filter some records before building if we know they won't work ahead of time. - # We can't guarantee we sample at least one good record so we count these as success too + # We filter records before building if we know they won't work ahead of time. + # We can't guarantee we get one good record so we count these as success too or r.error_no == autotvm.MeasureErrorNo.INSTANTIATION_ERROR ] assert len(successful_results) > 0, f"No successful tuning runs: {results!r}" - run_test_with_all_multiprocessing(runner, target, dev) + run_test_with_all_multiprocessing(runner, target) @tvm.testing.parametrize_targets("cuda", "opencl") -def test_tuning_gpu_inherits_pass_context(target, dev): +def test_tuning_gpu_inherits_pass_context(target): """Autotvm tuner inherits PassContexts but also adds a gpu verification pass by default. Test that using PassContext inherits passes properly but also runs gpu verification pass. """ - from tvm.tir.analysis import _ffi_api as _analysis_ffi_api @pass_instrument class PassInstrumentChecker: @@ -205,7 +241,7 @@ class PassInstrumentChecker: def __init__(self): self.has_been_run = False - def run_after_pass(self, mod, info): + def run_after_pass(self, *_): self.has_been_run = True class GPUVerifyPassMocked: @@ -274,10 +310,12 @@ def __init__( do_fork=False, runtime=None, ): + # pylint: disable=too-many-function-args super().__init__(timeout, n_parallel, build_kwargs, build_func, do_fork, runtime) + self.build_func = OverwrittenBuildFunc(tar.tar, runtime) - def runner(target, dev): + def runner(target): task, target = get_sample_task(target, None) logging.info("task config space: %s", task.config_space) @@ -295,10 +333,12 @@ def runner(target, dev): assert len(results) == 1 - run_test_with_all_multiprocessing(runner, target, dev) + run_test_with_all_multiprocessing(runner, target) def test_tuning_cpu(): + """Test tuning on cpu.""" + def runner(): ir_mod = tvm.parser.fromtext( textwrap.dedent( diff --git a/tests/python/integration/test_winograd_nnpack.py b/tests/python/integration/test_winograd_nnpack.py index 71091f69d964..b088b350c9f0 100644 --- a/tests/python/integration/test_winograd_nnpack.py +++ b/tests/python/integration/test_winograd_nnpack.py @@ -14,18 +14,18 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +"""Test winograd convolution using nnpack impl.""" import numpy as np +from pytest import skip + import tvm -from tvm import te -from tvm import autotvm +import tvm.testing +import tvm.topi.testing +from tvm import autotvm, te, topi from tvm.autotvm.task.space import FallbackConfigEntity from tvm.contrib import nnpack from tvm.contrib.pickle_memoize import memoize -from tvm import topi -import tvm.topi.testing from tvm.topi.utils import get_const_tuple -from pytest import skip -import tvm.testing def verify_conv2d_nchw( @@ -36,11 +36,12 @@ def verify_conv2d_nchw( kernel, stride, padding, + devices, dilation=1, add_bias=False, add_relu=False, - devices=["cuda", "llvm -device=arm_cpu", "opencl -device=mali"], ): + """Verify conv2d nchw workload.""" print( "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation) @@ -48,14 +49,14 @@ def verify_conv2d_nchw( in_height = in_width = in_size - A = te.placeholder((batch, in_channel, in_height, in_width), name="A") - W = te.placeholder((num_filter, in_channel, kernel, kernel), name="W") + placholder_a = te.placeholder((batch, in_channel, in_height, in_width), name="A") + placeholder_w = te.placeholder((num_filter, in_channel, kernel, kernel), name="W") bias = te.placeholder((num_filter, 1, 1), name="bias") - a_shape = get_const_tuple(A.shape) - w_shape = get_const_tuple(W.shape) + a_shape = get_const_tuple(placholder_a.shape) + w_shape = get_const_tuple(placeholder_w.shape) bias_shape = get_const_tuple(bias.shape) - dtype = A.dtype + dtype = placholder_a.dtype @memoize("topi.tests.test_topi_conv2d_nchw.verify_conv2d_nchw") def get_ref_data(): @@ -79,42 +80,52 @@ def check_device(device): print("Skipping %s becuase it is not enabled" % device) print("Running on target: %s" % device) with tvm.target.Target(device): - C = topi.nn.conv2d(A, W, stride, padding, dilation, layout="NCHW", out_dtype=dtype) + result_c = topi.nn.conv2d( + placholder_a, + placeholder_w, + stride, + padding, + dilation, + layout="NCHW", + out_dtype=dtype, + ) if add_bias: - C = topi.add(C, bias) + result_c = topi.add(result_c, bias) if add_relu: - C = topi.nn.relu(C) - s = topi.generic.schedule_conv2d_nchw([C]) + result_c = topi.nn.relu(result_c) + schedule = topi.generic.schedule_conv2d_nchw([result_c]) - a = tvm.nd.array(a_np, dev) - w = tvm.nd.array(w_np, dev) - b = tvm.nd.array(b_np, dev) - c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) + buff_a = tvm.nd.array(a_np, dev) + buff_w = tvm.nd.array(w_np, dev) + buff_b = tvm.nd.array(b_np, dev) + buff_c = tvm.nd.array(np.zeros(get_const_tuple(result_c.shape), dtype=result_c.dtype), dev) if add_bias: func = tvm.build( - s, - [A, W, bias, C], + schedule, + [placholder_a, placeholder_w, bias, result_c], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation), ) - func(a, w, b, c) + func(buff_a, buff_w, buff_b, buff_c) else: func = tvm.build( - s, - [A, W, C], + schedule, + [placholder_a, placeholder_w, result_c], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation), ) - func(a, w, c) - tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-4) + func(buff_a, buff_w, buff_c) + tvm.testing.assert_allclose(buff_c.numpy(), c_np, rtol=1e-4) for device in devices: check_device(device) class WinogradFallback(autotvm.FallbackContext): + """Winograd fallbacks.""" + def _query_inside(self, target, workload): key = (target, workload) if key in self.memory: @@ -126,6 +137,8 @@ def _query_inside(self, target, workload): def test_conv2d_nchw(): + """Verify conv2d nchw winograd works.""" + if not tvm.get_global_func( "tvm.contrib.nnpack.convolution_inference_without_weight_transform", True ):