Skip to content

Commit

Permalink
int8 4k tune working
Browse files Browse the repository at this point in the history
  • Loading branch information
masahi committed May 17, 2022
1 parent 3ca8ca0 commit 94d9d96
Showing 1 changed file with 15 additions and 16 deletions.
31 changes: 15 additions & 16 deletions tests/python/unittest/test_mma_16x8x32_4k_tune.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ def ldmatrix_a_impl(a: T.handle, c: T.handle) -> None:
4,
".b16",
A_warp.data,
16 * tx,
A_shared.data,
A_warp.elem_offset + 16 * tx,
A_shared.access_ptr("r"),
s1 * (tx % 16) + 16 * (tx // 16),
dtype="int8",
)
Expand Down Expand Up @@ -104,8 +104,8 @@ def ldmatrix_b_impl(a: T.handle, c: T.handle) -> None:
4,
".b16",
B_warp.data,
16 * tx,
B_shared.data,
B_warp.elem_offset + 16 * tx,
B_shared.access_ptr("r"),
s1,
dtype="int8",
)
Expand Down Expand Up @@ -359,7 +359,7 @@ def schedule(sch: tir.Schedule):
sch.bind(block_idy, "blockIdx.y")
sch.bind(thread_idy, "threadIdx.y")

def fetch_to_shared(block, idx, ndim):
def fetch_to_shared(block, idx, ndim, vec=False):
block_read = sch.cache_read(block, idx, "shared")
sch.compute_at(block_read, k0)
vector_size = 16
Expand All @@ -368,13 +368,15 @@ def fetch_to_shared(block, idx, ndim):
f_0, f_1, f_2, f_3 = sch.split(fused, factors=[None, num_ty, warp_size, vector_size])
sch.bind(f_2, "threadIdx.x")
sch.bind(f_1, "threadIdx.y")
sch.vectorize(f_3)
sch.storage_align(block_read, 0, axis=-2, factor=32, offset=16)

if vec:
sch.vectorize(f_3)
sch.storage_align(block_read, 0, axis=-2, factor=32, offset=16)

return block_read

A_sh = fetch_to_shared(block_outer, 0, 2)
B_sh = fetch_to_shared(block_outer, 1, 2)
A_sh = fetch_to_shared(block_outer, 0, 2, True)
B_sh = fetch_to_shared(block_outer, 1, 2, True)

loop = sch.get_loops(block_outer)[-1]

Expand Down Expand Up @@ -488,14 +490,11 @@ def shared_32x16_to_ldmatrix_32x16_layout(i, j):
else:
print(sch.mod.script())
print(sch.trace)
else:
target = "cuda"
f = tvm.build(sch.mod["main"], target=target, name="dense")

dev = tvm.device("cuda", 0)
a_np = np.random.uniform(size=(N, K)).astype("int8")
b_np = np.random.uniform(size=(K, M)).astype("int8")
c_np = np.dot(a_np.astype("int32"), b_np.astype("int32"))
a_np = np.random.randint(-128, 128, (M, K)).astype("int8")
b_np = np.random.randint(-128, 128, (K, N)).astype("int8")
c_np = np.dot(a_np.astype("float32"), b_np.astype("float32")).astype("int32")
a = tvm.nd.array(a_np, dev)
b = tvm.nd.array(b_np, dev)
c = tvm.nd.array(np.zeros((M, N), dtype="int32"), dev)
Expand All @@ -506,7 +505,7 @@ def shared_32x16_to_ldmatrix_32x16_layout(i, j):
tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-3)
print("ok")

evaluator = f.time_evaluator(f.entry_name, dev, number=1000)
evaluator = f.time_evaluator(f.entry_name, dev, number=500)
gflops = (N * M * K) * 2 / 1e9
time_ms = evaluator(a, b, c).mean * 1e3
print("matmul with tensor core: %f ms, %f GFLOPS" % (time_ms, gflops / (time_ms / 1e3)))

0 comments on commit 94d9d96

Please sign in to comment.