diff --git a/python/tvm/topi/cuda/tensor_intrin.py b/python/tvm/topi/cuda/tensor_intrin.py index c0596fc432623..6bb143140a416 100644 --- a/python/tvm/topi/cuda/tensor_intrin.py +++ b/python/tvm/topi/cuda/tensor_intrin.py @@ -71,7 +71,11 @@ def _instr(index): vec_y = yy.vload(0, dtype=vec_y_dtype) prev_z = 0 if index == 0 else zz.vload(0) - new_z = tvm.tir.call_pure_extern(zz_dtype, "__dp4a", vec_x, vec_y, prev_z) + # new_z = tvm.tir.call_pure_extern(zz_dtype, "__dp4a", vec_x, vec_y, prev_z) + new_z = tvm.tir.call_llvm_pure_intrin(zz_dtype, "llvm.amdgcn.sdot4", tvm.tir.const(4, "uint32"), + tvm.tir.call_intrin("int32", "tir.reinterpret", vec_x), + tvm.tir.call_intrin("int32", "tir.reinterpret", vec_y), + prev_z, True) ib.emit(zz.vstore(0, new_z)) return ib.get()