From 9a137ea7574e8385cd3f453b10cb4ff65b2fa3b9 Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Wed, 9 Jun 2021 21:33:15 -0700
Subject: [PATCH] [CuBLAS] Support implicit broadcast in batch_matmul (#8229)

---
 src/runtime/contrib/cblas/gemm_common.h | 38 +++++++++++++----
 src/runtime/contrib/cublas/cublas.cc    | 30 ++++++++++----
 tests/python/contrib/test_cublas.py     | 55 +++++++++++++------------
 3 files changed, 79 insertions(+), 44 deletions(-)
diff --git a/src/runtime/contrib/cblas/gemm_common.h b/src/runtime/contrib/cblas/gemm_common.h
index 9ccfa5183cd6..4724b14bffa1 100644
--- a/src/runtime/contrib/cblas/gemm_common.h
+++ b/src/runtime/contrib/cblas/gemm_common.h
@@ -181,28 +181,48 @@ inline void CallBatchGemm(TVMArgs args, TVMRetValue* ret, TBatchGemmOp op) {
   DLTensor* C = args[2];
   bool transa = args[3];
   bool transb = args[4];
+
   int bit_depth = sizeof(DType) * 8;
+
   ICHECK_EQ(A->ndim, 3);
   ICHECK_EQ(B->ndim, 3);
   ICHECK_EQ(C->ndim, 3);
-  int batch_size = BatchCount3D(A);
-  ICHECK_EQ(BatchCount3D(B), batch_size);
-  ICHECK_EQ(BatchCount3D(C), batch_size);
+
+  int batch_size = BatchCount3D(C);
   ICHECK_EQ(ElementStride(A), 1);
   ICHECK_EQ(ElementStride(B), 1);
   ICHECK_EQ(ElementStride(C), 1);
+
   // C can never be transposed.
   ICHECK(!IsInPlaceTransposed3D(C));
   // Reversed strides indicates an in-place transpose operation.
   transa = IsInPlaceTransposed3D(A) ? !transa : transa;
   transb = IsInPlaceTransposed3D(B) ? !transb : transb;
+
   ICHECK(TypeMatch(B->dtype, kDLFloat, bit_depth));
   ICHECK(TypeMatch(C->dtype, kDLFloat, bit_depth));
+
   double alpha = args.size() > 5 ? args[5] : 1.0;
   double beta = args.size() > 6 ? args[6] : 0.0;
-  const int A_size = A->shape[1] * A->shape[2];
-  const int B_size = B->shape[1] * B->shape[2];
-  const int C_size = C->shape[1] * C->shape[2];
+
+  int A_stride = A->shape[1] * A->shape[2];
+  int B_stride = B->shape[1] * B->shape[2];
+  int C_stride = C->shape[1] * C->shape[2];
+
+  // Broadcast A or B by changing its stride.
+  int batch_size_a = BatchCount3D(A);
+  int batch_size_b = BatchCount3D(B);
+  if (batch_size_a != batch_size_b) {
+    if (batch_size_a == 1) {
+      A_stride = 0;
+    } else if (batch_size_b == 1) {
+      B_stride = 0;
+    }
+  } else {
+    ICHECK_EQ(batch_size_a, batch_size);
+    ICHECK_EQ(batch_size_b, batch_size);
+  }
+
   DType* A_data = reinterpret_cast<typename TBatchGemmOp::TDatatype*>(static_cast<char*>(A->data) +
                                                                       A->byte_offset);
   DType* B_data = reinterpret_cast<typename TBatchGemmOp::TDatatype*>(static_cast<char*>(B->data) +
@@ -210,9 +230,9 @@ inline void CallBatchGemm(TVMArgs args, TVMRetValue* ret, TBatchGemmOp op) {
   DType* C_data = reinterpret_cast<typename TBatchGemmOp::TDatatype*>(static_cast<char*>(C->data) +
                                                                       C->byte_offset);
   op(batch_size, transb, transa, ColumnCount3D(B, transb), RowCount3D(A, transa),
-     ColumnCount3D(A, transa), static_cast<typename TBatchGemmOp::TDatatype>(alpha), B_data, B_size,
-     ColumnStride3D(B), A_data, A_size, ColumnStride3D(A),
-     static_cast<typename TBatchGemmOp::TDatatype>(beta), C_data, C_size, ColumnStride3D(C));
+     ColumnCount3D(A, transa), static_cast<typename TBatchGemmOp::TDatatype>(alpha), B_data,
+     B_stride, ColumnStride3D(B), A_data, A_stride, ColumnStride3D(A),
+     static_cast<typename TBatchGemmOp::TDatatype>(beta), C_data, C_stride, ColumnStride3D(C));
 }
 
 }  // namespace contrib
diff --git a/src/runtime/contrib/cublas/cublas.cc b/src/runtime/contrib/cublas/cublas.cc
index 1216a63703bb..015d68aec819 100644
--- a/src/runtime/contrib/cublas/cublas.cc
+++ b/src/runtime/contrib/cublas/cublas.cc
@@ -275,9 +275,8 @@ inline void CallBatchGemmEx(TVMArgs args, TVMRetValue* ret, cublasHandle_t hdl)
   ICHECK_EQ(A->ndim, 3);
   ICHECK_EQ(B->ndim, 3);
   ICHECK_EQ(C->ndim, 3);
-  int batch_size = BatchCount3D(A);
-  ICHECK_EQ(BatchCount3D(B), batch_size);
-  ICHECK_EQ(BatchCount3D(C), batch_size);
+
+  int batch_size = BatchCount3D(C);
   ICHECK_EQ(ElementStride(A), 1);
   ICHECK_EQ(ElementStride(B), 1);
   ICHECK_EQ(ElementStride(C), 1);
@@ -299,9 +298,23 @@ inline void CallBatchGemmEx(TVMArgs args, TVMRetValue* ret, cublasHandle_t hdl)
   double alpha = args.size() > 5 ? args[5] : 1.0;
   double beta = args.size() > 6 ? args[6] : 0.0;
 
-  const int A_size = A->shape[1] * A->shape[2];
-  const int B_size = B->shape[1] * B->shape[2];
-  const int C_size = C->shape[1] * C->shape[2];
+  int A_stride = A->shape[1] * A->shape[2];
+  int B_stride = B->shape[1] * B->shape[2];
+  int C_stride = C->shape[1] * C->shape[2];
+
+  // Broadcast A or B by changing its stride.
+  int batch_size_a = BatchCount3D(A);
+  int batch_size_b = BatchCount3D(B);
+  if (batch_size_a != batch_size_b) {
+    if (batch_size_a == 1) {
+      A_stride = 0;
+    } else if (batch_size_b == 1) {
+      B_stride = 0;
+    }
+  } else {
+    ICHECK_EQ(batch_size_a, batch_size);
+    ICHECK_EQ(batch_size_b, batch_size);
+  }
 
   cudaDataType_t cuda_in_type = GetCudaDataType(A->dtype);
   cudaDataType_t cuda_out_type = GetCudaDataType(C->dtype);
@@ -325,8 +338,9 @@ inline void CallBatchGemmEx(TVMArgs args, TVMRetValue* ret, cublasHandle_t hdl)
   CHECK_CUBLAS_ERROR(cublasGemmStridedBatchedEx(
       hdl, CUBLASBooleanToTranspose(transb), CUBLASBooleanToTranspose(transa),
       ColumnCount3D(B, transb), RowCount3D(A, transa), ColumnCount3D(A, transa), alpha_ptr, B_data,
-      cuda_in_type, ColumnStride3D(B), B_size, A_data, cuda_in_type, ColumnStride3D(A), A_size,
-      beta_ptr, C_data, cuda_out_type, ColumnStride3D(C), C_size, batch_size, cuda_out_type, algo));
+      cuda_in_type, ColumnStride3D(B), B_stride, A_data, cuda_in_type, ColumnStride3D(A), A_stride,
+      beta_ptr, C_data, cuda_out_type, ColumnStride3D(C), C_stride, batch_size, cuda_out_type,
+      algo));
 }
 
 // matrix multiplication for row major
diff --git a/tests/python/contrib/test_cublas.py b/tests/python/contrib/test_cublas.py
index a0f51ca7c9fc..648100a569d7 100644
--- a/tests/python/contrib/test_cublas.py
+++ b/tests/python/contrib/test_cublas.py
@@ -112,33 +112,23 @@ def verify(target="cuda"):
     verify()
 
 
-def verify_batch_matmul(in_dtype, out_dtype, rtol=1e-5):
-    j = 16
-    n = 1024
-    l = 128
-    m = 236
-    A = te.placeholder((j, n, l), name="A", dtype=in_dtype)
-    B = te.placeholder((j, l, m), name="B", dtype=in_dtype)
+def verify_batch_matmul(Ashape, Bshape, Cshape, in_dtype, out_dtype, rtol=1e-5):
+    A = te.placeholder(Ashape, name="A", dtype=in_dtype)
+    B = te.placeholder(Bshape, name="B", dtype=in_dtype)
     C = cublas.batch_matmul(A, B, dtype=out_dtype)
     s = te.create_schedule(C.op)
 
-    def verify(target="cuda"):
-        if not tvm.get_global_func("tvm.contrib.cublas.matmul", True):
-            print("skip because extern function is not available")
-            return
-        dev = tvm.cuda(0)
-        f = tvm.build(s, [A, B, C], target)
-        a = tvm.nd.array(np.random.uniform(size=(j, n, l)).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=(j, l, m)).astype(B.dtype), dev)
-        c = tvm.nd.array(np.zeros((j, n, m), dtype=C.dtype), dev)
-        f(a, b, c)
-        tvm.testing.assert_allclose(
-            c.numpy(),
-            np.matmul(a.numpy().astype(C.dtype), b.numpy().astype(C.dtype)).astype(C.dtype),
-            rtol=rtol,
-        )
-
-    verify()
+    dev = tvm.cuda(0)
+    f = tvm.build(s, [A, B, C], "cuda")
+    a = tvm.nd.array(np.random.uniform(size=Ashape).astype(A.dtype), dev)
+    b = tvm.nd.array(np.random.uniform(size=Bshape).astype(B.dtype), dev)
+    c = tvm.nd.array(np.zeros(Cshape, dtype=C.dtype), dev)
+    f(a, b, c)
+    tvm.testing.assert_allclose(
+        c.numpy(),
+        np.matmul(a.numpy().astype(C.dtype), b.numpy().astype(C.dtype)).astype(C.dtype),
+        rtol=rtol,
+    )
 
 
 @tvm.testing.requires_cuda
@@ -156,9 +146,20 @@ def test_matmul_add_igemm():
 
 @tvm.testing.requires_cuda
 def test_batch_matmul():
-    verify_batch_matmul("float", "float")
-    verify_batch_matmul("float16", "float")
-    verify_batch_matmul("float16", "float16", rtol=1e-2)
+    if not tvm.get_global_func("tvm.contrib.cublas.matmul", True):
+        print("skip because extern function is not available")
+        return
+
+    verify_batch_matmul((16, 1024, 128), (16, 128, 236), (16, 1024, 236), "float", "float")
+    verify_batch_matmul((16, 1024, 128), (1, 128, 236), (16, 1024, 236), "float", "float")
+    verify_batch_matmul((16, 1024, 128), (16, 128, 236), (16, 1024, 236), "float16", "float")
+    verify_batch_matmul((16, 1024, 128), (1, 128, 236), (16, 1024, 236), "float16", "float")
+    verify_batch_matmul(
+        (16, 1024, 128), (16, 128, 236), (16, 1024, 236), "float16", "float16", rtol=1e-2
+    )
+    verify_batch_matmul(
+        (16, 1024, 128), (1, 128, 236), (16, 1024, 236), "float16", "float16", rtol=1e-2
+    )
 
 
 if __name__ == "__main__":