Skip to content

Commit

Permalink
[SYCL][Joint Matrix][E2E] Add Joint Matrix tests for matrix dimension…
Browse files Browse the repository at this point in the history
… as function argument and runtime input (#15429)

Description:
1. The test joint_matrix_bf16_fill_k_cache_arg_dim.cpp has been added in
sycl/test-e2e/Matrix to validate passing matrix dimensions as function
arguments, instead of templated parameters, to the joint_matmul
function.
2. The test joint_matrix_bf16_fill_k_cache_runtime_dim.cpp has been
added in sycl/test-e2e/Matrix to validate that the program works for
reading matrix dimensions at runtime instead of compile time.
3. The joint_matrix_bf16_fill_k_cache_impl.hpp file has been modified to
support the new tests.
  • Loading branch information
YixingZhang007 authored Oct 4, 2024
1 parent fda7dc7 commit d29060e
Show file tree
Hide file tree
Showing 3 changed files with 106 additions and 35 deletions.
17 changes: 17 additions & 0 deletions sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_arg_dim.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
//==--- joint_matrix_bf16_fill_k_cache_arg_dim.cpp - DPC++ joint_matrix--------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// REQUIRES: aspect-ext_intel_matrix
// XFAIL: arch-intel_gpu_pvc

// RUN: %{build} -o %t_arg_dim_vnni.out %fp-model-precise -DARG_DIM -DVNNI
// RUN: %{run} %t_arg_dim_vnni.out

// -ffp-model=precise is added to not depend on compiler defaults.

#include "common.hpp"
#include "joint_matrix_bf16_fill_k_cache_impl.hpp"
107 changes: 72 additions & 35 deletions sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,19 @@ static constexpr void manually_unroll_loop(F &&f) {

template <size_t TM, size_t TN, size_t TK> class MatMul;

template <size_t rowsA, size_t colsA, size_t rowsB, size_t colsB,
template <
#if !defined(ARG_DIM) && !defined(RUNTIME_DIM)
size_t rowsA, size_t colsA, size_t rowsB, size_t colsB,
#endif // ARG_DIM, RUNTIME_DIM
size_t vnniFactor, typename TOperand, typename TResult, size_t TM,
size_t TN, size_t TK, size_t MCache1, size_t NCache1, size_t KCache1,
size_t MCache2, size_t NCache2, size_t KCache2>
double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i
#if defined(ARG_DIM) || defined(RUNTIME_DIM)
, size_t rowsA, size_t colsA, size_t rowsB, size_t colsB
#endif // ARG_DIM, RUNTIME_DIM
) {

size_t sgSize = get_sg_size<MatMul<TM, TN, TK>>(q);
range<2> global{rowsA / MCache1, (colsB / NCache1) * sgSize};
range<2> cachelocal{MCache2 / MCache1, NCache2 / NCache1 * sgSize};
Expand Down Expand Up @@ -287,8 +295,8 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
#ifdef PREFETCH
auto prefetch_offsetA = (m2 * MCache2 + sgId * prefRow) * colsA +
(k2 + prefDistance) * prefCol;
if ((prefetch_offsetA + (prefRow * MATRIX_SIZE) + prefCol) <
(MATRIX_SIZE * MATRIX_SIZE))
if ((prefetch_offsetA + (prefRow * colsA) + prefCol) <
(rowsA * colsA))
joint_matrix_prefetch<prefRow, prefCol>(
sg, A + prefetch_offsetA, colsA, layout::row_major,
syclex::properties{syclex::prefetch_hint_L1});
Expand All @@ -298,8 +306,8 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
pm1B * prefRow) *
(colsB)*vnniFactor +
(n2 * NCache2 * vnniFactor + pn1B * prefCol);
if ((prefetch_offsetB + (prefRow * MATRIX_SIZE * vnniFactor) +
prefCol) < (MATRIX_SIZE * MATRIX_SIZE))
if ((prefetch_offsetB + (prefRow * colsB * vnniFactor) +
prefCol) < (rowsB * colsB))
joint_matrix_prefetch<prefRow, prefCol>(
sg, B + prefetch_offsetB, colsB * vnniFactor,
layout::row_major,
Expand Down Expand Up @@ -349,31 +357,37 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
template <typename T, typename TResult, size_t vnniFactor, size_t TM, size_t TN,
size_t TK, size_t MCache1, size_t NCache1, size_t KCache1,
size_t MCache2, size_t NCache2, size_t KCache2>
void test() {
assert(MATRIX_SIZE >= TM && MATRIX_SIZE >= TK && MATRIX_SIZE >= TN &&
void test(size_t matrix_size_input) {
#ifdef RUNTIME_DIM
size_t matrix_size = matrix_size_input;
#else
constexpr size_t matrix_size = MATRIX_SIZE;
#endif // RUNTIME_DIM

assert(matrix_size >= TM && matrix_size >= TK && matrix_size >= TN &&
"invalid matrix size");
assert((MATRIX_SIZE % TM) == 0 && (MATRIX_SIZE % TN) == 0 &&
(MATRIX_SIZE % TK) == 0 &&
assert((matrix_size % TM) == 0 && (matrix_size % TN) == 0 &&
(matrix_size % TK) == 0 &&
"invalid matrix size detected: not a multiple of <TM,TN,TK>");

std::cout << "Testing: " << TM << " x " << TN << " x " << TK
<< " [TM x TN x TK]" << std::endl;

queue q;
T *A = malloc_shared<T>(MATRIX_SIZE * MATRIX_SIZE, q);
T *B = malloc_shared<T>(MATRIX_SIZE * MATRIX_SIZE, q);
TResult *C = malloc_shared<TResult>(MATRIX_SIZE * MATRIX_SIZE, q);
TResult *refC = malloc_shared<TResult>(MATRIX_SIZE * MATRIX_SIZE, q);
T *A = malloc_shared<T>(matrix_size * matrix_size, q);
T *B = malloc_shared<T>(matrix_size * matrix_size, q);
TResult *C = malloc_shared<TResult>(matrix_size * matrix_size, q);
TResult *refC = malloc_shared<TResult>(matrix_size * matrix_size, q);

matrix_rand<T>(MATRIX_SIZE, MATRIX_SIZE, A, T(1));
matrix_rand<T>(MATRIX_SIZE, MATRIX_SIZE, B, T(1));
matrix_rand<T>(matrix_size, matrix_size, A, T(1));
matrix_rand<T>(matrix_size, matrix_size, B, T(1));

matrix_multiply_ref<T, T, TResult, 1>(A, B, refC, MATRIX_SIZE, MATRIX_SIZE,
MATRIX_SIZE);
matrix_multiply_ref<T, T, TResult, 1>(A, B, refC, matrix_size, matrix_size,
matrix_size);

#ifdef VNNI
T *vnniB = malloc_shared<T>(MATRIX_SIZE * MATRIX_SIZE, q);
matrix_vnni<T>(MATRIX_SIZE, MATRIX_SIZE, B, vnniB, vnniFactor);
T *vnniB = malloc_shared<T>(matrix_size * matrix_size, q);
matrix_vnni<T>(matrix_size, matrix_size, B, vnniB, vnniFactor);
free(B, q);
B = vnniB;
#endif
Expand All @@ -382,22 +396,31 @@ void test() {
double totalDuration = 0;
for (unsigned int i = 0; i < testIterations; i++) {
double duration =
joint_matmul<MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE,
vnniFactor, T, TResult, TM, TN, TK, MCache1, NCache1,
KCache1, MCache2, NCache2, KCache2>(A, B, C, q, i);
joint_matmul<
#if !defined(ARG_DIM) && !defined(RUNTIME_DIM)
matrix_size, matrix_size, matrix_size, matrix_size,
#endif // ARG_DIM, RUNTIME_DIM
vnniFactor, T, TResult, TM, TN, TK, MCache1, NCache1,
KCache1, MCache2, NCache2, KCache2>
(A, B, C, q, i
#if defined(ARG_DIM) || defined(RUNTIME_DIM)
, matrix_size, matrix_size, matrix_size, matrix_size
#endif // ARG_DIM, RUNTIME_DIM
);

if (i >= recordThresh) {
totalDuration += duration;
}
}

assert(matrix_compare(MATRIX_SIZE, MATRIX_SIZE, C, refC));
assert(matrix_compare(matrix_size, matrix_size, C, refC));

double msecPerMatrixMul =
totalDuration / static_cast<double>(testIterations - recordThresh);
double gflops = (2.f * MATRIX_SIZE * MATRIX_SIZE * MATRIX_SIZE * 1.0e-9f) /
double gflops = (2.f * matrix_size * matrix_size * matrix_size * 1.0e-9f) /
(msecPerMatrixMul / 1000.f);

std::cout << "DONE for size " << MATRIX_SIZE << std::endl;
std::cout << "DONE for size " << matrix_size << std::endl;
std::cout << "GOPS is " << gflops << " Gop/s" << std::endl;

free(A, q);
Expand All @@ -406,7 +429,22 @@ void test() {
free(refC, q);
}

int main() {
int main(
#ifdef RUNTIME_DIM
int argc, char *argv[]
#endif //RUNTIME_DIM
) {

size_t matrix_size = -1;
#ifdef RUNTIME_DIM
if (argc == 2) {
matrix_size = std::stoul(argv[1]);
} else {
std::cerr << "Usage: ./program matrix_size\n";
return 1; // Error if no argument
}
#endif //RUNTIME_DIM

queue q;
std::vector<combination> combinations =
q.get_device()
Expand All @@ -429,22 +467,22 @@ int main() {
constexpr size_t NCache1 = 32;
constexpr size_t KCache1 = 32;
test<bfloat16, float, VnniFactor, /*TM*/ 16, /*TN*/ 16, /*TK*/ 32,
MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>();
MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
break;
}

if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
constexpr size_t NCache1 = 4 * /*TN*/ 16;
constexpr size_t KCache1 = 16;
test<bfloat16, float, VnniFactor, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16, MCache1,
NCache1, KCache1, MCache2, NCache2, KCache2>();
NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
#if (!defined(SG_SZ) || SG_SZ != 32)
// These combination are not currently supported for subgroup size = 32 in
// IGC
test<bfloat16, float, VnniFactor, /*TM*/ 16, /*TN*/ 16, /*TK*/ 16,
MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>();
MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
test<bfloat16, float, VnniFactor, /*TM*/ 32, /*TN*/ 64, /*TK*/ 16,
MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>();
MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
#endif
break;
}
Expand All @@ -454,10 +492,9 @@ int main() {
constexpr size_t KCache1 = 16;

test<bfloat16, float, VnniFactor, /*TM*/ 8, /*TN*/ 8, /*TK*/ 16, MCache1,
NCache1, KCache1, MCache2, NCache2, KCache2>();
// test<bfloat16, float, VnniFactor, /*TM*/ 32, /*TN*/ 32, /*TK*/ 16,
// MCache1,
// NCache1, KCache1, MCache2, NCache2, KCache2>();
NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
// test<bfloat16, float, VnniFactor, /*TM*/ 32, /*TN*/ 32, /*TK*/ 16, MCache1,
// NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
break;
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
//==--- joint_matrix_bf16_fill_k_cache_runtime_dim.cpp - DPC++ joint_matrix--------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// REQUIRES: aspect-ext_intel_matrix
// XFAIL: arch-intel_gpu_pvc

// RUN: %{build} -o %t_runtime_dim_vnni.out %fp-model-precise -DRUNTIME_DIM -DVNNI
// RUN: %{run} %t_runtime_dim_vnni.out 256

// -ffp-model=precise is added to not depend on compiler defaults.

#include "common.hpp"
#include "joint_matrix_bf16_fill_k_cache_impl.hpp"

0 comments on commit d29060e

Please sign in to comment.