diff --git a/.gitignore b/.gitignore index 604b0a64..de7becef 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ build stash -.* \ No newline at end of file +.* +*.pyc \ No newline at end of file diff --git a/.travis.yml b/.travis.yml index 01e43360..f523e20b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -21,5 +21,9 @@ before_script: script: - make - make install +branches: + only: + - master + - development notifications: email: false diff --git a/CHANGELOG b/CHANGELOG index db8bf1a8..29d666bb 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,24 @@ +Version 0.5.0 +- Improved structure and performance of level-2 routines (xSYMV/xHEMV) +- Reduced compilation time of level-3 OpenCL kernels +- Added level-1 routines: + * SSWAP/DSWAP/CSWAP/ZSWAP + * SSCAL/DSCAL/CSCAL/ZSCAL + * SCOPY/DCOPY/CCOPY/ZCOPY + * SDOT/DDOT + * CDOTU/ZDOTU + * CDOTC/ZDOTC +- Added level-2 routines: + * SGBMV/DGBMV/CGBMV/ZGBMV + * CHBMV/ZHBMV + * CHPMV/ZHPMV + * SSBMV/DSBMV + * SSPMV/DSPMV + * STRMV/DTRMV/CTRMV/ZTRMV + * STBMV/DTBMV/CTBMV/ZTBMV + * STPMV/DTPMV/CTPMV/ZTPMV + Version 0.4.0 - Now using the Claduc C++11 interface to OpenCL - Added plain C API for increased compatibility (clblast_c.h) diff --git a/CMakeLists.txt b/CMakeLists.txt index c9a398a7..40119c4e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,7 +13,7 @@ cmake_minimum_required(VERSION 2.8.10) project("clblast" C CXX) set(clblast_VERSION_MAJOR 0) -set(clblast_VERSION_MINOR 4) +set(clblast_VERSION_MINOR 5) set(clblast_VERSION_PATCH 0) # Options and their default values @@ -102,11 +102,11 @@ include_directories(${clblast_SOURCE_DIR}/include ${OPENCL_INCLUDE_DIRS}) # ================================================================================================== # Sets the supported routines and the used kernels. New routines and kernels should be added here. -set(KERNELS copy pad transpose padtranspose xaxpy xgemv xgemm) +set(KERNELS copy pad transpose padtranspose xaxpy xdot xgemv xgemm) set(SAMPLE_PROGRAMS_CPP sgemm) set(SAMPLE_PROGRAMS_C sgemm) -set(LEVEL1_ROUTINES xaxpy) -set(LEVEL2_ROUTINES xgemv xhemv xsymv) +set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc) +set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv) set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm) set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES}) set(PRECISIONS 32 3232 64 6464) diff --git a/README.md b/README.md index 106368be..8c7870a2 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ CLBlast: The tuned OpenCL BLAS library CLBlast is a modern, lightweight, performant and tunable OpenCL BLAS library written in C++11. It is designed to leverage the full performance potential of a wide variety of OpenCL devices from different vendors, including desktop and laptop GPUs, embedded GPUs, and other accelerators. CLBlast implements BLAS routines: basic linear algebra subprograms operating on vectors and matrices. -__Note that the CLBlast library is actively being developed, and is not mature enough for production environments__. This preview-version doesn't support all routines yet: others will be added in due time. It also lacks extensive tuning on some common OpenCL platforms: __out-of-the-box performance on some devices might be poor__. See below for more details. +__Note that the CLBlast library is actively being developed, and is not mature enough for production environments__. This preview-version doesn't support the less commonly used routines yet: they will be added in due time. It also lacks extensive tuning on some common OpenCL platforms: __out-of-the-box performance on some devices might be poor__. See below for more details. Why CLBlast and not clBLAS or cuBLAS? @@ -130,7 +130,7 @@ These graphs can be generated automatically on your own device. First, compile C Supported routines ------------- -CLBlast is in active development and currently does not support the full set of BLAS routines. The currently supported routines are marked with '✔' in the following tables: +CLBlast is in active development but already supports the majority of BLAS routines. The currently supported routines are marked with '✔' in the following tables: | Level-1 | S | D | C | Z | Notes | | ---------|---|---|---|---|---------| @@ -138,14 +138,13 @@ CLBlast is in active development and currently does not support the full set of | xROTMG | | | - | - | | | xROT | | | - | - | | | xROTM | | | - | - | | -| xSWAP | | | | | | -| xSCAL | | | | | +CS +ZD | -| xCOPY | | | | | | +| xSWAP | ✔ | ✔ | ✔ | ✔ | | +| xSCAL | ✔ | ✔ | ✔ | ✔ | +CS +ZD | +| xCOPY | ✔ | ✔ | ✔ | ✔ | | | xAXPY | ✔ | ✔ | ✔ | ✔ | | -| xDOT | | | - | - | +DS | -| xDOTU | - | - | | | | -| xDOTC | - | - | | | | -| xxxDOT | - | - | - | - | +SDS | +| xDOT | ✔ | ✔ | - | - | | +| xDOTU | - | - | ✔ | ✔ | | +| xDOTC | - | - | ✔ | ✔ | | | xNRM2 | | | - | - | +SC +DZ | | xASUM | | | - | - | +SC +DZ | | IxAMAX | | | | | | @@ -154,16 +153,16 @@ CLBlast is in active development and currently does not support the full set of | Level-2 | S | D | C | Z | Notes | | ---------|---|---|---|---|---------| | xGEMV | ✔ | ✔ | ✔ | ✔ | | -| xGBMV | | | | | | +| xGBMV | ✔ | ✔ | ✔ | ✔ | | | xHEMV | - | - | ✔ | ✔ | | -| xHBMV | - | - | | | | -| xHPMV | - | - | | | | +| xHBMV | - | - | ✔ | ✔ | | +| xHPMV | - | - | ✔ | ✔ | | | xSYMV | ✔ | ✔ | - | - | | -| xSBMV | | | - | - | | -| xSPMV | | | - | - | | -| xTRMV | | | | | | -| xTBMV | | | | | | -| xTPMV | | | | | | +| xSBMV | ✔ | ✔ | - | - | | +| xSPMV | ✔ | ✔ | - | - | | +| xTRMV | ✔ | ✔ | ✔ | ✔ | | +| xTBMV | ✔ | ✔ | ✔ | ✔ | | +| xTPMV | ✔ | ✔ | ✔ | ✔ | | | xTRSV | | | | | | | xTBSV | | | | | | | xTPSV | | | | | | diff --git a/include/clblast.h b/include/clblast.h index bd0f161c..70a3b5bc 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -68,6 +68,8 @@ enum class StatusCode { kInvalidLocalMemUsage = -2046, // Not enough local memory available on this device kNoHalfPrecision = -2045, // Half precision (16-bits) not supported by the device kNoDoublePrecision = -2044, // Double precision (64-bits) not supported by the device + kInvalidVectorDot = -2043, // Vector dot is not a valid OpenCL buffer + kInsufficientMemoryDot = -2042, // Vector dot's OpenCL buffer is too small }; // Matrix layout and transpose types @@ -83,18 +85,66 @@ enum class Precision { kHalf = 16, kSingle = 32, kDouble = 64, // ================================================================================================= // BLAS level-1 (vector-vector) routines +// ================================================================================================= + +// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP +template +StatusCode Swap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); + +// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL +template +StatusCode Scal(const size_t n, + const T alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); -// Templated-precision vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY +// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY template -StatusCode Axpy(const size_t n, const T alpha, +StatusCode Copy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); +// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY +template +StatusCode Axpy(const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); + +// Dot product of two vectors: SDOT/DDOT +template +StatusCode Dot(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); + +// Dot product of two complex vectors: CDOTU/ZDOTU +template +StatusCode Dotu(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); + +// Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC +template +StatusCode Dotc(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); + // ================================================================================================= // BLAS level-2 (matrix-vector) routines +// ================================================================================================= -// Templated-precision generalized matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV +// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV template StatusCode Gemv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, @@ -105,7 +155,18 @@ StatusCode Gemv(const Layout layout, const Transpose a_transpose, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); -// Templated-precision hermitian matrix-vector multiplication: SHEMV/DHEMV +// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV +template +StatusCode Gbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); + +// Hermitian matrix-vector multiplication: CHEMV/ZHEMV template StatusCode Hemv(const Layout layout, const Triangle triangle, const size_t n, @@ -116,7 +177,29 @@ StatusCode Hemv(const Layout layout, const Triangle triangle, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); -// Templated-precision symmetric matrix-vector multiplication: SSYMV/DSYMV +// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV +template +StatusCode Hbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); + +// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV +template +StatusCode Hpmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); + +// Symmetric matrix-vector multiplication: SSYMV/DSYMV template StatusCode Symv(const Layout layout, const Triangle triangle, const size_t n, @@ -127,10 +210,187 @@ StatusCode Symv(const Layout layout, const Triangle triangle, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); +// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV +template +StatusCode Sbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); + +// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV +template +StatusCode Spmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); + +// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV +template +StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + +// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV +template +StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + +// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV +template +StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + +// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV +template +StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + +// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV +template +StatusCode Tbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + +// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV +template +StatusCode Tpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + +// General rank-1 matrix update: SGER/DGER +template +StatusCode Ger(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// General rank-1 complex matrix update: CGERU/ZGERU +template +StatusCode Geru(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// General rank-1 complex conjugated matrix update: CGERC/ZGERC +template +StatusCode Gerc(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// Hermitian rank-1 matrix update: CHER/ZHER +template +StatusCode Her(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// Hermitian packed rank-1 matrix update: CHPR/ZHPR +template +StatusCode Hpr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); + +// Hermitian rank-2 matrix update: CHER2/ZHER2 +template +StatusCode Her2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 +template +StatusCode Hpr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); + +// Symmetric rank-1 matrix update: SSYR/DSYR +template +StatusCode Syr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// Symmetric packed rank-1 matrix update: SSPR/DSPR +template +StatusCode Spr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); + +// Symmetric rank-2 matrix update: SSYR2/DSYR2 +template +StatusCode Syr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// Symmetric packed rank-2 matrix update: SSPR2/DSPR2 +template +StatusCode Spr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); + // ================================================================================================= // BLAS level-3 (matrix-matrix) routines +// ================================================================================================= -// Templated-precision generalized matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM +// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM template StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, @@ -141,7 +401,7 @@ StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpos cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); -// Templated-precision symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM +// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM template StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, @@ -152,7 +412,7 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); -// Templated-precision hermitian matrix-matrix multiplication: CHEMM/ZHEMM +// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM template StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, @@ -163,7 +423,7 @@ StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); -// Templated-precision rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK +// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK template StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, @@ -173,7 +433,7 @@ StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_ cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); -// Templated-precision rank-K update of a hermitian matrix: CHERK/ZHERK +// Rank-K update of a hermitian matrix: CHERK/ZHERK template StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, @@ -183,7 +443,7 @@ StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_ cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); -// Templated-precision rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K +// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K template StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const size_t n, const size_t k, @@ -194,7 +454,7 @@ StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose a cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); -// Templated-precision rank-2K update of a hermitian matrix: CHER2K/ZHER2K +// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K template StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const size_t n, const size_t k, @@ -205,27 +465,23 @@ StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose a cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); -// Templated-precision triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM +// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM template -StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, - const Transpose a_transpose, const Diagonal diagonal, +StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event); -// Templated-precision matrix equation solver: STRSM/DTRSM/CTRSM/ZTRSM -/* +// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM template -StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, - const Transpose a_transpose, const Diagonal diagonal, +StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event); -*/ // ================================================================================================= } // namespace clblast diff --git a/include/clblast_c.h b/include/clblast_c.h index c25e5880..fac39a58 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -64,6 +64,8 @@ typedef enum StatusCode_ { kInvalidLocalMemUsage = -2046, // Not enough local memory available on this device kNoHalfPrecision = -2045, // Half precision (16-bits) not supported by the device kNoDoublePrecision = -2044, // Double precision (64-bits) not supported by the device + kInvalidVectorDot = -2043, // Vector dot is not a valid OpenCL buffer + kInsufficientMemoryDot = -2042, // Vector dot's OpenCL buffer is too small } StatusCode; // Matrix layout and transpose types @@ -81,6 +83,60 @@ typedef enum Precision_ { kHalf = 16, kSingle = 32, kDouble = 64, // BLAS level-1 (vector-vector) routines // ================================================================================================= +// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP +StatusCode CLBlastSswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastCswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); + +// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL +StatusCode CLBlastSscal(const size_t n, + const float alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDscal(const size_t n, + const double alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastCscal(const size_t n, + const cl_float2 alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZscal(const size_t n, + const cl_double2 alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + +// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY +StatusCode CLBlastScopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastCcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); + // Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY StatusCode CLBlastSaxpy(const size_t n, const float alpha, @@ -103,11 +159,47 @@ StatusCode CLBlastZaxpy(const size_t n, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); +// Dot product of two vectors: SDOT/DDOT +StatusCode CLBlastSdot(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDdot(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); + +// Dot product of two complex vectors: CDOTU/ZDOTU +StatusCode CLBlastCdotu(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZdotu(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); + +// Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC +StatusCode CLBlastCdotc(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZdotc(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); + // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= -// Generalized matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV +// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV StatusCode CLBlastSgemv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const float alpha, @@ -141,6 +233,40 @@ StatusCode CLBlastZgemv(const Layout layout, const Transpose a_transpose, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); +// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV +StatusCode CLBlastSgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastCgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); + // Hermitian matrix-vector multiplication: CHEMV/ZHEMV StatusCode CLBlastChemv(const Layout layout, const Triangle triangle, const size_t n, @@ -159,6 +285,42 @@ StatusCode CLBlastZhemv(const Layout layout, const Triangle triangle, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); +// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV +StatusCode CLBlastChbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZhbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); + +// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV +StatusCode CLBlastChpmv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_float2 alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZhpmv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_double2 alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); + // Symmetric matrix-vector multiplication: SSYMV/DSYMV StatusCode CLBlastSsymv(const Layout layout, const Triangle triangle, const size_t n, @@ -177,11 +339,347 @@ StatusCode CLBlastDsymv(const Layout layout, const Triangle triangle, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); +// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV +StatusCode CLBlastSsbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDsbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); + +// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV +StatusCode CLBlastSspmv(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDspmv(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); + +// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV +StatusCode CLBlastStrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastCtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + +// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV +StatusCode CLBlastStbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastCtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + +// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV +StatusCode CLBlastStpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastCtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + +// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV +StatusCode CLBlastStrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastCtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + +// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV +StatusCode CLBlastStbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastCtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + +// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV +StatusCode CLBlastStpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastCtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + +// General rank-1 matrix update: SGER/DGER +StatusCode CLBlastSger(const Layout layout, + const size_t m, const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDger(const Layout layout, + const size_t m, const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// General rank-1 complex matrix update: CGERU/ZGERU +StatusCode CLBlastCgeru(const Layout layout, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZgeru(const Layout layout, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// General rank-1 complex conjugated matrix update: CGERC/ZGERC +StatusCode CLBlastCgerc(const Layout layout, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZgerc(const Layout layout, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// Hermitian rank-1 matrix update: CHER/ZHER +StatusCode CLBlastCher(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZher(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// Hermitian packed rank-1 matrix update: CHPR/ZHPR +StatusCode CLBlastChpr(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZhpr(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); + +// Hermitian rank-2 matrix update: CHER2/ZHER2 +StatusCode CLBlastCher2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZher2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 +StatusCode CLBlastChpr2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZhpr2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); + +// Symmetric rank-1 matrix update: SSYR/DSYR +StatusCode CLBlastSsyr(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDsyr(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// Symmetric packed rank-1 matrix update: SSPR/DSPR +StatusCode CLBlastSspr(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDspr(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); + +// Symmetric rank-2 matrix update: SSYR2/DSYR2 +StatusCode CLBlastSsyr2(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDsyr2(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// Symmetric packed rank-2 matrix update: SSPR2/DSPR2 +StatusCode CLBlastSspr2(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDspr2(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); + // ================================================================================================= // BLAS level-3 (matrix-matrix) routines // ================================================================================================= -// Generalized matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM +// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM StatusCode CLBlastSgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const float alpha, @@ -391,6 +889,32 @@ StatusCode CLBlastZtrmm(const Layout layout, const Side side, const Triangle tri cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event); +// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM +StatusCode CLBlastStrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastCtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event); + // ================================================================================================= // CLBLAST_CLBLAST_C_H_ diff --git a/include/internal/clpp11.h b/include/internal/clpp11.h index 2c2cc797..df7a0d82 100644 --- a/include/internal/clpp11.h +++ b/include/internal/clpp11.h @@ -493,11 +493,11 @@ class Buffer { } // Copies the contents of this buffer into another device buffer - void CopyToAsync(const Queue &queue, const size_t size, const Buffer &destination) { + void CopyToAsync(const Queue &queue, const size_t size, const Buffer &destination) const { CheckError(clEnqueueCopyBuffer(queue(), *buffer_, destination(), 0, 0, size*sizeof(T), 0, nullptr, nullptr)); } - void CopyTo(const Queue &queue, const size_t size, const Buffer &destination) { + void CopyTo(const Queue &queue, const size_t size, const Buffer &destination) const { CopyToAsync(queue, size, destination); queue.Finish(); } diff --git a/include/internal/database.h b/include/internal/database.h index 8c937e34..1ac0e646 100644 --- a/include/internal/database.h +++ b/include/internal/database.h @@ -67,6 +67,7 @@ class Database { // The database consists of separate database entries, stored together in a vector static const DatabaseEntry XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble; + static const DatabaseEntry XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble; static const DatabaseEntry XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble; static const DatabaseEntry XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble; static const DatabaseEntry CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble; diff --git a/include/internal/database/xdot.h b/include/internal/database/xdot.h new file mode 100644 index 00000000..05841eb7 --- /dev/null +++ b/include/internal/database/xdot.h @@ -0,0 +1,113 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file populates the database with best-found tuning parameters for the Xdot kernels. +// +// ================================================================================================= + +namespace clblast { +// ================================================================================================= + +const Database::DatabaseEntry Database::XdotSingle = { + "Xdot", Precision::kSingle, { + { // NVIDIA GPUs + kDeviceTypeGPU, kDeviceVendorNVIDIA, { + } + }, + { // AMD GPUs + kDeviceTypeGPU, kDeviceVendorAMD, { + } + }, + { // Intel GPUs + kDeviceTypeGPU, kDeviceVendorIntel, { + { "Iris", { {"WGS1",512}, {"WGS2",512} } }, + } + }, + { // Default + kDeviceTypeAll, kDeviceVendorAll, { + { kDefaultDevice, { {"WGS1",64}, {"WGS2",64} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XdotDouble = { + "Xdot", Precision::kDouble, { + { // NVIDIA GPUs + kDeviceTypeGPU, kDeviceVendorNVIDIA, { + } + }, + { // AMD GPUs + kDeviceTypeGPU, kDeviceVendorAMD, { + } + }, + { // Intel GPUs + kDeviceTypeGPU, kDeviceVendorIntel, { + } + }, + { // Default + kDeviceTypeAll, kDeviceVendorAll, { + { kDefaultDevice, { {"WGS1",64}, {"WGS2",64} } }, + } + }, + } +}; +// ================================================================================================= + +const Database::DatabaseEntry Database::XdotComplexSingle = { + "Xdot", Precision::kComplexSingle, { + { // NVIDIA GPUs + kDeviceTypeGPU, kDeviceVendorNVIDIA, { + } + }, + { // AMD GPUs + kDeviceTypeGPU, kDeviceVendorAMD, { + } + }, + { // Intel GPUs + kDeviceTypeGPU, kDeviceVendorIntel, { + { "Iris", { {"WGS1",512}, {"WGS2",512} } }, + } + }, + { // Default + kDeviceTypeAll, kDeviceVendorAll, { + { kDefaultDevice, { {"WGS1",64}, {"WGS2",64} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XdotComplexDouble = { + "Xdot", Precision::kComplexDouble, { + { // NVIDIA GPUs + kDeviceTypeGPU, kDeviceVendorNVIDIA, { + } + }, + { // AMD GPUs + kDeviceTypeGPU, kDeviceVendorAMD, { + } + }, + { // Intel GPUs + kDeviceTypeGPU, kDeviceVendorIntel, { + } + }, + { // Default + kDeviceTypeAll, kDeviceVendorAll, { + { kDefaultDevice, { {"WGS1",64}, {"WGS2",64} } }, + } + }, + } +}; + +// ================================================================================================= +} // namespace clblast diff --git a/include/internal/routine.h b/include/internal/routine.h index 367917fd..b7c06a97 100644 --- a/include/internal/routine.h +++ b/include/internal/routine.h @@ -72,12 +72,16 @@ class Routine { const size_t offset, const size_t ld, const size_t data_size); StatusCode TestMatrixC(const size_t one, const size_t two, const Buffer &buffer, const size_t offset, const size_t ld, const size_t data_size); + StatusCode TestMatrixAP(const size_t n, const Buffer &buffer, + const size_t offset, const size_t data_size); // Tests for valid inputs of vectors X and Y StatusCode TestVectorX(const size_t n, const Buffer &buffer, const size_t offset, const size_t inc, const size_t data_size); StatusCode TestVectorY(const size_t n, const Buffer &buffer, const size_t offset, const size_t inc, const size_t data_size); + StatusCode TestVectorDot(const size_t n, const Buffer &buffer, const size_t offset, + const size_t data_size); // Copies/transposes a matrix and padds/unpads it with zeroes. This method is also able to write // to symmetric and triangular matrices through optional arguments. diff --git a/include/internal/routines/level1/xaxpy.h b/include/internal/routines/level1/xaxpy.h index 4b9da890..689cf169 100644 --- a/include/internal/routines/level1/xaxpy.h +++ b/include/internal/routines/level1/xaxpy.h @@ -35,7 +35,7 @@ class Xaxpy: public Routine { using Routine::ErrorIn; // Constructor - Xaxpy(Queue &queue, Event &event); + Xaxpy(Queue &queue, Event &event, const std::string &name = "AXPY"); // Templated-precision implementation of the routine StatusCode DoAxpy(const size_t n, const T alpha, diff --git a/include/internal/routines/level1/xcopy.h b/include/internal/routines/level1/xcopy.h new file mode 100644 index 00000000..15f339aa --- /dev/null +++ b/include/internal/routines/level1/xcopy.h @@ -0,0 +1,54 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xcopy routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XCOPY_H_ +#define CLBLAST_ROUTINES_XCOPY_H_ + +#include "internal/routine.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xcopy: public Routine { + public: + + // Members and methods from the base class + using Routine::db_; + using Routine::source_string_; + using Routine::queue_; + using Routine::GetProgramFromCache; + using Routine::TestVectorX; + using Routine::TestVectorY; + using Routine::RunKernel; + using Routine::ErrorIn; + + // Constructor + Xcopy(Queue &queue, Event &event, const std::string &name = "COPY"); + + // Templated-precision implementation of the routine + StatusCode DoCopy(const size_t n, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); + + private: + // Static variable to get the precision + const static Precision precision_; +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XCOPY_H_ +#endif diff --git a/include/internal/routines/level1/xdot.h b/include/internal/routines/level1/xdot.h new file mode 100644 index 00000000..64b62945 --- /dev/null +++ b/include/internal/routines/level1/xdot.h @@ -0,0 +1,58 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xdot routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XDOT_H_ +#define CLBLAST_ROUTINES_XDOT_H_ + +#include "internal/routine.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xdot: public Routine { + public: + + // Members and methods from the base class + using Routine::db_; + using Routine::source_string_; + using Routine::queue_; + using Routine::context_; + using Routine::GetProgramFromCache; + using Routine::TestVectorX; + using Routine::TestVectorY; + using Routine::TestVectorDot; + using Routine::RunKernel; + using Routine::ErrorIn; + + // Constructor + Xdot(Queue &queue, Event &event, const std::string &name = "DOT"); + + // Templated-precision implementation of the routine + StatusCode DoDot(const size_t n, + const Buffer &dot_buffer, const size_t dot_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, + const bool do_conjugate = false); + + private: + // Static variable to get the precision + const static Precision precision_; +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XDOT_H_ +#endif diff --git a/include/internal/routines/level1/xdotc.h b/include/internal/routines/level1/xdotc.h new file mode 100644 index 00000000..726cec7c --- /dev/null +++ b/include/internal/routines/level1/xdotc.h @@ -0,0 +1,44 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xdotc routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XDOTC_H_ +#define CLBLAST_ROUTINES_XDOTC_H_ + +#include "internal/routines/level1/xdot.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xdotc: public Xdot { + public: + + // Uses the regular Xdot routine + using Xdot::DoDot; + + // Constructor + Xdotc(Queue &queue, Event &event, const std::string &name = "DOTC"); + + // Templated-precision implementation of the routine + StatusCode DoDotc(const size_t n, + const Buffer &dot_buffer, const size_t dot_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XDOTC_H_ +#endif diff --git a/include/internal/routines/level1/xdotu.h b/include/internal/routines/level1/xdotu.h new file mode 100644 index 00000000..825ebb78 --- /dev/null +++ b/include/internal/routines/level1/xdotu.h @@ -0,0 +1,44 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xdotu routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XDOTU_H_ +#define CLBLAST_ROUTINES_XDOTU_H_ + +#include "internal/routines/level1/xdot.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xdotu: public Xdot { + public: + + // Uses the regular Xdot routine + using Xdot::DoDot; + + // Constructor + Xdotu(Queue &queue, Event &event, const std::string &name = "DOTU"); + + // Templated-precision implementation of the routine + StatusCode DoDotu(const size_t n, + const Buffer &dot_buffer, const size_t dot_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XDOTU_H_ +#endif diff --git a/include/internal/routines/level1/xscal.h b/include/internal/routines/level1/xscal.h new file mode 100644 index 00000000..d97b5a07 --- /dev/null +++ b/include/internal/routines/level1/xscal.h @@ -0,0 +1,52 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xscal routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XSCAL_H_ +#define CLBLAST_ROUTINES_XSCAL_H_ + +#include "internal/routine.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xscal: public Routine { + public: + + // Members and methods from the base class + using Routine::db_; + using Routine::source_string_; + using Routine::queue_; + using Routine::GetProgramFromCache; + using Routine::TestVectorX; + using Routine::RunKernel; + using Routine::ErrorIn; + + // Constructor + Xscal(Queue &queue, Event &event, const std::string &name = "SCAL"); + + // Templated-precision implementation of the routine + StatusCode DoScal(const size_t n, const T alpha, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc); + + private: + // Static variable to get the precision + const static Precision precision_; +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XSCAL_H_ +#endif diff --git a/include/internal/routines/level1/xswap.h b/include/internal/routines/level1/xswap.h new file mode 100644 index 00000000..fe79882b --- /dev/null +++ b/include/internal/routines/level1/xswap.h @@ -0,0 +1,54 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xswap routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XSWAP_H_ +#define CLBLAST_ROUTINES_XSWAP_H_ + +#include "internal/routine.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xswap: public Routine { + public: + + // Members and methods from the base class + using Routine::db_; + using Routine::source_string_; + using Routine::queue_; + using Routine::GetProgramFromCache; + using Routine::TestVectorX; + using Routine::TestVectorY; + using Routine::RunKernel; + using Routine::ErrorIn; + + // Constructor + Xswap(Queue &queue, Event &event, const std::string &name = "SWAP"); + + // Templated-precision implementation of the routine + StatusCode DoSwap(const size_t n, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); + + private: + // Static variable to get the precision + const static Precision precision_; +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XSWAP_H_ +#endif diff --git a/include/internal/routines/level2/xgbmv.h b/include/internal/routines/level2/xgbmv.h new file mode 100644 index 00000000..27b033e9 --- /dev/null +++ b/include/internal/routines/level2/xgbmv.h @@ -0,0 +1,49 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xgbmv routine. It is based on the generalized mat-vec multiplication +// routine (Xgemv). The Xgbmv class inherits from the templated class Xgemv, allowing it to call the +// "MatVec" function directly. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XGBMV_H_ +#define CLBLAST_ROUTINES_XGBMV_H_ + +#include "internal/routines/level2/xgemv.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xgbmv: public Xgemv { + public: + + // Uses the generic matrix-vector routine + using Xgemv::MatVec; + + // Constructor + Xgbmv(Queue &queue, Event &event, const std::string &name = "GBMV"); + + // Templated-precision implementation of the routine + StatusCode DoGbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XGBMV_H_ +#endif diff --git a/include/internal/routines/level2/xgemv.h b/include/internal/routines/level2/xgemv.h index 1e120a5e..b31565ec 100644 --- a/include/internal/routines/level2/xgemv.h +++ b/include/internal/routines/level2/xgemv.h @@ -32,6 +32,7 @@ class Xgemv: public Routine { using Routine::TestVectorX; using Routine::TestVectorY; using Routine::TestMatrixA; + using Routine::TestMatrixAP; using Routine::RunKernel; using Routine::ErrorIn; @@ -47,6 +48,18 @@ class Xgemv: public Routine { const T beta, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); + // Generic version used also for other matrix-vector multiplications + StatusCode MatVec(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, + bool fast_kernel, bool fast_kernel_rot, + const size_t parameter, const bool packed, + const size_t kl, const size_t ku); + private: // Static variable to get the precision const static Precision precision_; diff --git a/include/internal/routines/level2/xhbmv.h b/include/internal/routines/level2/xhbmv.h new file mode 100644 index 00000000..65138424 --- /dev/null +++ b/include/internal/routines/level2/xhbmv.h @@ -0,0 +1,49 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xhbmv routine. It is based on the generalized mat-vec multiplication +// routine (Xgemv). The Xhbmv class inherits from the templated class Xgemv, allowing it to call the +// "MatVec" function directly. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XHBMV_H_ +#define CLBLAST_ROUTINES_XHBMV_H_ + +#include "internal/routines/level2/xgemv.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xhbmv: public Xgemv { + public: + + // Uses the generic matrix-vector routine + using Xgemv::MatVec; + + // Constructor + Xhbmv(Queue &queue, Event &event, const std::string &name = "HBMV"); + + // Templated-precision implementation of the routine + StatusCode DoHbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XHBMV_H_ +#endif diff --git a/include/internal/routines/level2/xhemv.h b/include/internal/routines/level2/xhemv.h index 801b2fc3..b74db760 100644 --- a/include/internal/routines/level2/xhemv.h +++ b/include/internal/routines/level2/xhemv.h @@ -7,8 +7,9 @@ // Author(s): // Cedric Nugteren // -// This file implements the Xhemv routine. It is based on the generalized matrix multiplication -// routine (Xgemv). The implementation is very similar to the Xsymv routine. +// This file implements the Xhemv routine. It is based on the generalized mat-vec multiplication +// routine (Xgemv). The Xhemv class inherits from the templated class Xgemv, allowing it to call the +// "MatVec" function directly. // // ================================================================================================= @@ -25,16 +26,8 @@ template class Xhemv: public Xgemv { public: - // Members and methods from the base class - using Routine::db_; - using Routine::context_; - using Routine::GetProgramFromCache; - using Routine::TestMatrixA; - using Routine::RunKernel; - using Routine::ErrorIn; - - // Uses the regular Xgemv routine - using Xgemv::DoGemv; + // Uses the generic matrix-vector routine + using Xgemv::MatVec; // Constructor Xhemv(Queue &queue, Event &event, const std::string &name = "HEMV"); diff --git a/include/internal/routines/level2/xhpmv.h b/include/internal/routines/level2/xhpmv.h new file mode 100644 index 00000000..48f1ed3f --- /dev/null +++ b/include/internal/routines/level2/xhpmv.h @@ -0,0 +1,49 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xhpmv routine. It is based on the generalized mat-vec multiplication +// routine (Xgemv). The Xhpmv class inherits from the templated class Xgemv, allowing it to call the +// "MatVec" function directly. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XHPMV_H_ +#define CLBLAST_ROUTINES_XHPMV_H_ + +#include "internal/routines/level2/xgemv.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xhpmv: public Xgemv { + public: + + // Uses the generic matrix-vector routine + using Xgemv::MatVec; + + // Constructor + Xhpmv(Queue &queue, Event &event, const std::string &name = "HPMV"); + + // Templated-precision implementation of the routine + StatusCode DoHpmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer &ap_buffer, const size_t ap_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XHPMV_H_ +#endif diff --git a/include/internal/routines/level2/xsbmv.h b/include/internal/routines/level2/xsbmv.h new file mode 100644 index 00000000..bb24d8f4 --- /dev/null +++ b/include/internal/routines/level2/xsbmv.h @@ -0,0 +1,49 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsbmv routine. It is based on the generalized mat-vec multiplication +// routine (Xgemv). The Xsbmv class inherits from the templated class Xgemv, allowing it to call the +// "MatVec" function directly. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XSBMV_H_ +#define CLBLAST_ROUTINES_XSBMV_H_ + +#include "internal/routines/level2/xgemv.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xsbmv: public Xgemv { + public: + + // Uses the generic matrix-vector routine + using Xgemv::MatVec; + + // Constructor + Xsbmv(Queue &queue, Event &event, const std::string &name = "SBMV"); + + // Templated-precision implementation of the routine + StatusCode DoSbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XSBMV_H_ +#endif diff --git a/include/internal/routines/level2/xspmv.h b/include/internal/routines/level2/xspmv.h new file mode 100644 index 00000000..88f02a2f --- /dev/null +++ b/include/internal/routines/level2/xspmv.h @@ -0,0 +1,49 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xspmv routine. It is based on the generalized mat-vec multiplication +// routine (Xgemv). The Xspmv class inherits from the templated class Xgemv, allowing it to call the +// "MatVec" function directly. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XSPMV_H_ +#define CLBLAST_ROUTINES_XSPMV_H_ + +#include "internal/routines/level2/xgemv.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xspmv: public Xgemv { + public: + + // Uses the generic matrix-vector routine + using Xgemv::MatVec; + + // Constructor + Xspmv(Queue &queue, Event &event, const std::string &name = "SPMV"); + + // Templated-precision implementation of the routine + StatusCode DoSpmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer &ap_buffer, const size_t ap_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XSPMV_H_ +#endif diff --git a/include/internal/routines/level2/xsymv.h b/include/internal/routines/level2/xsymv.h index ab6da6d1..c7b92702 100644 --- a/include/internal/routines/level2/xsymv.h +++ b/include/internal/routines/level2/xsymv.h @@ -9,8 +9,7 @@ // // This file implements the Xsymv routine. It is based on the generalized mat-vec multiplication // routine (Xgemv). The Xsymv class inherits from the templated class Xgemv, allowing it to call the -// "DoGemm" function directly. The "DoSymv" function first preprocesses the symmetric matrix by -// transforming it into a general matrix, and then calls the regular GEMV code. +// "MatVec" function directly. // // ================================================================================================= @@ -27,16 +26,8 @@ template class Xsymv: public Xgemv { public: - // Members and methods from the base class - using Routine::db_; - using Routine::context_; - using Routine::GetProgramFromCache; - using Routine::TestMatrixA; - using Routine::RunKernel; - using Routine::ErrorIn; - - // Uses the regular Xgemv routine - using Xgemv::DoGemv; + // Uses the generic matrix-vector routine + using Xgemv::MatVec; // Constructor Xsymv(Queue &queue, Event &event, const std::string &name = "SYMV"); diff --git a/include/internal/routines/level2/xtbmv.h b/include/internal/routines/level2/xtbmv.h new file mode 100644 index 00000000..89c90193 --- /dev/null +++ b/include/internal/routines/level2/xtbmv.h @@ -0,0 +1,51 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xtbmv routine. It is based on the generalized mat-vec multiplication +// routine (Xgemv). The Xtbmv class inherits from the templated class Xgemv, allowing it to call the +// "MatVec" function directly. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XTBMV_H_ +#define CLBLAST_ROUTINES_XTBMV_H_ + +#include "internal/routines/level2/xgemv.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xtbmv: public Xgemv { + public: + + // Members from the base class + using Routine::queue_; + using Routine::context_; + + // Uses the generic matrix-vector routine + using Xgemv::MatVec; + + // Constructor + Xtbmv(Queue &queue, Event &event, const std::string &name = "TBMV"); + + // Templated-precision implementation of the routine + StatusCode DoTbmv(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XTBMV_H_ +#endif diff --git a/include/internal/routines/level2/xtpmv.h b/include/internal/routines/level2/xtpmv.h new file mode 100644 index 00000000..183d3505 --- /dev/null +++ b/include/internal/routines/level2/xtpmv.h @@ -0,0 +1,51 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xtpmv routine. It is based on the generalized mat-vec multiplication +// routine (Xgemv). The Xtpmv class inherits from the templated class Xgemv, allowing it to call the +// "MatVec" function directly. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XTPMV_H_ +#define CLBLAST_ROUTINES_XTPMV_H_ + +#include "internal/routines/level2/xgemv.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xtpmv: public Xgemv { + public: + + // Members from the base class + using Routine::queue_; + using Routine::context_; + + // Uses the generic matrix-vector routine + using Xgemv::MatVec; + + // Constructor + Xtpmv(Queue &queue, Event &event, const std::string &name = "TPMV"); + + // Templated-precision implementation of the routine + StatusCode DoTpmv(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const Buffer &ap_buffer, const size_t ap_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XTPMV_H_ +#endif diff --git a/include/internal/routines/level2/xtrmv.h b/include/internal/routines/level2/xtrmv.h new file mode 100644 index 00000000..dadfbc98 --- /dev/null +++ b/include/internal/routines/level2/xtrmv.h @@ -0,0 +1,51 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xtrmv routine. It is based on the generalized mat-vec multiplication +// routine (Xgemv). The Xtrmv class inherits from the templated class Xgemv, allowing it to call the +// "MatVec" function directly. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XTRMV_H_ +#define CLBLAST_ROUTINES_XTRMV_H_ + +#include "internal/routines/level2/xgemv.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xtrmv: public Xgemv { + public: + + // Members from the base class + using Routine::queue_; + using Routine::context_; + + // Uses the generic matrix-vector routine + using Xgemv::MatVec; + + // Constructor + Xtrmv(Queue &queue, Event &event, const std::string &name = "TRMV"); + + // Templated-precision implementation of the routine + StatusCode DoTrmv(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XTRMV_H_ +#endif diff --git a/include/internal/routines/level3/xgemm.h b/include/internal/routines/level3/xgemm.h index a0c8b595..9b40a7fc 100644 --- a/include/internal/routines/level3/xgemm.h +++ b/include/internal/routines/level3/xgemm.h @@ -38,7 +38,7 @@ class Xgemm: public Routine { using Routine::ErrorIn; // Constructor - Xgemm(Queue &queue, Event &event); + Xgemm(Queue &queue, Event &event, const std::string &name = "GEMM"); // Templated-precision implementation of the routine StatusCode DoGemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, diff --git a/include/internal/routines/level3/xhemm.h b/include/internal/routines/level3/xhemm.h index 5f1e8723..ca38ca08 100644 --- a/include/internal/routines/level3/xhemm.h +++ b/include/internal/routines/level3/xhemm.h @@ -37,7 +37,7 @@ class Xhemm: public Xgemm { using Xgemm::DoGemm; // Constructor - Xhemm(Queue &queue, Event &event); + Xhemm(Queue &queue, Event &event, const std::string &name = "HEMM"); // Templated-precision implementation of the routine StatusCode DoHemm(const Layout layout, const Side side, const Triangle triangle, diff --git a/include/internal/routines/level3/xher2k.h b/include/internal/routines/level3/xher2k.h index 9e961d23..7113a172 100644 --- a/include/internal/routines/level3/xher2k.h +++ b/include/internal/routines/level3/xher2k.h @@ -40,7 +40,7 @@ class Xher2k: public Routine { using Routine::ErrorIn; // Constructor - Xher2k(Queue &queue, Event &event); + Xher2k(Queue &queue, Event &event, const std::string &name = "HER2K"); // Templated-precision implementation of the routine StatusCode DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, diff --git a/include/internal/routines/level3/xherk.h b/include/internal/routines/level3/xherk.h index f285a71c..47112c2c 100644 --- a/include/internal/routines/level3/xherk.h +++ b/include/internal/routines/level3/xherk.h @@ -39,7 +39,7 @@ class Xherk: public Routine { using Routine::ErrorIn; // Constructor - Xherk(Queue &queue, Event &event); + Xherk(Queue &queue, Event &event, const std::string &name = "HERK"); // Templated-precision implementation of the routine StatusCode DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose, diff --git a/include/internal/routines/level3/xsymm.h b/include/internal/routines/level3/xsymm.h index 9ed3c722..9fc80eb4 100644 --- a/include/internal/routines/level3/xsymm.h +++ b/include/internal/routines/level3/xsymm.h @@ -39,7 +39,7 @@ class Xsymm: public Xgemm { using Xgemm::DoGemm; // Constructor - Xsymm(Queue &queue, Event &event); + Xsymm(Queue &queue, Event &event, const std::string &name = "SYMM"); // Templated-precision implementation of the routine StatusCode DoSymm(const Layout layout, const Side side, const Triangle triangle, diff --git a/include/internal/routines/level3/xsyr2k.h b/include/internal/routines/level3/xsyr2k.h index 85936658..c4679028 100644 --- a/include/internal/routines/level3/xsyr2k.h +++ b/include/internal/routines/level3/xsyr2k.h @@ -40,7 +40,7 @@ class Xsyr2k: public Routine { using Routine::ErrorIn; // Constructor - Xsyr2k(Queue &queue, Event &event); + Xsyr2k(Queue &queue, Event &event, const std::string &name = "SYR2K"); // Templated-precision implementation of the routine StatusCode DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, diff --git a/include/internal/routines/level3/xsyrk.h b/include/internal/routines/level3/xsyrk.h index 14d51a58..abf6b681 100644 --- a/include/internal/routines/level3/xsyrk.h +++ b/include/internal/routines/level3/xsyrk.h @@ -41,7 +41,7 @@ class Xsyrk: public Routine { using Routine::ErrorIn; // Constructor - Xsyrk(Queue &queue, Event &event); + Xsyrk(Queue &queue, Event &event, const std::string &name = "SYRK"); // Templated-precision implementation of the routine StatusCode DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, diff --git a/include/internal/routines/level3/xtrmm.h b/include/internal/routines/level3/xtrmm.h index d8ac60fd..a1f4d15c 100644 --- a/include/internal/routines/level3/xtrmm.h +++ b/include/internal/routines/level3/xtrmm.h @@ -38,7 +38,7 @@ class Xtrmm: public Xgemm { using Xgemm::DoGemm; // Constructor - Xtrmm(Queue &queue, Event &event); + Xtrmm(Queue &queue, Event &event, const std::string &name = "TRMM"); // Templated-precision implementation of the routine StatusCode DoTrmm(const Layout layout, const Side side, const Triangle triangle, diff --git a/include/internal/tuning.h b/include/internal/tuning.h index f029c704..6ea530ba 100644 --- a/include/internal/tuning.h +++ b/include/internal/tuning.h @@ -64,11 +64,13 @@ void Tuner(int argc, char* argv[]) { auto a_mat = std::vector(C::GetSizeA(args)); auto b_mat = std::vector(C::GetSizeB(args)); auto c_mat = std::vector(C::GetSizeC(args)); + auto temp = std::vector(C::GetSizeTemp(args)); PopulateVector(x_vec); PopulateVector(y_vec); PopulateVector(a_mat); PopulateVector(b_mat); PopulateVector(c_mat); + PopulateVector(temp); // Initializes the tuner for the chosen device cltune::Tuner tuner(args.platform_id, args.device_id); @@ -85,7 +87,7 @@ void Tuner(int argc, char* argv[]) { // Loads the kernel sources and defines the kernel to tune auto sources = C::GetSources(); auto id = tuner.AddKernelFromString(sources, C::KernelName(), C::GlobalSize(args), C::LocalSize()); - tuner.SetReferenceFromString(sources, C::KernelName(), C::GlobalSize(args), C::LocalSizeRef()); + tuner.SetReferenceFromString(sources, C::KernelName(), C::GlobalSizeRef(args), C::LocalSizeRef()); // Sets the tunable parameters and their possible values C::SetParameters(tuner, id); @@ -103,7 +105,7 @@ void Tuner(int argc, char* argv[]) { for (auto ¶meters: C::DivGlobal()) { tuner.DivGlobalSize(id, parameters); } // Sets the function's arguments - C::SetArguments(tuner, args, x_vec, y_vec, a_mat, b_mat, c_mat); + C::SetArguments(tuner, args, x_vec, y_vec, a_mat, b_mat, c_mat, temp); // Starts the tuning process tuner.Tune(); diff --git a/include/internal/utilities.h b/include/internal/utilities.h index d9fdb9ab..bd174ccb 100644 --- a/include/internal/utilities.h +++ b/include/internal/utilities.h @@ -41,6 +41,8 @@ const std::string kKhronosDoublePrecision = "cl_khr_fp64"; constexpr auto kArgM = "m"; constexpr auto kArgN = "n"; constexpr auto kArgK = "k"; +constexpr auto kArgKL = "kl"; +constexpr auto kArgKU = "ku"; constexpr auto kArgLayout = "layout"; constexpr auto kArgATransp = "transA"; constexpr auto kArgBTransp = "transB"; @@ -57,6 +59,8 @@ constexpr auto kArgCLeadDim = "ldc"; constexpr auto kArgAOffset = "offa"; constexpr auto kArgBOffset = "offb"; constexpr auto kArgCOffset = "offc"; +constexpr auto kArgAPOffset = "offap"; +constexpr auto kArgDotOffset = "offdot"; constexpr auto kArgAlpha = "alpha"; constexpr auto kArgBeta = "beta"; @@ -86,9 +90,11 @@ constexpr auto kArgNoAbbreviations = "no_abbrv"; template struct Arguments { // Routine-specific arguments - size_t m = 0; - size_t n = 0; - size_t k = 0; + size_t m = 1; + size_t n = 1; + size_t k = 1; + size_t ku = 1; + size_t kl = 1; Layout layout = Layout::kRowMajor; Transpose a_transpose = Transpose::kNo; Transpose b_transpose = Transpose::kNo; @@ -99,12 +105,14 @@ struct Arguments { size_t y_inc = 1; size_t x_offset = 0; size_t y_offset = 0; - size_t a_ld = 0; - size_t b_ld = 0; - size_t c_ld = 0; + size_t a_ld = 1; + size_t b_ld = 1; + size_t c_ld = 1; size_t a_offset = 0; size_t b_offset = 0; size_t c_offset = 0; + size_t ap_offset = 0; + size_t dot_offset = 0; T alpha = T{1.0}; T beta = T{1.0}; size_t x_size = 1; @@ -112,6 +120,8 @@ struct Arguments { size_t a_size = 1; size_t b_size = 1; size_t c_size = 1; + size_t ap_size = 1; + size_t dot_size = 1; // Tuner-specific arguments double fraction = 1.0; // Client-specific arguments @@ -138,6 +148,8 @@ struct Buffers { Buffer a_mat; Buffer b_mat; Buffer c_mat; + Buffer ap_mat; + Buffer dot; }; // ================================================================================================= diff --git a/scripts/generator/datatype.py b/scripts/generator/datatype.py new file mode 100644 index 00000000..0aa27197 --- /dev/null +++ b/scripts/generator/datatype.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python + +# ================================================================================================== +# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +# project loosely follows the Google C++ styleguide and uses a max-width of 100 characters per line. +# +# Author(s): +# Cedric Nugteren +# +# This file contains the 'DataType' class, used in the generator script to generate the CLBlast API +# interface and implementation. +# +# ================================================================================================== + +# Short-hands for data-types +FLT = "float" +DBL = "double" +FLT2 = "float2" +DBL2 = "double2" +F2CL = "cl_float2" +D2CL = "cl_double2" + +# Structure holding data-type and precision information +class DataType(): + def __init__(self, name, template, scalars, buffertype): + self.name = name + self.template = template + self.alpha_cpp = scalars[0] + self.beta_cpp = scalars[1] + self.alpha_cl = scalars[2] + self.beta_cl = scalars[3] + self.buffertype = buffertype + + # Outputs the name of the data-type (alpha/beta), possibly transforming into the right type + def UseAlpha(self): + if self.alpha_cpp in [FLT2, DBL2]: + return self.alpha_cpp+"{alpha.s[0], alpha.s[1]}" + return "alpha" + def UseBeta(self): + if self.beta_cpp in [FLT2, DBL2]: + return self.beta_cpp+"{beta.s[0], beta.s[1]}" + return "beta" + + # As above, but the transformation is in the opposite direction + def UseAlphaCL(self): + if self.alpha_cpp in [FLT2, DBL2]: + return self.alpha_cl+"{{alpha.real(), alpha.imag()}}" + return "alpha" + def UseBetaCL(self): + if self.beta_cpp in [FLT2, DBL2]: + return self.beta_cl+"{{beta.real(), beta.imag()}}" + return "beta" + + # Returns the template as used in the correctness/performance tests + def TestTemplate(self): + if self.buffertype != self.beta_cpp: + return "<"+self.buffertype+","+self.beta_cpp+">, "+self.buffertype+", "+self.beta_cpp + return "<"+self.buffertype+">, "+self.buffertype+", "+self.beta_cpp + + +# ================================================================================================== diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py new file mode 100644 index 00000000..25f02861 --- /dev/null +++ b/scripts/generator/generator.py @@ -0,0 +1,343 @@ +#!/usr/bin/env python + +# ================================================================================================== +# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +# project loosely follows the Google C++ styleguide and uses a max-width of 100 characters per line. +# +# Author(s): +# Cedric Nugteren +# +# This script automatically generates the bodies of the following files, creating the full CLBlast +# API interface and implementation (C, C++, and clBLAS wrapper): +# clblast.h +# clblast.cc +# clblast_c.h +# clblast_c.cc +# wrapper_clblas.h +# It also generates the main functions for the correctness and performance tests as found in +# test/correctness/routines/levelX/xYYYY.cc +# test/performance/routines/levelX/xYYYY.cc +# +# ================================================================================================== + +# System modules +import sys +import os.path + +# Local files +from routine import Routine +from datatype import DataType, FLT, DBL, FLT2, DBL2, F2CL, D2CL + +# ================================================================================================== + +# Regular data-types +S = DataType("S", FLT, [FLT, FLT, FLT, FLT], FLT ) # single (32) +D = DataType("D", DBL, [DBL, DBL, DBL, DBL], DBL ) # double (64) +C = DataType("C", FLT2, [FLT2, FLT2, F2CL, F2CL], FLT2) # single-complex (3232) +Z = DataType("Z", DBL2, [DBL2, DBL2, D2CL, D2CL], DBL2) # double-complex (6464) + +# Special cases +Css = DataType("C", FLT, [FLT, FLT, FLT, FLT], FLT2) # As C, but with constants from S +Zdd = DataType("Z", DBL, [DBL, DBL, DBL, DBL], DBL2) # As Z, but with constants from D +Ccs = DataType("C", FLT2+","+FLT, [FLT2, FLT, F2CL, FLT], FLT2) # As C, but with one constant from S +Zzd = DataType("Z", DBL2+","+DBL, [DBL2, DBL, D2CL, DBL], DBL2) # As Z, but with one constant from D + +# C++ template data-types +T = DataType("typename T", "T", ["T", "T", "T", "T"], "T") # regular routine +Tc = DataType("typename T", "std::complex,T", ["T", "T", "T", "T"], "std::complex") # for herk +TU = DataType("typename T, typename U", "T,U", ["T", "U", "T", "U"], "T") # for her2k + +# ================================================================================================== + +# Populates a list of routines +routines = [ +[ # Level 1: vector-vector + #Routine(False, "1", "rotg", T, [S,D], [], [], [], [], ["a","b","c","s"], False, "Generate plane rotation"), + #Routine(False, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["c","s"], False, "Apply plane rotation"), + Routine(True, "1", "swap", T, [S,D,C,Z], ["n"], [], [], ["x","y"], [], False, "Swap two vectors"), + Routine(True, "1", "scal", T, [S,D,C,Z], ["n"], [], [], ["x"], ["alpha"], False, "Vector scaling"), + Routine(True, "1", "copy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], [], False, "Vector copy"), + Routine(True, "1", "axpy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], ["alpha"], False, "Vector-times-constant plus vector"), + Routine(True, "1", "dot", T, [S,D], ["n"], [], ["x","y"], ["dot"], [], True, "Dot product of two vectors"), + Routine(True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], True, "Dot product of two complex vectors"), + Routine(True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], True, "Dot product of two complex vectors, one conjugated"), +], +[ # Level 2: matrix-vector + Routine(True, "2a", "gemv", T, [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], False, "General matrix-vector multiplication"), + Routine(True, "2a", "gbmv", T, [S,D,C,Z], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], False, "General banded matrix-vector multiplication"), + Routine(True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Hermitian matrix-vector multiplication"), + Routine(True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Hermitian banded matrix-vector multiplication"), + Routine(True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], False, "Hermitian packed matrix-vector multiplication"), + Routine(True, "2a", "symv", T, [S,D], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Symmetric matrix-vector multiplication"), + Routine(True, "2a", "sbmv", T, [S,D], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Symmetric banded matrix-vector multiplication"), + Routine(True, "2a", "spmv", T, [S,D], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], False, "Symmetric packed matrix-vector multiplication"), + Routine(True, "2a", "trmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], True, "Triangular matrix-vector multiplication"), + Routine(True, "2a", "tbmv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], True, "Triangular banded matrix-vector multiplication"), + Routine(True, "2a", "tpmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], True, "Triangular packed matrix-vector multiplication"), + Routine(False, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], False, "Solves a triangular system of equations"), + Routine(False, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], False, "Solves a banded triangular system of equations"), + Routine(False, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], False, "Solves a packed triangular system of equations"), + # Level 2: matrix update + Routine(False, "2b", "ger", T, [S,D], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 matrix update"), + Routine(False, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 complex matrix update"), + Routine(False, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 complex conjugated matrix update"), + Routine(False, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], False, "Hermitian rank-1 matrix update"), + Routine(False, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], False, "Hermitian packed rank-1 matrix update"), + Routine(False, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], False, "Hermitian rank-2 matrix update"), + Routine(False, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], False, "Hermitian packed rank-2 matrix update"), + Routine(False, "2b", "syr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], False, "Symmetric rank-1 matrix update"), + Routine(False, "2b", "spr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], False, "Symmetric packed rank-1 matrix update"), + Routine(False, "2b", "syr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], False, "Symmetric rank-2 matrix update"), + Routine(False, "2b", "spr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], False, "Symmetric packed rank-2 matrix update"), +], +[ # Level 3: matrix-matrix + Routine(True, "3", "gemm", T, [S,D,C,Z], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], False, "General matrix-matrix multiplication"), + Routine(True, "3", "symm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], False, "Symmetric matrix-matrix multiplication"), + Routine(True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], False, "Hermitian matrix-matrix multiplication"), + Routine(True, "3", "syrk", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], False, "Rank-K update of a symmetric matrix"), + Routine(True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], False, "Rank-K update of a hermitian matrix"), + Routine(True, "3", "syr2k", T, [S,D,C,Z], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], False, "Rank-2K update of a symmetric matrix"), + Routine(True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], False, "Rank-2K update of a hermitian matrix"), + Routine(True, "3", "trmm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], False, "Triangular matrix-matrix multiplication"), + Routine(False, "3", "trsm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], False, "Solves a triangular system of equations"), +]] + +# ================================================================================================== + +# Separators for the BLAS levels +separators = [""" +// ================================================================================================= +// BLAS level-1 (vector-vector) routines +// =================================================================================================""", +""" +// ================================================================================================= +// BLAS level-2 (matrix-vector) routines +// =================================================================================================""", +""" +// ================================================================================================= +// BLAS level-3 (matrix-matrix) routines +// ================================================================================================="""] + +# Main header/footer for source files +header = """ +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= +""" +footer = """ +// ================================================================================================= +""" + +# ================================================================================================== + +# The C++ API header (.h) +def clblast_h(routines): + result = "" + for routine in routines: + result += "\n// "+routine.description+": "+routine.ShortNames()+"\n" + result += routine.RoutineHeaderCPP(12)+";\n" + return result + +# The C++ API implementation (.cc) +def clblast_cc(routines): + result = "" + for routine in routines: + indent1 = " "*(20 + routine.Length()) + result += "\n// "+routine.description+": "+routine.ShortNames()+"\n" + if routine.implemented: + result += routine.RoutineHeaderCPP(12)+" {\n" + result += " auto queue_cpp = Queue(*queue);\n" + result += " auto event_cpp = Event(*event);\n" + result += " auto routine = X"+routine.name+"<"+routine.template.template+">(queue_cpp, event_cpp);\n" + result += " auto status = routine.SetUp();\n" + result += " if (status != StatusCode::kSuccess) { return status; }\n" + result += " return routine.Do"+routine.name.capitalize()+"(" + result += (",\n"+indent1).join([a for a in routine.ArgumentsCladuc(routine.template, indent1)]) + result += ");\n" + else: + result += routine.RoutineHeaderTypeCPP(12)+" {\n" + result += " return StatusCode::kNotImplemented;\n" + result += "}\n" + for flavour in routine.flavours: + indent2 = " "*(23 + routine.Length() + len(flavour.template)) + result += "template StatusCode "+routine.name.capitalize()+"<"+flavour.template+">(" + result += (",\n"+indent2).join([a for a in routine.ArgumentsType(flavour)]) + result += ",\n"+indent2+"cl_command_queue*, cl_event*);\n" + return result + +# ================================================================================================== + +# The C API header (.h) +def clblast_c_h(routines): + result = "" + for routine in routines: + result += "\n// "+routine.description+": "+routine.ShortNames()+"\n" + for flavour in routine.flavours: + result += routine.RoutineHeaderC(flavour, 20)+";\n" + return result + +# The C API implementation (.cc) +def clblast_c_cc(routines): + result = "" + for routine in routines: + result += "\n// "+routine.name.upper()+"\n" + for flavour in routine.flavours: + template = "<"+flavour.template+">" if routine.NoScalars() else "" + indent = " "*(26 + routine.Length() + len(template)) + result += routine.RoutineHeaderC(flavour, 20)+" {\n" + result += " auto status = clblast::"+routine.name.capitalize()+template+"(" + result += (",\n"+indent).join([a for a in routine.ArgumentsCast(flavour, indent)]) + result += ",\n"+indent+"queue, event);" + result += "\n return static_cast(status);\n}\n" + return result + +# ================================================================================================== + +# The wrapper to the reference clBLAS routines (for performance/correctness testing) +def wrapper_clblas(routines): + result = "" + for routine in routines: + result += "\n// Forwards the clBLAS calls for %s\n" % (routine.ShortNames()) + if routine.NoScalars(): + result += routine.RoutineHeaderWrapper(routine.template, True, 21)+";\n" + for flavour in routine.flavours: + indent = " "*(17 + routine.Length()) + result += routine.RoutineHeaderWrapper(flavour, False, 21)+" {\n" + arguments = routine.ArgumentsWrapper(flavour) + if routine.scratch: + result += " auto queue = Queue(queues[0]);\n" + result += " auto context = queue.GetContext();\n" + result += " auto scratch_buffer = Buffer<"+flavour.template+">(context, n*x_inc + x_offset);\n" + arguments += ["scratch_buffer()"] + result += " return clblas"+flavour.name+routine.name+"(" + result += (",\n"+indent).join([a for a in arguments]) + result += ",\n"+indent+"num_queues, queues, num_wait_events, wait_events, events);" + result += "\n}\n" + return result + +# ================================================================================================== + +# Checks for the number of command-line arguments +if len(sys.argv) != 2: + print "[ERROR] Usage: generator.py " + sys.exit() + +# Parses the command-line arguments +path_clblast = sys.argv[1] +files = [ + path_clblast+"/include/clblast.h", + path_clblast+"/src/clblast.cc", + path_clblast+"/include/clblast_c.h", + path_clblast+"/src/clblast_c.cc", + path_clblast+"/test/wrapper_clblas.h", +] +header_lines = [84, 52, 80, 24, 22] +footer_lines = [6, 3, 5, 2, 6] + +# Checks whether the command-line arguments are valid; exists otherwise +for f in files: + if not os.path.isfile(f): + print "[ERROR] The path '"+path_clblast+"' does not point to the root of the CLBlast library" + sys.exit() + +# ================================================================================================== + +# Iterates over all files to output +for i in xrange(0,len(files)): + + # Stores the header and the footer of the original file + with open(files[i]) as f: + original = f.readlines() + file_header = original[:header_lines[i]] + file_footer = original[-footer_lines[i]:] + + # Re-writes the body of the file + with open(files[i], "w") as f: + body = "" + for level in [1,2,3]: + body += separators[level-1]+"\n" + if i == 0: + body += clblast_h(routines[level-1]) + if i == 1: + body += clblast_cc(routines[level-1]) + if i == 2: + body += clblast_c_h(routines[level-1]) + if i == 3: + body += clblast_c_cc(routines[level-1]) + if i == 4: + body += wrapper_clblas(routines[level-1]) + f.write("".join(file_header)) + f.write(body) + f.write("".join(file_footer)) + +# ================================================================================================== + +# Outputs all the correctness-test implementations +for level in [1,2,3]: + for routine in routines[level-1]: + filename = path_clblast+"/test/correctness/routines/level"+str(level)+"/x"+routine.name+".cc" + with open(filename, "w") as f: + body = "" + body += "#include \"correctness/testblas.h\"\n" + body += "#include \"routines/level"+str(level)+"/x"+routine.name+".h\"\n\n" + body += "// Shortcuts to the clblast namespace\n" + body += "using float2 = clblast::float2;\n" + body += "using double2 = clblast::double2;\n\n" + body += "// Main function (not within the clblast namespace)\n" + body += "int main(int argc, char *argv[]) {\n" + not_first = "false" + for flavour in routine.flavours: + body += " clblast::RunTests +# +# This file contains the 'Routine' class, used in the generator script to generate the CLBlast API +# interface and implementation. +# +# ================================================================================================== + +# System modules +from itertools import chain + +# Translates an option name to a CLBlast data-type +def OptionToCLBlast(x): + return { + 'layout': "Layout", + 'a_transpose': "Transpose", + 'b_transpose': "Transpose", + 'ab_transpose': "Transpose", + 'side': "Side", + 'triangle': "Triangle", + 'diagonal': "Diagonal", + }[x] + +# As above, but for clBLAS data-types +def OptionToWrapper(x): + return { + 'layout': "clblasOrder", + 'a_transpose': "clblasTranspose", + 'b_transpose': "clblasTranspose", + 'ab_transpose': "clblasTranspose", + 'side': "clblasSide", + 'triangle': "clblasUplo", + 'diagonal': "clblasDiag", + }[x] + +# Buffers without 'ld' or 'inc' parameter +NO_LD_INC = ["dot","ap"] + +# ================================================================================================== + +# Class holding routine-specific information (e.g. name, which arguments, which precisions) +class Routine(): + def __init__(self, implemented, level, name, template, flavours, sizes, options, + inputs, outputs, scalars, scratch, description): + self.implemented = implemented + self.level = level + self.name = name + self.template = template + self.flavours = flavours + self.sizes = sizes + self.options = options + self.inputs = inputs + self.outputs = outputs + self.scalars = scalars + self.scratch = scratch # Scratch buffer (e.g. for xDOT) + self.description = description + + # Retrieves the number of characters in the routine's name + def Length(self): + return len(self.name) + + # Retrieves the postfix for a buffer + def Postfix(self, name): + return "inc" if (name in ["x","y"]) else "ld" + + # Determines whether or not this routine has scalar arguments (alpha/beta) + def NoScalars(self): + return self.scalars == [] + + # Returns the upper-case names of these routines (all flavours) + def ShortNames(self): + return "/".join([f.name+self.name.upper() for f in self.flavours]) + + # Determines which buffers go first (between alpha and beta) and which ones go after + def BuffersFirst(self): + if self.level == "2b": + return ["x","y"] + return ["ap","a","b","x"] + def BuffersSecond(self): + if self.level == "2b": + return ["ap","a","b","c"] + return ["y","c"] + + # ============================================================================================== + + # Retrieves a variable name for a specific input/output vector/matrix (e.g. 'x') + def Buffer(self, name): + if (name in self.inputs) or (name in self.outputs): + a = [name+"_buffer"] + b = [name+"_offset"] + c = [name+"_"+self.Postfix(name)] if (name not in NO_LD_INC) else [] + return [", ".join(a+b+c)] + return [] + + # As above but with data-types + def BufferDef(self, name): + prefix = "const " if (name in self.inputs) else "" + if (name in self.inputs) or (name in self.outputs): + a = [prefix+"cl_mem "+name+"_buffer"] + b = ["const size_t "+name+"_offset"] + c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in NO_LD_INC) else [] + return [", ".join(a+b+c)] + return [] + + # As above but with Claduc buffers + def BufferCladuc(self, name): + if (name in self.inputs) or (name in self.outputs): + a = ["Buffer<"+self.template.buffertype+">("+name+"_buffer)"] + b = [name+"_offset"] + c = [name+"_"+self.Postfix(name)] if (name not in NO_LD_INC) else [] + return [", ".join(a+b+c)] + return [] + + # As above but with a static cast for clBLAS wrapper + def BufferWrapper(self, name): + if (name in self.inputs) or (name in self.outputs): + a = [name+"_buffer"] + b = [name+"_offset"] + c = [] + if (name in ["x","y"]): + c = ["static_cast("+name+"_"+self.Postfix(name)+")"] + elif (name in ["a","b","c"]): + c = [name+"_"+self.Postfix(name)] + return [", ".join(a+b+c)] + return [] + + # As above, but only data-types + def BufferType(self, name): + prefix = "const " if (name in self.inputs) else "" + if (name in self.inputs) or (name in self.outputs): + a = [prefix+"cl_mem"] + b = ["const size_t"] + c = ["const size_t"] if (name not in NO_LD_INC) else [] + return [", ".join(a+b+c)] + return [] + + # ============================================================================================== + + # Retrieves the name of a scalar (alpha/beta) + def Scalar(self, name): + if (name in self.scalars): + return [name] + return [] + + # Retrieves the use of a scalar (alpha/beta) + def ScalarUse(self, name, flavour): + if name in self.scalars: + if name == "alpha": + return [flavour.UseAlpha()] + elif name == "beta": + return [flavour.UseBeta()] + return [name] + return [] + + # Retrieves the use of a scalar (alpha/beta) + def ScalarUseWrapper(self, name, flavour): + if name in self.scalars: + if name == "alpha": + return [flavour.UseAlphaCL()] + elif name == "beta": + return [flavour.UseBetaCL()] + return [name] + return [] + + # Retrieves the definition of a scalar (alpha/beta) + def ScalarDef(self, name, flavour): + if name in self.scalars: + if name == "alpha": + return ["const "+flavour.alpha_cl+" "+name] + return ["const "+flavour.beta_cl+" "+name] + return [] + + # As above, but without 'cl_' prefix + def ScalarDefPlain(self, name, flavour): + if name in self.scalars: + if name == "alpha": + return ["const "+flavour.alpha_cpp+" "+name] + return ["const "+flavour.beta_cpp+" "+name] + return [] + + # Retrieves the type of a scalar (alpha/beta) + def ScalarType(self, name, flavour): + if name in self.scalars: + if name == "alpha": + return ["const "+flavour.alpha_cpp] + return ["const "+flavour.beta_cpp] + return [] + + # ============================================================================================== + + # Retrieves a list of comma-separated sizes (m, n, k) + def Sizes(self): + if self.sizes: + return [", ".join([s for s in self.sizes])] + return [] + + # Retrieves the definition of the sizes (m,n,k) + def SizesDef(self): + if self.sizes: + return [", ".join(["const size_t "+s for s in self.sizes])] + return [] + + # Retrieves the types of the sizes (m,n,k) + def SizesType(self): + if self.sizes: + return [", ".join(["const size_t" for s in self.sizes])] + return [] + + # ============================================================================================== + + # Retrieves a list of options + def Options(self): + if self.options: + return [", ".join(self.options)] + return [] + + # As above, but now casted to CLBlast data-types + def OptionsCast(self, indent): + if self.options: + options = ["static_cast("+o+")" for o in self.options] + return [(",\n"+indent).join(options)] + return [] + + # Retrieves the definitions of the options (layout, transpose, side, etc.) + def OptionsDef(self): + if self.options: + definitions = ["const "+OptionToCLBlast(o)+" "+o for o in self.options] + return [", ".join(definitions)] + return [] + + # As above, but now using clBLAS data-types + def OptionsDefWrapper(self): + if self.options: + definitions = ["const "+OptionToWrapper(o)+" "+o for o in self.options] + return [", ".join(definitions)] + return [] + + # Retrieves the types of the options (layout, transpose, side, etc.) + def OptionsType(self): + if self.options: + definitions = ["const "+OptionToCLBlast(o) for o in self.options] + return [", ".join(definitions)] + return [] + + # ============================================================================================== + + # Retrieves a combination of all the argument names, with Claduc casts + def ArgumentsCladuc(self, flavour, indent): + return (self.Options() + self.Sizes() + self.BufferCladuc("dot") + + self.Scalar("alpha") + + list(chain(*[self.BufferCladuc(b) for b in self.BuffersFirst()])) + + self.Scalar("beta") + + list(chain(*[self.BufferCladuc(b) for b in self.BuffersSecond()])) + + list(chain(*[self.Scalar(s) for s in ["d1","d2","a","b","c","s"]]))) + + # Retrieves a combination of all the argument names, with CLBlast casts + def ArgumentsCast(self, flavour, indent): + return (self.OptionsCast(indent) + self.Sizes() + self.Buffer("dot") + + self.ScalarUse("alpha", flavour) + + list(chain(*[self.Buffer(b) for b in self.BuffersFirst()])) + + self.ScalarUse("beta", flavour) + + list(chain(*[self.Buffer(b) for b in self.BuffersSecond()])) + + list(chain(*[self.ScalarUse(s, flavour) for s in ["d1","d2","a","b","c","s"]]))) + + # As above, but for the clBLAS wrapper + def ArgumentsWrapper(self, flavour): + return (self.Options() + self.Sizes() + self.BufferWrapper("dot") + + self.ScalarUseWrapper("alpha", flavour) + + list(chain(*[self.BufferWrapper(b) for b in self.BuffersFirst()])) + + self.ScalarUseWrapper("beta", flavour) + + list(chain(*[self.BufferWrapper(b) for b in self.BuffersSecond()])) + + list(chain(*[self.ScalarUseWrapper(s, flavour) for s in ["d1","d2","a","b","c","s"]]))) + + # Retrieves a combination of all the argument definitions + def ArgumentsDef(self, flavour): + return (self.OptionsDef() + self.SizesDef() + self.BufferDef("dot") + + self.ScalarDef("alpha", flavour) + + list(chain(*[self.BufferDef(b) for b in self.BuffersFirst()])) + + self.ScalarDef("beta", flavour) + + list(chain(*[self.BufferDef(b) for b in self.BuffersSecond()])) + + list(chain(*[self.ScalarDef(s, flavour) for s in ["d1","d2","a","b","c","s"]]))) + + # As above, but clBLAS wrapper plain datatypes + def ArgumentsDefWrapper(self, flavour): + return (self.OptionsDefWrapper() + self.SizesDef() + self.BufferDef("dot") + + self.ScalarDefPlain("alpha", flavour) + + list(chain(*[self.BufferDef(b) for b in self.BuffersFirst()])) + + self.ScalarDefPlain("beta", flavour) + + list(chain(*[self.BufferDef(b) for b in self.BuffersSecond()])) + + list(chain(*[self.ScalarDefPlain(s, flavour) for s in ["d1","d2","a","b","c","s"]]))) + + # Retrieves a combination of all the argument types + def ArgumentsType(self, flavour): + return (self.OptionsType() + self.SizesType() + self.BufferType("dot") + + self.ScalarType("alpha", flavour) + + list(chain(*[self.BufferType(b) for b in self.BuffersFirst()])) + + self.ScalarType("beta", flavour) + + list(chain(*[self.BufferType(b) for b in self.BuffersSecond()])) + + list(chain(*[self.ScalarType(s, flavour) for s in ["d1","d2","a","b","c","s"]]))) + + + # ============================================================================================== + + # Retrieves the C++ templated definition for a routine + def RoutineHeaderCPP(self, spaces): + indent = " "*(spaces + self.Length()) + result = "template <"+self.template.name+">\n" + result += "StatusCode "+self.name.capitalize()+"(" + result += (",\n"+indent).join([a for a in self.ArgumentsDef(self.template)]) + result += ",\n"+indent+"cl_command_queue* queue, cl_event* event)" + return result + + # As above, but now without variable names + def RoutineHeaderTypeCPP(self, spaces): + indent = " "*(spaces + self.Length()) + result = "template <"+self.template.name+">\n" + result += "StatusCode "+self.name.capitalize()+"(" + result += (",\n"+indent).join([a for a in self.ArgumentsType(self.template)]) + result += ",\n"+indent+"cl_command_queue*, cl_event*)" + return result + + # As above, but now for C + def RoutineHeaderC(self, flavour, spaces): + indent = " "*(spaces + self.Length()) + result = "StatusCode CLBlast"+flavour.name+self.name+"(" + result += (",\n"+indent).join([a for a in self.ArgumentsDef(flavour)]) + result += ",\n"+indent+"cl_command_queue* queue, cl_event* event)" + return result + + # As above, but now for the clBLAS wrapper + def RoutineHeaderWrapper(self, flavour, def_only, spaces): + template = "<"+flavour.template+">" if self.NoScalars() and not def_only else "" + indent = " "*(spaces + self.Length() + len(template)) + result = "" + if self.NoScalars(): + result += "template <" + if def_only: + result += flavour.name + result += ">\n" + result += "clblasStatus clblasX"+self.name+template+"(" + result += (",\n"+indent).join([a for a in self.ArgumentsDefWrapper(flavour)]) + result += ",\n"+indent+"cl_uint num_queues, cl_command_queue *queues" + result += ",\n"+indent+"cl_uint num_wait_events, const cl_event *wait_events, cl_event *events)" + return result + +# ================================================================================================== diff --git a/src/clblast.cc b/src/clblast.cc index 12c7b880..77999aaf 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -18,12 +18,26 @@ #include "clblast.h" // BLAS level-1 includes +#include "internal/routines/level1/xswap.h" +#include "internal/routines/level1/xscal.h" +#include "internal/routines/level1/xcopy.h" #include "internal/routines/level1/xaxpy.h" +#include "internal/routines/level1/xdot.h" +#include "internal/routines/level1/xdotu.h" +#include "internal/routines/level1/xdotc.h" // BLAS level-2 includes #include "internal/routines/level2/xgemv.h" +#include "internal/routines/level2/xgbmv.h" #include "internal/routines/level2/xhemv.h" +#include "internal/routines/level2/xhbmv.h" +#include "internal/routines/level2/xhpmv.h" #include "internal/routines/level2/xsymv.h" +#include "internal/routines/level2/xsbmv.h" +#include "internal/routines/level2/xspmv.h" +#include "internal/routines/level2/xtrmv.h" +#include "internal/routines/level2/xtbmv.h" +#include "internal/routines/level2/xtpmv.h" // BLAS level-3 includes #include "internal/routines/level3/xgemm.h" @@ -36,479 +50,1404 @@ #include "internal/routines/level3/xtrmm.h" namespace clblast { + // ================================================================================================= // BLAS level-1 (vector-vector) routines // ================================================================================================= -// AXPY +// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP +template +StatusCode Swap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto event_cpp = Event(*event); + auto routine = Xswap(queue_cpp, event_cpp); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoSwap(n, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc); +} +template StatusCode Swap(const size_t, + cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Swap(const size_t, + cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Swap(const size_t, + cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Swap(const size_t, + cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL +template +StatusCode Scal(const size_t n, + const T alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto event_cpp = Event(*event); + auto routine = Xscal(queue_cpp, event_cpp); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoScal(n, + alpha, + Buffer(x_buffer), x_offset, x_inc); +} +template StatusCode Scal(const size_t, + const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Scal(const size_t, + const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Scal(const size_t, + const float2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Scal(const size_t, + const double2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY +template +StatusCode Copy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto event_cpp = Event(*event); + auto routine = Xcopy(queue_cpp, event_cpp); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoCopy(n, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc); +} +template StatusCode Copy(const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Copy(const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Copy(const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Copy(const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY +template +StatusCode Axpy(const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto event_cpp = Event(*event); + auto routine = Xaxpy(queue_cpp, event_cpp); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoAxpy(n, + alpha, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc); +} +template StatusCode Axpy(const size_t, + const float, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Axpy(const size_t, + const double, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Axpy(const size_t, + const float2, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Axpy(const size_t, + const double2, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Dot product of two vectors: SDOT/DDOT +template +StatusCode Dot(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto event_cpp = Event(*event); + auto routine = Xdot(queue_cpp, event_cpp); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoDot(n, + Buffer(dot_buffer), dot_offset, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc); +} +template StatusCode Dot(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Dot(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Dot product of two complex vectors: CDOTU/ZDOTU +template +StatusCode Dotu(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto event_cpp = Event(*event); + auto routine = Xdotu(queue_cpp, event_cpp); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoDotu(n, + Buffer(dot_buffer), dot_offset, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc); +} +template StatusCode Dotu(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Dotu(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC +template +StatusCode Dotc(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto event_cpp = Event(*event); + auto routine = Xdotc(queue_cpp, event_cpp); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoDotc(n, + Buffer(dot_buffer), dot_offset, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc); +} +template StatusCode Dotc(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Dotc(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// ================================================================================================= +// BLAS level-2 (matrix-vector) routines +// ================================================================================================= + +// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV +template +StatusCode Gemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto event_cpp = Event(*event); + auto routine = Xgemv(queue_cpp, event_cpp); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoGemv(layout, a_transpose, + m, n, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(x_buffer), x_offset, x_inc, + beta, + Buffer(y_buffer), y_offset, y_inc); +} +template StatusCode Gemv(const Layout, const Transpose, + const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Gemv(const Layout, const Transpose, + const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Gemv(const Layout, const Transpose, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Gemv(const Layout, const Transpose, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV +template +StatusCode Gbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto event_cpp = Event(*event); + auto routine = Xgbmv(queue_cpp, event_cpp); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoGbmv(layout, a_transpose, + m, n, kl, ku, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(x_buffer), x_offset, x_inc, + beta, + Buffer(y_buffer), y_offset, y_inc); +} +template StatusCode Gbmv(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Gbmv(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Gbmv(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Gbmv(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Hermitian matrix-vector multiplication: CHEMV/ZHEMV +template +StatusCode Hemv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto event_cpp = Event(*event); + auto routine = Xhemv(queue_cpp, event_cpp); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoHemv(layout, triangle, + n, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(x_buffer), x_offset, x_inc, + beta, + Buffer(y_buffer), y_offset, y_inc); +} +template StatusCode Hemv(const Layout, const Triangle, + const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Hemv(const Layout, const Triangle, + const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV +template +StatusCode Hbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto event_cpp = Event(*event); + auto routine = Xhbmv(queue_cpp, event_cpp); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoHbmv(layout, triangle, + n, k, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(x_buffer), x_offset, x_inc, + beta, + Buffer(y_buffer), y_offset, y_inc); +} +template StatusCode Hbmv(const Layout, const Triangle, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Hbmv(const Layout, const Triangle, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV +template +StatusCode Hpmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto event_cpp = Event(*event); + auto routine = Xhpmv(queue_cpp, event_cpp); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoHpmv(layout, triangle, + n, + alpha, + Buffer(ap_buffer), ap_offset, + Buffer(x_buffer), x_offset, x_inc, + beta, + Buffer(y_buffer), y_offset, y_inc); +} +template StatusCode Hpmv(const Layout, const Triangle, + const size_t, + const float2, + const cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const float2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Hpmv(const Layout, const Triangle, + const size_t, + const double2, + const cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const double2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Symmetric matrix-vector multiplication: SSYMV/DSYMV +template +StatusCode Symv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto event_cpp = Event(*event); + auto routine = Xsymv(queue_cpp, event_cpp); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoSymv(layout, triangle, + n, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(x_buffer), x_offset, x_inc, + beta, + Buffer(y_buffer), y_offset, y_inc); +} +template StatusCode Symv(const Layout, const Triangle, + const size_t, + const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Symv(const Layout, const Triangle, + const size_t, + const double, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV +template +StatusCode Sbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto event_cpp = Event(*event); + auto routine = Xsbmv(queue_cpp, event_cpp); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoSbmv(layout, triangle, + n, k, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(x_buffer), x_offset, x_inc, + beta, + Buffer(y_buffer), y_offset, y_inc); +} +template StatusCode Sbmv(const Layout, const Triangle, + const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Sbmv(const Layout, const Triangle, + const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV +template +StatusCode Spmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto event_cpp = Event(*event); + auto routine = Xspmv(queue_cpp, event_cpp); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoSpmv(layout, triangle, + n, + alpha, + Buffer(ap_buffer), ap_offset, + Buffer(x_buffer), x_offset, x_inc, + beta, + Buffer(y_buffer), y_offset, y_inc); +} +template StatusCode Spmv(const Layout, const Triangle, + const size_t, + const float, + const cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Spmv(const Layout, const Triangle, + const size_t, + const double, + const cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV template -StatusCode Axpy(const size_t n, const T alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, +StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); - auto routine = Xaxpy(queue_cpp, event_cpp); - - // Compiles the routine's device kernels + auto routine = Xtrmv(queue_cpp, event_cpp); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } - - // Runs the routine - return routine.DoAxpy(n, alpha, - Buffer(x_buffer), x_offset, x_inc, - Buffer(y_buffer), y_offset, y_inc); + return routine.DoTrmv(layout, triangle, a_transpose, diagonal, + n, + Buffer(a_buffer), a_offset, a_ld, + Buffer(x_buffer), x_offset, x_inc); } -template StatusCode Axpy(const size_t, const float, +template StatusCode Trmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -template StatusCode Axpy(const size_t, const double, +template StatusCode Trmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -template StatusCode Axpy(const size_t, const float2, +template StatusCode Trmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -template StatusCode Axpy(const size_t, const double2, +template StatusCode Trmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -// ================================================================================================= -// BLAS level-2 (matrix-vector) routines -// ================================================================================================= - -// GEMV +// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV template -StatusCode Gemv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, const T alpha, +StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); - auto routine = Xgemv(queue_cpp, event_cpp); - - // Compiles the routine's device kernels + auto routine = Xtbmv(queue_cpp, event_cpp); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } - - // Runs the routine - return routine.DoGemv(layout, a_transpose, m, n, alpha, + return routine.DoTbmv(layout, triangle, a_transpose, diagonal, + n, k, Buffer(a_buffer), a_offset, a_ld, - Buffer(x_buffer), x_offset, x_inc, beta, - Buffer(y_buffer), y_offset, y_inc); + Buffer(x_buffer), x_offset, x_inc); } -template StatusCode Gemv(const Layout, const Transpose, - const size_t, const size_t, const float, +template StatusCode Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -template StatusCode Gemv(const Layout, const Transpose, - const size_t, const size_t, const double, +template StatusCode Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -template StatusCode Gemv(const Layout, const Transpose, - const size_t, const size_t, const float2, +template StatusCode Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -template StatusCode Gemv(const Layout, const Transpose, - const size_t, const size_t, const double2, +template StatusCode Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -// ================================================================================================= - -// HEMV +// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV template -StatusCode Hemv(const Layout layout, const Triangle triangle, - const size_t n, const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, +StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); - auto routine = Xhemv(queue_cpp, event_cpp); - - // Compiles the routine's device kernels + auto routine = Xtpmv(queue_cpp, event_cpp); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } + return routine.DoTpmv(layout, triangle, a_transpose, diagonal, + n, + Buffer(ap_buffer), ap_offset, + Buffer(x_buffer), x_offset, x_inc); +} +template StatusCode Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); - // Runs the routine - return routine.DoHemv(layout, triangle, n, alpha, - Buffer(a_buffer), a_offset, a_ld, - Buffer(x_buffer), x_offset, x_inc, beta, - Buffer(y_buffer), y_offset, y_inc); +// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV +template +StatusCode Trsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; } -template StatusCode Hemv(const Layout, const Triangle, - const size_t, const float2, +template StatusCode Trsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Trsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -template StatusCode Hemv(const Layout, const Triangle, - const size_t, const double2, +template StatusCode Trsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Trsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -// ================================================================================================= +// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV +template +StatusCode Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); -// SYMV +// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV template -StatusCode Symv(const Layout layout, const Triangle triangle, - const size_t n, const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); - auto queue_cpp = Queue(*queue); - auto event_cpp = Event(*event); - auto routine = Xsymv(queue_cpp, event_cpp); +// General rank-1 matrix update: SGER/DGER +template +StatusCode Ger(const Layout, + const size_t, const size_t, + const T, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Ger(const Layout, + const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Ger(const Layout, + const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); - // Compiles the routine's device kernels - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } +// General rank-1 complex matrix update: CGERU/ZGERU +template +StatusCode Geru(const Layout, + const size_t, const size_t, + const T, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Geru(const Layout, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Geru(const Layout, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); - // Runs the routine - return routine.DoSymv(layout, triangle, n, alpha, - Buffer(a_buffer), a_offset, a_ld, - Buffer(x_buffer), x_offset, x_inc, beta, - Buffer(y_buffer), y_offset, y_inc); +// General rank-1 complex conjugated matrix update: CGERC/ZGERC +template +StatusCode Gerc(const Layout, + const size_t, const size_t, + const T, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; } -template StatusCode Symv(const Layout, const Triangle, - const size_t, const float, +template StatusCode Gerc(const Layout, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Gerc(const Layout, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Hermitian rank-1 matrix update: CHER/ZHER +template +StatusCode Her(const Layout, const Triangle, + const size_t, + const T, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Her(const Layout, const Triangle, + const size_t, + const float, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Her(const Layout, const Triangle, + const size_t, + const double, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -template StatusCode Symv(const Layout, const Triangle, - const size_t, const double, + +// Hermitian packed rank-1 matrix update: CHPR/ZHPR +template +StatusCode Hpr(const Layout, const Triangle, + const size_t, + const T, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Hpr(const Layout, const Triangle, + const size_t, + const float, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Hpr(const Layout, const Triangle, + const size_t, + const double, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); + +// Hermitian rank-2 matrix update: CHER2/ZHER2 +template +StatusCode Her2(const Layout, const Triangle, + const size_t, + const T, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Her2(const Layout, const Triangle, + const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Her2(const Layout, const Triangle, + const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 +template +StatusCode Hpr2(const Layout, const Triangle, + const size_t, + const T, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Hpr2(const Layout, const Triangle, + const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Hpr2(const Layout, const Triangle, + const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); + +// Symmetric rank-1 matrix update: SSYR/DSYR +template +StatusCode Syr(const Layout, const Triangle, + const size_t, + const T, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Syr(const Layout, const Triangle, + const size_t, + const float, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Syr(const Layout, const Triangle, + const size_t, + const double, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Symmetric packed rank-1 matrix update: SSPR/DSPR +template +StatusCode Spr(const Layout, const Triangle, + const size_t, + const T, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Spr(const Layout, const Triangle, + const size_t, + const float, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Spr(const Layout, const Triangle, + const size_t, + const double, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); + +// Symmetric rank-2 matrix update: SSYR2/DSYR2 +template +StatusCode Syr2(const Layout, const Triangle, + const size_t, + const T, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Syr2(const Layout, const Triangle, + const size_t, + const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Syr2(const Layout, const Triangle, + const size_t, + const double, + const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +// Symmetric packed rank-2 matrix update: SSPR2/DSPR2 +template +StatusCode Spr2(const Layout, const Triangle, + const size_t, + const T, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Spr2(const Layout, const Triangle, + const size_t, + const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Spr2(const Layout, const Triangle, + const size_t, + const double, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); + // ================================================================================================= // BLAS level-3 (matrix-matrix) routines // ================================================================================================= -// GEMM +// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM template StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, - const size_t m, const size_t n, const size_t k, const T alpha, + const size_t m, const size_t n, const size_t k, + const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xgemm(queue_cpp, event_cpp); - - // Compiles the routine's device kernels auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } - - // Runs the routine - return routine.DoGemm(layout, a_transpose, b_transpose, m, n, k, alpha, + return routine.DoGemm(layout, a_transpose, b_transpose, + m, n, k, + alpha, Buffer(a_buffer), a_offset, a_ld, - Buffer(b_buffer), b_offset, b_ld, beta, + Buffer(b_buffer), b_offset, b_ld, + beta, Buffer(c_buffer), c_offset, c_ld); } template StatusCode Gemm(const Layout, const Transpose, const Transpose, - const size_t, const size_t, const size_t, const float, + const size_t, const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const float, + const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode Gemm(const Layout, const Transpose, const Transpose, - const size_t, const size_t, const size_t, const double, + const size_t, const size_t, const size_t, + const double, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const double, + const cl_mem, const size_t, const size_t, + const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode Gemm(const Layout, const Transpose, const Transpose, - const size_t, const size_t, const size_t, const float2, + const size_t, const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const float2, + const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode Gemm(const Layout, const Transpose, const Transpose, - const size_t, const size_t, const size_t, const double2, + const size_t, const size_t, const size_t, + const double2, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const double2, + const cl_mem, const size_t, const size_t, + const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -// ================================================================================================= - -// SYMM +// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM template StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, const T alpha, + const size_t m, const size_t n, + const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xsymm(queue_cpp, event_cpp); - - // Compiles the routine's device kernels auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } - - // Runs the routine - return routine.DoSymm(layout, side, triangle, m, n, alpha, + return routine.DoSymm(layout, side, triangle, + m, n, + alpha, Buffer(a_buffer), a_offset, a_ld, - Buffer(b_buffer), b_offset, b_ld, beta, + Buffer(b_buffer), b_offset, b_ld, + beta, Buffer(c_buffer), c_offset, c_ld); } template StatusCode Symm(const Layout, const Side, const Triangle, - const size_t, const size_t, const float, + const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const float, + const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode Symm(const Layout, const Side, const Triangle, - const size_t, const size_t, const double, + const size_t, const size_t, + const double, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const double, + const cl_mem, const size_t, const size_t, + const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode Symm(const Layout, const Side, const Triangle, - const size_t, const size_t, const float2, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const float2, + const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode Symm(const Layout, const Side, const Triangle, - const size_t, const size_t, const double2, + const size_t, const size_t, + const double2, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const double2, + const cl_mem, const size_t, const size_t, + const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -// ================================================================================================= - -// HEMM +// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM template StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, const T alpha, + const size_t m, const size_t n, + const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xhemm(queue_cpp, event_cpp); - - // Compiles the routine's device kernels auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } - - // Runs the routine - return routine.DoHemm(layout, side, triangle, m, n, alpha, + return routine.DoHemm(layout, side, triangle, + m, n, + alpha, Buffer(a_buffer), a_offset, a_ld, - Buffer(b_buffer), b_offset, b_ld, beta, + Buffer(b_buffer), b_offset, b_ld, + beta, Buffer(c_buffer), c_offset, c_ld); } template StatusCode Hemm(const Layout, const Side, const Triangle, - const size_t, const size_t, const float2, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const float2, + const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode Hemm(const Layout, const Side, const Triangle, - const size_t, const size_t, const double2, + const size_t, const size_t, + const double2, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const double2, + const cl_mem, const size_t, const size_t, + const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -// ================================================================================================= - -// SYRK +// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK template StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const T beta, + const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xsyrk(queue_cpp, event_cpp); - - // Compiles the routine's device kernels auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } - - // Runs the routine - return routine.DoSyrk(layout, triangle, a_transpose, n, k, alpha, - Buffer(a_buffer), a_offset, a_ld, beta, + return routine.DoSyrk(layout, triangle, a_transpose, + n, k, + alpha, + Buffer(a_buffer), a_offset, a_ld, + beta, Buffer(c_buffer), c_offset, c_ld); } template StatusCode Syrk(const Layout, const Triangle, const Transpose, - const size_t, const size_t, const float, - const cl_mem, const size_t, const size_t, const float, + const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode Syrk(const Layout, const Triangle, const Transpose, - const size_t, const size_t, const double, - const cl_mem, const size_t, const size_t, const double, + const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode Syrk(const Layout, const Triangle, const Transpose, - const size_t, const size_t, const float2, - const cl_mem, const size_t, const size_t, const float2, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode Syrk(const Layout, const Triangle, const Transpose, - const size_t, const size_t, const double2, - const cl_mem, const size_t, const size_t, const double2, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -// ================================================================================================= - -// HERK +// Rank-K update of a hermitian matrix: CHERK/ZHERK template StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const T beta, + const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xherk,T>(queue_cpp, event_cpp); - - // Compiles the routine's device kernels auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } - - // Runs the routine - return routine.DoHerk(layout, triangle, a_transpose, n, k, alpha, - Buffer>(a_buffer), a_offset, a_ld, beta, + return routine.DoHerk(layout, triangle, a_transpose, + n, k, + alpha, + Buffer>(a_buffer), a_offset, a_ld, + beta, Buffer>(c_buffer), c_offset, c_ld); } template StatusCode Herk(const Layout, const Triangle, const Transpose, - const size_t, const size_t, const float, - const cl_mem, const size_t, const size_t, const float, + const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode Herk(const Layout, const Triangle, const Transpose, - const size_t, const size_t, const double, - const cl_mem, const size_t, const size_t, const double, + const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -// ================================================================================================= - -// SYR2K +// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K template StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, const T alpha, + const size_t n, const size_t k, + const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xsyr2k(queue_cpp, event_cpp); - - // Compiles the routine's device kernels auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } - - // Runs the routine - return routine.DoSyr2k(layout, triangle, ab_transpose, n, k, alpha, + return routine.DoSyr2k(layout, triangle, ab_transpose, + n, k, + alpha, Buffer(a_buffer), a_offset, a_ld, - Buffer(b_buffer), b_offset, b_ld, beta, + Buffer(b_buffer), b_offset, b_ld, + beta, Buffer(c_buffer), c_offset, c_ld); } template StatusCode Syr2k(const Layout, const Triangle, const Transpose, - const size_t, const size_t, const float, + const size_t, const size_t, + const float, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const float, + const cl_mem, const size_t, const size_t, + const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode Syr2k(const Layout, const Triangle, const Transpose, - const size_t, const size_t, const double, + const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const double, + const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode Syr2k(const Layout, const Triangle, const Transpose, - const size_t, const size_t, const float2, + const size_t, const size_t, + const float2, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const float2, + const cl_mem, const size_t, const size_t, + const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode Syr2k(const Layout, const Triangle, const Transpose, - const size_t, const size_t, const double2, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const double2, + const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -// ================================================================================================= - -// SYR2K +// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K template StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, const T alpha, + const size_t n, const size_t k, + const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const U beta, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const U beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xher2k(queue_cpp, event_cpp); - - // Compiles the routine's device kernels auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } - - // Runs the routine - return routine.DoHer2k(layout, triangle, ab_transpose, n, k, alpha, + return routine.DoHer2k(layout, triangle, ab_transpose, + n, k, + alpha, Buffer(a_buffer), a_offset, a_ld, - Buffer(b_buffer), b_offset, b_ld, beta, + Buffer(b_buffer), b_offset, b_ld, + beta, Buffer(c_buffer), c_offset, c_ld); } template StatusCode Her2k(const Layout, const Triangle, const Transpose, - const size_t, const size_t, const float2, + const size_t, const size_t, + const float2, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const float, + const cl_mem, const size_t, const size_t, + const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode Her2k(const Layout, const Triangle, const Transpose, - const size_t, const size_t, const double2, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const double, + const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -// ================================================================================================= - -// TRMM +// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM template -StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, - const Transpose a_transpose, const Diagonal diagonal, +StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, @@ -517,90 +1456,73 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xtrmm(queue_cpp, event_cpp); - - // Compiles the routine's device kernels auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } - - // Runs the routine - return routine.DoTrmm(layout, side, triangle, a_transpose, diagonal, m, n, alpha, + return routine.DoTrmm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(b_buffer), b_offset, b_ld); } -template StatusCode Trmm(const Layout, const Side, const Triangle, - const Transpose, const Diagonal, - const size_t, const size_t, const float, +template StatusCode Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const float, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -template StatusCode Trmm(const Layout, const Side, const Triangle, - const Transpose, const Diagonal, - const size_t, const size_t, const double, +template StatusCode Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const double, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -template StatusCode Trmm(const Layout, const Side, const Triangle, - const Transpose, const Diagonal, - const size_t, const size_t, const float2, +template StatusCode Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const float2, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -template StatusCode Trmm(const Layout, const Side, const Triangle, - const Transpose, const Diagonal, - const size_t, const size_t, const double2, +template StatusCode Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const double2, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -// ================================================================================================= - -// TRSM -/* +// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM template -StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, - const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto event_cpp = Event(*event); - auto routine = Xtrsm(queue_cpp, event_cpp); - - // Compiles the routine's device kernels - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - - // Runs the routine - return routine.DoTrsm(layout, side, triangle, a_transpose, diagonal, m, n, alpha, - Buffer(a_buffer), a_offset, a_ld, - Buffer(b_buffer), b_offset, b_ld); +StatusCode Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const T, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; } -template StatusCode Trsm(const Layout, const Side, const Triangle, - const Transpose, const Diagonal, - const size_t, const size_t, const float, +template StatusCode Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const float, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -template StatusCode Trsm(const Layout, const Side, const Triangle, - const Transpose, const Diagonal, - const size_t, const size_t, const double, +template StatusCode Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const double, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -template StatusCode Trsm(const Layout, const Side, const Triangle, - const Transpose, const Diagonal, - const size_t, const size_t, const float2, +template StatusCode Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const float2, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -template StatusCode Trsm(const Layout, const Side, const Triangle, - const Transpose, const Diagonal, - const size_t, const size_t, const double2, +template StatusCode Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const double2, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -*/ + // ================================================================================================= } // namespace clblast diff --git a/src/clblast_c.cc b/src/clblast_c.cc index 3b437aff..fcec0951 100644 --- a/src/clblast_c.cc +++ b/src/clblast_c.cc @@ -19,218 +19,1410 @@ extern "C" { #include "clblast.h" #include "internal/utilities.h" +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + // ================================================================================================= // BLAS level-1 (vector-vector) routines // ================================================================================================= -// AXPY -StatusCode CLBlastSaxpy(const size_t n, - const float alpha, +// SWAP +StatusCode CLBlastSswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Swap(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Swap(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Swap(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Swap(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} + +// SCAL +StatusCode CLBlastSscal(const size_t n, + const float alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Scal(n, + alpha, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDscal(const size_t n, + const double alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Scal(n, + alpha, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCscal(const size_t n, + const cl_float2 alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Scal(n, + float2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZscal(const size_t n, + const cl_double2 alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Scal(n, + double2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} + +// COPY +StatusCode CLBlastScopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Copy(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Copy(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Copy(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Copy(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} + +// AXPY +StatusCode CLBlastSaxpy(const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Axpy(n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDaxpy(const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Axpy(n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCaxpy(const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Axpy(n, + float2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZaxpy(const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Axpy(n, + double2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} + +// DOT +StatusCode CLBlastSdot(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Dot(n, + dot_buffer, dot_offset, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDdot(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Dot(n, + dot_buffer, dot_offset, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} + +// DOTU +StatusCode CLBlastCdotu(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Dotu(n, + dot_buffer, dot_offset, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZdotu(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Dotu(n, + dot_buffer, dot_offset, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} + +// DOTC +StatusCode CLBlastCdotc(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Dotc(n, + dot_buffer, dot_offset, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZdotc(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Dotc(n, + dot_buffer, dot_offset, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} + +// ================================================================================================= +// BLAS level-2 (matrix-vector) routines +// ================================================================================================= + +// GEMV +StatusCode CLBlastSgemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Gemv(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDgemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Gemv(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCgemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Gemv(static_cast(layout), + static_cast(a_transpose), + m, n, + float2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + float2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZgemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Gemv(static_cast(layout), + static_cast(a_transpose), + m, n, + double2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + double2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} + +// GBMV +StatusCode CLBlastSgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Gbmv(static_cast(layout), + static_cast(a_transpose), + m, n, kl, ku, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Gbmv(static_cast(layout), + static_cast(a_transpose), + m, n, kl, ku, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Gbmv(static_cast(layout), + static_cast(a_transpose), + m, n, kl, ku, + float2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + float2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Gbmv(static_cast(layout), + static_cast(a_transpose), + m, n, kl, ku, + double2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + double2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} + +// HEMV +StatusCode CLBlastChemv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Hemv(static_cast(layout), + static_cast(triangle), + n, + float2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + float2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZhemv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Hemv(static_cast(layout), + static_cast(triangle), + n, + double2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + double2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} + +// HBMV +StatusCode CLBlastChbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Hbmv(static_cast(layout), + static_cast(triangle), + n, k, + float2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + float2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZhbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Hbmv(static_cast(layout), + static_cast(triangle), + n, k, + double2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + double2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} + +// HPMV +StatusCode CLBlastChpmv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_float2 alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Hpmv(static_cast(layout), + static_cast(triangle), + n, + float2{alpha.s[0], alpha.s[1]}, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + float2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZhpmv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_double2 alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Hpmv(static_cast(layout), + static_cast(triangle), + n, + double2{alpha.s[0], alpha.s[1]}, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + double2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} + +// SYMV +StatusCode CLBlastSsymv(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Symv(static_cast(layout), + static_cast(triangle), + n, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDsymv(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Symv(static_cast(layout), + static_cast(triangle), + n, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} + +// SBMV +StatusCode CLBlastSsbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Sbmv(static_cast(layout), + static_cast(triangle), + n, k, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDsbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Sbmv(static_cast(layout), + static_cast(triangle), + n, k, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} + +// SPMV +StatusCode CLBlastSspmv(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Spmv(static_cast(layout), + static_cast(triangle), + n, + alpha, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDspmv(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Spmv(static_cast(layout), + static_cast(triangle), + n, + alpha, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} + +// TRMV +StatusCode CLBlastStrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} + +// TBMV +StatusCode CLBlastStbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tbmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tbmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tbmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tbmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} + +// TPMV +StatusCode CLBlastStpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tpmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tpmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tpmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tpmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} + +// TRSV +StatusCode CLBlastStrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} + +// TBSV +StatusCode CLBlastStbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tbsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tbsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tbsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tbsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} + +// TPSV +StatusCode CLBlastStpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tpsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tpsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tpsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tpsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} + +// GER +StatusCode CLBlastSger(const Layout layout, + const size_t m, const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Ger(static_cast(layout), + m, n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDger(const Layout layout, + const size_t m, const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Ger(static_cast(layout), + m, n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast(status); +} + +// GERU +StatusCode CLBlastCgeru(const Layout layout, + const size_t m, const size_t n, + const cl_float2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Axpy(n, - alpha, + auto status = clblast::Geru(static_cast(layout), + m, n, + float2{alpha.s[0], alpha.s[1]}, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, queue, event); return static_cast(status); } -StatusCode CLBlastDaxpy(const size_t n, - const double alpha, +StatusCode CLBlastZgeru(const Layout layout, + const size_t m, const size_t n, + const cl_double2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Axpy(n, - alpha, + auto status = clblast::Geru(static_cast(layout), + m, n, + double2{alpha.s[0], alpha.s[1]}, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, queue, event); return static_cast(status); } -StatusCode CLBlastCaxpy(const size_t n, + +// GERC +StatusCode CLBlastCgerc(const Layout layout, + const size_t m, const size_t n, const cl_float2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Axpy(n, - clblast::float2{alpha.s[0], alpha.s[1]}, + auto status = clblast::Gerc(static_cast(layout), + m, n, + float2{alpha.s[0], alpha.s[1]}, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, queue, event); return static_cast(status); } -StatusCode CLBlastZaxpy(const size_t n, +StatusCode CLBlastZgerc(const Layout layout, + const size_t m, const size_t n, const cl_double2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Axpy(n, - clblast::double2{alpha.s[0], alpha.s[1]}, + auto status = clblast::Gerc(static_cast(layout), + m, n, + double2{alpha.s[0], alpha.s[1]}, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, queue, event); return static_cast(status); } -// ================================================================================================= -// BLAS level-2 (matrix-vector) routines -// ================================================================================================= +// HER +StatusCode CLBlastCher(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Her(static_cast(layout), + static_cast(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZher(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Her(static_cast(layout), + static_cast(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast(status); +} -// GEMV -StatusCode CLBlastSgemv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, +// HPR +StatusCode CLBlastChpr(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Hpr(static_cast(layout), + static_cast(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + ap_buffer, ap_offset, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZhpr(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Hpr(static_cast(layout), + static_cast(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + ap_buffer, ap_offset, + queue, event); + return static_cast(status); +} + +// HER2 +StatusCode CLBlastCher2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_float2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Gemv(static_cast(layout), - static_cast(a_transpose), - m, n, - alpha, - a_buffer, a_offset, a_ld, + auto status = clblast::Her2(static_cast(layout), + static_cast(triangle), + n, + float2{alpha.s[0], alpha.s[1]}, x_buffer, x_offset, x_inc, - beta, y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, queue, event); return static_cast(status); } -StatusCode CLBlastDgemv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, +StatusCode CLBlastZher2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_double2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Gemv(static_cast(layout), - static_cast(a_transpose), - m, n, - alpha, - a_buffer, a_offset, a_ld, + auto status = clblast::Her2(static_cast(layout), + static_cast(triangle), + n, + double2{alpha.s[0], alpha.s[1]}, x_buffer, x_offset, x_inc, - beta, y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, queue, event); return static_cast(status); } -StatusCode CLBlastCgemv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, + +// HPR2 +StatusCode CLBlastChpr2(const Layout layout, const Triangle triangle, + const size_t n, const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Gemv(static_cast(layout), - static_cast(a_transpose), - m, n, - clblast::float2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, + auto status = clblast::Hpr2(static_cast(layout), + static_cast(triangle), + n, + float2{alpha.s[0], alpha.s[1]}, x_buffer, x_offset, x_inc, - clblast::float2{beta.s[0], beta.s[1]}, y_buffer, y_offset, y_inc, + ap_buffer, ap_offset, queue, event); return static_cast(status); } -StatusCode CLBlastZgemv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, +StatusCode CLBlastZhpr2(const Layout layout, const Triangle triangle, + const size_t n, const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Gemv(static_cast(layout), - static_cast(a_transpose), - m, n, - clblast::double2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, + auto status = clblast::Hpr2(static_cast(layout), + static_cast(triangle), + n, + double2{alpha.s[0], alpha.s[1]}, x_buffer, x_offset, x_inc, - clblast::double2{beta.s[0], beta.s[1]}, y_buffer, y_offset, y_inc, + ap_buffer, ap_offset, queue, event); return static_cast(status); } -// HEMV -StatusCode CLBlastChemv(const Layout layout, const Triangle triangle, +// SYR +StatusCode CLBlastSsyr(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Syr(static_cast(layout), + static_cast(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDsyr(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Syr(static_cast(layout), + static_cast(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast(status); +} + +// SPR +StatusCode CLBlastSspr(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Spr(static_cast(layout), + static_cast(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + ap_buffer, ap_offset, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDspr(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Spr(static_cast(layout), + static_cast(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + ap_buffer, ap_offset, + queue, event); + return static_cast(status); +} + +// SYR2 +StatusCode CLBlastSsyr2(const Layout layout, const Triangle triangle, const size_t n, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const float alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Hemv(static_cast(layout), + auto status = clblast::Syr2(static_cast(layout), static_cast(triangle), n, - clblast::float2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, + alpha, x_buffer, x_offset, x_inc, - clblast::float2{beta.s[0], beta.s[1]}, y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, queue, event); return static_cast(status); } -StatusCode CLBlastZhemv(const Layout layout, const Triangle triangle, +StatusCode CLBlastDsyr2(const Layout layout, const Triangle triangle, const size_t n, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const double alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Hemv(static_cast(layout), + auto status = clblast::Syr2(static_cast(layout), static_cast(triangle), n, - clblast::double2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, + alpha, x_buffer, x_offset, x_inc, - clblast::double2{beta.s[0], beta.s[1]}, y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, queue, event); return static_cast(status); } -// SYMV -StatusCode CLBlastSsymv(const Layout layout, const Triangle triangle, +// SPR2 +StatusCode CLBlastSspr2(const Layout layout, const Triangle triangle, const size_t n, const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Symv(static_cast(layout), + auto status = clblast::Spr2(static_cast(layout), static_cast(triangle), n, alpha, - a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, - beta, y_buffer, y_offset, y_inc, + ap_buffer, ap_offset, queue, event); return static_cast(status); } -StatusCode CLBlastDsymv(const Layout layout, const Triangle triangle, +StatusCode CLBlastDspr2(const Layout layout, const Triangle triangle, const size_t n, const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Symv(static_cast(layout), + auto status = clblast::Spr2(static_cast(layout), static_cast(triangle), n, alpha, - a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, - beta, y_buffer, y_offset, y_inc, + ap_buffer, ap_offset, queue, event); return static_cast(status); } @@ -292,10 +1484,10 @@ StatusCode CLBlastCgemm(const Layout layout, const Transpose a_transpose, const static_cast(a_transpose), static_cast(b_transpose), m, n, k, - clblast::float2{alpha.s[0], alpha.s[1]}, + float2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, - clblast::float2{beta.s[0], beta.s[1]}, + float2{beta.s[0], beta.s[1]}, c_buffer, c_offset, c_ld, queue, event); return static_cast(status); @@ -312,10 +1504,10 @@ StatusCode CLBlastZgemm(const Layout layout, const Transpose a_transpose, const static_cast(a_transpose), static_cast(b_transpose), m, n, k, - clblast::double2{alpha.s[0], alpha.s[1]}, + double2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, - clblast::double2{beta.s[0], beta.s[1]}, + double2{beta.s[0], beta.s[1]}, c_buffer, c_offset, c_ld, queue, event); return static_cast(status); @@ -374,10 +1566,10 @@ StatusCode CLBlastCsymm(const Layout layout, const Side side, const Triangle tri static_cast(side), static_cast(triangle), m, n, - clblast::float2{alpha.s[0], alpha.s[1]}, + float2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, - clblast::float2{beta.s[0], beta.s[1]}, + float2{beta.s[0], beta.s[1]}, c_buffer, c_offset, c_ld, queue, event); return static_cast(status); @@ -394,10 +1586,10 @@ StatusCode CLBlastZsymm(const Layout layout, const Side side, const Triangle tri static_cast(side), static_cast(triangle), m, n, - clblast::double2{alpha.s[0], alpha.s[1]}, + double2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, - clblast::double2{beta.s[0], beta.s[1]}, + double2{beta.s[0], beta.s[1]}, c_buffer, c_offset, c_ld, queue, event); return static_cast(status); @@ -416,10 +1608,10 @@ StatusCode CLBlastChemm(const Layout layout, const Side side, const Triangle tri static_cast(side), static_cast(triangle), m, n, - clblast::float2{alpha.s[0], alpha.s[1]}, + float2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, - clblast::float2{beta.s[0], beta.s[1]}, + float2{beta.s[0], beta.s[1]}, c_buffer, c_offset, c_ld, queue, event); return static_cast(status); @@ -436,10 +1628,10 @@ StatusCode CLBlastZhemm(const Layout layout, const Side side, const Triangle tri static_cast(side), static_cast(triangle), m, n, - clblast::double2{alpha.s[0], alpha.s[1]}, + double2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, - clblast::double2{beta.s[0], beta.s[1]}, + double2{beta.s[0], beta.s[1]}, c_buffer, c_offset, c_ld, queue, event); return static_cast(status); @@ -493,9 +1685,9 @@ StatusCode CLBlastCsyrk(const Layout layout, const Triangle triangle, const Tran static_cast(triangle), static_cast(a_transpose), n, k, - clblast::float2{alpha.s[0], alpha.s[1]}, + float2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, - clblast::float2{beta.s[0], beta.s[1]}, + float2{beta.s[0], beta.s[1]}, c_buffer, c_offset, c_ld, queue, event); return static_cast(status); @@ -511,9 +1703,9 @@ StatusCode CLBlastZsyrk(const Layout layout, const Triangle triangle, const Tran static_cast(triangle), static_cast(a_transpose), n, k, - clblast::double2{alpha.s[0], alpha.s[1]}, + double2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, - clblast::double2{beta.s[0], beta.s[1]}, + double2{beta.s[0], beta.s[1]}, c_buffer, c_offset, c_ld, queue, event); return static_cast(status); @@ -610,10 +1802,10 @@ StatusCode CLBlastCsyr2k(const Layout layout, const Triangle triangle, const Tra static_cast(triangle), static_cast(ab_transpose), n, k, - clblast::float2{alpha.s[0], alpha.s[1]}, + float2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, - clblast::float2{beta.s[0], beta.s[1]}, + float2{beta.s[0], beta.s[1]}, c_buffer, c_offset, c_ld, queue, event); return static_cast(status); @@ -630,10 +1822,10 @@ StatusCode CLBlastZsyr2k(const Layout layout, const Triangle triangle, const Tra static_cast(triangle), static_cast(ab_transpose), n, k, - clblast::double2{alpha.s[0], alpha.s[1]}, + double2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, - clblast::double2{beta.s[0], beta.s[1]}, + double2{beta.s[0], beta.s[1]}, c_buffer, c_offset, c_ld, queue, event); return static_cast(status); @@ -652,7 +1844,7 @@ StatusCode CLBlastCher2k(const Layout layout, const Triangle triangle, const Tra static_cast(triangle), static_cast(ab_transpose), n, k, - clblast::float2{alpha.s[0], alpha.s[1]}, + float2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, @@ -672,7 +1864,7 @@ StatusCode CLBlastZher2k(const Layout layout, const Triangle triangle, const Tra static_cast(triangle), static_cast(ab_transpose), n, k, - clblast::double2{alpha.s[0], alpha.s[1]}, + double2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, @@ -730,7 +1922,7 @@ StatusCode CLBlastCtrmm(const Layout layout, const Side side, const Triangle tri static_cast(a_transpose), static_cast(diagonal), m, n, - clblast::float2{alpha.s[0], alpha.s[1]}, + float2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, queue, event); @@ -748,7 +1940,81 @@ StatusCode CLBlastZtrmm(const Layout layout, const Side side, const Triangle tri static_cast(a_transpose), static_cast(diagonal), m, n, - clblast::double2{alpha.s[0], alpha.s[1]}, + double2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event); + return static_cast(status); +} + +// TRSM +StatusCode CLBlastStrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trsm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trsm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trsm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + float2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trsm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + double2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, queue, event); diff --git a/src/database.cc b/src/database.cc index 258d861e..b7275dad 100644 --- a/src/database.cc +++ b/src/database.cc @@ -13,6 +13,7 @@ #include "internal/database.h" #include "internal/database/xaxpy.h" +#include "internal/database/xdot.h" #include "internal/database/xgemv.h" #include "internal/database/xgemm.h" #include "internal/database/copy.h" @@ -28,6 +29,7 @@ namespace clblast { // Initializes the database const std::vector Database::database = { XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble, + XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble, diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl index 12d63b99..f2a2e7a7 100644 --- a/src/kernels/common.opencl +++ b/src/kernels/common.opencl @@ -109,12 +109,26 @@ R"( #define SetToOne(a) a = ONE #endif -// Multiply two complex variables (used in the define below) +// Adds two complex variables +#if PRECISION == 3232 || PRECISION == 6464 + #define Add(c, a, b) c.x = a.x + b.x; c.y = a.y + b.y +#else + #define Add(c, a, b) c = a + b +#endif + +// Multiply two complex variables (used in the defines below) #if PRECISION == 3232 || PRECISION == 6464 #define MulReal(a, b) a.x*b.x - a.y*b.y #define MulImag(a, b) a.x*b.y + a.y*b.x #endif +// The scalar multiply function +#if PRECISION == 3232 || PRECISION == 6464 + #define Multiply(c, a, b) c.x = MulReal(a,b); c.y = MulImag(a,b) +#else + #define Multiply(c, a, b) c = a * b +#endif + // The scalar multiply-add function #if PRECISION == 3232 || PRECISION == 6464 #define MultiplyAdd(c, a, b) c.x += MulReal(a,b); c.y += MulImag(a,b) diff --git a/src/kernels/xaxpy.opencl b/src/kernels/level1/level1.opencl similarity index 68% rename from src/kernels/xaxpy.opencl rename to src/kernels/level1/level1.opencl index b7ffe9ff..7e10426b 100644 --- a/src/kernels/xaxpy.opencl +++ b/src/kernels/level1/level1.opencl @@ -7,9 +7,7 @@ // Author(s): // Cedric Nugteren // -// This file contains the Xaxpy kernel. It contains one fast vectorized version in case of unit -// strides (incx=incy=1) and no offsets (offx=offy=0). Another version is more general, but doesn't -// support vector data-types. +// This file contains the common functions and parameters specific for level 1 BLAS kernels. // // ================================================================================================= @@ -48,6 +46,48 @@ R"( // ================================================================================================= +// The vectorized multiply function +inline realV MultiplyVector(realV cvec, const real aval, const realV bvec) { + #if VW == 1 + Multiply(cvec, aval, bvec); + #elif VW == 2 + Multiply(cvec.x, aval, bvec.x); + Multiply(cvec.y, aval, bvec.y); + #elif VW == 4 + Multiply(cvec.x, aval, bvec.x); + Multiply(cvec.y, aval, bvec.y); + Multiply(cvec.z, aval, bvec.z); + Multiply(cvec.w, aval, bvec.w); + #elif VW == 8 + Multiply(cvec.s0, aval, bvec.s0); + Multiply(cvec.s1, aval, bvec.s1); + Multiply(cvec.s2, aval, bvec.s2); + Multiply(cvec.s3, aval, bvec.s3); + Multiply(cvec.s4, aval, bvec.s4); + Multiply(cvec.s5, aval, bvec.s5); + Multiply(cvec.s6, aval, bvec.s6); + Multiply(cvec.s7, aval, bvec.s7); + #elif VW == 16 + Multiply(cvec.s0, aval, bvec.s0); + Multiply(cvec.s1, aval, bvec.s1); + Multiply(cvec.s2, aval, bvec.s2); + Multiply(cvec.s3, aval, bvec.s3); + Multiply(cvec.s4, aval, bvec.s4); + Multiply(cvec.s5, aval, bvec.s5); + Multiply(cvec.s6, aval, bvec.s6); + Multiply(cvec.s7, aval, bvec.s7); + Multiply(cvec.s8, aval, bvec.s8); + Multiply(cvec.s9, aval, bvec.s9); + Multiply(cvec.sA, aval, bvec.sA); + Multiply(cvec.sB, aval, bvec.sB); + Multiply(cvec.sC, aval, bvec.sC); + Multiply(cvec.sD, aval, bvec.sD); + Multiply(cvec.sE, aval, bvec.sE); + Multiply(cvec.sF, aval, bvec.sF); + #endif + return cvec; +} + // The vectorized multiply-add function inline realV MultiplyAddVector(realV cvec, const real aval, const realV bvec) { #if VW == 1 @@ -92,36 +132,6 @@ inline realV MultiplyAddVector(realV cvec, const real aval, const realV bvec) { // ================================================================================================= -// Full version of the kernel with offsets and strided accesses -__attribute__((reqd_work_group_size(WGS, 1, 1))) -__kernel void Xaxpy(const int n, const real alpha, - const __global real* restrict xgm, const int x_offset, const int x_inc, - __global real* ygm, const int y_offset, const int y_inc) { - - // Loops over the work that needs to be done (allows for an arbitrary number of threads) - #pragma unroll - for (int id = get_global_id(0); id +// +// This file contains the Xaxpy kernel. It contains one fast vectorized version in case of unit +// strides (incx=incy=1) and no offsets (offx=offy=0). Another version is more general, but doesn't +// support vector data-types. +// +// This kernel uses the level-1 BLAS common tuning parameters. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Full version of the kernel with offsets and strided accesses +__attribute__((reqd_work_group_size(WGS, 1, 1))) +__kernel void Xaxpy(const int n, const real alpha, + const __global real* restrict xgm, const int x_offset, const int x_inc, + __global real* ygm, const int y_offset, const int y_inc) { + + // Loops over the work that needs to be done (allows for an arbitrary number of threads) + #pragma unroll + for (int id = get_global_id(0); id +// +// This file contains the Xcopy kernel. It contains one fast vectorized version in case of unit +// strides (incx=incy=1) and no offsets (offx=offy=0). Another version is more general, but doesn't +// support vector data-types. +// +// This kernel uses the level-1 BLAS common tuning parameters. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Full version of the kernel with offsets and strided accesses +__attribute__((reqd_work_group_size(WGS, 1, 1))) +__kernel void Xcopy(const int n, + const __global real* restrict xgm, const int x_offset, const int x_inc, + __global real* ygm, const int y_offset, const int y_inc) { + + // Loops over the work that needs to be done (allows for an arbitrary number of threads) + #pragma unroll + for (int id = get_global_id(0); id +// +// This file contains the Xdot kernel. It implements a dot-product computation using reduction +// kernels. Reduction is split in two parts. In the first (main) kernel the X and Y vectors are +// multiplied, followed by a per-thread and a per-workgroup reduction. The second (epilogue) kernel +// is executed with a single workgroup only, computing the final result. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// Parameters set by the tuner or by the database. Here they are given a basic default value in case +// this kernel file is used outside of the CLBlast library. +#ifndef WGS1 + #define WGS1 64 // The local work-group size of the main kernel +#endif +#ifndef WGS2 + #define WGS2 64 // The local work-group size of the epilogue kernel +#endif + +// ================================================================================================= + +// The main reduction kernel, performing the multiplication and the majority of the sum operation +__attribute__((reqd_work_group_size(WGS1, 1, 1))) +__kernel void Xdot(const int n, + const __global real* restrict xgm, const int x_offset, const int x_inc, + const __global real* restrict ygm, const int y_offset, const int y_inc, + __global real* output, const int do_conjugate) { + __local real lm[WGS1]; + const int lid = get_local_id(0); + const int wgid = get_group_id(0); + const int num_groups = get_num_groups(0); + + // Performs multiplication and the first steps of the reduction + real acc; + SetToZero(acc); + int id = wgid*WGS1 + lid; + while (id < n) { + real x = xgm[id*x_inc + x_offset]; + real y = ygm[id*y_inc + y_offset]; + if (do_conjugate) { COMPLEX_CONJUGATE(x); } + MultiplyAdd(acc, x, y); + id += WGS1*num_groups; + } + lm[lid] = acc; + barrier(CLK_LOCAL_MEM_FENCE); + + // Performs reduction in local memory + #pragma unroll + for (int s=WGS1/2; s>0; s=s>>1) { + if (lid < s) { + Add(lm[lid], lm[lid], lm[lid + s]); + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + // Stores the per-workgroup result + if (lid == 0) { + output[wgid] = lm[0]; + } +} + +// ================================================================================================= + +// The epilogue reduction kernel, performing the final bit of the sum operation. This kernel has to +// be launched with a single workgroup only. +__attribute__((reqd_work_group_size(WGS2, 1, 1))) +__kernel void XdotEpilogue(const __global real* restrict input, + __global real* dot, const int dot_offset) { + __local real lm[WGS2]; + const int lid = get_local_id(0); + + // Performs the first step of the reduction while loading the data + Add(lm[lid], input[lid], input[lid + WGS2]); + barrier(CLK_LOCAL_MEM_FENCE); + + // Performs reduction in local memory + #pragma unroll + for (int s=WGS2/2; s>0; s=s>>1) { + if (lid < s) { + Add(lm[lid], lm[lid], lm[lid + s]); + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + // Stores the final result + if (lid == 0) { + dot[dot_offset] = lm[0]; + } +} + +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= diff --git a/src/kernels/level1/xscal.opencl b/src/kernels/level1/xscal.opencl new file mode 100644 index 00000000..956de3c0 --- /dev/null +++ b/src/kernels/level1/xscal.opencl @@ -0,0 +1,59 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file contains the Xscal kernel. It contains one fast vectorized version in case of unit +// strides (incx=1) and no offsets (offx=0). Another version is more general, but doesn't support +// vector data-types. +// +// This kernel uses the level-1 BLAS common tuning parameters. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Full version of the kernel with offsets and strided accesses +__attribute__((reqd_work_group_size(WGS, 1, 1))) +__kernel void Xscal(const int n, const real alpha, + __global real* xgm, const int x_offset, const int x_inc) { + + // Loops over the work that needs to be done (allows for an arbitrary number of threads) + #pragma unroll + for (int id = get_global_id(0); id +// +// This file contains the Xswap kernel. It contains one fast vectorized version in case of unit +// strides (incx=incy=1) and no offsets (offx=offy=0). Another version is more general, but doesn't +// support vector data-types. +// +// This kernel uses the level-1 BLAS common tuning parameters. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Full version of the kernel with offsets and strided accesses +__attribute__((reqd_work_group_size(WGS, 1, 1))) +__kernel void Xswap(const int n, + __global real* xgm, const int x_offset, const int x_inc, + __global real* ygm, const int y_offset, const int y_inc) { + + // Loops over the work that needs to be done (allows for an arbitrary number of threads) + #pragma unroll + for (int id = get_global_id(0); id= y-ku && x < y+kl+1) { result = agm[a_ld*y + k + x + a_offset]; } + else { SetToZero(result); } + + // For symmetric/hermitian matrices + #elif defined(ROUTINE_HEMV) || defined(ROUTINE_SYMV) + if ((parameter == 0 && y <= x) || (parameter == 1 && x <= y)) { + result = agm[a_ld*y + x + a_offset]; + #if defined(ROUTINE_HEMV) + if (x == y) { result.y = ZERO; } + #endif + } + else { + result = agm[a_ld*x + y + a_offset]; + #if defined(ROUTINE_HEMV) + COMPLEX_CONJUGATE(result); + #endif + } + + // For triangular matrices + #elif defined(ROUTINE_TRMV) + if (((parameter == 0 || parameter == 2) && y <= x) || + ((parameter == 1 || parameter == 3) && x <= y)) { + result = agm[a_ld*y + x + a_offset]; + if (parameter >= 2 && y == x) { + SetToOne(result); + } + } + else { + SetToZero(result); + } + + // For symmetric/hermitian banded matrices + #elif defined(ROUTINE_HBMV) || defined(ROUTINE_SBMV) + if (parameter == 1) { + if (x <= y) { + const int m = kl - y; + if (x >= y-kl && x <= y) { result = agm[a_ld*y + m + x + a_offset]; } + else { SetToZero(result); } + #if defined(ROUTINE_HBMV) + if (x == y) { result.y = ZERO; } + #endif + } + else { + const int m = kl - x; + if (y >= x-kl && y <= x) { result = agm[a_ld*x + m + y + a_offset]; } + else { SetToZero(result); } + #if defined(ROUTINE_HBMV) + COMPLEX_CONJUGATE(result); + #endif + } + } + else { + if (x >= y) { + const int m = -y; + if (x >= y && x < y+kl+1) { result = agm[a_ld*y + m + x + a_offset]; } + else { SetToZero(result); } + #if defined(ROUTINE_HBMV) + if (x == y) { result.y = ZERO; } + #endif + } + else { + const int m = -x; + if (y >= x && y < x+kl+1) { result = agm[a_ld*x + m + y + a_offset]; } + else { SetToZero(result); } + #if defined(ROUTINE_HBMV) + COMPLEX_CONJUGATE(result); + #endif + } + } + + // For triangular banded matrices + #elif defined(ROUTINE_TBMV) + if (parameter == 1 || parameter == 3) { + if (x <= y) { + const int m = kl - y; + if (x >= y-kl && x <= y) { result = agm[a_ld*y + m + x + a_offset]; } + else { SetToZero(result); } + if (parameter >= 2 && y == x) { + SetToOne(result); + } + } + else { + SetToZero(result); + } + } + else { + if (x >= y) { + const int m = -y; + if (x >= y && x < y+kl+1) { result = agm[a_ld*y + m + x + a_offset]; } + else { SetToZero(result); } + if (parameter >= 2 && y == x) { + SetToOne(result); + } + } + else { + SetToZero(result); + } + } + + // For symmetric/hermitian packed matrices + #elif defined(ROUTINE_HPMV) || defined(ROUTINE_SPMV) + if (parameter == 1) { + if (x <= y) { + result = agm[((y+1)*y)/2 + x + a_offset]; + #if defined(ROUTINE_HPMV) + if (x == y) { result.y = ZERO; } + #endif + } + else { + result = agm[((x+1)*x)/2 + y + a_offset]; + #if defined(ROUTINE_HPMV) + COMPLEX_CONJUGATE(result); + #endif + } + } + else { + if (x >= y) { + result = agm[((2*a_ld-(y+1))*y)/2 + x + a_offset]; + #if defined(ROUTINE_HPMV) + if (x == y) { result.y = ZERO; } + #endif + } + else { + result = agm[((2*a_ld-(x+1))*x)/2 + y + a_offset]; + #if defined(ROUTINE_HPMV) + COMPLEX_CONJUGATE(result); + #endif + } + } + + // For triangular packed matrices + #elif defined(ROUTINE_TPMV) + if (parameter == 1 || parameter == 3) { + if (x <= y) { + result = agm[((y+1)*y)/2 + x + a_offset]; + if (parameter >= 2 && y == x) { + SetToOne(result); + } + } + else { + SetToZero(result); + } + } + else { + if (x >= y) { + result = agm[((2*a_ld-(y+1))*y)/2 + x + a_offset]; + if (parameter >= 2 && y == x) { + SetToOne(result); + } + } + else { + SetToZero(result); + } + } + + // For general matrices + #else + result = agm[a_ld*y + x + a_offset]; + #endif + + return result; } + // Loads a vector input value (1/2) inline realVF LoadMatrixAVF(const __global realVF* restrict agm, const int x, const int y, const int a_ld) { - return agm[x + a_ld*y]; + return agm[a_ld*y + x]; } + // Loads a vector input value (2/2): as before, but different data-type inline realVFR LoadMatrixAVFR(const __global realVFR* restrict agm, const int x, const int y, const int a_ld) { - return agm[x + a_ld*y]; + return agm[a_ld*y + x]; } // ================================================================================================= @@ -106,7 +273,8 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta, const __global real* restrict agm, const int a_offset, const int a_ld, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* ygm, const int y_offset, const int y_inc, - const int do_conjugate) { + const int do_conjugate, const int parameter, + const int kl, const int ku) { // Local memory for the vector X __local real xlm[WGS1]; @@ -141,20 +309,20 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta, // The multiply-add function for the main part (divisable by WGS1) if (a_rotated == 0) { // Not rotated #pragma unroll - for (int kl=0; kl::TestMatrixC(const size_t one, const size_t two, const Buf return StatusCode::kSuccess; } +// Tests matrix AP for validity: checks for a valid OpenCL buffer and for a sufficient buffer size +template +StatusCode Routine::TestMatrixAP(const size_t n, const Buffer &buffer, + const size_t offset, const size_t data_size) { + try { + auto required_size = (((n*(n+1))/2) + offset)*data_size; + auto buffer_size = buffer.GetSize(); + if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryA; } + } catch (...) { return StatusCode::kInvalidMatrixA; } + return StatusCode::kSuccess; +} + // ================================================================================================= // Tests vector X for validity: checks for a valid increment, a valid OpenCL buffer, and for a @@ -223,6 +235,21 @@ StatusCode Routine::TestVectorY(const size_t n, const Buffer &buffer, cons // ================================================================================================= +// Tests vector dot for validity: checks for a valid increment, a valid OpenCL buffer, and for a +// sufficient buffer size. +template +StatusCode Routine::TestVectorDot(const size_t n, const Buffer &buffer, const size_t offset, + const size_t data_size) { + try { + auto required_size = (n + offset)*data_size; + auto buffer_size = buffer.GetSize(); + if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryDot; } + } catch (...) { return StatusCode::kInvalidVectorDot; } + return StatusCode::kSuccess; +} + +// ================================================================================================= + // Copies or transposes a matrix and pads/unpads it with zeros template StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t src_two, diff --git a/src/routines/level1/xaxpy.cc b/src/routines/level1/xaxpy.cc index 7646b0e4..f37a0724 100644 --- a/src/routines/level1/xaxpy.cc +++ b/src/routines/level1/xaxpy.cc @@ -29,10 +29,11 @@ template <> const Precision Xaxpy::precision_ = Precision::kComplexDoub // Constructor: forwards to base class constructor template -Xaxpy::Xaxpy(Queue &queue, Event &event): - Routine(queue, event, "AXPY", {"Xaxpy"}, precision_) { +Xaxpy::Xaxpy(Queue &queue, Event &event, const std::string &name): + Routine(queue, event, name, {"Xaxpy"}, precision_) { source_string_ = - #include "../../kernels/xaxpy.opencl" + #include "../../kernels/level1/level1.opencl" + #include "../../kernels/level1/xaxpy.opencl" ; } diff --git a/src/routines/level1/xcopy.cc b/src/routines/level1/xcopy.cc new file mode 100644 index 00000000..2b00d43f --- /dev/null +++ b/src/routines/level1/xcopy.cc @@ -0,0 +1,117 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xcopy class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level1/xcopy.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Specific implementations to get the memory-type based on a template argument +template <> const Precision Xcopy::precision_ = Precision::kSingle; +template <> const Precision Xcopy::precision_ = Precision::kDouble; +template <> const Precision Xcopy::precision_ = Precision::kComplexSingle; +template <> const Precision Xcopy::precision_ = Precision::kComplexDouble; + +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xcopy::Xcopy(Queue &queue, Event &event, const std::string &name): + Routine(queue, event, name, {"Xaxpy"}, precision_) { + source_string_ = + #include "../../kernels/level1/level1.opencl" + #include "../../kernels/level1/xcopy.opencl" + ; +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xcopy::DoCopy(const size_t n, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { + + // Makes sure all dimensions are larger than zero + if (n == 0) { return StatusCode::kInvalidDimension; } + + // Tests the vectors for validity + auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Determines whether or not the fast-version can be used + bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) && + (y_offset == 0) && (y_inc == 1) && + IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]); + + // If possible, run the fast-version of the kernel + auto kernel_name = (use_fast_kernel) ? "XcopyFast" : "Xcopy"; + + // Retrieves the Xcopy kernel from the compiled binary + try { + auto& program = GetProgramFromCache(); + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + if (use_fast_kernel) { + kernel.SetArgument(0, static_cast(n)); + kernel.SetArgument(1, x_buffer()); + kernel.SetArgument(2, y_buffer()); + } + else { + kernel.SetArgument(0, static_cast(n)); + kernel.SetArgument(1, x_buffer()); + kernel.SetArgument(2, static_cast(x_offset)); + kernel.SetArgument(3, static_cast(x_inc)); + kernel.SetArgument(4, y_buffer()); + kernel.SetArgument(5, static_cast(y_offset)); + kernel.SetArgument(6, static_cast(y_inc)); + } + + // Launches the kernel + if (use_fast_kernel) { + auto global = std::vector{CeilDiv(n, db_["WPT"]*db_["VW"])}; + auto local = std::vector{db_["WGS"]}; + status = RunKernel(kernel, global, local); + } + else { + auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); + auto global = std::vector{n_ceiled/db_["WPT"]}; + auto local = std::vector{db_["WGS"]}; + status = RunKernel(kernel, global, local); + } + if (ErrorIn(status)) { return status; } + + // Waits for all kernels to finish + queue_.Finish(); + + // Succesfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xcopy; +template class Xcopy; +template class Xcopy; +template class Xcopy; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level1/xdot.cc b/src/routines/level1/xdot.cc new file mode 100644 index 00000000..a0c1e756 --- /dev/null +++ b/src/routines/level1/xdot.cc @@ -0,0 +1,115 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xdot class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level1/xdot.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Specific implementations to get the memory-type based on a template argument +template <> const Precision Xdot::precision_ = Precision::kSingle; +template <> const Precision Xdot::precision_ = Precision::kDouble; +template <> const Precision Xdot::precision_ = Precision::kComplexSingle; +template <> const Precision Xdot::precision_ = Precision::kComplexDouble; + +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xdot::Xdot(Queue &queue, Event &event, const std::string &name): + Routine(queue, event, name, {"Xdot"}, precision_) { + source_string_ = + #include "../../kernels/level1/xdot.opencl" + ; +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xdot::DoDot(const size_t n, + const Buffer &dot_buffer, const size_t dot_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, + const bool do_conjugate) { + + // Makes sure all dimensions are larger than zero + if (n == 0) { return StatusCode::kInvalidDimension; } + + // Tests the vectors for validity + auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestVectorDot(1, dot_buffer, dot_offset, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Retrieves the Xdot kernels from the compiled binary + try { + auto& program = GetProgramFromCache(); + auto kernel1 = Kernel(program, "Xdot"); + auto kernel2 = Kernel(program, "XdotEpilogue"); + + // Creates the buffer for intermediate values + auto temp_size = 2*db_["WGS2"]; + auto temp_buffer = Buffer(context_, temp_size); + + // Sets the kernel arguments + kernel1.SetArgument(0, static_cast(n)); + kernel1.SetArgument(1, x_buffer()); + kernel1.SetArgument(2, static_cast(x_offset)); + kernel1.SetArgument(3, static_cast(x_inc)); + kernel1.SetArgument(4, y_buffer()); + kernel1.SetArgument(5, static_cast(y_offset)); + kernel1.SetArgument(6, static_cast(y_inc)); + kernel1.SetArgument(7, temp_buffer()); + kernel1.SetArgument(8, static_cast(do_conjugate)); + + // Launches the main kernel + auto global1 = std::vector{db_["WGS1"]*temp_size}; + auto local1 = std::vector{db_["WGS1"]}; + status = RunKernel(kernel1, global1, local1); + if (ErrorIn(status)) { return status; } + + // Sets the arguments for the epilogue kernel + kernel2.SetArgument(0, temp_buffer()); + kernel2.SetArgument(1, dot_buffer()); + kernel2.SetArgument(2, static_cast(dot_offset)); + + // Launches the epilogue kernel + auto global2 = std::vector{db_["WGS2"]}; + auto local2 = std::vector{db_["WGS2"]}; + status = RunKernel(kernel2, global2, local2); + if (ErrorIn(status)) { return status; } + + // Waits for all kernels to finish + queue_.Finish(); + + // Succesfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xdot; +template class Xdot; +template class Xdot; +template class Xdot; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level1/xdotc.cc b/src/routines/level1/xdotc.cc new file mode 100644 index 00000000..f414f556 --- /dev/null +++ b/src/routines/level1/xdotc.cc @@ -0,0 +1,49 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xdotc class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level1/xdotc.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xdotc::Xdotc(Queue &queue, Event &event, const std::string &name): + Xdot(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xdotc::DoDotc(const size_t n, + const Buffer &dot_buffer, const size_t dot_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { + return DoDot(n, dot_buffer, dot_offset, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + true); +} + +// ================================================================================================= + +// Compiles the templated class +template class Xdotc; +template class Xdotc; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level1/xdotu.cc b/src/routines/level1/xdotu.cc new file mode 100644 index 00000000..0b1bd2a8 --- /dev/null +++ b/src/routines/level1/xdotu.cc @@ -0,0 +1,49 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xdotu class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level1/xdotu.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xdotu::Xdotu(Queue &queue, Event &event, const std::string &name): + Xdot(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xdotu::DoDotu(const size_t n, + const Buffer &dot_buffer, const size_t dot_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { + return DoDot(n, dot_buffer, dot_offset, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + false); +} + +// ================================================================================================= + +// Compiles the templated class +template class Xdotu; +template class Xdotu; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level1/xscal.cc b/src/routines/level1/xscal.cc new file mode 100644 index 00000000..3fc36b3d --- /dev/null +++ b/src/routines/level1/xscal.cc @@ -0,0 +1,111 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xscal class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level1/xscal.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Specific implementations to get the memory-type based on a template argument +template <> const Precision Xscal::precision_ = Precision::kSingle; +template <> const Precision Xscal::precision_ = Precision::kDouble; +template <> const Precision Xscal::precision_ = Precision::kComplexSingle; +template <> const Precision Xscal::precision_ = Precision::kComplexDouble; + +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xscal::Xscal(Queue &queue, Event &event, const std::string &name): + Routine(queue, event, name, {"Xaxpy"}, precision_) { + source_string_ = + #include "../../kernels/level1/level1.opencl" + #include "../../kernels/level1/xscal.opencl" + ; +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xscal::DoScal(const size_t n, const T alpha, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { + + // Makes sure all dimensions are larger than zero + if (n == 0) { return StatusCode::kInvalidDimension; } + + // Tests the vector for validity + auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Determines whether or not the fast-version can be used + bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) && + IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]); + + // If possible, run the fast-version of the kernel + auto kernel_name = (use_fast_kernel) ? "XscalFast" : "Xscal"; + + // Retrieves the Xscal kernel from the compiled binary + try { + auto& program = GetProgramFromCache(); + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + if (use_fast_kernel) { + kernel.SetArgument(0, static_cast(n)); + kernel.SetArgument(1, alpha); + kernel.SetArgument(2, x_buffer()); + } + else { + kernel.SetArgument(0, static_cast(n)); + kernel.SetArgument(1, alpha); + kernel.SetArgument(2, x_buffer()); + kernel.SetArgument(3, static_cast(x_offset)); + kernel.SetArgument(4, static_cast(x_inc)); + } + + // Launches the kernel + if (use_fast_kernel) { + auto global = std::vector{CeilDiv(n, db_["WPT"]*db_["VW"])}; + auto local = std::vector{db_["WGS"]}; + status = RunKernel(kernel, global, local); + } + else { + auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); + auto global = std::vector{n_ceiled/db_["WPT"]}; + auto local = std::vector{db_["WGS"]}; + status = RunKernel(kernel, global, local); + } + if (ErrorIn(status)) { return status; } + + // Waits for all kernels to finish + queue_.Finish(); + + // Succesfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xscal; +template class Xscal; +template class Xscal; +template class Xscal; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level1/xswap.cc b/src/routines/level1/xswap.cc new file mode 100644 index 00000000..123977d3 --- /dev/null +++ b/src/routines/level1/xswap.cc @@ -0,0 +1,117 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xswap class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level1/xswap.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Specific implementations to get the memory-type based on a template argument +template <> const Precision Xswap::precision_ = Precision::kSingle; +template <> const Precision Xswap::precision_ = Precision::kDouble; +template <> const Precision Xswap::precision_ = Precision::kComplexSingle; +template <> const Precision Xswap::precision_ = Precision::kComplexDouble; + +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xswap::Xswap(Queue &queue, Event &event, const std::string &name): + Routine(queue, event, name, {"Xaxpy"}, precision_) { + source_string_ = + #include "../../kernels/level1/level1.opencl" + #include "../../kernels/level1/xswap.opencl" + ; +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xswap::DoSwap(const size_t n, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { + + // Makes sure all dimensions are larger than zero + if (n == 0) { return StatusCode::kInvalidDimension; } + + // Tests the vectors for validity + auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Determines whether or not the fast-version can be used + bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) && + (y_offset == 0) && (y_inc == 1) && + IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]); + + // If possible, run the fast-version of the kernel + auto kernel_name = (use_fast_kernel) ? "XswapFast" : "Xswap"; + + // Retrieves the Xswap kernel from the compiled binary + try { + auto& program = GetProgramFromCache(); + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + if (use_fast_kernel) { + kernel.SetArgument(0, static_cast(n)); + kernel.SetArgument(1, x_buffer()); + kernel.SetArgument(2, y_buffer()); + } + else { + kernel.SetArgument(0, static_cast(n)); + kernel.SetArgument(1, x_buffer()); + kernel.SetArgument(2, static_cast(x_offset)); + kernel.SetArgument(3, static_cast(x_inc)); + kernel.SetArgument(4, y_buffer()); + kernel.SetArgument(5, static_cast(y_offset)); + kernel.SetArgument(6, static_cast(y_inc)); + } + + // Launches the kernel + if (use_fast_kernel) { + auto global = std::vector{CeilDiv(n, db_["WPT"]*db_["VW"])}; + auto local = std::vector{db_["WGS"]}; + status = RunKernel(kernel, global, local); + } + else { + auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); + auto global = std::vector{n_ceiled/db_["WPT"]}; + auto local = std::vector{db_["WGS"]}; + status = RunKernel(kernel, global, local); + } + if (ErrorIn(status)) { return status; } + + // Waits for all kernels to finish + queue_.Finish(); + + // Succesfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xswap; +template class Xswap; +template class Xswap; +template class Xswap; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xgbmv.cc b/src/routines/level2/xgbmv.cc new file mode 100644 index 00000000..14d391ca --- /dev/null +++ b/src/routines/level2/xgbmv.cc @@ -0,0 +1,67 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xgbmv class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level2/xgbmv.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xgbmv::Xgbmv(Queue &queue, Event &event, const std::string &name): + Xgemv(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xgbmv::DoGbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { + + // Reverses the upper and lower band count + auto rotated = (layout == Layout::kRowMajor); + auto kl_real = (rotated) ? ku : kl; + auto ku_real = (rotated) ? kl : ku; + + // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. + // The specific hermitian matrix-accesses are implemented in the kernel guarded by the + // ROUTINE_GBMV define. + bool fast_kernels = false; + return MatVec(layout, a_transpose, + m, n, alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + 0, false, kl_real, ku_real); +} + +// ================================================================================================= + +// Compiles the templated class +template class Xgbmv; +template class Xgbmv; +template class Xgbmv; +template class Xgbmv; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xgemv.cc b/src/routines/level2/xgemv.cc index f95a9957..1b768dcd 100644 --- a/src/routines/level2/xgemv.cc +++ b/src/routines/level2/xgemv.cc @@ -32,8 +32,7 @@ template Xgemv::Xgemv(Queue &queue, Event &event, const std::string &name): Routine(queue, event, name, {"Pad", "Xgemv"}, precision_) { source_string_ = - #include "../../kernels/pad.opencl" // For {Herm,Symm}{Upper,Lower}ToSquared (for HEMV/SYMV) - #include "../../kernels/xgemv.opencl" + #include "../../kernels/level2/xgemv.opencl" ; } @@ -49,6 +48,31 @@ StatusCode Xgemv::DoGemv(const Layout layout, const Transpose a_transpose, const T beta, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { + // Performs the matrix-vector multiplication + return MatVec(layout, a_transpose, + m, n, alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + true, true, + 0, false, 0, 0); // N/A for this routine +} + +// ================================================================================================= + +// The generic implementation, also suited for other (non general) matrix-vector multiplications +template +StatusCode Xgemv::MatVec(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, + bool fast_kernel, bool fast_kernel_rot, + const size_t parameter, const bool packed, + const size_t kl, const size_t ku) { + // Makes sure all dimensions are larger than zero if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; } @@ -62,6 +86,11 @@ StatusCode Xgemv::DoGemv(const Layout layout, const Transpose a_transpose, auto m_real = (a_transposed) ? n : m; auto n_real = (a_transposed) ? m : n; + // Special adjustments for banded matrices + if (kl != 0 || ku != 0) { + a_one = kl+ku+1; + } + // Determines whether the kernel needs to perform rotated access ('^' is the XOR operator) auto a_rotated = a_transposed ^ a_altlayout; @@ -69,7 +98,9 @@ StatusCode Xgemv::DoGemv(const Layout layout, const Transpose a_transpose, auto a_conjugate = (a_transpose == Transpose::kConjugate); // Tests the matrix and the vectors for validity - auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T)); + auto status = StatusCode::kSuccess; + if (packed) { status = TestMatrixAP(n, a_buffer, a_offset, sizeof(T)); } + else { status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T)); } if (ErrorIn(status)) { return status; } status = TestVectorX(n_real, x_buffer, x_offset, x_inc, sizeof(T)); if (ErrorIn(status)) { return status; } @@ -77,26 +108,26 @@ StatusCode Xgemv::DoGemv(const Layout layout, const Transpose a_transpose, if (ErrorIn(status)) { return status; } // Determines whether or not the fast-version can be used - bool use_fast_kernel = (a_offset == 0) && (a_rotated == 0) && (a_conjugate == 0) && - IsMultiple(m, db_["WGS2"]*db_["WPT2"]) && - IsMultiple(n, db_["WGS2"]) && - IsMultiple(a_ld, db_["VW2"]); - bool use_fast_kernel_rot = (a_offset == 0) && (a_rotated == 1) && (a_conjugate == 0) && - IsMultiple(m, db_["WGS3"]*db_["WPT3"]) && - IsMultiple(n, db_["WGS3"]) && - IsMultiple(a_ld, db_["VW3"]); + fast_kernel = fast_kernel && (a_offset == 0) && (a_rotated == 0) && (a_conjugate == 0) && + IsMultiple(m, db_["WGS2"]*db_["WPT2"]) && + IsMultiple(n, db_["WGS2"]) && + IsMultiple(a_ld, db_["VW2"]); + fast_kernel_rot = fast_kernel_rot && (a_offset == 0) && (a_rotated == 1) && (a_conjugate == 0) && + IsMultiple(m, db_["WGS3"]*db_["WPT3"]) && + IsMultiple(n, db_["WGS3"]) && + IsMultiple(a_ld, db_["VW3"]); // If possible, run the fast-version (rotated or non-rotated) of the kernel auto kernel_name = "Xgemv"; auto m_ceiled = Ceil(m_real, db_["WGS1"]*db_["WPT1"]); auto global_size = m_ceiled / db_["WPT1"]; auto local_size = db_["WGS1"]; - if (use_fast_kernel) { + if (fast_kernel) { kernel_name = "XgemvFast"; global_size = m_real / db_["WPT2"]; local_size = db_["WGS2"]; } - if (use_fast_kernel_rot) { + if (fast_kernel_rot) { kernel_name = "XgemvFastRot"; global_size = m_real / db_["WPT3"]; local_size = db_["WGS3"]; @@ -123,6 +154,9 @@ StatusCode Xgemv::DoGemv(const Layout layout, const Transpose a_transpose, kernel.SetArgument(12, static_cast(y_offset)); kernel.SetArgument(13, static_cast(y_inc)); kernel.SetArgument(14, static_cast(a_conjugate)); + kernel.SetArgument(15, static_cast(parameter)); // extra parameter used for symm/herm + kernel.SetArgument(16, static_cast(kl)); // only used for banded matrices + kernel.SetArgument(17, static_cast(ku)); // only used for banded matrices // Launches the kernel auto global = std::vector{global_size}; diff --git a/src/routines/level2/xhbmv.cc b/src/routines/level2/xhbmv.cc new file mode 100644 index 00000000..f59a7cb3 --- /dev/null +++ b/src/routines/level2/xhbmv.cc @@ -0,0 +1,64 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xhbmv class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level2/xhbmv.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xhbmv::Xhbmv(Queue &queue, Event &event, const std::string &name): + Xgemv(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xhbmv::DoHbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { + + // The data is either in the upper or lower triangle + size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + + // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. + // The specific hermitian banded matrix-accesses are implemented in the kernel guarded by the + // ROUTINE_HBMV define. + bool fast_kernels = false; + return MatVec(layout, Transpose::kNo, + n, n, alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + is_upper, false, k, 0); +} + +// ================================================================================================= + +// Compiles the templated class +template class Xhbmv; +template class Xhbmv; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xhemv.cc b/src/routines/level2/xhemv.cc index 2d92e45f..5a58b28b 100644 --- a/src/routines/level2/xhemv.cc +++ b/src/routines/level2/xhemv.cc @@ -37,57 +37,21 @@ StatusCode Xhemv::DoHemv(const Layout layout, const Triangle triangle, const T beta, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { - // Makes sure all dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } - - // Checks for validity of the squared A matrix - auto status = TestMatrixA(n, n, a_buffer, a_offset, a_ld, sizeof(T)); - if (ErrorIn(status)) { return status; } - - // Determines which kernel to run based on the layout (the Xgemv kernel assumes column-major as - // default) and on whether we are dealing with an upper or lower triangle of the hermitian matrix - bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || - (triangle == Triangle::kLower && layout == Layout::kRowMajor)); - auto kernel_name = (is_upper) ? "HermUpperToSquared" : "HermLowerToSquared"; - - // Temporary buffer for a copy of the hermitian matrix - try { - auto temp_herm = Buffer(context_, n*n); - - // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemv - // routine afterwards - try { - auto& program = GetProgramFromCache(); - auto kernel = Kernel(program, kernel_name); - - // Sets the arguments for the hermitian-to-squared kernel - kernel.SetArgument(0, static_cast(n)); - kernel.SetArgument(1, static_cast(a_ld)); - kernel.SetArgument(2, static_cast(a_offset)); - kernel.SetArgument(3, a_buffer()); - kernel.SetArgument(4, static_cast(n)); - kernel.SetArgument(5, static_cast(n)); - kernel.SetArgument(6, static_cast(0)); - kernel.SetArgument(7, temp_herm()); - - // Uses the common padding kernel's thread configuration. This is allowed, since the - // hermitian-to-squared kernel uses the same parameters. - auto global = std::vector{Ceil(CeilDiv(n, db_["PAD_WPTX"]), db_["PAD_DIMX"]), - Ceil(CeilDiv(n, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; - auto local = std::vector{db_["PAD_DIMX"], db_["PAD_DIMY"]}; - status = RunKernel(kernel, global, local); - if (ErrorIn(status)) { return status; } - - // Runs the regular Xgemv code - status = DoGemv(layout, Transpose::kNo, n, n, alpha, - temp_herm, 0, n, - x_buffer, x_offset, x_inc, beta, - y_buffer, y_offset, y_inc); - - // Return the status of the Xgemv routine - return status; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } + // The data is either in the upper or lower triangle + size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + + // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. + // The specific hermitian matrix-accesses are implemented in the kernel guarded by the + // ROUTINE_HEMV define. + bool fast_kernels = false; + return MatVec(layout, Transpose::kNo, + n, n, alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + is_upper, false, 0, 0); } // ================================================================================================= diff --git a/src/routines/level2/xhpmv.cc b/src/routines/level2/xhpmv.cc new file mode 100644 index 00000000..2269255d --- /dev/null +++ b/src/routines/level2/xhpmv.cc @@ -0,0 +1,64 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xhpmv class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level2/xhpmv.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xhpmv::Xhpmv(Queue &queue, Event &event, const std::string &name): + Xgemv(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xhpmv::DoHpmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer &ap_buffer, const size_t ap_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { + + // The data is either in the upper or lower triangle + size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + + // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. + // The specific hermitian packed matrix-accesses are implemented in the kernel guarded by the + // ROUTINE_HPMV define. + bool fast_kernels = false; + return MatVec(layout, Transpose::kNo, + n, n, alpha, + ap_buffer, ap_offset, n, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + is_upper, true, 0, 0); +} + +// ================================================================================================= + +// Compiles the templated class +template class Xhpmv; +template class Xhpmv; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xsbmv.cc b/src/routines/level2/xsbmv.cc new file mode 100644 index 00000000..457bd762 --- /dev/null +++ b/src/routines/level2/xsbmv.cc @@ -0,0 +1,64 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsbmv class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level2/xsbmv.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xsbmv::Xsbmv(Queue &queue, Event &event, const std::string &name): + Xgemv(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xsbmv::DoSbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { + + // The data is either in the upper or lower triangle + size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + + // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. + // The specific symmetric banded matrix-accesses are implemented in the kernel guarded by the + // ROUTINE_SBMV define. + bool fast_kernels = false; + return MatVec(layout, Transpose::kNo, + n, n, alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + is_upper, false, k, 0); +} + +// ================================================================================================= + +// Compiles the templated class +template class Xsbmv; +template class Xsbmv; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xspmv.cc b/src/routines/level2/xspmv.cc new file mode 100644 index 00000000..4f1a9c61 --- /dev/null +++ b/src/routines/level2/xspmv.cc @@ -0,0 +1,64 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xspmv class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level2/xspmv.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xspmv::Xspmv(Queue &queue, Event &event, const std::string &name): + Xgemv(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xspmv::DoSpmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer &ap_buffer, const size_t ap_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { + + // The data is either in the upper or lower triangle + size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + + // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. + // The specific symmetric packed matrix-accesses are implemented in the kernel guarded by the + // ROUTINE_SPMV define. + bool fast_kernels = false; + return MatVec(layout, Transpose::kNo, + n, n, alpha, + ap_buffer, ap_offset, n, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + is_upper, true, 0, 0); +} + +// ================================================================================================= + +// Compiles the templated class +template class Xspmv; +template class Xspmv; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xsymv.cc b/src/routines/level2/xsymv.cc index 2ccb51f6..ec12324b 100644 --- a/src/routines/level2/xsymv.cc +++ b/src/routines/level2/xsymv.cc @@ -37,57 +37,21 @@ StatusCode Xsymv::DoSymv(const Layout layout, const Triangle triangle, const T beta, const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { - // Makes sure all dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } - - // Checks for validity of the squared A matrix - auto status = TestMatrixA(n, n, a_buffer, a_offset, a_ld, sizeof(T)); - if (ErrorIn(status)) { return status; } - - // Determines which kernel to run based on the layout (the Xgemv kernel assumes column-major as - // default) and on whether we are dealing with an upper or lower triangle of the symmetric matrix - bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || - (triangle == Triangle::kLower && layout == Layout::kRowMajor)); - auto kernel_name = (is_upper) ? "SymmUpperToSquared" : "SymmLowerToSquared"; - - // Temporary buffer for a copy of the symmetric matrix - try { - auto temp_symm = Buffer(context_, n*n); - - // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemv - // routine afterwards - try { - auto& program = GetProgramFromCache(); - auto kernel = Kernel(program, kernel_name); - - // Sets the arguments for the symmetric-to-squared kernel - kernel.SetArgument(0, static_cast(n)); - kernel.SetArgument(1, static_cast(a_ld)); - kernel.SetArgument(2, static_cast(a_offset)); - kernel.SetArgument(3, a_buffer()); - kernel.SetArgument(4, static_cast(n)); - kernel.SetArgument(5, static_cast(n)); - kernel.SetArgument(6, static_cast(0)); - kernel.SetArgument(7, temp_symm()); - - // Uses the common padding kernel's thread configuration. This is allowed, since the - // symmetric-to-squared kernel uses the same parameters. - auto global = std::vector{Ceil(CeilDiv(n, db_["PAD_WPTX"]), db_["PAD_DIMX"]), - Ceil(CeilDiv(n, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; - auto local = std::vector{db_["PAD_DIMX"], db_["PAD_DIMY"]}; - status = RunKernel(kernel, global, local); - if (ErrorIn(status)) { return status; } - - // Runs the regular Xgemv code - status = DoGemv(layout, Transpose::kNo, n, n, alpha, - temp_symm, 0, n, - x_buffer, x_offset, x_inc, beta, - y_buffer, y_offset, y_inc); - - // Return the status of the Xgemv routine - return status; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } + // The data is either in the upper or lower triangle + size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + + // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. + // The specific symmetric matrix-accesses are implemented in the kernel guarded by the + // ROUTINE_SYMV define. + bool fast_kernels = false; + return MatVec(layout, Transpose::kNo, + n, n, alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + is_upper, false, 0, 0); } // ================================================================================================= diff --git a/src/routines/level2/xtbmv.cc b/src/routines/level2/xtbmv.cc new file mode 100644 index 00000000..2e1aebff --- /dev/null +++ b/src/routines/level2/xtbmv.cc @@ -0,0 +1,81 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xtbmv class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level2/xtbmv.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xtbmv::Xtbmv(Queue &queue, Event &event, const std::string &name): + Xgemv(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xtbmv::DoTbmv(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { + + // Creates a copy of X: a temporary scratch buffer + auto scratch_buffer = Buffer(context_, n*x_inc + x_offset); + try { + x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer); + } catch (...) { } // Continues: error-code is returned in MatVec + + // The data is either in the upper or lower triangle + size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + + // Adds '2' to the parameter if the diagonal is unit + auto parameter = (diagonal == Diagonal::kUnit) ? is_upper + 2 : is_upper; + + // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. + // The specific triangular banded matrix-accesses are implemented in the kernel guarded by the + // ROUTINE_TBMV define. + auto fast_kernels = false; + auto status = MatVec(layout, a_transpose, + n, n, static_cast(1), + a_buffer, a_offset, a_ld, + scratch_buffer, x_offset, x_inc, static_cast(0), + x_buffer, x_offset, x_inc, + fast_kernels, fast_kernels, + parameter, false, k, 0); + + // Returns the proper error code (renames vector Y to X) + switch(status) { + case StatusCode::kInvalidVectorY: return StatusCode::kInvalidVectorX; + case StatusCode::kInvalidIncrementY: return StatusCode::kInvalidIncrementX; + case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX; + default: return status; + } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xtbmv; +template class Xtbmv; +template class Xtbmv; +template class Xtbmv; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xtpmv.cc b/src/routines/level2/xtpmv.cc new file mode 100644 index 00000000..aa0e099b --- /dev/null +++ b/src/routines/level2/xtpmv.cc @@ -0,0 +1,81 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xtpmv class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level2/xtpmv.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xtpmv::Xtpmv(Queue &queue, Event &event, const std::string &name): + Xgemv(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xtpmv::DoTpmv(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const Buffer &ap_buffer, const size_t ap_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { + + // Creates a copy of X: a temporary scratch buffer + auto scratch_buffer = Buffer(context_, n*x_inc + x_offset); + try { + x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer); + } catch (...) { } // Continues: error-code is returned in MatVec + + // The data is either in the upper or lower triangle + size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + + // Adds '2' to the parameter if the diagonal is unit + auto parameter = (diagonal == Diagonal::kUnit) ? is_upper + 2 : is_upper; + + // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. + // The specific triangular packed matrix-accesses are implemented in the kernel guarded by the + // ROUTINE_TPMV define. + auto fast_kernels = false; + auto status = MatVec(layout, a_transpose, + n, n, static_cast(1), + ap_buffer, ap_offset, n, + scratch_buffer, x_offset, x_inc, static_cast(0), + x_buffer, x_offset, x_inc, + fast_kernels, fast_kernels, + parameter, true, 0, 0); + + // Returns the proper error code (renames vector Y to X) + switch(status) { + case StatusCode::kInvalidVectorY: return StatusCode::kInvalidVectorX; + case StatusCode::kInvalidIncrementY: return StatusCode::kInvalidIncrementX; + case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX; + default: return status; + } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xtpmv; +template class Xtpmv; +template class Xtpmv; +template class Xtpmv; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xtrmv.cc b/src/routines/level2/xtrmv.cc new file mode 100644 index 00000000..94424743 --- /dev/null +++ b/src/routines/level2/xtrmv.cc @@ -0,0 +1,81 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xtrmv class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level2/xtrmv.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xtrmv::Xtrmv(Queue &queue, Event &event, const std::string &name): + Xgemv(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xtrmv::DoTrmv(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { + + // Creates a copy of X: a temporary scratch buffer + auto scratch_buffer = Buffer(context_, n*x_inc + x_offset); + try { + x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer); + } catch (...) { } // Continues: error-code is returned in MatVec + + // The data is either in the upper or lower triangle + size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + + // Adds '2' to the parameter if the diagonal is unit + auto parameter = (diagonal == Diagonal::kUnit) ? is_upper + 2 : is_upper; + + // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. + // The specific triangular matrix-accesses are implemented in the kernel guarded by the + // ROUTINE_TRMV define. + auto fast_kernels = false; + auto status = MatVec(layout, a_transpose, + n, n, static_cast(1), + a_buffer, a_offset, a_ld, + scratch_buffer, x_offset, x_inc, static_cast(0), + x_buffer, x_offset, x_inc, + fast_kernels, fast_kernels, + parameter, false, 0, 0); + + // Returns the proper error code (renames vector Y to X) + switch(status) { + case StatusCode::kInvalidVectorY: return StatusCode::kInvalidVectorX; + case StatusCode::kInvalidIncrementY: return StatusCode::kInvalidIncrementX; + case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX; + default: return status; + } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xtrmv; +template class Xtrmv; +template class Xtrmv; +template class Xtrmv; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level3/xgemm.cc b/src/routines/level3/xgemm.cc index 525a82e6..94aadcad 100644 --- a/src/routines/level3/xgemm.cc +++ b/src/routines/level3/xgemm.cc @@ -29,14 +29,14 @@ template <> const Precision Xgemm::precision_ = Precision::kComplexDoub // Constructor: forwards to base class constructor template -Xgemm::Xgemm(Queue &queue, Event &event): - Routine(queue, event, "GEMM", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { +Xgemm::Xgemm(Queue &queue, Event &event, const std::string &name): + Routine(queue, event, name, {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { source_string_ = - #include "../../kernels/copy.opencl" - #include "../../kernels/pad.opencl" - #include "../../kernels/transpose.opencl" - #include "../../kernels/padtranspose.opencl" - #include "../../kernels/xgemm.opencl" + #include "../../kernels/level3/copy.opencl" + #include "../../kernels/level3/pad.opencl" + #include "../../kernels/level3/transpose.opencl" + #include "../../kernels/level3/padtranspose.opencl" + #include "../../kernels/level3/xgemm.opencl" ; } diff --git a/src/routines/level3/xhemm.cc b/src/routines/level3/xhemm.cc index a1c0c7c1..bcc60dee 100644 --- a/src/routines/level3/xhemm.cc +++ b/src/routines/level3/xhemm.cc @@ -21,8 +21,8 @@ namespace clblast { // Constructor: forwards to base class constructor template -Xhemm::Xhemm(Queue &queue, Event &event): - Xgemm(queue, event) { +Xhemm::Xhemm(Queue &queue, Event &event, const std::string &name): + Xgemm(queue, event, name) { } // ================================================================================================= diff --git a/src/routines/level3/xher2k.cc b/src/routines/level3/xher2k.cc index 29b2f733..5b84decb 100644 --- a/src/routines/level3/xher2k.cc +++ b/src/routines/level3/xher2k.cc @@ -27,14 +27,14 @@ template <> const Precision Xher2k::precision_ = Precision::kCom // Constructor: forwards to base class constructor template -Xher2k::Xher2k(Queue &queue, Event &event): - Routine(queue, event, "HER2K", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { +Xher2k::Xher2k(Queue &queue, Event &event, const std::string &name): + Routine(queue, event, name, {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { source_string_ = - #include "../../kernels/copy.opencl" - #include "../../kernels/pad.opencl" - #include "../../kernels/transpose.opencl" - #include "../../kernels/padtranspose.opencl" - #include "../../kernels/xgemm.opencl" + #include "../../kernels/level3/copy.opencl" + #include "../../kernels/level3/pad.opencl" + #include "../../kernels/level3/transpose.opencl" + #include "../../kernels/level3/padtranspose.opencl" + #include "../../kernels/level3/xgemm.opencl" ; } diff --git a/src/routines/level3/xherk.cc b/src/routines/level3/xherk.cc index 5174e9ab..6a915c0b 100644 --- a/src/routines/level3/xherk.cc +++ b/src/routines/level3/xherk.cc @@ -27,14 +27,14 @@ template <> const Precision Xherk::precision_ = Precision::kComp // Constructor: forwards to base class constructor template -Xherk::Xherk(Queue &queue, Event &event): - Routine(queue, event, "HERK", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { +Xherk::Xherk(Queue &queue, Event &event, const std::string &name): + Routine(queue, event, name, {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { source_string_ = - #include "../../kernels/copy.opencl" - #include "../../kernels/pad.opencl" - #include "../../kernels/transpose.opencl" - #include "../../kernels/padtranspose.opencl" - #include "../../kernels/xgemm.opencl" + #include "../../kernels/level3/copy.opencl" + #include "../../kernels/level3/pad.opencl" + #include "../../kernels/level3/transpose.opencl" + #include "../../kernels/level3/padtranspose.opencl" + #include "../../kernels/level3/xgemm.opencl" ; } diff --git a/src/routines/level3/xsymm.cc b/src/routines/level3/xsymm.cc index 37c08d3b..583d5c7d 100644 --- a/src/routines/level3/xsymm.cc +++ b/src/routines/level3/xsymm.cc @@ -21,8 +21,8 @@ namespace clblast { // Constructor: forwards to base class constructor template -Xsymm::Xsymm(Queue &queue, Event &event): - Xgemm(queue, event) { +Xsymm::Xsymm(Queue &queue, Event &event, const std::string &name): + Xgemm(queue, event, name) { } // ================================================================================================= diff --git a/src/routines/level3/xsyr2k.cc b/src/routines/level3/xsyr2k.cc index b36e7c5e..de5f1afc 100644 --- a/src/routines/level3/xsyr2k.cc +++ b/src/routines/level3/xsyr2k.cc @@ -29,14 +29,14 @@ template <> const Precision Xsyr2k::precision_ = Precision::kComplexDou // Constructor: forwards to base class constructor template -Xsyr2k::Xsyr2k(Queue &queue, Event &event): - Routine(queue, event, "SYR2K", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { +Xsyr2k::Xsyr2k(Queue &queue, Event &event, const std::string &name): + Routine(queue, event, name, {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { source_string_ = - #include "../../kernels/copy.opencl" - #include "../../kernels/pad.opencl" - #include "../../kernels/transpose.opencl" - #include "../../kernels/padtranspose.opencl" - #include "../../kernels/xgemm.opencl" + #include "../../kernels/level3/copy.opencl" + #include "../../kernels/level3/pad.opencl" + #include "../../kernels/level3/transpose.opencl" + #include "../../kernels/level3/padtranspose.opencl" + #include "../../kernels/level3/xgemm.opencl" ; } diff --git a/src/routines/level3/xsyrk.cc b/src/routines/level3/xsyrk.cc index e4668216..d8fc6357 100644 --- a/src/routines/level3/xsyrk.cc +++ b/src/routines/level3/xsyrk.cc @@ -29,14 +29,14 @@ template <> const Precision Xsyrk::precision_ = Precision::kComplexDoub // Constructor: forwards to base class constructor template -Xsyrk::Xsyrk(Queue &queue, Event &event): - Routine(queue, event, "SYRK", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { +Xsyrk::Xsyrk(Queue &queue, Event &event, const std::string &name): + Routine(queue, event, name, {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { source_string_ = - #include "../../kernels/copy.opencl" - #include "../../kernels/pad.opencl" - #include "../../kernels/transpose.opencl" - #include "../../kernels/padtranspose.opencl" - #include "../../kernels/xgemm.opencl" + #include "../../kernels/level3/copy.opencl" + #include "../../kernels/level3/pad.opencl" + #include "../../kernels/level3/transpose.opencl" + #include "../../kernels/level3/padtranspose.opencl" + #include "../../kernels/level3/xgemm.opencl" ; } diff --git a/src/routines/level3/xtrmm.cc b/src/routines/level3/xtrmm.cc index 8be7d950..1180c026 100644 --- a/src/routines/level3/xtrmm.cc +++ b/src/routines/level3/xtrmm.cc @@ -21,8 +21,8 @@ namespace clblast { // Constructor: forwards to base class constructor template -Xtrmm::Xtrmm(Queue &queue, Event &event): - Xgemm(queue, event) { +Xtrmm::Xtrmm(Queue &queue, Event &event, const std::string &name): + Xgemm(queue, event, name) { } // ================================================================================================= diff --git a/src/tuning/copy.cc b/src/tuning/copy.cc index f38a28f3..e2837e60 100644 --- a/src/tuning/copy.cc +++ b/src/tuning/copy.cc @@ -31,7 +31,7 @@ class TuneCopy { static std::string GetSources() { return #include "../src/kernels/common.opencl" - #include "../src/kernels/copy.opencl" + #include "../src/kernels/level3/copy.opencl" ; } @@ -53,6 +53,7 @@ class TuneCopy { static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } static size_t GetSizeB(const Arguments &args) { return args.m * args.n; } static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel // Sets the tuning parameters and their possible values static void SetParameters(cltune::Tuner &tuner, const size_t id) { @@ -68,6 +69,7 @@ class TuneCopy { // Sets the base thread configuration static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } + static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } static std::vector LocalSize() { return {1, 1}; } static std::vector LocalSizeRef() { return {8, 8}; } @@ -81,7 +83,8 @@ class TuneCopy { // Sets the kernel's arguments static void SetArguments(cltune::Tuner &tuner, const Arguments &args, std::vector &, std::vector &, - std::vector &a_mat, std::vector &b_mat, std::vector &) { + std::vector &a_mat, std::vector &b_mat, std::vector &, + std::vector &) { tuner.AddArgumentScalar(static_cast(args.m)); tuner.AddArgumentInput(a_mat); tuner.AddArgumentOutput(b_mat); diff --git a/src/tuning/pad.cc b/src/tuning/pad.cc index 2ce566fb..72729422 100644 --- a/src/tuning/pad.cc +++ b/src/tuning/pad.cc @@ -31,7 +31,7 @@ class TunePad { static std::string GetSources() { return #include "../src/kernels/common.opencl" - #include "../src/kernels/pad.opencl" + #include "../src/kernels/level3/pad.opencl" ; } @@ -53,6 +53,7 @@ class TunePad { static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } static size_t GetSizeB(const Arguments &args) { return args.m * args.n; } static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel // Sets the tuning parameters and their possible values static void SetParameters(cltune::Tuner &tuner, const size_t id) { @@ -68,6 +69,7 @@ class TunePad { // Sets the base thread configuration static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } + static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } static std::vector LocalSize() { return {1, 1}; } static std::vector LocalSizeRef() { return {8, 8}; } @@ -81,7 +83,8 @@ class TunePad { // Sets the kernel's arguments static void SetArguments(cltune::Tuner &tuner, const Arguments &args, std::vector &, std::vector &, - std::vector &a_mat, std::vector &b_mat, std::vector &) { + std::vector &a_mat, std::vector &b_mat, std::vector &, + std::vector &) { tuner.AddArgumentScalar(static_cast(args.m)); tuner.AddArgumentScalar(static_cast(args.n)); tuner.AddArgumentScalar(static_cast(args.m)); diff --git a/src/tuning/padtranspose.cc b/src/tuning/padtranspose.cc index 8d494745..5edd89e0 100644 --- a/src/tuning/padtranspose.cc +++ b/src/tuning/padtranspose.cc @@ -31,7 +31,7 @@ class TunePadTranspose { static std::string GetSources() { return #include "../src/kernels/common.opencl" - #include "../src/kernels/padtranspose.opencl" + #include "../src/kernels/level3/padtranspose.opencl" ; } @@ -53,6 +53,7 @@ class TunePadTranspose { static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } static size_t GetSizeB(const Arguments &args) { return args.m * args.n; } static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel // Sets the tuning parameters and their possible values static void SetParameters(cltune::Tuner &tuner, const size_t id) { @@ -72,6 +73,7 @@ class TunePadTranspose { // Sets the base thread configuration static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } + static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } static std::vector LocalSize() { return {1, 1}; } static std::vector LocalSizeRef() { return {8, 8}; } @@ -85,7 +87,8 @@ class TunePadTranspose { // Sets the kernel's arguments static void SetArguments(cltune::Tuner &tuner, const Arguments &args, std::vector &, std::vector &, - std::vector &a_mat, std::vector &b_mat, std::vector &) { + std::vector &a_mat, std::vector &b_mat, std::vector &, + std::vector &) { tuner.AddArgumentScalar(static_cast(args.m)); tuner.AddArgumentScalar(static_cast(args.n)); tuner.AddArgumentScalar(static_cast(args.m)); diff --git a/src/tuning/transpose.cc b/src/tuning/transpose.cc index 2ffdb7aa..113e0a81 100644 --- a/src/tuning/transpose.cc +++ b/src/tuning/transpose.cc @@ -31,7 +31,7 @@ class TuneTranspose { static std::string GetSources() { return #include "../src/kernels/common.opencl" - #include "../src/kernels/transpose.opencl" + #include "../src/kernels/level3/transpose.opencl" ; } @@ -53,6 +53,7 @@ class TuneTranspose { static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } static size_t GetSizeB(const Arguments &args) { return args.m * args.n; } static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel // Sets the tuning parameters and their possible values static void SetParameters(cltune::Tuner &tuner, const size_t id) { @@ -73,6 +74,7 @@ class TuneTranspose { // Sets the base thread configuration static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } + static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } static std::vector LocalSize() { return {1, 1}; } static std::vector LocalSizeRef() { return {8, 8}; } @@ -86,7 +88,8 @@ class TuneTranspose { // Sets the kernel's arguments static void SetArguments(cltune::Tuner &tuner, const Arguments &args, std::vector &, std::vector &, - std::vector &a_mat, std::vector &b_mat, std::vector &) { + std::vector &a_mat, std::vector &b_mat, std::vector &, + std::vector &) { tuner.AddArgumentScalar(static_cast(args.m)); tuner.AddArgumentInput(a_mat); tuner.AddArgumentOutput(b_mat); diff --git a/src/tuning/xaxpy.cc b/src/tuning/xaxpy.cc index cc9e81d3..31aa6a8e 100644 --- a/src/tuning/xaxpy.cc +++ b/src/tuning/xaxpy.cc @@ -31,7 +31,8 @@ class TuneXaxpy { static std::string GetSources() { return #include "../src/kernels/common.opencl" - #include "../src/kernels/xaxpy.opencl" + #include "../src/kernels/level1/level1.opencl" + #include "../src/kernels/level1/xaxpy.opencl" ; } @@ -52,11 +53,12 @@ class TuneXaxpy { static double DefaultFraction() { return 1.0; } // N/A for this kernel // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments &args) { return args.n; } // N/A for this kernel - static size_t GetSizeY(const Arguments &args) { return args.n; } // N/A for this kernel + static size_t GetSizeX(const Arguments &args) { return args.n; } + static size_t GetSizeY(const Arguments &args) { return args.n; } static size_t GetSizeA(const Arguments &) { return 1; } // N/A for this kernel static size_t GetSizeB(const Arguments &) { return 1; } // N/A for this kernel static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel // Sets the tuning parameters and their possible values static void SetParameters(cltune::Tuner &tuner, const size_t id) { @@ -71,6 +73,7 @@ class TuneXaxpy { // Sets the base thread configuration static std::vector GlobalSize(const Arguments &args) { return {args.n}; } + static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } static std::vector LocalSize() { return {1}; } static std::vector LocalSizeRef() { return {64}; } @@ -84,7 +87,8 @@ class TuneXaxpy { // Sets the kernel's arguments static void SetArguments(cltune::Tuner &tuner, const Arguments &args, std::vector &x_vec, std::vector &y_vec, - std::vector &, std::vector &, std::vector &) { + std::vector &, std::vector &, std::vector &, + std::vector &) { tuner.AddArgumentScalar(static_cast(args.n)); tuner.AddArgumentScalar(args.alpha); tuner.AddArgumentInput(x_vec); diff --git a/src/tuning/xdot.cc b/src/tuning/xdot.cc new file mode 100644 index 00000000..ff6bee16 --- /dev/null +++ b/src/tuning/xdot.cc @@ -0,0 +1,125 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file uses the CLTune auto-tuner to tune the xdot OpenCL kernels. Note that the results are +// not verified, since the result is not final and depends on the WGS2 parameter. +// +// ================================================================================================= + +#include +#include + +#include "internal/utilities.h" +#include "internal/tuning.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TuneXdot { + public: + + // The representative kernel and the source code + static std::string KernelFamily() { return "xdot"; } + static std::string KernelName() { return "Xdot"; } + static std::string GetSources() { + return + #include "../src/kernels/common.opencl" + #include "../src/kernels/level1/xdot.opencl" + ; + } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { return {kArgN}; } + + // Tests for valid arguments + static void TestValidArguments(const Arguments &) { } + + // Sets the default values for the arguments + static size_t DefaultM() { return 1; } // N/A for this kernel + static size_t DefaultN() { return 4096*1024; } + static size_t DefaultK() { return 1; } // N/A for this kernel + static double DefaultFraction() { return 1.0; } // N/A for this kernel + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &args) { return args.n; } + static size_t GetSizeY(const Arguments &args) { return args.n; } + static size_t GetSizeA(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeB(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeTemp(const Arguments &args) { return args.n; } // Worst case + + // Sets the tuning parameters and their possible values + static void SetParameters(cltune::Tuner &tuner, const size_t id) { + tuner.AddParameter(id, "WGS1", {32, 64, 128, 256, 512, 1024}); + tuner.AddParameter(id, "WGS2", {32, 64, 128, 256, 512, 1024}); + tuner.AddParameter(id, "VW", {1}); + } + + // Sets the constraints and local memory size + static void SetConstraints(cltune::Tuner &, const size_t) { } + static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments &) { } + + // Sets the base thread configuration + static std::vector GlobalSize(const Arguments &) { return {2}; } + static std::vector GlobalSizeRef(const Arguments &) { return {2*64*64}; } + static std::vector LocalSize() { return {1}; } + static std::vector LocalSizeRef() { return {64}; } + + // Transforms the thread configuration based on the parameters + using TransformVector = std::vector>; + static TransformVector MulLocal() { return {{"WGS1"}}; } + static TransformVector DivLocal() { return {}; } + static TransformVector MulGlobal() { return {{"WGS1"},{"WGS2"}}; } + static TransformVector DivGlobal() { return {}; } + + // Sets the kernel's arguments + static void SetArguments(cltune::Tuner &tuner, const Arguments &args, + std::vector &x_vec, std::vector &y_vec, + std::vector &, std::vector &, std::vector &, + std::vector &temp) { + tuner.AddArgumentScalar(static_cast(args.n)); + tuner.AddArgumentInput(x_vec); + tuner.AddArgumentScalar(0); + tuner.AddArgumentScalar(1); + tuner.AddArgumentInput(y_vec); + tuner.AddArgumentScalar(0); + tuner.AddArgumentScalar(1); + tuner.AddArgumentInput(temp); // No output checking for the result - size varies + tuner.AddArgumentScalar(static_cast(false)); + } + + // Describes how to compute the performance metrics + static size_t GetMetric(const Arguments &args) { + return (2*args.n + 1) * GetBytes(args.precision); + } + static std::string PerformanceUnit() { return "GB/s"; } +}; + +// ================================================================================================= +} // namespace clblast + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; + case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/src/tuning/xgemm.cc b/src/tuning/xgemm.cc index 302f2bd5..c06e3e72 100644 --- a/src/tuning/xgemm.cc +++ b/src/tuning/xgemm.cc @@ -31,7 +31,7 @@ class TuneXgemm { static std::string GetSources() { return #include "../src/kernels/common.opencl" - #include "../src/kernels/xgemm.opencl" + #include "../src/kernels/level3/xgemm.opencl" ; } @@ -55,6 +55,7 @@ class TuneXgemm { static size_t GetSizeA(const Arguments &args) { return args.m * args.k; } static size_t GetSizeB(const Arguments &args) { return args.n * args.k; } static size_t GetSizeC(const Arguments &args) { return args.m * args.n; } + static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel // Sets the tuning parameters and their possible values static void SetParameters(cltune::Tuner &tuner, const size_t id) { @@ -103,6 +104,7 @@ class TuneXgemm { // Sets the base thread configuration static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } + static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } static std::vector LocalSize() { return {1, 1}; } static std::vector LocalSizeRef() { return {8, 8}; } @@ -116,7 +118,8 @@ class TuneXgemm { // Sets the kernel's arguments static void SetArguments(cltune::Tuner &tuner, const Arguments &args, std::vector &, std::vector &, - std::vector &a_mat, std::vector &b_mat, std::vector &c_mat) { + std::vector &a_mat, std::vector &b_mat, std::vector &c_mat, + std::vector &) { tuner.AddArgumentScalar(static_cast(args.m)); tuner.AddArgumentScalar(static_cast(args.n)); tuner.AddArgumentScalar(static_cast(args.k)); diff --git a/src/tuning/xgemv.cc b/src/tuning/xgemv.cc index e22b5103..6a066518 100644 --- a/src/tuning/xgemv.cc +++ b/src/tuning/xgemv.cc @@ -34,7 +34,7 @@ class TuneXgemv { static std::string GetSources() { return #include "../src/kernels/common.opencl" - #include "../src/kernels/xgemv.opencl" + #include "../src/kernels/level2/xgemv.opencl" ; } @@ -56,6 +56,7 @@ class TuneXgemv { static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } static size_t GetSizeB(const Arguments &) { return 1; } // N/A for this kernel static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel // Sets the tuning parameters and their possible values static void SetParameters(cltune::Tuner &tuner, const size_t id) { @@ -75,6 +76,7 @@ class TuneXgemv { // Sets the base thread configuration static std::vector GlobalSize(const Arguments &args) { return {args.m}; } + static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } static std::vector LocalSize() { return {1}; } static std::vector LocalSizeRef() { return {64}; } @@ -88,7 +90,8 @@ class TuneXgemv { // Sets the kernel's arguments static void SetArguments(cltune::Tuner &tuner, const Arguments &args, std::vector &x_vec, std::vector &y_vec, - std::vector &a_mat, std::vector &, std::vector &) { + std::vector &a_mat, std::vector &, std::vector &, + std::vector &) { auto a_rotated = (V==3) ? 1 : 0; tuner.AddArgumentScalar(static_cast(args.m)); tuner.AddArgumentScalar(static_cast(args.n)); diff --git a/test/correctness/routines/level1/xaxpy.cc b/test/correctness/routines/level1/xaxpy.cc index 1bb10f7b..746e0001 100644 --- a/test/correctness/routines/level1/xaxpy.cc +++ b/test/correctness/routines/level1/xaxpy.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the tests for the Xaxpy routine. -// // ================================================================================================= #include "correctness/testblas.h" #include "routines/level1/xaxpy.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; diff --git a/test/correctness/routines/level1/xcopy.cc b/test/correctness/routines/level1/xcopy.cc new file mode 100644 index 00000000..3e16ffc6 --- /dev/null +++ b/test/correctness/routines/level1/xcopy.cc @@ -0,0 +1,28 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level1/xcopy.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "SCOPY"); + clblast::RunTests, double, double>(argc, argv, true, "DCOPY"); + clblast::RunTests, float2, float2>(argc, argv, true, "CCOPY"); + clblast::RunTests, double2, double2>(argc, argv, true, "ZCOPY"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level1/xdot.cc b/test/correctness/routines/level1/xdot.cc new file mode 100644 index 00000000..5ea105e0 --- /dev/null +++ b/test/correctness/routines/level1/xdot.cc @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level1/xdot.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "SDOT"); + clblast::RunTests, double, double>(argc, argv, true, "DDOT"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level1/xdotc.cc b/test/correctness/routines/level1/xdotc.cc new file mode 100644 index 00000000..76aaa0ec --- /dev/null +++ b/test/correctness/routines/level1/xdotc.cc @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level1/xdotc.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float2, float2>(argc, argv, false, "CDOTC"); + clblast::RunTests, double2, double2>(argc, argv, true, "ZDOTC"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level1/xdotu.cc b/test/correctness/routines/level1/xdotu.cc new file mode 100644 index 00000000..aecde4f5 --- /dev/null +++ b/test/correctness/routines/level1/xdotu.cc @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level1/xdotu.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float2, float2>(argc, argv, false, "CDOTU"); + clblast::RunTests, double2, double2>(argc, argv, true, "ZDOTU"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level1/xscal.cc b/test/correctness/routines/level1/xscal.cc new file mode 100644 index 00000000..4d138fad --- /dev/null +++ b/test/correctness/routines/level1/xscal.cc @@ -0,0 +1,28 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level1/xscal.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "SSCAL"); + clblast::RunTests, double, double>(argc, argv, true, "DSCAL"); + clblast::RunTests, float2, float2>(argc, argv, true, "CSCAL"); + clblast::RunTests, double2, double2>(argc, argv, true, "ZSCAL"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level1/xswap.cc b/test/correctness/routines/level1/xswap.cc new file mode 100644 index 00000000..38f110f7 --- /dev/null +++ b/test/correctness/routines/level1/xswap.cc @@ -0,0 +1,28 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level1/xswap.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "SSWAP"); + clblast::RunTests, double, double>(argc, argv, true, "DSWAP"); + clblast::RunTests, float2, float2>(argc, argv, true, "CSWAP"); + clblast::RunTests, double2, double2>(argc, argv, true, "ZSWAP"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xgbmv.cc b/test/correctness/routines/level2/xgbmv.cc new file mode 100644 index 00000000..b28c5978 --- /dev/null +++ b/test/correctness/routines/level2/xgbmv.cc @@ -0,0 +1,28 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xgbmv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "SGBMV"); + clblast::RunTests, double, double>(argc, argv, true, "DGBMV"); + clblast::RunTests, float2, float2>(argc, argv, true, "CGBMV"); + clblast::RunTests, double2, double2>(argc, argv, true, "ZGBMV"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xgemv.cc b/test/correctness/routines/level2/xgemv.cc index f7229735..14eb74d1 100644 --- a/test/correctness/routines/level2/xgemv.cc +++ b/test/correctness/routines/level2/xgemv.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the tests for the Xgemv routine. -// // ================================================================================================= #include "correctness/testblas.h" #include "routines/level2/xgemv.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; diff --git a/test/correctness/routines/level2/xger.cc b/test/correctness/routines/level2/xger.cc new file mode 100644 index 00000000..c37a5c41 --- /dev/null +++ b/test/correctness/routines/level2/xger.cc @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xger.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "SGER"); + clblast::RunTests, double, double>(argc, argv, true, "DGER"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xgerc.cc b/test/correctness/routines/level2/xgerc.cc new file mode 100644 index 00000000..8fd31142 --- /dev/null +++ b/test/correctness/routines/level2/xgerc.cc @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xgerc.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float2, float2>(argc, argv, false, "CGERC"); + clblast::RunTests, double2, double2>(argc, argv, true, "ZGERC"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xgeru.cc b/test/correctness/routines/level2/xgeru.cc new file mode 100644 index 00000000..ee92416b --- /dev/null +++ b/test/correctness/routines/level2/xgeru.cc @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xgeru.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float2, float2>(argc, argv, false, "CGERU"); + clblast::RunTests, double2, double2>(argc, argv, true, "ZGERU"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xhbmv.cc b/test/correctness/routines/level2/xhbmv.cc new file mode 100644 index 00000000..4cd137a7 --- /dev/null +++ b/test/correctness/routines/level2/xhbmv.cc @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xhbmv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float2, float2>(argc, argv, false, "CHBMV"); + clblast::RunTests, double2, double2>(argc, argv, true, "ZHBMV"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xhemv.cc b/test/correctness/routines/level2/xhemv.cc index 183aebc2..20c5370c 100644 --- a/test/correctness/routines/level2/xhemv.cc +++ b/test/correctness/routines/level2/xhemv.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the tests for the Xhemv routine. -// // ================================================================================================= #include "correctness/testblas.h" #include "routines/level2/xhemv.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; diff --git a/test/correctness/routines/level2/xher.cc b/test/correctness/routines/level2/xher.cc new file mode 100644 index 00000000..5b9b48be --- /dev/null +++ b/test/correctness/routines/level2/xher.cc @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xher.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float2, float>(argc, argv, false, "CHER"); + clblast::RunTests, double2, double>(argc, argv, true, "ZHER"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xher2.cc b/test/correctness/routines/level2/xher2.cc new file mode 100644 index 00000000..093b3959 --- /dev/null +++ b/test/correctness/routines/level2/xher2.cc @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xher2.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float2, float2>(argc, argv, false, "CHER2"); + clblast::RunTests, double2, double2>(argc, argv, true, "ZHER2"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xhpmv.cc b/test/correctness/routines/level2/xhpmv.cc new file mode 100644 index 00000000..cbf41443 --- /dev/null +++ b/test/correctness/routines/level2/xhpmv.cc @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xhpmv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float2, float2>(argc, argv, false, "CHPMV"); + clblast::RunTests, double2, double2>(argc, argv, true, "ZHPMV"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xhpr.cc b/test/correctness/routines/level2/xhpr.cc new file mode 100644 index 00000000..a720aaef --- /dev/null +++ b/test/correctness/routines/level2/xhpr.cc @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xhpr.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float2, float>(argc, argv, false, "CHPR"); + clblast::RunTests, double2, double>(argc, argv, true, "ZHPR"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xhpr2.cc b/test/correctness/routines/level2/xhpr2.cc new file mode 100644 index 00000000..0fed97e1 --- /dev/null +++ b/test/correctness/routines/level2/xhpr2.cc @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xhpr2.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float2, float2>(argc, argv, false, "CHPR2"); + clblast::RunTests, double2, double2>(argc, argv, true, "ZHPR2"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xsbmv.cc b/test/correctness/routines/level2/xsbmv.cc new file mode 100644 index 00000000..212e2c3a --- /dev/null +++ b/test/correctness/routines/level2/xsbmv.cc @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xsbmv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "SSBMV"); + clblast::RunTests, double, double>(argc, argv, true, "DSBMV"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xspmv.cc b/test/correctness/routines/level2/xspmv.cc new file mode 100644 index 00000000..dc833024 --- /dev/null +++ b/test/correctness/routines/level2/xspmv.cc @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xspmv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "SSPMV"); + clblast::RunTests, double, double>(argc, argv, true, "DSPMV"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xspr.cc b/test/correctness/routines/level2/xspr.cc new file mode 100644 index 00000000..a0104dd4 --- /dev/null +++ b/test/correctness/routines/level2/xspr.cc @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xspr.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "SSPR"); + clblast::RunTests, double, double>(argc, argv, true, "DSPR"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xspr2.cc b/test/correctness/routines/level2/xspr2.cc new file mode 100644 index 00000000..5fe5827f --- /dev/null +++ b/test/correctness/routines/level2/xspr2.cc @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xspr2.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "SSPR2"); + clblast::RunTests, double, double>(argc, argv, true, "DSPR2"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xsymv.cc b/test/correctness/routines/level2/xsymv.cc index a479b999..6224739f 100644 --- a/test/correctness/routines/level2/xsymv.cc +++ b/test/correctness/routines/level2/xsymv.cc @@ -7,14 +7,14 @@ // Author(s): // Cedric Nugteren // -// This file implements the tests for the Xsymv routine. -// // ================================================================================================= #include "correctness/testblas.h" #include "routines/level2/xsymv.h" -// ================================================================================================= +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { diff --git a/test/correctness/routines/level2/xsyr.cc b/test/correctness/routines/level2/xsyr.cc new file mode 100644 index 00000000..a47b918f --- /dev/null +++ b/test/correctness/routines/level2/xsyr.cc @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xsyr.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "SSYR"); + clblast::RunTests, double, double>(argc, argv, true, "DSYR"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xsyr2.cc b/test/correctness/routines/level2/xsyr2.cc new file mode 100644 index 00000000..1743632c --- /dev/null +++ b/test/correctness/routines/level2/xsyr2.cc @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xsyr2.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "SSYR2"); + clblast::RunTests, double, double>(argc, argv, true, "DSYR2"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xtbmv.cc b/test/correctness/routines/level2/xtbmv.cc new file mode 100644 index 00000000..d3bbbade --- /dev/null +++ b/test/correctness/routines/level2/xtbmv.cc @@ -0,0 +1,28 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xtbmv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "STBMV"); + clblast::RunTests, double, double>(argc, argv, true, "DTBMV"); + clblast::RunTests, float2, float2>(argc, argv, true, "CTBMV"); + clblast::RunTests, double2, double2>(argc, argv, true, "ZTBMV"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xtbsv.cc b/test/correctness/routines/level2/xtbsv.cc new file mode 100644 index 00000000..c8a8a583 --- /dev/null +++ b/test/correctness/routines/level2/xtbsv.cc @@ -0,0 +1,28 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xtbsv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "STBSV"); + clblast::RunTests, double, double>(argc, argv, true, "DTBSV"); + clblast::RunTests, float2, float2>(argc, argv, true, "CTBSV"); + clblast::RunTests, double2, double2>(argc, argv, true, "ZTBSV"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xtpmv.cc b/test/correctness/routines/level2/xtpmv.cc new file mode 100644 index 00000000..95489a65 --- /dev/null +++ b/test/correctness/routines/level2/xtpmv.cc @@ -0,0 +1,28 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xtpmv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "STPMV"); + clblast::RunTests, double, double>(argc, argv, true, "DTPMV"); + clblast::RunTests, float2, float2>(argc, argv, true, "CTPMV"); + clblast::RunTests, double2, double2>(argc, argv, true, "ZTPMV"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xtpsv.cc b/test/correctness/routines/level2/xtpsv.cc new file mode 100644 index 00000000..97d27271 --- /dev/null +++ b/test/correctness/routines/level2/xtpsv.cc @@ -0,0 +1,28 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xtpsv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "STPSV"); + clblast::RunTests, double, double>(argc, argv, true, "DTPSV"); + clblast::RunTests, float2, float2>(argc, argv, true, "CTPSV"); + clblast::RunTests, double2, double2>(argc, argv, true, "ZTPSV"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xtrmv.cc b/test/correctness/routines/level2/xtrmv.cc new file mode 100644 index 00000000..ca50af88 --- /dev/null +++ b/test/correctness/routines/level2/xtrmv.cc @@ -0,0 +1,28 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xtrmv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "STRMV"); + clblast::RunTests, double, double>(argc, argv, true, "DTRMV"); + clblast::RunTests, float2, float2>(argc, argv, true, "CTRMV"); + clblast::RunTests, double2, double2>(argc, argv, true, "ZTRMV"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xtrsv.cc b/test/correctness/routines/level2/xtrsv.cc new file mode 100644 index 00000000..bfca0f20 --- /dev/null +++ b/test/correctness/routines/level2/xtrsv.cc @@ -0,0 +1,28 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xtrsv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "STRSV"); + clblast::RunTests, double, double>(argc, argv, true, "DTRSV"); + clblast::RunTests, float2, float2>(argc, argv, true, "CTRSV"); + clblast::RunTests, double2, double2>(argc, argv, true, "ZTRSV"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level3/xgemm.cc b/test/correctness/routines/level3/xgemm.cc index 90302095..632724ed 100644 --- a/test/correctness/routines/level3/xgemm.cc +++ b/test/correctness/routines/level3/xgemm.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the tests for the Xgemm routine. -// // ================================================================================================= #include "correctness/testblas.h" #include "routines/level3/xgemm.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; diff --git a/test/correctness/routines/level3/xhemm.cc b/test/correctness/routines/level3/xhemm.cc index 60555604..74e22080 100644 --- a/test/correctness/routines/level3/xhemm.cc +++ b/test/correctness/routines/level3/xhemm.cc @@ -7,22 +7,18 @@ // Author(s): // Cedric Nugteren // -// This file implements the tests for the Xhemm routine. -// // ================================================================================================= #include "correctness/testblas.h" #include "routines/level3/xhemm.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { - clblast::RunTests, float2, float2>(argc, argv, true, "CHEMM"); + clblast::RunTests, float2, float2>(argc, argv, false, "CHEMM"); clblast::RunTests, double2, double2>(argc, argv, true, "ZHEMM"); return 0; } diff --git a/test/correctness/routines/level3/xher2k.cc b/test/correctness/routines/level3/xher2k.cc index dd03fcd7..6377572a 100644 --- a/test/correctness/routines/level3/xher2k.cc +++ b/test/correctness/routines/level3/xher2k.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the tests for the Xher2k routine. -// // ================================================================================================= #include "correctness/testblas.h" #include "routines/level3/xher2k.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; diff --git a/test/correctness/routines/level3/xherk.cc b/test/correctness/routines/level3/xherk.cc index 32b8aa2c..32a39bfc 100644 --- a/test/correctness/routines/level3/xherk.cc +++ b/test/correctness/routines/level3/xherk.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the tests for the Xherk routine. -// // ================================================================================================= #include "correctness/testblas.h" #include "routines/level3/xherk.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; diff --git a/test/correctness/routines/level3/xsymm.cc b/test/correctness/routines/level3/xsymm.cc index 94968e31..046fca16 100644 --- a/test/correctness/routines/level3/xsymm.cc +++ b/test/correctness/routines/level3/xsymm.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the tests for the Xsymm routine. -// // ================================================================================================= #include "correctness/testblas.h" #include "routines/level3/xsymm.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; diff --git a/test/correctness/routines/level3/xsyr2k.cc b/test/correctness/routines/level3/xsyr2k.cc index 3b8e601b..db2b83d9 100644 --- a/test/correctness/routines/level3/xsyr2k.cc +++ b/test/correctness/routines/level3/xsyr2k.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the tests for the Xsyr2k routine. -// // ================================================================================================= #include "correctness/testblas.h" #include "routines/level3/xsyr2k.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; diff --git a/test/correctness/routines/level3/xsyrk.cc b/test/correctness/routines/level3/xsyrk.cc index f299342a..3dad3535 100644 --- a/test/correctness/routines/level3/xsyrk.cc +++ b/test/correctness/routines/level3/xsyrk.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the tests for the Xsyrk routine. -// // ================================================================================================= #include "correctness/testblas.h" #include "routines/level3/xsyrk.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; diff --git a/test/correctness/routines/level3/xtrmm.cc b/test/correctness/routines/level3/xtrmm.cc index 6efde5f8..2d843e3e 100644 --- a/test/correctness/routines/level3/xtrmm.cc +++ b/test/correctness/routines/level3/xtrmm.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the tests for the Xtrmm routine. -// // ================================================================================================= #include "correctness/testblas.h" #include "routines/level3/xtrmm.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; diff --git a/test/correctness/routines/level3/xtrsm.cc b/test/correctness/routines/level3/xtrsm.cc new file mode 100644 index 00000000..b5f5045e --- /dev/null +++ b/test/correctness/routines/level3/xtrsm.cc @@ -0,0 +1,28 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level3/xtrsm.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "STRSM"); + clblast::RunTests, double, double>(argc, argv, true, "DTRSM"); + clblast::RunTests, float2, float2>(argc, argv, true, "CTRSM"); + clblast::RunTests, double2, double2>(argc, argv, true, "ZTRSM"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/testblas.cc b/test/correctness/testblas.cc index ff81f4c3..85e18381 100644 --- a/test/correctness/testblas.cc +++ b/test/correctness/testblas.cc @@ -57,11 +57,15 @@ TestBlas::TestBlas(int argc, char *argv[], const bool silent, a_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset); b_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset); c_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset); + ap_source_.resize(std::max(max_mat, max_matvec)*std::max(max_mat, max_matvec) + max_offset); + dot_source_.resize(std::max(max_mat, max_matvec) + max_offset); PopulateVector(x_source_); PopulateVector(y_source_); PopulateVector(a_source_); PopulateVector(b_source_); PopulateVector(c_source_); + PopulateVector(ap_source_); + PopulateVector(dot_source_); } // =============================================================================================== @@ -81,12 +85,16 @@ void TestBlas::TestRegular(std::vector> &test_vector, const st auto a_mat1 = Buffer(context_, args.a_size); auto b_mat1 = Buffer(context_, args.b_size); auto c_mat1 = Buffer(context_, args.c_size); + auto ap_mat1 = Buffer(context_, args.ap_size); + auto dot1 = Buffer(context_, args.dot_size); x_vec1.Write(queue_, args.x_size, x_source_); y_vec1.Write(queue_, args.y_size, y_source_); a_mat1.Write(queue_, args.a_size, a_source_); b_mat1.Write(queue_, args.b_size, b_source_); c_mat1.Write(queue_, args.c_size, c_source_); - auto buffers1 = Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1}; + ap_mat1.Write(queue_, args.ap_size, ap_source_); + dot1.Write(queue_, args.dot_size, dot_source_); + auto buffers1 = Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, dot1}; auto status1 = run_reference_(args, buffers1, queue_); // Runs the CLBlast code @@ -95,12 +103,16 @@ void TestBlas::TestRegular(std::vector> &test_vector, const st auto a_mat2 = Buffer(context_, args.a_size); auto b_mat2 = Buffer(context_, args.b_size); auto c_mat2 = Buffer(context_, args.c_size); + auto ap_mat2 = Buffer(context_, args.ap_size); + auto dot2 = Buffer(context_, args.dot_size); x_vec2.Write(queue_, args.x_size, x_source_); y_vec2.Write(queue_, args.y_size, y_source_); a_mat2.Write(queue_, args.a_size, a_source_); b_mat2.Write(queue_, args.b_size, b_source_); c_mat2.Write(queue_, args.c_size, c_source_); - auto buffers2 = Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2}; + ap_mat2.Write(queue_, args.ap_size, ap_source_); + dot2.Write(queue_, args.dot_size, dot_source_); + auto buffers2 = Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, dot2}; auto status2 = run_routine_(args, buffers2, queue_); // Tests for equality of the two status codes @@ -149,25 +161,35 @@ void TestBlas::TestInvalid(std::vector> &test_vector, const st auto a1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr); auto b1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr); auto c1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr); + auto ap1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.ap_size*sizeof(T), nullptr,nullptr); + auto d1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.dot_size*sizeof(T), nullptr,nullptr); auto x_vec1 = Buffer(x1); auto y_vec1 = Buffer(y1); auto a_mat1 = Buffer(a1); auto b_mat1 = Buffer(b1); auto c_mat1 = Buffer(c1); + auto ap_mat1 = Buffer(ap1); + auto dot1 = Buffer(d1); auto x2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr); auto y2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr); auto a2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr); auto b2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr); auto c2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr); + auto ap2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.ap_size*sizeof(T), nullptr,nullptr); + auto d2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.dot_size*sizeof(T), nullptr,nullptr); auto x_vec2 = Buffer(x2); auto y_vec2 = Buffer(y2); auto a_mat2 = Buffer(a2); auto b_mat2 = Buffer(b2); auto c_mat2 = Buffer(c2); + auto ap_mat2 = Buffer(ap2); + auto dot2 = Buffer(d2); // Runs the two routines - auto status1 = run_reference_(args, Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1}, queue_); - auto status2 = run_routine_(args, Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2}, queue_); + auto buffers1 = Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, dot1}; + auto buffers2 = Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, dot2}; + auto status1 = run_reference_(args, buffers1, queue_); + auto status2 = run_routine_(args, buffers2, queue_); // Tests for equality of the two status codes TestErrorCodes(status1, status2, args); diff --git a/test/correctness/testblas.h b/test/correctness/testblas.h index 8a86c65e..bfd1763c 100644 --- a/test/correctness/testblas.h +++ b/test/correctness/testblas.h @@ -49,6 +49,7 @@ class TestBlas: public Tester { const std::vector kIncrements = { 1, 2, 7 }; const std::vector kMatrixDims = { 7, 64 }; const std::vector kMatrixVectorDims = { 61, 512 }; + const std::vector kBandSizes = { 4, 19 }; const std::vector kOffsets = GetOffsets(); const std::vector kAlphaValues = GetExampleScalars(full_test_); const std::vector kBetaValues = GetExampleScalars(full_test_); @@ -90,6 +91,8 @@ class TestBlas: public Tester { std::vector a_source_; std::vector b_source_; std::vector c_source_; + std::vector ap_source_; + std::vector dot_source_; // The routine-specific functions passed to the tester Routine run_routine_; @@ -120,6 +123,8 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name auto ms = std::vector{args.m}; auto ns = std::vector{args.n}; auto ks = std::vector{args.k}; + auto kus = std::vector{args.ku}; + auto kls = std::vector{args.kl}; auto layouts = std::vector{args.layout}; auto a_transposes = std::vector{args.a_transpose}; auto b_transposes = std::vector{args.b_transpose}; @@ -136,6 +141,8 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name auto a_offsets = std::vector{args.a_offset}; auto b_offsets = std::vector{args.b_offset}; auto c_offsets = std::vector{args.c_offset}; + auto ap_offsets = std::vector{args.ap_offset}; + auto dot_offsets = std::vector{args.dot_offset}; auto alphas = std::vector{args.alpha}; auto betas = std::vector{args.beta}; auto x_sizes = std::vector{args.x_size}; @@ -143,6 +150,7 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name auto a_sizes = std::vector{args.a_size}; auto b_sizes = std::vector{args.b_size}; auto c_sizes = std::vector{args.c_size}; + auto ap_sizes = std::vector{args.ap_size}; // Sets the dimensions of the matrices or vectors depending on the BLAS level auto dimensions = (C::BLASLevel() == 3) ? tester.kMatrixDims : @@ -154,6 +162,8 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name if (option == kArgM) { ms = dimensions; } if (option == kArgN) { ns = dimensions; } if (option == kArgK) { ks = dimensions; } + if (option == kArgKU) { kus = tester.kBandSizes; } + if (option == kArgKL) { kls = tester.kBandSizes; } if (option == kArgLayout) { layouts = tester.kLayouts; } if (option == kArgATransp) { a_transposes = C::GetATransposes(tester.kTransposes); } if (option == kArgBTransp) { b_transposes = C::GetBTransposes(tester.kTransposes); } @@ -170,6 +180,8 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name if (option == kArgAOffset) { a_offsets = tester.kOffsets; } if (option == kArgBOffset) { b_offsets = tester.kOffsets; } if (option == kArgCOffset) { c_offsets = tester.kOffsets; } + if (option == kArgAPOffset) { ap_offsets = tester.kOffsets; } + if (option == kArgDotOffset) { dot_offsets = tester.kOffsets; } if (option == kArgAlpha) { alphas = tester.kAlphaValues; } if (option == kArgBeta) { betas = tester.kBetaValues; } @@ -178,6 +190,7 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name if (option == kArgAOffset) { a_sizes = tester.kMatSizes; } if (option == kArgBOffset) { b_sizes = tester.kMatSizes; } if (option == kArgCOffset) { c_sizes = tester.kMatSizes; } + if (option == kArgAPOffset) { ap_sizes = tester.kMatSizes; } } // Loops over the test-cases from a data-layout point of view @@ -194,20 +207,28 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name for (auto &m: ms) { r_args.m = m; for (auto &n: ns) { r_args.n = n; for (auto &k: ks) { r_args.k = k; - for (auto &x_inc: x_incs) { r_args.x_inc = x_inc; - for (auto &x_offset: x_offsets) { r_args.x_offset = x_offset; - for (auto &y_inc: y_incs) { r_args.y_inc = y_inc; - for (auto &y_offset: y_offsets) { r_args.y_offset = y_offset; - for (auto &a_ld: a_lds) { r_args.a_ld = a_ld; - for (auto &a_offset: a_offsets) { r_args.a_offset = a_offset; - for (auto &b_ld: b_lds) { r_args.b_ld = b_ld; - for (auto &b_offset: b_offsets) { r_args.b_offset = b_offset; - for (auto &c_ld: c_lds) { r_args.c_ld = c_ld; - for (auto &c_offset: c_offsets) { r_args.c_offset = c_offset; - for (auto &alpha: alphas) { r_args.alpha = alpha; - for (auto &beta: betas) { r_args.beta = beta; - C::SetSizes(r_args); - regular_test_vector.push_back(r_args); + for (auto &ku: kus) { r_args.ku = ku; + for (auto &kl: kls) { r_args.kl = kl; + for (auto &x_inc: x_incs) { r_args.x_inc = x_inc; + for (auto &x_offset: x_offsets) { r_args.x_offset = x_offset; + for (auto &y_inc: y_incs) { r_args.y_inc = y_inc; + for (auto &y_offset: y_offsets) { r_args.y_offset = y_offset; + for (auto &a_ld: a_lds) { r_args.a_ld = a_ld; + for (auto &a_offset: a_offsets) { r_args.a_offset = a_offset; + for (auto &b_ld: b_lds) { r_args.b_ld = b_ld; + for (auto &b_offset: b_offsets) { r_args.b_offset = b_offset; + for (auto &c_ld: c_lds) { r_args.c_ld = c_ld; + for (auto &c_offset: c_offsets) { r_args.c_offset = c_offset; + for (auto &ap_offset: ap_offsets) { r_args.ap_offset = ap_offset; + for (auto &dot_offset: dot_offsets) { r_args.dot_offset = dot_offset; + for (auto &alpha: alphas) { r_args.alpha = alpha; + for (auto &beta: betas) { r_args.beta = beta; + C::SetSizes(r_args); + regular_test_vector.push_back(r_args); + } + } + } + } } } } @@ -227,14 +248,16 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name // Creates the arguments vector for the invalid-buffer tests auto invalid_test_vector = std::vector>{}; auto i_args = args; - i_args.m = i_args.n = i_args.k = tester.kBufferSize; + i_args.m = i_args.n = i_args.k = i_args.kl = i_args.ku = tester.kBufferSize; i_args.a_ld = i_args.b_ld = i_args.c_ld = tester.kBufferSize; for (auto &x_size: x_sizes) { i_args.x_size = x_size; for (auto &y_size: y_sizes) { i_args.y_size = y_size; for (auto &a_size: a_sizes) { i_args.a_size = a_size; for (auto &b_size: b_sizes) { i_args.b_size = b_size; for (auto &c_size: c_sizes) { i_args.c_size = c_size; - invalid_test_vector.push_back(i_args); + for (auto &ap_size: ap_sizes) { i_args.ap_size = ap_size; + invalid_test_vector.push_back(i_args); + } } } } diff --git a/test/correctness/tester.cc b/test/correctness/tester.cc index a52142c4..350865f0 100644 --- a/test/correctness/tester.cc +++ b/test/correctness/tester.cc @@ -132,6 +132,8 @@ void Tester::TestEnd() { if (o == kArgM) { fprintf(stdout, "%s=%lu ", kArgM, entry.args.m); } if (o == kArgN) { fprintf(stdout, "%s=%lu ", kArgN, entry.args.n); } if (o == kArgK) { fprintf(stdout, "%s=%lu ", kArgK, entry.args.k); } + if (o == kArgKU) { fprintf(stdout, "%s=%lu ", kArgKU, entry.args.ku); } + if (o == kArgKL) { fprintf(stdout, "%s=%lu ", kArgKL, entry.args.kl); } if (o == kArgLayout) { fprintf(stdout, "%s=%d ", kArgLayout, entry.args.layout);} if (o == kArgATransp) { fprintf(stdout, "%s=%d ", kArgATransp, entry.args.a_transpose);} if (o == kArgBTransp) { fprintf(stdout, "%s=%d ", kArgBTransp, entry.args.b_transpose);} @@ -148,6 +150,8 @@ void Tester::TestEnd() { if (o == kArgAOffset) { fprintf(stdout, "%s=%lu ", kArgAOffset, entry.args.a_offset);} if (o == kArgBOffset) { fprintf(stdout, "%s=%lu ", kArgBOffset, entry.args.b_offset);} if (o == kArgCOffset) { fprintf(stdout, "%s=%lu ", kArgCOffset, entry.args.c_offset);} + if (o == kArgAPOffset) { fprintf(stdout, "%s=%lu ", kArgAPOffset, entry.args.ap_offset);} + if (o == kArgDotOffset){ fprintf(stdout, "%s=%lu ", kArgDotOffset, entry.args.dot_offset);} } fprintf(stdout, "\n"); } diff --git a/test/performance/client.cc b/test/performance/client.cc index 893bb55d..fb248854 100644 --- a/test/performance/client.cc +++ b/test/performance/client.cc @@ -42,15 +42,17 @@ template Arguments Client::ParseArguments(int argc, char *argv[], const GetMetric default_a_ld, const GetMetric default_b_ld, const GetMetric default_c_ld) { auto args = Arguments{}; - auto help = std::string{"Options given/available:\n"}; + auto help = std::string{"\n* Options given/available:\n"}; // These are the options which are not for every client: they are optional for (auto &o: options_) { // Data-sizes - if (o == kArgM) { args.m = GetArgument(argc, argv, help, kArgM, 512UL); } - if (o == kArgN) { args.n = GetArgument(argc, argv, help, kArgN, 512UL); } - if (o == kArgK) { args.k = GetArgument(argc, argv, help, kArgK, 512UL); } + if (o == kArgM) { args.m = GetArgument(argc, argv, help, kArgM, 512UL); } + if (o == kArgN) { args.n = GetArgument(argc, argv, help, kArgN, 512UL); } + if (o == kArgK) { args.k = GetArgument(argc, argv, help, kArgK, 512UL); } + if (o == kArgKU) { args.ku = GetArgument(argc, argv, help, kArgKU, 128UL); } + if (o == kArgKL) { args.kl = GetArgument(argc, argv, help, kArgKL, 128UL); } // Data-layouts if (o == kArgLayout) { args.layout = GetArgument(argc, argv, help, kArgLayout, Layout::kRowMajor); } @@ -73,6 +75,10 @@ Arguments Client::ParseArguments(int argc, char *argv[], const GetMetric if (o == kArgAOffset) { args.a_offset = GetArgument(argc, argv, help, kArgAOffset, size_t{0}); } if (o == kArgBOffset) { args.b_offset = GetArgument(argc, argv, help, kArgBOffset, size_t{0}); } if (o == kArgCOffset) { args.c_offset = GetArgument(argc, argv, help, kArgCOffset, size_t{0}); } + if (o == kArgAPOffset) { args.ap_offset= GetArgument(argc, argv, help, kArgAPOffset, size_t{0}); } + + // Dot arguments + if (o == kArgDotOffset) { args.dot_offset = GetArgument(argc, argv, help, kArgDotOffset, size_t{0}); } // Scalar values if (o == kArgAlpha) { args.alpha = GetArgument(argc, argv, help, kArgAlpha, GetScalar()); } @@ -128,11 +134,15 @@ void Client::PerformanceTest(Arguments &args, const SetMetric set_sizes) std::vector a_source(args.a_size); std::vector b_source(args.b_size); std::vector c_source(args.c_size); + std::vector ap_source(args.ap_size); + std::vector dot_source(args.dot_size); PopulateVector(x_source); PopulateVector(y_source); PopulateVector(a_source); PopulateVector(b_source); PopulateVector(c_source); + PopulateVector(ap_source); + PopulateVector(dot_source); // Creates the matrices on the device auto x_vec = Buffer(context, args.x_size); @@ -140,12 +150,16 @@ void Client::PerformanceTest(Arguments &args, const SetMetric set_sizes) auto a_mat = Buffer(context, args.a_size); auto b_mat = Buffer(context, args.b_size); auto c_mat = Buffer(context, args.c_size); + auto ap_mat = Buffer(context, args.ap_size); + auto dot = Buffer(context, args.dot_size); x_vec.Write(queue, args.x_size, x_source); y_vec.Write(queue, args.y_size, y_source); a_mat.Write(queue, args.a_size, a_source); b_mat.Write(queue, args.b_size, b_source); c_mat.Write(queue, args.c_size, c_source); - auto buffers = Buffers{x_vec, y_vec, a_mat, b_mat, c_mat}; + ap_mat.Write(queue, args.ap_size, ap_source); + dot.Write(queue, args.dot_size, dot_source); + auto buffers = Buffers{x_vec, y_vec, a_mat, b_mat, c_mat, ap_mat, dot}; // Runs the routines and collects the timings auto ms_clblast = TimedExecution(args.num_runs, args, buffers, queue, run_routine_, "CLBlast"); @@ -218,8 +232,10 @@ void Client::PrintTableRow(const Arguments& args, const double ms_clblas auto integers = std::vector{}; for (auto &o: options_) { if (o == kArgM) { integers.push_back(args.m); } - if (o == kArgN) { integers.push_back(args.n); } + else if (o == kArgN) { integers.push_back(args.n); } else if (o == kArgK) { integers.push_back(args.k); } + else if (o == kArgKU) { integers.push_back(args.ku); } + else if (o == kArgKL) { integers.push_back(args.kl); } else if (o == kArgLayout) { integers.push_back(static_cast(args.layout)); } else if (o == kArgSide) { integers.push_back(static_cast(args.side)); } else if (o == kArgTriangle) { integers.push_back(static_cast(args.triangle)); } @@ -236,6 +252,8 @@ void Client::PrintTableRow(const Arguments& args, const double ms_clblas else if (o == kArgAOffset) { integers.push_back(args.a_offset); } else if (o == kArgBOffset) { integers.push_back(args.b_offset); } else if (o == kArgCOffset) { integers.push_back(args.c_offset); } + else if (o == kArgAPOffset) { integers.push_back(args.ap_offset); } + else if (o == kArgDotOffset) {integers.push_back(args.dot_offset); } } auto strings = std::vector{}; for (auto &o: options_) { diff --git a/test/performance/routines/level1/xaxpy.cc b/test/performance/routines/level1/xaxpy.cc index fe90c697..7ab15f28 100644 --- a/test/performance/routines/level1/xaxpy.cc +++ b/test/performance/routines/level1/xaxpy.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the Xaxpy command-line interface performance tester. -// // ================================================================================================= #include "performance/client.h" #include "routines/level1/xaxpy.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; @@ -23,8 +19,7 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: - throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level1/xcopy.cc b/test/performance/routines/level1/xcopy.cc new file mode 100644 index 00000000..6277e8fb --- /dev/null +++ b/test/performance/routines/level1/xcopy.cc @@ -0,0 +1,35 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level1/xcopy.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level1/xdot.cc b/test/performance/routines/level1/xdot.cc new file mode 100644 index 00000000..5aa76762 --- /dev/null +++ b/test/performance/routines/level1/xdot.cc @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level1/xdot.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level1/xdotc.cc b/test/performance/routines/level1/xdotc.cc new file mode 100644 index 00000000..81511085 --- /dev/null +++ b/test/performance/routines/level1/xdotc.cc @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level1/xdotc.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level1/xdotu.cc b/test/performance/routines/level1/xdotu.cc new file mode 100644 index 00000000..888eede3 --- /dev/null +++ b/test/performance/routines/level1/xdotu.cc @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level1/xdotu.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level1/xscal.cc b/test/performance/routines/level1/xscal.cc new file mode 100644 index 00000000..be49c066 --- /dev/null +++ b/test/performance/routines/level1/xscal.cc @@ -0,0 +1,35 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level1/xscal.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level1/xswap.cc b/test/performance/routines/level1/xswap.cc new file mode 100644 index 00000000..52fdc580 --- /dev/null +++ b/test/performance/routines/level1/xswap.cc @@ -0,0 +1,35 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level1/xswap.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xgbmv.cc b/test/performance/routines/level2/xgbmv.cc new file mode 100644 index 00000000..629e2182 --- /dev/null +++ b/test/performance/routines/level2/xgbmv.cc @@ -0,0 +1,35 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xgbmv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xgemv.cc b/test/performance/routines/level2/xgemv.cc index 376c6c33..2a1983de 100644 --- a/test/performance/routines/level2/xgemv.cc +++ b/test/performance/routines/level2/xgemv.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the Xgemv command-line interface performance tester. -// // ================================================================================================= #include "performance/client.h" #include "routines/level2/xgemv.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; @@ -23,8 +19,7 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: - throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level2/xger.cc b/test/performance/routines/level2/xger.cc new file mode 100644 index 00000000..5fb0d91d --- /dev/null +++ b/test/performance/routines/level2/xger.cc @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xger.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xgerc.cc b/test/performance/routines/level2/xgerc.cc new file mode 100644 index 00000000..fd511e42 --- /dev/null +++ b/test/performance/routines/level2/xgerc.cc @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xgerc.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xgeru.cc b/test/performance/routines/level2/xgeru.cc new file mode 100644 index 00000000..689ab2b1 --- /dev/null +++ b/test/performance/routines/level2/xgeru.cc @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xgeru.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xhbmv.cc b/test/performance/routines/level2/xhbmv.cc new file mode 100644 index 00000000..dabe6ec8 --- /dev/null +++ b/test/performance/routines/level2/xhbmv.cc @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xhbmv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xhemv.cc b/test/performance/routines/level2/xhemv.cc index dd70528e..77447d76 100644 --- a/test/performance/routines/level2/xhemv.cc +++ b/test/performance/routines/level2/xhemv.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the Xhemv command-line interface performance tester. -// // ================================================================================================= #include "performance/client.h" #include "routines/level2/xhemv.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; @@ -23,12 +19,9 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: - throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: - throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kDouble: - throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexSingle: clblast::RunClient, float2, float2>(argc, argv); break; case clblast::Precision::kComplexDouble: diff --git a/test/performance/routines/level2/xher.cc b/test/performance/routines/level2/xher.cc new file mode 100644 index 00000000..4ef87e45 --- /dev/null +++ b/test/performance/routines/level2/xher.cc @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xher.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xher2.cc b/test/performance/routines/level2/xher2.cc new file mode 100644 index 00000000..2d7e17ab --- /dev/null +++ b/test/performance/routines/level2/xher2.cc @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xher2.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xhpmv.cc b/test/performance/routines/level2/xhpmv.cc new file mode 100644 index 00000000..b9dd3f82 --- /dev/null +++ b/test/performance/routines/level2/xhpmv.cc @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xhpmv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xhpr.cc b/test/performance/routines/level2/xhpr.cc new file mode 100644 index 00000000..f596682c --- /dev/null +++ b/test/performance/routines/level2/xhpr.cc @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xhpr.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xhpr2.cc b/test/performance/routines/level2/xhpr2.cc new file mode 100644 index 00000000..1c493226 --- /dev/null +++ b/test/performance/routines/level2/xhpr2.cc @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xhpr2.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xsbmv.cc b/test/performance/routines/level2/xsbmv.cc new file mode 100644 index 00000000..febc6bfd --- /dev/null +++ b/test/performance/routines/level2/xsbmv.cc @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xsbmv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xspmv.cc b/test/performance/routines/level2/xspmv.cc new file mode 100644 index 00000000..97c6b032 --- /dev/null +++ b/test/performance/routines/level2/xspmv.cc @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xspmv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xspr.cc b/test/performance/routines/level2/xspr.cc new file mode 100644 index 00000000..cc18d9b6 --- /dev/null +++ b/test/performance/routines/level2/xspr.cc @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xspr.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xspr2.cc b/test/performance/routines/level2/xspr2.cc new file mode 100644 index 00000000..768452be --- /dev/null +++ b/test/performance/routines/level2/xspr2.cc @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xspr2.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xsymv.cc b/test/performance/routines/level2/xsymv.cc index 30e953a5..6748026f 100644 --- a/test/performance/routines/level2/xsymv.cc +++ b/test/performance/routines/level2/xsymv.cc @@ -7,28 +7,25 @@ // Author(s): // Cedric Nugteren // -// This file implements the Xsymv command-line interface performance tester. -// // ================================================================================================= #include "performance/client.h" #include "routines/level2/xsymv.h" -// ================================================================================================= +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: - throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: - throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kComplexDouble: - throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); } return 0; } diff --git a/test/performance/routines/level2/xsyr.cc b/test/performance/routines/level2/xsyr.cc new file mode 100644 index 00000000..84510e5d --- /dev/null +++ b/test/performance/routines/level2/xsyr.cc @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xsyr.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xsyr2.cc b/test/performance/routines/level2/xsyr2.cc new file mode 100644 index 00000000..b8c177d8 --- /dev/null +++ b/test/performance/routines/level2/xsyr2.cc @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xsyr2.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xtbmv.cc b/test/performance/routines/level2/xtbmv.cc new file mode 100644 index 00000000..1663dca0 --- /dev/null +++ b/test/performance/routines/level2/xtbmv.cc @@ -0,0 +1,35 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xtbmv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xtbsv.cc b/test/performance/routines/level2/xtbsv.cc new file mode 100644 index 00000000..e0cb9f2e --- /dev/null +++ b/test/performance/routines/level2/xtbsv.cc @@ -0,0 +1,35 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xtbsv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xtpmv.cc b/test/performance/routines/level2/xtpmv.cc new file mode 100644 index 00000000..407fdc8c --- /dev/null +++ b/test/performance/routines/level2/xtpmv.cc @@ -0,0 +1,35 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xtpmv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xtpsv.cc b/test/performance/routines/level2/xtpsv.cc new file mode 100644 index 00000000..e402dc60 --- /dev/null +++ b/test/performance/routines/level2/xtpsv.cc @@ -0,0 +1,35 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xtpsv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xtrmv.cc b/test/performance/routines/level2/xtrmv.cc new file mode 100644 index 00000000..c5563240 --- /dev/null +++ b/test/performance/routines/level2/xtrmv.cc @@ -0,0 +1,35 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xtrmv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xtrsv.cc b/test/performance/routines/level2/xtrsv.cc new file mode 100644 index 00000000..136e2108 --- /dev/null +++ b/test/performance/routines/level2/xtrsv.cc @@ -0,0 +1,35 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xtrsv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level3/xgemm.cc b/test/performance/routines/level3/xgemm.cc index c45c238f..2082ceac 100644 --- a/test/performance/routines/level3/xgemm.cc +++ b/test/performance/routines/level3/xgemm.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the Xgemm command-line interface performance tester. -// // ================================================================================================= #include "performance/client.h" #include "routines/level3/xgemm.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; @@ -23,8 +19,7 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: - throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level3/xhemm.cc b/test/performance/routines/level3/xhemm.cc index d215653b..cc68e937 100644 --- a/test/performance/routines/level3/xhemm.cc +++ b/test/performance/routines/level3/xhemm.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the Xhemm command-line interface performance tester. -// // ================================================================================================= #include "performance/client.h" #include "routines/level3/xhemm.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; @@ -23,12 +19,9 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: - throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: - throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kDouble: - throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexSingle: clblast::RunClient, float2, float2>(argc, argv); break; case clblast::Precision::kComplexDouble: diff --git a/test/performance/routines/level3/xher2k.cc b/test/performance/routines/level3/xher2k.cc index 2e1f248a..70d76bed 100644 --- a/test/performance/routines/level3/xher2k.cc +++ b/test/performance/routines/level3/xher2k.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the Xher2k command-line interface performance tester. -// // ================================================================================================= #include "performance/client.h" #include "routines/level3/xher2k.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; @@ -23,12 +19,9 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: - throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: - throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kDouble: - throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexSingle: clblast::RunClient, float2, float>(argc, argv); break; case clblast::Precision::kComplexDouble: diff --git a/test/performance/routines/level3/xherk.cc b/test/performance/routines/level3/xherk.cc index 4386f78c..b3b5dddf 100644 --- a/test/performance/routines/level3/xherk.cc +++ b/test/performance/routines/level3/xherk.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the Xherk command-line interface performance tester. -// // ================================================================================================= #include "performance/client.h" #include "routines/level3/xherk.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; @@ -23,12 +19,9 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: - throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: - throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kDouble: - throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexSingle: clblast::RunClient, float2, float>(argc, argv); break; case clblast::Precision::kComplexDouble: diff --git a/test/performance/routines/level3/xsymm.cc b/test/performance/routines/level3/xsymm.cc index bd014cee..f2292273 100644 --- a/test/performance/routines/level3/xsymm.cc +++ b/test/performance/routines/level3/xsymm.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the Xsymm command-line interface performance tester. -// // ================================================================================================= #include "performance/client.h" #include "routines/level3/xsymm.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; @@ -23,8 +19,7 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: - throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level3/xsyr2k.cc b/test/performance/routines/level3/xsyr2k.cc index 1261be88..0c8f8f7c 100644 --- a/test/performance/routines/level3/xsyr2k.cc +++ b/test/performance/routines/level3/xsyr2k.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the Xsyr2k command-line interface performance tester. -// // ================================================================================================= #include "performance/client.h" #include "routines/level3/xsyr2k.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; @@ -23,8 +19,7 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: - throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level3/xsyrk.cc b/test/performance/routines/level3/xsyrk.cc index 5799130f..ccd4511a 100644 --- a/test/performance/routines/level3/xsyrk.cc +++ b/test/performance/routines/level3/xsyrk.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the Xsyrk command-line interface performance tester. -// // ================================================================================================= #include "performance/client.h" #include "routines/level3/xsyrk.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; @@ -23,8 +19,7 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: - throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level3/xtrmm.cc b/test/performance/routines/level3/xtrmm.cc index c30866e9..8278d077 100644 --- a/test/performance/routines/level3/xtrmm.cc +++ b/test/performance/routines/level3/xtrmm.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the Xtrmm command-line interface performance tester. -// // ================================================================================================= #include "performance/client.h" #include "routines/level3/xtrmm.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; @@ -23,8 +19,7 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: - throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level3/xtrsm.cc b/test/performance/routines/level3/xtrsm.cc new file mode 100644 index 00000000..45f71c5e --- /dev/null +++ b/test/performance/routines/level3/xtrsm.cc @@ -0,0 +1,35 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level3/xtrsm.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/routines/level1/xcopy.h b/test/routines/level1/xcopy.h new file mode 100644 index 00000000..8d324d88 --- /dev/null +++ b/test/routines/level1/xcopy.h @@ -0,0 +1,117 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements a class with static methods to describe the Xcopy routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XCOPY_H_ +#define CLBLAST_TEST_ROUTINES_XCOPY_H_ + +#include +#include + +#include "wrapper_clblas.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TestXcopy { + public: + + // The BLAS level: 1, 2, or 3 + static size_t BLASLevel() { return 1; } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { + return {kArgN, + kArgXInc, kArgYInc, + kArgXOffset, kArgYOffset}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &args) { + return args.n * args.x_inc + args.x_offset; + } + static size_t GetSizeY(const Arguments &args) { + return args.n * args.y_inc + args.y_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments &args) { + args.x_size = GetSizeX(args); + args.y_size = GetSizeY(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments &) { return 1; } // N/A for this routine + static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine + static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine + + // Describes which transpose options are relevant for this routine + using Transposes = std::vector; + static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine + static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Copy(args.n, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + clWaitForEvents(1, &event); + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXcopy(args.n, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector result(args.y_size, static_cast(0)); + buffers.y_vec.Read(queue, args.y_size, result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments &args) { return args.n; } + static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine + static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t) { + return id1*args.y_inc + args.y_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments &args) { + return 1 * args.n; + } + static size_t GetBytes(const Arguments &args) { + return (2 * args.n) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XCOPY_H_ +#endif diff --git a/test/routines/level1/xdot.h b/test/routines/level1/xdot.h new file mode 100644 index 00000000..bfcfdaff --- /dev/null +++ b/test/routines/level1/xdot.h @@ -0,0 +1,123 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements a class with static methods to describe the Xdot routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XDOT_H_ +#define CLBLAST_TEST_ROUTINES_XDOT_H_ + +#include +#include + +#include "wrapper_clblas.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TestXdot { + public: + + // The BLAS level: 1, 2, or 3 + static size_t BLASLevel() { return 1; } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { + return {kArgN, + kArgXInc, kArgYInc, + kArgXOffset, kArgYOffset, kArgDotOffset}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &args) { + return args.n * args.x_inc + args.x_offset; + } + static size_t GetSizeY(const Arguments &args) { + return args.n * args.y_inc + args.y_offset; + } + static size_t GetSizeDot(const Arguments &args) { + return 1 + args.dot_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments &args) { + args.x_size = GetSizeX(args); + args.y_size = GetSizeY(args); + args.dot_size = GetSizeDot(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments &) { return 1; } // N/A for this routine + static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine + static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine + + // Describes which transpose options are relevant for this routine + using Transposes = std::vector; + static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine + static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Dot(args.n, + buffers.dot(), args.dot_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + clWaitForEvents(1, &event); + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXdot(args.n, + buffers.dot(), args.dot_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector result(args.dot_size, static_cast(0)); + buffers.dot.Read(queue, args.dot_size, result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments &) { return 1; } // N/A for this routine + static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine + static size_t GetResultIndex(const Arguments &args, const size_t, const size_t) { + return args.dot_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments &args) { + return 2 * args.n; + } + static size_t GetBytes(const Arguments &args) { + return ((2 * args.n) + 1) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XDOT_H_ +#endif diff --git a/test/routines/level1/xdotc.h b/test/routines/level1/xdotc.h new file mode 100644 index 00000000..e403ba4c --- /dev/null +++ b/test/routines/level1/xdotc.h @@ -0,0 +1,123 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements a class with static methods to describe the Xdotc routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XDOTC_H_ +#define CLBLAST_TEST_ROUTINES_XDOTC_H_ + +#include +#include + +#include "wrapper_clblas.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TestXdotc { + public: + + // The BLAS level: 1, 2, or 3 + static size_t BLASLevel() { return 1; } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { + return {kArgN, + kArgXInc, kArgYInc, + kArgXOffset, kArgYOffset, kArgDotOffset}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &args) { + return args.n * args.x_inc + args.x_offset; + } + static size_t GetSizeY(const Arguments &args) { + return args.n * args.y_inc + args.y_offset; + } + static size_t GetSizeDot(const Arguments &args) { + return 1 + args.dot_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments &args) { + args.x_size = GetSizeX(args); + args.y_size = GetSizeY(args); + args.dot_size = GetSizeDot(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments &) { return 1; } // N/A for this routine + static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine + static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine + + // Describes which transpose options are relevant for this routine + using Transposes = std::vector; + static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine + static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Dotc(args.n, + buffers.dot(), args.dot_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + clWaitForEvents(1, &event); + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXdotc(args.n, + buffers.dot(), args.dot_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector result(args.dot_size, static_cast(0)); + buffers.dot.Read(queue, args.dot_size, result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments &) { return 1; } // N/A for this routine + static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine + static size_t GetResultIndex(const Arguments &args, const size_t, const size_t) { + return args.dot_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments &args) { + return 2 * args.n; + } + static size_t GetBytes(const Arguments &args) { + return ((2 * args.n) + 1) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XDOTC_H_ +#endif diff --git a/test/routines/level1/xdotu.h b/test/routines/level1/xdotu.h new file mode 100644 index 00000000..8b2c65a8 --- /dev/null +++ b/test/routines/level1/xdotu.h @@ -0,0 +1,123 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements a class with static methods to describe the Xdotu routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XDOTU_H_ +#define CLBLAST_TEST_ROUTINES_XDOTU_H_ + +#include +#include + +#include "wrapper_clblas.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TestXdotu { + public: + + // The BLAS level: 1, 2, or 3 + static size_t BLASLevel() { return 1; } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { + return {kArgN, + kArgXInc, kArgYInc, + kArgXOffset, kArgYOffset, kArgDotOffset}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &args) { + return args.n * args.x_inc + args.x_offset; + } + static size_t GetSizeY(const Arguments &args) { + return args.n * args.y_inc + args.y_offset; + } + static size_t GetSizeDot(const Arguments &args) { + return 1 + args.dot_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments &args) { + args.x_size = GetSizeX(args); + args.y_size = GetSizeY(args); + args.dot_size = GetSizeDot(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments &) { return 1; } // N/A for this routine + static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine + static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine + + // Describes which transpose options are relevant for this routine + using Transposes = std::vector; + static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine + static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Dotu(args.n, + buffers.dot(), args.dot_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + clWaitForEvents(1, &event); + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXdotu(args.n, + buffers.dot(), args.dot_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector result(args.dot_size, static_cast(0)); + buffers.dot.Read(queue, args.dot_size, result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments &) { return 1; } // N/A for this routine + static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine + static size_t GetResultIndex(const Arguments &args, const size_t, const size_t) { + return args.dot_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments &args) { + return 2 * args.n; + } + static size_t GetBytes(const Arguments &args) { + return ((2 * args.n) + 1) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XDOTU_H_ +#endif diff --git a/test/routines/level1/xscal.h b/test/routines/level1/xscal.h new file mode 100644 index 00000000..d990afcc --- /dev/null +++ b/test/routines/level1/xscal.h @@ -0,0 +1,112 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements a class with static methods to describe the Xscal routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XSCAL_H_ +#define CLBLAST_TEST_ROUTINES_XSCAL_H_ + +#include +#include + +#include "wrapper_clblas.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TestXscal { + public: + + // The BLAS level: 1, 2, or 3 + static size_t BLASLevel() { return 1; } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { + return {kArgN, + kArgXInc, + kArgXOffset, + kArgAlpha}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &args) { + return args.n * args.x_inc + args.x_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments &args) { + args.x_size = GetSizeX(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments &) { return 1; } // N/A for this routine + static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine + static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine + + // Describes which transpose options are relevant for this routine + using Transposes = std::vector; + static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine + static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Scal(args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + &queue_plain, &event); + clWaitForEvents(1, &event); + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXscal(args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector result(args.x_size, static_cast(0)); + buffers.x_vec.Read(queue, args.x_size, result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments &args) { return args.n; } + static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine + static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t) { + return id1*args.x_inc + args.x_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments &args) { + return args.n; + } + static size_t GetBytes(const Arguments &args) { + return (2 * args.n) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XSCAL_H_ +#endif diff --git a/test/routines/level1/xswap.h b/test/routines/level1/xswap.h new file mode 100644 index 00000000..2096a2c3 --- /dev/null +++ b/test/routines/level1/xswap.h @@ -0,0 +1,118 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements a class with static methods to describe the Xswap routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XSWAP_H_ +#define CLBLAST_TEST_ROUTINES_XSWAP_H_ + +#include +#include + +#include "wrapper_clblas.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TestXswap { + public: + + // The BLAS level: 1, 2, or 3 + static size_t BLASLevel() { return 1; } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { + return {kArgN, + kArgXInc, kArgYInc, + kArgXOffset, kArgYOffset}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &args) { + return args.n * args.x_inc + args.x_offset; + } + static size_t GetSizeY(const Arguments &args) { + return args.n * args.y_inc + args.y_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments &args) { + args.x_size = GetSizeX(args); + args.y_size = GetSizeY(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments &) { return 1; } // N/A for this routine + static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine + static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine + + // Describes which transpose options are relevant for this routine + using Transposes = std::vector; + static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine + static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Swap(args.n, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + clWaitForEvents(1, &event); + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXswap(args.n, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector result(args.x_size + args.y_size, static_cast(0)); + buffers.x_vec.Read(queue, args.x_size, &result[0]); + buffers.y_vec.Read(queue, args.y_size, &result[args.x_size]); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments &args) { return args.n; } + static size_t ResultID2(const Arguments &) { return 2; } // x_vec and y_vec + static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { + return (id2 == 0) ? id1*args.x_inc + args.x_offset : id1*args.y_inc + args.y_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments &args) { + return args.n; + } + static size_t GetBytes(const Arguments &args) { + return (2 * args.n) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XSWAP_H_ +#endif diff --git a/test/routines/level2/xgbmv.h b/test/routines/level2/xgbmv.h new file mode 100644 index 00000000..0e238804 --- /dev/null +++ b/test/routines/level2/xgbmv.h @@ -0,0 +1,140 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements a class with static methods to describe the Xgbmv routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XGBMV_H_ +#define CLBLAST_TEST_ROUTINES_XGBMV_H_ + +#include +#include + +#include "wrapper_clblas.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TestXgbmv { + public: + + // The BLAS level: 1, 2, or 3 + static size_t BLASLevel() { return 2; } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { + return {kArgM, kArgN, kArgKL, kArgKU, + kArgLayout, kArgATransp, + kArgALeadDim, kArgXInc, kArgYInc, + kArgAOffset, kArgXOffset, kArgYOffset, + kArgAlpha, kArgBeta}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &args) { + auto a_transposed = (args.a_transpose != Transpose::kNo); + auto n_real = (a_transposed) ? args.m : args.n; + return n_real * args.x_inc + args.x_offset; + } + static size_t GetSizeY(const Arguments &args) { + auto a_transposed = (args.a_transpose != Transpose::kNo); + auto m_real = (a_transposed) ? args.n : args.m; + return m_real * args.y_inc + args.y_offset; + } + static size_t GetSizeA(const Arguments &args) { + auto a_rotated = (args.layout == Layout::kRowMajor); + auto a_two = (a_rotated) ? args.m : args.n; + return a_two * args.a_ld + args.a_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments &args) { + args.a_size = GetSizeA(args); + args.x_size = GetSizeX(args); + args.y_size = GetSizeY(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments &args) { return args.n; } + static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine + static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine + + // Describes which transpose options are relevant for this routine + using Transposes = std::vector; + static Transposes GetATransposes(const Transposes &all) { return all; } + static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Gbmv(args.layout, args.a_transpose, + args.m, args.n, args.kl, args.ku, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + clWaitForEvents(1, &event); + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXgbmv(static_cast(args.layout), + static_cast(args.a_transpose), + args.m, args.n, args.kl, args.ku, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector result(args.y_size, static_cast(0)); + buffers.y_vec.Read(queue, args.y_size, result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments &args) { + auto a_transposed = (args.a_transpose != Transpose::kNo); + return (a_transposed) ? args.n : args.m; + } + static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine + static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t) { + return id1*args.y_inc + args.y_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments &args) { + return 2 * args.m * args.n; + } + static size_t GetBytes(const Arguments &args) { + auto a_rotated = (args.layout == Layout::kRowMajor); + auto a_one = (a_rotated) ? args.n : args.m; + auto a_two = (a_rotated) ? args.m : args.n; + return ((args.kl+args.ku+1)*a_two + 2*a_one + a_two) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XGBMV_H_ +#endif diff --git a/test/routines/level2/xhbmv.h b/test/routines/level2/xhbmv.h new file mode 100644 index 00000000..34e1502f --- /dev/null +++ b/test/routines/level2/xhbmv.h @@ -0,0 +1,130 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements a class with static methods to describe the Xhbmv routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XHBMV_H_ +#define CLBLAST_TEST_ROUTINES_XHBMV_H_ + +#include +#include + +#include "wrapper_clblas.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TestXhbmv { + public: + + // The BLAS level: 1, 2, or 3 + static size_t BLASLevel() { return 2; } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { + return {kArgN, kArgKL, + kArgLayout, kArgTriangle, + kArgALeadDim, kArgXInc, kArgYInc, + kArgAOffset, kArgXOffset, kArgYOffset, + kArgAlpha, kArgBeta}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &args) { + return args.n * args.x_inc + args.x_offset; + } + static size_t GetSizeY(const Arguments &args) { + return args.n * args.y_inc + args.y_offset; + } + static size_t GetSizeA(const Arguments &args) { + return args.n * args.a_ld + args.a_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments &args) { + args.a_size = GetSizeA(args); + args.x_size = GetSizeX(args); + args.y_size = GetSizeY(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments &args) { return args.n; } + static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine + static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine + + // Describes which transpose options are relevant for this routine + using Transposes = std::vector; + static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine + static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Hbmv(args.layout, args.triangle, + args.n, args.kl, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + clWaitForEvents(1, &event); + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXhbmv(static_cast(args.layout), + static_cast(args.triangle), + args.n, args.kl, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector result(args.y_size, static_cast(0)); + buffers.y_vec.Read(queue, args.y_size, result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments &args) { + return args.n; + } + static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine + static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t) { + return id1*args.y_inc + args.y_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments &args) { + return 2 * args.n * args.n; + } + static size_t GetBytes(const Arguments &args) { + return ((args.kl+args.kl+1)*args.n + 2*args.n + args.n) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XHBMV_H_ +#endif diff --git a/test/routines/level2/xhpmv.h b/test/routines/level2/xhpmv.h new file mode 100644 index 00000000..8fd85b62 --- /dev/null +++ b/test/routines/level2/xhpmv.h @@ -0,0 +1,130 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements a class with static methods to describe the Xhpmv routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XHPMV_H_ +#define CLBLAST_TEST_ROUTINES_XHPMV_H_ + +#include +#include + +#include "wrapper_clblas.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TestXhpmv { + public: + + // The BLAS level: 1, 2, or 3 + static size_t BLASLevel() { return 2; } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { + return {kArgN, + kArgLayout, kArgTriangle, + kArgXInc, kArgYInc, + kArgAPOffset, kArgXOffset, kArgYOffset, + kArgAlpha, kArgBeta}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &args) { + return args.n * args.x_inc + args.x_offset; + } + static size_t GetSizeY(const Arguments &args) { + return args.n * args.y_inc + args.y_offset; + } + static size_t GetSizeAP(const Arguments &args) { + return ((args.n*(args.n+1)) / 2) + args.ap_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments &args) { + args.ap_size = GetSizeAP(args); + args.x_size = GetSizeX(args); + args.y_size = GetSizeY(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments &) { return 1; } // N/A for this routine + static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine + static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine + + // Describes which transpose options are relevant for this routine + using Transposes = std::vector; + static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine + static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Hpmv(args.layout, args.triangle, + args.n, args.alpha, + buffers.ap_mat(), args.ap_offset, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + clWaitForEvents(1, &event); + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXhpmv(static_cast(args.layout), + static_cast(args.triangle), + args.n, args.alpha, + buffers.ap_mat(), args.ap_offset, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector result(args.y_size, static_cast(0)); + buffers.y_vec.Read(queue, args.y_size, result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments &args) { + return args.n; + } + static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine + static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t) { + return id1*args.y_inc + args.y_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments &args) { + return 2 * args.n * args.n; + } + static size_t GetBytes(const Arguments &args) { + return (((args.n*(args.n+1)) / 2) + 2*args.n + args.n) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XHPMV_H_ +#endif diff --git a/test/routines/level2/xsbmv.h b/test/routines/level2/xsbmv.h new file mode 100644 index 00000000..5bc17e49 --- /dev/null +++ b/test/routines/level2/xsbmv.h @@ -0,0 +1,130 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements a class with static methods to describe the Xsbmv routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XSBMV_H_ +#define CLBLAST_TEST_ROUTINES_XSBMV_H_ + +#include +#include + +#include "wrapper_clblas.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TestXsbmv { + public: + + // The BLAS level: 1, 2, or 3 + static size_t BLASLevel() { return 2; } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { + return {kArgN, kArgKL, + kArgLayout, kArgTriangle, + kArgALeadDim, kArgXInc, kArgYInc, + kArgAOffset, kArgXOffset, kArgYOffset, + kArgAlpha, kArgBeta}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &args) { + return args.n * args.x_inc + args.x_offset; + } + static size_t GetSizeY(const Arguments &args) { + return args.n * args.y_inc + args.y_offset; + } + static size_t GetSizeA(const Arguments &args) { + return args.n * args.a_ld + args.a_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments &args) { + args.a_size = GetSizeA(args); + args.x_size = GetSizeX(args); + args.y_size = GetSizeY(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments &args) { return args.n; } + static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine + static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine + + // Describes which transpose options are relevant for this routine + using Transposes = std::vector; + static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine + static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Sbmv(args.layout, args.triangle, + args.n, args.kl, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + clWaitForEvents(1, &event); + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXsbmv(static_cast(args.layout), + static_cast(args.triangle), + args.n, args.kl, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector result(args.y_size, static_cast(0)); + buffers.y_vec.Read(queue, args.y_size, result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments &args) { + return args.n; + } + static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine + static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t) { + return id1*args.y_inc + args.y_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments &args) { + return 2 * args.n * args.n; + } + static size_t GetBytes(const Arguments &args) { + return ((args.kl+args.kl+1)*args.n + 2*args.n + args.n) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XSBMV_H_ +#endif diff --git a/test/routines/level2/xspmv.h b/test/routines/level2/xspmv.h new file mode 100644 index 00000000..e335da42 --- /dev/null +++ b/test/routines/level2/xspmv.h @@ -0,0 +1,130 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements a class with static methods to describe the Xspmv routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XSPMV_H_ +#define CLBLAST_TEST_ROUTINES_XSPMV_H_ + +#include +#include + +#include "wrapper_clblas.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TestXspmv { + public: + + // The BLAS level: 1, 2, or 3 + static size_t BLASLevel() { return 2; } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { + return {kArgN, + kArgLayout, kArgTriangle, + kArgXInc, kArgYInc, + kArgAPOffset, kArgXOffset, kArgYOffset, + kArgAlpha, kArgBeta}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &args) { + return args.n * args.x_inc + args.x_offset; + } + static size_t GetSizeY(const Arguments &args) { + return args.n * args.y_inc + args.y_offset; + } + static size_t GetSizeAP(const Arguments &args) { + return ((args.n*(args.n+1)) / 2) + args.ap_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments &args) { + args.ap_size = GetSizeAP(args); + args.x_size = GetSizeX(args); + args.y_size = GetSizeY(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments &) { return 1; } // N/A for this routine + static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine + static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine + + // Describes which transpose options are relevant for this routine + using Transposes = std::vector; + static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine + static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Spmv(args.layout, args.triangle, + args.n, args.alpha, + buffers.ap_mat(), args.ap_offset, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + clWaitForEvents(1, &event); + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXspmv(static_cast(args.layout), + static_cast(args.triangle), + args.n, args.alpha, + buffers.ap_mat(), args.ap_offset, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector result(args.y_size, static_cast(0)); + buffers.y_vec.Read(queue, args.y_size, result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments &args) { + return args.n; + } + static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine + static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t) { + return id1*args.y_inc + args.y_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments &args) { + return 2 * args.n * args.n; + } + static size_t GetBytes(const Arguments &args) { + return (((args.n*(args.n+1)) / 2) + 2*args.n + args.n) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XSPMV_H_ +#endif diff --git a/test/routines/level2/xtbmv.h b/test/routines/level2/xtbmv.h new file mode 100644 index 00000000..dbdddb65 --- /dev/null +++ b/test/routines/level2/xtbmv.h @@ -0,0 +1,125 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements a class with static methods to describe the Xtbmv routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XTBMV_H_ +#define CLBLAST_TEST_ROUTINES_XTBMV_H_ + +#include +#include + +#include "wrapper_clblas.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TestXtbmv { + public: + + // The BLAS level: 1, 2, or 3 + static size_t BLASLevel() { return 2; } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { + return {kArgN, kArgKL, + kArgLayout, kArgTriangle, kArgATransp, kArgDiagonal, + kArgALeadDim, kArgXInc, + kArgAOffset, kArgXOffset}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &args) { + return args.n * args.x_inc + args.x_offset; + } + static size_t GetSizeA(const Arguments &args) { + return args.n * args.a_ld + args.a_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments &args) { + args.a_size = GetSizeA(args); + args.x_size = GetSizeX(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments &args) { return args.n; } + static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine + static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine + + // Describes which transpose options are relevant for this routine + using Transposes = std::vector; + static Transposes GetATransposes(const Transposes &all) { return all; } + static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Tbmv(args.layout, args.triangle, args.a_transpose, args.diagonal, + args.n, args.kl, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, + &queue_plain, &event); + clWaitForEvents(1, &event); + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXtbmv(static_cast(args.layout), + static_cast(args.triangle), + static_cast(args.a_transpose), + static_cast(args.diagonal), + args.n, args.kl, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector result(args.x_size, static_cast(0)); + buffers.x_vec.Read(queue, args.x_size, result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments &args) { + return args.n; + } + static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine + static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t) { + return id1*args.x_inc + args.x_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments &args) { + return 2 * args.n * args.n; + } + static size_t GetBytes(const Arguments &args) { + return ((args.kl+args.kl+1)*args.n + 2*args.n + args.n) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XTBMV_H_ +#endif diff --git a/test/routines/level2/xtpmv.h b/test/routines/level2/xtpmv.h new file mode 100644 index 00000000..4425765e --- /dev/null +++ b/test/routines/level2/xtpmv.h @@ -0,0 +1,125 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements a class with static methods to describe the Xtpmv routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XTPMV_H_ +#define CLBLAST_TEST_ROUTINES_XTPMV_H_ + +#include +#include + +#include "wrapper_clblas.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TestXtpmv { + public: + + // The BLAS level: 1, 2, or 3 + static size_t BLASLevel() { return 2; } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { + return {kArgN, + kArgLayout, kArgTriangle, kArgATransp, kArgDiagonal, + kArgXInc, + kArgAPOffset, kArgXOffset}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &args) { + return args.n * args.x_inc + args.x_offset; + } + static size_t GetSizeAP(const Arguments &args) { + return ((args.n*(args.n+1)) / 2) + args.ap_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments &args) { + args.ap_size = GetSizeAP(args); + args.x_size = GetSizeX(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments &args) { return args.n; } + static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine + static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine + + // Describes which transpose options are relevant for this routine + using Transposes = std::vector; + static Transposes GetATransposes(const Transposes &all) { return all; } + static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Tpmv(args.layout, args.triangle, args.a_transpose, args.diagonal, + args.n, + buffers.ap_mat(), args.ap_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + &queue_plain, &event); + clWaitForEvents(1, &event); + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXtpmv(static_cast(args.layout), + static_cast(args.triangle), + static_cast(args.a_transpose), + static_cast(args.diagonal), + args.n, + buffers.ap_mat(), args.ap_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector result(args.x_size, static_cast(0)); + buffers.x_vec.Read(queue, args.x_size, result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments &args) { + return args.n; + } + static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine + static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t) { + return id1*args.x_inc + args.x_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments &args) { + return 2 * args.n * args.n; + } + static size_t GetBytes(const Arguments &args) { + return (((args.n*(args.n+1)) / 2) + 2*args.n + args.n) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XTPMV_H_ +#endif diff --git a/test/routines/level2/xtrmv.h b/test/routines/level2/xtrmv.h new file mode 100644 index 00000000..1c0c6fd8 --- /dev/null +++ b/test/routines/level2/xtrmv.h @@ -0,0 +1,125 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements a class with static methods to describe the Xtrmv routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XTRMV_H_ +#define CLBLAST_TEST_ROUTINES_XTRMV_H_ + +#include +#include + +#include "wrapper_clblas.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TestXtrmv { + public: + + // The BLAS level: 1, 2, or 3 + static size_t BLASLevel() { return 2; } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { + return {kArgN, + kArgLayout, kArgTriangle, kArgATransp, kArgDiagonal, + kArgALeadDim, kArgXInc, + kArgAOffset, kArgXOffset}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &args) { + return args.n * args.x_inc + args.x_offset; + } + static size_t GetSizeA(const Arguments &args) { + return args.n * args.a_ld + args.a_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments &args) { + args.a_size = GetSizeA(args); + args.x_size = GetSizeX(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments &args) { return args.n; } + static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine + static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine + + // Describes which transpose options are relevant for this routine + using Transposes = std::vector; + static Transposes GetATransposes(const Transposes &all) { return all; } + static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Trmv(args.layout, args.triangle, args.a_transpose, args.diagonal, + args.n, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, + &queue_plain, &event); + clWaitForEvents(1, &event); + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXtrmv(static_cast(args.layout), + static_cast(args.triangle), + static_cast(args.a_transpose), + static_cast(args.diagonal), + args.n, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector result(args.x_size, static_cast(0)); + buffers.x_vec.Read(queue, args.x_size, result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments &args) { + return args.n; + } + static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine + static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t) { + return id1*args.x_inc + args.x_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments &args) { + return 2 * args.n * args.n; + } + static size_t GetBytes(const Arguments &args) { + return (args.n*args.n + 2*args.n + args.n) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XTRMV_H_ +#endif diff --git a/test/wrapper_clblas.h b/test/wrapper_clblas.h index 86810fa2..23a02a45 100644 --- a/test/wrapper_clblas.h +++ b/test/wrapper_clblas.h @@ -20,653 +20,2004 @@ #include "internal/utilities.h" namespace clblast { + // ================================================================================================= // BLAS level-1 (vector-vector) routines +// ================================================================================================= + +// Forwards the clBLAS calls for SSWAP/DSWAP/CSWAP/ZSWAP +template +clblasStatus clblasXswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); +template <> +clblasStatus clblasXswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSswap(n, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDswap(n, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCswap(n, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZswap(n, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for SSCAL/DSCAL/CSCAL/ZSCAL +clblasStatus clblasXscal(const size_t n, + const float alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSscal(n, + alpha, + x_buffer, x_offset, static_cast(x_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXscal(const size_t n, + const double alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDscal(n, + alpha, + x_buffer, x_offset, static_cast(x_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXscal(const size_t n, + const float2 alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCscal(n, + cl_float2{{alpha.real(), alpha.imag()}}, + x_buffer, x_offset, static_cast(x_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXscal(const size_t n, + const double2 alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZscal(n, + cl_double2{{alpha.real(), alpha.imag()}}, + x_buffer, x_offset, static_cast(x_inc), + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for SCOPY/DCOPY/CCOPY/ZCOPY +template +clblasStatus clblasXcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); +template <> +clblasStatus clblasXcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasScopy(n, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDcopy(n, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCcopy(n, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZcopy(n, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for SAXPY/DAXPY/CAXPY/ZAXPY +clblasStatus clblasXaxpy(const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSaxpy(n, + alpha, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXaxpy(const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDaxpy(n, + alpha, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXaxpy(const size_t n, + const float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCaxpy(n, + cl_float2{{alpha.real(), alpha.imag()}}, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXaxpy(const size_t n, + const double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZaxpy(n, + cl_double2{{alpha.real(), alpha.imag()}}, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for SDOT/DDOT +template +clblasStatus clblasXdot(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); +template <> +clblasStatus clblasXdot(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + return clblasSdot(n, + dot_buffer, dot_offset, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXdot(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + return clblasDdot(n, + dot_buffer, dot_offset, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} -// Calls {clblasSaxpy, clblasDaxpy, clblasCaxpy, clblasZaxpy} with the arguments forwarded. -clblasStatus clblasXaxpy( - size_t n, float alpha, - const cl_mem x_vec, size_t x_offset, size_t x_inc, - const cl_mem y_vec, size_t y_offset, size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasSaxpy(n, alpha, - x_vec, x_offset, static_cast(x_inc), - y_vec, y_offset, static_cast(y_inc), - num_queues, queues, num_wait_events, wait_events, events); -} -clblasStatus clblasXaxpy( - size_t n, double alpha, - const cl_mem x_vec, size_t x_offset, size_t x_inc, - const cl_mem y_vec, size_t y_offset, size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasDaxpy(n, alpha, - x_vec, x_offset, static_cast(x_inc), - y_vec, y_offset, static_cast(y_inc), - num_queues, queues, num_wait_events, wait_events, events); -} -clblasStatus clblasXaxpy( - size_t n, float2 alpha, - const cl_mem x_vec, size_t x_offset, size_t x_inc, - const cl_mem y_vec, size_t y_offset, size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}}; - return clblasCaxpy(n, cl_alpha, - x_vec, x_offset, static_cast(x_inc), - y_vec, y_offset, static_cast(y_inc), - num_queues, queues, num_wait_events, wait_events, events); -} -clblasStatus clblasXaxpy( - size_t n, double2 alpha, - const cl_mem x_vec, size_t x_offset, size_t x_inc, - const cl_mem y_vec, size_t y_offset, size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}}; - return clblasZaxpy(n, cl_alpha, - x_vec, x_offset, static_cast(x_inc), - y_vec, y_offset, static_cast(y_inc), - num_queues, queues, num_wait_events, wait_events, events); +// Forwards the clBLAS calls for CDOTU/ZDOTU +template +clblasStatus clblasXdotu(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); +template <> +clblasStatus clblasXdotu(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + return clblasCdotu(n, + dot_buffer, dot_offset, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXdotu(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + return clblasZdotu(n, + dot_buffer, dot_offset, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for CDOTC/ZDOTC +template +clblasStatus clblasXdotc(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); +template <> +clblasStatus clblasXdotc(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + return clblasCdotc(n, + dot_buffer, dot_offset, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXdotc(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + return clblasZdotc(n, + dot_buffer, dot_offset, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); } // ================================================================================================= // BLAS level-2 (matrix-vector) routines +// ================================================================================================= -// Calls {clblasSgemv, clblasDgemv, clblasCgemv, clblasZgemv} with the arguments forwarded. -clblasStatus clblasXgemv( - clblasOrder layout, clblasTranspose a_transpose, size_t m, size_t n, float alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem x_vec, size_t x_offset, size_t x_inc, float beta, - const cl_mem y_vec, size_t y_offset, size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasSgemv(layout, a_transpose, m, n, alpha, - a_mat, a_offset, a_ld, - x_vec, x_offset, static_cast(x_inc), beta, - y_vec, y_offset, static_cast(y_inc), - num_queues, queues, num_wait_events, wait_events, events); -} -clblasStatus clblasXgemv( - clblasOrder layout, clblasTranspose a_transpose, size_t m, size_t n, double alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem x_vec, size_t x_offset, size_t x_inc, double beta, - const cl_mem y_vec, size_t y_offset, size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasDgemv(layout, a_transpose, m, n, alpha, - a_mat, a_offset, a_ld, - x_vec, x_offset, static_cast(x_inc), beta, - y_vec, y_offset, static_cast(y_inc), - num_queues, queues, num_wait_events, wait_events, events); -} -clblasStatus clblasXgemv( - clblasOrder layout, clblasTranspose a_transpose, size_t m, size_t n, float2 alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem x_vec, size_t x_offset, size_t x_inc, float2 beta, - const cl_mem y_vec, size_t y_offset, size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}}; - auto cl_beta = cl_float2{{beta.real(), beta.imag()}}; - return clblasCgemv(layout, a_transpose, m, n, cl_alpha, - a_mat, a_offset, a_ld, - x_vec, x_offset, static_cast(x_inc), cl_beta, - y_vec, y_offset, static_cast(y_inc), - num_queues, queues, num_wait_events, wait_events, events); -} -clblasStatus clblasXgemv( - clblasOrder layout, clblasTranspose a_transpose, size_t m, size_t n, double2 alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem x_vec, size_t x_offset, size_t x_inc, double2 beta, - const cl_mem y_vec, size_t y_offset, size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}}; - auto cl_beta = cl_double2{{beta.real(), beta.imag()}}; - return clblasZgemv(layout, a_transpose, m, n, cl_alpha, - a_mat, a_offset, a_ld, - x_vec, x_offset, static_cast(x_inc), cl_beta, - y_vec, y_offset, static_cast(y_inc), - num_queues, queues, num_wait_events, wait_events, events); -} - -// Calls {clblasChemv, clblasZhemv} with the arguments forwarded. -clblasStatus clblasXhemv( - clblasOrder layout, clblasUplo triangle, size_t n, float2 alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem x_vec, size_t x_offset, size_t x_inc, float2 beta, - const cl_mem y_vec, size_t y_offset, size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}}; - auto cl_beta = cl_float2{{beta.real(), beta.imag()}}; - return clblasChemv(layout, triangle, n, cl_alpha, - a_mat, a_offset, a_ld, - x_vec, x_offset, static_cast(x_inc), cl_beta, - y_vec, y_offset, static_cast(y_inc), - num_queues, queues, num_wait_events, wait_events, events); -} -clblasStatus clblasXhemv( - clblasOrder layout, clblasUplo triangle, size_t n, double2 alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem x_vec, size_t x_offset, size_t x_inc, double2 beta, - const cl_mem y_vec, size_t y_offset, size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}}; - auto cl_beta = cl_double2{{beta.real(), beta.imag()}}; - return clblasZhemv(layout, triangle, n, cl_alpha, - a_mat, a_offset, a_ld, - x_vec, x_offset, static_cast(x_inc), cl_beta, - y_vec, y_offset, static_cast(y_inc), - num_queues, queues, num_wait_events, wait_events, events); -} - -// Calls {clblasSsymv, clblasDsymv} with the arguments forwarded. -clblasStatus clblasXsymv( - clblasOrder layout, clblasUplo triangle, size_t n, float alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem x_vec, size_t x_offset, size_t x_inc, float beta, - const cl_mem y_vec, size_t y_offset, size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasSsymv(layout, triangle, n, alpha, - a_mat, a_offset, a_ld, - x_vec, x_offset, static_cast(x_inc), beta, - y_vec, y_offset, static_cast(y_inc), - num_queues, queues, num_wait_events, wait_events, events); -} -clblasStatus clblasXsymv( - clblasOrder layout, clblasUplo triangle, size_t n, double alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem x_vec, size_t x_offset, size_t x_inc, double beta, - const cl_mem y_vec, size_t y_offset, size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasDsymv(layout, triangle, n, alpha, - a_mat, a_offset, a_ld, - x_vec, x_offset, static_cast(x_inc), beta, - y_vec, y_offset, static_cast(y_inc), - num_queues, queues, num_wait_events, wait_events, events); +// Forwards the clBLAS calls for SGEMV/DGEMV/CGEMV/ZGEMV +clblasStatus clblasXgemv(const clblasOrder layout, const clblasTranspose a_transpose, + const size_t m, const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSgemv(layout, a_transpose, + m, n, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + beta, + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXgemv(const clblasOrder layout, const clblasTranspose a_transpose, + const size_t m, const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDgemv(layout, a_transpose, + m, n, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + beta, + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXgemv(const clblasOrder layout, const clblasTranspose a_transpose, + const size_t m, const size_t n, + const float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCgemv(layout, a_transpose, + m, n, + cl_float2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + cl_float2{{beta.real(), beta.imag()}}, + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXgemv(const clblasOrder layout, const clblasTranspose a_transpose, + const size_t m, const size_t n, + const double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZgemv(layout, a_transpose, + m, n, + cl_double2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + cl_double2{{beta.real(), beta.imag()}}, + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for SGBMV/DGBMV/CGBMV/ZGBMV +clblasStatus clblasXgbmv(const clblasOrder layout, const clblasTranspose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSgbmv(layout, a_transpose, + m, n, kl, ku, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + beta, + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXgbmv(const clblasOrder layout, const clblasTranspose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDgbmv(layout, a_transpose, + m, n, kl, ku, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + beta, + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXgbmv(const clblasOrder layout, const clblasTranspose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCgbmv(layout, a_transpose, + m, n, kl, ku, + cl_float2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + cl_float2{{beta.real(), beta.imag()}}, + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXgbmv(const clblasOrder layout, const clblasTranspose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZgbmv(layout, a_transpose, + m, n, kl, ku, + cl_double2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + cl_double2{{beta.real(), beta.imag()}}, + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for CHEMV/ZHEMV +clblasStatus clblasXhemv(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasChemv(layout, triangle, + n, + cl_float2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + cl_float2{{beta.real(), beta.imag()}}, + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXhemv(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZhemv(layout, triangle, + n, + cl_double2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + cl_double2{{beta.real(), beta.imag()}}, + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for CHBMV/ZHBMV +clblasStatus clblasXhbmv(const clblasOrder layout, const clblasUplo triangle, + const size_t n, const size_t k, + const float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasChbmv(layout, triangle, + n, k, + cl_float2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + cl_float2{{beta.real(), beta.imag()}}, + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXhbmv(const clblasOrder layout, const clblasUplo triangle, + const size_t n, const size_t k, + const double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZhbmv(layout, triangle, + n, k, + cl_double2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + cl_double2{{beta.real(), beta.imag()}}, + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for CHPMV/ZHPMV +clblasStatus clblasXhpmv(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const float2 alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasChpmv(layout, triangle, + n, + cl_float2{{alpha.real(), alpha.imag()}}, + ap_buffer, ap_offset, + x_buffer, x_offset, static_cast(x_inc), + cl_float2{{beta.real(), beta.imag()}}, + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXhpmv(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const double2 alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZhpmv(layout, triangle, + n, + cl_double2{{alpha.real(), alpha.imag()}}, + ap_buffer, ap_offset, + x_buffer, x_offset, static_cast(x_inc), + cl_double2{{beta.real(), beta.imag()}}, + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for SSYMV/DSYMV +clblasStatus clblasXsymv(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSsymv(layout, triangle, + n, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + beta, + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXsymv(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDsymv(layout, triangle, + n, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + beta, + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for SSBMV/DSBMV +clblasStatus clblasXsbmv(const clblasOrder layout, const clblasUplo triangle, + const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSsbmv(layout, triangle, + n, k, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + beta, + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXsbmv(const clblasOrder layout, const clblasUplo triangle, + const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDsbmv(layout, triangle, + n, k, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + beta, + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for SSPMV/DSPMV +clblasStatus clblasXspmv(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const float alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSspmv(layout, triangle, + n, + alpha, + ap_buffer, ap_offset, + x_buffer, x_offset, static_cast(x_inc), + beta, + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXspmv(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const double alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDspmv(layout, triangle, + n, + alpha, + ap_buffer, ap_offset, + x_buffer, x_offset, static_cast(x_inc), + beta, + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for STRMV/DTRMV/CTRMV/ZTRMV +template +clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); +template <> +clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + return clblasStrmv(layout, triangle, a_transpose, diagonal, + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + return clblasDtrmv(layout, triangle, a_transpose, diagonal, + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + return clblasCtrmv(layout, triangle, a_transpose, diagonal, + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + return clblasZtrmv(layout, triangle, a_transpose, diagonal, + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for STBMV/DTBMV/CTBMV/ZTBMV +template +clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); +template <> +clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + return clblasStbmv(layout, triangle, a_transpose, diagonal, + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + return clblasDtbmv(layout, triangle, a_transpose, diagonal, + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + return clblasCtbmv(layout, triangle, a_transpose, diagonal, + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + return clblasZtbmv(layout, triangle, a_transpose, diagonal, + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for STPMV/DTPMV/CTPMV/ZTPMV +template +clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); +template <> +clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + return clblasStpmv(layout, triangle, a_transpose, diagonal, + n, + ap_buffer, ap_offset, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + return clblasDtpmv(layout, triangle, a_transpose, diagonal, + n, + ap_buffer, ap_offset, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + return clblasCtpmv(layout, triangle, a_transpose, diagonal, + n, + ap_buffer, ap_offset, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + return clblasZtpmv(layout, triangle, a_transpose, diagonal, + n, + ap_buffer, ap_offset, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for STRSV/DTRSV/CTRSV/ZTRSV +template +clblasStatus clblasXtrsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); +template <> +clblasStatus clblasXtrsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasStrsv(layout, triangle, a_transpose, diagonal, + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtrsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDtrsv(layout, triangle, a_transpose, diagonal, + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtrsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCtrsv(layout, triangle, a_transpose, diagonal, + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtrsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZtrsv(layout, triangle, a_transpose, diagonal, + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for STBSV/DTBSV/CTBSV/ZTBSV +template +clblasStatus clblasXtbsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); +template <> +clblasStatus clblasXtbsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasStbsv(layout, triangle, a_transpose, diagonal, + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtbsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDtbsv(layout, triangle, a_transpose, diagonal, + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtbsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCtbsv(layout, triangle, a_transpose, diagonal, + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtbsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZtbsv(layout, triangle, a_transpose, diagonal, + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for STPSV/DTPSV/CTPSV/ZTPSV +template +clblasStatus clblasXtpsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); +template <> +clblasStatus clblasXtpsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasStpsv(layout, triangle, a_transpose, diagonal, + n, + ap_buffer, ap_offset, + x_buffer, x_offset, static_cast(x_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtpsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDtpsv(layout, triangle, a_transpose, diagonal, + n, + ap_buffer, ap_offset, + x_buffer, x_offset, static_cast(x_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtpsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCtpsv(layout, triangle, a_transpose, diagonal, + n, + ap_buffer, ap_offset, + x_buffer, x_offset, static_cast(x_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtpsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZtpsv(layout, triangle, a_transpose, diagonal, + n, + ap_buffer, ap_offset, + x_buffer, x_offset, static_cast(x_inc), + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for SGER/DGER +clblasStatus clblasXger(const clblasOrder layout, + const size_t m, const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSger(layout, + m, n, + alpha, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXger(const clblasOrder layout, + const size_t m, const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDger(layout, + m, n, + alpha, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for CGERU/ZGERU +clblasStatus clblasXgeru(const clblasOrder layout, + const size_t m, const size_t n, + const float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCgeru(layout, + m, n, + cl_float2{{alpha.real(), alpha.imag()}}, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXgeru(const clblasOrder layout, + const size_t m, const size_t n, + const double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZgeru(layout, + m, n, + cl_double2{{alpha.real(), alpha.imag()}}, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for CGERC/ZGERC +clblasStatus clblasXgerc(const clblasOrder layout, + const size_t m, const size_t n, + const float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCgerc(layout, + m, n, + cl_float2{{alpha.real(), alpha.imag()}}, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXgerc(const clblasOrder layout, + const size_t m, const size_t n, + const double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZgerc(layout, + m, n, + cl_double2{{alpha.real(), alpha.imag()}}, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for CHER/ZHER +clblasStatus clblasXher(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCher(layout, triangle, + n, + alpha, + x_buffer, x_offset, static_cast(x_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXher(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZher(layout, triangle, + n, + alpha, + x_buffer, x_offset, static_cast(x_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for CHPR/ZHPR +clblasStatus clblasXhpr(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasChpr(layout, triangle, + n, + alpha, + x_buffer, x_offset, static_cast(x_inc), + ap_buffer, ap_offset, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXhpr(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZhpr(layout, triangle, + n, + alpha, + x_buffer, x_offset, static_cast(x_inc), + ap_buffer, ap_offset, + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for CHER2/ZHER2 +clblasStatus clblasXher2(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCher2(layout, triangle, + n, + cl_float2{{alpha.real(), alpha.imag()}}, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXher2(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZher2(layout, triangle, + n, + cl_double2{{alpha.real(), alpha.imag()}}, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for CHPR2/ZHPR2 +clblasStatus clblasXhpr2(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasChpr2(layout, triangle, + n, + cl_float2{{alpha.real(), alpha.imag()}}, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + ap_buffer, ap_offset, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXhpr2(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZhpr2(layout, triangle, + n, + cl_double2{{alpha.real(), alpha.imag()}}, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + ap_buffer, ap_offset, + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for SSYR/DSYR +clblasStatus clblasXsyr(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSsyr(layout, triangle, + n, + alpha, + x_buffer, x_offset, static_cast(x_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXsyr(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDsyr(layout, triangle, + n, + alpha, + x_buffer, x_offset, static_cast(x_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for SSPR/DSPR +clblasStatus clblasXspr(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSspr(layout, triangle, + n, + alpha, + x_buffer, x_offset, static_cast(x_inc), + ap_buffer, ap_offset, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXspr(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDspr(layout, triangle, + n, + alpha, + x_buffer, x_offset, static_cast(x_inc), + ap_buffer, ap_offset, + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for SSYR2/DSYR2 +clblasStatus clblasXsyr2(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSsyr2(layout, triangle, + n, + alpha, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXsyr2(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDsyr2(layout, triangle, + n, + alpha, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for SSPR2/DSPR2 +clblasStatus clblasXspr2(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSspr2(layout, triangle, + n, + alpha, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + ap_buffer, ap_offset, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXspr2(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDspr2(layout, triangle, + n, + alpha, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + ap_buffer, ap_offset, + num_queues, queues, num_wait_events, wait_events, events); } // ================================================================================================= // BLAS level-3 (matrix-matrix) routines +// ================================================================================================= + +// Forwards the clBLAS calls for SGEMM/DGEMM/CGEMM/ZGEMM +clblasStatus clblasXgemm(const clblasOrder layout, const clblasTranspose a_transpose, const clblasTranspose b_transpose, + const size_t m, const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSgemm(layout, a_transpose, b_transpose, + m, n, k, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXgemm(const clblasOrder layout, const clblasTranspose a_transpose, const clblasTranspose b_transpose, + const size_t m, const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDgemm(layout, a_transpose, b_transpose, + m, n, k, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXgemm(const clblasOrder layout, const clblasTranspose a_transpose, const clblasTranspose b_transpose, + const size_t m, const size_t n, const size_t k, + const float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const float2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCgemm(layout, a_transpose, b_transpose, + m, n, k, + cl_float2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + cl_float2{{beta.real(), beta.imag()}}, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXgemm(const clblasOrder layout, const clblasTranspose a_transpose, const clblasTranspose b_transpose, + const size_t m, const size_t n, const size_t k, + const double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const double2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZgemm(layout, a_transpose, b_transpose, + m, n, k, + cl_double2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + cl_double2{{beta.real(), beta.imag()}}, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for SSYMM/DSYMM/CSYMM/ZSYMM +clblasStatus clblasXsymm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, + const size_t m, const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSsymm(layout, side, triangle, + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXsymm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, + const size_t m, const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDsymm(layout, side, triangle, + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXsymm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, + const size_t m, const size_t n, + const float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const float2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCsymm(layout, side, triangle, + m, n, + cl_float2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + cl_float2{{beta.real(), beta.imag()}}, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXsymm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, + const size_t m, const size_t n, + const double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const double2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZsymm(layout, side, triangle, + m, n, + cl_double2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + cl_double2{{beta.real(), beta.imag()}}, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for CHEMM/ZHEMM +clblasStatus clblasXhemm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, + const size_t m, const size_t n, + const float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const float2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasChemm(layout, side, triangle, + m, n, + cl_float2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + cl_float2{{beta.real(), beta.imag()}}, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXhemm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, + const size_t m, const size_t n, + const double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const double2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZhemm(layout, side, triangle, + m, n, + cl_double2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + cl_double2{{beta.real(), beta.imag()}}, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for SSYRK/DSYRK/CSYRK/ZSYRK +clblasStatus clblasXsyrk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, + const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSsyrk(layout, triangle, a_transpose, + n, k, + alpha, + a_buffer, a_offset, a_ld, + beta, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXsyrk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, + const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDsyrk(layout, triangle, a_transpose, + n, k, + alpha, + a_buffer, a_offset, a_ld, + beta, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXsyrk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, + const size_t n, const size_t k, + const float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const float2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCsyrk(layout, triangle, a_transpose, + n, k, + cl_float2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + cl_float2{{beta.real(), beta.imag()}}, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXsyrk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, + const size_t n, const size_t k, + const double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const double2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZsyrk(layout, triangle, a_transpose, + n, k, + cl_double2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + cl_double2{{beta.real(), beta.imag()}}, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); +} -// This calls {clblasSgemm, clblasDgemm, clblasCgemm, clblasZgemm} with the arguments forwarded. -clblasStatus clblasXgemm( - clblasOrder layout, clblasTranspose a_transpose, clblasTranspose b_transpose, - size_t m, size_t n, size_t k, float alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, float beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasSgemm(layout, a_transpose, b_transpose, - m, n, k, alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); -} -clblasStatus clblasXgemm( - clblasOrder layout, clblasTranspose a_transpose, clblasTranspose b_transpose, - size_t m, size_t n, size_t k, double alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, double beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasDgemm(layout, a_transpose, b_transpose, - m, n, k, alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); -} -clblasStatus clblasXgemm( - clblasOrder layout, clblasTranspose a_transpose, clblasTranspose b_transpose, - size_t m, size_t n, size_t k, float2 alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, float2 beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}}; - auto cl_beta = cl_float2{{beta.real(), beta.imag()}}; - return clblasCgemm(layout, a_transpose, b_transpose, - m, n, k, cl_alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, cl_beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); -} -clblasStatus clblasXgemm( - clblasOrder layout, clblasTranspose a_transpose, clblasTranspose b_transpose, - size_t m, size_t n, size_t k, double2 alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, double2 beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}}; - auto cl_beta = cl_double2{{beta.real(), beta.imag()}}; - return clblasZgemm(layout, a_transpose, b_transpose, - m, n, k, cl_alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, cl_beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); -} - -// This calls {clblasSsymm, clblasDsymm, clblasCsymm, clblasZsymm} with the arguments forwarded. -clblasStatus clblasXsymm( - clblasOrder layout, clblasSide side, clblasUplo triangle, - size_t m, size_t n, float alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, float beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasSsymm(layout, side, triangle, - m, n, alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); -} -clblasStatus clblasXsymm( - clblasOrder layout, clblasSide side, clblasUplo triangle, - size_t m, size_t n, double alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, double beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasDsymm(layout, side, triangle, - m, n, alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); -} -clblasStatus clblasXsymm( - clblasOrder layout, clblasSide side, clblasUplo triangle, - size_t m, size_t n, float2 alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, float2 beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}}; - auto cl_beta = cl_float2{{beta.real(), beta.imag()}}; - return clblasCsymm(layout, side, triangle, - m, n, cl_alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, cl_beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); -} -clblasStatus clblasXsymm( - clblasOrder layout, clblasSide side, clblasUplo triangle, - size_t m, size_t n, double2 alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, double2 beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}}; - auto cl_beta = cl_double2{{beta.real(), beta.imag()}}; - return clblasZsymm(layout, side, triangle, - m, n, cl_alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, cl_beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); -} - -// This calls {clblasChemm, clblasZhemm} with the arguments forwarded. -clblasStatus clblasXhemm( - clblasOrder layout, clblasSide side, clblasUplo triangle, - size_t m, size_t n, float2 alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, float2 beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}}; - auto cl_beta = cl_float2{{beta.real(), beta.imag()}}; - return clblasChemm(layout, side, triangle, - m, n, cl_alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, cl_beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); -} -clblasStatus clblasXhemm( - clblasOrder layout, clblasSide side, clblasUplo triangle, - size_t m, size_t n, double2 alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, double2 beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}}; - auto cl_beta = cl_double2{{beta.real(), beta.imag()}}; - return clblasZhemm(layout, side, triangle, - m, n, cl_alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, cl_beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); -} - -// This calls {clblasSsyrk, clblasDsyrk, clblasCsyrk, clblasZsyrk} with the arguments forwarded. -clblasStatus clblasXsyrk( - clblasOrder layout, clblasUplo triangle, clblasTranspose a_transpose, - size_t n, size_t k, float alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, float beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasSsyrk(layout, triangle, a_transpose, - n, k, alpha, - a_mat, a_offset, a_ld, beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); -} -clblasStatus clblasXsyrk( - clblasOrder layout, clblasUplo triangle, clblasTranspose a_transpose, - size_t n, size_t k, double alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, double beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasDsyrk(layout, triangle, a_transpose, - n, k, alpha, - a_mat, a_offset, a_ld, beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); -} -clblasStatus clblasXsyrk( - clblasOrder layout, clblasUplo triangle, clblasTranspose a_transpose, - size_t n, size_t k, float2 alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, float2 beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}}; - auto cl_beta = cl_float2{{beta.real(), beta.imag()}}; - return clblasCsyrk(layout, triangle, a_transpose, - n, k, cl_alpha, - a_mat, a_offset, a_ld, cl_beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); -} -clblasStatus clblasXsyrk( - clblasOrder layout, clblasUplo triangle, clblasTranspose a_transpose, - size_t n, size_t k, double2 alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, double2 beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}}; - auto cl_beta = cl_double2{{beta.real(), beta.imag()}}; - return clblasZsyrk(layout, triangle, a_transpose, - n, k, cl_alpha, - a_mat, a_offset, a_ld, cl_beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); -} - -// This calls {clblasCherk, clblasZherk} with the arguments forwarded. -clblasStatus clblasXherk( - clblasOrder layout, clblasUplo triangle, clblasTranspose a_transpose, - size_t n, size_t k, float alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, float beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasCherk(layout, triangle, a_transpose, - n, k, alpha, - a_mat, a_offset, a_ld, beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); -} -clblasStatus clblasXherk( - clblasOrder layout, clblasUplo triangle, clblasTranspose a_transpose, - size_t n, size_t k, double alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, double beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasZherk(layout, triangle, a_transpose, - n, k, alpha, - a_mat, a_offset, a_ld, beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); -} - -// This calls {clblasSsyr2k, clblasDsyr2k, clblasCsyr2k, clblasZsyr2k} with the arguments forwarded. -clblasStatus clblasXsyr2k( - clblasOrder layout, clblasUplo triangle, clblasTranspose ab_transpose, - size_t n, size_t k, float alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, float beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasSsyr2k(layout, triangle, ab_transpose, - n, k, alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); -} -clblasStatus clblasXsyr2k( - clblasOrder layout, clblasUplo triangle, clblasTranspose ab_transpose, - size_t n, size_t k, double alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, double beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasDsyr2k(layout, triangle, ab_transpose, - n, k, alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); -} -clblasStatus clblasXsyr2k( - clblasOrder layout, clblasUplo triangle, clblasTranspose ab_transpose, - size_t n, size_t k, float2 alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, float2 beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}}; - auto cl_beta = cl_float2{{beta.real(), beta.imag()}}; - return clblasCsyr2k(layout, triangle, ab_transpose, - n, k, cl_alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, cl_beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); -} -clblasStatus clblasXsyr2k( - clblasOrder layout, clblasUplo triangle, clblasTranspose ab_transpose, - size_t n, size_t k, double2 alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, double2 beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}}; - auto cl_beta = cl_double2{{beta.real(), beta.imag()}}; - return clblasZsyr2k(layout, triangle, ab_transpose, - n, k, cl_alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, cl_beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); -} - -// This calls {clblasCher2k, clblasZher2k} with the arguments forwarded. -clblasStatus clblasXher2k( - clblasOrder layout, clblasUplo triangle, clblasTranspose ab_transpose, - size_t n, size_t k, float2 alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, float beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}}; - return clblasCher2k(layout, triangle, ab_transpose, - n, k, cl_alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); -} -clblasStatus clblasXher2k( - clblasOrder layout, clblasUplo triangle, clblasTranspose ab_transpose, - size_t n, size_t k, double2 alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, double beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}}; - return clblasZher2k(layout, triangle, ab_transpose, - n, k, cl_alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); -} - -// This calls {clblasStrmm, clblasDtrmm, clblasCtrmm, clblasZtrmm} with the arguments forwarded. -clblasStatus clblasXtrmm( - clblasOrder layout, clblasSide side, clblasUplo triangle, - clblasTranspose a_transpose, clblasDiag diagonal, - size_t m, size_t n, float alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasStrmm(layout, side, triangle, a_transpose, diagonal, - m, n, alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, - num_queues, queues, num_wait_events, wait_events, events); -} -clblasStatus clblasXtrmm( - clblasOrder layout, clblasSide side, clblasUplo triangle, - clblasTranspose a_transpose, clblasDiag diagonal, - size_t m, size_t n, double alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasDtrmm(layout, side, triangle, a_transpose, diagonal, - m, n, alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, - num_queues, queues, num_wait_events, wait_events, events); -} -clblasStatus clblasXtrmm( - clblasOrder layout, clblasSide side, clblasUplo triangle, - clblasTranspose a_transpose, clblasDiag diagonal, - size_t m, size_t n, float2 alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}}; - return clblasCtrmm(layout, side, triangle, a_transpose, diagonal, - m, n, cl_alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, - num_queues, queues, num_wait_events, wait_events, events); -} -clblasStatus clblasXtrmm( - clblasOrder layout, clblasSide side, clblasUplo triangle, - clblasTranspose a_transpose, clblasDiag diagonal, - size_t m, size_t n, double2 alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}}; - return clblasZtrmm(layout, side, triangle, a_transpose, diagonal, - m, n, cl_alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, - num_queues, queues, num_wait_events, wait_events, events); -} - -// This calls {clblasStrsm, clblasDtrsm, clblasCtrsm, clblasZtrsm} with the arguments forwarded. -clblasStatus clblasXtrsm( - clblasOrder layout, clblasSide side, clblasUplo triangle, - clblasTranspose a_transpose, clblasDiag diagonal, - size_t m, size_t n, float alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasStrsm(layout, side, triangle, a_transpose, diagonal, - m, n, alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, - num_queues, queues, num_wait_events, wait_events, events); -} -clblasStatus clblasXtrsm( - clblasOrder layout, clblasSide side, clblasUplo triangle, - clblasTranspose a_transpose, clblasDiag diagonal, - size_t m, size_t n, double alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasDtrsm(layout, side, triangle, a_transpose, diagonal, - m, n, alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, - num_queues, queues, num_wait_events, wait_events, events); -} -clblasStatus clblasXtrsm( - clblasOrder layout, clblasSide side, clblasUplo triangle, - clblasTranspose a_transpose, clblasDiag diagonal, - size_t m, size_t n, float2 alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}}; - return clblasCtrsm(layout, side, triangle, a_transpose, diagonal, - m, n, cl_alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, - num_queues, queues, num_wait_events, wait_events, events); -} -clblasStatus clblasXtrsm( - clblasOrder layout, clblasSide side, clblasUplo triangle, - clblasTranspose a_transpose, clblasDiag diagonal, - size_t m, size_t n, double2 alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}}; - return clblasZtrsm(layout, side, triangle, a_transpose, diagonal, - m, n, cl_alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, - num_queues, queues, num_wait_events, wait_events, events); +// Forwards the clBLAS calls for CHERK/ZHERK +clblasStatus clblasXherk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, + const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCherk(layout, triangle, a_transpose, + n, k, + alpha, + a_buffer, a_offset, a_ld, + beta, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXherk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, + const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZherk(layout, triangle, a_transpose, + n, k, + alpha, + a_buffer, a_offset, a_ld, + beta, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for SSYR2K/DSYR2K/CSYR2K/ZSYR2K +clblasStatus clblasXsyr2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose, + const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSsyr2k(layout, triangle, ab_transpose, + n, k, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXsyr2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose, + const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDsyr2k(layout, triangle, ab_transpose, + n, k, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXsyr2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose, + const size_t n, const size_t k, + const float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const float2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCsyr2k(layout, triangle, ab_transpose, + n, k, + cl_float2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + cl_float2{{beta.real(), beta.imag()}}, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXsyr2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose, + const size_t n, const size_t k, + const double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const double2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZsyr2k(layout, triangle, ab_transpose, + n, k, + cl_double2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + cl_double2{{beta.real(), beta.imag()}}, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for CHER2K/ZHER2K +clblasStatus clblasXher2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose, + const size_t n, const size_t k, + const float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCher2k(layout, triangle, ab_transpose, + n, k, + cl_float2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXher2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose, + const size_t n, const size_t k, + const double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZher2k(layout, triangle, ab_transpose, + n, k, + cl_double2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for STRMM/DTRMM/CTRMM/ZTRMM +clblasStatus clblasXtrmm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t m, const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasStrmm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXtrmm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t m, const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDtrmm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXtrmm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t m, const size_t n, + const float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCtrmm(layout, side, triangle, a_transpose, diagonal, + m, n, + cl_float2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXtrmm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t m, const size_t n, + const double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZtrmm(layout, side, triangle, a_transpose, diagonal, + m, n, + cl_double2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for STRSM/DTRSM/CTRSM/ZTRSM +clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t m, const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasStrsm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t m, const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDtrsm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t m, const size_t n, + const float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCtrsm(layout, side, triangle, a_transpose, diagonal, + m, n, + cl_float2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t m, const size_t n, + const double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZtrsm(layout, side, triangle, a_transpose, diagonal, + m, n, + cl_double2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + num_queues, queues, num_wait_events, wait_events, events); } // =================================================================================================