diff --git a/CMakeLists.txt b/CMakeLists.txt index 1477d5981..aab8141a9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,11 +12,24 @@ endif( ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE -if( NOT CMAKE_CONFIGURATION_TYPES ) - set( CMAKE_BUILD_TYPE Debug CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." FORCE ) +if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) + set( CMAKE_BUILD_TYPE "Debug" CACHE STRINGS "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel.") + set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS None Debug Release RelWithDebInfo MinSizeRel) endif() -list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ) +# Append our library helper cmake path and the cmake path for hip (for convenience) +# Users may override HIP path by specifying their own in CMAKE_MODULE_PATH +list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake /opt/rocm/hip/cmake ) + +# Temp fix, there is a problem building our test program using googletest with gcc +# Change compiler default to clang +if( NOT DEFINED CMAKE_CXX_COMPILER AND NOT DEFINED ENV{CXX} ) + set( CMAKE_CXX_COMPILER clang++ ) +endif( ) + +if( NOT DEFINED CMAKE_C_COMPILER AND NOT DEFINED ENV{CC} ) + set( CMAKE_C_COMPILER clang ) +endif( ) # The superbuild does not build anything itself, all compiling is done in external projects project( rocblas-superbuild NONE ) @@ -24,16 +37,26 @@ project( rocblas-superbuild NONE ) # Everything is initially off, so that cache is not initialized until user elects to build option( BUILD_LIBRARY "Build rocBLAS library" OFF ) option( BUILD_CLIENTS "Build rocBLAS clients" OFF ) -option( BUILD_WITH_TENSILE "Building rocBLAS with Tensile or not" ON ) + +option( BUILD_WITH_TENSILE "Building rocBLAS with Tensile or not" ON) + +# which benchmark solution schedule +set( Tensile_ROOT "" CACHE STRING "Local path of Tensile.") +if ( BUILD_WITH_TENSILE ) + set( Tensile_LOGIC "FijiROCm14" CACHE STRING "Tensile to use which logic?") + set_property( CACHE Tensile_LOGIC PROPERTY STRINGS + FijiROCm14 + ) + option( Tensile_MERGE_FILES "Tensile to merge kernels and solutions files?" OFF) + option( Tensile_SHORT_FILENAMES "Tensile to use short file names? Use if compiler complains they're too long." OFF) + option( Tensile_PRINT_DEBUG "Tensile to print runtime debug info?" OFF) +endif() # BUILD_SHARED_LIBS is a cmake built-in; we make it an explicit option such that it shows in cmake-gui option( BUILD_SHARED_LIBS "Build rocBLAS as a shared library" OFF ) -set( HOST_TOOLCHAIN_NAME "clang" CACHE STRING "Compiler toolchain: (clang gcc)" ) -set_property( CACHE HOST_TOOLCHAIN_NAME PROPERTY STRINGS clang gcc ) - -set( DEVICE_TOOLCHAIN_NAME "hipcc" CACHE STRING "Compiler toolchain for library: (hcc hipcc)" ) -set_property( CACHE DEVICE_TOOLCHAIN_NAME PROPERTY STRINGS hcc hipcc ) +set( HIP_ROOT /opt/rocm/hip CACHE PATH "Specify hip installation dir") +set( BOOST_ROOT /opt/boost CACHE PATH "Specify boost installation dir") # set( rocblas_INSTALL_DIR ${CMAKE_INSTALL_PREFIX} ) set( rocblas_INSTALL_DIR "${PROJECT_BINARY_DIR}/package" ) @@ -44,7 +67,7 @@ set( rocblas_INSTALL_COMMAND INSTALL_COMMAND ${CMAKE_COMMAND} -E echo_append ) # Clients are programs provided in this repository, that make use of the library as a library client. This can include # but is not limited to benchmarks, tests and samples. if( BUILD_CLIENTS ) - # Clients need to find and link rocfft; we install it locally instead of globally + # Clients need to find and link rocblas; we install it locally instead of globally unset( rocblas_INSTALL_COMMAND ) endif() @@ -54,60 +77,91 @@ include( ExternalProject ) # This captures all of the dependencies cmake builds itself set( rocblas_dependencies ) -set( DEVICE_TOOLCHAIN_FILE "" ) -if( DEVICE_TOOLCHAIN_NAME STREQUAL "hipcc" ) - if( NOT DEFINED HIP_ROOT ) - include( cmake/external-hip.cmake ) - list( APPEND rocblas_dependencies HIP ) - endif( ) +set( BASE_CMAKE_ARGS ) + +# Noramlize the different ways of specifying a c++ compiler through -DCMAKE_CXX_COMPILER +if( DEFINED CMAKE_CXX_COMPILER ) + message( STATUS "CMAKE_CXX_COMPILER: ${CMAKE_CXX_COMPILER} " ) + list( APPEND BASE_CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} ) +elseif( DEFINED ENV{CXX} ) + message( STATUS "ENV{CXX}: $ENV{CXX} " ) + list( APPEND BASE_CMAKE_ARGS -DCMAKE_CXX_COMPILER=$ENV{CXX} ) +endif( ) - # configure toolchain file to find the hip compiler with the hip we just downloaded - configure_file( "${PROJECT_SOURCE_DIR}/cmake/${DEVICE_TOOLCHAIN_NAME}-toolchain.cmake.in" - "${PROJECT_BINARY_DIR}/cmake/${DEVICE_TOOLCHAIN_NAME}-toolchain.cmake" ) +# Noramlize the different ways of specifying a c compiler through -DCMAKE_C_COMPILER +if( DEFINED CMAKE_C_COMPILER ) + message( STATUS "CMAKE_C_COMPILER: ${CMAKE_C_COMPILER}" ) + list( APPEND BASE_CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} ) +elseif( DEFINED ENV{CC} ) + message( STATUS "ENV{CC}: $ENV{CC} " ) + list( APPEND BASE_CMAKE_ARGS -DCMAKE_C_COMPILER=$ENV{CC} ) +endif( ) - set( DEVICE_TOOLCHAIN_FILE "${PROJECT_BINARY_DIR}/cmake/${DEVICE_TOOLCHAIN_NAME}-toolchain.cmake" ) -else( ) - set( DEVICE_TOOLCHAIN_FILE "${PROJECT_SOURCE_DIR}/cmake/${DEVICE_TOOLCHAIN_NAME}-toolchain.cmake" ) +if( DEFINED CMAKE_CXX_FLAGS ) + message( STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS} " ) + list( APPEND BASE_CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} ) endif( ) -# Default arguments that get passed down into all external projects -set( BASE_CMAKE_ARGS - -DCMAKE_INSTALL_PREFIX= - -DCMAKE_MODULE_PATH=${CMAKE_MODULE_PATH} - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} - -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - ) +if( DEFINED CMAKE_C_FLAGS ) + message( STATUS "CMAKE_C_FLAGS: ${CMAKE_C_FLAGS}" ) + list( APPEND BASE_CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} ) +endif( ) # CMAKE_BUILD_TYPE only applies to single configuration build systems -if( NOT CMAKE_CONFIGURATION_TYPES ) - list( APPEND BASE_CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} ) +if( DEFINED CMAKE_BUILD_TYPE ) + list( APPEND BASE_CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} ) endif( ) +if( NOT DEFINED DEVICE_CXX_COMPILER ) + find_package( HIP REQUIRED ) + set( DEVICE_CXX_COMPILER ${HIP_ROOT_DIR}/bin/hipcc ) +endif() if( BUILD_LIBRARY ) if( BUILD_WITH_TENSILE ) - # defines - # Tensile_INCLUDE_DIRS - # TensileLib_LIBRARIES - # TensileLogger_LIBRARIES - - include( cmake/external-Tensile.cmake ) - list( APPEND rocblas_dependencies Tensile ) - message( STATUS "Tensile_ROOT=${Tensile_ROOT}" ) - - set(TENSILE_CMAKE_ARGS - -DCMAKE_PREFIX_PATH=${Tensile_ROOT} - -DBUILD_WITH_TENSILE=${BUILD_WITH_TENSILE} - ) + # defines + # Tensile_INCLUDE_DIRS + # TensileLib_LIBRARIES + # TensileLogger_LIBRARIES + if (Tensile_ROOT) + message(STATUS "Tensile_ROOT=${Tensile_ROOT} specified") + else() + find_package(Tensile QUIET) + if (Tensile_FOUND) + message(STATUS "Tensile package found.") + else() + include(cmake/external-Tensile.cmake ) + list( APPEND rocblas_dependencies Tensile ) + list( APPEND CMAKE_PREFIX_PATH ${Tensile_ROOT} ) + message( STATUS "Tensile not installed; will download to Tensile_ROOT=${Tensile_ROOT}" ) + endif() + endif() endif() - set( LIBRARY_CMAKE_ARGS - ${BASE_CMAKE_ARGS} - -DHOST_TOOLCHAIN_FILE=${HOST_TOOLCHAIN_FILE} - -DDEVICE_TOOLCHAIN_FILE=${DEVICE_TOOLCHAIN_FILE} + # WARNING: do not surround CMAKE_PREFIX_PATH with quotes, it breaks + # Replace all occurances of ; with ^^, which we elect to use a path seperator + string(REGEX REPLACE ";" "^^" LIBRARY_PREFIX_PATH "${CMAKE_PREFIX_PATH}" ) + string(REGEX REPLACE ";" "^^" LIBRARY_MODULE_PATH "${CMAKE_MODULE_PATH}" ) + + set( LIBRARY_CMAKE_ARGS + -DCMAKE_INSTALL_PREFIX= + -DCMAKE_PREFIX_PATH=${LIBRARY_PREFIX_PATH} + -DCMAKE_MODULE_PATH=${LIBRARY_MODULE_PATH} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} - ${TENSILE_CMAKE_ARGS} + -DCMAKE_CXX_COMPILER=${DEVICE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DBUILD_WITH_TENSILE=${BUILD_WITH_TENSILE} + -DTensile_LOGIC=${Tensile_LOGIC} + -DTensile_MERGE_FILES=${Tensile_MERGE_FILES} + -DTensile_SHORT_FILENAMES=${Tensile_SHORT_FILENAMES} + -DTensile_PRINT_DEBUG=${Tensile_PRINT_DEBUG} + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} ) + if (Tensile_FOUND) + set( LIBRARY_CMAKE_ARGS ${LIBRARY_CMAKE_ARGS} -DTensile_FOUND=${Tensile_FOUND}) + else() + set( LIBRARY_CMAKE_ARGS ${LIBRARY_CMAKE_ARGS} -DTensile_ROOT=${Tensile_ROOT}) + endif() # Build the library as an external project ExternalProject_Add( rocblas @@ -115,42 +169,52 @@ if( BUILD_LIBRARY ) SOURCE_DIR ${PROJECT_SOURCE_DIR}/library BINARY_DIR library-build INSTALL_DIR library-package + LIST_SEPARATOR ^^ CMAKE_ARGS ${LIBRARY_CMAKE_ARGS} ${rocblas_INSTALL_COMMAND} ) -endif() +endif( ) # Build clients of the library if( BUILD_CLIENTS ) - include( clients/cmake/build-options.cmake ) - - # Default arguments that get passed down into all external projects - set( CLIENTS_CMAKE_ARGS - ${BASE_CMAKE_ARGS} - -DBUILD_CLIENTS_SAMPLES=${BUILD_CLIENTS_SAMPLES} - -DBUILD_CLIENTS_BENCHMARKS=${BUILD_CLIENTS_BENCHMARKS} - -DBUILD_CLIENTS_TESTS=${BUILD_CLIENTS_TESTS} - -DCMAKE_TOOLCHAIN_FILE=${PROJECT_SOURCE_DIR}/cmake/${HOST_TOOLCHAIN_NAME}-toolchain.cmake - -DDEVICE_TOOLCHAIN_FILE=${DEVICE_TOOLCHAIN_FILE} - ) - - if( DEFINED BOOST_ROOT ) - list( APPEND CLIENTS_CMAKE_ARGS -DBOOST_ROOT=${BOOST_ROOT} ) - endif( ) - - if( BUILD_LIBRARY ) - ExternalProject_Get_Property( rocblas install_dir ) - list( APPEND CLIENTS_CMAKE_ARGS -DCMAKE_PREFIX_PATH=${install_dir} ) - endif( ) - - # Clients are set up as an external project to take advantage of specifying toolchain files. - # We want cmake to go through it's usual discovery process + include( clients/cmake/build-options.cmake ) + + if( BUILD_LIBRARY ) + ExternalProject_Get_Property( rocblas install_dir ) + list( APPEND CMAKE_PREFIX_PATH ${install_dir} ) + endif( ) + + # WARNING: do not surround CMAKE_PREFIX_PATH with quotes, it breaks + # Replace all occurances of ; with ^^, which we elect to use a path seperator + string(REGEX REPLACE ";" "^^" CLIENT_PREFIX_PATH "${CMAKE_PREFIX_PATH}" ) + string(REGEX REPLACE ";" "^^" CLIENT_MODULE_PATH "${CMAKE_MODULE_PATH}" ) + + # Default arguments that get passed down into all external projects + set( CLIENTS_CMAKE_ARGS + -DCMAKE_INSTALL_PREFIX= + -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DDEVICE_CXX_COMPILER=${DEVICE_CXX_COMPILER} + -DCMAKE_PREFIX_PATH=${CLIENT_PREFIX_PATH} + -DCMAKE_MODULE_PATH=${CLIENT_MODULE_PATH} + -DBUILD_CLIENTS_SAMPLES=${BUILD_CLIENTS_SAMPLES} + -DBUILD_CLIENTS_BENCHMARKS=${BUILD_CLIENTS_BENCHMARKS} + -DBUILD_CLIENTS_TESTS=${BUILD_CLIENTS_TESTS} + -DBUILD_WITH_TENSILE=${BUILD_WITH_TENSILE} + -DDEVICE_CXX_COMPILER=${DEVICE_CXX_COMPILER} + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + + ) + + # Clients are set up as an external project to take advantage of specifying toolchain files. + # We want cmake to go through it's usual discovery process ExternalProject_Add( rocblas-clients DEPENDS rocblas SOURCE_DIR ${PROJECT_SOURCE_DIR}/clients BINARY_DIR clients-build INSTALL_DIR clients-package + LIST_SEPARATOR ^^ CMAKE_ARGS ${CLIENTS_CMAKE_ARGS} - INSTALL_COMMAND "" + INSTALL_COMMAND "" ) endif( ) diff --git a/Jenkinsfile b/Jenkinsfile index c1d179c03..3e9c44489 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,8 +1,24 @@ #!/usr/bin/env groovy -currentBuild.result = "SUCCESS" -node('rocm-1.3 && fiji') +// Generated from snippet generator 'properties; set job properties' +properties([buildDiscarder(logRotator( + artifactDaysToKeepStr: '', + artifactNumToKeepStr: '', + daysToKeepStr: '', + numToKeepStr: '10')), + disableConcurrentBuilds()]) + +def build_type="Debug" +def build_type_postfix="-d" + +// Currently, YADP (yet-another-docker-plugin v0.1.0-rc30) does not load balance between clouds with the same label +// They recommend to use docker swarm, but not yet work with docker 1.12 'swarm mode' +// Manually load balance by picking a particular machine +node('rocm-1.3 && hawaii') { + def node_list = env.NODE_LABELS.tokenize() + // sh "echo node_list: ${node_list}" + def scm_dir = pwd() def build_dir_debug = "${scm_dir}/../build/debug" def build_dir_release = "${scm_dir}/../build/release" @@ -15,11 +31,24 @@ node('rocm-1.3 && fiji') try { dir("${scm_dir}") { - stage("Clone") { + stage("Clone") + { checkout scm + + if( fileExists( 'cmake/build-version.cmake' ) ) + { + def cmake_version_file = readFile( 'cmake/build-version.cmake' ).trim() + //echo "cmake_version_file:\n${cmake_version_file}" + + cmake_version_file = cmake_version_file.replaceAll(/(\d+\.)(\d+\.)(\d+\.)\d+/, "\$1\$2\$3${env.BUILD_ID}") + cmake_version_file = cmake_version_file.replaceAll(/VERSION_TWEAK\s+\d+/, "VERSION_TWEAK ${env.BUILD_ID}") + //echo "cmake_version_file:\n${cmake_version_file}" + writeFile( file: 'cmake/build-version.cmake', text: cmake_version_file ) + } } } + withEnv(["PATH=${PATH}:/opt/rocm/bin"]) { // Record important versions of software layers we use @@ -27,46 +56,76 @@ node('rocm-1.3 && fiji') cmake --version hcc --version hipconfig --version - ''' - - dir("${build_dir_release}") { - stage("configure clang release") { - // withEnv(['CXXFLAGS=-I /usr/include/c++/4.8 -I /usr/include/x86_64-linux-gnu/c++/4.8 -I /usr/include/x86_64-linux-gnu', 'HIP_PATH=/opt/rocm/hip']) { - // --amdgpu-target=AMD:AMDGPU:8:0:3 - sh "cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_LIBRARY=ON -DBUILD_CLIENTS=ON -DBUILD_CLIENTS_SAMPLES=ON -DBUILD_CLIENTS_TESTS=ON -DBUILD_WITH_TENSILE=ON -DHIP_ROOT=/opt/rocm/hip -DBOOST_ROOT=/opt/boost/clang ${scm_dir}" - // } - } + ''' - stage("Build") { - // withEnv(['HCC_AMDGPU_TARGET=AMD:AMDGPU:7:0:1,AMD:AMDGPU:8:0:3']) { - sh 'make -j 8' - // } - } + //Jenkins plugin that adds color terminal support to output 'Console Output'; requires bash shell + wrap([$class: 'AnsiColorBuildWrapper', 'colorMapName': 'XTerm']) + { - stage("Package Debian") { - sh 'cd library-build; make package' - archive includes: 'library-build/*.deb' - } + dir("${build_dir_release}") + { + stage("configure clang release") { + sh """#!/usr/bin/env bash + sudo apt-get update + sudo apt-get install python-yaml + cmake -DCMAKE_BUILD_TYPE=${build_type} -DCMAKE_PREFIX_PATH=/opt/boost/clang -DBUILD_LIBRARY=ON -DBUILD_WITH_TENSILE=ON \ + -DBUILD_CLIENTS=ON -DBUILD_CLIENTS_SAMPLES=ON -DBUILD_CLIENTS_TESTS=ON ${scm_dir} + """ + } - stage("unit tests") { - // To trim test time, only execute single digit tests - sh ''' - cd clients-build/tests-build/staging - ./rocblas-test-d --gtest_output=xml --gtest_filter=*/? - ''' - junit 'clients-build/tests-build/staging/*.xml' - } + stage("Build") + { + if (env.NODE_LABELS ==~ /.*fiji.*/) + { + sh 'echo Target Fiji ISA' + withEnv(['HCC_AMDGPU_TARGET=AMD:AMDGPU:8:0:3']) + { + sh '''#!/usr/bin/env bash + make -j 8 + ''' + } + } + else if (env.NODE_LABELS ==~ /.*hawaii.*/) + { + sh 'echo Target Hawaii ISA' + withEnv(['HCC_AMDGPU_TARGET=AMD:AMDGPU:7:0:1']) + { + sh '''#!/usr/bin/env bash + make -j 8 + ''' + } + } + } + + stage("Package Debian") { + sh 'cd library-build; make package' + archive includes: 'library-build/*.deb' + } + + // Cap the maximum amount of testing to be a few hours; assume failure if the time limit is hit + timeout(time: 1, unit: 'HOURS') + { + stage("unit tests") { + sh """#!/usr/bin/env bash + cd clients-build/tests-build/staging + ./rocblas-test${build_type_postfix} --gtest_output=xml + """ + junit 'clients-build/tests-build/staging/*.xml' + } + + stage("samples") + { + sh "cd clients-build/samples-build; ./example-sscal${build_type_postfix}" + } + } - stage("samples") { - sh "cd clients-build/samples-build; ./example-sscal-d" } + } } } catch( err ) { - currentBuild.result = "FAILURE" - def email_list = emailextrecipients([ [$class: 'CulpritsRecipientProvider'] ]) @@ -77,8 +136,8 @@ node('rocm-1.3 && fiji') // body: "Node: ${env.NODE_NAME}\nSee ${env.BUILD_URL}\n\n" + err.toString() // Disable email for now - mail to: "kent.knox@amd.com, david.tanner@amd.com, tingxing.dong@amd.com", - subject: "${env.JOB_NAME} finished with ${currentBuild.result}", + mail to: "kent.knox@amd.com, david.tanner@amd.com, tingxing.dong@amd.com, andrew.chapman@amd.com", + subject: "${env.JOB_NAME} finished with FAILUREs", body: "Node: ${env.NODE_NAME}\nSee ${env.BUILD_URL}\n\n" + err.toString() throw err diff --git a/README.md b/README.md index ae7816e19..76d1fe99c 100644 --- a/README.md +++ b/README.md @@ -1,36 +1,66 @@ # rocBLAS -Radeon Open Compute BLAS implementation on top of AMD [ROCm][] runtime. -rocBLAS is implemented with the [HIP][] programming language and optimized for AMD latest discrete GPUs. -It can also run on Nvidia GPUs as long as the CUDA enviroment is configured correctly. +A BLAS implementation on top of AMD's Radeon Open Compute [ROCm][] runtime and toolchains. rocBLAS is implemented in +the [HIP][] programming language, optimized for AMD's latest discrete GPUs and allowing it to run on CUDA enabled GPUs. ## Migrating libraries to ROCm from OpenCL -A substantial investment has been made by AMD in developing and promoting OpenCL libraries to accelerate common math domains, such as [clBLAS][], [clFFT][], clRNG and clSparse. These libraries have demonstrated significant performance benefits of data parallel (GPU) computation, but primarily remain in the domain of expert programmers. As AMD simplifies the programming model with ROCm, it would be beneficial to leverage the performance and learning present in the OpenCL libraries and carry that forward. +[clBLAS][] demonstrated significant performance benefits of data parallel (GPU) computation when applied to solving dense +linear algebra problems, but OpenCL primarily remains in the domain of expert programmers. The ROCm model introduces a +single source paradigm for integrating device and host code together in a single source file, thereby simplifying the +entire development process for heterogeneous computing. Compilers will get smarter, catching errors at compile/build time +and native profilers/debuggers will better integrate into the development process. As AMD simplifies the +programming model with ROCm (using HCC and HIP), it is the intent of this library to expose that simplified programming +model to end users. ## rocBLAS interface -In general, rocBLAS interface is compatible with [Netlib BLAS][] and cuBLAS-v2 API except Netlib BLAS does not have handle and cuBLAS' cublasHandle_t is replaced with rocblas_handle everywhere. Thus porting a CUDA application calling cuBLAS API to a HIP application calling rocBLAS API is straightforward. -For example, the rocBLAS SGEMV interface is +In general, the rocBLAS interface is compatible with [Netlib BLAS][] and the cuBLAS-v2 API, with the explicit exception that +Netlib BLAS does not have handle. The cuBLAS' cublasHandle_t is replaced with rocblas_handle everywhere. Thus, porting a +CUDA application which originally calls the cuBLAS API to a HIP application calling rocBLAS API should be relatively +straightforward. For example, the rocBLAS SGEMV interface is ```c rocblas_status rocblas_sgemv(rocblas_handle handle, rocblas_operation trans, rocblas_int m, rocblas_int n, - const float *alpha, - const float *A, rocblas_int lda, - const float *x, rocblas_int incx, - const float *beta, - float *y, rocblas_int incy); + const float* alpha, + const float* A, rocblas_int lda, + const float* x, rocblas_int incx, + const float* beta, + float* y, rocblas_int incy); ``` -where rocblas_int is an alias of int, rocblas_operation is a rocBLAS defined enum to specify the non/transpose operation in BLAS. rocBLAS assumed the required matrices A and vectors x, y are already allocated in the GPU memory space filled with data. -Users are repsonsible for copying the data from/to the host CPU memory to/from the GPU memory. HIP provides API to offload and retrieve data from the GPU. + +rocBLAS assumes matrices A and vectors x, y are allocated in GPU memory space filled with data. Users are +responsible for copying data from/to the host and device memory. HIP provides memcpy style API's to facilitate data +management. ##Asynchronous API -Except a few routines (like TRSM) having memory allocation inside preventing asynchronicity, most of the library routines (like BLAS-1 SCAL, BLAS-2 GEMV, BLAS-3 GEMM) are configured to operate in an asynchronous fashion to CPU, meaning that these library function calls will return to users immediately. +Except a few routines (like TRSM) having memory allocation inside preventing asynchronicity, most of the library routines +(like BLAS-1 SCAL, BLAS-2 GEMV, BLAS-3 GEMM) are configured to operate in asynchronous fashion with respect to CPU, +meaning that these library function calls return immediately. + +##Batched and strided GEMM API +rocBLAS GEMM can process matrices in batches with regular strides. There are several permutations of these API's, the +following is an example that takes everything + +```c +rocblas_status +rocblas_sgemm_strided_batched( + rocblas_handle handle, + rocblas_order order, + rocblas_operation transa, rocblas_operation transb, + rocblas_int m, rocblas_int n, rocblas_int k, + const float* alpha, + const float* A, rocblas_int ls_a, rocblas_int ld_a, rocblas_int bs_a, + const float* B, rocblas_int ls_b, rocblas_int ld_b, rocblas_int bs_b, + const float* beta, + float* C, rocblas_int ls_c, rocblas_int ld_c, rocblas_int bs_c, + rocblas_int batch_count ) +``` ##rocBLAS Wiki -The [wiki][] has helpful information about building rocblas library, samples and testing files. +The [wiki][] has helpful information about building the rocblas library, samples and tests. [wiki]: https://github.com/RadeonOpenCompute/rocBLAS/wiki @@ -41,6 +71,3 @@ The [wiki][] has helpful information about building rocblas library, samples and [Netlib BLAS]: http://www.netlib.org/blas/ [clBLAS]: https://github.com/clMathLibraries/clBLAS - -[clFFT]: https://github.com/clMathLibraries/clFFT - diff --git a/clients/CMakeLists.txt b/clients/CMakeLists.txt index b41569fa3..b27e63bcc 100644 --- a/clients/CMakeLists.txt +++ b/clients/CMakeLists.txt @@ -5,10 +5,6 @@ # Natively available on including Ubuntu 14.04, OpenSUSE 13.2, CentOS 6.6 cmake_minimum_required( VERSION 2.8.12 ) -# if( NOT CMAKE_TOOLCHAIN_FILE ) -# message( FATAL_ERROR "This project expects to know what compilers it should use through CMAKE_TOOLCHAIN_FILE" ) -# endif( ) - # This project may compile dependencies for clients project( rocblas-clients CXX ) @@ -22,39 +18,70 @@ set( rocblas_clients_dependencies ) # If the user does not explicitely specify BOOST_ROOT, build our # own on the fly -if( NOT DEFINED BOOST_ROOT ) - include( external-boost ) - list( APPEND rocblas_clients_dependencies boost ) -endif( ) - -if( NOT DEFINED GTEST_ROOT ) - include( external-gtest ) - list( APPEND rocblas_clients_dependencies googletest ) -endif( ) +#if( BUILD_CLIENTS_BENCHMARKS ) +# if( NOT DEFINED BOOST_ROOT ) +# include( external-boost ) +# list( APPEND rocblas_clients_dependencies boost ) +# endif( ) +#endif() -if( NOT DEFINED LAPACK ) - include( external-lapack ) - list( APPEND rocblas_clients_dependencies lapack ) -endif( ) - -list( APPEND CMAKE_PREFIX_PATH ${HIP_ROOT} ) +if( BUILD_CLIENTS_TESTS ) + if( NOT DEFINED GTEST_ROOT ) + include( external-gtest ) + list( APPEND rocblas_clients_dependencies googletest ) + endif( ) +endif() + +if( BUILD_CLIENTS_BENCHMARKS OR BUILD_CLIENTS_TESTS ) + if( NOT DEFINED LAPACK_ROOT ) + include( external-lapack ) + list( APPEND rocblas_clients_dependencies lapack ) + endif( ) +endif() + +# if( DEFINED HIP_ROOT ) +# list( APPEND CMAKE_MODULE_PATH ${HIP_ROOT}/cmake ) +# # list( APPEND CMAKE_PREFIX_PATH ${HIP_ROOT} ) +# endif( ) # WARNING: do not surround CMAKE_PREFIX_PATH with quotes, it breaks # Replace all occurances of ; with ^^, which we elect to use a path seperator string(REGEX REPLACE ";" "^^" CMAKE_PREFIX_PATH "${CMAKE_PREFIX_PATH}" ) string(REGEX REPLACE ";" "^^" CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH}" ) -string(REGEX REPLACE ";" "^^" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" ) -string(REGEX REPLACE ";" "^^" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" ) set( CLIENTS_CMAKE_ARGS - -DCMAKE_TOOLCHAIN_FILE=${DEVICE_TOOLCHAIN_FILE} -DCMAKE_MODULE_PATH=${CMAKE_MODULE_PATH} - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} - -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} -DBUILD_64=${BUILD_64} ) -if( NOT CMAKE_CONFIGURATION_TYPES ) +# Linking rocblas dynamically should allow the user to use any host compiler +# Linking rocblas statically requires the user to use the hcc compiler +if( BUILD_SHARED_LIBS ) + # Pass through compiler if explicitely set, otherwise let default handling + if( DEFINED CMAKE_CXX_COMPILER ) + list( APPEND CLIENTS_CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} ) + endif() +else() + if( DEFINED DEVICE_CXX_COMPILER ) + list( APPEND CLIENTS_CMAKE_ARGS -DCMAKE_CXX_COMPILER=${DEVICE_CXX_COMPILER} ) + else() + message( FATAL_ERROR "Statically linking rocblas requires DEVICE_CXX_COMPILER to be defined, typically hipcc") + endif() +endif( ) + +if( DEFINED CMAKE_C_COMPILER ) + list( APPEND CLIENTS_CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} ) +endif( ) + +if( DEFINED CMAKE_CXX_FLAGS ) + list( APPEND CLIENTS_CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} ) +endif( ) + +if( DEFINED CMAKE_C_FLAGS ) + list( APPEND CLIENTS_CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} ) +endif( ) + +if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND DEFINED CMAKE_BUILD_TYPE ) list( APPEND CLIENTS_CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} ) endif( ) @@ -69,7 +96,6 @@ if( BUILD_CLIENTS_SAMPLES ) ) ExternalProject_Add( samples - DEPENDS ${rocblas_clients_dependencies} SOURCE_DIR ${PROJECT_SOURCE_DIR}/samples BINARY_DIR samples-build CMAKE_ARGS ${SAMPLES_CMAKE_ARGS} @@ -82,6 +108,7 @@ if( BUILD_CLIENTS_BENCHMARKS ) set( BENCH_CMAKE_ARGS ${CLIENTS_CMAKE_ARGS} -DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}^^${BOOST_ROOT} + -DBUILD_WITH_TENSILE=${BUILD_WITH_TENSILE} ) ExternalProject_Add( benchmarks @@ -98,6 +125,7 @@ if( BUILD_CLIENTS_TESTS ) set( TESTS_CMAKE_ARGS ${CLIENTS_CMAKE_ARGS} -DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}^^${BOOST_ROOT}^^${GTEST_ROOT}^^${LAPACK_ROOT} + -DBUILD_WITH_TENSILE=${BUILD_WITH_TENSILE} ) ExternalProject_Add( tests diff --git a/clients/benchmarks/CMakeLists.txt b/clients/benchmarks/CMakeLists.txt index b35e8d6af..63565afc5 100644 --- a/clients/benchmarks/CMakeLists.txt +++ b/clients/benchmarks/CMakeLists.txt @@ -5,40 +5,10 @@ # PUBLIC keywords cmake_minimum_required( VERSION 2.8.12 ) -# Depending on whether we are building for 64 or 32 bit, construct common paths and names that subdirectories can reference for their use -if( rocBLAS_BUILD64 ) - set( CPACK_PACKAGE_FILE_NAME "${CMAKE_PROJECT_NAME}-${rocBLAS-clients_VERSION}-${CMAKE_HOST_SYSTEM_NAME}-x64" ) - set( INCLUDE_DIR include ) -else( ) - set( CPACK_PACKAGE_FILE_NAME "${CMAKE_PROJECT_NAME}-${rocBLAS-clients_VERSION}-${CMAKE_HOST_SYSTEM_NAME}-x32" ) - set( INCLUDE_DIR include ) -endif( ) - -# The following code is setting variables to control the behavior of CPack to generate our -if( WIN32 ) - set( CPACK_SOURCE_GENERATOR "ZIP" ) - set( CPACK_GENERATOR "ZIP" ) -else( ) - set( CPACK_SOURCE_GENERATOR "TGZ" ) - set( CPACK_GENERATOR "TGZ" ) -endif( ) - -set( CPACK_SOURCE_PACKAGE_FILE_NAME "${CMAKE_PROJECT_NAME}-${rocBLAS-clients_VERSION}-${CMAKE_HOST_SYSTEM_NAME}-Source") -# set( CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.txt") -set( CPACK_PACKAGE_VERSION_MAJOR ${rocBLAS-clients_VERSION_MAJOR} ) -set( CPACK_PACKAGE_VERSION_MINOR ${rocBLAS-clients_VERSION_MINOR} ) -set( CPACK_PACKAGE_VERSION_PATCH ${rocBLAS-clients_VERSION_PATCH} ) -set( CPACK_PACKAGE_VERSION_TWEAK ${rocBLAS-clients_VERSION_TWEAK} ) -set( CPACK_PACKAGE_DESCRIPTION_SUMMARY "Radeon Open Compute BLAS library package") -set( CPACK_PACKAGE_VENDOR "AMD") -set( CPACK_SOURCE_IGNORE_FILES "/\\\\.git/;/\\\\.hg/;/\\\\.svn/;" ) - -# Define all variables that influence CPack before including CPack, such as install targets -include( CPack ) +include( build-version ) +project_version( NAME rocblas-client LANGUAGES CXX ) #==================================NEW============================ - - set( CMAKE_EXPORT_COMPILE_COMMANDS ON ) find_package( rocblas REQUIRED CONFIG ) @@ -58,16 +28,32 @@ if( NOT Boost_FOUND ) endif( ) + set( rocblas_benchmark_common ../common/utility.cpp ../common/cblas_interface.cpp ../common/flops.cpp ../common/norm.cpp ../common/unit.cpp + ../common/rocblas_template_specialization.cpp ) add_executable( client client.cpp ${rocblas_benchmark_common} ) + +add_subdirectory ( ./perf_script ) + + + +if( BUILD_WITH_TENSILE ) + target_compile_definitions( client PRIVATE BUILD_WITH_TENSILE=1 ) + message(STATUS, "Build Tensil equal 1") +else() + target_compile_definitions( client PRIVATE BUILD_WITH_TENSILE=0 ) + message(STATUS, "Build Tensil equal 0") +endif() + + # Try to test for specific compiler features if cmake version is recent enough if( CMAKE_VERSION VERSION_GREATER "3.0" ) target_compile_features( client PRIVATE cxx_static_assert cxx_nullptr cxx_lambdas cxx_auto_type ) @@ -89,7 +75,6 @@ target_include_directories( client $ ) - target_link_libraries( client rocblas ${Boost_LIBRARIES} blas lapack) # Ubuntu systems need to explicitely link to pthreads lib because of --as-needed @@ -99,7 +84,36 @@ if( UNIX ) target_link_libraries( client pthread ) endif( ) - - set_target_properties( client PROPERTIES DEBUG_POSTFIX "-d" ) # set_target_properties( client PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" ) + +# Depending on whether we are building for 64 or 32 bit, construct common paths and names that subdirectories can reference for their use +if( rocBLAS_BUILD64 ) + set( CPACK_PACKAGE_FILE_NAME "${CMAKE_PROJECT_NAME}-${rocBLAS-clients_VERSION}-${CMAKE_HOST_SYSTEM_NAME}-x64" ) + set( INCLUDE_DIR include ) +else( ) + set( CPACK_PACKAGE_FILE_NAME "${CMAKE_PROJECT_NAME}-${rocBLAS-clients_VERSION}-${CMAKE_HOST_SYSTEM_NAME}-x32" ) + set( INCLUDE_DIR include ) +endif( ) + +# The following code is setting variables to control the behavior of CPack to generate our +if( WIN32 ) + set( CPACK_SOURCE_GENERATOR "ZIP" ) + set( CPACK_GENERATOR "ZIP" ) +else( ) + set( CPACK_SOURCE_GENERATOR "TGZ" ) + set( CPACK_GENERATOR "TGZ" ) +endif( ) + +set( CPACK_SOURCE_PACKAGE_FILE_NAME "${CMAKE_PROJECT_NAME}-${rocBLAS-clients_VERSION}-${CMAKE_HOST_SYSTEM_NAME}-Source") +# set( CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.txt") +set( CPACK_PACKAGE_VERSION_MAJOR ${rocBLAS-clients_VERSION_MAJOR} ) +set( CPACK_PACKAGE_VERSION_MINOR ${rocBLAS-clients_VERSION_MINOR} ) +set( CPACK_PACKAGE_VERSION_PATCH ${rocBLAS-clients_VERSION_PATCH} ) +set( CPACK_PACKAGE_VERSION_TWEAK ${rocBLAS-clients_VERSION_TWEAK} ) +set( CPACK_PACKAGE_DESCRIPTION_SUMMARY "Radeon Open Compute BLAS library package") +set( CPACK_PACKAGE_VENDOR "AMD") +set( CPACK_SOURCE_IGNORE_FILES "/\\\\.git/;/\\\\.hg/;/\\\\.svn/;" ) + +# Define all variables that influence CPack before including CPack, such as install targets +include( CPack ) diff --git a/clients/benchmarks/client.cpp b/clients/benchmarks/client.cpp index 830bc7b7a..d19dbb823 100644 --- a/clients/benchmarks/client.cpp +++ b/clients/benchmarks/client.cpp @@ -9,14 +9,21 @@ #include "rocblas.h" #include "utility.h" +#include "rocblas.hpp" #include "testing_scal.hpp" #include "testing_dot.hpp" #include "testing_nrm2.hpp" #include "testing_asum.hpp" #include "testing_amax.hpp" #include "testing_gemv.hpp" +#include "testing_ger.hpp" #include "testing_trtri.hpp" -#include "testing_gemm.hpp" +#include "testing_trtri_batched.hpp" +#if BUILD_WITH_TENSILE + #include "testing_gemm.hpp" + #include "testing_gemm_batched.hpp" + #include "testing_trsm.hpp" +#endif namespace po = boost::program_options; @@ -40,16 +47,20 @@ int main(int argc, char *argv[]) ( "sizem,m", po::value( &argus.M )->default_value(128), "Specific matrix size testing: sizem is only applicable to BLAS-2 & BLAS-3: the number of rows." ) ( "sizen,n", po::value( &argus.N )->default_value(128), "Specific matrix/vector size testing: BLAS-1: the length of the vector. BLAS-2 & BLAS-3: the number of columns" ) ( "sizek,k", po::value( &argus.K )->default_value(128), "Specific matrix size testing:sizek is only applicable to BLAS-3: the number of columns in A & C and rows in B." ) + ( "lda", po::value( &argus.lda )->default_value(128), "Specific leading dimension of matrix A, is only applicable to BLAS-2 & BLAS-3: the number of rows." ) + ( "ldb", po::value( &argus.ldb )->default_value(128), "Specific leading dimension of matrix B, is only applicable to BLAS-2 & BLAS-3: the number of rows." ) + ( "ldc", po::value( &argus.ldc )->default_value(128), "Specific leading dimension of matrix C, is only applicable to BLAS-2 & BLAS-3: the number of rows." ) ( "alpha", po::value( &argus.alpha)->default_value(1.0), "specifies the scalar alpha" ) ( "beta", po::value( &argus.beta )->default_value(0.0), "specifies the scalar beta" ) ( "order,o", po::value(&argus.order_option )->default_value(1), "0 = row major, 1 = column major. Right now, only column major is supported" ) - ( "function,f", po::value( &function )->default_value("gemv"), "BLAS function to test. Options: gemv, trsm, trmm, gemv, symv, syrk, syr2k" ) + ( "function,f", po::value( &function )->default_value("gemv"), "BLAS function to test. Options: gemv, ger, trsm, trmm, symv, syrk, syr2k" ) ( "precision,r", po::value( &precision )->default_value('s'), "Options: s,d,c,z" ) ( "transposeA", po::value( &argus.transA_option )->default_value('N'), "N = no transpose, T = transpose, C = conjugate transpose" ) ( "transposeB", po::value( &argus.transB_option )->default_value('N'), "N = no transpose, T = transpose, C = conjugate transpose" ) ( "side", po::value( &argus.side_option )->default_value('L'), "L = left, R = right. Only applicable to certain routines" ) ( "uplo", po::value( &argus.uplo_option )->default_value('U'), "U = upper, L = lower. Only applicable to certain routines" ) // xsymv xsyrk xsyr2k xtrsm xtrmm ( "diag", po::value( &argus.diag_option )->default_value('N'), "U = unit diagonal, N = non unit diagonal. Only applicable to certain routines" ) // xtrsm xtrmm + ( "batch", po::value( &argus.batch_count )->default_value(10), "Number of matrices. Only applicable to batched routines" ) // xtrsm xtrmm ( "verify,v", po::value(&argus.norm_check)->default_value(0), "Validate GPU results with CPU? 0 = No, 1 = Yes (default: No)") ( "device", po::value(&device_id)->default_value(0), "Set default device to be used for subsequent program runs") ; @@ -86,12 +97,10 @@ int main(int argc, char *argv[]) printf("Invalide matrix dimension\n"); } - //adjust dimension for BLAS-3 routines, may not appplicable to BLAS-1 and certain BLAS-2 routines - argus.transA_option == 'N' ? argus.lda = argus.M : argus.lda = argus.K; - argus.transB_option == 'N' ? argus.ldb = argus.K : argus.ldb = argus.N; - argus.ldc = argus.M; + argus.start = range[0]; argus.step = range[1]; argus.end = range[2]; + if (function == "scal"){ if (precision == 's') testing_scal( argus ); @@ -128,11 +137,11 @@ int main(int argc, char *argv[]) else if (precision == 'd') testing_gemv( argus ); } - else if (function == "gemm"){ + else if (function == "ger"){ if (precision == 's') - testing_gemm( argus ); + testing_ger( argus ); else if (precision == 'd') - testing_gemm( argus ); + testing_ger( argus ); } else if (function == "trtri"){ if (precision == 's') @@ -140,10 +149,47 @@ int main(int argc, char *argv[]) else if (precision == 'd') testing_trtri( argus ); } + else if (function == "trtri_batched"){ + if (precision == 's') + testing_trtri_batched( argus ); + else if (precision == 'd') + testing_trtri_batched( argus ); + } +#if BUILD_WITH_TENSILE + else if (function == "gemm"){ + + //adjust dimension for GEMM routines + argus.transA_option == 'N' ? argus.lda = argus.M : argus.lda = argus.K; + argus.transB_option == 'N' ? argus.ldb = argus.K : argus.ldb = argus.N; + argus.ldc = argus.M; + + if (precision == 's') + testing_gemm( argus ); + else if (precision == 'd') + testing_gemm( argus ); + } + else if (function == "gemm_batched"){ + //adjust dimension for GEMM routines + argus.transA_option == 'N' ? argus.lda = argus.M : argus.lda = argus.K; + argus.transB_option == 'N' ? argus.ldb = argus.K : argus.ldb = argus.N; + argus.ldc = argus.M; + if (precision == 's') + testing_gemm_batched( argus ); + else if (precision == 'd') + testing_gemm_batched( argus ); + } + else if (function == "trsm"){ + if (precision == 's') + testing_trsm( argus ); + else if (precision == 'd') + testing_trsm( argus ); + } +#endif else{ printf("Invalid value for --function \n"); return -1; } + return 0; } diff --git a/clients/benchmarks/perf_script/CMakeLists.txt b/clients/benchmarks/perf_script/CMakeLists.txt new file mode 100644 index 000000000..e8ccb04e5 --- /dev/null +++ b/clients/benchmarks/perf_script/CMakeLists.txt @@ -0,0 +1,23 @@ +# ######################################################################## +# Copyright 2016 Advanced Micro Devices, Inc. +# +# ######################################################################## + + +set(GRAPHING_SCRIPTS measurePerformance.py + plotPerformance.py + blasPerformanceTesting.py + errorHandler.py + performanceUtility.py + README.txt + ) +foreach(SCRIPT ${GRAPHING_SCRIPTS}) +#Copy a file to another location. + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SCRIPT} ${CMAKE_CURRENT_BINARY_DIR}/${SCRIPT} @ONLY) +endforeach() + +if( WIN32 ) + install( FILES ${GRAPHING_SCRIPTS} DESTINATION bin${SUFFIX_BIN} ) +else ( ) + install( FILES ${GRAPHING_SCRIPTS} DESTINATION ${CMAKE_CURRENT_BINARY_DIR} ) +endif( ) diff --git a/clients/benchmarks/perf_script/README.txt b/clients/benchmarks/perf_script/README.txt new file mode 100644 index 000000000..44ef01f5c --- /dev/null +++ b/clients/benchmarks/perf_script/README.txt @@ -0,0 +1,13 @@ + +Instruction to run the script to collect performance data + +1) copy client.exe (Windows, client on Linux) to this folder + +2) "python measurePerformance.py --help" to see how to run the command line mode + For example, "python measurePerformance.py -s 64-5760:64 -f gemm -r s --transa none --transb transpose > sgemm.txt" + will collect performance of sgemm NT of square matrices [64:5760] with step 64. + All the output will be dumped into sgemm.txt + +3) Open the sgemm.txt and use "grep" to fetch the performance line. You can extract the Gflop/s number from the performance line + +4) You can ingore the other automatic generated .txt files and folders during the run. diff --git a/clients/benchmarks/perf_script/blasPerformanceTesting.py b/clients/benchmarks/perf_script/blasPerformanceTesting.py new file mode 100644 index 000000000..a7e30037b --- /dev/null +++ b/clients/benchmarks/perf_script/blasPerformanceTesting.py @@ -0,0 +1,320 @@ +# ######################################################################## +# Copyright 2016 Advanced Micro Devices, Inc. +# +# ######################################################################## + +import itertools +import re#gex +import subprocess +import os +import sys +from datetime import datetime + +# Common data and functions for the performance suite + +class TestCombination: + def __init__(self, + lengthx, lengthy, lengthz, batchsize, + device, inlayout, outlayout, placeness, + ldscomplex, ldsfraction, cachesize, xfactor, + label): + self.x = lengthx + self.y = lengthy + self.z = lengthz + self.batchsize = batchsize + self.device = device + self.inlayout = inlayout + self.outlayout = outlayout + self.placeness = placeness + self.ldscomplex = ldscomplex + self.ldsfraction = ldsfraction + self.cachesize = cachesize + self.xfactor = xfactor + self.label = label + + def __str__(self): + return self.x + 'x' + self.y + 'x' + self.z + ':' + self.batchsize + ', ' + self.device + ', ' + self.inlayout + '/' + self.outlayout + ', ' + self.placeness + ', LDS comp(' + self.ldscomplex + '), LDS frac(' + self.ldsfraction + '), cachesz(' + self.cachesize + '), X-factor(' + self.xfactor + ') -- ' + self.label + +class GraphPoint: + def __init__(self, + lengthx, lengthy, lengthz, batchsize, + ldsfraction, device, label, + gflops): + self.x = lengthx + self.y = lengthy + self.z = lengthz + self.batchsize = batchsize + self.device = device + self.label = label + self.ldsfraction = ldsfraction + self.gflops = gflops + self.problemsize = str(int(self.x) * int(self.y) * int(self.z) * int(self.batchsize)) + + def __str__(self): + # ALL members must be represented here (x, y, z, batch, device, label, ldsfraction, etc) + return self.x + 'x' + self.y + 'x' + self.z + ':' + self.batchsize + ', ' + self.device + ', LDS fraction = ' + self.ldsfraction + ' -- ' + self.label + '; ' + self.gflops + +class TableRow: + # parameters = class TestCombination instantiation + def __init__(self, parameters, gflops): + self.parameters = parameters + self.gflops = gflops + + def __str__(self): + return self.parameters.__str__() + '; ' + self.gflops + +def transformDimension(x,y,z): + if int(z) != 1: + return 3 + elif int(y) != 1: + return 2 + elif int(x) != 1: + return 1 + +def executable(library): + if type(library) != str: + print 'ERROR: expected library name to be a string' + quit() + + if sys.platform != 'win32' and sys.platform != 'linux2': + print 'ERROR: unknown operating system' + quit() + if library == 'rocblas': + if sys.platform == 'win32': + exe = 'client.exe' + elif sys.platform == 'linux2': + exe = './client' + if library == 'acmlblas': + if sys.platform == 'win32': + exe = 'ACMLBlas_client.exe' + elif sys.platform == 'linux2': + exe = './ACMLBlas_client' + if library!='null' and library!='rocblas' and library!='acmlblas': + print 'ERROR: unknown library -- cannot determine executable name ' + library + quit() + + if not os.path.isfile(exe): + error_message = 'ERROR: could not find client named ' + exe + print error_message + quit() + + return exe + +def max_mem_available_in_bytes(exe, device): + arguments = [exe, '-i', device] + + deviceInfo = subprocess.check_output(arguments, stderr=subprocess.STDOUT).split(os.linesep) + deviceInfo = itertools.ifilter( lambda x: x.count('MAX_MEM_ALLOC_SIZE'), deviceInfo) + deviceInfo = list(itertools.islice(deviceInfo, None)) + maxMemoryAvailable = re.search('\d+$', deviceInfo[0]) + return int(maxMemoryAvailable.group(0)) + +def max_problem_size(exe, device): + numbers_in_one_datapoint = 2 # (i.e.: real or complex?) + bytes_in_one_number = 4 # (i.e.: single or double precision?) + return max_mem_available_in_bytes(exe, device) / (numbers_in_one_datapoint * bytes_in_one_number) + +def maxBatchSize(lengthx, lengthy, lengthz, exe, device): + problemSize = int(lengthx) * int(lengthy) * int(lengthz) + maxBatchSize = max_problem_size(exe, device) / problemSize + if int(lengthx) == pow(2,16) or int(lengthx) == pow(2,17): + # special cases in the kernel. extra padding is added in, so we need to shrink the batch size to accommodate + return str(maxBatchSize/2) + else: + return str(maxBatchSize) + +def create_ini_file_if_requested(args): + if args.createIniFilename: + #print vars(args) + for x in vars(args): + #print x + if (type(getattr(args,x)) != file) and getattr(args,x) != None\ + and x.count('File') == 0: + args.createIniFilename.write('--' + x + ' ') + args.createIniFilename.write(str(getattr(args,x)) + '; ') + quit() + +def load_ini_file_if_requested(args, parser): + if args.useIniFilename: + argument_list = args.useIniFilename.readlines() + arg_string = str() + for a in argument_list: + arg_string += a + arg_string = arg_string.replace(';', '') + arg_string = arg_string.split() + args = parser.parse_args(arg_string) + return args + + +def is_numeric_type(x): + return type(x) == int or type(x) == long or type(x) == float + +def split_up_comma_delimited_lists(args): + for x in vars(args): + attr = getattr(args, x) + if attr == None: + setattr(args, x, [None]) + elif is_numeric_type(attr): + setattr(args, x, [attr]) + elif type(attr) == str: + setattr(args, x, attr.split(',')) + return args + +class Range: + def __init__(self, ranges, defaultStep='+1'): + # we might be passed in a single value or a list of strings + # if we receive a single value, we want to feed it right back + if type(ranges) != list: + self.expanded = ranges + elif ranges[0] == None: + self.expanded = [None] + else: + self.expanded = [] + for thisRange in ranges: + thisRange = str(thisRange) + if re.search('^\+\d+$', thisRange): + self.expanded = self.expanded + [thisRange] + elif thisRange == 'max': + self.expanded = self.expanded + ['max'] + else: + #elif thisRange != 'max': + if thisRange.count(':'): + self._stepAmount = thisRange.split(':')[1] + else: + self._stepAmount = defaultStep + thisRange = thisRange.split(':')[0] + + if self._stepAmount.count('x'): + self._stepper = '_mult' + else: + self._stepper = '_add' + self._stepAmount = self._stepAmount.lstrip('+x') + self._stepAmount = int(self._stepAmount) + + if thisRange.count('-'): + self.begin = int(thisRange.split('-')[0]) + self.end = int(thisRange.split('-')[1]) + else: + self.begin = int(thisRange.split('-')[0]) + self.end = int(thisRange.split('-')[0]) + self.current = self.begin + + if self.begin == 0 and self._stepper == '_mult': + self.expanded = self.expanded + [0] + else: + while self.current <= self.end: + self.expanded = self.expanded + [self.current] + self._step() + + # now we want to uniquify and sort the expanded range + self.expanded = list(set(self.expanded)) + self.expanded.sort() + + # advance current value to next + def _step(self): + getattr(self, self._stepper)() + + def _mult(self): + self.current = self.current * self._stepAmount + + def _add(self): + self.current = self.current + self._stepAmount + +def expand_range(a_range): + return Range(a_range).expanded + +def decode_parameter_problemsize(problemsize): + if not problemsize.count(None): + i = 0 + while i < len(problemsize): + problemsize[i] = problemsize[i].split(':') + j = 0 + while j < len(problemsize[i]): + problemsize[i][j] = problemsize[i][j].split('x') + j = j+1 + i = i+1 + + return problemsize + +def blas_table_header(): + return 'm,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,numQueues,label,GFLOPS' + +class BlasTestCombination: + def __init__(self, + sizem, sizen, sizek, + lda, ldb, ldc, + offa, offb, offc, + alpha, beta, order, + transa, transb, + side, uplo, diag, + function, precision, + device, library, label): + self.sizem = str(sizem) + self.sizen = str(sizen) + self.sizek = str(sizek) + self.lda = str(lda) + self.ldb = str(ldb) + self.ldc = str(ldc) + self.offa = str(offa) + self.offb = str(offb) + self.offc = str(offc) + self.alpha = str(alpha) + self.beta = str(beta) + self.order = order + self.transa = transa + self.transb = transb + self.side = side + self.uplo = uplo + self.diag = diag + self.function = function + self.precision = precision + self.device = device + self.library = library + self.label = label + + def __str__(self): + return self.sizem + 'x' + self.sizen + 'x' + self.sizek + ':' + self.lda + 'x' + self.ldb + 'x' + self.ldc + self.offa + 'x' + self.offb + 'x' + self.offc + ', ' + self.device + ', ' + self.precision + self.function + ', ' + self.library + ', alpha(' + self.alpha + '), beta(' + self.beta + '), order(' + self.order + '), transa(' + self.transa + '), transb(' + self.transb + '), side(' + self.side + '), uplo(' + self.uplo + '), diag(' + self.diag + ') -- ' + self.label + +class BlasGraphPoint: + def __init__(self, + sizem, sizen, sizek, + lda, ldb, ldc, + offa, offb, offc, + device, order, transa, transb, + function, library, label, + gflops): + self.sizem = sizem + self.sizen = sizen + self.sizek = sizek + self.lda = lda + self.ldb = ldb + self.ldc = ldc + self.offa = offa + self.offb = offb + self.offc = offc + self.device = device + self.order = order + self.transa = transa + self.transb = transb + self.function = function + self.library = library + self.label = label + self.gflops = gflops + + def __str__(self): + # ALL members must be represented here (x, y, z, batch, device, label, ldsfraction, etc) + return self.sizem + 'x' + self.sizen + 'x' + self.sizek + ':' + self.device + ', ' + self.function + ', ' + self.library + ', order(' + self.order + '), transa(' + self.transa + '), transb(' + self.transb + ') -- ' + self.label + '; ' + self.gflops + ' gflops' + +def open_file( filename ): + if type(filename) == list: + filename = filename[0] + if filename == None: + filename = 'results' + datetime.now().isoformat().replace(':','.') + '.txt' + else: + if os.path.isfile(filename): + oldname = filename + filename = filename + datetime.now().isoformat().replace(':','.') + message = 'A file with the name ' + oldname + ' already exists. Changing filename to ' + filename + print message + + return open(filename, 'w') diff --git a/clients/benchmarks/perf_script/errorHandler.py b/clients/benchmarks/perf_script/errorHandler.py new file mode 100644 index 000000000..cc9938825 --- /dev/null +++ b/clients/benchmarks/perf_script/errorHandler.py @@ -0,0 +1,57 @@ +# ######################################################################## +# Copyright 2016 Advanced Micro Devices, Inc. +# +# ######################################################################## + +#---------------------------------File Note------------------------------------ +#Date: 27 January 2012 +#This file defines all the error code and error handler mechanism +#--------------------------------Global Variables------------------------------ + +UINS_CAT = 100 +WIN_REG_SEARCH_FAIL = 101 +UNIMPL_APP = 200 +SYS_ERR = 300 +TIME_OUT = 400 +DIM_INCO_FILE_FMT = 500 #incorrect file format for dimension +DIM_FILE_VAL_INCO = 501 #Value coming from dimension file is incorrect + +#__errorTable : Defines all the errors in the system. Add a new error code and +# error message here +"""Error table is defined as private to this module""" +errorTable = { + UINS_CAT: 'Application is not able to find the installed catalyst', + WIN_REG_SEARCH_FAIL: 'Windows Registry search for catalysts version is unsuccessful', + UNIMPL_APP: 'Unimplemented Application requirement', + SYS_ERR: 'System error occurred - Please check the source code', + TIME_OUT: 'Operation is timed out', + DIM_INCO_FILE_FMT: 'incorrect file format for dimension - Not able to find dimension', + DIM_FILE_VAL_INCO: 'Value coming from dimension file is incorrect' + } + +#--------------------------------Class Definitions----------------------------- +class TimeoutException(Exception): + pass + +"""Base class for handling all the application generated exception""" +class ApplicationException(Exception): + + def __init__(self, fileName, errno, msg = ""): + self.fileName = fileName + self.errno = errno + self.mess = errorTable[errno] + msg + self.message = 'Application ERROR:'+repr(self.fileName+'-'+str(self.errno)+'-'+self.mess) + + def __str__(self): + return repr(self.fileName+'-'+str(self.errno)+'-'+self.mess) + + +#--------------------------------Global Function------------------------------- +if __name__ == '__main__': + #print errorTable + try: + raise ApplicationException('errorHandler', SYS_ERR) + + except: + print 'Generic exception' + diff --git a/clients/benchmarks/perf_script/measurePerformance.py b/clients/benchmarks/perf_script/measurePerformance.py new file mode 100644 index 000000000..8804c4587 --- /dev/null +++ b/clients/benchmarks/perf_script/measurePerformance.py @@ -0,0 +1,527 @@ +# ######################################################################## +# Copyright 2016 Advanced Micro Devices, Inc. +# +# ######################################################################## + +import sys +import argparse +import subprocess +import itertools +import re#gex +import os +from threading import Timer, Thread +import thread, time +from platform import system +from datetime import datetime + +import errorHandler +from blasPerformanceTesting import * +from performanceUtility import timeout, log + +IAM = 'BLAS' +TIMOUT_VAL = 900 #In seconds + +""" +define and parse parameters +""" +devicevalues = ['gpu', 'cpu'] +libraryvalues = ['rocblas','acmlblas'] +ordervalues = ['row','column'] +transvalues = ['none','transpose','conj'] +sidevalues = ['left','right'] +uplovalues = ['upper','lower'] +diagvalues = ['unit','nonunit'] +functionvalues = ['gemm', 'trmm', 'trsm', 'syrk', 'syr2k', 'gemv', 'symv', 'symm', 'hemm', 'herk', 'her2k' ] +precisionvalues = ['s', 'd', 'c', 'z'] +roundtripvalues = ['roundtrip','noroundtrip','both'] +memallocvalues = ['default','alloc_host_ptr','use_host_ptr','copy_host_ptr','use_persistent_mem_amd'] + +parser = argparse.ArgumentParser(description='Measure performance of the rocblas library') +parser.add_argument('--device', + dest='device', default='gpu', + help='device(s) to run on; may be a comma-delimited list. choices are ' + str(devicevalues) + '. (default gpu)') +parser.add_argument('-m', '--sizem', + dest='sizem', default=None, + help='size(s) of m to test; may include ranges and comma-delimited lists. stepping may be indicated with a colon. e.g., 1024 or 100-800:100 or 15,2048-3000') +parser.add_argument('-n', '--sizen', + dest='sizen', default=None, + help='size(s) of n to test; may include ranges and comma-delimited lists. stepping may be indicated with a colon. e.g., 1024 or 100-800:100 or 15,2048-3000') +parser.add_argument('-k', '--sizek', + dest='sizek', default=None, + help='size(s) of k to test; may include ranges and comma-delimited lists. stepping may be indicated with a colon. e.g., 1024 or 100-800:100 or 15,2048-3000') +parser.add_argument('-s', '--square', + dest='square', default=None, + help='size(s) of m=n=k to test; may include ranges and comma-delimited lists. stepping may be indicated with a colon. this option sets lda = ldb = ldc to the values indicated with --lda for all problems set with --square. e.g., 1024 or 100-800:100 or 15,2048-3000') +parser.add_argument('--problemsize', + dest='problemsize', default=None, + help='additional problems of a set size. may be used in addition to sizem/n/k and lda/b/c. each indicated problem size will be added to the list of problems to complete. should be entered in MxNxK:AxBxC format (where :AxBxC specifies lda/b/c. :AxBxC is optional. if included, lda/b/c are subject to the same range restrictions as indicated in the lda/b/c section of this help. if omitted, :0x0x0 is assumed). may enter multiple in a comma-delimited list. e.g., 2x2x2:4x6x9,3x3x3 or 1024x800x333') +parser.add_argument('--lda', + dest='lda', default=0, + help='value of lda; may include ranges and comma-delimited lists. stepping may be indicated with a colon. if transA = \'n\', lda must be >= \'m\'. otherwise, lda must be >= \'k\'. if this is violated, the problem will be skipped. if lda is 0, it will be automatically set to match either \'m\' (if transA = \'n\') or \'k\' (otherwise). may indicate relative size with +X, where X is the offset relative to M or K (depending on transA). e.g., 1024 or 100-800:100 or 15,2048-3000 or +10 (if transA = \'n\' and M = 100, lda = 110) (default 0)') +parser.add_argument('--ldb', + dest='ldb', default=0, + help='value of ldb; may include ranges and comma-delimited lists. stepping may be indicated with a colon. if transB = \'n\', ldb must be >= \'k\'. otherwise, ldb must be >= \'n\'. if this is violated, the problem will be skipped. if ldb is 0, it will be automatically set to match either \'k\' (if transB = \'n\') or \'n\' (otherwise). may indicate relative size with +X, where X is the offset relative to K or N (depending on transB). e.g., 1024 or 100-800:100 or 15,2048-3000 or +100 (if transB = \'n\' and K = 2000, ldb = 2100) (default 0)') +parser.add_argument('--ldc', + dest='ldc', default=0, + help='value of ldc; may include ranges and comma-delimited lists. stepping may be indicated with a colon. ldc must be >= \'m\'. if this is violated, the problem will be skipped. if ldc is 0, it will be automatically set to match \'m\'. may indicate relative size with +X, where X is the offset relative to M. e.g., 1024 or 100-800:100 or 15,2048-3000 or +5 (if M = 15, ldc = 20) (default 0)') +parser.add_argument('--offa', + dest='offa', default=0, + help='offset of the matrix A in memory; may include ranges and comma-delimited lists. stepping may be indicated with a colon. e.g., 0-31 or 100-128:2 or 42 (default 0)') +parser.add_argument('--offb', + dest='offb', default=0, + help='offset of the matrix B or vector X in memory; may include ranges and comma-delimited lists. stepping may be indicated with a colon. e.g., 0-31 or 100-128:2 or 42 (default 0)') +parser.add_argument('--offc', + dest='offc', default=0, + help='offset of the matrix C or vector Y in memory; may include ranges and comma-delimited lists. stepping may be indicated with a colon. e.g., 0-31 or 100-128:2 or 42 (default 0)') +parser.add_argument('-a', '--alpha', + dest='alpha', default=1.0, type=float, + help='specifies the scalar alpha') +parser.add_argument('-b', '--beta', + dest='beta', default=1.0, type=float, + help='specifies the scalar beta') +parser.add_argument('-f', '--function', + dest='function', default='gemm', + help='indicates the function(s) to use. may be a comma delimited list. choices are ' + str(functionvalues) + ' (default gemm)') +parser.add_argument('-r', '--precision', + dest='precision', default='s', + help='specifies the precision for the function. may be a comma delimited list. choices are ' + str(precisionvalues) + ' (default s)') +parser.add_argument('-o', '--order', + dest='order', default='row', + help='select row or column major. may be a comma delimited list. choices are ' + str(ordervalues) + ' (default row)') +parser.add_argument('--transa', + dest='transa', default='none', + help='select none, transpose, or conjugate transpose for matrix A. may be a comma delimited list. choices are ' + str(transvalues) + ' (default none)') +parser.add_argument('--transb', + dest='transb', default='none', + help='select none, transpose, or conjugate transpose for matrix B. may be a comma delimited list. choices are ' + str(transvalues) + ' (default none)') +parser.add_argument('--side', + dest='side', default='left', + help='select side, left or right for TRMM and TRSM. may be a comma delimited list. choices are ' + str(sidevalues) + ' (default left)') +parser.add_argument('--uplo', + dest='uplo', default='upper', + help='select uplo, upper or lower triangle. may be a comma delimited list. choices are ' + str(uplovalues) + ' (default upper)') +parser.add_argument('--diag', + dest='diag', default='unit', + help='select diag, whether set diagonal elements to one. may be a comma delimited list. choices are ' + str(diagvalues) + ' (default unit)') +parser.add_argument('--library', + dest='library', default='rocblas', + help='indicates the library to use. choices are ' + str(libraryvalues) + ' (default rocblas)') +parser.add_argument('--label', + dest='label', default=None, + help='a label to be associated with all transforms performed in this run. if LABEL includes any spaces, it must be in \"double quotes\". note that the label is not saved to an .ini file. e.g., --label cayman may indicate that a test was performed on a cayman card or --label \"Windows 32\" may indicate that the test was performed on Windows 32') +parser.add_argument('--tablefile', + dest='tableOutputFilename', default=None, + help='save the results to a plaintext table with the file name indicated. this can be used with rocblas.plotPerformance.py to generate graphs of the data (default: table prints to screen)') +parser.add_argument('--roundtrip', + dest='roundtrip', default='noroundtrip', + help='whether measure the roundtrips or not. choices are ' + str(roundtripvalues) + '. (default noroundtrip); should not be specified when calling ACML') +parser.add_argument('--memalloc', + dest='memalloc', default='default', + help='set the flags for OpenCL memory allocation. Choices are ' + str(memallocvalues) + '. (default is default); do not need to set when calling ACML or if roundtrip is not set') +ini_group = parser.add_mutually_exclusive_group() +ini_group.add_argument('--createini', + dest='createIniFilename', default=None, type=argparse.FileType('w'), + help='create an .ini file with the given name that saves the other parameters given at the command line, then quit. e.g., \'rocblas.measurePerformance.py -m 10 -n 100 -k 1000-1010 -f sgemm --createini my_favorite_setup.ini\' will create an .ini file that will save the configuration for an sgemm of the indicated sizes.') +ini_group.add_argument('--ini', + dest='useIniFilename', default=None, type=argparse.FileType('r'), + help='use the parameters in the named .ini file instead of the command line parameters.') + +args = parser.parse_args() + +label = str(args.label) +roundtrip = str(args.roundtrip) +library = str(args.library) +memalloc = str(args.memalloc) + +subprocess.call('mkdir perfLog', shell = True) +logfile = os.path.join('perfLog', (label+'-'+'blasMeasurePerfLog.txt')) + +def printLog(txt): + print txt + log(logfile, txt) +printLog("=========================MEASURE PERFORMANCE START===========================") +printLog("Process id of Measure Performance:"+str(os.getpid())) + + +#This function is defunct now +@timeout(5, "fileName") # timeout is 15 minutes, 15*60 = 300 secs +def checkTimeOutPut2(args): + global currCommandProcess + #ret = subprocess.check_output(args, stderr=subprocess.STDOUT) + #return ret + currCommandProcess = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + printLog("Curr Command Process id = "+str(currCommandProcess.pid)) + ret = currCommandProcess.communicate() + if(ret[0] == None or ret[0] == ''): + errCode = currCommandProcess.poll() + raise subprocess.CalledProcessError(errCode, args, output=ret[1]) + return ret[0] + +#Spawns a separate thread to execute the library command and wait for that thread to complete +#This wait is of 900 seconds (15 minutes). If still the thread is alive then we kill the thread +def checkTimeOutPut(args): + t = None + global currCommandProcess + global stde + global stdo + stde = None + stdo = None + def executeCommand(): + global currCommandProcess + global stdo + global stde + try: + stdo, stde = currCommandProcess.communicate() + printLog('stdout:\n'+str(stdo)) + printLog('stderr:\n'+str(stde)) + except: + printLog("ERROR: UNKNOWN Exception - +checkWinTimeOutPut()::executeCommand()") + + currCommandProcess = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + thread = Thread(target=executeCommand) + thread.start() + thread.join(TIMOUT_VAL) #wait for the thread to complete + if thread.is_alive(): + printLog('ERROR: Killing the process - terminating thread because it is taking too much of time to execute') + currCommandProcess.kill() + printLog('ERROR: Timed out exception') + raise errorHandler.ApplicationException(__file__, errorHandler.TIME_OUT) + if stdo == "" or stdo==None: + errCode = currCommandProcess.poll() + printLog('ERROR: @@@@@Raising Called processor exception') + raise subprocess.CalledProcessError(errCode, args, output=stde) + return stdo + +printLog('Executing measure performance for label: '+str(label)) + +create_ini_file_if_requested(args) +args = load_ini_file_if_requested(args, parser) +args = split_up_comma_delimited_lists(args) + + +""" +check parameters for sanity +""" +if args.sizem.count(None) == 0 and (args.sizen.count(None) or args.sizek.count(None)): + printLog( 'ERROR: if any of m, n, or k are specified, all of m, n, and k must be specified') + quit() +if args.sizen.count(None) == 0 and (args.sizem.count(None) or args.sizek.count(None)): + printLog( 'ERROR: if any of m, n, or k are specified, all of m, n, and k must be specified') + quit() +if args.sizek.count(None) == 0 and (args.sizem.count(None) or args.sizen.count(None)): + printLog( 'ERROR: if any of m, n, or k are specified, all of m, n, and k must be specified') + quit() + +if args.square.count(None) and args.problemsize.count(None) and args.sizem.count(None) and args.sizen.count(None) and args.sizek.count(None): + printLog( 'ERROR: at least one of [--square] or [--problemsize] or [-m, -n, and -k] must be specified') + quit() + +args.sizem = expand_range(args.sizem) +args.sizen = expand_range(args.sizen) +args.sizek = expand_range(args.sizek) +args.square = expand_range(args.square) +args.lda = expand_range(args.lda) +args.ldb = expand_range(args.ldb) +args.ldc = expand_range(args.ldc) +args.offa = expand_range(args.offa) +args.offb = expand_range(args.offb) +args.offc = expand_range(args.offc) +args.problemsize = decode_parameter_problemsize(args.problemsize) + +""" +create the problem size combinations for each run of the client +""" +if not args.sizem.count(None): + # we only need to do make combinations of problem sizes if m,n,k have been specified explicitly + problem_size_combinations = itertools.product(args.sizem, args.sizen, args.sizek, + args.lda, args.ldb, args.ldc) + problem_size_combinations = list(itertools.islice(problem_size_combinations, None)) +else: + problem_size_combinations = [] + +""" +add manually entered problem sizes to the list of problems to crank out +""" +manual_test_combinations = [] + + +if not args.problemsize.count(None): + for n in args.problemsize: + sizem = [] + sizen = [] + sizek = [] + lda = [] + ldb = [] + ldc = [] + + sizem.append(int(n[0][0])) + sizen.append(int(n[0][1])) + sizek.append(int(n[0][2])) + if len(n) > 1: + lda.append(int(n[1][0])) + ldb.append(int(n[1][1])) + ldc.append(int(n[1][2])) + else: + lda.append(0) + ldb.append(0) + ldc.append(0) + + combos = itertools.product(sizem,sizen,sizek,lda,ldb,ldc) + combos = list(itertools.islice(combos, None)) + for n in combos: + manual_test_combinations.append(n) + +""" +add square problem sizes to the list of problems to crank out +""" +square_test_combinations = [] + +if not args.square.count(None): + for n in args.square: + combos = itertools.product([n],[n],[n],args.lda) # only lda is considered with --square, and lda/b/c are all set to the values specified by lda + combos = list(itertools.islice(combos, None)) + for n in combos: + square_test_combinations.append((n[0],n[1],n[2],n[3],n[3],n[3])) # set lda/b/c = lda + +problem_size_combinations = problem_size_combinations + manual_test_combinations + square_test_combinations + +""" +create final list of all transformations (with problem sizes and transform properties) +""" +test_combinations = itertools.product(problem_size_combinations, args.offa, args.offb, args.offc, args.alpha, args.beta, args.order, args.transa, args.transb, args.side, args.uplo, args.diag, args.function, args.precision, args.device, args.library) +test_combinations = list(itertools.islice(test_combinations, None)) + +test_combinations = [BlasTestCombination(params[0][0], params[0][1], params[0][2], params[0][3], params[0][4], params[0][5], params[1], params[2], params[3], params[4], params[5], params[6], params[7], params[8], params[9], params[10], params[11], params[12], params[13], params[14], params[15], label) for params in test_combinations] + + +""" +open output file and write the header +""" +table = open_file(args.tableOutputFilename) +table.write(blas_table_header() + '\n') +table.flush() + +""" +turn each test combination into a command, run the command, and then stash the gflops +""" +result = [] # this is where we'll store the results for the table + +printLog( 'Total combinations = '+str(len(test_combinations))) + +vi = 0 +#test_combinations = test_combinations[:5] +for params in test_combinations: + vi = vi+1 + printLog('preparing command: '+ str(vi)) + device = params.device + sizem = params.sizem + sizen = params.sizen + sizek = params.sizek + lda = params.lda + ldb = params.ldb + ldc = params.ldc + offa = params.offa + offb = params.offb + offc = params.offc + alpha = params.alpha + beta = params.beta + function = params.function + precision = params.precision + library = params.library + label = params.label + + if params.order == 'row': + order = str(0) + elif params.order == 'column': + order = str(1) + else: + printLog( 'ERROR: unknown value for order') + quit() + + if params.side == 'left': + side = 'L' + elif params.side == 'right': + side = 'R' + else: + printLog( 'ERROR: unknown value for side') + quit() + + if params.uplo == 'upper': + uplo = 'U' + elif params.uplo == 'lower': + uplo = 'L' + else: + printLog( 'ERROR: unknown value for uplo') + quit() + + if params.diag == 'unit': + diag = 'U' + elif params.diag == 'nonunit': + diag = 'N' + else: + printLog( 'ERROR: unknown value for diag') + quit() + + if re.search('^\+\d+$', lda): + if params.transa == 'none': + lda = str(int(lda.lstrip('+')) + int(sizem)) + else: + lda = str(int(lda.lstrip('+')) + int(sizek)) + + if re.search('^\+\d+$', ldb): + if params.transb == 'none': + ldb = str(int(ldb.lstrip('+')) + int(sizek)) + else: + ldb = str(int(ldb.lstrip('+')) + int(sizen)) + + if re.search('^\+\d+$', ldc): + ldc = str(int(ldc.lstrip('+')) + int(sizem)) + + if params.transa == 'none': + transa = 'N' + elif params.transa == 'transpose': + transa = 'T' + elif params.transa == 'conj': + transa = 'C' + else: + printLog( 'ERROR: unknown value for transa') + + if params.transb == 'none': + transb = 'N' + elif params.transb == 'transpose': + transb = 'T' + elif params.transb == 'conj': + transb = 'C' + else: + printLog( 'ERROR: unknown value for transb') + + if library == 'acmlblas': + arguments = [executable(library), + '-m', sizem, + '-n', sizen, + '-k', sizek, + '--lda', lda, + '--ldb', ldb, + '--ldc', ldc, + '--alpha', alpha, + '--beta', beta, + '--order', order, + '--transposeA', transa, + '--transposeB', transb, + '--side', side, + '--uplo', uplo, + '--diag', diag, + '--function', function, + '--precision', precision, + '-p', '10', + '--roundtrip', roundtrip] + elif library == 'rocblas': + arguments = [executable(library), + '-m', sizem, + '-n', sizen, + '-k', sizek, + '--lda', lda, + '--ldb', ldb, + '--ldc', ldc, + '--alpha', alpha, + '--beta', beta, + '--order', order, + '--transposeA', transa, + '--transposeB', transb, + '--side', side, + '--uplo', uplo, + '--diag', diag, + '--function', function, + '--precision', precision] + + else: + printLog( 'ERROR: unknown library:"' +library+ '" can\'t assemble command') + quit() + + writeline = True + + try: + printLog('Executing Command: '+str(arguments)) + output = checkTimeOutPut(arguments); + output = output.split(os.linesep); + printLog('Execution Successfull---------------\n') + except errorHandler.ApplicationException as ae: + writeline = False + #Killing the process + #if system() != 'Windows': + # currCommandProcess.kill() + # printLog('ERROR: Killed process') + printLog('ERROR: Command is taking too much of time-- '+ae.message+'\n'+'Command: \n'+str(arguments)) + except subprocess.CalledProcessError as clientCrash: + if clientCrash.output.count('bad_alloc'): + writeline = False + printLog( 'Omitting line from table - problem is too large') + elif clientCrash.output.count('CL_INVALID_BUFFER_SIZE'): + writeline = False + printLog( 'Omitting line from table - problem is too large') + elif clientCrash.output.count('CL_INVALID_WORK_GROUP_SIZE'): + writeline = False + printLog( 'Omitting line from table - workgroup size is invalid') + elif clientCrash.output.count('lda must be set to 0 or a value >='): + writeline = False + printLog( 'Omitting line from table - lda is too small') + elif clientCrash.output.count('ldb must be set to 0 or a value >='): + writeline = False + printLog( 'Omitting line from table - ldb is too small') + elif clientCrash.output.count('ldc must be set to 0 or a value >='): + writeline = False + printLog( 'Omitting line from table - ldc is too small') + else: + writeline = False + printLog('ERROR: client crash.\n') + printLog(str(clientCrash.output)) + printLog( str(clientCrash)) + printLog('In original code we quit here - 1') + continue + #quit() + + if writeline: + gflopsoutput = itertools.ifilter( lambda x: x.count('Gflops'), output) + gflopsoutput = list(itertools.islice(gflopsoutput, None)) + thisResult = re.search('\d+\.*\d*e*-*\d*$', gflopsoutput[0]) + if thisResult != None: + thisResult = float(thisResult.group(0)) + thisResult = (params.sizem, + params.sizen, + params.sizek, + params.lda, + params.ldb, + params.ldc, + params.offa, + params.offb, + params.offc, + params.alpha, + params.beta, + params.order, + params.transa, + params.transb, + params.side, + params.uplo, + params.diag, + params.precision + params.function, + params.device, + params.library, + params.label, + thisResult) + + outputRow = '' + for x in thisResult: + outputRow = outputRow + str(x) + ',' + outputRow = outputRow.rstrip(',') + table.write(outputRow + '\n') + table.flush() + else: + if gflopsoutput[0].find('nan') or gflopsoutput[0].find('inf'): + printLog( 'WARNING: output from client was funky for this run. skipping table row') + else: + prinLog( 'ERROR: output from client makes no sense') + prinLog(str( gflopsoutput[0])) + printLog('In original code we quit here - 2') + continue + #quit() +printLog("=========================MEASURE PERFORMANCE ENDS===========================\n") diff --git a/clients/benchmarks/perf_script/performanceUtility.py b/clients/benchmarks/perf_script/performanceUtility.py new file mode 100644 index 000000000..c565f5f24 --- /dev/null +++ b/clients/benchmarks/perf_script/performanceUtility.py @@ -0,0 +1,86 @@ +# ######################################################################## +# Copyright 2016 Advanced Micro Devices, Inc. +# +# ######################################################################## + +#This file contains a number of utilities function which could be independent of +#any specific domain concept + +import signal +from subprocess import check_output +import errorHandler +from datetime import datetime + +def currentUser(): + try: + return check_output("who", shell = True).split()[0]; + except: + print 'Unhandled Exception at performanceUtility::currentUser()' + raise + +#Details: Generate sorted numbers in radices of 2,3 and 5 upto a given upper limit number +def generate235Radices(maxSize): + sizeList = list() + i = 0 + j = 0 + k = 0 + SUM = int() + sumj = int() + sumk = int() + sumi = 1 + while(True): + sumj = 1 + j = 0 + while(True): + sumk = 1 + k = 0 + while(True): + SUM = sumi*sumj*sumk + if ( SUM > maxSize ): break + sizeList.append(SUM) + k += 1 + sumk *= 2 + if (k == 0): break + j += 1 + sumj *= 3 + if ( j == 0 and k == 0): break + i += 1 + sumi *= 5 + sizeList.sort() + return sizeList + + +def timeout(timeout_time, default): + def timeout_function(f): + def f2(args): + def timeout_handler(signum, frame): + raise errorHandler.TimeoutException() + + old_handler = signal.signal(signal.SIGALRM, timeout_handler) + signal.alarm(timeout_time) # triger alarm in timeout_time seconds + retval = "" + try: + retval = f(args) + except errorHandler.TimeoutException: + raise errorHandler.ApplicationException(__file__, errorHandler.TIME_OUT) + except: + signal.alarm(0) + raise + finally: + #print 'executing finally' + signal.signal(signal.SIGALRM, old_handler) + signal.alarm(0) + return retval + return f2 + return timeout_function + + +def logTxtOutput(fileName, mode, txt): + todayFile = fileName+'-'+datetime.now().strftime('%Y-%b-%d')+'.txt' + with open(todayFile, mode) as f: + f.write('------\n'+txt+'\n') + +def log(filename, txt): + with open(filename, 'a') as f: + f.write(datetime.now().ctime()+'# '+txt+'\n') + diff --git a/clients/benchmarks/perf_script/plotPerformance.py b/clients/benchmarks/perf_script/plotPerformance.py new file mode 100644 index 000000000..b2dd58bda --- /dev/null +++ b/clients/benchmarks/perf_script/plotPerformance.py @@ -0,0 +1,298 @@ +# ######################################################################## +# Copyright 2016 Advanced Micro Devices, Inc. +# +# ######################################################################## + +# to use this script, you will need to download and install the 32-BIT VERSION of: +# - Python 2.7 x86 (32-bit) - http://www.python.org/download/releases/2.7.1 +# +# you will also need the 32-BIT VERSIONS of the following packages as not all the packages are available in 64bit at the time of this writing +# The ActiveState python distribution is recommended for windows +# (make sure to get the python 2.7-compatible packages): +# - NumPy 1.5.1 (32-bit, 64-bit unofficial, supports Python 2.4 - 2.7 and 3.1 - 3.2.) - http://sourceforge.net/projects/numpy/files/NumPy/ +# - matplotlib 1.0.1 (32-bit & 64-bit, supports Python 2.4 - 2.7) - http://sourceforge.net/projects/matplotlib/files/matplotlib/ +# +# For ActiveState Python, all that one should need to type is 'pypm install matplotlib' + +import datetime +import sys +import argparse +import subprocess +import itertools +import os +import matplotlib +import pylab +from matplotlib.backends.backend_pdf import PdfPages +from blasPerformanceTesting import * + +def plotGraph(dataForAllPlots, title, plottype, plotkwargs, xaxislabel, yaxislabel): + """ + display a pretty graph + """ + colors = ['k','y','m','c','r','b','g'] + #plottype = 'plot' + for thisPlot in dataForAllPlots: + getattr(pylab, plottype)(thisPlot.xdata, thisPlot.ydata, + '{}.-'.format(colors.pop()), + label=thisPlot.label, **plotkwargs) + if len(dataForAllPlots) > 1: + pylab.legend(loc='best') + + pylab.title(title) + pylab.xlabel(xaxislabel) + pylab.ylabel(yaxislabel) + pylab.grid(True) + + if args.outputFilename == None: + # if no pdf output is requested, spit the graph to the screen . . . + pylab.show() + else: + pylab.savefig(args.outputFilename,dpi=(1024/8)) + # . . . otherwise, gimme gimme pdf + #pdf = PdfPages(args.outputFilename) + #pdf.savefig() + #pdf.close() + +######## plotFromDataFile() Function to plot from data file begins ######## +def plotFromDataFile(): + data = [] + """ + read in table(s) from file(s) + """ + for thisFile in args.datafile: + if not os.path.isfile(thisFile): + print 'No file with the name \'{}\' exists. Please indicate another filename.'.format(thisFile) + quit() + + results = open(thisFile, 'r') + results_contents = results.read() + results_contents = results_contents.rstrip().split('\n') + + firstRow = results_contents.pop(0) + print firstRow + print blas_table_header() + print firstRow.rstrip()==blas_table_header() + if firstRow.rstrip() != blas_table_header(): + print 'ERROR: input file \'{}\' does not match expected format.'.format(thisFile) + quit() + + for row in results_contents: + row = row.split(',') + row = TableRow(BlasTestCombination(row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7],row[8],row[9],row[10],row[11],row[12],row[13],row[14], row[15], row[16], row[17][1:], row[17][0], row[18], row[19], row[20]), row[21]) + data.append(BlasGraphPoint(row.parameters.sizem, row.parameters.sizen, row.parameters.sizek, row.parameters.lda, row.parameters.ldb, row.parameters.ldc, row.parameters.offa , row.parameters.offb , row.parameters.offc , row.parameters.device, row.parameters.order, row.parameters.transa, row.parameters.transb, row.parameters.precision + row.parameters.function, row.parameters.library, row.parameters.label, row.gflops)) + + """ + data sanity check + """ + # if multiple plotvalues have > 1 value among the data rows, the user must specify which to plot + multiplePlotValues = [] + for option in plotvalues: + values = [] + for point in data: + values.append(getattr(point, option)) + multiplePlotValues.append(len(set(values)) > 1) + if multiplePlotValues.count(True) > 1 and args.plot == None: + print 'ERROR: more than one parameter of {} has multiple values. Please specify which parameter to plot with --plot'.format(plotvalues) + quit() + + # if args.graphxaxis is not 'problemsize', the user should know that the results might be strange + #if args.graphxaxis != 'problemsize': + # xaxisvalueSet = [] + # for option in xaxisvalues: + # if option != 'problemsize': + # values = [] + # for point in data: + # values.append(getattr(point, option)) + # xaxisvalueSet.append(len(set(values)) > 1) + # if xaxisvalueSet.count(True) > 1: + # print 'WARNING: more than one parameter of {} is varied. unexpected results may occur. please double check your graphs for accuracy.'.format(xaxisvalues) + + # multiple rows should not have the same input values + #pointInputs = [] + #for point in data: + # pointInputs.append(point.__str__().split(';')[0]) + #if len(set(pointInputs)) != len(data): + # print 'ERROR: imported table has duplicate rows with identical input parameters' + # quit() + + """ + figure out if we have multiple plots on this graph (and what they should be) + """ + if args.plot != None: + multiplePlots = args.plot + elif multiplePlotValues.count(True) == 1 and plotvalues[multiplePlotValues.index(True)] != 'sizek': + # we don't ever want to default to sizek, because it's probably going to vary for most plots + # we'll require the user to explicitly request multiple plots on sizek if necessary + multiplePlots = plotvalues[multiplePlotValues.index(True)] + else: + # default to device if none of the options to plot have multiple values + multiplePlots = 'device' + + """ + assemble data for the graphs + """ + data.sort(key=lambda row: int(getattr(row, args.graphxaxis))) + + # choose scale for x axis + if args.xaxisscale == None: + # user didn't specify. autodetect + if int(getattr(data[len(data)-1], args.graphxaxis)) > 2000: # big numbers on x-axis + args.xaxisscale = 'log2' + elif int(getattr(data[len(data)-1], args.graphxaxis)) > 10000: # bigger numbers on x-axis + args.xaxisscale = 'log10' + else: # small numbers on x-axis + args.xaxisscale = 'linear' + + if args.xaxisscale == 'linear': + plotkwargs = {} + plottype = 'plot' + elif args.xaxisscale == 'log2': + plottype = 'semilogx' + plotkwargs = {'basex':2} + elif args.xaxisscale == 'log10': + plottype = 'semilogx' + plotkwargs = {'basex':10} + else: + print 'ERROR: invalid value for x-axis scale' + quit() + + plots = set(getattr(row, multiplePlots) for row in data) + + class DataForOnePlot: + def __init__(self, inlabel, inxdata, inydata): + self.label = inlabel + self.xdata = inxdata + self.ydata = inydata + + dataForAllPlots = [] + for plot in plots: + dataForThisPlot = itertools.ifilter( lambda x: getattr(x, multiplePlots) == plot, data) + dataForThisPlot = list(itertools.islice(dataForThisPlot, None)) + #if args.graphxaxis == 'problemsize': + # xdata = [int(row.x) * int(row.y) * int(row.z) * int(row.batchsize) for row in dataForThisPlot] + #else: + xdata = [getattr(row, args.graphxaxis) for row in dataForThisPlot] + ydata = [getattr(row, args.graphyaxis) for row in dataForThisPlot] + dataForAllPlots.append(DataForOnePlot(plot,xdata,ydata)) + + """ + assemble labels for the graph or use the user-specified ones + """ + if args.graphtitle: + # use the user selection + title = args.graphtitle + else: + # autogen a lovely title + title = 'Performance vs. ' + args.graphxaxis.capitalize() + + if args.xaxislabel: + # use the user selection + xaxislabel = args.xaxislabel + else: + # autogen a lovely x-axis label + if args.graphxaxis == 'cachesize': + units = '(bytes)' + else: + units = '(datapoints)' + + xaxislabel = args.graphxaxis + ' ' + units + + if args.yaxislabel: + # use the user selection + yaxislabel = args.yaxislabel + else: + # autogen a lovely y-axis label + if args.graphyaxis == 'gflops': + units = 'GFLOPS' + yaxislabel = 'Performance (' + units + ')' + + """ + display a pretty graph + """ + colors = ['k','y','m','c','r','b','g'] + + for thisPlot in dataForAllPlots: + getattr(pylab, plottype)(thisPlot.xdata, thisPlot.ydata, '{}.-'.format(colors.pop()), label=thisPlot.label, **plotkwargs) + + if len(dataForAllPlots) > 1: + pylab.legend(loc='best') + + pylab.title(title) + pylab.xlabel(xaxislabel) + pylab.ylabel(yaxislabel) + pylab.grid(True) + + if args.outputFilename == None: + # if no pdf output is requested, spit the graph to the screen . . . + pylab.show() + else: + # . . . otherwise, gimme gimme pdf + #pdf = PdfPages(args.outputFilename) + #pdf.savefig() + #pdf.close() + pylab.savefig(args.outputFilename,dpi=(1024/8)) +######### plotFromDataFile() Function to plot from data file ends ######### + + + +######## "main" program begins ##### +""" +define and parse parameters +""" +xaxisvalues = ['sizem','sizen','sizek'] +yaxisvalues = ['gflops'] +plotvalues = ['lda','ldb','ldc','sizek','device','label','order','transa','transb','function','library'] + + + +parser = argparse.ArgumentParser(description='Plot performance of the clblas\ + library. clblas.plotPerformance.py reads in data tables from clblas.\ + measurePerformance.py and plots their values') +fileOrDb = parser.add_mutually_exclusive_group(required=True) +fileOrDb.add_argument('-d', '--datafile', + dest='datafile', action='append', default=None, required=False, + help='indicate a file to use as input. must be in the format output by\ + clblas.measurePerformance.py. may be used multiple times to indicate\ + multiple input files. e.g., -d cypressOutput.txt -d caymanOutput.txt') +parser.add_argument('-x', '--x_axis', + dest='graphxaxis', default=None, choices=xaxisvalues, required=True, + help='indicate which value will be represented on the x axis. problemsize\ + is defined as x*y*z*batchsize') +parser.add_argument('-y', '--y_axis', + dest='graphyaxis', default='gflops', choices=yaxisvalues, + help='indicate which value will be represented on the y axis') +parser.add_argument('--plot', + dest='plot', default=None, choices=plotvalues, + help='indicate which of {} should be used to differentiate multiple plots.\ + this will be chosen automatically if not specified'.format(plotvalues)) +parser.add_argument('--title', + dest='graphtitle', default=None, + help='the desired title for the graph generated by this execution. if\ + GRAPHTITLE contains any spaces, it must be entered in \"double quotes\".\ + if this option is not specified, the title will be autogenerated') +parser.add_argument('--x_axis_label', + dest='xaxislabel', default=None, + help='the desired label for the graph\'s x-axis. if XAXISLABEL contains\ + any spaces, it must be entered in \"double quotes\". if this option\ + is not specified, the x-axis label will be autogenerated') +parser.add_argument('--x_axis_scale', + dest='xaxisscale', default=None, choices=['linear','log2','log10'], + help='the desired scale for the graph\'s x-axis. if nothing is specified,\ + it will be selected automatically') +parser.add_argument('--y_axis_label', + dest='yaxislabel', default=None, + help='the desired label for the graph\'s y-axis. if YAXISLABEL contains any\ + spaces, it must be entered in \"double quotes\". if this option is not\ + specified, the y-axis label will be autogenerated') +parser.add_argument('--outputfile', + dest='outputFilename', default=None, + help='name of the file to output graphs. Supported formats: emf, eps, pdf, png, ps, raw, rgba, svg, svgz.') + +args = parser.parse_args() + +if args.datafile != None: + plotFromDataFile() +else: + print "Atleast specify if you want to use text files or database for plotting graphs. Use -h or --help option for more details" + quit() + diff --git a/clients/cmake/external-gtest.cmake b/clients/cmake/external-gtest.cmake index 6407bc01e..db4ae61ed 100644 --- a/clients/cmake/external-gtest.cmake +++ b/clients/cmake/external-gtest.cmake @@ -7,24 +7,20 @@ include( ExternalProject ) # set( gtest_version "1.7.0" CACHE STRING "gtest version to download/use" ) # mark_as_advanced( gtest_version ) -# -# message( STATUS "gtest_version: " ${gtest_version} ) -# -# if( DEFINED ENV{GMOCK_URL} ) -# set( ext.gtest_URL "$ENV{GMOCK_URL}" CACHE STRING "URL to download gtest from" ) -# else( ) -# set( ext.gtest_URL "https://github.com/google/googletest/archive/release-${gtest_version}.zip" CACHE STRING "URL to download gtest from" ) -# endif( ) -# mark_as_advanced( ext.gtest_URL ) + +# If the user does not specify an explicit fortran compiler, assume gfortran +if( NOT DEFINED CMAKE_C_COMPILER ) + set( CMAKE_C_COMPILER cc ) +endif( ) set( gtest_git_repository "https://github.com/google/googletest.git" CACHE STRING "URL to download gtest from" ) set( gtest_git_tag "master" CACHE STRING "URL to download gtest from" ) # -DCMAKE_ARCHIVE_OUTPUT_DIRECTORY:PATH=${LIB_DIR} -set( gtest_cmake_args -DCMAKE_INSTALL_PREFIX=/package -DCMAKE_DEBUG_POSTFIX=d -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} ) +set( gtest_cmake_args -DCMAKE_INSTALL_PREFIX=/package ) if( MSVC ) - list( APPEND gtest_cmake_args -Dgtest_force_shared_crt=ON ) + list( APPEND gtest_cmake_args -Dgtest_force_shared_crt=ON -DCMAKE_DEBUG_POSTFIX=d ) else( ) # GTEST_USE_OWN_TR1_TUPLE necessary to compile with hipcc set( EXTRA_FLAGS "-DGTEST_USE_OWN_TR1_TUPLE=1" ) @@ -66,7 +62,7 @@ else( ) # WARNING: find_package( gtest ) only works if it can find release binaries # Even if you want to link against debug binaries, you must build release binaries too - list( APPEND gtest_cmake_args -DCMAKE_BUILD_TYPE=Release ) + list( APPEND gtest_cmake_args -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} ) message( STATUS "ExternalGmock using ( " ${Cores} " ) cores to build with" ) endif( ) diff --git a/clients/cmake/external-lapack.cmake b/clients/cmake/external-lapack.cmake index 55064e82e..c9fccc037 100644 --- a/clients/cmake/external-lapack.cmake +++ b/clients/cmake/external-lapack.cmake @@ -6,9 +6,19 @@ message( STATUS "Configuring lapack external dependency" ) include( ExternalProject ) set( lapack_git_repository "https://github.com/Reference-LAPACK/lapack-release" CACHE STRING "URL to download lapack from" ) -set( lapack_git_tag "lapack-3.6.1" CACHE STRING "git branch" ) +set( lapack_git_tag "lapack-3.7.0" CACHE STRING "git branch" ) -set( lapack_cmake_args -DCMAKE_INSTALL_PREFIX=/package -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} ) +# If the user does not specify an explicit fortran compiler, assume gfortran +if( NOT DEFINED CMAKE_Fortran_COMPILER ) + set( CMAKE_Fortran_COMPILER gfortran ) +endif( ) + +# If the user does not specify an explicit fortran compiler, assume gfortran +if( NOT DEFINED CMAKE_C_COMPILER ) + set( CMAKE_C_COMPILER cc ) +endif( ) + +set( lapack_cmake_args -DCMAKE_INSTALL_PREFIX=/package -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_Fortran_COMPILER=${CMAKE_Fortran_COMPILER} ) # message( STATUS "lapack_make ( " ${lapack_make} " ) " ) # message( STATUS "lapack_cmake_args ( " ${lapack_cmake_args} " ) " ) diff --git a/clients/common/cblas_interface.cpp b/clients/common/cblas_interface.cpp index 1803df8a5..861c5d7a7 100644 --- a/clients/common/cblas_interface.cpp +++ b/clients/common/cblas_interface.cpp @@ -22,6 +22,11 @@ extern "C" { void ctrtri_(char* uplo, char* diag, int* n, rocblas_float_complex* A, int* lda, int *info); void ztrtri_(char* uplo, char* diag, int* n, rocblas_double_complex* A, int* lda, int *info); + void sgetrf_(int* m, int* n, float* A, int* lda, int* ipiv, int *info); + void dgetrf_(int* m, int* n, double* A, int* lda, int* ipiv, int *info); + void cgetrf_(int* m, int* n, rocblas_float_complex* A, int* lda, int* ipiv, int *info); + void zgetrf_(int* m, int* n, rocblas_double_complex* A, int* lda, int* ipiv, int *info); + #ifdef __cplusplus } #endif @@ -201,7 +206,7 @@ extern "C" { } - //nrm2 + //asum template<> void cblas_asum( rocblas_int n, const float *x, rocblas_int incx, @@ -352,6 +357,26 @@ extern "C" { cblas_zhemv(CblasColMajor, (CBLAS_UPLO)uplo, n, &alpha, A, lda, x, incx, &beta, y, incy); } + template<> + void cblas_ger( rocblas_int m, rocblas_int n, + float alpha, + float *x, rocblas_int incx, + float *y, rocblas_int incy, + float *A, rocblas_int lda) + { + cblas_sger(CblasColMajor, m, n, alpha, x, incx, y, incy, A, lda); + } + + template<> + void cblas_ger(rocblas_int m, rocblas_int n, + double alpha, + double *x, rocblas_int incx, + double *y, rocblas_int incy, + double *A, rocblas_int lda) + { + cblas_dger(CblasColMajor, m, n, alpha, x, incx, y, incy, A, lda); + } + /* * =========================================================================== * level 3 BLAS @@ -409,7 +434,7 @@ extern "C" { rocblas_operation transA, rocblas_diagonal diag, rocblas_int m, rocblas_int n, float alpha, - float *A, rocblas_int lda, + const float *A, rocblas_int lda, float *B, rocblas_int ldb) { //just directly cast, since transA, transB are integers in the enum @@ -421,7 +446,7 @@ extern "C" { rocblas_operation transA, rocblas_diagonal diag, rocblas_int m, rocblas_int n, double alpha, - double *A, rocblas_int lda, + const double *A, rocblas_int lda, double *B, rocblas_int ldb) { //just directly cast, since transA, transB are integers in the enum @@ -433,7 +458,7 @@ extern "C" { rocblas_operation transA, rocblas_diagonal diag, rocblas_int m, rocblas_int n, rocblas_float_complex alpha, - rocblas_float_complex *A, rocblas_int lda, + const rocblas_float_complex *A, rocblas_int lda, rocblas_float_complex *B, rocblas_int ldb) { //just directly cast, since transA, transB are integers in the enum @@ -445,7 +470,7 @@ extern "C" { rocblas_operation transA, rocblas_diagonal diag, rocblas_int m, rocblas_int n, rocblas_double_complex alpha, - rocblas_double_complex *A, rocblas_int lda, + const rocblas_double_complex *A, rocblas_int lda, rocblas_double_complex *B, rocblas_int ldb) { //just directly cast, since transA, transB are integers in the enum @@ -476,3 +501,99 @@ extern "C" { dtrtri_(&uplo, &diag, &n, A, &lda, &info); return info; } + + + //trmm + template<> + void cblas_trmm( rocblas_side side, rocblas_fill uplo, + rocblas_operation transA, rocblas_diagonal diag, + rocblas_int m, rocblas_int n, + float alpha, + const float *A, rocblas_int lda, + float *B, rocblas_int ldb) + { + //just directly cast, since transA, transB are integers in the enum + cblas_strmm(CblasColMajor, (CBLAS_SIDE)side, (CBLAS_UPLO)uplo, (CBLAS_TRANSPOSE)transA, (CBLAS_DIAG)diag, m, n, alpha, A, lda, B, ldb); + } + + template<> + void cblas_trmm(rocblas_side side, rocblas_fill uplo, + rocblas_operation transA, rocblas_diagonal diag, + rocblas_int m, rocblas_int n, + double alpha, + const double *A, rocblas_int lda, + double *B, rocblas_int ldb) + { + //just directly cast, since transA, transB are integers in the enum + cblas_dtrmm(CblasColMajor, (CBLAS_SIDE)side, (CBLAS_UPLO)uplo, (CBLAS_TRANSPOSE)transA, (CBLAS_DIAG)diag, m, n, alpha, A, lda, B, ldb); + } + + template<> + void cblas_trmm( rocblas_side side, rocblas_fill uplo, + rocblas_operation transA, rocblas_diagonal diag, + rocblas_int m, rocblas_int n, + rocblas_float_complex alpha, + const rocblas_float_complex *A, rocblas_int lda, + rocblas_float_complex *B, rocblas_int ldb) + { + //just directly cast, since transA, transB are integers in the enum + cblas_ctrmm(CblasColMajor, (CBLAS_SIDE)side, (CBLAS_UPLO)uplo, (CBLAS_TRANSPOSE)transA, (CBLAS_DIAG)diag, m, n, &alpha, A, lda, B, ldb); + } + + template<> + void cblas_trmm( rocblas_side side, rocblas_fill uplo, + rocblas_operation transA, rocblas_diagonal diag, + rocblas_int m, rocblas_int n, + rocblas_double_complex alpha, + const rocblas_double_complex *A, rocblas_int lda, + rocblas_double_complex *B, rocblas_int ldb) + { + //just directly cast, since transA, transB are integers in the enum + cblas_ztrmm(CblasColMajor, (CBLAS_SIDE)side, (CBLAS_UPLO)uplo, (CBLAS_TRANSPOSE)transA, (CBLAS_DIAG)diag, m, n, &alpha, A, lda, B, ldb); + } + + //getrf + template<> + rocblas_int cblas_getrf(rocblas_int m, + rocblas_int n, + float *A, rocblas_int lda, + rocblas_int *ipiv) + { + rocblas_int info; + sgetrf_(&m, &n, A, &lda, ipiv, &info); + return info; + } + + template<> + rocblas_int cblas_getrf(rocblas_int m, + rocblas_int n, + double *A, rocblas_int lda, + rocblas_int *ipiv) + { + rocblas_int info; + dgetrf_(&m, &n, A, &lda, ipiv, &info); + return info; + } + + template<> + rocblas_int cblas_getrf(rocblas_int m, + rocblas_int n, + rocblas_float_complex *A, rocblas_int lda, + rocblas_int *ipiv) + { + rocblas_int info; + cgetrf_(&m, &n, A, &lda, ipiv, &info); + return info; + } + + template<> + rocblas_int cblas_getrf(rocblas_int m, + rocblas_int n, + rocblas_double_complex *A, rocblas_int lda, + rocblas_int *ipiv) + { + rocblas_int info; + zgetrf_(&m, &n, A, &lda, ipiv, &info); + return info; + } + diff --git a/clients/common/norm.cpp b/clients/common/norm.cpp index d6581239a..8c930bc12 100644 --- a/clients/common/norm.cpp +++ b/clients/common/norm.cpp @@ -80,7 +80,7 @@ double norm_check_general(char norm_type, rocblas_int M, rocblas_int N, double cpu_norm = dlange_(&norm_type, &M, &N, hCPU, &lda, work); daxpy_(&size, &alpha, hCPU, &incx, hGPU, &incx); - double error = dlange_(&norm_type, &M, &N, hGPU, &lda, work)/cpu_norm; + double error = dlange_(&norm_type, &M, &N, hGPU, &lda, work)/cpu_norm; return error; } @@ -99,7 +99,7 @@ double norm_check_general(char norm_type, rocblas_int M, float cpu_norm = clange_(&norm_type, &M, &N, hCPU, &lda, work); caxpy_(&size, &alpha, hCPU, &incx, hGPU, &incx); - float error = clange_(&norm_type, &M, &N, hGPU, &lda, work)/cpu_norm; + float error = clange_(&norm_type, &M, &N, hGPU, &lda, work)/cpu_norm; return (double)error; } @@ -119,7 +119,7 @@ rocblas_double_complex *hGPU) double cpu_norm = zlange_(&norm_type, &M, &N, hCPU, &lda, work); zaxpy_(&size, &alpha, hCPU, &incx, hGPU, &incx); - double error = zlange_(&norm_type, &M, &N, hGPU, &lda, work)/cpu_norm; + double error = zlange_(&norm_type, &M, &N, hGPU, &lda, work)/cpu_norm; return error; } diff --git a/clients/common/rocblas_template_specialization.cpp b/clients/common/rocblas_template_specialization.cpp new file mode 100644 index 000000000..31deff068 --- /dev/null +++ b/clients/common/rocblas_template_specialization.cpp @@ -0,0 +1,530 @@ +/* ************************************************************************ + * Copyright 2016 Advanced Micro Devices, Inc. + * + * ************************************************************************/ + + +#include +#include "rocblas.h" +#include "rocblas.hpp" + +/*!\file + * \brief provide template functions interfaces to ROCBLAS C89 interfaces +*/ + + + + /* + * =========================================================================== + * level 1 BLAS + * =========================================================================== + */ + //scal + template<> + rocblas_status + rocblas_scal(rocblas_handle handle, + rocblas_int n, + const float *alpha, + float *x, rocblas_int incx){ + + return rocblas_sscal(handle, n, alpha, x, incx); + } + + template<> + rocblas_status + rocblas_scal(rocblas_handle handle, + rocblas_int n, + const double *alpha, + double *x, rocblas_int incx){ + + return rocblas_dscal(handle, n, alpha, x, incx); + } + + template<> + rocblas_status + rocblas_scal(rocblas_handle handle, + rocblas_int n, + const rocblas_float_complex *alpha, + rocblas_float_complex *x, rocblas_int incx){ + + return rocblas_cscal(handle, n, alpha, x, incx); + } + + template<> + rocblas_status + rocblas_scal(rocblas_handle handle, + rocblas_int n, + const rocblas_double_complex *alpha, + rocblas_double_complex *x, rocblas_int incx){ + + return rocblas_zscal(handle, n, alpha, x, incx); + } + + //swap + template<> + rocblas_status + rocblas_swap( rocblas_handle handle, rocblas_int n, + float *x, rocblas_int incx, + float *y, rocblas_int incy) + { + return rocblas_swap(handle, n, x, incx, y, incy); + } + + template<> + rocblas_status + rocblas_swap( rocblas_handle handle, rocblas_int n, + double *x, rocblas_int incx, + double *y, rocblas_int incy) + { + return rocblas_dswap(handle, n, x, incx, y, incy); + } + + template<> + rocblas_status + rocblas_swap( rocblas_handle handle, rocblas_int n, + rocblas_float_complex *x, rocblas_int incx, + rocblas_float_complex *y, rocblas_int incy) + { + return rocblas_cswap(handle, n, x, incx, y, incy); + } + + template<> + rocblas_status + rocblas_swap( rocblas_handle handle, rocblas_int n, + rocblas_double_complex *x, rocblas_int incx, + rocblas_double_complex *y, rocblas_int incy) + { + return rocblas_zswap(handle, n, x, incx, y, incy); + } + + //copy + template<> + rocblas_status + rocblas_copy( rocblas_handle handle, rocblas_int n, + const float *x, rocblas_int incx, + float *y, rocblas_int incy) + { + return rocblas_scopy(handle, n, x, incx, y, incy); + } + + template<> + rocblas_status + rocblas_copy( rocblas_handle handle, rocblas_int n, + const double *x, rocblas_int incx, + double *y, rocblas_int incy) + { + return rocblas_dcopy(handle, n, x, incx, y, incy); + } + + template<> + rocblas_status + rocblas_copy( rocblas_handle handle, rocblas_int n, + const rocblas_float_complex *x, rocblas_int incx, + rocblas_float_complex *y, rocblas_int incy) + { + return rocblas_ccopy(handle, n, x, incx, y, incy); + } + + template<> + rocblas_status + rocblas_copy( rocblas_handle handle, rocblas_int n, + const rocblas_double_complex *x, rocblas_int incx, + rocblas_double_complex *y, rocblas_int incy) + { + return rocblas_zcopy(handle, n, x, incx, y, incy); + } + + //dot + template<> + rocblas_status + rocblas_dot( rocblas_handle handle, rocblas_int n, + const float *x, rocblas_int incx, + const float *y, rocblas_int incy, + float *result) + { + return rocblas_sdot(handle, n, x, incx, y, incy, result); + } + + template<> + rocblas_status + rocblas_dot( rocblas_handle handle, rocblas_int n, + const double *x, rocblas_int incx, + const double *y, rocblas_int incy, + double *result) + { + return rocblas_ddot(handle, n, x, incx, y, incy, result); + } + + template<> + rocblas_status + rocblas_dot( rocblas_handle handle, rocblas_int n, + const rocblas_float_complex *x, rocblas_int incx, + const rocblas_float_complex *y, rocblas_int incy, + rocblas_float_complex *result) + { + return rocblas_cdotu(handle, n, x, incx, y, incy, result); + } + + template<> + rocblas_status + rocblas_dot( rocblas_handle handle, rocblas_int n, + const rocblas_double_complex *x, rocblas_int incx, + const rocblas_double_complex *y, rocblas_int incy, + rocblas_double_complex *result) + { + return rocblas_zdotu(handle, n, x, incx, y, incy, result); + } + + + //asum + template<> + rocblas_status + rocblas_asum(rocblas_handle handle, + rocblas_int n, + const float *x, rocblas_int incx, + float *result){ + + return rocblas_sasum(handle, n, x, incx, result); + } + + template<> + rocblas_status + rocblas_asum(rocblas_handle handle, + rocblas_int n, + const double *x, rocblas_int incx, + double *result){ + + return rocblas_dasum(handle, n, x, incx, result); + } + + template<> + rocblas_status + rocblas_asum(rocblas_handle handle, + rocblas_int n, + const rocblas_float_complex *x, rocblas_int incx, + float *result){ + + return rocblas_scasum(handle, n, x, incx, result); + } + + //nrm2 + template<> + rocblas_status + rocblas_nrm2(rocblas_handle handle, + rocblas_int n, + const float *x, rocblas_int incx, + float *result){ + + return rocblas_snrm2(handle, n, x, incx, result); + } + + template<> + rocblas_status + rocblas_nrm2(rocblas_handle handle, + rocblas_int n, + const double *x, rocblas_int incx, + double *result){ + + return rocblas_dnrm2(handle, n, x, incx, result); + } + + template<> + rocblas_status + rocblas_nrm2(rocblas_handle handle, + rocblas_int n, + const rocblas_float_complex *x, rocblas_int incx, + float *result){ + + return rocblas_scnrm2(handle, n, x, incx, result); + } + + template<> + rocblas_status + rocblas_nrm2(rocblas_handle handle, + rocblas_int n, + const rocblas_double_complex *x, rocblas_int incx, + double *result){ + + return rocblas_dznrm2(handle, n, x, incx, result); + } + + + //amin + template<> + rocblas_status + rocblas_amin(rocblas_handle handle, + rocblas_int n, + const float *x, rocblas_int incx, + rocblas_int *result){ + + return rocblas_samin(handle, n, x, incx, result); + } + + template<> + rocblas_status + rocblas_amin(rocblas_handle handle, + rocblas_int n, + const double *x, rocblas_int incx, + rocblas_int *result){ + + return rocblas_damin(handle, n, x, incx, result); + } + + template<> + rocblas_status + rocblas_amin(rocblas_handle handle, + rocblas_int n, + const rocblas_float_complex *x, rocblas_int incx, + rocblas_int *result){ + + return rocblas_scamin(handle, n, x, incx, result); + } + + template<> + rocblas_status + rocblas_amin(rocblas_handle handle, + rocblas_int n, + const rocblas_double_complex *x, rocblas_int incx, + rocblas_int *result){ + + return rocblas_dzamin(handle, n, x, incx, result); + } + + //amax + template<> + rocblas_status + rocblas_amax(rocblas_handle handle, + rocblas_int n, + const float *x, rocblas_int incx, + rocblas_int *result){ + + return rocblas_samax(handle, n, x, incx, result); + } + + template<> + rocblas_status + rocblas_amax(rocblas_handle handle, + rocblas_int n, + const double *x, rocblas_int incx, + rocblas_int *result){ + + return rocblas_damax(handle, n, x, incx, result); + } + + template<> + rocblas_status + rocblas_amax(rocblas_handle handle, + rocblas_int n, + const rocblas_float_complex *x, rocblas_int incx, + rocblas_int *result){ + + return rocblas_scamax(handle, n, x, incx, result); + } + + template<> + rocblas_status + rocblas_amax(rocblas_handle handle, + rocblas_int n, + const rocblas_double_complex *x, rocblas_int incx, + rocblas_int *result){ + + return rocblas_dzamax(handle, n, x, incx, result); + } + + /* + * =========================================================================== + * level 2 BLAS + * =========================================================================== + */ + + template<> + rocblas_status + rocblas_gemv( rocblas_handle handle, + rocblas_operation transA, rocblas_int m, rocblas_int n, + const float *alpha, + const float *A, rocblas_int lda, + const float *x, rocblas_int incx, + const float *beta, float *y, rocblas_int incy) + { + return rocblas_sgemv(handle, transA, m, n, alpha, A, lda, x, incx, beta, y, incy); + } + + template<> + rocblas_status + rocblas_gemv( rocblas_handle handle, + rocblas_operation transA, rocblas_int m, rocblas_int n, + const double *alpha, + const double *A, rocblas_int lda, + const double *x, rocblas_int incx, + const double *beta, double *y, rocblas_int incy) + { + return rocblas_dgemv(handle, transA, m, n, alpha, A, lda, x, incx, beta, y, incy); + } + + + + template<> + rocblas_status + rocblas_ger(rocblas_handle handle, + rocblas_int m, rocblas_int n, + const float *alpha, + const float *x, rocblas_int incx, + const float *y, rocblas_int incy, + float *A, rocblas_int lda){ + + return rocblas_sger(handle, m, n, alpha, x, incx, y, incy, A, lda); + } + + template<> + rocblas_status + rocblas_ger(rocblas_handle handle, + rocblas_int m, rocblas_int n, + const double *alpha, + const double *x, rocblas_int incx, + const double *y, rocblas_int incy, + double *A, rocblas_int lda){ + + return rocblas_dger(handle, m, n, alpha, x, incx, y, incy, A, lda); + } + + /* + * =========================================================================== + * level 3 BLAS + * =========================================================================== + */ + + // + + template<> + rocblas_status + rocblas_trtri(rocblas_handle handle, + rocblas_fill uplo, + rocblas_diagonal diag, + rocblas_int n, + float *A, rocblas_int lda, + float *invA, rocblas_int ldinvA){ + return rocblas_strtri(handle, uplo, diag, n, A, lda, invA, ldinvA); + } + + template<> + rocblas_status + rocblas_trtri(rocblas_handle handle, + rocblas_fill uplo, + rocblas_diagonal diag, + rocblas_int n, + double *A, rocblas_int lda, + double *invA, rocblas_int ldinvA){ + return rocblas_dtrtri(handle, uplo, diag, n, A, lda, invA, ldinvA); + } + + template<> + rocblas_status + rocblas_trtri_batched(rocblas_handle handle, + rocblas_fill uplo, + rocblas_diagonal diag, + rocblas_int n, + float *A, rocblas_int lda, rocblas_int bsa, + float *invA, rocblas_int ldinvA, rocblas_int bsinvA, + rocblas_int batch_count){ + return rocblas_strtri_batched(handle, uplo, diag, n, A, lda, bsa, invA, ldinvA, bsinvA, batch_count); + } + + template<> + rocblas_status + rocblas_trtri_batched(rocblas_handle handle, + rocblas_fill uplo, + rocblas_diagonal diag, + rocblas_int n, + double *A, rocblas_int lda, rocblas_int bsa, + double *invA, rocblas_int ldinvA, rocblas_int bsinvA, + rocblas_int batch_count){ + return rocblas_dtrtri_batched(handle, uplo, diag, n, A, lda, bsa, invA, ldinvA, bsinvA, batch_count); + } + +#if BUILD_WITH_TENSILE + + template<> + rocblas_status rocblas_gemm(rocblas_handle handle, + rocblas_operation transA, rocblas_operation transB, + rocblas_int m, rocblas_int n, rocblas_int k, + const float *alpha, + const float *A, rocblas_int lda, + const float *B, rocblas_int ldb, + const float *beta, + float *C, rocblas_int ldc){ + return rocblas_sgemm(handle, rocblas_order_column_major, transA, transB, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); + } + + template<> + rocblas_status rocblas_gemm(rocblas_handle handle, + rocblas_operation transA, rocblas_operation transB, + rocblas_int m, rocblas_int n, rocblas_int k, + const double *alpha, + const double *A, rocblas_int lda, + const double *B, rocblas_int ldb, + const double *beta, + double *C, rocblas_int ldc){ + return rocblas_dgemm(handle, rocblas_order_column_major, transA, transB, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); + } + + + template<> + rocblas_status rocblas_gemm_batched( + rocblas_handle handle, + rocblas_operation transA, rocblas_operation transB, + rocblas_int m, rocblas_int n, rocblas_int k, + const float *alpha, + const float *A, rocblas_int lda, rocblas_int bsa, + const float *B, rocblas_int ldb, rocblas_int bsb, + const float *beta, + float *C, rocblas_int ldc, rocblas_int bsc, + rocblas_int batch_count){ + + return rocblas_sgemm_batched(handle, rocblas_order_column_major, transA, transB, m, n, k, alpha, A, lda, bsa, B, ldb, bsb, beta, C, ldc, bsc, batch_count); + } + + template<> + rocblas_status rocblas_gemm_batched( + rocblas_handle handle, + rocblas_operation transA, rocblas_operation transB, + rocblas_int m, rocblas_int n, rocblas_int k, + const double *alpha, + const double *A, rocblas_int lda, rocblas_int bsa, + const double *B, rocblas_int ldb, rocblas_int bsb, + const double *beta, + double *C, rocblas_int ldc, rocblas_int bsc, + rocblas_int batch_count){ + + return rocblas_dgemm_batched(handle, rocblas_order_column_major, transA, transB, m, n, k, alpha, A, lda, bsa, B, ldb, bsb, beta, C, ldc, bsc, batch_count); + } + + + template<> + rocblas_status rocblas_trsm(rocblas_handle handle, + rocblas_side side, rocblas_fill uplo, + rocblas_operation transA, rocblas_diagonal diag, + rocblas_int m, rocblas_int n, + const float* alpha, + float* A, rocblas_int lda, + float* B, rocblas_int ldb){ + return rocblas_strsm(handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb); + } + + + template<> + rocblas_status rocblas_trsm(rocblas_handle handle, + rocblas_side side, rocblas_fill uplo, + rocblas_operation transA, rocblas_diagonal diag, + rocblas_int m, rocblas_int n, + const double* alpha, + double* A, rocblas_int lda, + double* B, rocblas_int ldb){ + return rocblas_dtrsm(handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb); + } + +#endif + + + // + + diff --git a/clients/common/unit.cpp b/clients/common/unit.cpp index 450803512..0e6e48c2f 100644 --- a/clients/common/unit.cpp +++ b/clients/common/unit.cpp @@ -17,7 +17,9 @@ template<> void unit_check_general(rocblas_int M, rocblas_int N, rocblas_int lda, float *hCPU, float *hGPU){ + #pragma unroll for(rocblas_int j=0; j void unit_check_general(rocblas_int M, rocblas_int N, rocblas_int lda, double *hCPU, double *hGPU){ + #pragma unroll for(rocblas_int j=0; j void unit_check_general(rocblas_int M, rocblas_int N, rocblas_int lda, rocblas_float_complex *hCPU, rocblas_float_complex *hGPU){ + #pragma unroll for(rocblas_int j=0; j void unit_check_general(rocblas_int M, rocblas_int N, rocblas_int lda, rocblas_double_complex *hCPU, rocblas_double_complex *hGPU){ + #pragma unroll for(rocblas_int j=0; j void unit_check_general(rocblas_int M, rocblas_int N, rocblas_int lda, rocblas_int *hCPU, rocblas_int *hGPU){ + #pragma unroll for(rocblas_int j=0; j + float get_trsm_tolerance(){ + return 5*1e-5; + } + + template<> + double get_trsm_tolerance(){ + return 1e-12; + } + + /*! \brief Template: gtest unit compare two matrices float/double/complex */ + //Do not put a wrapper over ASSERT_FLOAT_EQ, sincer assert exit the current function NOT the test case + // a wrapper will cause the loop keep going + + //trsm has division, must use near to suppress the false failure + template<> + void unit_check_trsm(rocblas_int M, rocblas_int N, rocblas_int lda, double hGPU, float tolerance){ + +#ifdef GOOGLE_TEST + ASSERT_LE(hGPU, tolerance); +#endif + } + + template<> + void unit_check_trsm(rocblas_int M, rocblas_int N, rocblas_int lda, double hGPU, double tolerance){ + +#ifdef GOOGLE_TEST + ASSERT_LE(hGPU, tolerance); +#endif + } + + + + + + + + diff --git a/clients/gtest/CMakeLists.txt b/clients/gtest/CMakeLists.txt index 92bccc18b..103a677e7 100644 --- a/clients/gtest/CMakeLists.txt +++ b/clients/gtest/CMakeLists.txt @@ -20,32 +20,8 @@ if( NOT MSVC_IDE AND NOT CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Debug CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() -# Check if cmake supports the new VERSION tag for project() commands -# rocblas becomes the name of the project with a particular version -if( POLICY CMP0048 ) - cmake_policy( SET CMP0048 NEW ) - project( rocblas-test VERSION 0.0.2.0 LANGUAGES C CXX ) -else( ) - project( rocblas-test CXX ) - # Define a version for the code - if( NOT DEFINED rocblas_VERSION_MAJOR ) - set( rocblas-test_VERSION_MAJOR 0 ) - endif( ) - - if( NOT DEFINED rocblas_VERSION_MINOR ) - set( rocblas-test_VERSION_MINOR 0 ) - endif( ) - - if( NOT DEFINED rocblas_VERSION_PATCH ) - set( rocblas-test_VERSION_PATCH 2 ) - endif( ) - - if( NOT DEFINED rocblas_VERSION_TWEAK ) - set( rocblas-test_VERSION_TWEAK 0 ) - endif( ) - - set( rocblas-test_VERSION "${rocblas-test_VERSION_MAJOR}.${rocblas-test_VERSION_MINOR}.${rocblas-test_VERSION_PATCH}.${rocblas-test_VERSION_TWEAK}") -endif( ) +include( build-version ) +project_version( NAME rocblas-test LANGUAGES CXX ) # Modify the global find property to help us find libraries like Boost in the correct paths for 64-bit # Essentially, find_library calls will look for /lib64 instead of /lib; works for windows and linux @@ -97,19 +73,26 @@ if( NOT Boost_FOUND ) find_package( Boost REQUIRED COMPONENTS program_options ) endif( ) +# message( STATUS "CMAKE_PREFIX_PATH: ${CMAKE_PREFIX_PATH}" ) +# message( STATUS "CMAKE_MODULE_PATH: ${CMAKE_MODULE_PATH}" ) + find_package( GTest REQUIRED ) find_package( cblas REQUIRED CONFIG ) find_package( rocblas REQUIRED CONFIG ) +find_package( HIP REQUIRED ) - set(Tensile_TEST_SRC - gemm_gtest.cpp - trsm_gtest.cpp - ) - +if( BUILD_WITH_TENSILE ) +set(Tensile_TEST_SRC + gemm_gtest.cpp + trsm_gtest.cpp + ) +endif() set(rocblas_test_source rocblas_gtest_main.cpp blas1_gtest.cpp + gemv_gtest.cpp + ger_gtest.cpp ${Tensile_TEST_SRC} ) @@ -120,10 +103,21 @@ set( rocblas_benchmark_common ../common/flops.cpp ../common/norm.cpp ../common/unit.cpp + ../common/rocblas_template_specialization.cpp ) +# link_directories( /opt/rocm/lib ) add_executable( rocblas-test ${rocblas_test_source} ${rocblas_benchmark_common} ) +if( BUILD_WITH_TENSILE ) + target_compile_definitions( rocblas-test PRIVATE BUILD_WITH_TENSILE=1 ) + message(STATUS, "Build Tensil equal 1") +else() + target_compile_definitions( rocblas-test PRIVATE BUILD_WITH_TENSILE=0 ) + message(STATUS, "Build Tensil equal 0") +endif() + + # Try to test for specific compiler features if cmake version is recent enough if( CMAKE_VERSION VERSION_GREATER "3.0" ) target_compile_features( rocblas-test PRIVATE cxx_static_assert cxx_nullptr cxx_lambdas cxx_auto_type) @@ -137,17 +131,19 @@ else( ) endif( ) endif( ) -target_compile_definitions( rocblas-test PRIVATE GTEST_USE_OWN_TR1_TUPLE=1 -DGOOGLE_TEST) +# target_compile_definitions( rocblas-test PRIVATE GTEST_USE_OWN_TR1_TUPLE=1 -DGOOGLE_TEST -D__HIP_PLATFORM_HCC__) +target_compile_definitions( rocblas-test PRIVATE GTEST_USE_OWN_TR1_TUPLE=1 -DGOOGLE_TEST ) target_include_directories( rocblas-test PRIVATE # $ $ $ $ + $ ) -#target_link_libraries( rocblas-test rocblas ${Boost_LIBRARIES} ) -target_link_libraries( rocblas-test rocblas gfortran lapack ${CBLAS_LIBRARIES} ${GTEST_LIBRARIES} ) +# target_link_libraries( rocblas-test rocblas ${Boost_LIBRARIES} ${CBLAS_LIBRARIES} ${GTEST_LIBRARIES} libmcwamp.a libhip_hcc.a CXLActivityLogger hsa-runtime64 hc_am hsakmt dl lapack c++ gfortran) +target_link_libraries( rocblas-test rocblas ${CBLAS_LIBRARIES} ${GTEST_LIBRARIES} lapack gfortran) # Ubuntu systems need to explicitely link to pthreads lib because of --as-needed # https://github.com/google/googletest/issues/391#issuecomment-125645879 diff --git a/clients/gtest/blas1_gtest.cpp b/clients/gtest/blas1_gtest.cpp index 706122105..7eeb3e029 100644 --- a/clients/gtest/blas1_gtest.cpp +++ b/clients/gtest/blas1_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016 Advanced Micro Devices, Inc. + * dotright 2016 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -9,6 +9,9 @@ #include #include #include "testing_scal.hpp" +#include "testing_dot.hpp" +#include "testing_asum.hpp" +#include "testing_amax.hpp" #include "utility.h" using ::testing::TestWithParam; @@ -52,7 +55,7 @@ Yet, the goal of this file is to verify result correctness not argument-checkers Representative sampling is sufficient, endless brute-force sampling is not necessary =================================================================== */ -int N_range[] = {-1, 10, 500, 1000}; +int N_range[] = {-1, 10, 500, 1000, 7111, 10000}; //vector of vector, each pair is a {alpha, beta}; //add/delete this list in pairs, like {2.0, 4.0} @@ -73,7 +76,7 @@ vector> incx_incy_range = { {1, 1}, /* ===================================================================== - BLAS-1: Scal, Swap, Copy + BLAS-1: scal, dot, asum, amax =================================================================== */ class blas1_gtest: public :: TestWithParam @@ -120,9 +123,7 @@ TEST_P(blas1_gtest, scal_float) // The Arguments data struture have physical meaning associated. // while the tuple is non-intuitive. Arguments arg = setup_blas1_arguments( GetParam() ); - rocblas_status status = testing_scal( arg ); - // if not success, then the input argument is problematic, so detect the error message if(status != rocblas_status_success){ if( arg.N < 0 ){ @@ -132,26 +133,67 @@ TEST_P(blas1_gtest, scal_float) EXPECT_EQ(rocblas_status_invalid_size, status); } } - } - - -TEST_P(blas1_gtest, swap_float) +TEST_P(blas1_gtest, dot_float) { - // argument automatically transferred to testing_swap - //testing_swap( GetParam() ); - //testing_swap( GetParam() ); + // GetParam return a tuple. Tee setup routine unpack the tuple + // and initializes arg(Arguments) which will be passed to testing routine + // The Arguments data struture have physical meaning associated. + // while the tuple is non-intuitive. + Arguments arg = setup_blas1_arguments( GetParam() ); + rocblas_status status = testing_dot( arg ); + // if not success, then the input argument is problematic, so detect the error message + if(status != rocblas_status_success){ + if( arg.N < 0 ){ + EXPECT_EQ(rocblas_status_invalid_size, status); + } + else if( arg.incx < 0){ + EXPECT_EQ(rocblas_status_invalid_size, status); + } + else if( arg.incy < 0){ + EXPECT_EQ(rocblas_status_invalid_size, status); + } + } } - -TEST_P(blas1_gtest, copy_float) +TEST_P(blas1_gtest, asum_float) { - // argument automatically transferred to testing_copy - //testing_copy( GetParam() ); - //testing_copy( GetParam() ); + // GetParam return a tuple. Tee setup routine unpack the tuple + // and initializes arg(Arguments) which will be passed to testing routine + // The Arguments data struture have physical meaning associated. + // while the tuple is non-intuitive. + Arguments arg = setup_blas1_arguments( GetParam() ); + rocblas_status status = testing_asum( arg ); + // if not success, then the input argument is problematic, so detect the error message + if(status != rocblas_status_success){ + if( arg.N < 0 ){ + EXPECT_EQ(rocblas_status_invalid_size, status); + } + else if( arg.incx < 0){ + EXPECT_EQ(rocblas_status_invalid_size, status); + } + } } +TEST_P(blas1_gtest, amax_float) +{ + // GetParam return a tuple. Tee setup routine unpack the tuple + // and initializes arg(Arguments) which will be passed to testing routine + // The Arguments data struture have physical meaning associated. + // while the tuple is non-intuitive. + Arguments arg = setup_blas1_arguments( GetParam() ); + rocblas_status status = testing_amax( arg ); + // if not success, then the input argument is problematic, so detect the error message + if(status != rocblas_status_success){ + if( arg.N < 0 ){ + EXPECT_EQ(rocblas_status_invalid_size, status); + } + else if( arg.incx < 0){ + EXPECT_EQ(rocblas_status_invalid_size, status); + } + } +} //Values is for a single item; ValuesIn is for an array //notice we are using vector of vector diff --git a/clients/gtest/gemm_gtest.cpp b/clients/gtest/gemm_gtest.cpp index fd2486cd1..bb133260d 100644 --- a/clients/gtest/gemm_gtest.cpp +++ b/clients/gtest/gemm_gtest.cpp @@ -29,21 +29,13 @@ README: This file contains testers to verify the correctness of Normal users only need to get the library routines without testers =================================================================== */ - -/* ===================================================================== -Advance users only: BrainStorm the parameters but do not make artificial one which invalidates the matrix. -like lda pairs with M, and "lda must >= M". case "lda < M" will be guarded by argument-checkers inside API of course. -Yet, the goal of this file is to verify result correctness not argument-checkers. - -Representative sampling is sufficient, endless brute-force sampling is not necessary -=================================================================== */ - - //vector of vector, each vector is a {M, N, K, lda, ldb, ldc}; //add/delete as a group const vector> matrix_size_range = { {-1, -1, -1, -1, 1, 1}, + { 3, 33, 3, 33, 35, 35}, + { 5, 5, 5, 5, 5, 5}, {10, 10, 20, 100, 10, 10}, {600,500, 500, 500, 600, 500}, {1024, 1024, 1024, 1024, 1024, 1024} @@ -51,17 +43,16 @@ vector> matrix_size_range = { const vector> full_matrix_size_range = { + {192, 192, 192, 192, 192, 192}, + {640, 640, 640, 640, 640, 640}, {1000, 1000, 1000, 1000, 1000, 1000}, - {2000, 2000, 2000, 2000, 2000, 2000}, {4011, 4011, 4011, 4011, 4011, 4011}, - {8000, 8000, 8000, 8000, 8000, 8000}, }; //vector of vector, each pair is a {alpha, beta}; //add/delete this list in pairs, like {2.0, 4.0} const vector> alpha_beta_range = { {1.0, 0.0}, - {-1.0, -1.0}, }; @@ -221,9 +212,11 @@ INSTANTIATE_TEST_CASE_P(rocblas_gemm_matrix_size, ); //THis function mainly test the scope of alpha_beta, transA_transB,.the scope of matrix_size_range is small + INSTANTIATE_TEST_CASE_P(rocblas_gemm_scalar_transpose, gemm_gtest, Combine( ValuesIn(matrix_size_range), ValuesIn(full_alpha_beta_range), ValuesIn(transA_transB_range) ) ); + diff --git a/clients/gtest/gemv_gtest.cpp b/clients/gtest/gemv_gtest.cpp index abaee4f5d..4b47488aa 100644 --- a/clients/gtest/gemv_gtest.cpp +++ b/clients/gtest/gemv_gtest.cpp @@ -44,8 +44,8 @@ Representative sampling is sufficient, endless brute-force sampling is not neces const vector> matrix_size_range = { {-1, -1, -1}, - {10, 10, 2}, - {600,500, 500}, + /* {10, 10, 2}, */ + /* {600,500, 500}, */ {1000, 1000, 1000}, {2000, 2000, 2000}, {4011, 4011, 4011}, @@ -59,7 +59,7 @@ vector> incx_incy_range = { {1, 1}, {0, -1}, {2, 1}, - {10, 100} + {10, 100}, }; //vector of vector, each pair is a {alpha, beta}; @@ -69,16 +69,16 @@ vector> alpha_beta_range = { {1.0, 0.0}, {-1.0, -1.0}, {2.0, 1.0}, - {0.0, 1.0} + {0.0, 1.0}, }; //for single/double precision, 'C'(conjTranspose) will downgraded to 'T' (transpose) internally in sgemv/dgemv, const vector transA_range = { - {'N'}, - {'T'}, - {'C'}, + 'N', + 'T', + 'C', }; diff --git a/clients/gtest/ger_gtest.cpp b/clients/gtest/ger_gtest.cpp new file mode 100644 index 000000000..425e67d5f --- /dev/null +++ b/clients/gtest/ger_gtest.cpp @@ -0,0 +1,172 @@ +/* ************************************************************************ + * Copyright 2016 Advanced Micro Devices, Inc. + * + * ************************************************************************ */ + + +#include +#include +#include +#include +#include "testing_ger.hpp" +#include "utility.h" + +using ::testing::TestWithParam; +using ::testing::Values; +using ::testing::ValuesIn; +using ::testing::Combine; +using namespace std; + +//only GCC/VS 2010 comes with std::tr1::tuple, but it is unnecessary, std::tuple is good enough; + +typedef std::tuple, vector, double> ger_tuple; + +/* ===================================================================== +README: This file contains testers to verify the correctness of + BLAS routines with google test + + It is supposed to be played/used by advance / expert users + Normal users only need to get the library routines without testers + =================================================================== */ + + +/* ===================================================================== +Advance users only: BrainStorm the parameters but do not make artificial one which invalidates the matrix. +like lda pairs with M, and "lda must >= M". case "lda < M" will be guarded by argument-checkers inside API of course. +Yet, the goal of this file is to verify result correctness not argument-checkers. + +Representative sampling is sufficient, endless brute-force sampling is not necessary +=================================================================== */ + + +//vector of vector, each vector is a {M, N, lda}; +//add/delete as a group +const +vector> matrix_size_range = { + {-1, -1, -1}, + {11, 11, 11}, + {16, 16, 16}, + {32, 32, 32}, + {65, 65, 65}, + /* {10, 10, 2}, */ + /* {600,500, 500}, */ + {1000, 1000, 1000}, + {2000, 2000, 2000}, + {4011, 4011, 4011}, + {8000, 8000, 8000} + }; + +//vector of vector, each pair is a {incx, incy}; +//add/delete this list in pairs, like {1, 1} +const +vector> incx_incy_range = { + {1, 1}, + {0, -1}, + {2, 1}, + {10, 100} + }; + +//vector, each entry is {alpha}; +//add/delete single values, like {2.0} +const +vector alpha_range = { + -0.5, + 2.0, + 0.0 + }; + + +/* ===============Google Unit Test==================================================== */ + + +/* ===================================================================== + BLAS-2 ger: +=================================================================== */ + +/* ============================Setup Arguments======================================= */ + +//Please use "class Arguments" (see utility.hpp) to pass parameters to templated testers; +//Some routines may not touch/use certain "members" of objects "argus". +//like BLAS-1 Scal does not have lda, BLAS-2 GEMV does not have ldb, ldc; +//That is fine. These testers & routines will leave untouched members alone. +//Do not use std::tuple to directly pass parameters to testers +//by std:tuple, you have unpack it with extreme care for each one by like "std::get<0>" which is not intuitive and error-prone + +Arguments setup_ger_arguments(ger_tuple tup) +{ + + vector matrix_size = std::get<0>(tup); + vector incx_incy = std::get<1>(tup); + double alpha = std::get<2>(tup); + + Arguments arg; + + // see the comments about matrix_size_range above + arg.M = matrix_size[0]; + arg.N = matrix_size[1]; + arg.lda = matrix_size[2]; + + // see the comments about matrix_size_range above + arg.incx = incx_incy[0]; + arg.incy = incx_incy[1]; + + arg.alpha = alpha; + + arg.timing = 0; + + return arg; +} + + +class ger_gtest: public :: TestWithParam +{ + protected: + ger_gtest(){} + virtual ~ger_gtest(){} + virtual void SetUp(){} + virtual void TearDown(){} +}; + + +TEST_P(ger_gtest, ger_gtest_float) +{ + // GetParam return a tuple. Tee setup routine unpack the tuple + // and initializes arg(Arguments) which will be passed to testing routine + // The Arguments data struture have physical meaning associated. + // while the tuple is non-intuitive. + + + Arguments arg = setup_ger_arguments( GetParam() ); + + rocblas_status status = testing_ger( arg ); + + // if not success, then the input argument is problematic, so detect the error message + if(status != rocblas_status_success){ + + if( arg.M < 0 || arg.N < 0 ){ + EXPECT_EQ(rocblas_status_invalid_size, status); + } + else if(arg.lda < arg.M){ + EXPECT_EQ(rocblas_status_invalid_size, status); + } + else if(arg.incx <= 0){ + EXPECT_EQ(rocblas_status_invalid_size, status); + } + else if(arg.incy <= 0){ + EXPECT_EQ(rocblas_status_invalid_size, status); + } + } + +} + +//notice we are using vector of vector +//so each elment in xxx_range is a avector, +//ValuesIn take each element (a vector) and combine them and feed them to test_p +// The combinations are { {M, N, lda}, {incx,incy} {alpha} } + +INSTANTIATE_TEST_CASE_P(rocblas_ger, + ger_gtest, + Combine( + ValuesIn(matrix_size_range), ValuesIn(incx_incy_range), ValuesIn(alpha_range) + ) + ); diff --git a/clients/gtest/trsm_gtest.cpp b/clients/gtest/trsm_gtest.cpp index b867b4ff4..5057019aa 100644 --- a/clients/gtest/trsm_gtest.cpp +++ b/clients/gtest/trsm_gtest.cpp @@ -45,20 +45,20 @@ const vector> matrix_size_range = { {-1, -1, 1, 1}, {10, 10, 20, 100}, - {600,500, 600, 500}, + {600, 500, 600, 600}, {1024, 1024, 1024, 1024} }; const vector> full_matrix_size_range = { + {192, 192, 192, 192}, + {640, 640, 960, 960}, {1000, 1000, 1000, 1000}, {2000, 2000, 2000, 2000}, - {4011, 4011, 4011, 4011}, - {8000, 8000, 8000, 8000}, }; const -vector alpha_range = {1.0, 0.0, -1.0}; +vector alpha_range = {1.0, -5.0}; //vector of vector, each pair is a {side, uplo, transA, diag}; @@ -75,7 +75,6 @@ const vector> side_uplo_transA_diag_range = { {'L', 'L', 'N', 'N'}, {'R', 'L', 'N', 'N'}, - {'R', 'U', 'T', 'U'}, {'L', 'U', 'C', 'N'}, }; @@ -182,6 +181,34 @@ TEST_P(trsm_gtest, trsm_gtest_float) } +TEST_P(trsm_gtest, trsm_gtest_double) +{ + // GetParam return a tuple. Tee setup routine unpack the tuple + // and initializes arg(Arguments) which will be passed to testing routine + // The Arguments data struture have physical meaning associated. + // while the tuple is non-intuitive. + + + Arguments arg = setup_trsm_arguments( GetParam() ); + + rocblas_status status = testing_trsm( arg ); + + // if not success, then the input argument is problematic, so detect the error message + if(status != rocblas_status_success){ + + if( arg.M < 0 || arg.N < 0 ){ + EXPECT_EQ(rocblas_status_invalid_size, status); + } + else if(arg.side_option == 'L' ? arg.lda < arg.M : arg.lda < arg.N){ + EXPECT_EQ(rocblas_status_invalid_size, status); + } + else if(arg.ldb < arg.M){ + EXPECT_EQ(rocblas_status_invalid_size, status); + } + } + +} + //notice we are using vector of vector //so each elment in xxx_range is a avector, //ValuesIn take each element (a vector) and combine them and feed them to test_p diff --git a/clients/include/cblas_interface.h b/clients/include/cblas_interface.h index 544fae79f..ddfbb80d8 100644 --- a/clients/include/cblas_interface.h +++ b/clients/include/cblas_interface.h @@ -73,6 +73,13 @@ T *x, rocblas_int incx, T beta, T *y, rocblas_int incy); + template + void cblas_ger( rocblas_int m, rocblas_int n, + T alpha, + T *x, rocblas_int incx, + T *y, rocblas_int incy, + T *A, rocblas_int lda); + template void cblas_hemv( rocblas_fill uplo, rocblas_int n, T alpha, @@ -92,13 +99,27 @@ rocblas_operation transA, rocblas_diagonal diag, rocblas_int m, rocblas_int n, T alpha, - T *A, rocblas_int lda, + const T *A, rocblas_int lda, T *B, rocblas_int ldb); template rocblas_int cblas_trtri(char uplo, char diag, rocblas_int n, T *A, rocblas_int lda); + + template + void cblas_trmm( rocblas_side side, rocblas_fill uplo, + rocblas_operation transA, rocblas_diagonal diag, + rocblas_int m, rocblas_int n, + T alpha, + const T *A, rocblas_int lda, + T *B, rocblas_int ldb); + + template + rocblas_int cblas_getrf(rocblas_int m, + rocblas_int n, + T *A, rocblas_int lda, + rocblas_int *ipiv); /* ============================================================================================ */ diff --git a/clients/include/flops.h b/clients/include/flops.h index b72c15db7..ccfc17867 100644 --- a/clients/include/flops.h +++ b/clients/include/flops.h @@ -32,6 +32,12 @@ return (2.0 * n * n)/1e9; } + /* \brief floating point counts of GER */ + template + double ger_gflop_count(rocblas_int m, rocblas_int n){ + return (2.0 * m * n)/1e9; + } + /* * =========================================================================== * level 3 BLAS diff --git a/library/include/rocblas.hpp b/clients/include/rocblas.hpp similarity index 87% rename from library/include/rocblas.hpp rename to clients/include/rocblas.hpp index 09b90bba5..2e1efe4a9 100644 --- a/library/include/rocblas.hpp +++ b/clients/include/rocblas.hpp @@ -8,11 +8,7 @@ #define _ROCBLAS_HPP_ /* library headers */ -#include "rocblas-export.h" -#include "rocblas-version.h" -#include "rocblas_types.h" -#include "rocblas_auxiliary.h" -#include "rocblas_functions.h" +#include "rocblas.h" /*!\file @@ -95,6 +91,15 @@ const T *x, rocblas_int incx, T *y, rocblas_int incy); + template + rocblas_status + rocblas_ger(rocblas_handle handle, + rocblas_int m, rocblas_int n, + const T *alpha, + const T *x, rocblas_int incx, + const T *y, rocblas_int incy, + T *A, rocblas_int lda); + template rocblas_status rocblas_gemv(rocblas_handle handle, @@ -146,26 +151,29 @@ rocblas_operation transA, rocblas_diagonal diag, rocblas_int m, rocblas_int n, const T* alpha, - const T* A, rocblas_int lda, - T* B, rocblas_int ldb); + T* A, rocblas_int lda, + T* B, rocblas_int ldb); + + template rocblas_status rocblas_trtri(rocblas_handle handle, - rocblas_fill uplo, rocblas_diagonal diag, + rocblas_fill uplo, + rocblas_diagonal diag, rocblas_int n, - T *A, rocblas_int lda, + T *A, rocblas_int lda, T *invA, rocblas_int ldinvA); template rocblas_status rocblas_trtri_batched(rocblas_handle handle, - rocblas_fill uplo, - rocblas_diagonal diag, - rocblas_int n, - T *A, rocblas_int lda, rocblas_int bsa, - T *invA, rocblas_int ldinvA, rocblas_int bsinvA, - rocblas_int batch_count); + rocblas_fill uplo, + rocblas_diagonal diag, + rocblas_int n, + T *A, rocblas_int lda, rocblas_int bsa, + T *invA, rocblas_int ldinvA, rocblas_int bsinvA, + rocblas_int batch_count); template rocblas_status @@ -177,5 +185,4 @@ - #endif // _ROCBLAS_HPP_ diff --git a/clients/include/testing_gemm.hpp b/clients/include/testing_gemm.hpp index 3f50a28e7..0a3e81395 100644 --- a/clients/include/testing_gemm.hpp +++ b/clients/include/testing_gemm.hpp @@ -65,7 +65,7 @@ rocblas_status testing_gemm(Arguments argus) A_size = lda * A_col; B_size = ldb * B_col; C_size = ldc * N; //check here to prevent undefined memory allocation error - if( M < 0 || N < 0 || K < 0 || lda < 0 || ldb < 0 || ldc < 0 ){ + if( M < 0 || N < 0 || K < 0 || lda < A_row || ldb < B_row || ldc < M ){ return rocblas_status_invalid_size; } //Naming: dX is in GPU (device) memory. hK is in CPU (host) memory, plz follow this practice @@ -109,13 +109,6 @@ rocblas_status testing_gemm(Arguments argus) dB, ldb, &beta, dC, ldc); - if (status != rocblas_status_success) { - hipFree(dA); - hipFree(dB); - hipFree(dC); - return status; - } - // sleep(1); if(argus.timing){ gpu_time_used = get_time_us() - gpu_time_used; rocblas_gflops = gemm_gflop_count (M, N, K) / gpu_time_used * 1e6; @@ -134,23 +127,22 @@ rocblas_status testing_gemm(Arguments argus) cpu_time_used = get_time_us(); } - - cblas_gemm( + if(status != rocblas_status_invalid_size){//only valid size compare with cblas + cblas_gemm( transA, transB, M, N, K, alpha, hA.data(), lda, hB.data(), ldb, beta, hC_copy.data(), ldc); - + } if(argus.timing){ cpu_time_used = get_time_us() - cpu_time_used; cblas_gflops = gemm_gflop_count(M, N, K) / cpu_time_used * 1e6; } - for(int i=0;i(handle, transA, transB, M, N, K, - &alpha, dA, lda, bsa + &alpha, dA, lda, bsa, dB, ldb, bsb, &beta, dC, ldc, bsc, batch_count); - if (status != rocblas_status_success) { - hipFree(dA); - hipFree(dB); - hipFree(dC); - return status; - } -#endif + // sleep(1); if(argus.timing){ gpu_time_used = get_time_us() - gpu_time_used; @@ -177,7 +170,7 @@ rocblas_status testing_gemm_batched(Arguments argus) } cout << endl; - cout << "GG," << batch_count << M <<','<< N <<',' << K <<',' << lda <<','<< ldb <<',' << ldc <<',' << rocblas_gflops << "(" << gpu_time_used << "),"; + cout << "GG," << batch_count <<',' << M <<','<< N <<',' << K <<',' << lda <<','<< ldb <<',' << ldc <<',' << rocblas_gflops << "(" << gpu_time_used << "),"; if(argus.norm_check){ cout << cblas_gflops << "(" << cpu_time_used << "),"; diff --git a/clients/include/testing_ger.hpp b/clients/include/testing_ger.hpp new file mode 100644 index 000000000..3adb67d78 --- /dev/null +++ b/clients/include/testing_ger.hpp @@ -0,0 +1,186 @@ +/* ************************************************************************ + * Copyright 2016 Advanced Micro Devices, Inc. + * + * ************************************************************************ */ + +#include +#include +#include +#include + +#include "rocblas.hpp" +#include "utility.h" +#include "cblas_interface.h" +#include "norm.h" +#include "unit.h" +#include "flops.h" + +using namespace std; + + +/* ============================================================================================ */ + +template +rocblas_status testing_ger(Arguments argus) +{ + + rocblas_int M = argus.M; + rocblas_int N = argus.N; + rocblas_int incx = argus.incx; + rocblas_int incy = argus.incy; + rocblas_int lda = argus.lda; + + rocblas_int A_size = lda * N; + + rocblas_status status = rocblas_status_success; + + + //argument sanity check, quick return if input parameters are invalid before allocating invalid memory + if ( M < 0 ){ + status = rocblas_status_invalid_size; + return status; + } + if ( N < 0 ){ + status = rocblas_status_invalid_size; + return status; + } + else if ( lda < 0 ){ + status = rocblas_status_invalid_size; + return status; + } + else if ( incx <= 0 ){ + status = rocblas_status_invalid_size; + return status; + } + else if ( incy <= 0 ){ + status = rocblas_status_invalid_size; + return status; + } + + //Naming: dK is in GPU (device) memory. hK is in CPU (host) memory + vector hA(A_size); + vector hB(A_size); + vector hx(M * incx); + vector hy(N * incy); + + T *dA, *dx, *dy; + + double gpu_time_used, cpu_time_used; + double rocblas_gflops, cblas_gflops, rocblas_bandwidth; + double rocblas_error; + + T alpha = (T)argus.alpha; + + rocblas_handle handle; + + rocblas_create_handle(&handle); + + //allocate memory on device + CHECK_HIP_ERROR(hipMalloc(&dA, A_size * sizeof(T))); + CHECK_HIP_ERROR(hipMalloc(&dx, M * incx * sizeof(T))); + CHECK_HIP_ERROR(hipMalloc(&dy, N * incy * sizeof(T))); + + //Initial Data on CPU + srand(1); + rocblas_init(hA, M, N, lda); + rocblas_init(hx, 1, M, incx); + rocblas_init(hy, 1, N, incy); + + //copy matrix is easy in STL; hB = hA: save a copy in hB which will be output of CPU BLAS + hB = hA; + + //copy data from CPU to device + hipMemcpy(dA, hA.data(), sizeof(T)*lda*N, hipMemcpyHostToDevice); + hipMemcpy(dx, hx.data(), sizeof(T)*M * incx, hipMemcpyHostToDevice); + hipMemcpy(dy, hy.data(), sizeof(T)*N * incy, hipMemcpyHostToDevice); + + /* ===================================================================== + ROCBLAS + =================================================================== */ + if(argus.timing){ + gpu_time_used = get_time_us();// in microseconds + } + + for(int iter=0;iter<1;iter++){ + + status = rocblas_ger(handle, + M, N, + (T*)&alpha, + dx, incx, + dy, incy, + dA, lda); + + if (status != rocblas_status_success) { + CHECK_HIP_ERROR(hipFree(dA)); + CHECK_HIP_ERROR(hipFree(dx)); + CHECK_HIP_ERROR(hipFree(dy)); + rocblas_destroy_handle(handle); + return status; + } + } + if(argus.timing){ + gpu_time_used = get_time_us() - gpu_time_used; + rocblas_gflops = ger_gflop_count (M, N) / gpu_time_used * 1e6 * 1; + rocblas_bandwidth = (2.0 * M * N) * sizeof(T)/ gpu_time_used / 1e3; + } + + //copy output from device to CPU + hipMemcpy(hA.data(), dA, sizeof(T)*N*lda, hipMemcpyDeviceToHost); + + if(argus.unit_check || argus.norm_check){ + /* ===================================================================== + CPU BLAS + =================================================================== */ + if(argus.timing){ + cpu_time_used = get_time_us(); + } + + cblas_ger(M, N, + alpha, + hx.data(), incx, + hy.data(), incy, + hB.data(), lda); + + if(argus.timing){ + cpu_time_used = get_time_us() - cpu_time_used; + cblas_gflops = ger_gflop_count(M, N) / cpu_time_used * 1e6; + } + + //enable unit check, notice unit check is not invasive, but norm check is, + // unit check and norm check can not be interchanged their order + if(argus.unit_check){ + unit_check_general(M, N, lda, hB.data(), hA.data()); + } + + + //if enable norm check, norm check is invasive + //any typeinfo(T) will not work here, because template deduction is matched in compilation time + if(argus.norm_check){ + rocblas_error = norm_check_general('F', M, N, lda, hB.data(), hA.data()); + } + } + + if(argus.timing){ + //only norm_check return an norm error, unit check won't return anything + cout << "M, N, lda, rocblas-Gflops, rocblas-GB/s, "; + if(argus.norm_check){ + cout << "CPU-Gflops, norm-error" ; + } + cout << endl; + + cout << "GGG,"<< M << ',' << N <<',' << lda <<','<< rocblas_gflops << ',' << rocblas_bandwidth << ',' ; + + if(argus.norm_check){ + cout << cblas_gflops << ','; + cout << rocblas_error; + } + + cout << endl; + } + + CHECK_HIP_ERROR(hipFree(dA)); + CHECK_HIP_ERROR(hipFree(dx)); + CHECK_HIP_ERROR(hipFree(dy)); + rocblas_destroy_handle(handle); + return rocblas_status_success; +} diff --git a/clients/include/testing_trsm.hpp b/clients/include/testing_trsm.hpp index 230d5bf02..f72c068e0 100644 --- a/clients/include/testing_trsm.hpp +++ b/clients/include/testing_trsm.hpp @@ -54,7 +54,8 @@ rocblas_status testing_trsm(Arguments argus) vector hA(A_size); vector hB(B_size); vector hB_copy(B_size); - + vector hX(B_size); + T *dA, *dB; double gpu_time_used, cpu_time_used; @@ -72,13 +73,39 @@ rocblas_status testing_trsm(Arguments argus) //Initial Data on CPU srand(1); rocblas_init_symmetric(hA, K, lda); + //proprocess the matrix to avoid ill-conditioned matrix + vector ipiv(K); + cblas_getrf(K, K, hA.data(), lda, ipiv.data()); rocblas_init(hB, M, N, ldb); + + for(int i=0;i(hB, M, N, ldb); + hX = hB;//original solution hX + //Calculate hB = hA*hX; + cblas_trmm( + side, uplo, + transA, diag, + M, N, 1.0/alpha, + (const T*)hA.data(), lda, + hB.data(), ldb); + + hB_copy = hB; //copy data from CPU to device CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T)*A_size, hipMemcpyHostToDevice)); CHECK_HIP_ERROR(hipMemcpy(dB, hB.data(), sizeof(T)*B_size, hipMemcpyHostToDevice)); + /* ===================================================================== ROCBLAS =================================================================== */ @@ -86,7 +113,7 @@ rocblas_status testing_trsm(Arguments argus) gpu_time_used = get_time_us();// in microseconds } -/* + status = rocblas_trsm(handle, side, uplo, transA, diag, @@ -94,13 +121,6 @@ rocblas_status testing_trsm(Arguments argus) &alpha, dA,lda, dB,ldb); -*/ - if (status != rocblas_status_success) { - CHECK_HIP_ERROR(hipFree(dA)); - CHECK_HIP_ERROR(hipFree(dB)); - rocblas_destroy_handle(handle); - return status; - } if(argus.timing){ gpu_time_used = get_time_us() - gpu_time_used; @@ -108,9 +128,9 @@ rocblas_status testing_trsm(Arguments argus) } //copy output from device to CPU - CHECK_HIP_ERROR(hipMemcpy(hA.data(), dA, sizeof(T)*A_size, hipMemcpyDeviceToHost)); CHECK_HIP_ERROR(hipMemcpy(hB.data(), dB, sizeof(T)*B_size, hipMemcpyDeviceToHost)); + if(argus.unit_check || argus.norm_check){ /* ===================================================================== CPU BLAS @@ -123,7 +143,7 @@ rocblas_status testing_trsm(Arguments argus) side, uplo, transA, diag, M, N, alpha, - hA.data(), lda, + (const T*)hA.data(), lda, hB_copy.data(), ldb); if(argus.timing){ @@ -131,28 +151,36 @@ rocblas_status testing_trsm(Arguments argus) cblas_gflops = trsm_gflop_count(M, N, K) / cpu_time_used * 1e6; } - //enable unit check, notice unit check is not invasive, but norm check is, - // unit check and norm check can not be interchanged their order - if(argus.unit_check){ - unit_check_general(M, N, ldb, hB_copy.data(), hB.data()); - } + + print_matrix(hB_copy, hB, min(M, 3), min(N,3), ldb); + + //if enable norm check, norm check is invasive //any typeinfo(T) will not work here, because template deduction is matched in compilation time - if(argus.norm_check){ - rocblas_error = norm_check_general('F', M, N, ldb, hB_copy.data(), hB.data()); + rocblas_error = norm_check_general('F', M, N, ldb, hB_copy.data(), hB.data()); + + + //enable unit check, notice unit check is not invasive, but norm check is, + // unit check and norm check can not be interchanged their order + if(argus.unit_check){ + T tolerance = get_trsm_tolerance();// see unit.h for the tolerance + unit_check_trsm(M, N, ldb, rocblas_error, tolerance ); } } + if(argus.timing){ //only norm_check return an norm error, unit check won't return anything - cout << "M, N, lda, rocblas-Gflops (us) "; + cout << "M, N, lda, ldb, side, uplo, transA, diag, rocblas-Gflops (us) "; if(argus.norm_check){ cout << "CPU-Gflops(us), norm-error" ; } cout << endl; - cout << M <<','<< N <<',' << lda <<','<< rocblas_gflops << "(" << gpu_time_used << "),"; + cout << M << ',' << N <<',' << lda <<','<< ldb <<',' << char_side << ',' << char_uplo << ',' + << char_transA << ',' << char_diag << ',' << + rocblas_gflops << "(" << gpu_time_used << "),"; if(argus.norm_check){ cout << cblas_gflops << "(" << cpu_time_used << "),"; diff --git a/clients/include/testing_trtri.hpp b/clients/include/testing_trtri.hpp index 47e448007..cf9987324 100644 --- a/clients/include/testing_trtri.hpp +++ b/clients/include/testing_trtri.hpp @@ -65,6 +65,19 @@ rocblas_status testing_trtri(Arguments argus) //Initial Data on CPU srand(1); rocblas_init_symmetric(hA, N, lda); + + //proprocess the matrix to avoid ill-conditioned matrix + for(int i=0;i (N) / gpu_time_used * 1e6 ; @@ -119,6 +125,10 @@ rocblas_status testing_trtri(Arguments argus) cblas_gflops = trtri_gflop_count(N) / cpu_time_used * 1e6; } + #ifndef NDEBUG + print_matrix(hB, hA, N, N, lda); + #endif + //enable unit check, notice unit check is not invasive, but norm check is, // unit check and norm check can not be interchanged their order if(argus.unit_check){ diff --git a/clients/include/unit.h b/clients/include/unit.h index d1e60ca2b..3b1b8fe47 100644 --- a/clients/include/unit.h +++ b/clients/include/unit.h @@ -35,4 +35,10 @@ void unit_check_general(rocblas_int M, rocblas_int N, rocblas_int lda, T *hCPU, T *hGPU); + template + void unit_check_trsm(rocblas_int M, rocblas_int N, rocblas_int lda, double hGPU, T tolerance); + + template + T get_trsm_tolerance(); + #endif diff --git a/clients/include/utility.h b/clients/include/utility.h index c1be134a6..36d2b9d7e 100644 --- a/clients/include/utility.h +++ b/clients/include/utility.h @@ -93,6 +93,17 @@ using namespace std; template char type2char(); + /* ============================================================================================ */ + /*! \brief Debugging purpose, print out CPU and GPU result matrix, not valid in complex number */ + template + void print_matrix(vector CPU_result, vector GPU_result, rocblas_int m, rocblas_int n, rocblas_int lda){ + for(int i=0;i ) +target_include_directories( example-scal-template + PRIVATE + $ +) #target_link_libraries( example-openmp rocblas ) target_link_libraries( example-sscal rocblas ) +target_link_libraries( example-scal-template rocblas ) # Ubuntu systems need to explicitely link to pthreads lib because of --as-needed # https://github.com/google/googletest/issues/391#issuecomment-125645879 @@ -114,10 +95,12 @@ target_link_libraries( example-sscal rocblas ) if( UNIX ) # target_link_libraries( example-openmp pthread ) target_link_libraries( example-sscal pthread ) + target_link_libraries( example-scal-template pthread ) endif( ) #set_target_properties( example-openmp PROPERTIES DEBUG_POSTFIX "-d" ) # set_target_properties( example-openmp PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" ) set_target_properties( example-sscal PROPERTIES DEBUG_POSTFIX "-d" ) +set_target_properties( example-scal-template PROPERTIES DEBUG_POSTFIX "-d" ) # set_target_properties( example-sscal PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" ) diff --git a/clients/samples/example_scal_template.cpp b/clients/samples/example_scal_template.cpp new file mode 100644 index 000000000..7336107a5 --- /dev/null +++ b/clients/samples/example_scal_template.cpp @@ -0,0 +1,83 @@ +/* ************************************************************************ + * Copyright 2016 Advanced Micro Devices, Inc. + * + * ************************************************************************ */ + +#include +#include +#include + +#include "rocblas.hpp" +#include "utility.h" + +using namespace std; + +/* ============================================================================================ */ + +int main() +{ + + rocblas_int N = 10240; + rocblas_status status = rocblas_status_success; + float alpha = 10.0; + + //Naming: dX is in GPU (device) memory. hK is in CPU (host) memory, plz follow this practice + vector hx(N); + vector hz(N); + float *dx; + + double gpu_time_used; + + rocblas_handle handle; + rocblas_create_handle(&handle); + + //allocate memory on device + hipMalloc(&dx, N * sizeof(float)); + + //Initial Data on CPU + srand(1); + rocblas_init(hx, 1, N, 1); + + //copy vector is easy in STL; hz = hx: save a copy in hz which will be output of CPU BLAS + hz = hx; + + hipMemcpy(dx, hx.data(), sizeof(float)*N, hipMemcpyHostToDevice); + + printf("N rocblas(us) \n"); + + gpu_time_used = get_time_us();// in microseconds + + + /* ===================================================================== + ROCBLAS C++ template interface + =================================================================== */ + + status = rocblas_scal(handle, + N, + &alpha, + dx, 1); + if (status != rocblas_status_success) { + return status; + } + + + gpu_time_used = get_time_us() - gpu_time_used; + + //copy output from device to CPU + hipMemcpy(hx.data(), dx, sizeof(float)*N, hipMemcpyDeviceToHost); + + //verify rocblas_scal result + for(rocblas_int i=0;i/package - LOG_BUILD 1 + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" ) -set_property( TARGET Tensile PROPERTY FOLDER "extern") -ExternalProject_Get_Property( Tensile install_dir ) - # For use by the user of external-Tensile.cmake -set( Tensile_ROOT ${install_dir}/package ) +set( Tensile_ROOT ${CMAKE_BINARY_DIR}/extern/Tensile/src/Tensile) diff --git a/cmake/external-hip.cmake b/cmake/external-hip.cmake index 1bb7f5a7c..9d734585e 100644 --- a/cmake/external-hip.cmake +++ b/cmake/external-hip.cmake @@ -12,25 +12,21 @@ endif( ) set( hip_git_repository "https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP.git" CACHE STRING "URL to download hip from" ) set( hip_git_tag "master" CACHE STRING "URL to download hip from" ) -set( HOST_TOOLCHAIN_FILE "${PROJECT_SOURCE_DIR}/cmake/${HOST_TOOLCHAIN_NAME}-toolchain.cmake" ) +set( hip_cmake_args -DCMAKE_INSTALL_PREFIX=/package -DCMAKE_BUILD_TYPE=Release ) -set( hip_cmake_args -DCMAKE_INSTALL_PREFIX=/package -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=${HOST_TOOLCHAIN_FILE} ) - -if( ${BUILD_SHARED_LIBS} ) - message( STATUS "Compiling HIP as SHARED library") - # list( APPEND hip_cmake_args -DHIP_USE_SHARED_LIBRARY=1 ) - list( APPEND hip_cmake_args -DCMAKE_CXX_FLAGS=-fPIC ) -endif() +if( DEFINED CMAKE_CXX_COMPILER ) + list( APPEND hip_cmake_args -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} ) +endif( ) - ExternalProject_Add( - HIP - PREFIX ${CMAKE_BINARY_DIR}/extern/hip +ExternalProject_Add( + HIP + PREFIX ${CMAKE_BINARY_DIR}/extern/hip GIT_REPOSITORY ${hip_git_repository} GIT_TAG ${hip_git_tag} - CMAKE_ARGS ${hip_cmake_args} - LOG_BUILD 1 - LOG_INSTALL 1 - ) + CMAKE_ARGS ${hip_cmake_args} + LOG_BUILD 1 + LOG_INSTALL 1 +) set_property( TARGET HIP PROPERTY FOLDER "extern") ExternalProject_Get_Property( HIP install_dir ) diff --git a/cmake/gcc-toolchain.cmake b/cmake/gcc-toolchain.cmake deleted file mode 100644 index 69ed7925f..000000000 --- a/cmake/gcc-toolchain.cmake +++ /dev/null @@ -1,6 +0,0 @@ -# Trivial toolchain file to help project pick up appropriate compilers -set( CMAKE_C_COMPILER gcc ) -set( CMAKE_CXX_COMPILER g++ ) - -# we use gfortran to resolve fortran dependencies for lapack -set( CMAKE_Fortran_COMPILER gfortran ) diff --git a/cmake/hcc-toolchain.cmake b/cmake/hcc-toolchain.cmake deleted file mode 100644 index 074a49a51..000000000 --- a/cmake/hcc-toolchain.cmake +++ /dev/null @@ -1,3 +0,0 @@ -# Trivial toolchain file to help project pick up appropriate compilers -set( CMAKE_C_COMPILER hcc ) -set( CMAKE_CXX_COMPILER hcc ) diff --git a/cmake/hipcc-toolchain.cmake.in b/cmake/hipcc-toolchain.cmake.in deleted file mode 100644 index 794f54218..000000000 --- a/cmake/hipcc-toolchain.cmake.in +++ /dev/null @@ -1,3 +0,0 @@ -# Trivial toolchain file to help project pick up appropriate compilers -set( CMAKE_C_COMPILER @HOST_TOOLCHAIN_NAME@ ) -set( CMAKE_CXX_COMPILER @HIP_ROOT@/bin/hipcc ) \ No newline at end of file diff --git a/library/CMakeLists.txt b/library/CMakeLists.txt index a363c53c5..b33a475b9 100644 --- a/library/CMakeLists.txt +++ b/library/CMakeLists.txt @@ -6,6 +6,11 @@ # PUBLIC keywords cmake_minimum_required( VERSION 2.8.12 ) +# MACOSX_RPATH is enabled by default policy +if( POLICY CMP0042 ) + cmake_policy( SET CMP0042 NEW ) +endif( ) + # CMP0063: Honor visibility properties for all target types [cmake 3.3] if( POLICY CMP0063 ) cmake_policy( SET CMP0063 NEW ) @@ -22,48 +27,12 @@ endif( ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE -if( NOT CMAKE_CONFIGURATION_TYPES ) +if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() -if( DEFINED HOST_TOOLCHAIN_FILE ) -endif( ) - -if( DEFINED DEVICE_TOOLCHAIN_FILE ) - set( CMAKE_TOOLCHAIN_FILE "${DEVICE_TOOLCHAIN_FILE}" ) -endif( ) - -# Check if cmake supports the new VERSION tag for project() commands -# rocblas becomes the name of the project with a particular version -if( POLICY CMP0048 ) - cmake_policy( SET CMP0048 NEW ) - project( rocblas VERSION 0.0.2.0 LANGUAGES CXX C ) -else( ) - project( rocblas CXX C ) - # Define a version for the code - if( NOT DEFINED rocblas_VERSION_MAJOR ) - set( rocblas_VERSION_MAJOR 0 ) - endif( ) - - if( NOT DEFINED rocblas_VERSION_MINOR ) - set( rocblas_VERSION_MINOR 0 ) - endif( ) - - if( NOT DEFINED rocblas_VERSION_PATCH ) - set( rocblas_VERSION_PATCH 2 ) - endif( ) - - if( NOT DEFINED rocblas_VERSION_TWEAK ) - set( rocblas_VERSION_TWEAK 0 ) - endif( ) - - set( rocblas_VERSION "${rocblas_VERSION_MAJOR}.${rocblas_VERSION_MINOR}.${rocblas_VERSION_PATCH}.${rocblas_VERSION_TWEAK}") -endif( ) - -# MACOSX_RPATH is enabled by default policy -if( POLICY CMP0042 ) - cmake_policy( SET CMP0042 NEW ) -endif( ) +include( build-version ) +project_version( NAME rocblas LANGUAGES CXX ) message( STATUS "rocblas_VERSION= ${rocblas_VERSION}" ) message( STATUS "CMAKE_BUILD_TYPE= ${CMAKE_BUILD_TYPE}" ) @@ -94,7 +63,9 @@ if( MSVC ) string( REGEX REPLACE "/STACK:[0-9]+" "" CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS}" ) endif( ) -include ( build-bitness ) +set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-Bsymbolic") + +include( build-bitness ) # Print out compiler flags for viewing/debug message( STATUS "CMAKE_CXX_COMPILER flags: " ${CMAKE_CXX_FLAGS} ) @@ -110,7 +81,7 @@ configure_file( "${PROJECT_SOURCE_DIR}/include/rocblas-version.h.in" "${PROJECT_ set( rocblas_headers_public include/rocblas.h - include/rocblas.hpp + # include/rocblas.hpp include/rocblas_types.h include/rocblas_auxiliary.h include/rocblas_functions.h diff --git a/library/include/rocblas_functions.h b/library/include/rocblas_functions.h index dffc9eac5..6ce9b7dd8 100644 --- a/library/include/rocblas_functions.h +++ b/library/include/rocblas_functions.h @@ -435,6 +435,52 @@ rocblas_dzamax(rocblas_handle handle, const rocblas_double_complex *x, rocblas_int incx, rocblas_int *result); +/*! \brief BLAS Level 1 API + + \details + amin finds the first index of the element of minimum magnitude of real vector x + or the sum of magnitude of the real and imaginary parts of elements if x is a complex vector + + @param[in] + handle rocblas_handle. + handle to the rocblas library context queue. + @param[in] + n rocblas_int. + @param[in] + x pointer storing vector x on the GPU. + @param[in] + incx rocblas_int + specifies the increment for the elements of y. + @param[inout] + result + store the amin product. either on the host CPU or device GPU. + return is 0.0 if n, incx<=0. + ********************************************************************/ + +ROCBLAS_EXPORT rocblas_status +rocblas_samin(rocblas_handle handle, + rocblas_int n, + const float *x, rocblas_int incx, + rocblas_int *result); + +ROCBLAS_EXPORT rocblas_status +rocblas_damin(rocblas_handle handle, + rocblas_int n, + const double *x, rocblas_int incx, + rocblas_int *result); + +ROCBLAS_EXPORT rocblas_status +rocblas_scamin(rocblas_handle handle, + rocblas_int n, + const rocblas_float_complex *x, rocblas_int incx, + rocblas_int *result); + +ROCBLAS_EXPORT rocblas_status +rocblas_dzamin(rocblas_handle handle, + rocblas_int n, + const rocblas_double_complex *x, rocblas_int incx, + rocblas_int *result); + /* * =========================================================================== * level 2 BLAS @@ -604,12 +650,272 @@ rocblas_zhemv(rocblas_handle handle, rocblas_double_complex *y, rocblas_int incy); +/*! \brief BLAS Level 2 API + + \details + xGER performs the matrix-vector operations + + A := A + alpha*x*y**T + + where alpha is a scalars, x and y are vectors, and A is an + m by n matrix. + + @param[in] + handle rocblas_handle. + handle to the rocblas library context queue. + @param[in] + m rocblas_int + @param[in] + n rocblas_int + @param[in] + alpha + specifies the scalar alpha. + @param[in] + x pointer storing vector x on the GPU. + @param[in] + incx rocblas_int + specifies the increment for the elements of x. + @param[in] + y pointer storing vector y on the GPU. + @param[in] + incy rocblas_int + specifies the increment for the elements of y. + @param[inout] + A pointer storing matrix A on the GPU. + @param[in] + lda rocblas_int + specifies the leading dimension of A. + + ********************************************************************/ + +ROCBLAS_EXPORT rocblas_status +rocblas_sger(rocblas_handle handle, + rocblas_int m, rocblas_int n, + const float *alpha, + const float *x, rocblas_int incx, + const float *y, rocblas_int incy, + float *A, rocblas_int lda); + +ROCBLAS_EXPORT rocblas_status +rocblas_dger(rocblas_handle handle, + rocblas_int m, rocblas_int n, + const double *alpha, + const double *x, rocblas_int incx, + const double *y, rocblas_int incy, + double *A, rocblas_int lda); + /* * =========================================================================== * level 3 BLAS * =========================================================================== */ +/*! \brief BLAS Level 3 API + + \details + trtri compute the inverse of a matrix A, namely, invA + + and write the result into invA; + + @param[in] + handle rocblas_handle. + handle to the rocblas library context queue. + @param[in] + uplo rocblas_fill. + specifies whether the upper 'rocblas_fill_upper' or lower 'rocblas_fill_lower' + if rocblas_fill_upper, the lower part of A is not referenced + if rocblas_fill_lower, the upper part of A is not referenced + @param[in] + diag rocblas_diagonal. + = 'rocblas_diagonal_non_unit', A is non-unit triangular; + = 'rocblas_diagonal_unit', A is unit triangular; + @param[in] + n rocblas_int. + size of matrix A and invA + @param[in] + A pointer storing matrix A on the GPU. + @param[in] + lda rocblas_int + specifies the leading dimension of A. + @param[output] + invA pointer storing matrix invA on the GPU. + @param[in] + ldinvA rocblas_int + specifies the leading dimension of invA. + +********************************************************************/ + +ROCBLAS_EXPORT rocblas_status +rocblas_strtri(rocblas_handle handle, + rocblas_fill uplo, rocblas_diagonal diag, + rocblas_int n, + float *A, rocblas_int lda, + float *invA, rocblas_int ldinvA); + +ROCBLAS_EXPORT rocblas_status +rocblas_dtrtri(rocblas_handle handle, + rocblas_fill uplo, rocblas_diagonal diag, + rocblas_int n, + double *A, rocblas_int lda, + double *invA, rocblas_int ldinvA); + + +/*! \brief BLAS Level 3 API + + \details + trtri compute the inverse of a matrix A + + inv(A); + + @param[in] + handle rocblas_handle. + handle to the rocblas library context queue. + @param[in] + uplo rocblas_fill. + specifies whether the upper 'rocblas_fill_upper' or lower 'rocblas_fill_lower' + @param[in] + diag rocblas_diagonal. + = 'rocblas_diagonal_non_unit', A is non-unit triangular; + = 'rocblas_diagonal_unit', A is unit triangular; + @param[in] + n rocblas_int. + @param[in] + A pointer storing matrix A on the GPU. + @param[in] + lda rocblas_int + specifies the leading dimension of A. + @param[in] + bsa rocblas_int + "batch stride a": stride from the start of one "A" matrix to the next + @param[output] + invA pointer storing the inverse matrix A on the GPU. + @param[in] + ldinvA rocblas_int + specifies the leading dimension of invA. + @param[in] + bsinvA rocblas_int + "batch stride invA": stride from the start of one "invA" matrix to the next + @param[in] + batch_count rocblas_int + numbers of matrices in the batch + ********************************************************************/ + +ROCBLAS_EXPORT rocblas_status +rocblas_strtri_batched(rocblas_handle handle, + rocblas_fill uplo, + rocblas_diagonal diag, + rocblas_int n, + float *A, rocblas_int lda, rocblas_int bsa, + float *invA, rocblas_int ldinvA, rocblas_int bsinvA, + rocblas_int batch_count); + + +ROCBLAS_EXPORT rocblas_status +rocblas_dtrtri_batched(rocblas_handle handle, + rocblas_fill uplo, + rocblas_diagonal diag, + rocblas_int n, + double *A, rocblas_int lda, rocblas_int bsa, + double *invA, rocblas_int ldinvA, rocblas_int bsinvA, + rocblas_int batch_count); + + + +/*! \brief BLAS Level 3 API + + \details + + trsm solves + + op(A)*X = alpha*B or X*op(A) = alpha*B, + + where alpha is a scalar, X and B are m by n matrices, + A is triangular matrix and op(A) is one of + + op( A ) = A or op( A ) = A^T or op( A ) = A^H. + + The matrix X is overwritten on B. + + @param[in] + handle rocblas_handle. + handle to the rocblas library context queue. + + @param[in] + side rocblas_side. + rocblas_side_left: op(A)*X = alpha*B. + rocblas_side_right: X*op(A) = alpha*B. + + @param[in] + uplo rocblas_fill. + rocblas_fill_upper: A is an upper triangular matrix. + rocblas_fill_lower: A is a lower triangular matrix. + + @param[in] + transA rocblas_operation. + transB: op(A) = A. + rocblas_operation_transpose: op(A) = A^T. + rocblas_operation_conjugate_transpose: op(A) = A^H. + + @param[in] + diag rocblas_diagonal. + rocblas_diagonal_unit: A is assumed to be unit triangular. + rocblas_diagonal_non_unit: A is not assumed to be unit triangular. + + @param[in] + m rocblas_int. + m specifies the number of rows of B. m >= 0. + + @param[in] + n rocblas_int. + n specifies the number of columns of B. n >= 0. + + @param[in] + alpha + alpha specifies the scalar alpha. When alpha is + &zero then A is not referenced and B need not be set before + entry. + + @param[in] + A pointer storing matrix A on the GPU. + of dimension ( lda, k ), where k is m + when rocblas_side_left and + is n when rocblas_side_right + only the upper/lower triangular part is accessed. + + @param[in] + lda rocblas_int. + lda specifies the first dimension of A. + if side = rocblas_side_left, lda >= max( 1, m ), + if side = rocblas_side_right, lda >= max( 1, n ). + + @param[in,output] + B pointer storing matrix B on the GPU. + + @param[in] + ldb rocblas_int. + ldb specifies the first dimension of B. ldb >= max( 1, m ). + + ********************************************************************/ + +ROCBLAS_EXPORT rocblas_status +rocblas_strsm(rocblas_handle handle, + rocblas_side side, rocblas_fill uplo, + rocblas_operation transA, rocblas_diagonal diag, + rocblas_int m, rocblas_int n, + const float* alpha, + float* A, rocblas_int lda, + float* B, rocblas_int ldb); + + +ROCBLAS_EXPORT rocblas_status +rocblas_dtrsm(rocblas_handle handle, + rocblas_side side, rocblas_fill uplo, + rocblas_operation transA, rocblas_diagonal diag, + rocblas_int m, rocblas_int n, + const double* alpha, + double* A, rocblas_int lda, + double* B, rocblas_int ldb); + /*! \brief BLAS Level 3 API diff --git a/library/include/rocblas_types.h b/library/include/rocblas_types.h index 24f82370f..5e9d070dd 100644 --- a/library/include/rocblas_types.h +++ b/library/include/rocblas_types.h @@ -97,9 +97,9 @@ extern "C" { rocblas_status_invalid_handle = 1, /**< handle not initialized, invalid or null */ rocblas_status_not_implemented = 2, /**< function is not implemented */ rocblas_status_invalid_pointer = 3, /**< invalid pointer parameter */ - rocblas_status_invalid_size = 3, /**< invalid size parameter */ - rocblas_status_memory_error = 4, /**< failed internal memory allocation, copy or dealloc */ - rocblas_status_internal_error = 5, /**< other internal library failure */ + rocblas_status_invalid_size = 4, /**< invalid size parameter */ + rocblas_status_memory_error = 5, /**< failed internal memory allocation, copy or dealloc */ + rocblas_status_internal_error = 6, /**< other internal library failure */ } rocblas_status; diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt index 044ad6253..5f3107dcb 100644 --- a/library/src/CMakeLists.txt +++ b/library/src/CMakeLists.txt @@ -21,22 +21,54 @@ endfunction( ) # ######################################################################## # Set up Tensile Dependency - if( BUILD_WITH_TENSILE ) - find_package(Tensile REQUIRED CONFIG) - - add_tensile_lib( rocblas-tensile - SOLUTIONS ${CMAKE_CURRENT_SOURCE_DIR}/blas3/Tensile/XML_SolutionTimes - BACKEND HIP ) + set(Tensile_BACKEND "HIP") + message(STATUS "Tensile_LOGIC=${Tensile_LOGIC}") + message(STATUS "Tensile_FOUND=${Tensile_FOUND}") + if (Tensile_FOUND) # was it found at the top level + find_package(Tensile) + message(STATUS "Tensile package found") + TensileCreateLibrary( + ${CMAKE_CURRENT_SOURCE_DIR}/blas3/Tensile/Logic/${Tensile_LOGIC} + ${Tensile_BACKEND} + ${Tensile_MERGE_FILES} + ${Tensile_SHORT_FILENAMES} + ${Tensile_PRINT_DEBUG} + ) + else() + message(STATUS "Tensile_ROOT=${Tensile_ROOT} specified; manually including TensileConfig.cmake") + include(${Tensile_ROOT}/Tensile/Source/TensileConfig.cmake) + TensileCreateLibrary( + ${CMAKE_CURRENT_SOURCE_DIR}/blas3/Tensile/Logic/${Tensile_LOGIC} + ${Tensile_BACKEND} + ${Tensile_MERGE_FILES} + ${Tensile_SHORT_FILENAMES} + ${Tensile_PRINT_DEBUG} + Tensile_ROOT ${Tensile_ROOT} + ) + endif() + + #message(STATUS "Tensile_ROOT=${Tensile_ROOT}") + #find_package(Tensile) + #if (Tensile_FOUND) + # message(STATUS "Tensile package found.") + #else() + # message(STATUS "Tensile NOT found; including TensileConfig.cmake.") + # include(${Tensile_ROOT}/Source/TensileConfig.cmake) + #endif() + #TensileCreateLibrary( + # ${CMAKE_CURRENT_SOURCE_DIR}/blas3/Tensile/Logic/${Tensile_LOGIC} + # ${Tensile_BACKEND} # HIP or OCL + # ${Tensile_MERGE_FILES} # ON or OFF + # ${Tensile_SHORT_FILENAMES} # ON or OFF + # ${Tensile_PRINT_DEBUG} # ON or OFF + # ${Tensile_ROOT} + # ) #rocblas_gemm and rocblas_trsm require tensile set( Tensile_SRC blas3/Tensile/gemm.cpp - blas3/Tensile/Tensile_status.cpp - blas3/rocblas_gemm.cpp -# blas3/rocblas_trmm.cpp -# blas3/rocblas_trsm.cpp -# blas3/rocblas_trtri_trsm.cpp + blas3/rocblas_trsm.cpp ) set(Tensile_INC @@ -56,17 +88,19 @@ set( rocblas_auxiliary_source ) set( rocblas_blas3_source - blas3/rocblas_trtri_batched.cpp blas3/rocblas_trtri.cpp + blas3/rocblas_trtri_batched.cpp ${Tensile_SRC} ) set( rocblas_blas2_source blas2/rocblas_gemv.cpp + blas2/rocblas_ger.cpp ) set( rocblas_blas1_source blas1/fetch_template.cpp + blas1/rocblas_amin.cpp blas1/rocblas_amax.cpp blas1/rocblas_asum.cpp blas1/rocblas_axpy.cpp @@ -114,7 +148,7 @@ target_include_directories( rocblas ) if( BUILD_WITH_TENSILE ) - target_link_libraries( rocblas PRIVATE rocblas-tensile ) + target_link_libraries( rocblas PRIVATE Tensile) target_compile_definitions( rocblas PRIVATE BUILD_WITH_TENSILE=1 ) endif() @@ -159,7 +193,7 @@ write_basic_package_version_file( ) if( BUILD_WITH_TENSILE ) - install( TARGETS rocblas rocblas-tensile + install( TARGETS rocblas Tensile EXPORT rocblas-targets RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} diff --git a/library/src/blas1/rocblas_amax.cpp b/library/src/blas1/rocblas_amax.cpp index 4fbaea4e4..77afaf6e9 100644 --- a/library/src/blas1/rocblas_amax.cpp +++ b/library/src/blas1/rocblas_amax.cpp @@ -7,7 +7,7 @@ #include "rocblas.h" -#include "rocblas.hpp" + #include "status.h" #include "definitions.h" #include "device_template.h" @@ -222,58 +222,6 @@ rocblas_amax_template(rocblas_handle handle, -/* ============================================================================================ */ - - /* - * =========================================================================== - * template interface - * template specialization - * =========================================================================== - */ - - -template<> -rocblas_status -rocblas_amax(rocblas_handle handle, - rocblas_int n, - const float *x, rocblas_int incx, - rocblas_int *result){ - - return rocblas_amax_template(handle, n, x, incx, result); -} - -template<> -rocblas_status -rocblas_amax(rocblas_handle handle, - rocblas_int n, - const double *x, rocblas_int incx, - rocblas_int *result){ - - return rocblas_amax_template(handle, n, x, incx, result); -} - -template<> -rocblas_status -rocblas_amax(rocblas_handle handle, - rocblas_int n, - const rocblas_float_complex *x, rocblas_int incx, - rocblas_int *result){ - - return rocblas_amax_template(handle, n, x, incx, result); -} - -template<> -rocblas_status -rocblas_amax(rocblas_handle handle, - rocblas_int n, - const rocblas_double_complex *x, rocblas_int incx, - rocblas_int *result){ - - return rocblas_amax_template(handle, n, x, incx, result); -} - - - /* ============================================================================================ */ /* @@ -290,7 +238,7 @@ rocblas_samax(rocblas_handle handle, const float *x, rocblas_int incx, rocblas_int *result){ - return rocblas_amax(handle, n, x, incx, result); + return rocblas_amax_template(handle, n, x, incx, result); } @@ -301,7 +249,7 @@ rocblas_damax(rocblas_handle handle, const double *x, rocblas_int incx, rocblas_int *result){ - return rocblas_amax(handle, n, x, incx, result); + return rocblas_amax_template(handle, n, x, incx, result); } @@ -312,7 +260,7 @@ rocblas_scamax(rocblas_handle handle, const rocblas_float_complex *x, rocblas_int incx, rocblas_int *result){ - return rocblas_amax(handle, n, x, incx, result); + return rocblas_amax_template(handle, n, x, incx, result); } extern "C" @@ -322,7 +270,7 @@ rocblas_dzamax(rocblas_handle handle, const rocblas_double_complex *x, rocblas_int incx, rocblas_int *result){ - return rocblas_amax(handle, n, x, incx, result); + return rocblas_amax_template(handle, n, x, incx, result); } diff --git a/library/src/blas1/rocblas_amin.cpp b/library/src/blas1/rocblas_amin.cpp index 8068deca1..142f8ea7c 100644 --- a/library/src/blas1/rocblas_amin.cpp +++ b/library/src/blas1/rocblas_amin.cpp @@ -7,7 +7,7 @@ #include "rocblas.h" -#include "rocblas.hpp" + #include "status.h" #include "definitions.h" #include "device_template.h" @@ -222,57 +222,6 @@ rocblas_amin_template(rocblas_handle handle, -/* ============================================================================================ */ - - /* - * =========================================================================== - * template interface - * template specialization - * =========================================================================== - */ - - -template<> -rocblas_status -rocblas_amin(rocblas_handle handle, - rocblas_int n, - const float *x, rocblas_int incx, - rocblas_int *result){ - - return rocblas_amin_template(handle, n, x, incx, result); -} - -template<> -rocblas_status -rocblas_amin(rocblas_handle handle, - rocblas_int n, - const double *x, rocblas_int incx, - rocblas_int *result){ - - return rocblas_amin_template(handle, n, x, incx, result); -} - -template<> -rocblas_status -rocblas_amin(rocblas_handle handle, - rocblas_int n, - const rocblas_float_complex *x, rocblas_int incx, - rocblas_int *result){ - - return rocblas_amin_template(handle, n, x, incx, result); -} - -template<> -rocblas_status -rocblas_amin(rocblas_handle handle, - rocblas_int n, - const rocblas_double_complex *x, rocblas_int incx, - rocblas_int *result){ - - return rocblas_amin_template(handle, n, x, incx, result); -} - - /* ============================================================================================ */ @@ -290,7 +239,7 @@ rocblas_samin(rocblas_handle handle, const float *x, rocblas_int incx, rocblas_int *result){ - return rocblas_amin(handle, n, x, incx, result); + return rocblas_amin_template(handle, n, x, incx, result); } @@ -301,7 +250,7 @@ rocblas_damin(rocblas_handle handle, const double *x, rocblas_int incx, rocblas_int *result){ - return rocblas_amin(handle, n, x, incx, result); + return rocblas_amin_template(handle, n, x, incx, result); } @@ -312,7 +261,7 @@ rocblas_scamin(rocblas_handle handle, const rocblas_float_complex *x, rocblas_int incx, rocblas_int *result){ - return rocblas_amin(handle, n, x, incx, result); + return rocblas_amin_template(handle, n, x, incx, result); } extern "C" @@ -322,5 +271,5 @@ rocblas_dzamin(rocblas_handle handle, const rocblas_double_complex *x, rocblas_int incx, rocblas_int *result){ - return rocblas_amin(handle, n, x, incx, result); + return rocblas_amin_template(handle, n, x, incx, result); } diff --git a/library/src/blas1/rocblas_asum.cpp b/library/src/blas1/rocblas_asum.cpp index d5c4a7c0b..a530b4941 100644 --- a/library/src/blas1/rocblas_asum.cpp +++ b/library/src/blas1/rocblas_asum.cpp @@ -7,7 +7,7 @@ #include "rocblas.h" -#include "rocblas.hpp" + #include "status.h" #include "definitions.h" #include "device_template.h" @@ -204,56 +204,6 @@ rocblas_asum_template(rocblas_handle handle, -/* ============================================================================================ */ - - /* - * =========================================================================== - * template interface - * template specialization - * =========================================================================== - */ - - -template<> -rocblas_status -rocblas_asum(rocblas_handle handle, - rocblas_int n, - const float *x, rocblas_int incx, - float *result){ - - return rocblas_asum_template(handle, n, x, incx, result); -} - -template<> -rocblas_status -rocblas_asum(rocblas_handle handle, - rocblas_int n, - const double *x, rocblas_int incx, - double *result){ - - return rocblas_asum_template(handle, n, x, incx, result); -} - -template<> -rocblas_status -rocblas_asum(rocblas_handle handle, - rocblas_int n, - const rocblas_float_complex *x, rocblas_int incx, - float *result){ - - return rocblas_asum_template(handle, n, x, incx, result); -} - -template<> -rocblas_status -rocblas_asum(rocblas_handle handle, - rocblas_int n, - const rocblas_double_complex *x, rocblas_int incx, - double *result){ - - return rocblas_asum_template(handle, n, x, incx, result); -} - /* ============================================================================================ */ @@ -272,7 +222,7 @@ rocblas_sasum(rocblas_handle handle, const float *x, rocblas_int incx, float *result){ - return rocblas_asum(handle, n, x, incx, result); + return rocblas_asum_template(handle, n, x, incx, result); } @@ -283,7 +233,7 @@ rocblas_dasum(rocblas_handle handle, const double *x, rocblas_int incx, double *result){ - return rocblas_asum(handle, n, x, incx, result); + return rocblas_asum_template(handle, n, x, incx, result); } @@ -294,7 +244,7 @@ rocblas_scasum(rocblas_handle handle, const rocblas_float_complex *x, rocblas_int incx, float *result){ - return rocblas_asum(handle, n, x, incx, result); + return rocblas_asum_template(handle, n, x, incx, result); } extern "C" @@ -304,7 +254,7 @@ rocblas_dzasum(rocblas_handle handle, const rocblas_double_complex *x, rocblas_int incx, double *result){ - return rocblas_asum(handle, n, x, incx, result); + return rocblas_asum_template(handle, n, x, incx, result); } diff --git a/library/src/blas1/rocblas_axpy.cpp b/library/src/blas1/rocblas_axpy.cpp index 8a510ea70..b263f5a7e 100644 --- a/library/src/blas1/rocblas_axpy.cpp +++ b/library/src/blas1/rocblas_axpy.cpp @@ -7,7 +7,7 @@ #include "rocblas.h" -#include "rocblas.hpp" + #include "definitions.h" #define NB_X 256 @@ -117,59 +117,6 @@ rocblas_axpy_template(rocblas_handle handle, return rocblas_status_success; } -/* ============================================================================================ */ - - /* - * =========================================================================== - * template interface - * template specialization - * =========================================================================== - */ - - -template<> -rocblas_status -rocblas_axpy(rocblas_handle handle, - rocblas_int n, - const float *alpha, - const float *x, rocblas_int incx, - float *y, rocblas_int incy){ - - return rocblas_axpy_template(handle, n, alpha, x, incx, y, incy); -} - -template<> -rocblas_status -rocblas_axpy(rocblas_handle handle, - rocblas_int n, - const double *alpha, - const double *x, rocblas_int incx, - double *y, rocblas_int incy){ - - return rocblas_axpy_template(handle, n, alpha, x, incx, y, incy); -} - -template<> -rocblas_status -rocblas_axpy(rocblas_handle handle, - rocblas_int n, - const rocblas_float_complex *alpha, - const rocblas_float_complex *x, rocblas_int incx, - rocblas_float_complex *y, rocblas_int incy){ - - return rocblas_axpy_template(handle, n, alpha, x, incx, y, incy); -} - -template<> -rocblas_status -rocblas_axpy(rocblas_handle handle, - rocblas_int n, - const rocblas_double_complex *alpha, - const rocblas_double_complex *x, rocblas_int incx, - rocblas_double_complex *y, rocblas_int incy){ - - return rocblas_axpy_template(handle, n, alpha, x, incx, y, incy); -} /* ============================================================================================ */ @@ -189,7 +136,7 @@ rocblas_saxpy(rocblas_handle handle, const float *x, rocblas_int incx, float *y, rocblas_int incy){ - return rocblas_axpy(handle, n, alpha, x, incx, y, incy); + return rocblas_axpy_template(handle, n, alpha, x, incx, y, incy); } extern "C" @@ -200,7 +147,7 @@ rocblas_daxpy(rocblas_handle handle, const double *x, rocblas_int incx, double *y, rocblas_int incy){ - return rocblas_axpy(handle, n, alpha, x, incx, y, incy); + return rocblas_axpy_template(handle, n, alpha, x, incx, y, incy); } extern "C" @@ -211,7 +158,7 @@ rocblas_caxpy(rocblas_handle handle, const rocblas_float_complex *x, rocblas_int incx, rocblas_float_complex *y, rocblas_int incy){ - return rocblas_axpy(handle, n, alpha, x, incx, y, incy); + return rocblas_axpy_template(handle, n, alpha, x, incx, y, incy); } extern "C" @@ -222,7 +169,7 @@ rocblas_zaxpy(rocblas_handle handle, const rocblas_double_complex *x, rocblas_int incx, rocblas_double_complex *y, rocblas_int incy){ - return rocblas_axpy(handle, n, alpha, x, incx, y, incy); + return rocblas_axpy_template(handle, n, alpha, x, incx, y, incy); } diff --git a/library/src/blas1/rocblas_copy.cpp b/library/src/blas1/rocblas_copy.cpp index 5ef7c7db7..5a65720e0 100644 --- a/library/src/blas1/rocblas_copy.cpp +++ b/library/src/blas1/rocblas_copy.cpp @@ -7,7 +7,7 @@ #include "rocblas.h" -#include "rocblas.hpp" + #include "definitions.h" #define NB_X 256 @@ -91,55 +91,7 @@ rocblas_copy_template(rocblas_handle handle, return rocblas_status_success; } -/* ============================================================================================ */ - - /* - * =========================================================================== - * template interface - * template specialization - * =========================================================================== - */ - - -template<> -rocblas_status -rocblas_copy(rocblas_handle handle, - rocblas_int n, - const float *x, rocblas_int incx, - float* y, rocblas_int incy){ - - return rocblas_copy_template(handle, n, x, incx, y, incy); -} - -template<> -rocblas_status -rocblas_copy(rocblas_handle handle, - rocblas_int n, - const double *x, rocblas_int incx, - double* y, rocblas_int incy){ - return rocblas_copy_template(handle, n, x, incx, y, incy); -} - -template<> -rocblas_status -rocblas_copy(rocblas_handle handle, - rocblas_int n, - const rocblas_float_complex *x, rocblas_int incx, - rocblas_float_complex* y, rocblas_int incy){ - - return rocblas_copy_template(handle, n, x, incx, y, incy); -} - -template<> -rocblas_status -rocblas_copy(rocblas_handle handle, - rocblas_int n, - const rocblas_double_complex *x, rocblas_int incx, - rocblas_double_complex* y, rocblas_int incy){ - - return rocblas_copy_template(handle, n, x, incx, y, incy); -} /* ============================================================================================ */ /* @@ -156,7 +108,7 @@ rocblas_scopy(rocblas_handle handle, const float *x, rocblas_int incx, float* y, rocblas_int incy){ - return rocblas_copy(handle, n, x, incx, y, incy); + return rocblas_copy_template(handle, n, x, incx, y, incy); } @@ -167,7 +119,7 @@ rocblas_dcopy(rocblas_handle handle, const double *x, rocblas_int incx, double* y, rocblas_int incy){ - return rocblas_copy(handle, n, x, incx, y, incy); + return rocblas_copy_template(handle, n, x, incx, y, incy); } extern "C" @@ -177,7 +129,7 @@ rocblas_ccopy(rocblas_handle handle, const rocblas_float_complex *x, rocblas_int incx, rocblas_float_complex* y, rocblas_int incy){ - return rocblas_copy(handle, n, x, incx, y, incy); + return rocblas_copy_template(handle, n, x, incx, y, incy); } extern "C" @@ -187,7 +139,7 @@ rocblas_zcopy(rocblas_handle handle, const rocblas_double_complex *x, rocblas_int incx, rocblas_double_complex* y, rocblas_int incy){ - return rocblas_copy(handle, n, x, incx, y, incy); + return rocblas_copy_template(handle, n, x, incx, y, incy); } diff --git a/library/src/blas1/rocblas_dot.cpp b/library/src/blas1/rocblas_dot.cpp index a6fd054a6..664327237 100644 --- a/library/src/blas1/rocblas_dot.cpp +++ b/library/src/blas1/rocblas_dot.cpp @@ -7,7 +7,7 @@ #include "rocblas.h" -#include "rocblas.hpp" + #include "status.h" #include "definitions.h" #include "device_template.h" @@ -216,62 +216,6 @@ rocblas_dot_template(rocblas_handle handle, -/* ============================================================================================ */ - - /* - * =========================================================================== - * template interface - * template specialization - * =========================================================================== - */ - - -template<> -rocblas_status -rocblas_dot(rocblas_handle handle, - rocblas_int n, - const float *x, rocblas_int incx, - const float *y, rocblas_int incy, - float *result){ - - return rocblas_dot_template(handle, n, x, incx, y, incy, result); -} - - - -template<> -rocblas_status -rocblas_dot(rocblas_handle handle, - rocblas_int n, - const double *x, rocblas_int incx, - const double *y, rocblas_int incy, - double *result){ - - return rocblas_dot_template(handle, n, x, incx, y, incy, result); -} - -template<> -rocblas_status -rocblas_dot(rocblas_handle handle, - rocblas_int n, - const rocblas_float_complex *x, rocblas_int incx, - const rocblas_float_complex *y, rocblas_int incy, - rocblas_float_complex *result){ - - return rocblas_dot_template(handle, n, x, incx, y, incy, result); -} - -template<> -rocblas_status -rocblas_dot(rocblas_handle handle, - rocblas_int n, - const rocblas_double_complex *x, rocblas_int incx, - const rocblas_double_complex *y, rocblas_int incy, - rocblas_double_complex *result){ - - return rocblas_dot_template(handle, n, x, incx, y, incy, result); -} - @@ -292,7 +236,7 @@ rocblas_sdot(rocblas_handle handle, const float *y, rocblas_int incy, float *result){ - return rocblas_dot(handle, n, x, incx, y, incy, result); + return rocblas_dot_template(handle, n, x, incx, y, incy, result); } extern "C" @@ -303,7 +247,7 @@ rocblas_ddot(rocblas_handle handle, const double *y, rocblas_int incy, double *result){ - return rocblas_dot(handle, n, x, incx, y, incy, result); + return rocblas_dot_template(handle, n, x, incx, y, incy, result); } @@ -315,7 +259,7 @@ rocblas_cdotu(rocblas_handle handle, const rocblas_float_complex *y, rocblas_int incy, rocblas_float_complex *result){ - return rocblas_dot(handle, n, x, incx, y, incy, result); + return rocblas_dot_template(handle, n, x, incx, y, incy, result); } @@ -327,7 +271,7 @@ rocblas_zdotu(rocblas_handle handle, const rocblas_double_complex *y, rocblas_int incy, rocblas_double_complex *result){ - return rocblas_dot(handle, n, x, incx, y, incy, result); + return rocblas_dot_template(handle, n, x, incx, y, incy, result); } diff --git a/library/src/blas1/rocblas_nrm2.cpp b/library/src/blas1/rocblas_nrm2.cpp index 906ba16ef..b3e172539 100644 --- a/library/src/blas1/rocblas_nrm2.cpp +++ b/library/src/blas1/rocblas_nrm2.cpp @@ -7,7 +7,7 @@ #include "rocblas.h" -#include "rocblas.hpp" + #include "status.h" #include "definitions.h" #include "device_template.h" @@ -204,57 +204,6 @@ rocblas_nrm2_template(rocblas_handle handle, -/* ============================================================================================ */ - - /* - * =========================================================================== - * template interface - * template specialization - * =========================================================================== - */ - - -template<> -rocblas_status -rocblas_nrm2(rocblas_handle handle, - rocblas_int n, - const float *x, rocblas_int incx, - float *result){ - - return rocblas_nrm2_template(handle, n, x, incx, result); -} - -template<> -rocblas_status -rocblas_nrm2(rocblas_handle handle, - rocblas_int n, - const double *x, rocblas_int incx, - double *result){ - - return rocblas_nrm2_template(handle, n, x, incx, result); -} - -template<> -rocblas_status -rocblas_nrm2(rocblas_handle handle, - rocblas_int n, - const rocblas_float_complex *x, rocblas_int incx, - float *result){ - - return rocblas_nrm2_template(handle, n, x, incx, result); -} - -template<> -rocblas_status -rocblas_nrm2(rocblas_handle handle, - rocblas_int n, - const rocblas_double_complex *x, rocblas_int incx, - double *result){ - - return rocblas_nrm2_template(handle, n, x, incx, result); -} - - /* ============================================================================================ */ @@ -272,7 +221,7 @@ rocblas_snrm2(rocblas_handle handle, const float *x, rocblas_int incx, float *result){ - return rocblas_nrm2(handle, n, x, incx, result); + return rocblas_nrm2_template(handle, n, x, incx, result); } @@ -283,7 +232,7 @@ rocblas_dnrm2(rocblas_handle handle, const double *x, rocblas_int incx, double *result){ - return rocblas_nrm2(handle, n, x, incx, result); + return rocblas_nrm2_template(handle, n, x, incx, result); } @@ -294,7 +243,7 @@ rocblas_scnrm2(rocblas_handle handle, const rocblas_float_complex *x, rocblas_int incx, float *result){ - return rocblas_nrm2(handle, n, x, incx, result); + return rocblas_nrm2_template(handle, n, x, incx, result); } extern "C" @@ -304,7 +253,7 @@ rocblas_dznrm2(rocblas_handle handle, const rocblas_double_complex *x, rocblas_int incx, double *result){ - return rocblas_nrm2(handle, n, x, incx, result); + return rocblas_nrm2_template(handle, n, x, incx, result); } diff --git a/library/src/blas1/rocblas_scal.cpp b/library/src/blas1/rocblas_scal.cpp index fc2901ace..26415335c 100644 --- a/library/src/blas1/rocblas_scal.cpp +++ b/library/src/blas1/rocblas_scal.cpp @@ -7,7 +7,7 @@ #include "rocblas.h" -#include "rocblas.hpp" + #include "definitions.h" #define NB_X 256 @@ -107,59 +107,11 @@ rocblas_scal_template(rocblas_handle handle, /* ============================================================================================ */ - /* - * =========================================================================== - * template interface - * template specialization - * =========================================================================== - */ - - -template<> -rocblas_status -rocblas_scal(rocblas_handle handle, - rocblas_int n, - const float *alpha, - float *x, rocblas_int incx){ - - return rocblas_scal_template(handle, n, alpha, x, incx); -} -template<> -rocblas_status -rocblas_scal(rocblas_handle handle, - rocblas_int n, - const double *alpha, - double *x, rocblas_int incx){ - - return rocblas_scal_template(handle, n, alpha, x, incx); -} - -template<> -rocblas_status -rocblas_scal(rocblas_handle handle, - rocblas_int n, - const rocblas_float_complex *alpha, - rocblas_float_complex *x, rocblas_int incx){ - - return rocblas_scal_template(handle, n, alpha, x, incx); -} - -template<> -rocblas_status -rocblas_scal(rocblas_handle handle, - rocblas_int n, - const rocblas_double_complex *alpha, - rocblas_double_complex *x, rocblas_int incx){ - - return rocblas_scal_template(handle, n, alpha, x, incx); -} - -/* ============================================================================================ */ /* * =========================================================================== - * C89 wrapper + * C wrapper * =========================================================================== */ @@ -171,7 +123,7 @@ rocblas_sscal(rocblas_handle handle, const float *alpha, float *x, rocblas_int incx){ - return rocblas_scal(handle, n, alpha, x, incx); + return rocblas_scal_template(handle, n, alpha, x, incx); } extern "C" @@ -181,7 +133,7 @@ rocblas_dscal(rocblas_handle handle, const double *alpha, double *x, rocblas_int incx){ - return rocblas_scal(handle, n, alpha, x, incx); + return rocblas_scal_template(handle, n, alpha, x, incx); } @@ -192,7 +144,7 @@ rocblas_cscal(rocblas_handle handle, const rocblas_float_complex *alpha, rocblas_float_complex *x, rocblas_int incx){ - return rocblas_scal(handle, n, alpha, x, incx); + return rocblas_scal_template(handle, n, alpha, x, incx); } extern "C" @@ -202,7 +154,7 @@ rocblas_zscal(rocblas_handle handle, const rocblas_double_complex *alpha, rocblas_double_complex *x, rocblas_int incx){ - return rocblas_scal(handle, n, alpha, x, incx); + return rocblas_scal_template(handle, n, alpha, x, incx); } diff --git a/library/src/blas1/rocblas_swap.cpp b/library/src/blas1/rocblas_swap.cpp index c2249543f..4d540dbce 100644 --- a/library/src/blas1/rocblas_swap.cpp +++ b/library/src/blas1/rocblas_swap.cpp @@ -7,7 +7,7 @@ #include "rocblas.h" -#include "rocblas.hpp" + #include "definitions.h" @@ -97,55 +97,6 @@ rocblas_swap_template(rocblas_handle handle, return rocblas_status_success; } -/* ============================================================================================ */ - - /* - * =========================================================================== - * template interface - * template specialization - * =========================================================================== - */ - - -template<> -rocblas_status -rocblas_swap(rocblas_handle handle, - rocblas_int n, - float *x, rocblas_int incx, - float* y, rocblas_int incy){ - - return rocblas_swap_template(handle, n, x, incx, y, incy); -} - -template<> -rocblas_status -rocblas_swap(rocblas_handle handle, - rocblas_int n, - double *x, rocblas_int incx, - double* y, rocblas_int incy){ - - return rocblas_swap_template(handle, n, x, incx, y, incy); -} - -template<> -rocblas_status -rocblas_swap(rocblas_handle handle, - rocblas_int n, - rocblas_float_complex *x, rocblas_int incx, - rocblas_float_complex* y, rocblas_int incy){ - - return rocblas_swap_template(handle, n, x, incx, y, incy); -} - -template<> -rocblas_status -rocblas_swap(rocblas_handle handle, - rocblas_int n, - rocblas_double_complex *x, rocblas_int incx, - rocblas_double_complex* y, rocblas_int incy){ - - return rocblas_swap_template(handle, n, x, incx, y, incy); -} /* ============================================================================================ */ @@ -163,7 +114,7 @@ rocblas_sswap(rocblas_handle handle, float *x, rocblas_int incx, float* y, rocblas_int incy){ - return rocblas_swap(handle, n, x, incx, y, incy); + return rocblas_swap_template(handle, n, x, incx, y, incy); } @@ -174,7 +125,7 @@ rocblas_dswap(rocblas_handle handle, double *x, rocblas_int incx, double* y, rocblas_int incy){ - return rocblas_swap(handle, n, x, incx, y, incy); + return rocblas_swap_template(handle, n, x, incx, y, incy); } @@ -185,7 +136,7 @@ rocblas_cswap(rocblas_handle handle, rocblas_float_complex *x, rocblas_int incx, rocblas_float_complex* y, rocblas_int incy){ - return rocblas_swap(handle, n, x, incx, y, incy); + return rocblas_swap_template(handle, n, x, incx, y, incy); } extern "C" @@ -195,7 +146,7 @@ rocblas_zswap(rocblas_handle handle, rocblas_double_complex *x, rocblas_int incx, rocblas_double_complex* y, rocblas_int incy){ - return rocblas_swap(handle, n, x, incx, y, incy); + return rocblas_swap_template(handle, n, x, incx, y, incy); } /* ============================================================================================ */ diff --git a/library/src/blas2/ger_device.h b/library/src/blas2/ger_device.h new file mode 100644 index 000000000..c6f09a00c --- /dev/null +++ b/library/src/blas2/ger_device.h @@ -0,0 +1,35 @@ + + /* + * =========================================================================== + * This file provide common device function for ger routines + * =========================================================================== + */ + +/* ============================================================================================ */ + + +#include "../blas1/device_template.h" + + + +template +static __device__ void +ger_device( + rocblas_int m, rocblas_int n, + T alpha, + const T * __restrict__ x, rocblas_int incx, + const T * __restrict__ y, rocblas_int incy, + T * A, rocblas_int lda) +{ + if (m <= 0 || n <= 0) return; + + rocblas_int tx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + rocblas_int ty = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + + if (tx < m && ty < n) { + A[tx + lda*ty] += (alpha) * x[tx*incx] * y[ty*incy]; + } +} + + + diff --git a/library/src/blas2/rocblas_gemv.cpp b/library/src/blas2/rocblas_gemv.cpp index 9503f2b26..63ea33285 100644 --- a/library/src/blas2/rocblas_gemv.cpp +++ b/library/src/blas2/rocblas_gemv.cpp @@ -7,7 +7,7 @@ #include "rocblas.h" -#include "rocblas.hpp" + #include "status.h" #include "definitions.h" #include "gemv_device.h" @@ -194,42 +194,6 @@ rocblas_gemv_template(rocblas_handle handle, } -/* ============================================================================================ */ - - /* - * =========================================================================== - * template interface - * template specialization - * =========================================================================== - */ - - - -template<> -rocblas_status -rocblas_gemv(rocblas_handle handle, - rocblas_operation transA, rocblas_int m, rocblas_int n, - const float *alpha, - const float *A, rocblas_int lda, - const float *x, rocblas_int incx, - const float *beta, - float *y, rocblas_int incy){ - - return rocblas_gemv_template(handle, transA, m, n, alpha, A, lda, x, incx, beta, y, incy); -} - -template<> -rocblas_status -rocblas_gemv(rocblas_handle handle, - rocblas_operation transA, rocblas_int m, rocblas_int n, - const double *alpha, - const double *A, rocblas_int lda, - const double *x, rocblas_int incx, - const double *beta, - double *y, rocblas_int incy){ - - return rocblas_gemv_template(handle, transA, m, n, alpha, A, lda, x, incx, beta, y, incy); -} /* ============================================================================================ */ @@ -252,7 +216,7 @@ rocblas_sgemv(rocblas_handle handle, const float *beta, float *y, rocblas_int incy){ - return rocblas_gemv(handle, transA, m, n, alpha, A, lda, x, incx, beta, y, incy); + return rocblas_gemv_template(handle, transA, m, n, alpha, A, lda, x, incx, beta, y, incy); } @@ -266,6 +230,6 @@ rocblas_dgemv(rocblas_handle handle, const double *beta, double *y, rocblas_int incy){ - return rocblas_gemv(handle, transA, m, n, alpha, A, lda, x, incx, beta, y, incy); + return rocblas_gemv_template(handle, transA, m, n, alpha, A, lda, x, incx, beta, y, incy); } diff --git a/library/src/blas2/rocblas_ger.cpp b/library/src/blas2/rocblas_ger.cpp new file mode 100644 index 000000000..27e9f72c2 --- /dev/null +++ b/library/src/blas2/rocblas_ger.cpp @@ -0,0 +1,175 @@ +/* ************************************************************************ + * Copyright 2016 Advanced Micro Devices, Inc. + * + * ************************************************************************ */ +#include + + + +#include "rocblas.h" +#include "status.h" +#include "definitions.h" +#include "ger_device.h" + +template +__global__ void +ger_kernel_host_pointer(hipLaunchParm lp, + rocblas_int m, rocblas_int n, + const T alpha, + const T * __restrict__ x, rocblas_int incx, + const T * __restrict__ y, rocblas_int incy, + T * A, rocblas_int lda) +{ + ger_device(m, n, alpha, x, incx, y, incy, A, lda); +} + +template +__global__ void +ger_kernel_device_pointer(hipLaunchParm lp, + rocblas_int m, rocblas_int n, + const T * alpha, + const T * __restrict__ x, rocblas_int incx, + const T * __restrict__ y, rocblas_int incy, + T * A, rocblas_int lda) +{ + ger_device(m, n, *alpha, x, incx, y, incy, A, lda); +} + + +/*! \brief BLAS Level 2 API + + \details + xGER performs the matrix-vector operations + + A := A + alpha*x*y**T + + where alpha is a scalars, x and y are vectors, and A is an + m by n matrix. + + @param[in] + handle rocblas_handle. + handle to the rocblas library context queue. + @param[in] + m rocblas_int + @param[in] + n rocblas_int + @param[in] + alpha + specifies the scalar alpha. + @param[in] + x pointer storing vector x on the GPU. + @param[in] + incx rocblas_int + specifies the increment for the elements of x. + @param[in] + y pointer storing vector y on the GPU. + @param[in] + incy rocblas_int + specifies the increment for the elements of y. + @param[inout] + A pointer storing matrix A on the GPU. + @param[in] + lda rocblas_int + specifies the leading dimension of A. + + ********************************************************************/ + +template +rocblas_status +rocblas_ger_template(rocblas_handle handle, + rocblas_int m, rocblas_int n, + const T *alpha, + const T * x, rocblas_int incx, + const T * y, rocblas_int incy, + T * A, rocblas_int lda) +{ + + if(handle == nullptr) + return rocblas_status_invalid_handle; + else if ( m < 0 ) + return rocblas_status_invalid_size; + else if ( n < 0 ) + return rocblas_status_invalid_size; + else if ( x == nullptr ) + return rocblas_status_invalid_pointer; + else if ( incx < 0 ) + return rocblas_status_invalid_size; + else if ( y == nullptr ) + return rocblas_status_invalid_pointer; + else if ( incy < 0 ) + return rocblas_status_invalid_size; + else if ( A == nullptr ) + return rocblas_status_invalid_pointer; + else if ( lda < m ) + return rocblas_status_invalid_size; + + /* + * Quick return if possible. Not Argument error + */ + + if ( m==0 || n == 0 ) + return rocblas_status_success; + + hipStream_t rocblas_stream; + RETURN_IF_ROCBLAS_ERROR(rocblas_get_stream(handle, &rocblas_stream)); + + #define GEMV_DIM_X 128 + #define GEMV_DIM_Y 8 + rocblas_int blocksX = ((m-1) / GEMV_DIM_X) + 1; + rocblas_int blocksY = ((n-1) / GEMV_DIM_Y) + 1; + + dim3 ger_grid( blocksX, blocksY, 1 ); + dim3 ger_threads(GEMV_DIM_X, GEMV_DIM_Y, 1 ); + + if( rocblas_get_pointer_location((void*)alpha) == rocblas_mem_location_device ) + { + hipLaunchKernel(HIP_KERNEL_NAME(ger_kernel_device_pointer), dim3(ger_grid), dim3(ger_threads), 0, rocblas_stream, + m, n, alpha, x, incx, y, incy, A, lda); + } + else{ + T h_alpha_scalar = *alpha; + hipLaunchKernel(HIP_KERNEL_NAME(ger_kernel_host_pointer), dim3(ger_grid), dim3(ger_threads), 0, rocblas_stream, + m, n, h_alpha_scalar, x, incx, y, incy, A, lda); + } + #undef GEMV_DIM_X + #undef GEMV_DIM_Y + + return rocblas_status_success; +} + + +/* ============================================================================================ */ + + /* + * =========================================================================== + * C wrapper + * =========================================================================== + */ + + + +extern "C" +rocblas_status +rocblas_sger(rocblas_handle handle, + rocblas_int m, rocblas_int n, + const float *alpha, + const float *x, rocblas_int incx, + const float *y, rocblas_int incy, + float *A, rocblas_int lda){ + + return rocblas_ger_template(handle, m, n, alpha, x, incx, y, incy, A, lda); + +} + +extern "C" +rocblas_status +rocblas_dger(rocblas_handle handle, + rocblas_int m, rocblas_int n, + const double *alpha, + const double *x, rocblas_int incx, + const double *y, rocblas_int incy, + double *A, rocblas_int lda){ + + return rocblas_ger_template(handle, m, n, alpha, x, incx, y, incy, A, lda); + +} diff --git a/library/src/blas3/Tensile/Logic/FijiROCm14/Fiji_Cijk_Ailk_Bjlk_DB.yaml b/library/src/blas3/Tensile/Logic/FijiROCm14/Fiji_Cijk_Ailk_Bjlk_DB.yaml new file mode 100644 index 000000000..c60cb13fa --- /dev/null +++ b/library/src/blas3/Tensile/Logic/FijiROCm14/Fiji_Cijk_Ailk_Bjlk_DB.yaml @@ -0,0 +1,351 @@ +- Fiji +- AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 1 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumDimensionsC: 2 + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false +- - AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAccumulate: false + BenchmarkFork: 0 + DepthU: 8 + EdgeMultiKernel: false + EdgeType: Branch + KernelMaxSizes: &id001 [0, 0, 0] + KernelSerial: true + LoadMacInterleave: 4 + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 48 + MacroTile1: 48 + NumLoadsA: 6 + NumLoadsB: 6 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PadLDS: 1 + Prefetch: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 1 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumDimensionsC: 2 + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SplitU: 1 + ThreadTile0: 6 + ThreadTile1: 6 + ThreadTileEdge: 6 + ThreadTileShape: 0 + Valid: false + VectorWidthGlobalLoad: 4 + VectorWidthGlobalStore: 4 + VectorWidthLocalLoad: 4 + VectorWidthLocalStore: 4 + WorkGroup0: 8 + WorkGroup1: 8 + WorkGroupEdge: 8 + WorkGroupMapping: 1 + WorkGroupShape: 0 + - AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAccumulate: false + BenchmarkFork: 0 + DepthU: 8 + EdgeMultiKernel: false + EdgeType: Branch + KernelMaxSizes: *id001 + KernelSerial: true + LoadMacInterleave: 4 + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PadLDS: 1 + Prefetch: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 1 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumDimensionsC: 2 + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SplitU: 1 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileEdge: 2 + ThreadTileShape: 0 + Valid: true + VectorWidthGlobalLoad: 4 + VectorWidthGlobalStore: 4 + VectorWidthLocalLoad: 4 + VectorWidthLocalStore: 4 + WorkGroup0: 8 + WorkGroup1: 8 + WorkGroupEdge: 8 + WorkGroupMapping: 1 + WorkGroupShape: 0 + - AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAccumulate: false + BenchmarkFork: 0 + DepthU: 8 + EdgeMultiKernel: false + EdgeType: Branch + KernelMaxSizes: *id001 + KernelSerial: true + LoadMacInterleave: 4 + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PadLDS: 1 + Prefetch: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 1 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumDimensionsC: 2 + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SplitU: 1 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileEdge: 2 + ThreadTileShape: 0 + Valid: true + VectorWidthGlobalLoad: 4 + VectorWidthGlobalStore: 4 + VectorWidthLocalLoad: 4 + VectorWidthLocalStore: 4 + WorkGroup0: 16 + WorkGroup1: 16 + WorkGroupEdge: 16 + WorkGroupMapping: 1 + WorkGroupShape: 0 +- [2, 3, 0, 1] +- - - -1 + - - - -1 + - - - 32 + - - [32, 2] + - [128, 1] + - [928, 2] + - [1184, 1] + - [2144, 2] + - [2528, 1] + - [2944, 2] + - [3392, 1] + - [-1, 2] + - - 64 + - - [704, 1] + - [-1, 2] + - - 128 + - - [352, 1] + - [2528, 2] + - [2944, 1] + - [3392, 2] + - [-1, 1] + - - 224 + - - [32, 2] + - [128, 1] + - [928, 2] + - [1184, 1] + - [1792, 2] + - [2528, 1] + - [2944, 2] + - [3392, 1] + - [-1, 2] + - - 352 + - - [32, 2] + - [128, 1] + - [928, 2] + - [1472, 1] + - [3392, 2] + - [-1, 1] + - - 512 + - - [32, 2] + - [64, 1] + - [704, 2] + - [1184, 1] + - [3392, 2] + - [-1, 1] + - - 704 + - - [32, 2] + - [64, 1] + - [704, 2] + - [1184, 1] + - [1472, 0] + - [2944, 2] + - [3392, 1] + - [-1, 2] + - - 928 + - - [352, 2] + - [928, 1] + - [-1, 2] + - - 1184 + - - [352, 2] + - [512, 1] + - [2144, 2] + - [2528, 1] + - [-1, 2] + - - 1472 + - - [352, 2] + - [512, 1] + - [704, 0] + - [928, 2] + - [1184, 1] + - [-1, 2] + - - 1792 + - - [704, 2] + - [928, 1] + - [1792, 2] + - [2144, 0] + - [-1, 2] + - - 2144 + - - [224, 2] + - [352, 1] + - [-1, 2] + - - 2528 + - - [224, 2] + - [704, 1] + - [-1, 2] + - - -1 + - - [128, 2] + - [224, 1] + - [-1, 2] diff --git a/library/src/blas3/Tensile/Logic/FijiROCm14/Fiji_Cijk_Ailk_Bjlk_SB.yaml b/library/src/blas3/Tensile/Logic/FijiROCm14/Fiji_Cijk_Ailk_Bjlk_SB.yaml new file mode 100644 index 000000000..17229a8df --- /dev/null +++ b/library/src/blas3/Tensile/Logic/FijiROCm14/Fiji_Cijk_Ailk_Bjlk_SB.yaml @@ -0,0 +1,879 @@ +- Fiji +- AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumDimensionsC: 2 + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false +- - AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAccumulate: false + BenchmarkFork: 0 + DepthU: 8 + EdgeMultiKernel: false + EdgeType: Branch + KernelMaxSizes: [0, 0, 0] + KernelSerial: true + LoadMacInterleave: 4 + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + PadLDS: 1 + Prefetch: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumDimensionsC: 2 + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SplitU: 1 + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileEdge: 2 + ThreadTileShape: 2 + Valid: true + VectorWidthGlobalLoad: 4 + VectorWidthGlobalStore: 4 + VectorWidthLocalLoad: 4 + VectorWidthLocalStore: 4 + WorkGroup0: 8 + WorkGroup1: 8 + WorkGroupEdge: 8 + WorkGroupMapping: 1 + WorkGroupShape: 0 + - AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAccumulate: false + BenchmarkFork: 0 + DepthU: 8 + EdgeMultiKernel: false + EdgeType: Branch + KernelMaxSizes: [0, 0, 0] + KernelSerial: true + LoadMacInterleave: 4 + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 256 + MacroTile1: 64 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PadLDS: 1 + Prefetch: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumDimensionsC: 2 + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SplitU: 1 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileEdge: 8 + ThreadTileShape: 0 + Valid: true + VectorWidthGlobalLoad: 4 + VectorWidthGlobalStore: 4 + VectorWidthLocalLoad: 4 + VectorWidthLocalStore: 4 + WorkGroup0: 32 + WorkGroup1: 8 + WorkGroupEdge: 8 + WorkGroupMapping: 1 + WorkGroupShape: -4 + - AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAccumulate: false + BenchmarkFork: 0 + DepthU: 8 + EdgeMultiKernel: false + EdgeType: Branch + KernelMaxSizes: [0, 0, 0] + KernelSerial: true + LoadMacInterleave: 4 + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PadLDS: 1 + Prefetch: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumDimensionsC: 2 + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SplitU: 1 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileEdge: 8 + ThreadTileShape: 0 + Valid: true + VectorWidthGlobalLoad: 4 + VectorWidthGlobalStore: 4 + VectorWidthLocalLoad: 4 + VectorWidthLocalStore: 4 + WorkGroup0: 16 + WorkGroup1: 16 + WorkGroupEdge: 16 + WorkGroupMapping: 1 + WorkGroupShape: 0 + - AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAccumulate: false + BenchmarkFork: 0 + DepthU: 8 + EdgeMultiKernel: false + EdgeType: Branch + KernelMaxSizes: [0, 0, 0] + KernelSerial: true + LoadMacInterleave: 4 + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + PadLDS: 1 + Prefetch: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumDimensionsC: 2 + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SplitU: 1 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileEdge: 8 + ThreadTileShape: 0 + Valid: true + VectorWidthGlobalLoad: 4 + VectorWidthGlobalStore: 4 + VectorWidthLocalLoad: 4 + VectorWidthLocalStore: 4 + WorkGroup0: 16 + WorkGroup1: 8 + WorkGroupEdge: 8 + WorkGroupMapping: 1 + WorkGroupShape: -2 + - AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAccumulate: false + BenchmarkFork: 0 + DepthU: 16 + EdgeMultiKernel: false + EdgeType: Branch + KernelMaxSizes: [0, 0, 0] + KernelSerial: true + LoadMacInterleave: 4 + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PadLDS: 1 + Prefetch: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumDimensionsC: 2 + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SplitU: 1 + ThreadTile0: 1 + ThreadTile1: 1 + ThreadTileEdge: 1 + ThreadTileShape: 0 + Valid: true + VectorWidthGlobalLoad: 4 + VectorWidthGlobalStore: 4 + VectorWidthLocalLoad: 4 + VectorWidthLocalStore: 4 + WorkGroup0: 16 + WorkGroup1: 4 + WorkGroupEdge: 4 + WorkGroupMapping: 1 + WorkGroupShape: -4 + - AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAccumulate: false + BenchmarkFork: 0 + DepthU: 8 + EdgeMultiKernel: false + EdgeType: Branch + KernelMaxSizes: [0, 0, 0] + KernelSerial: true + LoadMacInterleave: 4 + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + PadLDS: 1 + Prefetch: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumDimensionsC: 2 + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SplitU: 1 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileEdge: 8 + ThreadTileShape: 0 + Valid: true + VectorWidthGlobalLoad: 4 + VectorWidthGlobalStore: 4 + VectorWidthLocalLoad: 4 + VectorWidthLocalStore: 4 + WorkGroup0: 8 + WorkGroup1: 16 + WorkGroupEdge: 8 + WorkGroupMapping: 1 + WorkGroupShape: 2 +- [2, 3, 0, 1] +- - - -1 + - - - -1 + - - - 16 + - - [1472, 4] + - [1696, 0] + - [1936, 4] + - [-1, 0] + - - 32 + - - [1936, 4] + - [3376, 0] + - [3712, 4] + - [-1, 0] + - - 64 + - - [896, 4] + - [4064, 0] + - [5216, 3] + - [5632, 0] + - [-1, 3] + - - 112 + - - [464, 4] + - [4432, 0] + - [-1, 3] + - - 176 + - - [352, 4] + - [1696, 0] + - [1936, 3] + - [2752, 0] + - [4064, 3] + - [5216, 5] + - [5632, 0] + - [6064, 5] + - [-1, 3] + - - 256 + - - [176, 4] + - [1936, 0] + - [-1, 3] + - - 352 + - - [176, 4] + - [1472, 0] + - [2464, 3] + - [2752, 0] + - [3056, 3] + - [3376, 5] + - [-1, 3] + - - 464 + - - [112, 4] + - [1072, 0] + - [1264, 3] + - [1472, 5] + - [1936, 3] + - [2192, 0] + - [6976, 3] + - [7456, 2] + - [-1, 3] + - - 592 + - - [64, 4] + - [736, 0] + - [1472, 3] + - [1696, 0] + - [1936, 3] + - [2192, 5] + - [6512, 3] + - [6976, 5] + - [-1, 3] + - - 736 + - - [64, 4] + - [592, 0] + - [7456, 3] + - [-1, 5] + - - 896 + - - [64, 4] + - [464, 0] + - [1936, 3] + - [2192, 5] + - [3712, 3] + - [4064, 1] + - [4432, 3] + - [4816, 1] + - [5216, 3] + - [5632, 2] + - [6064, 3] + - [6512, 5] + - [-1, 2] + - - 1072 + - - [32, 4] + - [464, 0] + - [2464, 3] + - [2752, 5] + - [3376, 3] + - [3712, 5] + - [4064, 2] + - [4432, 3] + - [5216, 2] + - [5632, 5] + - [-1, 2] + - - 1264 + - - [32, 4] + - [352, 0] + - [3712, 3] + - [4432, 2] + - [4816, 5] + - [5216, 3] + - [5632, 2] + - [7456, 3] + - [-1, 1] + - - 1472 + - - [32, 4] + - [352, 0] + - [2464, 3] + - [2752, 5] + - [3056, 2] + - [3712, 3] + - [4432, 2] + - [4816, 5] + - [5216, 3] + - [5632, 5] + - [6512, 2] + - [6976, 5] + - [7456, 1] + - [-1, 3] + - - 1696 + - - [16, 0] + - [32, 4] + - [256, 0] + - [464, 3] + - [592, 0] + - [1936, 3] + - [2192, 5] + - [2464, 3] + - [2752, 1] + - [3376, 3] + - [3712, 1] + - [4064, 2] + - [4432, 3] + - [5216, 2] + - [5632, 5] + - [6064, 3] + - [6512, 2] + - [6976, 3] + - [7456, 1] + - [-1, 3] + - - 1936 + - - [16, 0] + - [32, 4] + - [256, 0] + - [352, 3] + - [464, 5] + - [2192, 3] + - [2464, 2] + - [3056, 3] + - [3376, 2] + - [3712, 5] + - [4432, 2] + - [4816, 3] + - [5216, 1] + - [5632, 2] + - [-1, 3] + - - 2192 + - - [176, 0] + - [2464, 3] + - [3056, 2] + - [5216, 3] + - [5632, 1] + - [6064, 2] + - [6976, 3] + - [-1, 1] + - - 2464 + - - [176, 0] + - [1936, 3] + - [2464, 2] + - [2752, 3] + - [3376, 2] + - [3712, 5] + - [4064, 3] + - [4816, 2] + - [5216, 3] + - [5632, 2] + - [6064, 5] + - [6976, 2] + - [7456, 5] + - [-1, 3] + - - 2752 + - - [112, 0] + - [256, 3] + - [352, 0] + - [1936, 3] + - [2192, 2] + - [2464, 1] + - [3056, 3] + - [3376, 2] + - [3712, 3] + - [4064, 2] + - [4432, 1] + - [4816, 3] + - [5632, 5] + - [-1, 3] + - - 3056 + - - [112, 0] + - [256, 5] + - [1696, 3] + - [1936, 5] + - [2192, 3] + - [2464, 2] + - [4064, 3] + - [4432, 2] + - [5216, 3] + - [5632, 5] + - [6512, 3] + - [6976, 2] + - [-1, 5] + - - 3376 + - - [112, 0] + - [176, 5] + - [1472, 3] + - [1696, 2] + - [4064, 1] + - [4432, 3] + - [5216, 1] + - [5632, 5] + - [6064, 1] + - [6512, 3] + - [6976, 1] + - [-1, 3] + - - 3712 + - - [112, 0] + - [176, 5] + - [256, 3] + - [352, 5] + - [464, 3] + - [592, 5] + - [1264, 3] + - [1472, 1] + - [1696, 2] + - [1936, 1] + - [3056, 3] + - [7456, 1] + - [-1, 3] + - - 4064 + - - [112, 0] + - [176, 5] + - [1264, 3] + - [1696, 5] + - [5632, 3] + - [6064, 2] + - [-1, 3] + - - 4432 + - - [64, 0] + - [112, 5] + - [1264, 3] + - [1936, 1] + - [2192, 3] + - [2464, 5] + - [2752, 3] + - [3056, 1] + - [3712, 3] + - [4432, 1] + - [5216, 3] + - [5632, 1] + - [6976, 3] + - [-1, 5] + - - 4816 + - - [64, 0] + - [112, 5] + - [352, 3] + - [464, 5] + - [1264, 3] + - [1472, 1] + - [1696, 2] + - [3712, 3] + - [4432, 2] + - [4816, 3] + - [5216, 1] + - [5632, 3] + - [6064, 5] + - [6512, 2] + - [6976, 3] + - [-1, 5] + - - 5216 + - - [64, 0] + - [112, 5] + - [1264, 3] + - [1936, 1] + - [2192, 2] + - [3376, 1] + - [3712, 3] + - [5632, 1] + - [6064, 3] + - [6512, 5] + - [7456, 3] + - [-1, 5] + - - 5632 + - - [64, 0] + - [112, 5] + - [1072, 3] + - [1264, 5] + - [1472, 3] + - [1696, 2] + - [2192, 3] + - [2464, 2] + - [2752, 3] + - [3056, 1] + - [3376, 3] + - [3712, 1] + - [4816, 3] + - [5216, 2] + - [5632, 5] + - [6064, 2] + - [6512, 3] + - [6976, 5] + - [7456, 3] + - [-1, 5] + - - 6064 + - - [64, 0] + - [112, 5] + - [176, 3] + - [256, 5] + - [464, 3] + - [592, 5] + - [1472, 3] + - [1936, 2] + - [2192, 3] + - [2464, 2] + - [3056, 1] + - [3712, 3] + - [4064, 1] + - [4816, 5] + - [5632, 3] + - [6064, 2] + - [6976, 3] + - [-1, 5] + - - 6512 + - - [64, 0] + - [256, 5] + - [1072, 3] + - [1264, 2] + - [1936, 1] + - [2192, 3] + - [3376, 1] + - [3712, 3] + - [4064, 1] + - [4432, 3] + - [4816, 1] + - [6976, 3] + - [7456, 1] + - [-1, 2] + - - 6976 + - - [64, 0] + - [112, 3] + - [176, 5] + - [736, 3] + - [896, 5] + - [1472, 3] + - [1696, 1] + - [1936, 2] + - [2192, 1] + - [2464, 3] + - [3056, 1] + - [3712, 3] + - [4064, 1] + - [4816, 2] + - [5216, 5] + - [5632, 2] + - [6064, 3] + - [6512, 5] + - [-1, 3] + - - 7456 + - - [64, 0] + - [112, 3] + - [176, 5] + - [1072, 3] + - [1264, 2] + - [2192, 1] + - [2464, 2] + - [2752, 3] + - [3056, 1] + - [4432, 3] + - [4816, 5] + - [6512, 2] + - [6976, 3] + - [7456, 1] + - [-1, 3] + - - -1 + - - [64, 0] + - [1072, 3] + - [1472, 2] + - [2752, 1] + - [3056, 5] + - [4064, 3] + - [4816, 5] + - [6976, 3] + - [7456, 5] + - [-1, 1] diff --git a/library/src/blas3/Tensile/Logic/FijiROCm14/Fiji_Cijk_Ailk_Bljk_DB.yaml b/library/src/blas3/Tensile/Logic/FijiROCm14/Fiji_Cijk_Ailk_Bljk_DB.yaml new file mode 100644 index 000000000..e1018f6de --- /dev/null +++ b/library/src/blas3/Tensile/Logic/FijiROCm14/Fiji_Cijk_Ailk_Bljk_DB.yaml @@ -0,0 +1,267 @@ +- Fiji +- AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 1 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumDimensionsC: 2 + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false +- - AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAccumulate: false + BenchmarkFork: 0 + DepthU: 8 + EdgeMultiKernel: false + EdgeType: Branch + KernelMaxSizes: [0, 0, 0] + KernelSerial: true + LoadMacInterleave: 4 + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PadLDS: 1 + Prefetch: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 1 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumDimensionsC: 2 + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SplitU: 1 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileEdge: 2 + ThreadTileShape: 0 + Valid: true + VectorWidthGlobalLoad: 4 + VectorWidthGlobalStore: 4 + VectorWidthLocalLoad: 4 + VectorWidthLocalStore: 4 + WorkGroup0: 8 + WorkGroup1: 8 + WorkGroupEdge: 8 + WorkGroupMapping: 1 + WorkGroupShape: 0 + - AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAccumulate: false + BenchmarkFork: 0 + DepthU: 8 + EdgeMultiKernel: false + EdgeType: Branch + KernelMaxSizes: [0, 0, 0] + KernelSerial: true + LoadMacInterleave: 4 + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PadLDS: 1 + Prefetch: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 1 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumDimensionsC: 2 + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SplitU: 1 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileEdge: 2 + ThreadTileShape: 0 + Valid: true + VectorWidthGlobalLoad: 4 + VectorWidthGlobalStore: 4 + VectorWidthLocalLoad: 4 + VectorWidthLocalStore: 4 + WorkGroup0: 16 + WorkGroup1: 16 + WorkGroupEdge: 16 + WorkGroupMapping: 1 + WorkGroupShape: 0 +- [2, 3, 0, 1] +- - - -1 + - - - -1 + - - - 32 + - - [32, 1] + - [128, 0] + - [2944, 1] + - [-1, 0] + - - 64 + - - [32, 0] + - [64, 1] + - [704, 0] + - [-1, 1] + - - 128 + - - [32, 0] + - [64, 1] + - [352, 0] + - [3392, 1] + - [-1, 0] + - - 224 + - - [32, 1] + - [128, 0] + - [928, 1] + - [1184, 0] + - [2528, 1] + - [-1, 0] + - - 352 + - - [32, 1] + - [128, 0] + - [1792, 1] + - [3392, 0] + - [-1, 1] + - - 512 + - - [32, 1] + - [64, 0] + - [1184, 1] + - [2528, 0] + - [3392, 1] + - [-1, 0] + - - 704 + - - [32, 1] + - [64, 0] + - [928, 1] + - [1472, 0] + - [-1, 1] + - - 928 + - - [928, 1] + - [1184, 0] + - [2528, 1] + - [2944, 0] + - [-1, 1] + - - 1184 + - - [512, 1] + - [1184, 0] + - [-1, 1] + - - 1472 + - - [512, 1] + - [704, 0] + - [-1, 1] + - - 1792 + - - [352, 1] + - [928, 0] + - [-1, 1] + - - 2144 + - - [352, 1] + - [512, 0] + - [-1, 1] + - - 2944 + - - [224, 1] + - [512, 0] + - [-1, 1] + - - 3392 + - - [224, 1] + - [352, 0] + - [928, 1] + - [1472, 0] + - [-1, 1] + - - -1 + - - [224, 1] + - [352, 0] + - [-1, 1] diff --git a/library/src/blas3/Tensile/Logic/FijiROCm14/Fiji_Cijk_Ailk_Bljk_SB.yaml b/library/src/blas3/Tensile/Logic/FijiROCm14/Fiji_Cijk_Ailk_Bljk_SB.yaml new file mode 100644 index 000000000..4d74d5b57 --- /dev/null +++ b/library/src/blas3/Tensile/Logic/FijiROCm14/Fiji_Cijk_Ailk_Bljk_SB.yaml @@ -0,0 +1,854 @@ +- Fiji +- AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumDimensionsC: 2 + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false +- - AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAccumulate: false + BenchmarkFork: 0 + DepthU: 8 + EdgeMultiKernel: false + EdgeType: Branch + KernelMaxSizes: [0, 0, 0] + KernelSerial: true + LoadMacInterleave: 4 + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PadLDS: 1 + Prefetch: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumDimensionsC: 2 + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SplitU: 1 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileEdge: 8 + ThreadTileShape: 0 + Valid: true + VectorWidthGlobalLoad: 4 + VectorWidthGlobalStore: 4 + VectorWidthLocalLoad: 4 + VectorWidthLocalStore: 4 + WorkGroup0: 8 + WorkGroup1: 16 + WorkGroupEdge: 8 + WorkGroupMapping: 1 + WorkGroupShape: 2 + - AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAccumulate: false + BenchmarkFork: 0 + DepthU: 8 + EdgeMultiKernel: false + EdgeType: Branch + KernelMaxSizes: [0, 0, 0] + KernelSerial: true + LoadMacInterleave: 4 + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 256 + MacroTile1: 64 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PadLDS: 1 + Prefetch: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumDimensionsC: 2 + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SplitU: 1 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileEdge: 8 + ThreadTileShape: 0 + Valid: true + VectorWidthGlobalLoad: 4 + VectorWidthGlobalStore: 4 + VectorWidthLocalLoad: 4 + VectorWidthLocalStore: 4 + WorkGroup0: 32 + WorkGroup1: 8 + WorkGroupEdge: 8 + WorkGroupMapping: 1 + WorkGroupShape: -4 + - AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAccumulate: false + BenchmarkFork: 0 + DepthU: 8 + EdgeMultiKernel: false + EdgeType: Branch + KernelMaxSizes: [0, 0, 0] + KernelSerial: true + LoadMacInterleave: 4 + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PadLDS: 1 + Prefetch: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumDimensionsC: 2 + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SplitU: 1 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileEdge: 2 + ThreadTileShape: -2 + Valid: true + VectorWidthGlobalLoad: 4 + VectorWidthGlobalStore: 4 + VectorWidthLocalLoad: 4 + VectorWidthLocalStore: 4 + WorkGroup0: 16 + WorkGroup1: 8 + WorkGroupEdge: 8 + WorkGroupMapping: 1 + WorkGroupShape: -2 + - AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAccumulate: false + BenchmarkFork: 0 + DepthU: 8 + EdgeMultiKernel: false + EdgeType: Branch + KernelMaxSizes: [0, 0, 0] + KernelSerial: true + LoadMacInterleave: 4 + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 8 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PadLDS: 1 + Prefetch: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumDimensionsC: 2 + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SplitU: 1 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileEdge: 8 + ThreadTileShape: 0 + Valid: true + VectorWidthGlobalLoad: 4 + VectorWidthGlobalStore: 4 + VectorWidthLocalLoad: 4 + VectorWidthLocalStore: 4 + WorkGroup0: 8 + WorkGroup1: 8 + WorkGroupEdge: 8 + WorkGroupMapping: 1 + WorkGroupShape: 0 + - AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAccumulate: false + BenchmarkFork: 0 + DepthU: 8 + EdgeMultiKernel: false + EdgeType: Branch + KernelMaxSizes: [0, 0, 0] + KernelSerial: true + LoadMacInterleave: 4 + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PadLDS: 1 + Prefetch: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumDimensionsC: 2 + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SplitU: 1 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileEdge: 8 + ThreadTileShape: 0 + Valid: true + VectorWidthGlobalLoad: 4 + VectorWidthGlobalStore: 4 + VectorWidthLocalLoad: 4 + VectorWidthLocalStore: 4 + WorkGroup0: 16 + WorkGroup1: 8 + WorkGroupEdge: 8 + WorkGroupMapping: 1 + WorkGroupShape: -2 +- [2, 3, 0, 1] +- - - -1 + - - - -1 + - - - 16 + - - [-1, 2] + - - 32 + - - [2464, 2] + - [2752, 4] + - [3712, 2] + - [4064, 4] + - [4432, 2] + - [4816, 3] + - [5632, 2] + - [6064, 3] + - [-1, 2] + - - 64 + - - [-1, 2] + - - 112 + - - [4064, 2] + - [5632, 3] + - [6064, 0] + - [6512, 3] + - [7456, 0] + - [-1, 4] + - - 176 + - - [2464, 2] + - [3056, 3] + - [3376, 0] + - [-1, 3] + - - 256 + - - [1936, 2] + - [2192, 0] + - [2464, 3] + - [2752, 0] + - [3056, 3] + - [3376, 0] + - [3712, 3] + - [4064, 4] + - [7456, 3] + - [-1, 0] + - - 352 + - - [1264, 2] + - [2464, 0] + - [4816, 3] + - [5216, 4] + - [5632, 3] + - [6064, 0] + - [7456, 3] + - [-1, 4] + - - 464 + - - [896, 2] + - [1072, 0] + - [1264, 3] + - [1936, 0] + - [3712, 3] + - [4064, 0] + - [4432, 3] + - [4816, 0] + - [5632, 3] + - [6064, 4] + - [7456, 3] + - [-1, 0] + - - 592 + - - [736, 2] + - [896, 3] + - [1072, 0] + - [1264, 3] + - [1696, 0] + - [2192, 3] + - [2464, 0] + - [4432, 3] + - [4816, 4] + - [5632, 3] + - [-1, 0] + - - 736 + - - [592, 2] + - [896, 3] + - [1072, 0] + - [3712, 3] + - [4064, 1] + - [4816, 3] + - [-1, 0] + - - 896 + - - [464, 2] + - [592, 0] + - [896, 3] + - [1072, 0] + - [1936, 3] + - [2464, 0] + - [3056, 3] + - [3376, 4] + - [4064, 3] + - [6512, 0] + - [6976, 4] + - [-1, 0] + - - 1072 + - - [464, 2] + - [592, 4] + - [1696, 3] + - [1936, 0] + - [2464, 3] + - [2752, 0] + - [3376, 3] + - [4432, 0] + - [4816, 3] + - [7456, 0] + - [-1, 3] + - - 1264 + - - [352, 2] + - [592, 0] + - [2752, 3] + - [6976, 0] + - [7456, 3] + - [-1, 0] + - - 1472 + - - [256, 2] + - [352, 3] + - [464, 0] + - [592, 3] + - [736, 0] + - [2464, 3] + - [7456, 0] + - [-1, 3] + - - 1696 + - - [256, 2] + - [352, 0] + - [464, 4] + - [592, 2] + - [1472, 3] + - [1696, 4] + - [1936, 3] + - [2192, 0] + - [2464, 3] + - [3056, 0] + - [3376, 3] + - [4064, 0] + - [4432, 3] + - [5632, 0] + - [6064, 3] + - [6976, 0] + - [-1, 3] + - - 1936 + - - [256, 2] + - [464, 0] + - [1696, 3] + - [1936, 4] + - [3056, 0] + - [3376, 3] + - [4816, 0] + - [5216, 3] + - [5632, 4] + - [6064, 0] + - [6512, 3] + - [-1, 0] + - - 2192 + - - [176, 2] + - [464, 0] + - [736, 3] + - [896, 0] + - [1936, 3] + - [2192, 0] + - [2464, 3] + - [4816, 0] + - [5632, 3] + - [6064, 1] + - [6976, 0] + - [7456, 1] + - [-1, 3] + - - 2464 + - - [176, 2] + - [464, 0] + - [1696, 3] + - [1936, 1] + - [2192, 3] + - [3056, 0] + - [3376, 3] + - [4064, 0] + - [4432, 3] + - [4816, 0] + - [5632, 3] + - [7456, 0] + - [-1, 1] + - - 2752 + - - [176, 2] + - [256, 3] + - [352, 2] + - [896, 3] + - [1072, 4] + - [1264, 3] + - [1472, 4] + - [1936, 3] + - [3056, 0] + - [3376, 3] + - [3712, 4] + - [4064, 0] + - [4816, 3] + - [5632, 0] + - [6064, 1] + - [6512, 4] + - [6976, 1] + - [-1, 3] + - - 3056 + - - [112, 2] + - [352, 0] + - [1696, 3] + - [2192, 0] + - [2464, 3] + - [3056, 0] + - [3376, 4] + - [3712, 0] + - [4432, 3] + - [4816, 0] + - [5216, 3] + - [5632, 1] + - [6064, 0] + - [6976, 4] + - [7456, 0] + - [-1, 3] + - - 3376 + - - [112, 2] + - [352, 0] + - [736, 3] + - [896, 0] + - [1264, 3] + - [1472, 4] + - [1696, 3] + - [1936, 4] + - [3056, 0] + - [3712, 3] + - [4064, 1] + - [5216, 0] + - [6064, 1] + - [7456, 0] + - [-1, 3] + - - 3712 + - - [112, 2] + - [256, 0] + - [896, 3] + - [1072, 1] + - [1472, 3] + - [1696, 0] + - [1936, 3] + - [2192, 0] + - [2464, 4] + - [2752, 3] + - [3056, 0] + - [3376, 3] + - [4064, 0] + - [4432, 4] + - [4816, 1] + - [5632, 4] + - [6976, 0] + - [-1, 3] + - - 4064 + - - [112, 2] + - [256, 0] + - [352, 3] + - [464, 0] + - [592, 3] + - [736, 0] + - [1072, 3] + - [1472, 4] + - [1696, 3] + - [1936, 1] + - [2464, 0] + - [2752, 3] + - [3056, 0] + - [3376, 1] + - [3712, 0] + - [4064, 3] + - [4432, 1] + - [4816, 4] + - [5216, 0] + - [5632, 3] + - [6064, 0] + - [6512, 4] + - [6976, 0] + - [7456, 4] + - [-1, 3] + - - 4432 + - - [112, 2] + - [736, 3] + - [896, 0] + - [1264, 3] + - [1696, 0] + - [1936, 1] + - [2192, 0] + - [2464, 1] + - [2752, 3] + - [3056, 1] + - [3712, 0] + - [4064, 4] + - [4816, 1] + - [5216, 3] + - [5632, 1] + - [6512, 0] + - [7456, 4] + - [-1, 3] + - - 4816 + - - [64, 2] + - [112, 0] + - [352, 3] + - [592, 0] + - [896, 3] + - [1072, 0] + - [1264, 4] + - [1472, 1] + - [1696, 3] + - [1936, 0] + - [2464, 4] + - [2752, 3] + - [3056, 0] + - [3712, 3] + - [4432, 4] + - [6064, 0] + - [6512, 4] + - [-1, 0] + - - 5216 + - - [64, 2] + - [112, 0] + - [176, 1] + - [256, 3] + - [352, 0] + - [592, 3] + - [736, 0] + - [1264, 3] + - [1472, 0] + - [1696, 1] + - [1936, 0] + - [2464, 3] + - [2752, 4] + - [3376, 1] + - [3712, 0] + - [4432, 4] + - [5216, 0] + - [5632, 1] + - [6064, 4] + - [-1, 0] + - - 5632 + - - [64, 2] + - [112, 3] + - [176, 2] + - [736, 3] + - [896, 0] + - [1264, 4] + - [1696, 3] + - [1936, 0] + - [2752, 3] + - [3056, 1] + - [4064, 4] + - [4816, 0] + - [5216, 4] + - [5632, 3] + - [6976, 0] + - [7456, 4] + - [-1, 0] + - - 6064 + - - [64, 2] + - [176, 0] + - [352, 3] + - [464, 0] + - [1264, 3] + - [1472, 4] + - [1696, 0] + - [2464, 3] + - [2752, 4] + - [3376, 1] + - [4432, 0] + - [4816, 3] + - [5216, 0] + - [5632, 4] + - [6064, 3] + - [6512, 1] + - [-1, 0] + - - 6512 + - - [64, 2] + - [176, 0] + - [464, 3] + - [736, 0] + - [896, 3] + - [1072, 1] + - [1264, 4] + - [1696, 3] + - [1936, 4] + - [2192, 3] + - [2464, 0] + - [2752, 1] + - [3376, 0] + - [3712, 4] + - [4064, 0] + - [4816, 4] + - [5216, 3] + - [-1, 0] + - - 6976 + - - [64, 2] + - [176, 0] + - [256, 3] + - [464, 0] + - [736, 3] + - [896, 0] + - [1072, 3] + - [1264, 0] + - [1472, 4] + - [1696, 3] + - [2464, 0] + - [2752, 1] + - [3376, 0] + - [3712, 4] + - [4064, 3] + - [4432, 0] + - [4816, 1] + - [-1, 0] + - - 7456 + - - [64, 2] + - [176, 0] + - [896, 3] + - [1072, 1] + - [1696, 3] + - [1936, 0] + - [2192, 1] + - [2464, 0] + - [2752, 4] + - [4064, 0] + - [4432, 1] + - [5216, 3] + - [5632, 0] + - [6064, 1] + - [-1, 0] + - - -1 + - - [64, 2] + - [1696, 0] + - [1936, 3] + - [2192, 0] + - [2464, 4] + - [3376, 0] + - [4064, 1] + - [4432, 4] + - [5216, 1] + - [5632, 0] + - [6064, 3] + - [6512, 4] + - [6976, 0] + - [-1, 3] diff --git a/library/src/blas3/Tensile/Logic/FijiROCm14/Fiji_Cijk_Alik_Bjlk_DB.yaml b/library/src/blas3/Tensile/Logic/FijiROCm14/Fiji_Cijk_Alik_Bjlk_DB.yaml new file mode 100644 index 000000000..4c8ee9533 --- /dev/null +++ b/library/src/blas3/Tensile/Logic/FijiROCm14/Fiji_Cijk_Alik_Bjlk_DB.yaml @@ -0,0 +1,248 @@ +- Fiji +- AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 1 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumDimensionsC: 2 + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStrides: false +- - AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAccumulate: false + BenchmarkFork: 0 + DepthU: 8 + EdgeMultiKernel: false + EdgeType: Branch + KernelMaxSizes: &id001 [0, 0, 0] + KernelSerial: true + LoadMacInterleave: 4 + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PadLDS: 1 + Prefetch: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 1 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumDimensionsC: 2 + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStrides: false + SplitU: 1 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileEdge: 2 + ThreadTileShape: 0 + Valid: true + VectorWidthGlobalLoad: 4 + VectorWidthGlobalStore: 4 + VectorWidthLocalLoad: 4 + VectorWidthLocalStore: 4 + WorkGroup0: 8 + WorkGroup1: 8 + WorkGroupEdge: 8 + WorkGroupMapping: 1 + WorkGroupShape: 0 + - AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAccumulate: false + BenchmarkFork: 0 + DepthU: 8 + EdgeMultiKernel: false + EdgeType: Branch + KernelMaxSizes: *id001 + KernelSerial: true + LoadMacInterleave: 4 + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PadLDS: 1 + Prefetch: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 1 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumDimensionsC: 2 + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStrides: false + SplitU: 1 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileEdge: 2 + ThreadTileShape: 0 + Valid: true + VectorWidthGlobalLoad: 4 + VectorWidthGlobalStore: 4 + VectorWidthLocalLoad: 4 + VectorWidthLocalStore: 4 + WorkGroup0: 16 + WorkGroup1: 16 + WorkGroupEdge: 16 + WorkGroupMapping: 1 + WorkGroupShape: 0 +- [2, 3, 0, 1] +- - - -1 + - - - -1 + - - - 32 + - - [2944, 1] + - [-1, 0] + - - 64 + - - [512, 1] + - [704, 0] + - [-1, 1] + - - 128 + - - [2944, 1] + - [3392, 0] + - [-1, 1] + - - 224 + - - [2144, 1] + - [-1, 0] + - - 352 + - - [1792, 1] + - [2528, 0] + - [2944, 1] + - [-1, 0] + - - 512 + - - [1184, 1] + - [1792, 0] + - [-1, 1] + - - 704 + - - [928, 1] + - [1472, 0] + - [-1, 1] + - - 1184 + - - [512, 1] + - [928, 0] + - [-1, 1] + - - 1472 + - - [512, 1] + - [704, 0] + - [-1, 1] + - - 1792 + - - [224, 1] + - [512, 0] + - [928, 1] + - [1184, 0] + - [-1, 1] + - - 2528 + - - [224, 1] + - [512, 0] + - [-1, 1] + - - 2944 + - - [224, 1] + - [352, 0] + - [1184, 1] + - [1472, 0] + - [-1, 1] + - - 3392 + - - [224, 1] + - [352, 0] + - [-1, 1] + - - -1 + - - [128, 1] + - [224, 0] + - [-1, 1] diff --git a/library/src/blas3/Tensile/Logic/FijiROCm14/Fiji_Cijk_Alik_Bjlk_SB.yaml b/library/src/blas3/Tensile/Logic/FijiROCm14/Fiji_Cijk_Alik_Bjlk_SB.yaml new file mode 100644 index 000000000..6fed9d816 --- /dev/null +++ b/library/src/blas3/Tensile/Logic/FijiROCm14/Fiji_Cijk_Alik_Bjlk_SB.yaml @@ -0,0 +1,1044 @@ +- Fiji +- AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumDimensionsC: 2 + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStrides: false +- - AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAccumulate: false + BenchmarkFork: 0 + DepthU: 8 + EdgeMultiKernel: false + EdgeType: Branch + KernelMaxSizes: [0, 0, 0] + KernelSerial: true + LoadMacInterleave: 4 + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 2 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PadLDS: 1 + Prefetch: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumDimensionsC: 2 + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStrides: false + SplitU: 1 + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileEdge: 2 + ThreadTileShape: 2 + Valid: true + VectorWidthGlobalLoad: 4 + VectorWidthGlobalStore: 4 + VectorWidthLocalLoad: 4 + VectorWidthLocalStore: 4 + WorkGroup0: 16 + WorkGroup1: 8 + WorkGroupEdge: 8 + WorkGroupMapping: 1 + WorkGroupShape: -2 + - AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAccumulate: false + BenchmarkFork: 0 + DepthU: 8 + EdgeMultiKernel: false + EdgeType: Branch + KernelMaxSizes: [0, 0, 0] + KernelSerial: true + LoadMacInterleave: 4 + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PadLDS: 1 + Prefetch: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumDimensionsC: 2 + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStrides: false + SplitU: 1 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileEdge: 8 + ThreadTileShape: 0 + Valid: true + VectorWidthGlobalLoad: 4 + VectorWidthGlobalStore: 4 + VectorWidthLocalLoad: 4 + VectorWidthLocalStore: 4 + WorkGroup0: 16 + WorkGroup1: 16 + WorkGroupEdge: 16 + WorkGroupMapping: 1 + WorkGroupShape: 0 + - AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAccumulate: false + BenchmarkFork: 0 + DepthU: 16 + EdgeMultiKernel: false + EdgeType: Branch + KernelMaxSizes: [0, 0, 0] + KernelSerial: true + LoadMacInterleave: 4 + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 8 + MacroTile1: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 2 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PadLDS: 1 + Prefetch: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumDimensionsC: 2 + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStrides: false + SplitU: 1 + ThreadTile0: 1 + ThreadTile1: 1 + ThreadTileEdge: 1 + ThreadTileShape: 0 + Valid: true + VectorWidthGlobalLoad: 4 + VectorWidthGlobalStore: 4 + VectorWidthLocalLoad: 4 + VectorWidthLocalStore: 4 + WorkGroup0: 8 + WorkGroup1: 8 + WorkGroupEdge: 8 + WorkGroupMapping: 1 + WorkGroupShape: 0 + - AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAccumulate: false + BenchmarkFork: 0 + DepthU: 8 + EdgeMultiKernel: false + EdgeType: Branch + KernelMaxSizes: [0, 0, 0] + KernelSerial: true + LoadMacInterleave: 4 + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + PadLDS: 1 + Prefetch: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumDimensionsC: 2 + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStrides: false + SplitU: 1 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileEdge: 8 + ThreadTileShape: 0 + Valid: true + VectorWidthGlobalLoad: 4 + VectorWidthGlobalStore: 4 + VectorWidthLocalLoad: 4 + VectorWidthLocalStore: 4 + WorkGroup0: 8 + WorkGroup1: 16 + WorkGroupEdge: 8 + WorkGroupMapping: 1 + WorkGroupShape: 2 + - AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAccumulate: false + BenchmarkFork: 0 + DepthU: 8 + EdgeMultiKernel: false + EdgeType: Branch + KernelMaxSizes: [0, 0, 0] + KernelSerial: true + LoadMacInterleave: 4 + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 256 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 2 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + PadLDS: 1 + Prefetch: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumDimensionsC: 2 + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStrides: false + SplitU: 1 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileEdge: 8 + ThreadTileShape: 0 + Valid: true + VectorWidthGlobalLoad: 4 + VectorWidthGlobalStore: 4 + VectorWidthLocalLoad: 4 + VectorWidthLocalStore: 4 + WorkGroup0: 8 + WorkGroup1: 32 + WorkGroupEdge: 8 + WorkGroupMapping: 1 + WorkGroupShape: 4 + - AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAccumulate: false + BenchmarkFork: 0 + DepthU: 8 + EdgeMultiKernel: false + EdgeType: Branch + KernelMaxSizes: [0, 0, 0] + KernelSerial: true + LoadMacInterleave: 4 + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 256 + MacroTile1: 64 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 8 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PadLDS: 1 + Prefetch: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumDimensionsC: 2 + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStrides: false + SplitU: 1 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileEdge: 8 + ThreadTileShape: 0 + Valid: true + VectorWidthGlobalLoad: 4 + VectorWidthGlobalStore: 4 + VectorWidthLocalLoad: 4 + VectorWidthLocalStore: 4 + WorkGroup0: 32 + WorkGroup1: 8 + WorkGroupEdge: 8 + WorkGroupMapping: 1 + WorkGroupShape: -4 + - AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAccumulate: false + BenchmarkFork: 0 + DepthU: 8 + EdgeMultiKernel: false + EdgeType: Branch + KernelMaxSizes: [0, 0, 0] + KernelSerial: true + LoadMacInterleave: 4 + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 8 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + PadLDS: 1 + Prefetch: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumDimensionsC: 2 + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStrides: false + SplitU: 1 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileEdge: 8 + ThreadTileShape: 0 + Valid: true + VectorWidthGlobalLoad: 4 + VectorWidthGlobalStore: 4 + VectorWidthLocalLoad: 4 + VectorWidthLocalStore: 4 + WorkGroup0: 8 + WorkGroup1: 8 + WorkGroupEdge: 8 + WorkGroupMapping: 1 + WorkGroupShape: 0 +- [2, 3, 0, 1] +- - - -1 + - - - -1 + - - - 16 + - - [1472, 2] + - [-1, 0] + - - 32 + - - [1936, 2] + - [-1, 0] + - - 64 + - - [896, 2] + - [-1, 0] + - - 112 + - - [464, 2] + - [4064, 0] + - [4816, 4] + - [5216, 6] + - [6976, 4] + - [7456, 6] + - [-1, 4] + - - 176 + - - [352, 2] + - [2464, 0] + - [2752, 4] + - [3056, 6] + - [5632, 4] + - [6512, 0] + - [-1, 4] + - - 256 + - - [176, 2] + - [1936, 0] + - [3376, 6] + - [4064, 4] + - [4432, 0] + - [4816, 6] + - [5216, 4] + - [6064, 6] + - [6976, 4] + - [7456, 6] + - [-1, 4] + - - 352 + - - [176, 2] + - [1264, 0] + - [1472, 4] + - [1696, 6] + - [2464, 4] + - [2752, 0] + - [3376, 6] + - [4064, 4] + - [4432, 6] + - [5632, 4] + - [6976, 6] + - [-1, 4] + - - 464 + - - [112, 2] + - [896, 0] + - [1264, 6] + - [3376, 4] + - [3712, 6] + - [4064, 4] + - [4432, 6] + - [4816, 4] + - [5632, 6] + - [6064, 1] + - [6512, 6] + - [-1, 4] + - - 592 + - - [64, 2] + - [736, 0] + - [1072, 4] + - [1264, 6] + - [1472, 4] + - [1936, 0] + - [2464, 4] + - [3056, 6] + - [3376, 4] + - [3712, 6] + - [4064, 4] + - [4432, 6] + - [4816, 4] + - [5216, 6] + - [5632, 4] + - [6064, 6] + - [6512, 1] + - [6976, 6] + - [-1, 4] + - - 736 + - - [64, 2] + - [592, 0] + - [736, 6] + - [896, 4] + - [1696, 6] + - [2192, 4] + - [3376, 6] + - [4432, 4] + - [4816, 6] + - [5216, 4] + - [6064, 6] + - [6512, 1] + - [7456, 6] + - [-1, 4] + - - 896 + - - [64, 2] + - [464, 0] + - [1936, 6] + - [2192, 4] + - [3056, 6] + - [3376, 3] + - [4064, 6] + - [4432, 4] + - [5216, 6] + - [5632, 4] + - [-1, 6] + - - 1072 + - - [32, 2] + - [464, 0] + - [2464, 6] + - [2752, 4] + - [3376, 6] + - [3712, 4] + - [5216, 6] + - [5632, 4] + - [-1, 6] + - - 1264 + - - [16, 0] + - [32, 2] + - [352, 0] + - [3712, 6] + - [4064, 1] + - [4432, 6] + - [4816, 4] + - [5216, 6] + - [5632, 3] + - [6064, 6] + - [6512, 3] + - [7456, 6] + - [-1, 1] + - - 1472 + - - [16, 0] + - [32, 2] + - [352, 0] + - [2464, 6] + - [2752, 3] + - [3056, 6] + - [3376, 5] + - [3712, 6] + - [4064, 1] + - [5216, 6] + - [6064, 4] + - [6976, 6] + - [-1, 4] + - - 1696 + - - [16, 0] + - [32, 2] + - [256, 0] + - [464, 6] + - [592, 0] + - [736, 6] + - [896, 4] + - [1072, 6] + - [1264, 4] + - [1472, 6] + - [2192, 4] + - [3056, 6] + - [3376, 4] + - [3712, 6] + - [5632, 4] + - [6064, 6] + - [-1, 4] + - - 1936 + - - [16, 0] + - [32, 2] + - [256, 0] + - [1472, 4] + - [1696, 6] + - [1936, 4] + - [2192, 6] + - [2464, 4] + - [2752, 6] + - [3056, 1] + - [3376, 6] + - [4432, 4] + - [4816, 6] + - [5216, 3] + - [5632, 4] + - [6064, 3] + - [6512, 4] + - [6976, 5] + - [7456, 1] + - [-1, 4] + - - 2192 + - - [16, 0] + - [32, 2] + - [176, 0] + - [256, 4] + - [464, 6] + - [736, 4] + - [896, 5] + - [1072, 4] + - [1472, 6] + - [2192, 4] + - [2464, 6] + - [3056, 5] + - [3712, 6] + - [4432, 4] + - [4816, 5] + - [5216, 1] + - [5632, 6] + - [6512, 5] + - [6976, 4] + - [7456, 5] + - [-1, 4] + - - 2464 + - - [176, 0] + - [256, 4] + - [736, 6] + - [896, 4] + - [1072, 6] + - [1264, 4] + - [1472, 6] + - [1936, 4] + - [2192, 6] + - [2752, 4] + - [3376, 6] + - [4816, 4] + - [5632, 1] + - [6512, 4] + - [6976, 1] + - [-1, 4] + - - 2752 + - - [112, 0] + - [256, 6] + - [352, 0] + - [896, 6] + - [1072, 5] + - [1264, 6] + - [1472, 5] + - [2464, 6] + - [2752, 1] + - [3056, 6] + - [4064, 1] + - [4432, 3] + - [4816, 4] + - [5216, 5] + - [6976, 1] + - [7456, 6] + - [-1, 4] + - - 3056 + - - [16, 0] + - [32, 2] + - [112, 0] + - [1696, 6] + - [1936, 4] + - [2192, 1] + - [2752, 6] + - [3056, 4] + - [3376, 6] + - [3712, 1] + - [4064, 4] + - [4432, 5] + - [4816, 4] + - [5632, 6] + - [6064, 1] + - [6512, 6] + - [6976, 1] + - [7456, 6] + - [-1, 4] + - - 3376 + - - [112, 0] + - [736, 6] + - [896, 3] + - [1264, 6] + - [1472, 4] + - [1936, 6] + - [2192, 1] + - [2464, 6] + - [2752, 1] + - [3056, 4] + - [4432, 1] + - [4816, 6] + - [5216, 1] + - [5632, 4] + - [6064, 5] + - [6512, 1] + - [7456, 5] + - [-1, 4] + - - 3712 + - - [112, 0] + - [896, 6] + - [1072, 5] + - [2464, 6] + - [2752, 1] + - [3056, 4] + - [3712, 5] + - [4064, 4] + - [4432, 1] + - [5216, 5] + - [5632, 4] + - [6064, 6] + - [6976, 4] + - [7456, 3] + - [-1, 5] + - - 4064 + - - [64, 0] + - [176, 6] + - [256, 4] + - [352, 6] + - [464, 4] + - [592, 6] + - [736, 4] + - [1072, 6] + - [1472, 4] + - [2192, 1] + - [2464, 4] + - [2752, 1] + - [4064, 4] + - [4432, 1] + - [6064, 4] + - [6512, 1] + - [6976, 4] + - [7456, 3] + - [-1, 1] + - - 4432 + - - [64, 0] + - [736, 6] + - [896, 5] + - [1696, 6] + - [2752, 1] + - [3056, 4] + - [3712, 1] + - [4816, 4] + - [5216, 1] + - [6064, 4] + - [6512, 3] + - [7456, 4] + - [-1, 3] + - - 4816 + - - [64, 0] + - [464, 6] + - [592, 1] + - [1072, 6] + - [1264, 4] + - [1696, 6] + - [2752, 1] + - [3056, 4] + - [3376, 5] + - [4816, 1] + - [5632, 4] + - [6064, 1] + - [-1, 4] + - - 5216 + - - [64, 0] + - [112, 6] + - [176, 5] + - [256, 6] + - [352, 1] + - [592, 6] + - [736, 4] + - [1472, 6] + - [1696, 4] + - [1936, 1] + - [2192, 5] + - [2752, 1] + - [3376, 4] + - [4432, 1] + - [4816, 4] + - [5216, 3] + - [6064, 4] + - [7456, 3] + - [-1, 6] + - - 5632 + - - [64, 0] + - [736, 6] + - [896, 5] + - [1072, 6] + - [1264, 5] + - [1696, 6] + - [2192, 1] + - [2464, 5] + - [2752, 1] + - [3056, 4] + - [3712, 5] + - [4064, 4] + - [4432, 6] + - [4816, 4] + - [5216, 5] + - [6064, 4] + - [7456, 3] + - [-1, 5] + - - 6064 + - - [64, 0] + - [112, 3] + - [352, 6] + - [464, 3] + - [1072, 6] + - [1264, 3] + - [1472, 1] + - [1696, 6] + - [1936, 1] + - [2192, 6] + - [2464, 1] + - [2752, 6] + - [3376, 4] + - [3712, 5] + - [4064, 4] + - [5216, 3] + - [5632, 4] + - [6064, 6] + - [6512, 3] + - [7456, 4] + - [-1, 3] + - - 6512 + - - [64, 0] + - [464, 6] + - [736, 3] + - [1072, 6] + - [1264, 1] + - [1472, 6] + - [1696, 5] + - [2192, 1] + - [2464, 4] + - [2752, 1] + - [3056, 6] + - [3376, 3] + - [3712, 6] + - [4432, 4] + - [5216, 3] + - [5632, 4] + - [6512, 3] + - [6976, 6] + - [-1, 4] + - - 6976 + - - [64, 0] + - [736, 6] + - [896, 3] + - [1072, 6] + - [1264, 1] + - [1472, 6] + - [1696, 5] + - [2192, 1] + - [2464, 4] + - [2752, 6] + - [3056, 4] + - [3376, 1] + - [3712, 3] + - [4064, 4] + - [5216, 3] + - [5632, 4] + - [6064, 6] + - [6512, 3] + - [-1, 6] + - - 7456 + - - [64, 0] + - [896, 6] + - [1072, 5] + - [1264, 6] + - [1696, 4] + - [1936, 1] + - [3056, 4] + - [3376, 6] + - [5216, 3] + - [5632, 4] + - [6064, 6] + - [7456, 3] + - [-1, 4] + - - -1 + - - [64, 0] + - [112, 3] + - [176, 6] + - [256, 4] + - [352, 3] + - [464, 4] + - [592, 3] + - [736, 4] + - [896, 3] + - [1072, 6] + - [1264, 4] + - [1472, 3] + - [6064, 4] + - [6512, 3] + - [6976, 4] + - [7456, 3] + - [-1, 4] diff --git a/library/src/blas3/Tensile/Logic/FijiROCm14/Fiji_Cijk_Alik_Bljk_DB.yaml b/library/src/blas3/Tensile/Logic/FijiROCm14/Fiji_Cijk_Alik_Bljk_DB.yaml new file mode 100644 index 000000000..635fa35d0 --- /dev/null +++ b/library/src/blas3/Tensile/Logic/FijiROCm14/Fiji_Cijk_Alik_Bljk_DB.yaml @@ -0,0 +1,394 @@ +- Fiji +- AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 1 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumDimensionsC: 2 + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false +- - AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAccumulate: false + BenchmarkFork: 0 + DepthU: 8 + EdgeMultiKernel: false + EdgeType: Branch + KernelMaxSizes: &id001 [0, 0, 0] + KernelSerial: true + LoadMacInterleave: 4 + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 2 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PadLDS: 1 + Prefetch: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 1 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumDimensionsC: 2 + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SplitU: 1 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileEdge: 2 + ThreadTileShape: 1 + Valid: true + VectorWidthGlobalLoad: 4 + VectorWidthGlobalStore: 4 + VectorWidthLocalLoad: 4 + VectorWidthLocalStore: 4 + WorkGroup0: 8 + WorkGroup1: 8 + WorkGroupEdge: 8 + WorkGroupMapping: 1 + WorkGroupShape: 0 + - AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAccumulate: false + BenchmarkFork: 0 + DepthU: 8 + EdgeMultiKernel: false + EdgeType: Branch + KernelMaxSizes: *id001 + KernelSerial: true + LoadMacInterleave: 4 + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PadLDS: 1 + Prefetch: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 1 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumDimensionsC: 2 + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SplitU: 1 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileEdge: 2 + ThreadTileShape: 0 + Valid: true + VectorWidthGlobalLoad: 4 + VectorWidthGlobalStore: 4 + VectorWidthLocalLoad: 4 + VectorWidthLocalStore: 4 + WorkGroup0: 16 + WorkGroup1: 16 + WorkGroupEdge: 16 + WorkGroupMapping: 1 + WorkGroupShape: 0 + - AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAccumulate: false + BenchmarkFork: 0 + DepthU: 8 + EdgeMultiKernel: false + EdgeType: Branch + KernelMaxSizes: *id001 + KernelSerial: true + LoadMacInterleave: 4 + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PadLDS: 1 + Prefetch: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 1 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumDimensionsC: 2 + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SplitU: 1 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileEdge: 2 + ThreadTileShape: 0 + Valid: true + VectorWidthGlobalLoad: 4 + VectorWidthGlobalStore: 4 + VectorWidthLocalLoad: 4 + VectorWidthLocalStore: 4 + WorkGroup0: 8 + WorkGroup1: 8 + WorkGroupEdge: 8 + WorkGroupMapping: 1 + WorkGroupShape: 0 +- [2, 3, 0, 1] +- - - -1 + - - - -1 + - - - 32 + - - [2528, 1] + - [2944, 0] + - [3392, 1] + - [-1, 0] + - - 64 + - - [512, 1] + - [704, 2] + - [-1, 1] + - - 128 + - - [32, 1] + - [64, 2] + - [224, 1] + - [352, 2] + - [2144, 1] + - [2528, 2] + - [2944, 1] + - [-1, 0] + - - 224 + - - [32, 1] + - [64, 2] + - [704, 1] + - [1184, 2] + - [2144, 1] + - [2944, 0] + - [3392, 1] + - [-1, 0] + - - 352 + - - [32, 1] + - [128, 2] + - [512, 1] + - [704, 0] + - [928, 1] + - [2144, 0] + - [2528, 2] + - [2944, 0] + - [-1, 1] + - - 512 + - - [128, 1] + - [224, 0] + - [928, 1] + - [1184, 0] + - [1792, 2] + - [2144, 0] + - [-1, 1] + - - 704 + - - [32, 1] + - [64, 2] + - [224, 1] + - [352, 2] + - [512, 1] + - [1472, 0] + - [1792, 2] + - [2144, 0] + - [3392, 1] + - [-1, 0] + - - 928 + - - [32, 2] + - [128, 1] + - [224, 2] + - [352, 1] + - [512, 0] + - [704, 2] + - [1184, 1] + - [1472, 0] + - [2528, 1] + - [2944, 0] + - [-1, 1] + - - 1184 + - - [352, 1] + - [1184, 0] + - [1472, 1] + - [1792, 0] + - [2144, 2] + - [2528, 1] + - [2944, 0] + - [3392, 1] + - [-1, 0] + - - 1472 + - - [352, 1] + - [512, 0] + - [1184, 1] + - [2144, 0] + - [2944, 1] + - [3392, 0] + - [-1, 1] + - - 1792 + - - [32, 1] + - [64, 0] + - [224, 1] + - [512, 0] + - [704, 1] + - [1184, 0] + - [1472, 1] + - [1792, 0] + - [2944, 1] + - [3392, 0] + - [-1, 1] + - - 2144 + - - [224, 1] + - [352, 0] + - [704, 1] + - [928, 0] + - [2944, 1] + - [3392, 0] + - [-1, 1] + - - 2528 + - - [928, 1] + - [1184, 0] + - [1472, 2] + - [-1, 1] + - - 2944 + - - [128, 1] + - [224, 0] + - [2144, 1] + - [2528, 0] + - [3392, 1] + - [-1, 0] + - - 3392 + - - [928, 1] + - [1792, 0] + - [-1, 1] + - - -1 + - - [224, 1] + - [352, 2] + - [512, 1] + - [704, 0] + - [928, 1] + - [1184, 0] + - [-1, 1] diff --git a/library/src/blas3/Tensile/Logic/FijiROCm14/Fiji_Cijk_Alik_Bljk_SB.yaml b/library/src/blas3/Tensile/Logic/FijiROCm14/Fiji_Cijk_Alik_Bljk_SB.yaml new file mode 100644 index 000000000..06cf158d4 --- /dev/null +++ b/library/src/blas3/Tensile/Logic/FijiROCm14/Fiji_Cijk_Alik_Bljk_SB.yaml @@ -0,0 +1,554 @@ +- Fiji +- AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumDimensionsC: 2 + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false +- - AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAccumulate: false + BenchmarkFork: 0 + DepthU: 8 + EdgeMultiKernel: false + EdgeType: Branch + KernelMaxSizes: [0, 0, 0] + KernelSerial: true + LoadMacInterleave: 4 + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 4 + NumLoadsCoalescedB: 8 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PadLDS: 1 + Prefetch: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumDimensionsC: 2 + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SplitU: 1 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileEdge: 8 + ThreadTileShape: 0 + Valid: true + VectorWidthGlobalLoad: 4 + VectorWidthGlobalStore: 4 + VectorWidthLocalLoad: 4 + VectorWidthLocalStore: 4 + WorkGroup0: 8 + WorkGroup1: 16 + WorkGroupEdge: 8 + WorkGroupMapping: 1 + WorkGroupShape: 2 + - AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAccumulate: false + BenchmarkFork: 0 + DepthU: 8 + EdgeMultiKernel: false + EdgeType: Branch + KernelMaxSizes: [0, 0, 0] + KernelSerial: true + LoadMacInterleave: 4 + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 256 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 2 + NumLoadsCoalescedB: 8 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PadLDS: 1 + Prefetch: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumDimensionsC: 2 + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SplitU: 1 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileEdge: 8 + ThreadTileShape: 0 + Valid: true + VectorWidthGlobalLoad: 4 + VectorWidthGlobalStore: 4 + VectorWidthLocalLoad: 4 + VectorWidthLocalStore: 4 + WorkGroup0: 8 + WorkGroup1: 32 + WorkGroupEdge: 8 + WorkGroupMapping: 1 + WorkGroupShape: 4 + - AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAccumulate: false + BenchmarkFork: 0 + DepthU: 8 + EdgeMultiKernel: false + EdgeType: Branch + KernelMaxSizes: [0, 0, 0] + KernelSerial: true + LoadMacInterleave: 4 + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 8 + NumLoadsCoalescedB: 8 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PadLDS: 1 + Prefetch: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumDimensionsC: 2 + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SplitU: 1 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileEdge: 8 + ThreadTileShape: 0 + Valid: true + VectorWidthGlobalLoad: 4 + VectorWidthGlobalStore: 4 + VectorWidthLocalLoad: 4 + VectorWidthLocalStore: 4 + WorkGroup0: 8 + WorkGroup1: 8 + WorkGroupEdge: 8 + WorkGroupMapping: 1 + WorkGroupShape: 0 +- [2, 3, 0, 1] +- - - -1 + - - - -1 + - - - 16 + - - [-1, 2] + - - 32 + - - [896, 2] + - [1072, 0] + - [1264, 2] + - [1472, 0] + - [-1, 2] + - - 64 + - - [-1, 2] + - - 112 + - - [7456, 2] + - [-1, 1] + - - 176 + - - [-1, 2] + - - 256 + - - [3712, 2] + - [4064, 1] + - [7456, 2] + - [-1, 1] + - - 352 + - - [-1, 2] + - - 464 + - - [1696, 2] + - [1936, 0] + - [3712, 2] + - [4064, 1] + - [-1, 2] + - - 592 + - - [7456, 2] + - [-1, 1] + - - 736 + - - [6064, 2] + - [6512, 1] + - [7456, 2] + - [-1, 1] + - - 896 + - - [1696, 2] + - [1936, 0] + - [4064, 2] + - [4432, 1] + - [5216, 2] + - [5632, 1] + - [7456, 2] + - [-1, 1] + - - 1072 + - - [1696, 2] + - [1936, 0] + - [6064, 2] + - [6512, 1] + - [6976, 2] + - [-1, 1] + - - 1264 + - - [32, 2] + - [64, 0] + - [3712, 2] + - [4064, 1] + - [4432, 2] + - [4816, 1] + - [5216, 2] + - [5632, 1] + - [7456, 2] + - [-1, 0] + - - 1472 + - - [2464, 2] + - [2752, 1] + - [3712, 2] + - [4064, 1] + - [5216, 2] + - [5632, 1] + - [-1, 2] + - - 1696 + - - [32, 2] + - [64, 1] + - [1472, 2] + - [1936, 0] + - [3712, 2] + - [4064, 1] + - [4816, 2] + - [5216, 1] + - [5632, 2] + - [6976, 1] + - [7456, 2] + - [-1, 0] + - - 1936 + - - [32, 2] + - [64, 0] + - [352, 2] + - [464, 0] + - [2192, 2] + - [2464, 1] + - [2752, 2] + - [3056, 1] + - [3712, 2] + - [4432, 1] + - [5632, 2] + - [6064, 1] + - [7456, 0] + - [-1, 1] + - - 2192 + - - [32, 2] + - [64, 0] + - [736, 2] + - [896, 0] + - [1936, 2] + - [2192, 1] + - [3712, 2] + - [4432, 1] + - [6976, 2] + - [-1, 0] + - - 2464 + - - [32, 2] + - [64, 0] + - [1264, 2] + - [1472, 0] + - [1696, 2] + - [1936, 0] + - [2192, 1] + - [2464, 2] + - [2752, 1] + - [3712, 2] + - [4064, 0] + - [4432, 2] + - [5216, 1] + - [6064, 0] + - [6512, 2] + - [-1, 0] + - - 2752 + - - [1264, 2] + - [1472, 0] + - [3712, 2] + - [4064, 1] + - [4432, 2] + - [6512, 0] + - [6976, 2] + - [-1, 0] + - - 3056 + - - [32, 2] + - [64, 0] + - [1696, 2] + - [2192, 1] + - [2464, 2] + - [2752, 1] + - [3712, 2] + - [5216, 0] + - [5632, 2] + - [6976, 0] + - [-1, 2] + - - 3376 + - - [32, 2] + - [64, 0] + - [736, 2] + - [896, 0] + - [1264, 2] + - [1472, 0] + - [2752, 2] + - [3056, 1] + - [3712, 2] + - [4064, 0] + - [4432, 2] + - [6976, 0] + - [7456, 2] + - [-1, 0] + - - 3712 + - - [16, 2] + - [32, 0] + - [1264, 2] + - [1696, 0] + - [1936, 2] + - [2192, 1] + - [2752, 2] + - [3056, 0] + - [3376, 1] + - [3712, 0] + - [4064, 2] + - [4432, 1] + - [5632, 0] + - [6512, 2] + - [-1, 0] + - - 4064 + - - [32, 2] + - [64, 0] + - [176, 2] + - [256, 0] + - [352, 2] + - [464, 0] + - [592, 2] + - [736, 0] + - [1936, 2] + - [2192, 0] + - [2752, 2] + - [3376, 0] + - [3712, 1] + - [4064, 0] + - [4432, 2] + - [5632, 0] + - [6064, 2] + - [6512, 0] + - [6976, 2] + - [-1, 0] + - - 4432 + - - [736, 2] + - [896, 0] + - [2464, 2] + - [2752, 1] + - [3056, 0] + - [3376, 2] + - [3712, 0] + - [4064, 2] + - [5216, 0] + - [5632, 2] + - [-1, 0] + - - 4816 + - - [16, 2] + - [64, 0] + - [464, 2] + - [592, 0] + - [1264, 2] + - [1696, 0] + - [3056, 2] + - [4816, 0] + - [5216, 2] + - [-1, 0] + - - 5216 + - - [16, 2] + - [64, 0] + - [256, 2] + - [352, 0] + - [592, 2] + - [736, 0] + - [1472, 2] + - [1696, 0] + - [2192, 2] + - [2464, 0] + - [2752, 2] + - [-1, 0] + - - 5632 + - - [32, 2] + - [64, 0] + - [736, 2] + - [896, 0] + - [1072, 2] + - [1264, 0] + - [1472, 2] + - [1696, 0] + - [2192, 1] + - [2464, 2] + - [2752, 0] + - [3056, 2] + - [-1, 0] + - - 6064 + - - [352, 2] + - [464, 0] + - [1072, 2] + - [1472, 0] + - [1696, 2] + - [1936, 0] + - [2192, 1] + - [2464, 2] + - [-1, 0] + - - 6512 + - - [464, 2] + - [736, 0] + - [1072, 2] + - [1696, 0] + - [1936, 1] + - [2192, 2] + - [-1, 0] + - - 6976 + - - [1264, 2] + - [1936, 0] + - [2464, 2] + - [2752, 1] + - [-1, 0] + - - 7456 + - - [1264, 2] + - [1696, 0] + - [1936, 2] + - [-1, 0] + - - -1 + - - [64, 2] + - [112, 0] + - [176, 2] + - [896, 0] + - [1072, 2] + - [1264, 0] + - [1472, 2] + - [2752, 0] + - [3056, 2] + - [-1, 0] diff --git a/library/src/blas3/Tensile/Tensile_status.cpp b/library/src/blas3/Tensile/Tensile_status.cpp deleted file mode 100644 index 7115c2eeb..000000000 --- a/library/src/blas3/Tensile/Tensile_status.cpp +++ /dev/null @@ -1,42 +0,0 @@ - -#include "status.h" -#include -#include "Tensile.h" - -/******************************************************************************* - * \brief convert TensileStatus to rocblas_status - ******************************************************************************/ -rocblas_status -get_rocblas_status_for_tensile_status( TensileStatus status ) { - switch(status) { - - case tensileStatusSuccess: - return rocblas_status_success; - - case tensileStatusControlInvalid: - return rocblas_status_invalid_handle; - - case tensileStatusTensorNumDimensionsInvalid: - case tensileStatusTensorDimensionOrderInvalid: - case tensileStatusTensorDimensionStrideInvalid: - case tensileStatusTensorDimensionSizeInvalid: - case tensileStatusOperandNumDimensionsMismatch: - case tensileStatusOperationOperandNumIndicesMismatch: - case tensileStatusOperationIndexAssignmentInvalidA: - case tensileStatusOperationIndexAssignmentInvalidB: - case tensileStatusOperationIndexAssignmentDuplicateA: - case tensileStatusOperationIndexAssignmentDuplicateB: - case tensileStatusOperationNumFreeIndicesInvalid: - case tensileStatusOperationNumSummationIndicesInvalid: - case tensileStatusOperationIndexUnassigned: - case tensileStatusOperationSummationIndexAssignmentsInvalid: - case tensileStatusDeviceProfileNumDevicesInvalid: - case tensileStatusDeviceProfileNotSupported: // tensile should return a default implementation - case tensileStatusProblemNotSupported: // - case tensileStatusInvalidParameter: - default: - return rocblas_status_internal_error; - } -} - - diff --git a/library/src/blas3/Tensile/Tensile_status.h b/library/src/blas3/Tensile/Tensile_status.h deleted file mode 100644 index ff01b4d4b..000000000 --- a/library/src/blas3/Tensile/Tensile_status.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef TENSILE_STATUS_H -#define TENSILE_STATUS_H - -#include "rocblas.h" -#include "Tensile.h" - -/******************************************************************************* - * \brief convert TensileStatus to rocblas_status - ******************************************************************************/ -rocblas_status -get_rocblas_status_for_tensile_status( TensileStatus status ); - - -#endif diff --git a/library/src/blas3/Tensile/XML_Problems/list_of_xmls.txt b/library/src/blas3/Tensile/XML_Problems/list_of_xmls.txt deleted file mode 100644 index 2420192dd..000000000 --- a/library/src/blas3/Tensile/XML_Problems/list_of_xmls.txt +++ /dev/null @@ -1 +0,0 @@ -The contents of this file isn't read; it only exists for CMake to check the timestamp for rebuilding target TensileGenBenchmark.py. TensileGenBackend.py will use all XMLs in this directory. diff --git a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cij_Aik_Bjk_min.xml b/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cij_Aik_Bjk_min.xml deleted file mode 100644 index dbe24ffc8..000000000 --- a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cij_Aik_Bjk_min.xml +++ /dev/null @@ -1,2271 +0,0 @@ - - - - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
-
diff --git a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cij_Aik_Bjk_strided_min.xml b/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cij_Aik_Bjk_strided_min.xml deleted file mode 100644 index f9abd5e64..000000000 --- a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cij_Aik_Bjk_strided_min.xml +++ /dev/null @@ -1,2049 +0,0 @@ - - - - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
-
diff --git a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cij_Aik_Bkj_min.xml b/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cij_Aik_Bkj_min.xml deleted file mode 100644 index 37e07caf2..000000000 --- a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cij_Aik_Bkj_min.xml +++ /dev/null @@ -1,2190 +0,0 @@ - - - - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
-
diff --git a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cij_Aik_Bkj_strided_min.xml b/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cij_Aik_Bkj_strided_min.xml deleted file mode 100644 index ce405d096..000000000 --- a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cij_Aik_Bkj_strided_min.xml +++ /dev/null @@ -1,1764 +0,0 @@ - - - - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
-
diff --git a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cij_Aki_Bjk_min.xml b/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cij_Aki_Bjk_min.xml deleted file mode 100644 index d83aa3172..000000000 --- a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cij_Aki_Bjk_min.xml +++ /dev/null @@ -1,1800 +0,0 @@ - - - - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
-
diff --git a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cij_Aki_Bjk_strided_min.xml b/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cij_Aki_Bjk_strided_min.xml deleted file mode 100644 index 9c6e5795b..000000000 --- a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cij_Aki_Bjk_strided_min.xml +++ /dev/null @@ -1,1971 +0,0 @@ - - - - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
-
diff --git a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cij_Aki_Bkj_min.xml b/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cij_Aki_Bkj_min.xml deleted file mode 100644 index 7799dd1b7..000000000 --- a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cij_Aki_Bkj_min.xml +++ /dev/null @@ -1,4134 +0,0 @@ - - - - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
-
diff --git a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cij_Aki_Bkj_strided_min.xml b/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cij_Aki_Bkj_strided_min.xml deleted file mode 100644 index f86990781..000000000 --- a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cij_Aki_Bkj_strided_min.xml +++ /dev/null @@ -1,2481 +0,0 @@ - - - - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
-
diff --git a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cijk_Ailk_Bjlk_min.xml b/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cijk_Ailk_Bjlk_min.xml deleted file mode 100644 index f04c3ed6a..000000000 --- a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cijk_Ailk_Bjlk_min.xml +++ /dev/null @@ -1,1905 +0,0 @@ - - - - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
-
diff --git a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cijk_Ailk_Bjlk_strided_min.xml b/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cijk_Ailk_Bjlk_strided_min.xml deleted file mode 100644 index 98284fde6..000000000 --- a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cijk_Ailk_Bjlk_strided_min.xml +++ /dev/null @@ -1,1527 +0,0 @@ - - - - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
-
diff --git a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cijk_Ailk_Bljk_min.xml b/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cijk_Ailk_Bljk_min.xml deleted file mode 100644 index d4b78a777..000000000 --- a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cijk_Ailk_Bljk_min.xml +++ /dev/null @@ -1,1476 +0,0 @@ - - - - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
-
diff --git a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cijk_Ailk_Bljk_strided_min.xml b/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cijk_Ailk_Bljk_strided_min.xml deleted file mode 100644 index c86238e92..000000000 --- a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cijk_Ailk_Bljk_strided_min.xml +++ /dev/null @@ -1,1326 +0,0 @@ - - - - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
-
diff --git a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cijk_Alik_Bjlk_min.xml b/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cijk_Alik_Bjlk_min.xml deleted file mode 100644 index 405c81c30..000000000 --- a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cijk_Alik_Bjlk_min.xml +++ /dev/null @@ -1,1776 +0,0 @@ - - - - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
-
diff --git a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cijk_Alik_Bjlk_strided_min.xml b/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cijk_Alik_Bjlk_strided_min.xml deleted file mode 100644 index 2908c4b65..000000000 --- a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cijk_Alik_Bjlk_strided_min.xml +++ /dev/null @@ -1,1191 +0,0 @@ - - - - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
-
diff --git a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cijk_Alik_Bljk_min.xml b/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cijk_Alik_Bljk_min.xml deleted file mode 100644 index b59785b21..000000000 --- a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cijk_Alik_Bljk_min.xml +++ /dev/null @@ -1,2739 +0,0 @@ - - - - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
-
diff --git a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cijk_Alik_Bljk_strided_min.xml b/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cijk_Alik_Bljk_strided_min.xml deleted file mode 100644 index e792d7b31..000000000 --- a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_DDDDD_Cijk_Alik_Bljk_strided_min.xml +++ /dev/null @@ -1,1641 +0,0 @@ - - - - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
-
diff --git a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cij_Aik_Bjk_min.xml b/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cij_Aik_Bjk_min.xml deleted file mode 100644 index 33a4c9be1..000000000 --- a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cij_Aik_Bjk_min.xml +++ /dev/null @@ -1,2529 +0,0 @@ - - - - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
-
diff --git a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cij_Aik_Bjk_strided_min.xml b/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cij_Aik_Bjk_strided_min.xml deleted file mode 100644 index a2f1ac071..000000000 --- a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cij_Aik_Bjk_strided_min.xml +++ /dev/null @@ -1,2310 +0,0 @@ - - - - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
-
diff --git a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cij_Aik_Bkj_min.xml b/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cij_Aik_Bkj_min.xml deleted file mode 100644 index c8d15c311..000000000 --- a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cij_Aik_Bkj_min.xml +++ /dev/null @@ -1,2439 +0,0 @@ - - - - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
-
diff --git a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cij_Aik_Bkj_strided_min.xml b/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cij_Aik_Bkj_strided_min.xml deleted file mode 100644 index ad15eceb2..000000000 --- a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cij_Aik_Bkj_strided_min.xml +++ /dev/null @@ -1,2514 +0,0 @@ - - - - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
-
diff --git a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cij_Aki_Bjk_min.xml b/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cij_Aki_Bjk_min.xml deleted file mode 100644 index aa0b1d1e7..000000000 --- a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cij_Aki_Bjk_min.xml +++ /dev/null @@ -1,2550 +0,0 @@ - - - - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
-
diff --git a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cij_Aki_Bjk_strided_min.xml b/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cij_Aki_Bjk_strided_min.xml deleted file mode 100644 index 508f2c68d..000000000 --- a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cij_Aki_Bjk_strided_min.xml +++ /dev/null @@ -1,2676 +0,0 @@ - - - - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
-
diff --git a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cij_Aki_Bkj_min.xml b/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cij_Aki_Bkj_min.xml deleted file mode 100644 index c98bad36e..000000000 --- a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cij_Aki_Bkj_min.xml +++ /dev/null @@ -1,7260 +0,0 @@ - - - - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
-
diff --git a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cij_Aki_Bkj_strided_min.xml b/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cij_Aki_Bkj_strided_min.xml deleted file mode 100644 index a4b1b3d59..000000000 --- a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cij_Aki_Bkj_strided_min.xml +++ /dev/null @@ -1,4800 +0,0 @@ - - - - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
-
diff --git a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cijk_Ailk_Bjlk_min.xml b/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cijk_Ailk_Bjlk_min.xml deleted file mode 100644 index 8fcb776c3..000000000 --- a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cijk_Ailk_Bjlk_min.xml +++ /dev/null @@ -1,2151 +0,0 @@ - - - - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
-
diff --git a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cijk_Ailk_Bjlk_strided_min.xml b/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cijk_Ailk_Bjlk_strided_min.xml deleted file mode 100644 index cf53c5e82..000000000 --- a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cijk_Ailk_Bjlk_strided_min.xml +++ /dev/null @@ -1,1869 +0,0 @@ - - - - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
-
diff --git a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cijk_Ailk_Bljk_min.xml b/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cijk_Ailk_Bljk_min.xml deleted file mode 100644 index a8c690b6c..000000000 --- a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cijk_Ailk_Bljk_min.xml +++ /dev/null @@ -1,1674 +0,0 @@ - - - - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
-
diff --git a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cijk_Ailk_Bljk_strided_min.xml b/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cijk_Ailk_Bljk_strided_min.xml deleted file mode 100644 index 1df307a38..000000000 --- a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cijk_Ailk_Bljk_strided_min.xml +++ /dev/null @@ -1,1578 +0,0 @@ - - - - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
-
diff --git a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cijk_Alik_Bjlk_min.xml b/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cijk_Alik_Bjlk_min.xml deleted file mode 100644 index 9b440c465..000000000 --- a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cijk_Alik_Bjlk_min.xml +++ /dev/null @@ -1,1473 +0,0 @@ - - - - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
-
diff --git a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cijk_Alik_Bjlk_strided_min.xml b/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cijk_Alik_Bjlk_strided_min.xml deleted file mode 100644 index 7934c8aca..000000000 --- a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cijk_Alik_Bjlk_strided_min.xml +++ /dev/null @@ -1,1794 +0,0 @@ - - - - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
-
diff --git a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cijk_Alik_Bljk_min.xml b/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cijk_Alik_Bljk_min.xml deleted file mode 100644 index 70a7ab990..000000000 --- a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cijk_Alik_Bljk_min.xml +++ /dev/null @@ -1,4056 +0,0 @@ - - - - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
-
diff --git a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cijk_Alik_Bljk_strided_min.xml b/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cijk_Alik_Bljk_strided_min.xml deleted file mode 100644 index cd6ff90d4..000000000 --- a/library/src/blas3/Tensile/XML_SolutionTimes/Hawaii_CT_SSSSS_Cijk_Alik_Bljk_strided_min.xml +++ /dev/null @@ -1,3375 +0,0 @@ - - - - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - - - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
- - -

- - - - - - - - -

- - - -
- -
-
diff --git a/library/src/blas3/Tensile/XML_SolutionTimes/list_of_xmls.txt b/library/src/blas3/Tensile/XML_SolutionTimes/list_of_xmls.txt deleted file mode 100644 index 0fb1ad986..000000000 --- a/library/src/blas3/Tensile/XML_SolutionTimes/list_of_xmls.txt +++ /dev/null @@ -1 +0,0 @@ -The contents of this file isn't read; it only exists for CMake to check the timestamp for rebuilding target TensileGenBackend.py. TensileGenBackend.py will use all XMLs in this directory. diff --git a/library/src/blas3/Tensile/gemm.cpp b/library/src/blas3/Tensile/gemm.cpp index 9d7fd6b28..3c18c53f6 100644 --- a/library/src/blas3/Tensile/gemm.cpp +++ b/library/src/blas3/Tensile/gemm.cpp @@ -1,208 +1,387 @@ -/* ************************************************************************ +/************************************************************************** * Copyright 2016 Advanced Micro Devices, Inc. - * ************************************************************************ */ -#include + ************************************************************************** */ +#include +#include #include "rocblas.h" #include "Tensile.h" #include "gemm.h" #include "definitions.h" #include "handle.h" + +/******************************************************************************* + * API Args + ******************************************************************************/ +#define ARGS(TYPE) \ + rocblas_handle handle, \ + rocblas_order order, \ + rocblas_operation trans_a, \ + rocblas_operation trans_b, \ + rocblas_int m, \ + rocblas_int n, \ + rocblas_int k, \ + const TYPE *alpha, \ + const TYPE *A, \ + rocblas_int ld_a, \ + const TYPE *B, \ + rocblas_int ld_b, \ + const TYPE *beta, \ + TYPE *C, \ + rocblas_int ld_c + +#define ARGS_BATCHED(TYPE) \ + rocblas_handle handle, \ + rocblas_order order, \ + rocblas_operation trans_a, \ + rocblas_operation trans_b, \ + rocblas_int m, \ + rocblas_int n, \ + rocblas_int k, \ + const TYPE *alpha, \ + const TYPE *A, \ + rocblas_int ld_a, \ + rocblas_int bs_a, \ + const TYPE *B, \ + rocblas_int ld_b, \ + rocblas_int bs_b, \ + const TYPE *beta, \ + TYPE *C, \ + rocblas_int ld_c, \ + rocblas_int bs_c, \ + rocblas_int b_c + + +/******************************************************************************* + * Preamble Code + ******************************************************************************/ +#define PREAMBLE \ + rocblas_int b_c = 1; \ + rocblas_int bs_c; \ + rocblas_int bs_a; \ + rocblas_int bs_b; \ + infer_batch_strides( order, trans_a, trans_b, m, n, k, \ + ld_a, &bs_a, ld_b, &bs_b, ld_c, &bs_c ); \ + rocblas_status validArgs = validateArgs( handle, order, trans_a, trans_b, \ + m, n, k, alpha, A, ld_a, bs_a, B, ld_b, bs_b, beta, C, ld_c, bs_c, b_c); \ + if (validArgs != rocblas_status_success) return validArgs; \ + \ + unsigned int strideC1 = static_cast(ld_c); \ + unsigned int strideC2 = static_cast(bs_c); \ + unsigned int strideA1 = static_cast(ld_a); \ + unsigned int strideA2 = static_cast(bs_a); \ + unsigned int strideB1 = static_cast(ld_b); \ + unsigned int strideB2 = static_cast(bs_b); \ + unsigned int sizeI = (order==rocblas_order_column_major) \ + ? static_cast(m) : static_cast(n) ; \ + unsigned int sizeJ = (order==rocblas_order_column_major) \ + ? static_cast(n) : static_cast(m) ; \ + unsigned int sizeK = b_c; \ + unsigned int sizeL = static_cast(k); + +#define PREAMBLE_BATCHED \ + rocblas_status validArgs = validateArgs( handle, order, trans_a, trans_b, \ + m, n, k, alpha, A, ld_a, bs_a, B, ld_b, bs_b, beta, C, ld_c, bs_c, b_c); \ + if (validArgs != rocblas_status_success) return validArgs; \ + \ + unsigned int strideC1 = static_cast(ld_c); \ + unsigned int strideC2 = static_cast(bs_c); \ + unsigned int strideA1 = static_cast(ld_a); \ + unsigned int strideA2 = static_cast(bs_a); \ + unsigned int strideB1 = static_cast(ld_b); \ + unsigned int strideB2 = static_cast(bs_b); \ + unsigned int sizeI = (order==rocblas_order_column_major) \ + ? static_cast(m) : static_cast(n) ; \ + unsigned int sizeJ = (order==rocblas_order_column_major) \ + ? static_cast(n) : static_cast(m) ; \ + unsigned int sizeK = static_cast(b_c); \ + unsigned int sizeL = static_cast(k); + + + +/******************************************************************************* + * Calling Tensile + ******************************************************************************/ +#define TENSILE_NN(SCHEDULE, PREC) \ + tensile_ ## SCHEDULE ##_Cijk_Ailk_Bljk_ ## PREC ## B +#define TENSILE_NT(SCHEDULE, PREC) \ + tensile_ ## SCHEDULE ##_Cijk_Ailk_Bjlk_ ## PREC ## B +#define TENSILE_TN(SCHEDULE, PREC) \ + tensile_ ## SCHEDULE ##_Cijk_Alik_Bljk_ ## PREC ## B +#define TENSILE_TT(SCHEDULE, PREC) \ + tensile_ ## SCHEDULE ##_Cijk_Alik_Bjlk_ ## PREC ## B + +#define TENSILE_CALLS(SCHEDULE, PREC) \ + hipError_t status; \ + if ( trans_a == rocblas_operation_none) { \ + if (trans_b == rocblas_operation_none) { /*NN*/ \ + status = TENSILE_NN(SCHEDULE,PREC)( C, A, B, *alpha, *beta, \ + 0, 0, 0, strideC1, strideC2, strideA1, strideA2, \ + strideB1, strideB2, sizeI, sizeJ, sizeK, sizeL, \ + handle->rocblas_stream, 0, nullptr, nullptr); \ + } else { /*NT*/ \ + status = TENSILE_NT(SCHEDULE,PREC)( C, A, B, *alpha, *beta, \ + 0, 0, 0, strideC1, strideC2, strideA1, strideA2, \ + strideB1, strideB2, sizeI, sizeJ, sizeK, sizeL, \ + handle->rocblas_stream, 0, nullptr, nullptr); \ + } \ + } else { /*TN*/ \ + if (trans_b == rocblas_operation_none) { \ + status = TENSILE_TN(SCHEDULE,PREC)( C, A, B, *alpha, *beta, \ + 0, 0, 0, strideC1, strideC2, strideA1, strideA2, \ + strideB1, strideB2, sizeI, sizeJ, sizeK, sizeL, \ + handle->rocblas_stream, 0, nullptr, nullptr); \ + } else { /*TT*/ \ + status = TENSILE_TT(SCHEDULE,PREC)( C, A, B, *alpha, *beta, \ + 0, 0, 0, strideC1, strideC2, strideA1, strideA2, \ + strideB1, strideB2, sizeI, sizeJ, sizeK, sizeL, \ + handle->rocblas_stream, 0, nullptr, nullptr); \ + } \ + } \ + return get_rocblas_status_for_hip_status( status ); + + +/******************************************************************************* + * GEMM APIs + ******************************************************************************/ +#define GEMM_API(prec, PREC, TYPE, SCHEDULE) \ + rocblas_status rocblas_ ## prec ## gemm( ARGS(TYPE) ) { \ + PREAMBLE \ + TENSILE_CALLS(SCHEDULE, PREC) \ + } + +#define GEMM_API_BATCHED(prec, PREC, TYPE, SCHEDULE) \ + rocblas_status rocblas_ ## prec ## gemm_batched( ARGS_BATCHED(TYPE) ) { \ + PREAMBLE_BATCHED \ + TENSILE_CALLS(SCHEDULE, PREC) \ + } + +GEMM_API(s, S, float, Fiji ) +GEMM_API(d, D, double, Fiji ) +GEMM_API_BATCHED(s, S, float, Fiji ) +GEMM_API_BATCHED(d, D, double, Fiji ) + +#if 0 + #define COMPLEX 0 + /******************************************************************************* * GEMM wrapper around Tensile ******************************************************************************/ rocblas_status xgemm_tensile( - rocblas_handle handle, - rocblas_order order, - rocblas_operation trans_a, rocblas_operation trans_b, - rocblas_int m, rocblas_int n, rocblas_int k, - TensileDataType type_alpha, const void *alpha, - TensileDataType type_a, const void *a, rocblas_int ls_a, rocblas_int ld_a, rocblas_int bs_a, - TensileDataType type_b, const void *b, rocblas_int ls_b, rocblas_int ld_b, rocblas_int bs_b, - TensileDataType type_beta, const void *beta, - TensileDataType type_c, void *c, rocblas_int ls_c, rocblas_int ld_c, rocblas_int bs_c, - rocblas_int batch_count ) { - - // quick return 0 is valid in BLAS - if ( m == 0 || n == 0 || k == 0 || batch_count == 0) { - return rocblas_status_success; - } + rocblas_handle handle, + rocblas_order order, + rocblas_operation trans_a, rocblas_operation trans_b, + rocblas_int m, rocblas_int n, rocblas_int k, + TensileDataType type_alpha, const void *alpha, + TensileDataType type_a, const void *a, rocblas_int ls_a, rocblas_int ld_a, rocblas_int bs_a, + TensileDataType type_b, const void *b, rocblas_int ls_b, rocblas_int ld_b, rocblas_int bs_b, + TensileDataType type_beta, const void *beta, + TensileDataType type_c, void *c, rocblas_int ls_c, rocblas_int ld_c, rocblas_int bs_c, + rocblas_int b_c ) { + + // quick return 0 is valid in BLAS + if ( m == 0 || n == 0 || k == 0 || b_c == 0) { + return rocblas_status_success; + } - // sizes must not be negative - if ( m < 0 || n < 0 || k < 0 || batch_count < 0) { - return rocblas_status_invalid_size; - } + // sizes must not be negative + if ( m < 0 || n < 0 || k < 0 || b_c < 0) { + return rocblas_status_invalid_size; + } - // handle must be valid - if (handle == nullptr) { - return rocblas_status_invalid_handle; - } + // handle must be valid + if (handle == nullptr) { + return rocblas_status_invalid_handle; + } - // pointers must be valid - if ( c == nullptr - || a == nullptr - || b == nullptr - || alpha == nullptr - || beta == nullptr ) { - return rocblas_status_invalid_pointer; - } + // pointers must be valid + if ( c == nullptr + || a == nullptr + || b == nullptr + || alpha == nullptr + || beta == nullptr ) { + return rocblas_status_invalid_pointer; + } - // tensor dimensions in rows/cols - int num_cols_c = m; - int num_rows_c = n; - int num_cols_a = (trans_a == rocblas_operation_none) ? k : m; - int num_rows_a = (trans_a == rocblas_operation_none) ? m : k; - int num_cols_b = (trans_b == rocblas_operation_none) ? n : k; - int num_rows_b = (trans_b == rocblas_operation_none) ? k : n; - - - /* create tensile tensors - * - translates rows/cols into dim0/dim1 - * - dim0 is dimension with shortest stride (necessary for performance) - * - dim1 is other matrix dimension - * - dim2 is batch dimension - */ - - // create tensor c - TensileTensor tensor_c; - tensor_c.dataType = type_c; - tensor_c.numDimensions = (batch_count > 1) ? 3 : 2; - tensor_c.dimensions[0].stride = ls_c; - tensor_c.dimensions[0].size = (order==rocblas_order_column_major) ? num_rows_c : num_cols_c; - tensor_c.dimensions[1].stride = ld_c; - tensor_c.dimensions[1].size = (order==rocblas_order_column_major) ? num_cols_c : num_rows_c; - tensor_c.dimensions[2].stride = bs_c; - tensor_c.dimensions[2].size = batch_count; - // validate tensor c - if (tensor_c.dimensions[0].stride < 1) { - // user gave invalid ls_c - return rocblas_status_invalid_size; - } - if (tensor_c.dimensions[1].stride < tensor_c.dimensions[0].stride * tensor_c.dimensions[0].size ) { - // user gave invalid ld_c - return rocblas_status_invalid_size; - } - if (tensor_c.dimensions[2].stride < tensor_c.dimensions[1].stride * tensor_c.dimensions[1].size ) { - // user gave invalid bs_c - return rocblas_status_invalid_size; - } + // tensor dimensions in rows/cols + int num_cols_c = n; + int num_rows_c = m; + int num_cols_a = (trans_a == rocblas_operation_none) ? k : m; + int num_rows_a = (trans_a == rocblas_operation_none) ? m : k; + int num_cols_b = (trans_b == rocblas_operation_none) ? n : k; + int num_rows_b = (trans_b == rocblas_operation_none) ? k : n; + + if(order==rocblas_order_column_major){ + if( num_rows_a > ld_a + || num_rows_b > ld_b + || num_rows_c > ld_c) { + return rocblas_status_invalid_size; + } + } - // create tensor a - TensileTensor tensor_a; - tensor_a.dataType = conjugate_if_necessary( type_a, trans_a ); - tensor_a.numDimensions = (batch_count > 1) ? 3 : 2; - tensor_a.dimensions[0].stride = ls_a; - tensor_a.dimensions[0].size = (order==rocblas_order_column_major) ? num_rows_a : num_cols_a; - tensor_a.dimensions[1].stride = ld_a; - tensor_a.dimensions[1].size = (order==rocblas_order_column_major) ? num_cols_a : num_rows_a; - tensor_a.dimensions[2].stride = bs_a; - tensor_a.dimensions[2].size = batch_count; - // validate tensor a - if (tensor_a.dimensions[0].stride < 1) { - // user gave invalid ls_a - return rocblas_status_invalid_size; - } - if (tensor_a.dimensions[1].stride < tensor_a.dimensions[0].stride * tensor_a.dimensions[0].size ) { - // user gave invalid ld_a - return rocblas_status_invalid_size; - } - if (tensor_a.dimensions[2].stride < tensor_a.dimensions[1].stride * tensor_a.dimensions[1].size ) { - // user gave invalid bs_a - return rocblas_status_invalid_size; - } + /* create tensile tensors + * - translates rows/cols into dim0/dim1 + * - dim0 is dimension with shortest stride (necessary for performance) + * - dim1 is other matrix dimension + * - dim2 is batch dimension + */ + + // create tensor c + TensileTensor tensor_c; + tensor_c.dataType = type_c; + tensor_c.numDimensions = (b_c > 1) ? 3 : 2; + tensor_c.dimensions[0].stride = ls_c; + tensor_c.dimensions[0].size = (order==rocblas_order_column_major) ? num_rows_c : num_cols_c; + tensor_c.dimensions[1].stride = ld_c; + tensor_c.dimensions[1].size = (order==rocblas_order_column_major) ? num_cols_c : num_rows_c; + tensor_c.dimensions[2].stride = bs_c; + tensor_c.dimensions[2].size = b_c; + // validate tensor c + if (tensor_c.dimensions[0].stride < 1) { + // user gave invalid ls_c + return rocblas_status_invalid_size; + } + if (tensor_c.dimensions[1].stride < tensor_c.dimensions[0].stride * tensor_c.dimensions[0].size ) { + // user gave invalid ld_c + return rocblas_status_invalid_size; + } + if (tensor_c.dimensions[2].stride < tensor_c.dimensions[1].stride * tensor_c.dimensions[1].size ) { + // user gave invalid bs_c + return rocblas_status_invalid_size; + } - // create tensor b - TensileTensor tensor_b; - tensor_b.dataType = conjugate_if_necessary( type_b, trans_b ); - tensor_b.numDimensions = (batch_count > 1) ? 3 : 2; - tensor_b.dimensions[0].stride = ls_b; - tensor_b.dimensions[0].size = (order==rocblas_order_column_major) ? num_rows_b : num_cols_b; - tensor_b.dimensions[1].stride = ld_b; - tensor_b.dimensions[1].size = (order==rocblas_order_column_major) ? num_cols_b : num_rows_b; - tensor_b.dimensions[2].stride = bs_b; - tensor_b.dimensions[2].size = batch_count; - // validate tensor b - if (tensor_b.dimensions[0].stride < 1) { - // user gave invalid ls_b - return rocblas_status_invalid_size; - } - if (tensor_b.dimensions[1].stride < tensor_b.dimensions[0].stride * tensor_b.dimensions[0].size ) { - // user gave invalid ld_b - return rocblas_status_invalid_size; - } - if (tensor_b.dimensions[2].stride < tensor_b.dimensions[1].stride * tensor_b.dimensions[1].size ) { - // user gave invalid bs_b - return rocblas_status_invalid_size; - } + // create tensor a + TensileTensor tensor_a; + tensor_a.dataType = conjugate_if_necessary( type_a, trans_a ); + tensor_a.numDimensions = (b_c > 1) ? 3 : 2; + tensor_a.dimensions[0].stride = ls_a; + tensor_a.dimensions[0].size = (order==rocblas_order_column_major) ? num_rows_a : num_cols_a; + tensor_a.dimensions[1].stride = ld_a; + tensor_a.dimensions[1].size = (order==rocblas_order_column_major) ? num_cols_a : num_rows_a; + tensor_a.dimensions[2].stride = bs_a; + tensor_a.dimensions[2].size = b_c; + // validate tensor a + if (tensor_a.dimensions[0].stride < 1) { + // user gave invalid ls_a + return rocblas_status_invalid_size; + } + if (tensor_a.dimensions[1].stride < tensor_a.dimensions[0].stride * tensor_a.dimensions[0].size ) { + // user gave invalid ld_a + return rocblas_status_invalid_size; + } + if (tensor_a.dimensions[2].stride < tensor_a.dimensions[1].stride * tensor_a.dimensions[1].size ) { + // user gave invalid bs_a + return rocblas_status_invalid_size; + } + // create tensor b + TensileTensor tensor_b; + tensor_b.dataType = conjugate_if_necessary( type_b, trans_b ); + tensor_b.numDimensions = (b_c > 1) ? 3 : 2; + tensor_b.dimensions[0].stride = ls_b; + tensor_b.dimensions[0].size = (order==rocblas_order_column_major) ? num_rows_b : num_cols_b; + tensor_b.dimensions[1].stride = ld_b; + tensor_b.dimensions[1].size = (order==rocblas_order_column_major) ? num_cols_b : num_rows_b; + tensor_b.dimensions[2].stride = bs_b; + tensor_b.dimensions[2].size = b_c; + // validate tensor b + if (tensor_b.dimensions[0].stride < 1) { + // user gave invalid ls_b + return rocblas_status_invalid_size; + } + if (tensor_b.dimensions[1].stride < tensor_b.dimensions[0].stride * tensor_b.dimensions[0].size ) { + // user gave invalid ld_b + return rocblas_status_invalid_size; + } + if (tensor_b.dimensions[2].stride < tensor_b.dimensions[1].stride * tensor_b.dimensions[1].size ) { + // user gave invalid bs_b + return rocblas_status_invalid_size; + } - // TODO - do assignments depend on order? - // index assignments - unsigned int index_assignments_a[3]; - unsigned int index_assignments_b[3]; - if ( batch_count > 1) { - index_assignments_a[0] = trans_a == rocblas_operation_none ? 0 : 3; - index_assignments_a[1] = trans_a == rocblas_operation_none ? 3 : 0; - index_assignments_a[2] = 2; - index_assignments_b[0] = trans_b == rocblas_operation_none ? 3 : 1; - index_assignments_b[1] = trans_b == rocblas_operation_none ? 1 : 3; - index_assignments_b[2] = 2; - } else { - index_assignments_a[0] = trans_a == rocblas_operation_none ? 0 : 2; - index_assignments_a[1] = trans_a == rocblas_operation_none ? 2 : 0; - index_assignments_b[0] = trans_b == rocblas_operation_none ? 2 : 1; - index_assignments_b[1] = trans_b == rocblas_operation_none ? 1 : 2; - } - // create problem - TensileProblem problem; - RETURN_IF_TENSILE_ERROR( tensileCreateProblem( - &problem, - tensor_c, - tensor_a, - tensor_b, - index_assignments_a, - index_assignments_b, - tensileOperationTypeContraction, - type_alpha, - type_beta, - false, // Use offsets? No. Only OpenCL needed them for generality; HIP doesn't. - handle->tensile_device_profile) ); - -#ifdef _DEBUG - // thorough validation that problem was created correctly - RETURN_IF_TENSILE_ERROR( tensileValidateProblem(problem) ); -#endif + // TODO - do assignments depend on order? + // index assignments + unsigned int index_assignments_a[3]; + unsigned int index_assignments_b[3]; + if ( b_c > 1) { + index_assignments_a[0] = trans_a == rocblas_operation_none ? 0 : 3; + index_assignments_a[1] = trans_a == rocblas_operation_none ? 3 : 0; + index_assignments_a[2] = 2; + index_assignments_b[0] = trans_b == rocblas_operation_none ? 3 : 1; + index_assignments_b[1] = trans_b == rocblas_operation_none ? 1 : 3; + index_assignments_b[2] = 2; + } else { + index_assignments_a[0] = trans_a == rocblas_operation_none ? 0 : 2; + index_assignments_a[1] = trans_a == rocblas_operation_none ? 2 : 0; + index_assignments_b[0] = trans_b == rocblas_operation_none ? 2 : 1; + index_assignments_b[1] = trans_b == rocblas_operation_none ? 1 : 2; + } - // lookup solution - TensileSolution solution; - RETURN_IF_TENSILE_ERROR( tensileGetSolutionForProblem( &solution, problem ) ); - - // wrap pointers and enqueue solution - TensileTensorData tensor_data_c{ c, 0 }; - TensileTensorDataConst tensor_data_a{ a, 0 }; - TensileTensorDataConst tensor_data_b{ b, 0 }; - TensileScalarData scalar_data_alpha{ alpha }; - TensileScalarData scalar_data_beta{ beta }; - RETURN_IF_TENSILE_ERROR( tensileEnqueueSolution( - solution, - tensor_data_c, - tensor_data_a, - tensor_data_b, - scalar_data_alpha, - scalar_data_beta, - &handle->tensile_control) ); - - // cleanup - RETURN_IF_TENSILE_ERROR( tensileDestroyProblem(problem) ); - RETURN_IF_TENSILE_ERROR( tensileDestroySolution(solution) ); - - // TODO - put events into handle, if necessary - - // success - return rocblas_status_success; + #ifndef NDEBUG + printf("creating problem \n"); + #endif + // create problem + TensileProblem problem; + // DONOT SIMPLY return a TENSILE error, the return type is rocblas_status, otherwise cause stalling + PRINT_IF_TENSILE_ERROR( tensileCreateProblem( + &problem, + tensor_c, + tensor_a, + tensor_b, + index_assignments_a, + index_assignments_b, + tensileOperationTypeContraction, + type_alpha, + type_beta, + false, // Use offsets? No. Only OpenCL needed them for generality; HIP doesn't. + handle->tensile_device_profile) ); + + #ifndef NDEBUG + // thorough validation that problem was created correctly + PRINT_IF_TENSILE_ERROR( tensileValidateProblem(problem) ); + #endif + + #ifndef NDEBUG + // lookup solution + printf("looking up solution \n"); + struct timeval tv; + gettimeofday(&tv, NULL); + double begin = (tv.tv_sec * 1000 * 1000) + tv.tv_usec ; + #endif + + TensileSolution solution; + PRINT_IF_TENSILE_ERROR( tensileGetSolutionForProblem( &solution, problem ) ); + + #ifndef NDEBUG + gettimeofday(&tv, NULL); + double end = (tv.tv_sec * 1000 * 1000) + tv.tv_usec ; + double time_used_in_us = (end - begin); + printf("It takes %f us to get the solution \n", time_used_in_us); + #endif + + // wrap pointers and enqueue solution + TensileTensorData tensor_data_c{ c, 0 }; + TensileTensorDataConst tensor_data_a{ a, 0 }; + TensileTensorDataConst tensor_data_b{ b, 0 }; + TensileScalarData scalar_data_alpha{ alpha }; + TensileScalarData scalar_data_beta{ beta }; + PRINT_IF_TENSILE_ERROR( tensileEnqueueSolution( + solution, + tensor_data_c, + tensor_data_a, + tensor_data_b, + scalar_data_alpha, + scalar_data_beta, + &handle->tensile_control) ); + + // cleanup + PRINT_IF_TENSILE_ERROR( tensileDestroyProblem(problem) ); + PRINT_IF_TENSILE_ERROR( tensileDestroySolution(solution) ); + + // success + return rocblas_status_success; } @@ -210,116 +389,51 @@ rocblas_status xgemm_tensile( /******************************************************************************* * API Functions : ******************************************************************************/ -// fp16 (hgemm) is available -// rocblas_status rocblas_hgemm( -// rocblas_handle handle, -// rocblas_order order, -// rocblas_operation transa, rocblas_operation transb, -// rocblas_int m, rocblas_int n, rocblas_int k, -// const rocblas_half *alpha, -// const rocblas_half *A, rocblas_int ld_a, -// const rocblas_half *B, rocblas_int ld_b, -// const rocblas_half *beta, -// rocblas_half *C, rocblas_int ld_c) { -// -// TensileDataType type_c = tensileDataTypeHalf; -// TensileDataType type_a = tensileDataTypeHalf; -// TensileDataType type_b = tensileDataTypeHalf; -// TensileDataType type_alpha = tensileDataTypeHalf; -// TensileDataType type_beta = tensileDataTypeHalf; -// -// rocblas_int ls_c = 1; -// rocblas_int ls_a = 1; -// rocblas_int ls_b = 1; -// -// rocblas_int bs_c; -// rocblas_int bs_a; -// rocblas_int bs_b; -// -// infer_batch_strides( order, transa, transb, m, n, k, -// ld_a, &bs_a, ld_b, &bs_b, ld_c, &bs_c ); -// rocblas_int batch_count = 1; -// -// return xgemm_tensile( handle, order, transa, transb, -// m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, -// type_b, B, ls_b, ld_b, bs_b, type_beta, beta, -// type_c, C, ls_c, ld_c, bs_c, batch_count ); -// } - -rocblas_status rocblas_sgemm( - rocblas_handle handle, - rocblas_order order, - rocblas_operation transa, rocblas_operation transb, - rocblas_int m, rocblas_int n, rocblas_int k, - const float *alpha, - const float *A, rocblas_int ld_a, - const float *B, rocblas_int ld_b, - const float *beta, - float *C, rocblas_int ld_c) { - - TensileDataType type_c = tensileDataTypeSingle; - TensileDataType type_a = tensileDataTypeSingle; - TensileDataType type_b = tensileDataTypeSingle; - TensileDataType type_alpha = tensileDataTypeSingle; - TensileDataType type_beta = tensileDataTypeSingle; - - rocblas_int ls_c = 1; - rocblas_int ls_a = 1; - rocblas_int ls_b = 1; - - rocblas_int bs_c; - rocblas_int bs_a; - rocblas_int bs_b; - - infer_batch_strides( order, transa, transb, m, n, k, - ld_a, &bs_a, ld_b, &bs_b, ld_c, &bs_c ); - rocblas_int batch_count = 1; - - return xgemm_tensile( handle, order, transa, transb, - m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, - type_b, B, ls_b, ld_b, bs_b, type_beta, beta, - type_c, C, ls_c, ld_c, bs_c, batch_count ); +#if 0 + rocblas_status rocblas_hgemm( + rocblas_handle handle, + rocblas_order order, + rocblas_operation trans_a, rocblas_operation trans_b, + rocblas_int m, rocblas_int n, rocblas_int k, + const rocblas_half *alpha, + const rocblas_half *A, rocblas_int ld_a, + const rocblas_half *B, rocblas_int ld_b, + const rocblas_half *beta, + rocblas_half *C, rocblas_int ld_c) { + + TensileDataType type_c = tensileDataTypeHalf; + TensileDataType type_a = tensileDataTypeHalf; + TensileDataType type_b = tensileDataTypeHalf; + TensileDataType type_alpha = tensileDataTypeHalf; + TensileDataType type_beta = tensileDataTypeHalf; + + rocblas_int ls_c = 1; + rocblas_int ls_a = 1; + rocblas_int ls_b = 1; + + rocblas_int bs_c; + rocblas_int bs_a; + rocblas_int bs_b; + + infer_batch_strides( order, trans_a, trans_b, m, n, k, + ld_a, &bs_a, ld_b, &bs_b, ld_c, &bs_c ); + rocblas_int b_c = 1; + + return xgemm_tensile( handle, order, trans_a, trans_b, + m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, + type_b, B, ls_b, ld_b, bs_b, type_beta, beta, + type_c, C, ls_c, ld_c, bs_c, b_c ); } +#endif -rocblas_status rocblas_dgemm( - rocblas_handle handle, - rocblas_order order, - rocblas_operation transa, rocblas_operation transb, - rocblas_int m, rocblas_int n, rocblas_int k, - const double *alpha, - const double *A, rocblas_int ld_a, - const double *B, rocblas_int ld_b, - const double *beta, - double *C, rocblas_int ld_c) { - - TensileDataType type_c = tensileDataTypeDouble; - TensileDataType type_a = tensileDataTypeDouble; - TensileDataType type_b = tensileDataTypeDouble; - TensileDataType type_alpha = tensileDataTypeDouble; - TensileDataType type_beta = tensileDataTypeDouble; - - rocblas_int ls_c = 1; - rocblas_int ls_a = 1; - rocblas_int ls_b = 1; - - rocblas_int bs_c; - rocblas_int bs_a; - rocblas_int bs_b; - - infer_batch_strides( order, transa, transb, m, n, k, - ld_a, &bs_a, ld_b, &bs_b, ld_c, &bs_c ); - rocblas_int batch_count = 1; - - return xgemm_tensile( handle, order, transa, transb, - m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, - type_b, B, ls_b, ld_b, bs_b, type_beta, beta, - type_c, C, ls_c, ld_c, bs_c, batch_count ); + +rocblas_status rocblas_dgemm( ARGS(double) ) { } // rocblas_status rocblas_qgemm( // rocblas_handle handle, // rocblas_order order, -// rocblas_operation transa, rocblas_operation transb, +// rocblas_operation trans_a, rocblas_operation trans_b, // rocblas_int m, rocblas_int n, rocblas_int k, // const rocblas_half_complex *alpha, // const rocblas_half_complex *A, rocblas_int ld_a, @@ -327,35 +441,35 @@ rocblas_status rocblas_dgemm( // const rocblas_half_complex *beta, // rocblas_half_complex *C, rocblas_int ld_c) { -// TensileDataType type_c = tensileDataTypeComplexHalf; -// TensileDataType type_a = tensileDataTypeComplexHalf; -// TensileDataType type_b = tensileDataTypeComplexHalf; -// TensileDataType type_alpha = tensileDataTypeComplexHalf; -// TensileDataType type_beta = tensileDataTypeComplexHalf; + // TensileDataType type_c = tensileDataTypeComplexHalf; + // TensileDataType type_a = tensileDataTypeComplexHalf; + // TensileDataType type_b = tensileDataTypeComplexHalf; + // TensileDataType type_alpha = tensileDataTypeComplexHalf; + // TensileDataType type_beta = tensileDataTypeComplexHalf; -// rocblas_int ls_c = 1; -// rocblas_int ls_a = 1; -// rocblas_int ls_b = 1; + // rocblas_int ls_c = 1; + // rocblas_int ls_a = 1; + // rocblas_int ls_b = 1; -// rocblas_int bs_c; -// rocblas_int bs_a; -// rocblas_int bs_b; + // rocblas_int bs_c; + // rocblas_int bs_a; + // rocblas_int bs_b; -// infer_batch_strides( order, transa, transb, m, n, k, -// ld_a, &bs_a, ld_b, &bs_b, ld_c, &bs_c ); -// rocblas_int batch_count = 1; + // infer_batch_strides( order, trans_a, trans_b, m, n, k, + // ld_a, &bs_a, ld_b, &bs_b, ld_c, &bs_c ); + // rocblas_int b_c = 1; -// return xgemm_tensile( handle, order, transa, transb, -// m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, -// type_b, B, ls_b, ld_b, bs_b, type_beta, beta, -// type_c, C, ls_c, ld_c, bs_c, batch_count ); + // return xgemm_tensile( handle, order, trans_a, trans_b, + // m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, + // type_b, B, ls_b, ld_b, bs_b, type_beta, beta, + // type_c, C, ls_c, ld_c, bs_c, b_c ); // } #if COMPLEX rocblas_status rocblas_cgemm( rocblas_handle handle, rocblas_order order, - rocblas_operation transa, rocblas_operation transb, + rocblas_operation trans_a, rocblas_operation trans_b, rocblas_int m, rocblas_int n, rocblas_int k, const rocblas_float_complex *alpha, const rocblas_float_complex *A, rocblas_int ld_a, @@ -363,34 +477,34 @@ rocblas_status rocblas_cgemm( const rocblas_float_complex *beta, rocblas_float_complex *C, rocblas_int ld_c) { - TensileDataType type_c = tensileDataTypeComplexSingle; - TensileDataType type_a = tensileDataTypeComplexSingle; - TensileDataType type_b = tensileDataTypeComplexSingle; - TensileDataType type_alpha = tensileDataTypeComplexSingle; - TensileDataType type_beta = tensileDataTypeComplexSingle; + TensileDataType type_c = tensileDataTypeComplexSingle; + TensileDataType type_a = tensileDataTypeComplexSingle; + TensileDataType type_b = tensileDataTypeComplexSingle; + TensileDataType type_alpha = tensileDataTypeComplexSingle; + TensileDataType type_beta = tensileDataTypeComplexSingle; - rocblas_int ls_c = 1; - rocblas_int ls_a = 1; - rocblas_int ls_b = 1; + rocblas_int ls_c = 1; + rocblas_int ls_a = 1; + rocblas_int ls_b = 1; - rocblas_int bs_c; - rocblas_int bs_a; - rocblas_int bs_b; + rocblas_int bs_c; + rocblas_int bs_a; + rocblas_int bs_b; - infer_batch_strides( order, transa, transb, m, n, k, - ld_a, &bs_a, ld_b, &bs_b, ld_c, &bs_c ); - rocblas_int batch_count = 1; + infer_batch_strides( order, trans_a, trans_b, m, n, k, + ld_a, &bs_a, ld_b, &bs_b, ld_c, &bs_c ); + rocblas_int b_c = 1; - return xgemm_tensile( handle, order, transa, transb, - m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, - type_b, B, ls_b, ld_b, bs_b, type_beta, beta, - type_c, C, ls_c, ld_c, bs_c, batch_count ); + return xgemm_tensile( handle, order, trans_a, trans_b, + m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, + type_b, B, ls_b, ld_b, bs_b, type_beta, beta, + type_c, C, ls_c, ld_c, bs_c, b_c ); } rocblas_status rocblas_zgemm( rocblas_handle handle, rocblas_order order, - rocblas_operation transa, rocblas_operation transb, + rocblas_operation trans_a, rocblas_operation trans_b, rocblas_int m, rocblas_int n, rocblas_int k, const rocblas_double_complex *alpha, const rocblas_double_complex *A, rocblas_int ld_a, @@ -398,28 +512,28 @@ rocblas_status rocblas_zgemm( const rocblas_double_complex *beta, rocblas_double_complex *C, rocblas_int ld_c) { - TensileDataType type_c = tensileDataTypeComplexDouble; - TensileDataType type_a = tensileDataTypeComplexDouble; - TensileDataType type_b = tensileDataTypeComplexDouble; - TensileDataType type_alpha = tensileDataTypeComplexDouble; - TensileDataType type_beta = tensileDataTypeComplexDouble; + TensileDataType type_c = tensileDataTypeComplexDouble; + TensileDataType type_a = tensileDataTypeComplexDouble; + TensileDataType type_b = tensileDataTypeComplexDouble; + TensileDataType type_alpha = tensileDataTypeComplexDouble; + TensileDataType type_beta = tensileDataTypeComplexDouble; - rocblas_int ls_c = 1; - rocblas_int ls_a = 1; - rocblas_int ls_b = 1; + rocblas_int ls_c = 1; + rocblas_int ls_a = 1; + rocblas_int ls_b = 1; - rocblas_int bs_c; - rocblas_int bs_a; - rocblas_int bs_b; + rocblas_int bs_c; + rocblas_int bs_a; + rocblas_int bs_b; - infer_batch_strides( order, transa, transb, m, n, k, - ld_a, &bs_a, ld_b, &bs_b, ld_c, &bs_c ); - rocblas_int batch_count = 1; + infer_batch_strides( order, trans_a, trans_b, m, n, k, + ld_a, &bs_a, ld_b, &bs_b, ld_c, &bs_c ); + rocblas_int b_c = 1; - return xgemm_tensile( handle, order, transa, transb, - m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, - type_b, B, ls_b, ld_b, bs_b, type_beta, beta, - type_c, C, ls_c, ld_c, bs_c, batch_count ); + return xgemm_tensile( handle, order, trans_a, trans_b, + m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, + type_b, B, ls_b, ld_b, bs_b, type_beta, beta, + type_c, C, ls_c, ld_c, bs_c, b_c ); } #endif @@ -432,7 +546,7 @@ rocblas_status rocblas_zgemm( // rocblas_status rocblas_hgemm_strided( // rocblas_handle handle, // rocblas_order order, -// rocblas_operation transa, rocblas_operation transb, +// rocblas_operation trans_a, rocblas_operation trans_b, // rocblas_int m, rocblas_int n, rocblas_int k, // const rocblas_half *alpha, // const rocblas_half *A, rocblas_int ls_a, rocblas_int ld_a, @@ -440,30 +554,30 @@ rocblas_status rocblas_zgemm( // const rocblas_half *beta, // rocblas_half *C, rocblas_int ls_c, rocblas_int ld_c) { // -// TensileDataType type_c = tensileDataTypeHalf; -// TensileDataType type_a = tensileDataTypeHalf; -// TensileDataType type_b = tensileDataTypeHalf; -// TensileDataType type_alpha = tensileDataTypeHalf; -// TensileDataType type_beta = tensileDataTypeHalf; -// -// rocblas_int bs_c; -// rocblas_int bs_a; -// rocblas_int bs_b; -// -// infer_batch_strides( order, transa, transb, m, n, k, -// ld_a, &bs_a, ld_b, &bs_b, ld_c, &bs_c ); -// rocblas_int batch_count = 1; -// -// return xgemm_tensile( handle, order, transa, transb, -// m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, -// type_b, B, ls_b, ld_b, bs_b, type_beta, beta, -// type_c, C, ls_c, ld_c, bs_c, batch_count ); + // TensileDataType type_c = tensileDataTypeHalf; + // TensileDataType type_a = tensileDataTypeHalf; + // TensileDataType type_b = tensileDataTypeHalf; + // TensileDataType type_alpha = tensileDataTypeHalf; + // TensileDataType type_beta = tensileDataTypeHalf; + // + // rocblas_int bs_c; + // rocblas_int bs_a; + // rocblas_int bs_b; + // + // infer_batch_strides( order, trans_a, trans_b, m, n, k, + // ld_a, &bs_a, ld_b, &bs_b, ld_c, &bs_c ); + // rocblas_int b_c = 1; + // + // return xgemm_tensile( handle, order, trans_a, trans_b, + // m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, + // type_b, B, ls_b, ld_b, bs_b, type_beta, beta, + // type_c, C, ls_c, ld_c, bs_c, b_c ); // } rocblas_status rocblas_sgemm_strided( rocblas_handle handle, rocblas_order order, - rocblas_operation transa, rocblas_operation transb, + rocblas_operation trans_a, rocblas_operation trans_b, rocblas_int m, rocblas_int n, rocblas_int k, const float *alpha, const float *A, rocblas_int ls_a, rocblas_int ld_a, @@ -471,30 +585,30 @@ rocblas_status rocblas_sgemm_strided( const float *beta, float *C, rocblas_int ls_c, rocblas_int ld_c) { - TensileDataType type_c = tensileDataTypeSingle; - TensileDataType type_a = tensileDataTypeSingle; - TensileDataType type_b = tensileDataTypeSingle; - TensileDataType type_alpha = tensileDataTypeSingle; - TensileDataType type_beta = tensileDataTypeSingle; + TensileDataType type_c = tensileDataTypeSingle; + TensileDataType type_a = tensileDataTypeSingle; + TensileDataType type_b = tensileDataTypeSingle; + TensileDataType type_alpha = tensileDataTypeSingle; + TensileDataType type_beta = tensileDataTypeSingle; - rocblas_int bs_c; - rocblas_int bs_a; - rocblas_int bs_b; + rocblas_int bs_c; + rocblas_int bs_a; + rocblas_int bs_b; - infer_batch_strides( order, transa, transb, m, n, k, - ld_a, &bs_a, ld_b, &bs_b, ld_c, &bs_c ); - rocblas_int batch_count = 1; + infer_batch_strides( order, trans_a, trans_b, m, n, k, + ld_a, &bs_a, ld_b, &bs_b, ld_c, &bs_c ); + rocblas_int b_c = 1; - return xgemm_tensile( handle, order, transa, transb, - m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, - type_b, B, ls_b, ld_b, bs_b, type_beta, beta, - type_c, C, ls_c, ld_c, bs_c, batch_count ); + return xgemm_tensile( handle, order, trans_a, trans_b, + m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, + type_b, B, ls_b, ld_b, bs_b, type_beta, beta, + type_c, C, ls_c, ld_c, bs_c, b_c ); } rocblas_status rocblas_dgemm_strided( rocblas_handle handle, rocblas_order order, - rocblas_operation transa, rocblas_operation transb, + rocblas_operation trans_a, rocblas_operation trans_b, rocblas_int m, rocblas_int n, rocblas_int k, const double *alpha, const double *A, rocblas_int ls_a, rocblas_int ld_a, @@ -502,30 +616,30 @@ rocblas_status rocblas_dgemm_strided( const double *beta, double *C, rocblas_int ls_c, rocblas_int ld_c) { - TensileDataType type_c = tensileDataTypeDouble; - TensileDataType type_a = tensileDataTypeDouble; - TensileDataType type_b = tensileDataTypeDouble; - TensileDataType type_alpha = tensileDataTypeDouble; - TensileDataType type_beta = tensileDataTypeDouble; + TensileDataType type_c = tensileDataTypeDouble; + TensileDataType type_a = tensileDataTypeDouble; + TensileDataType type_b = tensileDataTypeDouble; + TensileDataType type_alpha = tensileDataTypeDouble; + TensileDataType type_beta = tensileDataTypeDouble; - rocblas_int bs_c; - rocblas_int bs_a; - rocblas_int bs_b; + rocblas_int bs_c; + rocblas_int bs_a; + rocblas_int bs_b; - infer_batch_strides( order, transa, transb, m, n, k, - ld_a, &bs_a, ld_b, &bs_b, ld_c, &bs_c ); - rocblas_int batch_count = 1; + infer_batch_strides( order, trans_a, trans_b, m, n, k, + ld_a, &bs_a, ld_b, &bs_b, ld_c, &bs_c ); + rocblas_int b_c = 1; - return xgemm_tensile( handle, order, transa, transb, - m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, - type_b, B, ls_b, ld_b, bs_b, type_beta, beta, - type_c, C, ls_c, ld_c, bs_c, batch_count ); + return xgemm_tensile( handle, order, trans_a, trans_b, + m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, + type_b, B, ls_b, ld_b, bs_b, type_beta, beta, + type_c, C, ls_c, ld_c, bs_c, b_c ); } // rocblas_status rocblas_qgemm_strided( // rocblas_handle handle, // rocblas_order order, -// rocblas_operation transa, rocblas_operation transb, +// rocblas_operation trans_a, rocblas_operation trans_b, // rocblas_int m, rocblas_int n, rocblas_int k, // const rocblas_half_complex *alpha, // const rocblas_half_complex *A, rocblas_int ls_a, rocblas_int ld_a, @@ -543,21 +657,21 @@ rocblas_status rocblas_dgemm_strided( // rocblas_int bs_a; // rocblas_int bs_b; -// infer_batch_strides( order, transa, transb, m, n, k, +// infer_batch_strides( order, trans_a, trans_b, m, n, k, // ld_a, &bs_a, ld_b, &bs_b, ld_c, &bs_c ); -// rocblas_int batch_count = 1; +// rocblas_int b_c = 1; -// return xgemm_tensile( handle, order, transa, transb, +// return xgemm_tensile( handle, order, trans_a, trans_b, // m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, // type_b, B, ls_b, ld_b, bs_b, type_beta, beta, -// type_c, C, ls_c, ld_c, bs_c, batch_count ); +// type_c, C, ls_c, ld_c, bs_c, b_c ); // } #if COMPLEX rocblas_status rocblas_cgemm_strided( rocblas_handle handle, rocblas_order order, - rocblas_operation transa, rocblas_operation transb, + rocblas_operation trans_a, rocblas_operation trans_b, rocblas_int m, rocblas_int n, rocblas_int k, const rocblas_float_complex *alpha, const rocblas_float_complex *A, rocblas_int ls_a, rocblas_int ld_a, @@ -565,30 +679,30 @@ rocblas_status rocblas_cgemm_strided( const rocblas_float_complex *beta, rocblas_float_complex *C, rocblas_int ls_c, rocblas_int ld_c) { - TensileDataType type_c = tensileDataTypeComplexSingle; - TensileDataType type_a = tensileDataTypeComplexSingle; - TensileDataType type_b = tensileDataTypeComplexSingle; - TensileDataType type_alpha = tensileDataTypeComplexSingle; - TensileDataType type_beta = tensileDataTypeComplexSingle; + TensileDataType type_c = tensileDataTypeComplexSingle; + TensileDataType type_a = tensileDataTypeComplexSingle; + TensileDataType type_b = tensileDataTypeComplexSingle; + TensileDataType type_alpha = tensileDataTypeComplexSingle; + TensileDataType type_beta = tensileDataTypeComplexSingle; - rocblas_int bs_c; - rocblas_int bs_a; - rocblas_int bs_b; + rocblas_int bs_c; + rocblas_int bs_a; + rocblas_int bs_b; - infer_batch_strides( order, transa, transb, m, n, k, - ld_a, &bs_a, ld_b, &bs_b, ld_c, &bs_c ); - rocblas_int batch_count = 1; + infer_batch_strides( order, trans_a, trans_b, m, n, k, + ld_a, &bs_a, ld_b, &bs_b, ld_c, &bs_c ); + rocblas_int b_c = 1; - return xgemm_tensile( handle, order, transa, transb, - m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, - type_b, B, ls_b, ld_b, bs_b, type_beta, beta, - type_c, C, ls_c, ld_c, bs_c, batch_count ); + return xgemm_tensile( handle, order, trans_a, trans_b, + m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, + type_b, B, ls_b, ld_b, bs_b, type_beta, beta, + type_c, C, ls_c, ld_c, bs_c, b_c ); } rocblas_status rocblas_zgemm_strided( rocblas_handle handle, rocblas_order order, - rocblas_operation transa, rocblas_operation transb, + rocblas_operation trans_a, rocblas_operation trans_b, rocblas_int m, rocblas_int n, rocblas_int k, const rocblas_double_complex *alpha, const rocblas_double_complex *A, rocblas_int ls_a, rocblas_int ld_a, @@ -596,24 +710,24 @@ rocblas_status rocblas_zgemm_strided( const rocblas_double_complex *beta, rocblas_double_complex *C, rocblas_int ls_c, rocblas_int ld_c) { - TensileDataType type_c = tensileDataTypeComplexDouble; - TensileDataType type_a = tensileDataTypeComplexDouble; - TensileDataType type_b = tensileDataTypeComplexDouble; - TensileDataType type_alpha = tensileDataTypeComplexDouble; - TensileDataType type_beta = tensileDataTypeComplexDouble; + TensileDataType type_c = tensileDataTypeComplexDouble; + TensileDataType type_a = tensileDataTypeComplexDouble; + TensileDataType type_b = tensileDataTypeComplexDouble; + TensileDataType type_alpha = tensileDataTypeComplexDouble; + TensileDataType type_beta = tensileDataTypeComplexDouble; - rocblas_int bs_c; - rocblas_int bs_a; - rocblas_int bs_b; + rocblas_int bs_c; + rocblas_int bs_a; + rocblas_int bs_b; - infer_batch_strides( order, transa, transb, m, n, k, - ld_a, &bs_a, ld_b, &bs_b, ld_c, &bs_c ); - rocblas_int batch_count = 1; + infer_batch_strides( order, trans_a, trans_b, m, n, k, + ld_a, &bs_a, ld_b, &bs_b, ld_c, &bs_c ); + rocblas_int b_c = 1; - return xgemm_tensile( handle, order, transa, transb, - m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, - type_b, B, ls_b, ld_b, bs_b, type_beta, beta, - type_c, C, ls_c, ld_c, bs_c, batch_count ); + return xgemm_tensile( handle, order, trans_a, trans_b, + m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, + type_b, B, ls_b, ld_b, bs_b, type_beta, beta, + type_c, C, ls_c, ld_c, bs_c, b_c ); } #endif @@ -622,19 +736,19 @@ rocblas_status rocblas_zgemm_strided( * bs_a - "batch stride a": stride from the start of one "A" matrix to the next * bs_b * bs_c - * batch_count - numbers of gemm's in the batch + * b_c - numbers of gemm's in the batch **************************************************************************/ // rocblas_status rocblas_hgemm_batched( // rocblas_handle handle, // rocblas_order order, -// rocblas_operation transa, rocblas_operation transb, +// rocblas_operation trans_a, rocblas_operation trans_b, // rocblas_int m, rocblas_int n, rocblas_int k, // const rocblas_half *alpha, // const rocblas_half *A, rocblas_int ld_a, rocblas_int bs_a, // const rocblas_half *B, rocblas_int ld_b, rocblas_int bs_b, // const rocblas_half *beta, // rocblas_half *C, rocblas_int ld_c, rocblas_int bs_c, -// rocblas_int batch_count ) { +// rocblas_int b_c ) { // // TensileDataType type_c = tensileDataTypeHalf; // TensileDataType type_a = tensileDataTypeHalf; @@ -646,151 +760,151 @@ rocblas_status rocblas_zgemm_strided( // rocblas_int ls_a = 1; // rocblas_int ls_b = 1; // -// return xgemm_tensile( handle, order, transa, transb, +// return xgemm_tensile( handle, order, trans_a, trans_b, // m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, // type_b, B, ls_b, ld_b, bs_b, type_beta, beta, -// type_c, C, ls_c, ld_c, bs_c, batch_count ); +// type_c, C, ls_c, ld_c, bs_c, b_c ); // } rocblas_status rocblas_sgemm_batched( rocblas_handle handle, rocblas_order order, - rocblas_operation transa, rocblas_operation transb, + rocblas_operation trans_a, rocblas_operation trans_b, rocblas_int m, rocblas_int n, rocblas_int k, const float *alpha, const float *A, rocblas_int ld_a, rocblas_int bs_a, const float *B, rocblas_int ld_b, rocblas_int bs_b, const float *beta, float *C, rocblas_int ld_c, rocblas_int bs_c, - rocblas_int batch_count ) { - - TensileDataType type_c = tensileDataTypeSingle; - TensileDataType type_a = tensileDataTypeSingle; - TensileDataType type_b = tensileDataTypeSingle; - TensileDataType type_alpha = tensileDataTypeSingle; - TensileDataType type_beta = tensileDataTypeSingle; - - rocblas_int ls_c = 1; - rocblas_int ls_a = 1; - rocblas_int ls_b = 1; - - return xgemm_tensile( handle, order, transa, transb, - m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, - type_b, B, ls_b, ld_b, bs_b, type_beta, beta, - type_c, C, ls_c, ld_c, bs_c, batch_count ); + rocblas_int b_c ) { + + TensileDataType type_c = tensileDataTypeSingle; + TensileDataType type_a = tensileDataTypeSingle; + TensileDataType type_b = tensileDataTypeSingle; + TensileDataType type_alpha = tensileDataTypeSingle; + TensileDataType type_beta = tensileDataTypeSingle; + + rocblas_int ls_c = 1; + rocblas_int ls_a = 1; + rocblas_int ls_b = 1; + + return xgemm_tensile( handle, order, trans_a, trans_b, + m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, + type_b, B, ls_b, ld_b, bs_b, type_beta, beta, + type_c, C, ls_c, ld_c, bs_c, b_c ); } rocblas_status rocblas_dgemm_batched( rocblas_handle handle, rocblas_order order, - rocblas_operation transa, rocblas_operation transb, + rocblas_operation trans_a, rocblas_operation trans_b, rocblas_int m, rocblas_int n, rocblas_int k, const double *alpha, const double *A, rocblas_int ld_a, rocblas_int bs_a, const double *B, rocblas_int ld_b, rocblas_int bs_b, const double *beta, double *C, rocblas_int ld_c, rocblas_int bs_c, - rocblas_int batch_count ) { - - TensileDataType type_c = tensileDataTypeDouble; - TensileDataType type_a = tensileDataTypeDouble; - TensileDataType type_b = tensileDataTypeDouble; - TensileDataType type_alpha = tensileDataTypeDouble; - TensileDataType type_beta = tensileDataTypeDouble; - - rocblas_int ls_c = 1; - rocblas_int ls_a = 1; - rocblas_int ls_b = 1; - - return xgemm_tensile( handle, order, transa, transb, - m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, - type_b, B, ls_b, ld_b, bs_b, type_beta, beta, - type_c, C, ls_c, ld_c, bs_c, batch_count ); + rocblas_int b_c ) { + + TensileDataType type_c = tensileDataTypeDouble; + TensileDataType type_a = tensileDataTypeDouble; + TensileDataType type_b = tensileDataTypeDouble; + TensileDataType type_alpha = tensileDataTypeDouble; + TensileDataType type_beta = tensileDataTypeDouble; + + rocblas_int ls_c = 1; + rocblas_int ls_a = 1; + rocblas_int ls_b = 1; + + return xgemm_tensile( handle, order, trans_a, trans_b, + m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, + type_b, B, ls_b, ld_b, bs_b, type_beta, beta, + type_c, C, ls_c, ld_c, bs_c, b_c ); } // rocblas_status rocblas_qgemm_batched( // rocblas_handle handle, // rocblas_order order, -// rocblas_operation transa, rocblas_operation transb, +// rocblas_operation trans_a, rocblas_operation trans_b, // rocblas_int m, rocblas_int n, rocblas_int k, // const rocblas_half_complex *alpha, // const rocblas_half_complex *A, rocblas_int ld_a, rocblas_int bs_a, // const rocblas_half_complex *B, rocblas_int ld_b, rocblas_int bs_b, // const rocblas_half_complex *beta, // rocblas_half_complex *C, rocblas_int ld_c, rocblas_int bs_c, -// rocblas_int batch_count ) { - -// TensileDataType type_c = tensileDataTypeComplexHalf; -// TensileDataType type_a = tensileDataTypeComplexHalf; -// TensileDataType type_b = tensileDataTypeComplexHalf; -// TensileDataType type_alpha = tensileDataTypeComplexHalf; -// TensileDataType type_beta = tensileDataTypeComplexHalf; - -// rocblas_int ls_c = 1; -// rocblas_int ls_a = 1; -// rocblas_int ls_b = 1; - -// return xgemm_tensile( handle, order, transa, transb, -// m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, -// type_b, B, ls_b, ld_b, bs_b, type_beta, beta, -// type_c, C, ls_c, ld_c, bs_c, batch_count ); +// rocblas_int b_c ) { + + // TensileDataType type_c = tensileDataTypeComplexHalf; + // TensileDataType type_a = tensileDataTypeComplexHalf; + // TensileDataType type_b = tensileDataTypeComplexHalf; + // TensileDataType type_alpha = tensileDataTypeComplexHalf; + // TensileDataType type_beta = tensileDataTypeComplexHalf; + + // rocblas_int ls_c = 1; + // rocblas_int ls_a = 1; + // rocblas_int ls_b = 1; + + // return xgemm_tensile( handle, order, trans_a, trans_b, + // m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, + // type_b, B, ls_b, ld_b, bs_b, type_beta, beta, + // type_c, C, ls_c, ld_c, bs_c, b_c ); // } #if COMPLEX rocblas_status rocblas_cgemm_batched( rocblas_handle handle, rocblas_order order, - rocblas_operation transa, rocblas_operation transb, + rocblas_operation trans_a, rocblas_operation trans_b, rocblas_int m, rocblas_int n, rocblas_int k, const rocblas_float_complex *alpha, const rocblas_float_complex *A, rocblas_int ld_a, rocblas_int bs_a, const rocblas_float_complex *B, rocblas_int ld_b, rocblas_int bs_b, const rocblas_float_complex *beta, rocblas_float_complex *C, rocblas_int ld_c, rocblas_int bs_c, - rocblas_int batch_count ) { - - TensileDataType type_c = tensileDataTypeComplexSingle; - TensileDataType type_a = tensileDataTypeComplexSingle; - TensileDataType type_b = tensileDataTypeComplexSingle; - TensileDataType type_alpha = tensileDataTypeComplexSingle; - TensileDataType type_beta = tensileDataTypeComplexSingle; - - rocblas_int ls_c = 1; - rocblas_int ls_a = 1; - rocblas_int ls_b = 1; - - return xgemm_tensile( handle, order, transa, transb, - m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, - type_b, B, ls_b, ld_b, bs_b, type_beta, beta, - type_c, C, ls_c, ld_c, bs_c, batch_count ); + rocblas_int b_c ) { + + TensileDataType type_c = tensileDataTypeComplexSingle; + TensileDataType type_a = tensileDataTypeComplexSingle; + TensileDataType type_b = tensileDataTypeComplexSingle; + TensileDataType type_alpha = tensileDataTypeComplexSingle; + TensileDataType type_beta = tensileDataTypeComplexSingle; + + rocblas_int ls_c = 1; + rocblas_int ls_a = 1; + rocblas_int ls_b = 1; + + return xgemm_tensile( handle, order, trans_a, trans_b, + m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, + type_b, B, ls_b, ld_b, bs_b, type_beta, beta, + type_c, C, ls_c, ld_c, bs_c, b_c ); } rocblas_status rocblas_zgemm_batched( rocblas_handle handle, rocblas_order order, - rocblas_operation transa, rocblas_operation transb, + rocblas_operation trans_a, rocblas_operation trans_b, rocblas_int m, rocblas_int n, rocblas_int k, const rocblas_double_complex *alpha, const rocblas_double_complex *A, rocblas_int ld_a, rocblas_int bs_a, const rocblas_double_complex *B, rocblas_int ld_b, rocblas_int bs_b, const rocblas_double_complex *beta, rocblas_double_complex *C, rocblas_int ld_c, rocblas_int bs_c, - rocblas_int batch_count ) { - - TensileDataType type_c = tensileDataTypeComplexDouble; - TensileDataType type_a = tensileDataTypeComplexDouble; - TensileDataType type_b = tensileDataTypeComplexDouble; - TensileDataType type_alpha = tensileDataTypeComplexDouble; - TensileDataType type_beta = tensileDataTypeComplexDouble; - - rocblas_int ls_c = 1; - rocblas_int ls_a = 1; - rocblas_int ls_b = 1; - - return xgemm_tensile( handle, order, transa, transb, - m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, - type_b, B, ls_b, ld_b, bs_b, type_beta, beta, - type_c, C, ls_c, ld_c, bs_c, batch_count ); + rocblas_int b_c ) { + + TensileDataType type_c = tensileDataTypeComplexDouble; + TensileDataType type_a = tensileDataTypeComplexDouble; + TensileDataType type_b = tensileDataTypeComplexDouble; + TensileDataType type_alpha = tensileDataTypeComplexDouble; + TensileDataType type_beta = tensileDataTypeComplexDouble; + + rocblas_int ls_c = 1; + rocblas_int ls_a = 1; + rocblas_int ls_b = 1; + + return xgemm_tensile( handle, order, trans_a, trans_b, + m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, + type_b, B, ls_b, ld_b, bs_b, type_beta, beta, + type_c, C, ls_c, ld_c, bs_c, b_c ); } #endif @@ -802,199 +916,155 @@ rocblas_status rocblas_zgemm_batched( * bs_a - "batch stride a": stride from the start of one "A" matrix to the next * bs_b * bs_c - * batch_count - numbers of gemm's in the batch + * b_c - numbers of gemm's in the batch **************************************************************************/ // rocblas_status rocblas_hgemm_strided_batched( // rocblas_handle handle, // rocblas_order order, -// rocblas_operation transa, rocblas_operation transb, +// rocblas_operation trans_a, rocblas_operation trans_b, // rocblas_int m, rocblas_int n, rocblas_int k, // const rocblas_half *alpha, // const rocblas_half *A, rocblas_int ls_a, rocblas_int ld_a, rocblas_int bs_a, // const rocblas_half *B, rocblas_int ls_b, rocblas_int ld_b, rocblas_int bs_b, // const rocblas_half *beta, // rocblas_half *C, rocblas_int ls_c, rocblas_int ld_c, rocblas_int bs_c, -// rocblas_int batch_count ) { -// -// TensileDataType type_c = tensileDataTypeHalf; -// TensileDataType type_a = tensileDataTypeHalf; -// TensileDataType type_b = tensileDataTypeHalf; -// TensileDataType type_alpha = tensileDataTypeHalf; -// TensileDataType type_beta = tensileDataTypeHalf; +// rocblas_int b_c ) { // -// return xgemm_tensile( handle, order, transa, transb, -// m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, -// type_b, B, ls_b, ld_b, bs_b, type_beta, beta, -// type_c, C, ls_c, ld_c, bs_c, batch_count ); + // TensileDataType type_c = tensileDataTypeHalf; + // TensileDataType type_a = tensileDataTypeHalf; + // TensileDataType type_b = tensileDataTypeHalf; + // TensileDataType type_alpha = tensileDataTypeHalf; + // TensileDataType type_beta = tensileDataTypeHalf; + // + // return xgemm_tensile( handle, order, trans_a, trans_b, + // m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, + // type_b, B, ls_b, ld_b, bs_b, type_beta, beta, + // type_c, C, ls_c, ld_c, bs_c, b_c ); // } rocblas_status rocblas_sgemm_strided_batched( rocblas_handle handle, rocblas_order order, - rocblas_operation transa, rocblas_operation transb, + rocblas_operation trans_a, rocblas_operation trans_b, rocblas_int m, rocblas_int n, rocblas_int k, const float *alpha, const float *A, rocblas_int ls_a, rocblas_int ld_a, rocblas_int bs_a, const float *B, rocblas_int ls_b, rocblas_int ld_b, rocblas_int bs_b, const float *beta, float *C, rocblas_int ls_c, rocblas_int ld_c, rocblas_int bs_c, - rocblas_int batch_count ) { - - TensileDataType type_c = tensileDataTypeSingle; - TensileDataType type_a = tensileDataTypeSingle; - TensileDataType type_b = tensileDataTypeSingle; - TensileDataType type_alpha = tensileDataTypeSingle; - TensileDataType type_beta = tensileDataTypeSingle; - - return xgemm_tensile( handle, order, transa, transb, - m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, - type_b, B, ls_b, ld_b, bs_b, type_beta, beta, - type_c, C, ls_c, ld_c, bs_c, batch_count ); + rocblas_int b_c ) { + + TensileDataType type_c = tensileDataTypeSingle; + TensileDataType type_a = tensileDataTypeSingle; + TensileDataType type_b = tensileDataTypeSingle; + TensileDataType type_alpha = tensileDataTypeSingle; + TensileDataType type_beta = tensileDataTypeSingle; + + return xgemm_tensile( handle, order, trans_a, trans_b, + m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, + type_b, B, ls_b, ld_b, bs_b, type_beta, beta, + type_c, C, ls_c, ld_c, bs_c, b_c ); } rocblas_status rocblas_dgemm_strided_batched( rocblas_handle handle, rocblas_order order, - rocblas_operation transa, rocblas_operation transb, + rocblas_operation trans_a, rocblas_operation trans_b, rocblas_int m, rocblas_int n, rocblas_int k, const double *alpha, const double *A, rocblas_int ls_a, rocblas_int ld_a, rocblas_int bs_a, const double *B, rocblas_int ls_b, rocblas_int ld_b, rocblas_int bs_b, const double *beta, double *C, rocblas_int ls_c, rocblas_int ld_c, rocblas_int bs_c, - rocblas_int batch_count ) { - - TensileDataType type_c = tensileDataTypeDouble; - TensileDataType type_a = tensileDataTypeDouble; - TensileDataType type_b = tensileDataTypeDouble; - TensileDataType type_alpha = tensileDataTypeDouble; - TensileDataType type_beta = tensileDataTypeDouble; - - return xgemm_tensile( handle, order, transa, transb, - m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, - type_b, B, ls_b, ld_b, bs_b, type_beta, beta, - type_c, C, ls_c, ld_c, bs_c, batch_count ); + rocblas_int b_c ) { + + TensileDataType type_c = tensileDataTypeDouble; + TensileDataType type_a = tensileDataTypeDouble; + TensileDataType type_b = tensileDataTypeDouble; + TensileDataType type_alpha = tensileDataTypeDouble; + TensileDataType type_beta = tensileDataTypeDouble; + + return xgemm_tensile( handle, order, trans_a, trans_b, + m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, + type_b, B, ls_b, ld_b, bs_b, type_beta, beta, + type_c, C, ls_c, ld_c, bs_c, b_c ); } // rocblas_status rocblas_qgemm_strided_batched( // rocblas_handle handle, // rocblas_order order, -// rocblas_operation transa, rocblas_operation transb, +// rocblas_operation trans_a, rocblas_operation trans_b, // rocblas_int m, rocblas_int n, rocblas_int k, // const rocblas_half_complex *alpha, // const rocblas_half_complex *A, rocblas_int ls_a, rocblas_int ld_a, rocblas_int bs_a, // const rocblas_half_complex *B, rocblas_int ls_b, rocblas_int ld_b, rocblas_int bs_b, // const rocblas_half_complex *beta, // rocblas_half_complex *C, rocblas_int ls_c, rocblas_int ld_c, rocblas_int bs_c, -// rocblas_int batch_count ) { - -// TensileDataType type_c = tensileDataTypeComplexHalf; -// TensileDataType type_a = tensileDataTypeComplexHalf; -// TensileDataType type_b = tensileDataTypeComplexHalf; -// TensileDataType type_alpha = tensileDataTypeComplexHalf; -// TensileDataType type_beta = tensileDataTypeComplexHalf; - -// return xgemm_tensile( handle, order, transa, transb, -// m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, -// type_b, B, ls_b, ld_b, bs_b, type_beta, beta, -// type_c, C, ls_c, ld_c, bs_c, batch_count ); +// rocblas_int b_c ) { + + // TensileDataType type_c = tensileDataTypeComplexHalf; + // TensileDataType type_a = tensileDataTypeComplexHalf; + // TensileDataType type_b = tensileDataTypeComplexHalf; + // TensileDataType type_alpha = tensileDataTypeComplexHalf; + // TensileDataType type_beta = tensileDataTypeComplexHalf; + + // return xgemm_tensile( handle, order, trans_a, trans_b, + // m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, + // type_b, B, ls_b, ld_b, bs_b, type_beta, beta, + // type_c, C, ls_c, ld_c, bs_c, b_c ); // } #if COMPLEX rocblas_status rocblas_cgemm_strided_batched( rocblas_handle handle, rocblas_order order, - rocblas_operation transa, rocblas_operation transb, + rocblas_operation trans_a, rocblas_operation trans_b, rocblas_int m, rocblas_int n, rocblas_int k, const rocblas_float_complex *alpha, const rocblas_float_complex *A, rocblas_int ls_a, rocblas_int ld_a, rocblas_int bs_a, const rocblas_float_complex *B, rocblas_int ls_b, rocblas_int ld_b, rocblas_int bs_b, const rocblas_float_complex *beta, rocblas_float_complex *C, rocblas_int ls_c, rocblas_int ld_c, rocblas_int bs_c, - rocblas_int batch_count ) { - - TensileDataType type_c = tensileDataTypeComplexSingle; - TensileDataType type_a = tensileDataTypeComplexSingle; - TensileDataType type_b = tensileDataTypeComplexSingle; - TensileDataType type_alpha = tensileDataTypeComplexSingle; - TensileDataType type_beta = tensileDataTypeComplexSingle; - - return xgemm_tensile( handle, order, transa, transb, - m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, - type_b, B, ls_b, ld_b, bs_b, type_beta, beta, - type_c, C, ls_c, ld_c, bs_c, batch_count ); + rocblas_int b_c ) { + + TensileDataType type_c = tensileDataTypeComplexSingle; + TensileDataType type_a = tensileDataTypeComplexSingle; + TensileDataType type_b = tensileDataTypeComplexSingle; + TensileDataType type_alpha = tensileDataTypeComplexSingle; + TensileDataType type_beta = tensileDataTypeComplexSingle; + + return xgemm_tensile( handle, order, trans_a, trans_b, + m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, + type_b, B, ls_b, ld_b, bs_b, type_beta, beta, + type_c, C, ls_c, ld_c, bs_c, b_c ); } rocblas_status rocblas_zgemm_strided_batched( rocblas_handle handle, rocblas_order order, - rocblas_operation transa, rocblas_operation transb, + rocblas_operation trans_a, rocblas_operation trans_b, rocblas_int m, rocblas_int n, rocblas_int k, const rocblas_double_complex *alpha, const rocblas_double_complex *A, rocblas_int ls_a, rocblas_int ld_a, rocblas_int bs_a, const rocblas_double_complex *B, rocblas_int ls_b, rocblas_int ld_b, rocblas_int bs_b, const rocblas_double_complex *beta, rocblas_double_complex *C, rocblas_int ls_c, rocblas_int ld_c, rocblas_int bs_c, - rocblas_int batch_count ) { - - TensileDataType type_c = tensileDataTypeComplexDouble; - TensileDataType type_a = tensileDataTypeComplexDouble; - TensileDataType type_b = tensileDataTypeComplexDouble; - TensileDataType type_alpha = tensileDataTypeComplexDouble; - TensileDataType type_beta = tensileDataTypeComplexDouble; - - return xgemm_tensile( handle, order, transa, transb, - m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, - type_b, B, ls_b, ld_b, bs_b, type_beta, beta, - type_c, C, ls_c, ld_c, bs_c, batch_count ); + rocblas_int b_c ) { + + TensileDataType type_c = tensileDataTypeComplexDouble; + TensileDataType type_a = tensileDataTypeComplexDouble; + TensileDataType type_b = tensileDataTypeComplexDouble; + TensileDataType type_alpha = tensileDataTypeComplexDouble; + TensileDataType type_beta = tensileDataTypeComplexDouble; + + return xgemm_tensile( handle, order, trans_a, trans_b, + m, n, k, type_alpha, alpha, type_a, A, ls_a, ld_a, bs_a, + type_b, B, ls_b, ld_b, bs_b, type_beta, beta, + type_c, C, ls_c, ld_c, bs_c, b_c ); } +// eliminating complex #endif -/******************************************************************************* - * Helper Functions - ******************************************************************************/ -TensileDataType conjugate_if_necessary( TensileDataType type, rocblas_operation trans ) { - if ( trans == rocblas_operation_conjugate_transpose ) { - switch ( type ) { - // case tensileDataTypeComplexHalf: - // return tensileDataTypeComplexConjugateHalf; - case tensileDataTypeComplexSingle: - return tensileDataTypeComplexConjugateSingle; - case tensileDataTypeComplexDouble: - return tensileDataTypeComplexConjugateDouble; - default: - // code should never come here, so create an error - return tensileDataTypeNone; - } - } else { - // not conjugate transposing - return type; - } -} -/******************************************************************************* - * Infer Batch Strides - ******************************************************************************/ -void infer_batch_strides( - rocblas_order order, - rocblas_operation transa, rocblas_operation transb, - rocblas_int m, rocblas_int n, rocblas_int k, - rocblas_int ld_a, rocblas_int *bs_a, - rocblas_int ld_b, rocblas_int *bs_b, - rocblas_int ld_c, rocblas_int *bs_c ) { - - int num_cols_a = (transa == rocblas_operation_none ? k : m); - int num_rows_a = (transa == rocblas_operation_none ? m : k); - int num_cols_b = (transb == rocblas_operation_none ? n : k); - int num_rows_b = (transb == rocblas_operation_none ? k : n); - int num_cols_c = m; - int num_rows_c = n; - - int dim1_size_a = (order==rocblas_order_column_major) ? num_cols_a : num_rows_a; - int dim1_size_b = (order==rocblas_order_column_major) ? num_cols_b : num_rows_b; - int dim1_size_c = (order==rocblas_order_column_major) ? num_cols_c : num_rows_c; - - *bs_a = ld_a * dim1_size_a; - *bs_b = ld_b * dim1_size_b; - *bs_c = ld_c * dim1_size_c; -} +// eliminating old code +#endif diff --git a/library/src/blas3/Tensile/gemm.h b/library/src/blas3/Tensile/gemm.h index 06818f847..3426cdfc5 100644 --- a/library/src/blas3/Tensile/gemm.h +++ b/library/src/blas3/Tensile/gemm.h @@ -1,12 +1,148 @@ #include "rocblas_types.h" #include "Tensile.h" -TensileDataType conjugate_if_necessary( TensileDataType type, rocblas_operation trans ); - -void infer_batch_strides( +/******************************************************************************* + * Infer Batch Strides + ******************************************************************************/ +inline void infer_batch_strides( rocblas_order order, - rocblas_operation transa, rocblas_operation transb, + rocblas_operation trans_a, rocblas_operation trans_b, rocblas_int m, rocblas_int n, rocblas_int k, rocblas_int ld_a, rocblas_int *bs_a, rocblas_int ld_b, rocblas_int *bs_b, - rocblas_int ld_c, rocblas_int *bs_c ); + rocblas_int ld_c, rocblas_int *bs_c ) { + + rocblas_int num_cols_c = n; + rocblas_int num_rows_c = m; + rocblas_int num_cols_a = (trans_a == rocblas_operation_none ? k : m); + rocblas_int num_rows_a = (trans_a == rocblas_operation_none ? m : k); + rocblas_int num_cols_b = (trans_b == rocblas_operation_none ? n : k); + rocblas_int num_rows_b = (trans_b == rocblas_operation_none ? k : n); + + rocblas_int dim1_size_a = (order==rocblas_order_column_major) + ? num_cols_a : num_rows_a; + rocblas_int dim1_size_b = (order==rocblas_order_column_major) + ? num_cols_b : num_rows_b; + rocblas_int dim1_size_c = (order==rocblas_order_column_major) + ? num_cols_c : num_rows_c; + + *bs_a = ld_a * dim1_size_a; + *bs_b = ld_b * dim1_size_b; + *bs_c = ld_c * dim1_size_c; + +} // infer batched strides + + +/******************************************************************************* + * Validate Arguments + ******************************************************************************/ +inline rocblas_status validateArgs( + rocblas_handle handle, + rocblas_order order, + rocblas_operation trans_a, rocblas_operation trans_b, + rocblas_int m, rocblas_int n, rocblas_int k, + const void *alpha, + const void *a, rocblas_int ld_a, rocblas_int bs_a, + const void *b, rocblas_int ld_b, rocblas_int bs_b, + const void *beta, + void *c, rocblas_int ld_c, rocblas_int bs_c, rocblas_int b_c + ) { + + // quick return 0 is valid in BLAS + if ( m == 0 || n == 0 || k == 0 || b_c == 0) { + return rocblas_status_success; + } + + // sizes must not be negative + if ( m < 0 || n < 0 || k < 0 || b_c < 0) { + return rocblas_status_invalid_size; + } + + // strides must not be negative + if ( m < 0 || n < 0 || k < 0 || b_c < 0) { + return rocblas_status_invalid_size; + } + + // handle must be valid + if (handle == nullptr) { + return rocblas_status_invalid_handle; + } + + // pointers must be valid + if ( c == nullptr + || a == nullptr + || b == nullptr + || alpha == nullptr + || beta == nullptr ) { + return rocblas_status_invalid_pointer; + } + + rocblas_int num_cols_c = n; + rocblas_int num_rows_c = m; + rocblas_int num_cols_a = (trans_a == rocblas_operation_none) ? k : m; + rocblas_int num_rows_a = (trans_a == rocblas_operation_none) ? m : k; + rocblas_int num_cols_b = (trans_b == rocblas_operation_none) ? n : k; + rocblas_int num_rows_b = (trans_b == rocblas_operation_none) ? k : n; + + // valid strides + if(order==rocblas_order_column_major){ + if( num_rows_a > ld_a + || num_rows_b > ld_b + || num_rows_c > ld_c) { + return rocblas_status_invalid_size; + } + } else { + if( num_cols_a > ld_a + || num_cols_b > ld_b + || num_cols_c > ld_c) { + return rocblas_status_invalid_size; + } + } + + // TODO re-write these checks +#if 0 + // validate tensor c + if (tensor_c.dimensions[0].stride < 1) { + // user gave invalid ls_c + return rocblas_status_invalid_size; + } + if (tensor_c.dimensions[1].stride < tensor_c.dimensions[0].stride * tensor_c.dimensions[0].size ) { + // user gave invalid ld_c + return rocblas_status_invalid_size; + } + if (tensor_c.dimensions[2].stride < tensor_c.dimensions[1].stride * tensor_c.dimensions[1].size ) { + // user gave invalid bs_c + return rocblas_status_invalid_size; + } + + // validate tensor a + if (tensor_a.dimensions[0].stride < 1) { + // user gave invalid ls_a + return rocblas_status_invalid_size; + } + if (tensor_a.dimensions[1].stride < tensor_a.dimensions[0].stride * tensor_a.dimensions[0].size ) { + // user gave invalid ld_a + return rocblas_status_invalid_size; + } + if (tensor_a.dimensions[2].stride < tensor_a.dimensions[1].stride * tensor_a.dimensions[1].size ) { + // user gave invalid bs_a + return rocblas_status_invalid_size; + } + + // validate tensor b + if (tensor_b.dimensions[0].stride < 1) { + // user gave invalid ls_b + return rocblas_status_invalid_size; + } + if (tensor_b.dimensions[1].stride < tensor_b.dimensions[0].stride * tensor_b.dimensions[0].size ) { + // user gave invalid ld_b + return rocblas_status_invalid_size; + } + if (tensor_b.dimensions[2].stride < tensor_b.dimensions[1].stride * tensor_b.dimensions[1].size ) { + // user gave invalid bs_b + return rocblas_status_invalid_size; + } +#endif + return rocblas_status_success; +} // validate parameters + diff --git a/library/src/blas3/rocblas_gemm.cpp b/library/src/blas3/gemm.hpp similarity index 85% rename from library/src/blas3/rocblas_gemm.cpp rename to library/src/blas3/gemm.hpp index 0208a80c1..5a1d03c6e 100644 --- a/library/src/blas3/rocblas_gemm.cpp +++ b/library/src/blas3/gemm.hpp @@ -2,11 +2,36 @@ * Copyright 2016 Advanced Micro Devices, Inc. * * ************************************************************************ */ + + +#pragma once +#ifndef _GEMM_HPP_ +#define _GEMM_HPP_ + #include + template + rocblas_status rocblas_gemm_template(rocblas_handle handle, + rocblas_operation transA, rocblas_operation transB, + rocblas_int m, rocblas_int n, rocblas_int k, + const T *alpha, + const T *A, rocblas_int lda, + const T *B, rocblas_int ldb, + const T *beta, + T *C, rocblas_int ldc); + + template + rocblas_status rocblas_gemm_batched_template( + rocblas_handle handle, + rocblas_operation transA, rocblas_operation transB, + rocblas_int m, rocblas_int n, rocblas_int k, + const T *alpha, + const T *A, rocblas_int lda, rocblas_int bsa, + const T *B, rocblas_int ldb, rocblas_int bsb, + const T *beta, + T *C, rocblas_int ldc, rocblas_int bsc, + rocblas_int batch_count); -#include "rocblas.h" -#include "rocblas.hpp" #define COMPLEX 0 @@ -76,7 +101,7 @@ template<> rocblas_status -rocblas_gemm(rocblas_handle handle, +rocblas_gemm_template(rocblas_handle handle, rocblas_operation transA, rocblas_operation transB, rocblas_int M, rocblas_int N, rocblas_int K, @@ -91,7 +116,7 @@ rocblas_gemm(rocblas_handle handle, template<> rocblas_status -rocblas_gemm(rocblas_handle handle, +rocblas_gemm_template(rocblas_handle handle, rocblas_operation transA, rocblas_operation transB, rocblas_int M, rocblas_int N, rocblas_int K, @@ -108,7 +133,7 @@ rocblas_gemm(rocblas_handle handle, template<> rocblas_status -rocblas_gemm(rocblas_handle handle, +rocblas_gemm_template(rocblas_handle handle, rocblas_operation transA, rocblas_operation transB, rocblas_int M, rocblas_int N, rocblas_int K, @@ -124,7 +149,7 @@ rocblas_gemm(rocblas_handle handle, template<> rocblas_status -rocblas_gemm(rocblas_handle handle, +rocblas_gemm_template(rocblas_handle handle, rocblas_operation transA, rocblas_operation transB, rocblas_int M, rocblas_int N, rocblas_int K, @@ -213,7 +238,7 @@ rocblas_gemm(rocblas_handle handle, template<> rocblas_status -rocblas_gemm_batched(rocblas_handle handle, +rocblas_gemm_batched_template(rocblas_handle handle, rocblas_operation transA, rocblas_operation transB, rocblas_int M, rocblas_int N, rocblas_int K, @@ -229,7 +254,7 @@ rocblas_gemm_batched(rocblas_handle handle, template<> rocblas_status -rocblas_gemm_batched(rocblas_handle handle, +rocblas_gemm_batched_template(rocblas_handle handle, rocblas_operation transA, rocblas_operation transB, rocblas_int M, rocblas_int N, rocblas_int K, @@ -247,7 +272,7 @@ rocblas_gemm_batched(rocblas_handle handle, template<> rocblas_status -rocblas_gemm_batched(rocblas_handle handle, +rocblas_gemm_batched_template(rocblas_handle handle, rocblas_operation transA, rocblas_operation transB, rocblas_int M, rocblas_int N, rocblas_int K, @@ -263,7 +288,7 @@ rocblas_gemm_batched(rocblas_handle handle, template<> rocblas_status -rocblas_gemm_batched(rocblas_handle handle, +rocblas_gemm_batched_template(rocblas_handle handle, rocblas_operation transA, rocblas_operation transB, rocblas_int M, rocblas_int N, rocblas_int K, @@ -279,3 +304,4 @@ rocblas_gemm_batched(rocblas_handle handle, #endif +#endif // _GEMM_HPP_ diff --git a/library/src/blas3/rocblas_trmm.cpp b/library/src/blas3/rocblas_trmm.cpp index 4d710a4be..39f8d1c8f 100644 --- a/library/src/blas3/rocblas_trmm.cpp +++ b/library/src/blas3/rocblas_trmm.cpp @@ -4,9 +4,8 @@ * ************************************************************************ */ #include - + #include "rocblas.h" -#include "rocblas.hpp" #include "definitions.h" diff --git a/library/src/blas3/rocblas_trsm.cpp b/library/src/blas3/rocblas_trsm.cpp index 23d7eb272..e91fce2c0 100644 --- a/library/src/blas3/rocblas_trsm.cpp +++ b/library/src/blas3/rocblas_trsm.cpp @@ -2,18 +2,14 @@ * Copyright 2016 Advanced Micro Devices, Inc. * * ************************************************************************ */ +#include #include - - #include "rocblas.h" -#include "rocblas.hpp" #include "status.h" #include "definitions.h" -#include "trtri_device.h" - - - +#include "gemm.hpp" +#include "trtri_trsm.hpp" #define A(ii, jj) (A + (ii) + (jj)*lda) #define B(ii, jj) (B + (ii) + (jj)*ldb) @@ -21,6 +17,7 @@ #define invA(ii) (invA + (ii)*BLOCK) + /* ===============left==================================================== */ template @@ -34,9 +31,9 @@ rocblas_status rocblas_trsm_left(rocblas_handle handle, const T* invA, T* X) { - const T* negative_one = -1.0; - const T* one = 1.0; - const T* zero = 0.0; + const T negtive_one = -1.0; + const T one = 1.0; + const T zero = 0.0; rocblas_int i, jb; @@ -47,17 +44,19 @@ rocblas_status rocblas_trsm_left(rocblas_handle handle, if (uplo == rocblas_fill_lower) { // left, lower no-transpose jb = min(BLOCK, m); - rocblas_gemm(handle, transA, transB, jb, n, jb, alpha, invA, BLOCK, B, ldb, zero, X, ldb); + rocblas_gemm_template(handle, transA, transB, jb, n, jb, alpha, invA, BLOCK, B, ldb, &zero, X, ldb); if (BLOCK < m) { - rocblas_gemm(handle, transA, transB, m-BLOCK, n, BLOCK, negative_one, A(BLOCK,0), lda, X, ldb, alpha, B(BLOCK,0), ldb); + rocblas_gemm_template(handle, transA, transB, m-BLOCK, n, BLOCK, &negtive_one, A(BLOCK,0), lda, X, ldb, alpha, B(BLOCK,0), ldb); // remaining blocks for( i=BLOCK; i < m; i += BLOCK ) { jb = min(m-i, BLOCK); - rocblas_gemm(handle, transA, transB, jb, n, jb, one, invA(i), BLOCK, B(i,0), ldb, zero, X(i,0), ldb); - if (i+BLOCK >= m) + + rocblas_gemm_template(handle, transA, transB, jb, n, jb, &one, invA(i), BLOCK, B(i,0), ldb, &zero, X(i,0), ldb); + if (i+BLOCK >= m)// this condition is not necessary at all and can be changed as if (i+BLOCK(handle, transA, transB, m-i-BLOCK, n, BLOCK, negative_one, A(i+BLOCK,i), lda, X(i,0), ldb, one, B(i+BLOCK,0), ldb); + + rocblas_gemm_template(handle, transA, transB, m-i-BLOCK, n, BLOCK, &negtive_one, A(i+BLOCK,i), lda, X(i,0), ldb, &one, B(i+BLOCK,0), ldb); } } @@ -65,9 +64,9 @@ rocblas_status rocblas_trsm_left(rocblas_handle handle, for( i=0; i < m; i += BLOCK ) { jb = min(m-i, BLOCK); T *tmp = (i == 0) ? alpha : one; - rocblas_gemm(handle, transA, transB, jb, n, jb, tmp, invA(i), BLOCK, B(i,0), ldb, zero, X(i,0), ldb); + rocblas_gemm_template(handle, transA, transB, jb, n, jb, tmp, invA(i), BLOCK, B(i,0), ldb, &zero, X(i,0), ldb); if(i + BLOCK < m){ - rocblas_gemm(handle, transA, transB, m-i-BLOCK, n, BLOCK, negative_one, A(i+BLOCK,i), lda, X(i,0), ldb, tmp, B(i+BLOCK,0), ldb); + rocblas_gemm_template(handle, transA, transB, m-i-BLOCK, n, BLOCK, &negtive_one, A(i+BLOCK,i), lda, X(i,0), ldb, tmp, B(i+BLOCK,0), ldb); } } @@ -76,17 +75,21 @@ rocblas_status rocblas_trsm_left(rocblas_handle handle, else { // left, upper no-transpose jb = (m % BLOCK == 0) ? BLOCK : (m % BLOCK); - i = m-jb; - rocblas_gemm(handle, transA, transB, jb, n, jb, alpha, invA(i), BLOCK, B(i,0), ldb, zero, X(i,0), ldb); + i = m-jb; + + //if m=n=35=lda=ldb, BLOCK =32, then jb = 3, i = 32; {3, 35, 3, 32, 35, 35} + rocblas_gemm_template(handle, transA, transB, jb, n, jb, alpha, invA(i), BLOCK, B(i,0), ldb, &zero, X(i,0), ldb); if (i-BLOCK >= 0) { - rocblas_gemm(handle, transA, transB, i, n, jb, negative_one, A(0,i), lda, X(i,0), ldb, alpha, B, ldb); + + rocblas_gemm_template(handle, transA, transB, i, n, jb, &negtive_one, A(0,i), lda, X(i,0), ldb, alpha, B, ldb); // remaining blocks for( i=m-jb-BLOCK; i >= 0; i -= BLOCK ) { - rocblas_gemm(handle, transA, transB, BLOCK, n, BLOCK, one, invA(i), BLOCK, B(i,0), ldb, zero, X(i,0), ldb); + //{32, 35, 32, 32, 35, 35} + rocblas_gemm_template(handle, transA, transB, BLOCK, n, BLOCK, &one, invA(i), BLOCK, B(i,0), ldb, &zero, X(i,0), ldb); if (i-BLOCK < 0) break; - rocblas_gemm(handle, transA, transB, i, n, BLOCK, negative_one, A(0,i), lda, X(i,0), ldb, one, B, ldb); + rocblas_gemm_template(handle, transA, transB, i, n, BLOCK, &negtive_one, A(0,i), lda, X(i,0), ldb, &one, B, ldb); } } } @@ -96,33 +99,33 @@ rocblas_status rocblas_trsm_left(rocblas_handle handle, // left, lower transpose jb = (m % BLOCK == 0) ? BLOCK : (m % BLOCK); i = m-jb; - rocblas_gemm(handle, transA, transB, jb, n, jb, alpha, invA(i), BLOCK, B(i,0), ldb, zero, X(i,0), ldb); + rocblas_gemm_template(handle, transA, transB, jb, n, jb, alpha, invA(i), BLOCK, B(i,0), ldb, &zero, X(i,0), ldb); if (i-BLOCK >= 0) { - rocblas_gemm(handle, transA, transB, i, n, jb, negative_one, A(i,0), lda, X(i,0), ldb, alpha, B, ldb); + rocblas_gemm_template(handle, transA, transB, i, n, jb, &negtive_one, A(i,0), lda, X(i,0), ldb, alpha, B, ldb); // remaining blocks for( i=m-jb-BLOCK; i >= 0; i -= BLOCK ) { - rocblas_gemm(handle, transA, transB, BLOCK, n, BLOCK, one, invA(i), BLOCK, B(i,0), ldb, zero, X(i,0), ldb); + rocblas_gemm_template(handle, transA, transB, BLOCK, n, BLOCK, &one, invA(i), BLOCK, B(i,0), ldb, &zero, X(i,0), ldb); if (i-BLOCK < 0) break; - rocblas_gemm(handle, transA, transB, i, n, BLOCK, negative_one, A(i,0), lda, X(i,0), ldb, one, B, ldb); + rocblas_gemm_template(handle, transA, transB, i, n, BLOCK, &negtive_one, A(i,0), lda, X(i,0), ldb, &one, B, ldb); } } } else { // left, upper transpose jb = min(BLOCK, m); - rocblas_gemm(handle, transA, transB, jb, n, jb, alpha, invA, BLOCK, B, ldb, zero, X, ldb); + rocblas_gemm_template(handle, transA, transB, jb, n, jb, alpha, invA, BLOCK, B, ldb, &zero, X, ldb); if (BLOCK < m) { - rocblas_gemm(handle, transA, transB, m-BLOCK, n, BLOCK, negative_one, A(0,BLOCK), lda, X, ldb, alpha, B(BLOCK,0), ldb); + rocblas_gemm_template(handle, transA, transB, m-BLOCK, n, BLOCK, &negtive_one, A(0,BLOCK), lda, X, ldb, alpha, B(BLOCK,0), ldb); // remaining blocks for( i=BLOCK; i < m; i += BLOCK ) { jb = min(m-i, BLOCK); - rocblas_gemm(handle, transA, transB, jb, n, jb, one, invA(i), BLOCK, B(i,0), ldb, zero, X(i,0), ldb); + rocblas_gemm_template(handle, transA, transB, jb, n, jb, &one, invA(i), BLOCK, B(i,0), ldb, &zero, X(i,0), ldb); if (i+BLOCK >= m) break; - rocblas_gemm(handle, transA, transB, m-i-BLOCK, n, BLOCK, negative_one, A(i,i+BLOCK), lda, X(i,0), ldb, one, B(i+BLOCK,0), ldb); + rocblas_gemm_template(handle, transA, transB, m-i-BLOCK, n, BLOCK, &negtive_one, A(i,i+BLOCK), lda, X(i,0), ldb, &one, B(i+BLOCK,0), ldb); } } } @@ -146,9 +149,9 @@ rocblas_status rocblas_trsm_right(rocblas_handle handle, const T* invA, T* X) { - const T* negative_one = -1.0; - const T* one = 1.0; - const T* zero = 0.0; + const T negtive_one = -1.0; + const T one = 1.0; + const T zero = 0.0; rocblas_int i, jb; @@ -160,33 +163,33 @@ rocblas_status rocblas_trsm_right(rocblas_handle handle, // right, lower no-transpose jb = (n % BLOCK == 0) ? BLOCK : (n % BLOCK); i = n-jb; - rocblas_gemm(handle, transB, transA, m, jb, jb, alpha, B(0,i), ldb, invA(i), BLOCK, zero, X(0,i), ldb); + rocblas_gemm_template(handle, transB, transA, m, jb, jb, alpha, B(0,i), ldb, invA(i), BLOCK, &zero, X(0,i), ldb); if (i-BLOCK >= 0) { - rocblas_gemm(handle, transB, transA, m, i, jb, negative_one, X(0,i), ldb, A(i,0), lda, alpha, B, ldb); + rocblas_gemm_template(handle, transB, transA, m, i, jb, &negtive_one, X(0,i), ldb, A(i,0), lda, alpha, B, ldb); // remaining blocks for( i=n-jb-BLOCK; i >= 0; i -= BLOCK ) { - rocblas_gemm(handle, transB, transA, m, BLOCK, BLOCK, one, B(0,i), ldb, invA(i), BLOCK, zero, X(0,i), ldb); + rocblas_gemm_template(handle, transB, transA, m, BLOCK, BLOCK, &one, B(0,i), ldb, invA(i), BLOCK, &zero, X(0,i), ldb); if (i-BLOCK < 0) break; - rocblas_gemm(handle, transB, transA, m, i, BLOCK, negative_one, X(0,i), ldb, A(i,0), lda, one, B, ldb); + rocblas_gemm_template(handle, transB, transA, m, i, BLOCK, &negtive_one, X(0,i), ldb, A(i,0), lda, &one, B, ldb); } } } else { // right, upper no-transpose jb = min(BLOCK, n); - rocblas_gemm(handle, transB, transA, m, jb, jb, alpha, B, ldb, invA, BLOCK, zero, X, ldb); + rocblas_gemm_template(handle, transB, transA, m, jb, jb, alpha, B, ldb, invA, BLOCK, &zero, X, ldb); if (BLOCK < n) { - rocblas_gemm(handle, transB, transA, m, n-BLOCK, BLOCK, negative_one, X, ldb, A(0,BLOCK), lda, alpha, B(0,BLOCK), ldb); + rocblas_gemm_template(handle, transB, transA, m, n-BLOCK, BLOCK, &negtive_one, X, ldb, A(0,BLOCK), lda, alpha, B(0,BLOCK), ldb); // remaining blocks for( i=BLOCK; i < n; i += BLOCK ) { jb = min(BLOCK, n-i); - rocblas_gemm(handle, transB, transA, m, jb, jb, one, B(0,i), ldb, invA(i), BLOCK, zero, X(0,i), ldb); + rocblas_gemm_template(handle, transB, transA, m, jb, jb, &one, B(0,i), ldb, invA(i), BLOCK, &zero, X(0,i), ldb); if (i+BLOCK >= n) break; - rocblas_gemm(handle, transB, transA, m, n-i-BLOCK, BLOCK, negative_one, X(0,i), ldb, A(i,i+BLOCK), lda, one, B(0,i+BLOCK), ldb); + rocblas_gemm_template(handle, transB, transA, m, n-i-BLOCK, BLOCK, &negtive_one, X(0,i), ldb, A(i,i+BLOCK), lda, &one, B(0,i+BLOCK), ldb); } } } @@ -195,17 +198,17 @@ rocblas_status rocblas_trsm_right(rocblas_handle handle, if (uplo == rocblas_fill_lower) { // right, lower transpose jb = min(BLOCK, n); - rocblas_gemm(handle, transB, transA, m, jb, jb, alpha, B, ldb, invA, BLOCK, zero, X, ldb); + rocblas_gemm_template(handle, transB, transA, m, jb, jb, alpha, B, ldb, invA, BLOCK, &zero, X, ldb); if (BLOCK < n) { - rocblas_gemm(handle, transB, transA, m, n-BLOCK, BLOCK, negative_one, X, ldb, A(BLOCK,0), lda, alpha, B(0,BLOCK), ldb); + rocblas_gemm_template(handle, transB, transA, m, n-BLOCK, BLOCK, &negtive_one, X, ldb, A(BLOCK,0), lda, alpha, B(0,BLOCK), ldb); // remaining blocks for( i=BLOCK; i < n; i += BLOCK ) { jb = min(BLOCK, n-i); - rocblas_gemm(handle, transB, transA, m, jb, jb, one, B(0,i), ldb, invA(i), BLOCK, zero, X(0,i), ldb); + rocblas_gemm_template(handle, transB, transA, m, jb, jb, &one, B(0,i), ldb, invA(i), BLOCK, &zero, X(0,i), ldb); if (i+BLOCK >= n) break; - rocblas_gemm(handle, transB, transA, m, n-i-BLOCK, BLOCK, negative_one, X(0,i), ldb, A(BLOCK+i,i), lda, one, B(0,i+BLOCK), ldb); + rocblas_gemm_template(handle, transB, transA, m, n-i-BLOCK, BLOCK, &negtive_one, X(0,i), ldb, A(BLOCK+i,i), lda, &one, B(0,i+BLOCK), ldb); } } } @@ -213,16 +216,16 @@ rocblas_status rocblas_trsm_right(rocblas_handle handle, // right, upper transpose jb = (n % BLOCK == 0) ? BLOCK : (n % BLOCK); i = n-jb; - rocblas_gemm(handle, transB, transA, m, jb, jb, alpha, B(0,i), ldb, invA(i), BLOCK, zero, X(0,i), ldb); + rocblas_gemm_template(handle, transB, transA, m, jb, jb, alpha, B(0,i), ldb, invA(i), BLOCK, &zero, X(0,i), ldb); if (i-BLOCK >= 0) { - rocblas_gemm(handle, transB, transA, m, i, jb, negative_one, X(0,i), ldb, A(0,i), lda, alpha, B, ldb); + rocblas_gemm_template(handle, transB, transA, m, i, jb, &negtive_one, X(0,i), ldb, A(0,i), lda, alpha, B, ldb); // remaining blocks for( i=n-jb-BLOCK; i >= 0; i -= BLOCK ) { - rocblas_gemm(handle, transB, transA, m, BLOCK, BLOCK, one, B(0,i), ldb, invA(i), BLOCK, zero, X(0,i), ldb); + rocblas_gemm_template(handle, transB, transA, m, BLOCK, BLOCK, &one, B(0,i), ldb, invA(i), BLOCK, &zero, X(0,i), ldb); if (i-BLOCK < 0) break; - rocblas_gemm(handle, transB, transA, m, i, BLOCK, negative_one, X(0,i), ldb, A(0,i), lda, one, B, ldb); + rocblas_gemm_template(handle, transB, transA, m, i, BLOCK, &negtive_one, X(0,i), ldb, A(0,i), lda, &one, B, ldb); } } } @@ -285,7 +288,7 @@ rocblas_status rocblas_trsm_right(rocblas_handle handle, @param[in] alpha alpha specifies the scalar alpha. When alpha is - zero then A is not referenced and B need not be set before + &zero then A is not referenced and B need not be set before entry. @param[in] @@ -316,8 +319,8 @@ rocblas_status rocblas_trsm_template(rocblas_handle handle, rocblas_operation transA, rocblas_diagonal diag, rocblas_int m, rocblas_int n, const T* alpha, - const T* A, rocblas_int lda, - T* B, rocblas_int ldb) + T* A, rocblas_int lda, + T* B, rocblas_int ldb) { //A is of size lda*k rocblas_int k = (side == rocblas_side_left ? m : n); @@ -345,23 +348,23 @@ rocblas_status rocblas_trsm_template(rocblas_handle handle, if (m == 0 || n == 0) return rocblas_status_success; - T* invA, X; + T* invA; + T* X; //invA is of size BLOCK*k, BLOCK is the blocking size - RETURN_IF_HIP_ERROR(hipMalloc( &invA, BLOCK*k )); + PRINT_IF_HIP_ERROR(hipMalloc( &invA, BLOCK*k*sizeof(T) )); //X is the same size of B - RETURN_IF_HIP_ERROR(hipMalloc( &X, ldb*n )); + PRINT_IF_HIP_ERROR(hipMalloc( &X, ldb*n * sizeof(T) )); - //intialize invA and X to be zero - RETURN_IF_HIP_ERROR(hipMemset(invA, 0, BLOCK*k*sizeof(T))); + //intialize invA and X to be &zero + PRINT_IF_HIP_ERROR(hipMemset(invA, 0, BLOCK*k*sizeof(T))); //potential bug, may use hipMemcpy B to X - RETURN_IF_HIP_ERROR(hipMemset(X, 0, ldb*n*sizeof(T))); + PRINT_IF_HIP_ERROR(hipMemset(X, 0, ldb*n*sizeof(T))); //batched trtri invert diagonal part (BLOCK*BLOCK) of A into invA - rocblas_status status = rocblas_trtri_trsm(handle, uplo, diag, + rocblas_status status = rocblas_trtri_trsm_template(handle, uplo, diag, k, A, lda, invA); - if(status != rocblas_status_success) return status; if (side == rocblas_side_left) { status = rocblas_trsm_left(handle, uplo, transA, m, n, alpha, A, lda, B, ldb, invA, X); @@ -370,48 +373,16 @@ rocblas_status rocblas_trsm_template(rocblas_handle handle, status = rocblas_trsm_right(handle, uplo, transA, m, n, alpha, A, lda, B, ldb, invA, X); } - - RETURN_IF_HIP_ERROR(hipFree(invA)); - RETURN_IF_HIP_ERROR(hipFree(X)); + #ifndef NDEBUG + printf("copy x to b\n"); + #endif + PRINT_IF_HIP_ERROR(hipMemcpy(B, X, ldb*n*sizeof(T), hipMemcpyDeviceToDevice));//TODO: optimized it with copy kernel + PRINT_IF_HIP_ERROR(hipFree(invA)); + PRINT_IF_HIP_ERROR(hipFree(X)); return status; } -/* ============================================================================================ */ - - /* - * =========================================================================== - * template interface - * template specialization - * =========================================================================== - */ - -template<> -rocblas_status -rocblas_trsm(rocblas_handle handle, - rocblas_side side, rocblas_fill uplo, - rocblas_operation transA, rocblas_diagonal diag, - rocblas_int m, rocblas_int n, - const float* alpha, - const float* A, rocblas_int lda, - float* B, rocblas_int ldb){ - - //shared memory usuage is (192/2)^2 * sizeof(float) = 36K. LDS is 64K per CU. Theoretically u can use all 64K, but in practice no. - return rocblas_trsm_template(handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb); -} - -template<> -rocblas_status -rocblas_trsm(rocblas_handle handle, - rocblas_side side, rocblas_fill uplo, - rocblas_operation transA, rocblas_diagonal diag, - rocblas_int m, rocblas_int n, - const double* alpha, - const double* A, rocblas_int lda, - double* B, rocblas_int ldb){ - //shared memory usuage is (128/2)^2 * sizeof(float) = 32K. LDS is 64K per CU. Theoretically u can use all 64K, but in practice no. - return rocblas_trsm_template(handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb); -} /* ============================================================================================ */ @@ -429,12 +400,14 @@ rocblas_strsm(rocblas_handle handle, rocblas_operation transA, rocblas_diagonal diag, rocblas_int m, rocblas_int n, const float* alpha, - const float* A, rocblas_int lda, - float* B, rocblas_int ldb){ + float* A, rocblas_int lda, + float* B, rocblas_int ldb){ - return rocblas_trsm(handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb); + //shared memory usuage is (192/2)^2 * sizeof(float) = 36K. LDS is 64K per CU. Theoretically u can use all 64K, but in practice no. + return rocblas_trsm_template(handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb); } + extern "C" rocblas_status rocblas_dtrsm(rocblas_handle handle, @@ -442,8 +415,9 @@ rocblas_dtrsm(rocblas_handle handle, rocblas_operation transA, rocblas_diagonal diag, rocblas_int m, rocblas_int n, const double* alpha, - const double* A, rocblas_int lda, - double* B, rocblas_int ldb){ + double* A, rocblas_int lda, + double* B, rocblas_int ldb){ - return rocblas_trsm(handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb); + //shared memory usuage is (128/2)^2 * sizeof(float) = 32K. LDS is 64K per CU. Theoretically u can use all 64K, but in practice no. + return rocblas_trsm_template(handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb); } diff --git a/library/src/blas3/rocblas_trtri.cpp b/library/src/blas3/rocblas_trtri.cpp index 9a2aeeec4..5b4f8b954 100644 --- a/library/src/blas3/rocblas_trtri.cpp +++ b/library/src/blas3/rocblas_trtri.cpp @@ -4,357 +4,16 @@ * * ************************************************************************ */ #include - - - #include "rocblas.h" -#include "rocblas.hpp" -#include "trtri_device.h" -#include "definitions.h" - - - -/* ============================================================================================ */ - - -/* - when n <= IB -*/ - -template -__global__ void -trtri_small_kernel(hipLaunchParm lp, - rocblas_fill uplo, - rocblas_diagonal diag, - rocblas_int n, - T *A, rocblas_int lda, - T *invA, rocblas_int ldinvA) -{ - trtri_device(uplo, diag, n, A, lda, invA, ldinvA); -} - - -template -rocblas_status -rocblas_trtri_small(rocblas_handle handle, - rocblas_fill uplo, rocblas_diagonal diag, - rocblas_int n, - T *A, rocblas_int lda, - T *invA, rocblas_int ldinvA) -{ - - if(n > IB ){ - printf("n is %d must be less than %d, will exit\n", n, IB); - return rocblas_status_not_implemented; - } - - dim3 grid(1, 1, 1); - dim3 threads(IB, 1, 1); - - hipStream_t rocblas_stream; - RETURN_IF_ROCBLAS_ERROR(rocblas_get_stream(handle, &rocblas_stream)); - - hipLaunchKernel(HIP_KERNEL_NAME(trtri_small_kernel), dim3(grid), dim3(threads), 0, rocblas_stream, uplo, diag, n, A, lda, invA, ldinvA ); - - return rocblas_status_success; -} - - -/* ============================================================================================ */ - - -/* - Invert the IB by IB diagonal blocks of A of size n by n, - and stores the results in part of invA -*/ - -template -__global__ void -trtri_diagonal_kernel(hipLaunchParm lp, - rocblas_fill uplo, - rocblas_diagonal diag, - rocblas_int n, - T *A, rocblas_int lda, - T *invA, rocblas_int ldinvA) -{ - //get the individual matrix which is processed by device function - //device function only see one matrix - - // each hip thread Block compute a inverse of a IB * IB diagonal block of A - // notice the last digaonal block may be smaller than IB*IB - - T *individual_A = A + hipBlockIdx_x * IB * lda + hipBlockIdx_x * IB; - T *individual_invA = invA + hipBlockIdx_x * IB * ldinvA + hipBlockIdx_x * IB; - - trtri_device(uplo, diag, min(IB, n- hipBlockIdx_x * IB), individual_A, lda, individual_invA, ldinvA); - -} - - -/* - suppose nn is the orginal matrix AA' size (denoted as n in TOP API) - A, B, C, submatrices in AA in trtri - this special gemm performs D = -A*B*C after trtri - - if lower, - D = -A*B*C ==> invA21 = -invA22*A21*invA11, - let m = (nn-IB), n = IB, - - if upper, - D = -A*B*C ==> invA12 = -invA11*A12*invA22, - let m = IB, n = (nn-IB), - - Then, either case, - D is of m * n - A is of m * m - B if of m * n - C is of n * n - - since m <= IB, n <= IB, - create a shared memory space as the buffer to store the intermediate results of W=B*C, A*W - -*/ - -template -__global__ void -gemm_trsm_kernel(hipLaunchParm lp, - rocblas_int m, rocblas_int n, - T *A, rocblas_int lda, - T *B, rocblas_int ldb, - T *C, rocblas_int ldc, - T *D, rocblas_int ldd) -{ - - __shared__ T shared_tep[IB*IB]; - __shared__ T vec[IB]; - T reg[IB]; - - rocblas_int tx = hipThreadIdx_x; - - //read B into registers, B is of m * n - if(tx < m){ - for(int col=0;col invA11 = A11^{-1}, by trtri directly - A22*invA22 = I -> invA22 = A22^{-1}, by trtri directly - A21*invA11 + invA22*invA21 = 0 -> invA21 = -A22^{-1}*A21*invA11 = -invA22*A21*invA11, by gemm - - - If A is a upper triangular matrix, to compute the invA - all of Aii, invAii are of size IB by IB - - [ A11 A12 ] * [ invA11 invA12 ] = [ I 0 ] - [ 0 A22 ] [ 0 invA22 ] [ 0 I ] - - A11*invA11 = I -> invA11 = A11^{-1}, by trtri directly - A22*invA22 = I -> invA22 = A22^{-1}, by trtri directly - A11*invA12 + A12*invA22 = 0 -> invA12 = -A11^{-1}*A12*invA22 = -invA11*A12*invA22, by gemm - -*/ - - -template -rocblas_status -rocblas_trtri_large(rocblas_handle handle, - rocblas_fill uplo, rocblas_diagonal diag, - rocblas_int n, - T *A, rocblas_int lda, - T *invA, rocblas_int ldinvA) -{ - - if(n > 2 * IB ){ - printf("n is %d, n must be less than %d, will return\n", n, 2*IB); - return rocblas_status_not_implemented; - } - - hipStream_t rocblas_stream; - RETURN_IF_ROCBLAS_ERROR(rocblas_get_stream(handle, &rocblas_stream)); - - dim3 grid_trtri(2, 1, 1); - dim3 threads(IB, 1, 1); - - //first stage: invert IB * IB diagoanl blocks of A and write the result of invA11 and invA22 in invA - hipLaunchKernel(HIP_KERNEL_NAME(trtri_diagonal_kernel), dim3(grid_trtri), dim3(threads), 0, rocblas_stream, - uplo, diag, n, A, lda, invA, ldinvA); - - - if( n <= IB ){ - //if n is too small, no invA21 or invA12 exist, gemm is not required - return rocblas_status_success; - } - - //second stage: using a special gemm to compute invA21 (lower) or invA12 (upper) - dim3 grid_gemm(1, 1, 1); - - rocblas_int m_gemm; - rocblas_int n_gemm; - T *A_gemm; - T *B_gemm; - T *C_gemm; - T *D_gemm; - - if(uplo == rocblas_fill_lower){ - // perform D = -A*B*C ==> invA21 = -invA22*A21*invA11, - m_gemm = (n-IB); - n_gemm = IB; - A_gemm = invA + IB + IB * ldinvA; // invA22 - B_gemm = A + IB; //A21 - C_gemm = invA; //invA11 - D_gemm = invA + IB; //invA21 - } - else{ - // perform D = -A*B*C ==> invA12 = -invA11*A12*invA22, - m_gemm = IB; - n_gemm = (n-IB); - A_gemm = invA; // invA11 - B_gemm = A + lda * IB; //A12 - C_gemm = invA + IB + IB * ldinvA; //invA22 - D_gemm = invA + IB * ldinvA; // invA12 - } - - hipLaunchKernel(HIP_KERNEL_NAME(gemm_trsm_kernel), dim3(grid_gemm), dim3(threads), 0, rocblas_stream, - m_gemm, n_gemm, A_gemm, ldinvA, B_gemm, lda, C_gemm, ldinvA, D_gemm, ldinvA); - - return rocblas_status_success; - +namespace trtri{ //must use namespace to avoid multply definiton + #include "trtri.hpp" } - -/* ============================================================================================ */ - -/*! \brief BLAS Level 3 API - - \details - trtri compute the inverse of a matrix A, namely, invA - - and write the result into invA; - - @param[in] - handle rocblas_handle. - handle to the rocblas library context queue. - @param[in] - uplo rocblas_fill. - specifies whether the upper 'rocblas_fill_upper' or lower 'rocblas_fill_lower' - if rocblas_fill_upper, the lower part of A is not referenced - if rocblas_fill_lower, the upper part of A is not referenced - @param[in] - diag rocblas_diagonal. - = 'rocblas_diagonal_non_unit', A is non-unit triangular; - = 'rocblas_diagonal_unit', A is unit triangular; - @param[in] - n rocblas_int. - size of matrix A and invA - @param[in] - A pointer storing matrix A on the GPU. - @param[in] - lda rocblas_int - specifies the leading dimension of A. - @param[output] - invA pointer storing matrix invA on the GPU. - @param[in] - ldinvA rocblas_int - specifies the leading dimension of invA. - -********************************************************************/ - -template -rocblas_status -rocblas_trtri_template(rocblas_handle handle, - rocblas_fill uplo, rocblas_diagonal diag, - rocblas_int n, - T *A, rocblas_int lda, - T *invA, rocblas_int ldinvA) -{ - - if(handle == nullptr) - return rocblas_status_invalid_handle; - else if ( uplo != rocblas_fill_lower && uplo != rocblas_fill_upper) - return rocblas_status_not_implemented; - else if ( n < 0 ) - return rocblas_status_invalid_size; - else if ( A == nullptr ) - return rocblas_status_invalid_pointer; - else if ( lda < n ) - return rocblas_status_invalid_size; - else if ( invA == nullptr ) - return rocblas_status_invalid_pointer; - else if ( ldinvA < n ) - return rocblas_status_invalid_size; - - - if (n <= IB){ - return rocblas_trtri_small(handle, uplo, diag, n, A, lda, invA, ldinvA); - } - else if( n <= 2 * IB){ - return rocblas_trtri_large(handle, uplo, diag, n, A, lda, invA, ldinvA); - } - else{ - printf("n is %d, n must be less than %d, will return\n", n, 2*IB); - return rocblas_status_not_implemented; - } -} - - /* ============================================================================================ */ /* * =========================================================================== - * template interface - * template specialization + * C interface * =========================================================================== */ @@ -365,24 +24,24 @@ rocblas_trtri_template(rocblas_handle handle, //typically, only a small matrix A is inverted by trtri, so if n is too big, trtri is not implemented //trtri is usually called by trsm -template<> +extern "C" rocblas_status -rocblas_trtri(rocblas_handle handle, +rocblas_strtri(rocblas_handle handle, rocblas_fill uplo, rocblas_diagonal diag, rocblas_int n, float *A, rocblas_int lda, float *invA, rocblas_int ldinvA){ - return rocblas_trtri_template(handle, uplo, diag, n, A, lda, invA, ldinvA); + return trtri::rocblas_trtri_template(handle, uplo, diag, n, A, lda, invA, ldinvA); } -template<> +extern "C" rocblas_status -rocblas_trtri(rocblas_handle handle, +rocblas_dtrtri(rocblas_handle handle, rocblas_fill uplo, rocblas_diagonal diag, rocblas_int n, double *A, rocblas_int lda, double *invA, rocblas_int ldinvA){ - return rocblas_trtri_template(handle, uplo, diag, n, A, lda, invA, ldinvA); + return trtri::rocblas_trtri_template(handle, uplo, diag, n, A, lda, invA, ldinvA); } diff --git a/library/src/blas3/rocblas_trtri_batched.cpp b/library/src/blas3/rocblas_trtri_batched.cpp index 67cc8c069..d57b9fd21 100644 --- a/library/src/blas3/rocblas_trtri_batched.cpp +++ b/library/src/blas3/rocblas_trtri_batched.cpp @@ -1,139 +1,11 @@ /* ************************************************************************ - * Copyright 2016 Advanced Micro Devices, Inc. - * - * ************************************************************************ */ + * * Copyright 2016 Advanced Micro Devices, Inc. + * * + * * ************************************************************************ */ #include - - #include "rocblas.h" -#include "rocblas.hpp" -#include "definitions.h" -#include "trtri_device.h" - -//flag indicate whether write into A or invA -template -__global__ void -trtri_kernel_batched(hipLaunchParm lp, - rocblas_fill uplo, - rocblas_diagonal diag, - rocblas_int n, - T *A, rocblas_int lda, rocblas_int bsa, - T *invA, rocblas_int ldinvA, rocblas_int bsinvA) -{ - //get the individual matrix which is processed by device function - //device function only see one matrix - T *individual_A = A + hipBlockIdx_z * bsa; - T *individual_invA = invA + hipBlockIdx_z * bsinvA; - - trtri_device(uplo, diag, n, individual_A, lda, individual_invA, ldinvA); -} - - - - - -/* ============================================================================================ */ - -/*! \brief BLAS Level 3 API - - \details - trtri compute the inverse of a matrix A - - inv(A); - - @param[in] - handle rocblas_handle. - handle to the rocblas library context queue. - @param[in] - uplo rocblas_fill. - specifies whether the upper 'rocblas_fill_upper' or lower 'rocblas_fill_lower' - @param[in] - diag rocblas_diagonal. - = 'rocblas_diagonal_non_unit', A is non-unit triangular; - = 'rocblas_diagonal_unit', A is unit triangular; - @param[in] - n rocblas_int. - @param[in] - A pointer storing matrix A on the GPU. - @param[in] - lda rocblas_int - specifies the leading dimension of A. - @param[in] - bsa rocblas_int - "batch stride a": stride from the start of one "A" matrix to the next - @param[output] - invA pointer storing the inverse matrix A on the GPU. - @param[in] - ldinvA rocblas_int - specifies the leading dimension of invA. - @param[in] - bsinvA rocblas_int - "batch stride invA": stride from the start of one "invA" matrix to the next - @param[in] - batch_count rocblas_int - numbers of matrices in the batch - ********************************************************************/ - -//because of shared memory size, the NB_X must be <= 64 -#define NB_X 32 - -//assume invA has already been allocated, recommened for repeated calling of trtri product routine -template -rocblas_status -rocblas_trtri_batched_template(rocblas_handle handle, - rocblas_fill uplo, - rocblas_diagonal diag, - rocblas_int n, - T *A, rocblas_int lda, rocblas_int bsa, - T *invA, rocblas_int ldinvA, rocblas_int bsinvA, - rocblas_int batch_count) -{ - if(handle == nullptr) - return rocblas_status_invalid_handle; - else if ( uplo != rocblas_fill_lower && uplo != rocblas_fill_upper) - return rocblas_status_not_implemented; - else if ( n < 0 ) - return rocblas_status_invalid_size; - else if ( A == nullptr ) - return rocblas_status_invalid_pointer; - else if ( lda < n ) - return rocblas_status_invalid_size; - else if ( bsa < lda*n ) - return rocblas_status_invalid_size; - else if ( invA == nullptr ) - return rocblas_status_invalid_pointer; - else if ( ldinvA < n ) - return rocblas_status_invalid_size; - else if ( bsinvA < ldinvA*n ) - return rocblas_status_invalid_size; - else if ( batch_count < 0 ) - return rocblas_status_invalid_size; - - /* - * Quick return if possible. - */ - - if ( n == 0 || batch_count == 0) - return rocblas_status_success; - - if(n > NB_X ){ - printf("n is %d, n must be less than %d, will return\n", n, NB_X); - return rocblas_status_not_implemented; - } - - dim3 grid(1, 1, batch_count); - dim3 threads(NB_X, 1, 1); - - hipStream_t rocblas_stream; - RETURN_IF_ROCBLAS_ERROR(rocblas_get_stream(handle, &rocblas_stream)); - - hipLaunchKernel(HIP_KERNEL_NAME(trtri_kernel_batched), dim3(grid), dim3(threads), 0, rocblas_stream, - uplo, diag, n, A, lda, bsa, invA, ldinvA, bsinvA); - - return rocblas_status_success; - -} +#include "trtri_batched.hpp" @@ -141,16 +13,15 @@ rocblas_trtri_batched_template(rocblas_handle handle, /* * =========================================================================== - * template interface - * template specialization + * C interface * This function is called by trsm * =========================================================================== */ -template<> +extern "C" rocblas_status -rocblas_trtri_batched(rocblas_handle handle, +rocblas_strtri_batched(rocblas_handle handle, rocblas_fill uplo, rocblas_diagonal diag, rocblas_int n, @@ -162,9 +33,9 @@ rocblas_trtri_batched(rocblas_handle handle, } -template<> +extern "C" rocblas_status -rocblas_trtri_batched(rocblas_handle handle, +rocblas_dtrtri_batched(rocblas_handle handle, rocblas_fill uplo, rocblas_diagonal diag, rocblas_int n, @@ -175,4 +46,3 @@ rocblas_trtri_batched(rocblas_handle handle, return rocblas_trtri_batched_template(handle, uplo, diag, n, A, lda, bsa, invA, ldinvA, bsinvA, batch_count); } -/* ============================================================================================ */ diff --git a/library/src/blas3/rocblas_trtri_trsm.cpp b/library/src/blas3/rocblas_trtri_trsm.cpp deleted file mode 100644 index 66282536f..000000000 --- a/library/src/blas3/rocblas_trtri_trsm.cpp +++ /dev/null @@ -1,249 +0,0 @@ -/* ************************************************************************ - * Copyright 2016 Advanced Micro Devices, Inc. - * - * ************************************************************************ */ -#include - - - -#include "rocblas.h" -#include "rocblas.hpp" -#include "definitions.h" -#include "status.h" -#include "trtri_device.h" - - -/* - Invert the IB by IB diagonal blocks of A of size n by n, where n is divisible by IB - and stores the results in part of invA of size NB by NB. - currently IB = NB/2; - flag indicate whether write into A or invA, invA: 1, A: 0 - - - [ IB ] NB = 2 * IB; - [ IB ] - -*/ -template -__global__ void -trtri_trsm_kernel(hipLaunchParm lp, - rocblas_fill uplo, - rocblas_diagonal diag, - rocblas_int n, - T *A, rocblas_int lda, - T *invA) -{ - //get the individual matrix which is processed by device function - //device function only see one matrix - - // each hip thread Block compute a inverse of a IB * IB diagonal block of A - T *individual_A = A + hipBlockIdx_x * IB * lda + hipBlockIdx_x * IB; - - T *individual_invA; - individual_invA = invA + hipBlockIdx_x/2 * NB * NB; - // the odd thread block makes a shift - if( hipBlockIdx_x % 2 == 1 ){ - individual_invA += NB * IB + IB; - } - - trtri_device(uplo, diag, IB, individual_A, lda, individual_invA, NB); - -} - - - -/* ============================================================================================ */ - -/*! \brief BLAS Level 3 API - - \details - - This routine is a special routine only called by trsm, it is a private API. - Internally, it calls batched trtri and batched gemm to - compute the inverse of the diagonal blocks of a matrix A - Each individual digaonal block invA is NB * NB - The last individual diagonal block will be pad 0 if n is not divisible by NB - - Specifically, it first calls trtri to invert a IB * IB digaonal in this NB * NB diagonal block - Second, it finishs the diagonal block by calling batched GEMM - - - @param[in] - handle rocblas_handle. - handle to the rocblas library context queue. - @param[in] - uplo rocblas_fill. - specifies whether the upper 'rocblas_fill_upper' or lower 'rocblas_fill_lower' - if rocblas_fill_upper, the lower part of A is not referenced - if rocblas_fill_lower, the upper part of A is not referenced - @param[in] - diag rocblas_diagonal. - = 'rocblas_diagonal_non_unit', A is non-unit triangular; - = 'rocblas_diagonal_unit', A is unit triangular; - @param[in] - n rocblas_int. - @param[in] - A pointer storing matrix A on the GPU. - @param[in] - lda rocblas_int - specifies the leading dimension of A. - @param[output] - invA - of dimension (NB, ceil(n/NB)*NB), - On exit, contains inverses of the NB by NB diagonal blocks of A. - - ********************************************************************/ - -//assume invA has already been allocated, and leading dimension of invA is NB -//assume IB is exactly half of NB -template -rocblas_status -rocblas_trtri_trsm_template(rocblas_handle handle, - rocblas_fill uplo, - rocblas_diagonal diag, - rocblas_int n, - T *A, rocblas_int lda, - T *invA) -{ - if(handle == nullptr) - return rocblas_status_invalid_handle; - else if ( uplo != rocblas_fill_lower && uplo != rocblas_fill_upper) - return rocblas_status_not_implemented; - else if ( n < 0 ) - return rocblas_status_invalid_size; - else if ( A == nullptr ) - return rocblas_status_invalid_pointer; - else if ( lda < n ) - return rocblas_status_invalid_size; - else if ( invA == nullptr ) - return rocblas_status_invalid_pointer; - - - /* - * Quick return if possible. - */ - - if ( n == 0 ) - return rocblas_status_success; - - hipStream_t rocblas_stream; - RETURN_IF_ROCBLAS_ERROR(rocblas_get_stream(handle, &rocblas_stream)); - - rocblas_int blocks = n / NB; // number of divisible NB*NB blocks, but 2 * blocks of IB*IB blocks - - dim3 grid(blocks * 2, 1, 1); - dim3 threads(IB, 1, 1 ); - - /* - Algorithm: - - If A is a lower triangular matrix, to compute the invA - all of Aii, invAii are of size IB by IB - - [ A11 0 ] * [ invA11 0 ] = [ I 0 ] - [ A21 A22 ] [ invA21 invA22 ] [ 0 I ] - - A11*invA11 = I -> invA11 = A11^{-1}, by trtri directly - A22*invA22 = I -> invA22 = A22^{-1}, by trtri directly - A21*invA11 + invA22*invA21 = 0 -> invA21 = -A22^{-1}*A21*invA11 = -invA22*A21*invA11, by gemm - - - If A is a upper triangular matrix, to compute the invA - all of Aii, invAii are of size IB by IB - - [ A11 A12 ] * [ invA11 invA12 ] = [ I 0 ] - [ 0 A22 ] [ 0 invA22 ] [ 0 I ] - - A11*invA11 = I -> invA11 = A11^{-1}, by trtri directly - A22*invA22 = I -> invA22 = A22^{-1}, by trtri directly - A11*invA12 + A12*invA22 = 0 -> invA12 = -A11^{-1}*A12*invA22 = -invA11*A12*invA22, by gemm - - */ - - //invert IB * IB diagoanl blocks of A and write the result of invA11 and invA22 in invA - hipLaunchKernel(HIP_KERNEL_NAME(trtri_trsm_kernel), dim3(grid), dim3(threads), 0, rocblas_stream, - uplo, diag, (blocks)*NB, A, lda, invA); - - T one = 1; T zero = 0; T negative_one = -1; - T* C; - RETURN_IF_HIP_ERROR(hipMalloc(&C, sizeof(T) * IB * IB * blocks)); - - rocblas_status status; - - rocblas_int stride_A = NB*lda + NB; - rocblas_int stride_invA = NB*NB; - rocblas_int stride_C = IB*IB; - - rocblas_int A12_A21_offset, invA11_invA22_offset, invA21_invA12_offset; - - if (uplo == rocblas_fill_lower){ - A12_A21_offset = IB; //A21 - invA11_invA22_offset = 0; //invA11 - invA21_invA12_offset = IB; //invA21 - } - else - { - A12_A21_offset = IB*NB; //A12 - invA11_invA22_offset = NB*IB+IB; //invA22 - invA21_invA12_offset = IB*NB; //invA12 - } - - // first batched gemm compute C = A21*invA11 (lower) or C = A12*invA22 (upper) - // distance between each invA11 or invA22 is stride_invA, stride_A for each A21 or A12, C of size IB * IB - status = rocblas_gemm_batched(handle, rocblas_operation_none, rocblas_operation_none, - IB, IB, IB, - &one, - (A + A12_A21_offset), lda, stride_A, - (invA + invA11_invA22_offset), NB, stride_invA, - &zero, - C, IB, stride_C, - blocks ); - - - // second batched gemm compute invA21 = -invA22 * C (lower) or invA12 = -invA11*C (upper) - // distance between each invA21 or invA12 is stride_invA, - status = rocblas_gemm_batched(handle, rocblas_operation_none, rocblas_operation_none, - IB, IB, IB, - &negative_one, - invA + invA11_invA22_offset, NB, stride_invA, - &zero, - C, IB, stride_C, - invA + invA21_invA12_offset, NB, stride_invA, - blocks ); - - - RETURN_IF_HIP_ERROR(hipFree(C)); - - //the last digaonal block is handled seperately if n is not divisible by NB, - if(n % NB != 0 ){ - status = rocblas_trtri(handle, uplo, diag, n-blocks*NB, A + blocks*NB*lda + blocks*NB, lda, invA + blocks*NB*NB, NB); - } - - return status; - -} - - -/* ============================================================================================ */ - - /* - * =========================================================================== - * template interface - * template specialization - * This function is called by trsm - * =========================================================================== - */ - - - -template<> -rocblas_status -rocblas_trtri_trsm(rocblas_handle handle, - rocblas_fill uplo, - rocblas_diagonal diag, - rocblas_int n, - float *A, rocblas_int lda, - float *invA){ - - return rocblas_trtri_trsm_template(handle, uplo, diag, n, A, lda, invA); -} diff --git a/library/src/blas3/trtri.hpp b/library/src/blas3/trtri.hpp new file mode 100644 index 000000000..77c6b10b7 --- /dev/null +++ b/library/src/blas3/trtri.hpp @@ -0,0 +1,353 @@ +/* ************************************************************************ + * * Copyright 2016 Advanced Micro Devices, Inc. + * * + * * ************************************************************************ */ + +#pragma once +#ifndef _TRTRI_HPP_ +#define _TRTRI_HPP_ + +#include +#include "trtri_device.h" +#include "definitions.h" + + + +/* ============================================================================================ */ + + +/* + when n <= IB +*/ + +template +__global__ void +trtri_small_kernel(hipLaunchParm lp, + rocblas_fill uplo, + rocblas_diagonal diag, + rocblas_int n, + T *A, rocblas_int lda, + T *invA, rocblas_int ldinvA) +{ + trtri_device(uplo, diag, n, A, lda, invA, ldinvA); +} + + + +template +rocblas_status +rocblas_trtri_small(rocblas_handle handle, + rocblas_fill uplo, rocblas_diagonal diag, + rocblas_int n, + T *A, rocblas_int lda, + T *invA, rocblas_int ldinvA) +{ + + if(n > IB ){ + printf("n is %d must be less than %d, will exit\n", n, IB); + return rocblas_status_not_implemented; + } + + dim3 grid(1, 1, 1); + dim3 threads(IB, 1, 1); + + hipStream_t rocblas_stream; + RETURN_IF_ROCBLAS_ERROR(rocblas_get_stream(handle, &rocblas_stream)); + + hipLaunchKernel(HIP_KERNEL_NAME(trtri_small_kernel), dim3(grid), dim3(threads), 0, rocblas_stream, uplo, diag, n, A, lda, invA, ldinvA ); + + return rocblas_status_success; +} + + +/* ============================================================================================ */ + + +/* + Invert the IB by IB diagonal blocks of A of size n by n, + and stores the results in part of invA +*/ + +template +__global__ void +trtri_diagonal_kernel(hipLaunchParm lp, + rocblas_fill uplo, + rocblas_diagonal diag, + rocblas_int n, + T *A, rocblas_int lda, + T *invA, rocblas_int ldinvA) +{ + //get the individual matrix which is processed by device function + //device function only see one matrix + + // each hip thread Block compute a inverse of a IB * IB diagonal block of A + // notice the last digaonal block may be smaller than IB*IB + + T *individual_A = A + hipBlockIdx_x * IB * lda + hipBlockIdx_x * IB; + T *individual_invA = invA + hipBlockIdx_x * IB * ldinvA + hipBlockIdx_x * IB; + + trtri_device(uplo, diag, min(IB, n- hipBlockIdx_x * IB), individual_A, lda, individual_invA, ldinvA); + +} + + +/* + suppose nn is the orginal matrix AA' size (denoted as n in TOP API) + A, B, C, submatrices in AA in trtri + this special gemm performs D = -A*B*C after trtri + + if lower, + D = -A*B*C ==> invA21 = -invA22*A21*invA11, + let m = (nn-IB), n = IB, + + if upper, + D = -A*B*C ==> invA12 = -invA11*A12*invA22, + let m = IB, n = (nn-IB), + + Then, either case, + D is of m * n + A is of m * m + B if of m * n + C is of n * n + + since m <= IB, n <= IB, + create a shared memory space as the buffer to store the intermediate results of W=B*C, A*W + +*/ + +template +__global__ void +gemm_trsm_kernel(hipLaunchParm lp, + rocblas_int m, rocblas_int n, + T *A, rocblas_int lda, + T *B, rocblas_int ldb, + T *C, rocblas_int ldc, + T *D, rocblas_int ldd) +{ + + __shared__ T shared_tep[IB*IB]; + __shared__ T vec[IB]; + T reg[IB]; + + rocblas_int tx = hipThreadIdx_x; + + //read B into registers, B is of m * n + if(tx < m){ + for(int col=0;col invA11 = A11^{-1}, by trtri directly + A22*invA22 = I -> invA22 = A22^{-1}, by trtri directly + A21*invA11 + invA22*invA21 = 0 -> invA21 = -A22^{-1}*A21*invA11 = -invA22*A21*invA11, by gemm + + + If A is a upper triangular matrix, to compute the invA + all of Aii, invAii are of size IB by IB + + [ A11 A12 ] * [ invA11 invA12 ] = [ I 0 ] + [ 0 A22 ] [ 0 invA22 ] [ 0 I ] + + A11*invA11 = I -> invA11 = A11^{-1}, by trtri directly + A22*invA22 = I -> invA22 = A22^{-1}, by trtri directly + A11*invA12 + A12*invA22 = 0 -> invA12 = -A11^{-1}*A12*invA22 = -invA11*A12*invA22, by gemm + +*/ + + +template +rocblas_status +rocblas_trtri_large(rocblas_handle handle, + rocblas_fill uplo, rocblas_diagonal diag, + rocblas_int n, + T *A, rocblas_int lda, + T *invA, rocblas_int ldinvA) +{ + + if(n > 2 * IB ){ + printf("n is %d, n must be less than %d, will return\n", n, 2*IB); + return rocblas_status_not_implemented; + } + + hipStream_t rocblas_stream; + RETURN_IF_ROCBLAS_ERROR(rocblas_get_stream(handle, &rocblas_stream)); + + dim3 grid_trtri(2, 1, 1); + dim3 threads(IB, 1, 1); + + //first stage: invert IB * IB diagoanl blocks of A and write the result of invA11 and invA22 in invA + hipLaunchKernel(HIP_KERNEL_NAME(trtri_diagonal_kernel), dim3(grid_trtri), dim3(threads), 0, rocblas_stream, + uplo, diag, n, A, lda, invA, ldinvA); + + + if( n <= IB ){ + //if n is too small, no invA21 or invA12 exist, gemm is not required + return rocblas_status_success; + } + + //second stage: using a special gemm to compute invA21 (lower) or invA12 (upper) + dim3 grid_gemm(1, 1, 1); + + rocblas_int m_gemm; + rocblas_int n_gemm; + T *A_gemm; + T *B_gemm; + T *C_gemm; + T *D_gemm; + + if(uplo == rocblas_fill_lower){ + // perform D = -A*B*C ==> invA21 = -invA22*A21*invA11, + m_gemm = (n-IB); + n_gemm = IB; + A_gemm = invA + IB + IB * ldinvA; // invA22 + B_gemm = A + IB; //A21 + C_gemm = invA; //invA11 + D_gemm = invA + IB; //invA21 + } + else{ + // perform D = -A*B*C ==> invA12 = -invA11*A12*invA22, + m_gemm = IB; + n_gemm = (n-IB); + A_gemm = invA; // invA11 + B_gemm = A + lda * IB; //A12 + C_gemm = invA + IB + IB * ldinvA; //invA22 + D_gemm = invA + IB * ldinvA; // invA12 + } + + hipLaunchKernel(HIP_KERNEL_NAME(gemm_trsm_kernel), dim3(grid_gemm), dim3(threads), 0, rocblas_stream, + m_gemm, n_gemm, A_gemm, ldinvA, B_gemm, lda, C_gemm, ldinvA, D_gemm, ldinvA); + + return rocblas_status_success; + +} + + +/* ============================================================================================ */ + +/*! \brief BLAS Level 3 API + + \details + trtri compute the inverse of a matrix A, namely, invA + + and write the result into invA; + + @param[in] + handle rocblas_handle. + handle to the rocblas library context queue. + @param[in] + uplo rocblas_fill. + specifies whether the upper 'rocblas_fill_upper' or lower 'rocblas_fill_lower' + if rocblas_fill_upper, the lower part of A is not referenced + if rocblas_fill_lower, the upper part of A is not referenced + @param[in] + diag rocblas_diagonal. + = 'rocblas_diagonal_non_unit', A is non-unit triangular; + = 'rocblas_diagonal_unit', A is unit triangular; + @param[in] + n rocblas_int. + size of matrix A and invA + @param[in] + A pointer storing matrix A on the GPU. + @param[in] + lda rocblas_int + specifies the leading dimension of A. + @param[output] + invA pointer storing matrix invA on the GPU. + @param[in] + ldinvA rocblas_int + specifies the leading dimension of invA. + +********************************************************************/ +/* IB must be <= 64 in order to fit shared (local) memory */ +template +rocblas_status +rocblas_trtri_template(rocblas_handle handle, + rocblas_fill uplo, rocblas_diagonal diag, + rocblas_int n, + T *A, rocblas_int lda, + T *invA, rocblas_int ldinvA) +{ + + if(handle == nullptr) + return rocblas_status_invalid_handle; + else if ( uplo != rocblas_fill_lower && uplo != rocblas_fill_upper) + return rocblas_status_not_implemented; + else if ( n < 0 ) + return rocblas_status_invalid_size; + else if ( A == nullptr ) + return rocblas_status_invalid_pointer; + else if ( lda < n ) + return rocblas_status_invalid_size; + else if ( invA == nullptr ) + return rocblas_status_invalid_pointer; + else if ( ldinvA < n ) + return rocblas_status_invalid_size; + + + if (n <= IB){ + return rocblas_trtri_small(handle, uplo, diag, n, A, lda, invA, ldinvA); + } + else if( n <= 2 * IB){ + return rocblas_trtri_large(handle, uplo, diag, n, A, lda, invA, ldinvA); + } + else{ + printf("n is %d, n must be less than %d, will return\n", n, 2*IB); + return rocblas_status_not_implemented; + } +} + + +#endif // _TRTRI_HPP_ + diff --git a/library/src/blas3/trtri_batched.hpp b/library/src/blas3/trtri_batched.hpp new file mode 100644 index 000000000..056686063 --- /dev/null +++ b/library/src/blas3/trtri_batched.hpp @@ -0,0 +1,141 @@ +/* ************************************************************************ + * Copyright 2016 Advanced Micro Devices, Inc. + * + * ************************************************************************ */ + + +#pragma once +#ifndef _TRTRI_BATCHED_HPP_ +#define _TRTRI_BATCHED_HPP_ + + +#include +#include "definitions.h" +#include "trtri_device.h" + +//flag indicate whether write into A or invA +template +__global__ void +trtri_kernel_batched(hipLaunchParm lp, + rocblas_fill uplo, + rocblas_diagonal diag, + rocblas_int n, + T *A, rocblas_int lda, rocblas_int bsa, + T *invA, rocblas_int ldinvA, rocblas_int bsinvA) +{ + //get the individual matrix which is processed by device function + //device function only see one matrix + T *individual_A = A + hipBlockIdx_z * bsa; + T *individual_invA = invA + hipBlockIdx_z * bsinvA; + + trtri_device(uplo, diag, n, individual_A, lda, individual_invA, ldinvA); +} + + + + + +/* ============================================================================================ */ + +/*! \brief BLAS Level 3 API + + \details + trtri compute the inverse of a matrix A + + inv(A); + + @param[in] + handle rocblas_handle. + handle to the rocblas library context queue. + @param[in] + uplo rocblas_fill. + specifies whether the upper 'rocblas_fill_upper' or lower 'rocblas_fill_lower' + @param[in] + diag rocblas_diagonal. + = 'rocblas_diagonal_non_unit', A is non-unit triangular; + = 'rocblas_diagonal_unit', A is unit triangular; + @param[in] + n rocblas_int. + @param[in] + A pointer storing matrix A on the GPU. + @param[in] + lda rocblas_int + specifies the leading dimension of A. + @param[in] + bsa rocblas_int + "batch stride a": stride from the start of one "A" matrix to the next + @param[output] + invA pointer storing the inverse matrix A on the GPU. + @param[in] + ldinvA rocblas_int + specifies the leading dimension of invA. + @param[in] + bsinvA rocblas_int + "batch stride invA": stride from the start of one "invA" matrix to the next + @param[in] + batch_count rocblas_int + numbers of matrices in the batch + ********************************************************************/ + +//because of shared memory size, the NB_X must be <= 64 +#define NB_X 32 + +//assume invA has already been allocated, recommened for repeated calling of trtri product routine +template +rocblas_status +rocblas_trtri_batched_template(rocblas_handle handle, + rocblas_fill uplo, + rocblas_diagonal diag, + rocblas_int n, + T *A, rocblas_int lda, rocblas_int bsa, + T *invA, rocblas_int ldinvA, rocblas_int bsinvA, + rocblas_int batch_count) +{ + if(handle == nullptr) + return rocblas_status_invalid_handle; + else if ( uplo != rocblas_fill_lower && uplo != rocblas_fill_upper) + return rocblas_status_not_implemented; + else if ( n < 0 ) + return rocblas_status_invalid_size; + else if ( A == nullptr ) + return rocblas_status_invalid_pointer; + else if ( lda < n ) + return rocblas_status_invalid_size; + else if ( bsa < lda*n ) + return rocblas_status_invalid_size; + else if ( invA == nullptr ) + return rocblas_status_invalid_pointer; + else if ( ldinvA < n ) + return rocblas_status_invalid_size; + else if ( bsinvA < ldinvA*n ) + return rocblas_status_invalid_size; + else if ( batch_count < 0 ) + return rocblas_status_invalid_size; + + /* + * Quick return if possible. + */ + + if ( n == 0 || batch_count == 0) + return rocblas_status_success; + + if(n > NB_X ){ + printf("n is %d, n must be less than %d, will return\n", n, NB_X); + return rocblas_status_not_implemented; + } + + dim3 grid(1, 1, batch_count); + dim3 threads(NB_X, 1, 1); + + hipStream_t rocblas_stream; + RETURN_IF_ROCBLAS_ERROR(rocblas_get_stream(handle, &rocblas_stream)); + + hipLaunchKernel(HIP_KERNEL_NAME(trtri_kernel_batched), dim3(grid), dim3(threads), 0, rocblas_stream, + uplo, diag, n, A, lda, bsa, invA, ldinvA, bsinvA); + + return rocblas_status_success; + +} + + +#endif // _TRTRI_BATCHED_HPP_ diff --git a/library/src/blas3/trtri_device.h b/library/src/blas3/trtri_device.h index 895c2d5a8..f8ae24f57 100644 --- a/library/src/blas3/trtri_device.h +++ b/library/src/blas3/trtri_device.h @@ -1,3 +1,12 @@ +/* ************************************************************************ + * Copyright 2016 Advanced Micro Devices, Inc. + * + * ************************************************************************ */ + + +#pragma once +#ifndef _TRTRI_DEVICE_H_ +#define _TRTRI_DEVICE_H_ /* * =========================================================================== @@ -70,23 +79,24 @@ trtri_device(rocblas_fill uplo, } } } - //__syncthreads(); // since NB < 64, this synch can be avoided + __syncthreads(); // if NB < 64, this synch can be avoided //invert the diagonal element if (tx < n){ //compute only diagonal element if (diag == rocblas_diagonal_unit){ - sA[tx + tx * n] = 1; + sA[tx + tx * n] = 1.0; } - else{ - if(sA[tx + tx * n] == 0){ // notice this does not apply for complex - sA[tx + tx * n] = 1; // means the matrix is singular + else{//inverse the diagonal + if(sA[tx + tx * n] == 0.0){ // notice this does not apply for complex + sA[tx + tx * n] = 1.0; // means the matrix is singular } else{ - sA[tx + tx * n] = 1/sA[tx + tx * n]; + sA[tx + tx * n] = 1.0/sA[tx + tx * n]; } } } + __syncthreads(); // if NB < 64, this synch can be avoided on AMD Fiji // solve the inverse of A column by column, each inverse(A)' column will overwrite sA'column which store A // this operation is safe @@ -96,7 +106,7 @@ trtri_device(rocblas_fill uplo, //use the diagonal one to update current column if(tx > col) reg += sA[tx + col * n] * sA[col + col * n]; - //__syncthreads(); // since NB < 64, this synch can be avoided + __syncthreads(); // if NB < 64, this synch can be avoided on AMD Fiji // in each column, it solves step, each step solve an inverse(A)[step][col] for(int step=col+1;step step update with (tx = step)'s result if(tx > step){ reg += sA[tx + step * n] * sA[step + col * n]; } - //__syncthreads(); // since NB < 64, this synch can be avoided + __syncthreads(); // if NB < 64, this synch can be avoided on AMD Fiji } - //__syncthreads(); + __syncthreads(); } @@ -146,3 +156,5 @@ trtri_device(rocblas_fill uplo, #define STRSM_BLOCK 192 #define DTRSM_BLOCK 128 + +#endif // _TRTRI_DEVICE_H_ diff --git a/library/src/blas3/trtri_trsm.hpp b/library/src/blas3/trtri_trsm.hpp new file mode 100644 index 000000000..30a3be211 --- /dev/null +++ b/library/src/blas3/trtri_trsm.hpp @@ -0,0 +1,251 @@ +/* ************************************************************************ + * Copyright 2016 Advanced Micro Devices, Inc. + * + * ************************************************************************ */ + + +#pragma once +#ifndef __TRTRI_TRSM_HPP__ +#define __TRTRI_TRSM_HPP__ + +#include + +#include "definitions.h" +#include "status.h" +#include "trtri.hpp" +#include "gemm.hpp" + +/* + Invert the IB by IB diagonal blocks of A of size n by n, where n is divisible by IB + and stores the results in part of invA of size NB by NB. + currently IB = NB/2; + flag indicate whether write into A or invA, invA: 1, A: 0 + + + [ IB ] NB = 2 * IB; + [ IB ] + +*/ +template +__global__ void +trtri_trsm_kernel(hipLaunchParm lp, + rocblas_fill uplo, + rocblas_diagonal diag, + rocblas_int n, + T *A, rocblas_int lda, + T *invA) +{ + //get the individual matrix which is processed by device function + //device function only see one matrix + + // each hip thread Block compute a inverse of a IB * IB diagonal block of A + + T *individual_invA; + individual_invA = invA + hipBlockIdx_x/2 * NB * NB; + // the odd thread block makes a shift + if( hipBlockIdx_x % 2 == 1 ){ + individual_invA += NB * (NB/2) + (NB/2); + } + + trtri_device(uplo, diag, (NB/2), A+hipBlockIdx_x * (NB/2) * lda + hipBlockIdx_x * (NB/2), lda, individual_invA, NB); + +} + + + +/* ============================================================================================ */ + +/*! \brief BLAS Level 3 API + + \details + + This routine is a special routine only called by trsm, it is a private API. + Internally, it calls batched trtri and batched gemm to + compute the inverse of the diagonal blocks of a matrix A. The result is in invA + Each individual digaonal block invA is NB * NB + The last individual diagonal block will be pad 0 if n is not divisible by NB + + Specifically, it first calls trtri to invert a IB * IB digaonal in this NB * NB diagonal block + Second, it finishs the diagonal block by calling batched GEMM + + + @param[in] + handle rocblas_handle. + handle to the rocblas library context queue. + @param[in] + uplo rocblas_fill. + specifies whether the upper 'rocblas_fill_upper' or lower 'rocblas_fill_lower' + if rocblas_fill_upper, the lower part of A is not referenced + if rocblas_fill_lower, the upper part of A is not referenced + @param[in] + diag rocblas_diagonal. + = 'rocblas_diagonal_non_unit', A is non-unit triangular; + = 'rocblas_diagonal_unit', A is unit triangular; + @param[in] + n rocblas_int. + @param[in] + A pointer storing matrix A on the GPU. + @param[in] + lda rocblas_int + specifies the leading dimension of A. + @param[output] + invA + of dimension (NB, ceil(n/NB)*NB), + On exit, contains inverses of the NB by NB diagonal blocks of A. + + ********************************************************************/ + +//assume invA has already been allocated, and leading dimension of invA is NB +//assume IB is exactly half of NB +template +rocblas_status +rocblas_trtri_trsm_template(rocblas_handle handle, + rocblas_fill uplo, + rocblas_diagonal diag, + rocblas_int n, + T *A, rocblas_int lda, + T *invA) +{ + if(handle == nullptr) + return rocblas_status_invalid_handle; + else if ( uplo != rocblas_fill_lower && uplo != rocblas_fill_upper) + return rocblas_status_not_implemented; + else if ( n < 0 ) + return rocblas_status_invalid_size; + else if ( A == nullptr ) + return rocblas_status_invalid_pointer; + else if ( lda < n ) + return rocblas_status_invalid_size; + else if ( invA == nullptr ) + return rocblas_status_invalid_pointer; + + + /* + * Quick return if possible. + */ + + if ( n == 0 ) + return rocblas_status_success; + + rocblas_status status; + + hipStream_t rocblas_stream; + RETURN_IF_ROCBLAS_ERROR(rocblas_get_stream(handle, &rocblas_stream)); + + /* blocks is number of divisible NB*NB blocks, but 2 * blocks of IB*IB blocks. + if n < NB. Then blocks = 0; the trtri_trsm and batched gemm are diabled */ + + rocblas_int blocks = n / NB; + + if(blocks > 0){ + + rocblas_int IB = NB/2; + dim3 grid(blocks * 2, 1, 1); + dim3 threads(IB, 1, 1 ); + + /* + Algorithm: + + If A is a lower triangular matrix, to compute the invA + all of Aii, invAii are of size IB by IB + + [ A11 0 ] * [ invA11 0 ] = [ I 0 ] + [ A21 A22 ] [ invA21 invA22 ] [ 0 I ] + + A11*invA11 = I -> invA11 = A11^{-1}, by trtri directly + A22*invA22 = I -> invA22 = A22^{-1}, by trtri directly + A21*invA11 + A22*invA21 = 0 -> invA21 = -A22^{-1}*A21*invA11 = -invA22*A21*invA11, by gemm + + + If A is a upper triangular matrix, to compute the invA + all of Aii, invAii are of size IB by IB + + [ A11 A12 ] * [ invA11 invA12 ] = [ I 0 ] + [ 0 A22 ] [ 0 invA22 ] [ 0 I ] + + A11*invA11 = I -> invA11 = A11^{-1}, by trtri directly + A22*invA22 = I -> invA22 = A22^{-1}, by trtri directly + A11*invA12 + A12*invA22 = 0 -> invA12 = -A11^{-1}*A12*invA22 = -invA11*A12*invA22, by gemm + + */ + + //invert IB * IB diagoanl blocks of A and write the result of invA11 and invA22 in invA + + hipLaunchKernel(HIP_KERNEL_NAME(trtri_trsm_kernel), dim3(grid), dim3(threads), 0, rocblas_stream, + uplo, diag, (blocks)*NB, A, lda, invA); + + + + T one = 1; T zero = 0; T negative_one = -1; + T* C; + PRINT_IF_HIP_ERROR(hipMalloc(&C, sizeof(T) * IB * IB * blocks)); + + rocblas_int stride_A = NB*lda + NB; + rocblas_int stride_invA = NB*NB; + rocblas_int stride_C = IB*IB; + + rocblas_int A12_A21_offset, invA11_offset, invA22_offset, invA21_invA12_offset; + +// A21*invA11 + invA22*invA21 = 0 -> invA21 = -A22^{-1}*A21*invA11 = -invA22*A21*invA11, by gemm + + if (uplo == rocblas_fill_lower){ + A12_A21_offset = IB; //A21 + invA11_offset = 0; //invA11 in lower + invA21_invA12_offset = IB; //invA21 + invA22_offset = IB*NB+IB;//invA22 in lower + } + else + { + A12_A21_offset = IB*NB; //A12 + invA11_offset = NB*IB+IB; //invA22 in upper + invA21_invA12_offset = IB*NB; //invA12 + invA22_offset = 0; // A11 in upper + } + + + #ifndef NDEBUG + printf("first batched gemm\n"); + #endif + // first batched gemm compute C = A21*invA11 (lower) or C = A12*invA22 (upper) + // distance between each invA11 or invA22 is stride_invA, stride_A for each A21 or A12, C of size IB * IB + status = rocblas_gemm_batched_template(handle, rocblas_operation_none, rocblas_operation_none, + IB, IB, IB, + &one, + (const T*)(A + ((uplo == rocblas_fill_lower) ? IB : IB*lda) ), lda, stride_A, + (const T*)(invA + ((uplo == rocblas_fill_lower) ? 0 : IB*NB+IB) ) , NB, stride_invA, + &zero, + C, IB, stride_C, + blocks ); + + #ifndef NDEBUG + printf("second batched gemm\n"); + #endif + // second batched gemm compute invA21 = -invA22 * C (lower) or invA12 = -invA11*C (upper) + // distance between each invA21 or invA12 is stride_invA, + status = rocblas_gemm_batched_template(handle, rocblas_operation_none, rocblas_operation_none, + IB, IB, IB, + &negative_one, + (const T*)(invA + ((uplo == rocblas_fill_lower) ? IB*NB+IB : 0)), NB, stride_invA, + (const T*)C, IB, stride_C, + &zero, + (invA + ((uplo == rocblas_fill_lower) ? IB : IB*NB)), NB, stride_invA, + blocks ); + + + PRINT_IF_HIP_ERROR(hipFree(C)); + + }//end if + + //the last digaonal block is handled seperately if n is not divisible by NB, or if there is only one block + if(n % NB != 0 || blocks == 0){ + status = rocblas_trtri_template(handle, uplo, diag, n-blocks*NB, A + blocks*NB*lda + blocks*NB, lda, invA + blocks*NB*NB, NB); + } + + + return rocblas_status_success; + +} + + + +#endif // __TRTRI_TRSM_HPP__ diff --git a/library/src/handle.cpp b/library/src/handle.cpp index 2063a27a4..c3f3ed1b2 100644 --- a/library/src/handle.cpp +++ b/library/src/handle.cpp @@ -4,39 +4,16 @@ #include "handle.h" #include -#if BUILD_WITH_TENSILE - #include "Tensile.h" -#endif - /******************************************************************************* - * constructor + * constructor ******************************************************************************/ _rocblas_handle::_rocblas_handle() { - // default device is active device - THROW_IF_HIP_ERROR( hipGetDevice(&device) ); - THROW_IF_HIP_ERROR( hipGetDeviceProperties(&device_properties, device) ); - - // rocblas by default take the system default stream 0 users cannot create - -#if BUILD_WITH_TENSILE - // tensile device profile - tensile_device_profile = tensileCreateEmptyDeviceProfile(); - if ( strlen(device_properties.name) > tensile_device_profile.devices[0].maxNameLength) { - strncpy( tensile_device_profile.devices[0].name, - device_properties.name, tensile_device_profile.devices[0].maxNameLength); - tensile_device_profile.devices[0].name[tensile_device_profile.devices[0].maxNameLength-1] = '\0'; - } else { - strcpy( tensile_device_profile.devices[0].name, device_properties.name); - } - tensile_device_profile.numDevices = 1; - - // tensile control - tensile_control = tensileCreateEmptyControl(); - tensile_control.queues[0] = rocblas_stream; - tensile_control.numQueues = 1; + // default device is active device + THROW_IF_HIP_ERROR( hipGetDevice(&device) ); + THROW_IF_HIP_ERROR( hipGetDeviceProperties(&device_properties, device) ); -#endif + // rocblas by default take the system default stream 0 users cannot create } @@ -44,7 +21,7 @@ _rocblas_handle::_rocblas_handle() { * destructor ******************************************************************************/ _rocblas_handle::~_rocblas_handle() { - //rocblas by default take the system default stream which user cannot destory + //rocblas by default take the system default stream which user cannot destory } /******************************************************************************* @@ -53,20 +30,15 @@ _rocblas_handle::~_rocblas_handle() { /******************************************************************************* - * set stream: + * set stream: This API assumes user has already created a valid stream - Associate the following rocblas API call with this user provided stream + Associate the following rocblas API call with this user provided stream ******************************************************************************/ rocblas_status _rocblas_handle::set_stream( hipStream_t user_stream ) { - //TODO: check the user_stream valid or not - rocblas_stream = user_stream; -#if BUILD_WITH_TENSILE - tensile_control.queues[0] = user_stream; - tensile_control.numQueues = 1; - // It is impossible to switch stream to another device in rocblas without destroying the handle -#endif - return rocblas_status_success; + //TODO: check the user_stream valid or not + rocblas_stream = user_stream; + return rocblas_status_success; } @@ -74,6 +46,6 @@ rocblas_status _rocblas_handle::set_stream( hipStream_t user_stream ) { * get stream ******************************************************************************/ rocblas_status _rocblas_handle::get_stream( hipStream_t *stream ) const { - *stream = rocblas_stream; - return rocblas_status_success; + *stream = rocblas_stream; + return rocblas_status_success; } diff --git a/library/src/include/definitions.h b/library/src/include/definitions.h index eff6d0c2b..172ca7671 100644 --- a/library/src/include/definitions.h +++ b/library/src/include/definitions.h @@ -1,10 +1,11 @@ +/* ************************************************************************ + * Copyright 2016 Advanced Micro Devices, Inc. + * + * ************************************************************************ */ + #ifndef DEFINITIONS_H #define DEFINITIONS_H -#if BUILD_WITH_TENSILE - #include "Tensile_status.h" -#endif - #include "status.h" /******************************************************************************* @@ -13,44 +14,44 @@ * thereby it can include top-level definitions included by all ******************************************************************************/ -#define RETURN_IF_TENSILE_ERROR(INPUT_STATUS_FOR_CHECK) { \ - TensileStatus TMP_STATUS_FOR_CHECK = INPUT_STATUS_FOR_CHECK; \ - if (TMP_STATUS_FOR_CHECK != tensileStatusSuccess) { \ - tensileStatusCheck( TMP_STATUS_FOR_CHECK ); \ - return get_rocblas_status_for_tensile_status(TMP_STATUS_FOR_CHECK); \ - } } #define RETURN_IF_HIP_ERROR(INPUT_STATUS_FOR_CHECK) { \ - hipError_t TMP_STATUS_FOR_CHECK = INPUT_STATUS_FOR_CHECK; \ - if (TMP_STATUS_FOR_CHECK != hipSuccess ) { \ - return get_rocblas_status_for_hip_status(TMP_STATUS_FOR_CHECK); \ - } } + hipError_t TMP_STATUS_FOR_CHECK = INPUT_STATUS_FOR_CHECK; \ + if (TMP_STATUS_FOR_CHECK != hipSuccess ) { \ + return get_rocblas_status_for_hip_status(TMP_STATUS_FOR_CHECK); \ +} } #define RETURN_IF_ROCBLAS_ERROR(INPUT_STATUS_FOR_CHECK) { \ - rocblas_status TMP_STATUS_FOR_CHECK = INPUT_STATUS_FOR_CHECK; \ - if (TMP_STATUS_FOR_CHECK != rocblas_status_success) { \ - return TMP_STATUS_FOR_CHECK; \ - } } - -#if BUILD_WITH_TENSILE - #define THROW_IF_TENSILE_ERROR(INPUT_STATUS_FOR_CHECK) {\ - TensileStatus TMP_STATUS_FOR_CHECK = INPUT_STATUS_FOR_CHECK; \ - if (TMP_STATUS_FOR_CHECK != tensileStatusSuccess) { \ - tensileStatusCheck( TMP_STATUS_FOR_CHECK ); \ - throw get_rocblas_status_for_tensile_status(TMP_STATUS_FOR_CHECK); \ - } } -#endif + rocblas_status TMP_STATUS_FOR_CHECK = INPUT_STATUS_FOR_CHECK; \ + if (TMP_STATUS_FOR_CHECK != rocblas_status_success) { \ + return TMP_STATUS_FOR_CHECK; \ +} } #define THROW_IF_HIP_ERROR(INPUT_STATUS_FOR_CHECK) { \ - hipError_t TMP_STATUS_FOR_CHECK = INPUT_STATUS_FOR_CHECK; \ - if (TMP_STATUS_FOR_CHECK != hipSuccess ) { \ - throw get_rocblas_status_for_hip_status(TMP_STATUS_FOR_CHECK); \ - } } + hipError_t TMP_STATUS_FOR_CHECK = INPUT_STATUS_FOR_CHECK; \ + if (TMP_STATUS_FOR_CHECK != hipSuccess ) { \ + throw get_rocblas_status_for_hip_status(TMP_STATUS_FOR_CHECK); \ +} } #define THROW_IF_ROCBLAS_ERROR(INPUT_STATUS_FOR_CHECK) { \ - rocblas_status TMP_STATUS_FOR_CHECK = INPUT_STATUS_FOR_CHECK; \ - if (TMP_STATUS_FOR_CHECK != rocblas_status_success) { \ - throw TMP_STATUS_FOR_CHECK; \ - } } + rocblas_status TMP_STATUS_FOR_CHECK = INPUT_STATUS_FOR_CHECK; \ + if (TMP_STATUS_FOR_CHECK != rocblas_status_success) { \ + throw TMP_STATUS_FOR_CHECK; \ +} } + +#define PRINT_IF_HIP_ERROR(INPUT_STATUS_FOR_CHECK) {\ + hipError_t TMP_STATUS_FOR_CHECK = INPUT_STATUS_FOR_CHECK; \ + if (TMP_STATUS_FOR_CHECK != hipSuccess) { \ + fprintf(stderr, "hip error code: %d at %s:%d\n", TMP_STATUS_FOR_CHECK,__FILE__, __LINE__); \ +} } + + +#define PRINT_IF_ROCBLAS_ERROR(INPUT_STATUS_FOR_CHECK) {\ + rocblas_status TMP_STATUS_FOR_CHECK = INPUT_STATUS_FOR_CHECK; \ + if (TMP_STATUS_FOR_CHECK != rocblas_status_success) { \ + fprintf(stderr, "rocblas error code: %d at %s:%d\n", TMP_STATUS_FOR_CHECK,__FILE__, __LINE__); \ +} } + + -#endif +#endif //DEFINITIONS_H diff --git a/library/src/include/handle.h b/library/src/include/handle.h index 629539270..6f84e6c85 100644 --- a/library/src/include/handle.h +++ b/library/src/include/handle.h @@ -1,3 +1,8 @@ +/* ************************************************************************ + * Copyright 2016 Advanced Micro Devices, Inc. + * + * ************************************************************************ */ + #ifndef HANDLE_H #define HANDLE_H #include @@ -5,10 +10,6 @@ #include "rocblas.h" -#if BUILD_WITH_TENSILE - #include "Tensile.h" -#endif - /******************************************************************************* * \brief rocblas_handle is a structure holding the rocblas library context. * It must be initialized using rocblas_create_handle() and the returned handle mus @@ -29,21 +30,6 @@ struct _rocblas_handle{ // rocblas by default take the system default stream 0 users cannot create hipStream_t rocblas_stream = 0; -#if BUILD_WITH_TENSILE - /***************************************************************************** - * \brief Tensile Device Profile - * describes device to which this control is assigned so - * Tensile can lookup optimal solution - ****************************************************************************/ - TensileDeviceProfile tensile_device_profile; - - /***************************************************************************** - * \brief Tensile Control - * for passing control state (stream) to Tensile - ****************************************************************************/ - TensileControl tensile_control; -#endif - }; #endif diff --git a/library/src/include/status.h b/library/src/include/status.h index f5662a198..238e80315 100644 --- a/library/src/include/status.h +++ b/library/src/include/status.h @@ -1,3 +1,9 @@ +/* ************************************************************************ + * Copyright 2016 Advanced Micro Devices, Inc. + * + * ************************************************************************ */ + + #ifndef STATUS_H #define STATUS_H diff --git a/library/src/rocblas_auxiliary.cpp b/library/src/rocblas_auxiliary.cpp index 353237817..c5e422480 100644 --- a/library/src/rocblas_auxiliary.cpp +++ b/library/src/rocblas_auxiliary.cpp @@ -18,13 +18,13 @@ * can not recoginize it is on host or not ******************************************************************************/ rocblas_mem_location rocblas_get_pointer_location(void *ptr){ - hipPointerAttribute_t attribute; - hipPointerGetAttributes(&attribute, ptr); - if (ptr == attribute.devicePointer) { - return rocblas_mem_location_device; - } else { - return rocblas_mem_location_host; - } + hipPointerAttribute_t attribute; + hipPointerGetAttributes(&attribute, ptr); + if (ptr == attribute.devicePointer) { + return rocblas_mem_location_device; + } else { + return rocblas_mem_location_host; + } } @@ -34,19 +34,19 @@ rocblas_mem_location rocblas_get_pointer_location(void *ptr){ extern "C" rocblas_status rocblas_create_handle(rocblas_handle *handle){ - // if handle not valid - if (handle == nullptr) { - return rocblas_status_invalid_pointer; - } + // if handle not valid + if (handle == nullptr) { + return rocblas_status_invalid_pointer; + } - // allocate on heap - try { - *handle = new _rocblas_handle(); - } catch (rocblas_status status) { - return status; - } + // allocate on heap + try { + *handle = new _rocblas_handle(); + } catch (rocblas_status status) { + return status; + } - return rocblas_status_success; + return rocblas_status_success; } @@ -55,13 +55,13 @@ rocblas_status rocblas_create_handle(rocblas_handle *handle){ ******************************************************************************/ extern "C" rocblas_status rocblas_destroy_handle(rocblas_handle handle){ - // call destructor - try { - delete handle; - } catch (rocblas_status status) { - return status; - } - return rocblas_status_success; + // call destructor + try { + delete handle; + } catch (rocblas_status status) { + return status; + } + return rocblas_status_success; } @@ -73,7 +73,7 @@ rocblas_status rocblas_destroy_handle(rocblas_handle handle){ extern "C" rocblas_status rocblas_set_stream(rocblas_handle handle, hipStream_t stream_id){ - return handle->set_stream( stream_id ); + return handle->set_stream( stream_id ); } @@ -84,8 +84,5 @@ rocblas_set_stream(rocblas_handle handle, hipStream_t stream_id){ extern "C" rocblas_status rocblas_get_stream(rocblas_handle handle, hipStream_t *stream_id){ - return handle->get_stream( stream_id ); + return handle->get_stream( stream_id ); } - - - diff --git a/library/src/status.cpp b/library/src/status.cpp index 1cc0b1a9b..e8a6148fe 100644 --- a/library/src/status.cpp +++ b/library/src/status.cpp @@ -1,3 +1,7 @@ +/* ************************************************************************ + * Copyright 2016 Advanced Micro Devices, Inc. + * + * ************************************************************************ */ #include @@ -10,34 +14,33 @@ ******************************************************************************/ rocblas_status get_rocblas_status_for_hip_status( hipError_t status ) { - switch(status) { - - // success - case hipSuccess: - return rocblas_status_success; - - // internal hip memory allocation - case hipErrorMemoryAllocation: - case hipErrorLaunchOutOfResources: - return rocblas_status_memory_error; - - // user-allocated hip memory - case hipErrorInvalidDevicePointer: // hip memory - return rocblas_status_invalid_pointer; - - // user-allocated device, stream, event - case hipErrorInvalidDevice: - case hipErrorInvalidResourceHandle: - return rocblas_status_invalid_handle; - - // library using hip incorrectly - case hipErrorInvalidValue: - return rocblas_status_internal_error; - - // hip runtime failing - case hipErrorNoDevice: // no hip devices - case hipErrorUnknown: - default: - return rocblas_status_internal_error; - } + switch(status) { + // success + case hipSuccess: + return rocblas_status_success; + + // internal hip memory allocation + case hipErrorMemoryAllocation: + case hipErrorLaunchOutOfResources: + return rocblas_status_memory_error; + + // user-allocated hip memory + case hipErrorInvalidDevicePointer: // hip memory + return rocblas_status_invalid_pointer; + + // user-allocated device, stream, event + case hipErrorInvalidDevice: + case hipErrorInvalidResourceHandle: + return rocblas_status_invalid_handle; + + // library using hip incorrectly + case hipErrorInvalidValue: + return rocblas_status_internal_error; + + // hip runtime failing + case hipErrorNoDevice: // no hip devices + case hipErrorUnknown: + default: + return rocblas_status_internal_error; + } }