Skip to content

Commit

Permalink
Merge pull request #24 from CNugteren/development
Browse files Browse the repository at this point in the history
Update to version 0.4.0
  • Loading branch information
CNugteren committed Aug 22, 2015
2 parents db6846b + 74f6017 commit a41744d
Show file tree
Hide file tree
Showing 640 changed files with 4,720 additions and 220,456 deletions.
25 changes: 25 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
language: cpp
compiler:
- gcc
- clang
before_install:
- sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
- sudo add-apt-repository -y ppa:kalakris/cmake
- sudo apt-get update -qq
- sudo apt-get install -qq gcc-4.8 g++-4.8 clang
- sudo apt-get install -qq fglrx=2:8.960-0ubuntu1 opencl-headers
- sudo apt-get install -qq cmake
install:
- if [ "$CXX" = "g++" ]; then export CXX="g++-4.8" CC="gcc-4.8"; fi
before_script:
- mkdir install
- export PATH=`pwd`/install/bin:${PATH}
- export LD_LIBRARY_PATH=`pwd`/install/lib64:`pwd`/install/lib:${LD_LIBRARY_PATH}
- mkdir build
- cd build
- cmake -DCMAKE_INSTALL_PREFIX:PATH=../install ..
script:
- make
- make install
notifications:
email: false
10 changes: 10 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,4 +1,14 @@

Version 0.4.0
- Now using the Claduc C++11 interface to OpenCL
- Added plain C API for increased compatibility (clblast_c.h)
- Re-organized tuner infrastructure and added JSON output
- Removed clBLAS sources, it should now be installed separately for testing
- Added Travis continuous integration
- Added level-2 routines:
* CHEMV/ZHEMV
* SSYMV/DSYMV

Version 0.3.0
- Re-organized test/client infrastructure to avoid code duplication
- Added an optional bypass for pre/post-processing kernels in level-3 routines
Expand Down
110 changes: 75 additions & 35 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@

# CMake project details
cmake_minimum_required(VERSION 2.8.10)
project("clblast" CXX)
project("clblast" C CXX)
set(clblast_VERSION_MAJOR 0)
set(clblast_VERSION_MINOR 3)
set(clblast_VERSION_MINOR 4)
set(clblast_VERSION_PATCH 0)

# Options and their default values
Expand Down Expand Up @@ -85,6 +85,15 @@ if(TUNERS)
endif()
endif()

# Locates the clBLAS library in case the tests need to be compiled. "FindclBLAS.cmake" is included.
if(TESTS)
find_package(clBLAS)
if(NOT CLBLAS_FOUND)
message(STATUS "Could NOT find clBLAS, disabling the compilation of the tests")
set(TESTS OFF)
endif()
endif()

# ==================================================================================================

# Includes directories: CLBlast and OpenCL
Expand All @@ -94,16 +103,18 @@ include_directories(${clblast_SOURCE_DIR}/include ${OPENCL_INCLUDE_DIRS})

# Sets the supported routines and the used kernels. New routines and kernels should be added here.
set(KERNELS copy pad transpose padtranspose xaxpy xgemv xgemm)
set(SAMPLE_PROGRAMS sgemm)
set(SAMPLE_PROGRAMS_CPP sgemm)
set(SAMPLE_PROGRAMS_C sgemm)
set(LEVEL1_ROUTINES xaxpy)
set(LEVEL2_ROUTINES xgemv)
set(LEVEL2_ROUTINES xgemv xhemv xsymv)
set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm)
set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES})
set(PRECISIONS 32 3232 64 6464)

# ==================================================================================================

# Gathers all source-files
set(SOURCES src/clblast.cc src/database.cc src/routine.cc src/utilities.cc)
set(SOURCES src/clblast.cc src/database.cc src/routine.cc src/utilities.cc src/clblast_c.cc)
foreach(ROUTINE ${LEVEL1_ROUTINES})
set(SOURCES ${SOURCES} src/routines/level1/${ROUTINE}.cc)
endforeach()
Expand All @@ -121,17 +132,36 @@ target_link_libraries(clblast ${OPENCL_LIBRARIES})
# Installs the library
install(TARGETS clblast DESTINATION lib)
install(FILES include/clblast.h DESTINATION include)
install(FILES include/clblast_c.h DESTINATION include)

# ==================================================================================================

# Sets a default platform ($DEVICEPLATFORM) and device ($DEFAULT_DEVICE) to run tuners and tests on
set(DEVICEPLATFORM )
if(DEFINED ENV{DEFAULT_DEVICE})
set(DEVICEPLATFORM ${DEVICEPLATFORM} -device $ENV{DEFAULT_DEVICE})
endif()
if(DEFINED ENV{DEFAULT_PLATFORM})
set(DEVICEPLATFORM ${DEVICEPLATFORM} -platform $ENV{DEFAULT_PLATFORM})
endif()

# ==================================================================================================

# This section contains all the code related to the examples
if(SAMPLES)

# Adds sample programs
foreach(SAMPLE ${SAMPLE_PROGRAMS})
add_executable(sample_${SAMPLE} samples/${SAMPLE}.cc)
target_link_libraries(sample_${SAMPLE} clblast ${OPENCL_LIBRARIES})
install(TARGETS sample_${SAMPLE} DESTINATION bin)
# Adds sample programs (C++)
foreach(SAMPLE ${SAMPLE_PROGRAMS_CPP})
add_executable(clblast_sample_${SAMPLE} samples/${SAMPLE}.cc)
target_link_libraries(clblast_sample_${SAMPLE} clblast ${OPENCL_LIBRARIES})
install(TARGETS clblast_sample_${SAMPLE} DESTINATION bin)
endforeach()

# Adds sample programs (C)
foreach(SAMPLE ${SAMPLE_PROGRAMS_C})
add_executable(clblast_sample_${SAMPLE}_c samples/${SAMPLE}.c)
target_link_libraries(clblast_sample_${SAMPLE}_c clblast ${OPENCL_LIBRARIES})
install(TARGETS clblast_sample_${SAMPLE}_c DESTINATION bin)
endforeach()

endif()
Expand All @@ -145,75 +175,85 @@ if(TUNERS)
# Includes CLTune
include_directories(${CLTUNE_INCLUDE_DIRS})

# Creates the common tuner objects (requires CMake 2.8.8)
add_library(tuners_common OBJECT src/tuning/tuning.cc)

# Adds tuning executables
foreach(KERNEL ${KERNELS})
add_executable(tuner_${KERNEL} $<TARGET_OBJECTS:tuners_common> src/tuning/${KERNEL}.cc)
target_link_libraries(tuner_${KERNEL} clblast ${CLTUNE_LIBRARIES} ${OPENCL_LIBRARIES})
install(TARGETS tuner_${KERNEL} DESTINATION bin)
add_executable(clblast_tuner_${KERNEL} src/tuning/${KERNEL}.cc)
target_link_libraries(clblast_tuner_${KERNEL} clblast ${CLTUNE_LIBRARIES} ${OPENCL_LIBRARIES})
install(TARGETS clblast_tuner_${KERNEL} DESTINATION bin)
endforeach()

# Adds 'alltuners' target: runs all tuners for all precisions
set(ALLTUNERS )
set(ALLTUNERSDEPENDS )
foreach(KERNEL ${KERNELS})
foreach(PRECISION ${PRECISIONS})
set(ALLTUNERS ${ALLTUNERS} COMMAND clblast_tuner_${KERNEL} -precision ${PRECISION} ${DEVICEPLATFORM})
endforeach()
set(ALLTUNERSDEPENDS clblast_tuner_${KERNEL})
endforeach()
add_custom_target(alltuners ${ALLTUNERS} DEPENDS ${ALLTUNERSDEPENDS})

endif()

# ==================================================================================================

# Down from here is all test (performance and correctness) related. Note that these tests require
# the compilation of the clBLAS library to act as a reference.
# the presence of the clBLAS library to act as a reference.
if(TESTS)

# Enables clBLAS to provide as reference for the tests (source-code is shipped with the project).
# This subproject uses specific flags to reduce the amount of warnings.
set(CMAKE_CXX_FLAGS_CLBLAST ${CMAKE_CXX_FLAGS})
set(CMAKE_CXX_FLAGS "-O3")
add_subdirectory(external/clBLAS/src)
set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS_CLBLAST})

# Adds new include directories for the reference clBLAS
include_directories(${clblast_SOURCE_DIR}/test ${clBLAS_SOURCE_DIR})
include_directories(${clblast_SOURCE_DIR}/test ${CLBLAS_INCLUDE_DIRS})

# Creates the common correctness-tests objects (requires CMake 2.8.8)
add_library(test_correctness_common OBJECT
test/correctness/tester.cc test/correctness/testblas.cc)

# Compiles the correctness-tests
foreach(ROUTINE ${LEVEL1_ROUTINES})
add_executable(test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
add_executable(clblast_test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
test/correctness/routines/level1/${ROUTINE}.cc)
endforeach()
foreach(ROUTINE ${LEVEL2_ROUTINES})
add_executable(test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
add_executable(clblast_test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
test/correctness/routines/level2/${ROUTINE}.cc)
endforeach()
foreach(ROUTINE ${LEVEL3_ROUTINES})
add_executable(test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
add_executable(clblast_test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
test/correctness/routines/level3/${ROUTINE}.cc)
endforeach()
foreach(ROUTINE ${ROUTINES})
target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
install(TARGETS test_${ROUTINE} DESTINATION bin)
target_link_libraries(clblast_test_${ROUTINE} clblast ${CLBLAS_LIBRARIES} ${OPENCL_LIBRARIES})
install(TARGETS clblast_test_${ROUTINE} DESTINATION bin)
endforeach()

# Adds 'alltests' target: runs all tests
set(ALLTESTS )
set(ALLTESTSDEPENDS )
foreach(ROUTINE ${ROUTINES})
set(ALLTESTS ${ALLTESTS} COMMAND clblast_test_${ROUTINE} ${DEVICEPLATFORM})
set(ALLTESTSDEPENDS clblast_test_${ROUTINE})
endforeach()
add_custom_target(alltests ${ALLTESTS} DEPENDS ${ALLTESTSDEPENDS})

# Creates the common performance-tests objects (requires CMake 2.8.8)
add_library(test_performance_common OBJECT test/performance/client.cc)

# Compiles the performance-tests
foreach(ROUTINE ${LEVEL1_ROUTINES})
add_executable(client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
add_executable(clblast_client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
test/performance/routines/level1/${ROUTINE}.cc)
endforeach()
foreach(ROUTINE ${LEVEL2_ROUTINES})
add_executable(client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
add_executable(clblast_client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
test/performance/routines/level2/${ROUTINE}.cc)
endforeach()
foreach(ROUTINE ${LEVEL3_ROUTINES})
add_executable(client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
add_executable(clblast_client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
test/performance/routines/level3/${ROUTINE}.cc)
endforeach()
foreach(ROUTINE ${ROUTINES})
target_link_libraries(client_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
install(TARGETS client_${ROUTINE} DESTINATION bin)
target_link_libraries(clblast_client_${ROUTINE} clblast ${CLBLAS_LIBRARIES} ${OPENCL_LIBRARIES})
install(TARGETS clblast_client_${ROUTINE} DESTINATION bin)
endforeach()

endif()
Expand Down
32 changes: 18 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
CLBlast: The tuned OpenCL BLAS library
================

[![Build Status](https://travis-ci.org/CNugteren/CLBlast.svg?branch=master)](https://travis-ci.org/CNugteren/CLBlast)

CLBlast is a modern, lightweight, performant and tunable OpenCL BLAS library written in C++11. It is designed to leverage the full performance potential of a wide variety of OpenCL devices from different vendors, including desktop and laptop GPUs, embedded GPUs, and other accelerators. CLBlast implements BLAS routines: basic linear algebra subprograms operating on vectors and matrices.

__Note that the CLBlast library is actively being developed, and is not mature enough for production environments__. This preview-version doesn't support all routines yet: others will be added in due time. It also lacks extensive tuning on some common OpenCL platforms: __out-of-the-box performance on some devices might be poor__. See below for more details.
Expand Down Expand Up @@ -39,10 +41,10 @@ The pre-requisites for compilation of CLBlast are:
- Clang 3.3 or newer
- AppleClang 5.0 or newer
- ICC 14.0 or newer
* An OpenCL 1.1 or newer library. CLBlast has been tested on x86-64 Linux and OS X systems with:
* An OpenCL 1.1 or newer library, for example:
- Apple OpenCL
- NVIDIA CUDA SDK (5.5, 6.5, 7.0)
- AMD APP SDK (2.9)
- NVIDIA CUDA SDK
- AMD APP SDK

An example of an out-of-source build (starting from the root of the CLBlast folder):

Expand All @@ -60,11 +62,15 @@ A custom installation folder can be specified when calling CMake:
Using the library
-------------

Like clBLAS and cuBLAS, CLBlast also requires OpenCL device buffers as arguments to its routines. This means you'll have full control over the OpenCL buffers and the host-device memory transfers. CLBlast's API is designed to resemble clBLAS's C API as much as possible, requiring little integration effort in case clBLAS was previously used. Using CLBlast starts by including the header:
Like clBLAS and cuBLAS, CLBlast also requires OpenCL device buffers as arguments to its routines. This means you'll have full control over the OpenCL buffers and the host-device memory transfers. CLBlast's API is designed to resemble clBLAS's C API as much as possible, requiring little integration effort in case clBLAS was previously used. Using CLBlast starts by including the C++ header:

#include <clblast.h>

Afterwards, any of CLBlast's routines can be called directly: there is no need to initialize the library. The available routines and the required arguments are described in the `clblast.h` include file. Additionally, a couple of stand-alone sample program are included in `samples/`.
Or alternatively the plain C version:

#include <clblast_c.h>

Afterwards, any of CLBlast's routines can be called directly: there is no need to initialize the library. The available routines and the required arguments are described in the `clblast.h` include file. Additionally, a couple of stand-alone example programs are included in `samples/`.


Using the tuners (optional)
Expand All @@ -85,11 +91,11 @@ If your device is not (yet) among this list or if you want to tune CLBlast for s

cmake -DTUNERS=ON ..

Note that CLBlast's tuners are based on the CLTune auto-tuning library, which has to be installed separately. CLTune is available from GitHub.
Note that CLBlast's tuners are based on the CLTune auto-tuning library, which has to be installed separately (version 1.7.0 or higher). CLTune is available from GitHub.

Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance.
Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance.

The tuner will output a C++ database compatible line with the results, which can be added to `include/internal/database/xxxxx.h` in the appropriate section. Or, if tuning parameters already exist for your device but you believe they can be improved, this is also the place where they can be modified. If you want the found parameters to be included in future releases of CLBlast, please post the results in the corresponding issue on GitHub or [email the main author](http://www.cedricnugteren.nl).
The tuner will output a C++ database compatible line with the results, which can be added to `include/internal/database/xxxxx.h` in the appropriate section. Or, if tuning parameters already exist for your device but you believe they can be improved, this is also the place where they can be modified. If you want the found parameters to be included in future releases of CLBlast, please post the JSON output in the corresponding issue on GitHub or [email the main author](http://www.cedricnugteren.nl).


Compiling the tests (optional)
Expand All @@ -99,9 +105,9 @@ To make sure CLBlast is working correctly on your device (recommended), compile

cmake -DTESTS=ON ..

Afterwards, executables in the form of `test_xxxxx` are available, in which `xxxxx` is the name of a routine (e.g. `xgemm`). Note that CLBlast is tested against [clBLAS](http://github.com/clMathLibraries/clBLAS) for correctness. However, it is not required to install clBLAS separately on your system: it is included as part of the CLBlast source code in `external/clBLAS`.
Afterwards, executables in the form of `clblast_test_xxxxx` are available, in which `xxxxx` is the name of a routine (e.g. `xgemm`). Note that CLBlast is tested against [clBLAS](http://github.com/clMathLibraries/clBLAS) for correctness. The library clBLAS is therefore required to be installed on your system for the CLBlast tests.

With the `-DTESTS=ON` flag, additional performance tests are compiled. These come in the form of client executables named `client_xxxxx`, in which `xxxxx` is the name of a routine (e.g. `xgemm`). These clients take a bunch of configuration options and directly run both CLBlast and clBLAS in a head-to-head performance test.
With the `-DTESTS=ON` flag, additional performance tests are compiled. These come in the form of client executables named `clblast_client_xxxxx`, in which `xxxxx` is the name of a routine (e.g. `xgemm`). These clients take a bunch of configuration options and directly run both CLBlast and clBLAS in a head-to-head performance test.


Performance remarks
Expand Down Expand Up @@ -149,10 +155,10 @@ CLBlast is in active development and currently does not support the full set of
| ---------|---|---|---|---|---------|
| xGEMV ||||| |
| xGBMV | | | | | |
| xHEMV | - | - | | | |
| xHEMV | - | - | | | |
| xHBMV | - | - | | | |
| xHPMV | - | - | | | |
| xSYMV | | | - | - | |
| xSYMV | | | - | - | |
| xSBMV | | | - | - | |
| xSPMV | | | - | - | |
| xTRMV | | | | | |
Expand Down Expand Up @@ -208,8 +214,6 @@ To-do list before release of version 1.0
- Increase the functionality:
* Support all routines supported by clBLAS
* Allow the user control over events and synchronization
* Add an interface with OpenCL C++ data-types
* Add an old-style C compatible interface
* Add half-precision routines (e.g. HGEMM)
- Improve host performance:
* Allow initialization to pre-compile kernels and store to disk
Expand Down
Loading

0 comments on commit a41744d

Please sign in to comment.