Skip to content

Commit

Permalink
Cutlass 1.3 Release (#42)
Browse files Browse the repository at this point in the history
CUTLASS 1.3 Release
- Efficient GEMM kernel targeting Volta Tensor Cores via mma.sync instruction added in CUDA 10.1.
  • Loading branch information
kerrmudgeon authored Mar 20, 2019
1 parent 19a9d64 commit 877bdca
Show file tree
Hide file tree
Showing 256 changed files with 16,930 additions and 802 deletions.
4 changes: 2 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# NVIDIA CUTLASS Changelog

## [1.2.1](https://github.com/NVIDIA/cutlass/releases/tag/v1.2.1) (2018-12-19)
* Resolved issue with sm50 and sm52 architectures
## [1.3.0](https://github.com/NVIDIA/cutlass/releases/tag/v1.3.0) (2019-03-20)
* Efficient GEMM kernel targeting Volta Tensor Cores via `mma.sync` instruction added in CUDA 10.1.

## [1.2.0](https://github.com/NVIDIA/cutlass/releases/tag/v1.2.0) (2018-10-26)
* Parallelized reductions across threadblocks ("Split-K")
Expand Down
98 changes: 82 additions & 16 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:
Expand All @@ -20,7 +20,7 @@
# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

cmake_minimum_required(VERSION 3.3.0)
cmake_minimum_required(VERSION 3.3.0 FATAL_ERROR)

set(CUTLASS_LANGUAGES CXX)

Expand All @@ -36,7 +36,8 @@ else()
# FindCUDA fails to detect VS 2017 due to a changed directory format of the toolkits.
# For this configuration we need CMake >= 3.9.0 to use the native CUDA support.
if (WIN32 AND MSVC_VERSION GREATER 1800)
message(FATAL_ERROR "Please upgrade CMake to version >= 3.9.0 to support Visual Studio 2017 or higher")
message(SEND_ERROR "Please upgrade CMake to version >= 3.9.0 to support Visual Studio 2017 or higher")
cmake_minimum_required(VERSION 3.9.0 FATAL_ERROR)
endif()

# Fall back to the FindCUDA version to create an executable with CUDA files
Expand All @@ -52,7 +53,11 @@ if( NOT CMAKE_SIZEOF_VOID_P EQUAL 8 )
message(FATAL_ERROR "CUTLASS requires a 64-bit compiler!")
endif()

find_package(CUDA)
find_package(CUDA REQUIRED)
include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
# Some platforms (e.g. Visual Studio) don't add the CUDA include directories to the system include
# paths by default, so we add it explicitly here.

find_package(Doxygen QUIET)

###################################################################################################
Expand All @@ -61,9 +66,18 @@ find_package(Doxygen QUIET)
#
###################################################################################################

find_library(CUBLAS_LIBRARY cublas HINTS
#
# Conditionally enable cuBLAS
#
set(CUTLASS_ENABLE_CUBLAS ON CACHE BOOL "Enable CUTLASS Tests to build with cuBLAS library.")

if(CUTLASS_ENABLE_CUBLAS)

find_library(CUBLAS_LIBRARY cublas HINTS
${CUDA_TOOLKIT_ROOT_DIR}/lib64
${CUDA_TOOLKIT_ROOT_DIR}/lib/x64)
endif()


# By default we want to build in Release mode to ensure that we're getting best performance
if (NOT (CMAKE_BUILD_TYPE OR CONFIGURATION_TYPES))
Expand All @@ -78,26 +92,56 @@ if(WIN32)
endif()

if (WIN32)
# Enable more warnings and treat as errors
string(APPEND NVCC_FLAGS " -Xcompiler /W3 -Xcompiler /WX")
# Enable more warnings and treat as errors
string(APPEND NVCC_FLAGS " -Xcompiler /W3 -Xcompiler /WX")

# Disable warning on Unicode characters
string(APPEND NVCC_FLAGS " -Xcompiler /wd4819")
# Disable warning on Unicode characters
string(APPEND NVCC_FLAGS " -Xcompiler /wd4819")

# Disable excess x86 floating point precision that can lead to results being labeled incorrectly
string(APPEND NVCC_FLAGS " -Xcompiler /fp:strict")
# Disable excess x86 floating point precision that can lead to results being labeled incorrectly
string(APPEND NVCC_FLAGS " -Xcompiler /fp:strict")

# Verbose option
if (${CUTLASS_NVCC_VERBOSE})
string(APPEND NVCC_FLAGS " -v")
endif()
# Verbose option
if (${CUTLASS_NVCC_VERBOSE})
string(APPEND NVCC_FLAGS " -v")
endif()
endif(WIN32)

set(CUTLASS_NVCC_ARCHS "50;60;61;70;75" CACHE STRING "The SM architectures to build code for.")
set(CUTLASS_NVCC_ARCHS_DEFAULT "")
if(NOT CUDA_VERSION VERSION_LESS 7.5)
list(APPEND CUTLASS_NVCC_ARCHS_DEFAULT 50)
endif()
if(NOT CUDA_VERSION VERSION_LESS 8.0)
list(APPEND CUTLASS_NVCC_ARCHS_DEFAULT 60 61)
endif()
if(NOT CUDA_VERSION VERSION_LESS 9.0)
list(APPEND CUTLASS_NVCC_ARCHS_DEFAULT 70)
endif()
if(NOT CUDA_VERSION VERSION_LESS 9.2)
list(APPEND CUTLASS_NVCC_ARCHS_DEFAULT 72)
endif()
if(NOT CUDA_VERSION VERSION_LESS 10.0)
list(APPEND CUTLASS_NVCC_ARCHS_DEFAULT 75)
endif()
set(CUTLASS_NVCC_ARCHS ${CUTLASS_NVCC_ARCHS_DEFAULT} CACHE STRING "The SM architectures to build code for.")

set(CUTLASS_NVCC_EMBED_CUBIN ON CACHE BOOL "Embed compiled CUDA kernel binaries into executables.")
set(CUTLASS_NVCC_EMBED_PTX ON CACHE BOOL "Embed compiled PTX into executables.")
set(CUTLASS_NVCC_KEEP OFF CACHE BOOL "Keep intermediate files generated by NVCC.")

# CUDA 10.1 introduces "mma" in PTX performing collective matrix multiply operations.
if (CUDA_VERSION VERSION_LESS 10.1)
set(CUTLASS_ENABLE_TENSOR_CORE_MMA_DEFAULT OFF)
else()
set(CUTLASS_ENABLE_TENSOR_CORE_MMA_DEFAULT ON)
endif()

set(CUTLASS_ENABLE_TENSOR_CORE_MMA ${CUTLASS_ENABLE_TENSOR_CORE_MMA_DEFAULT} CACHE BOOL
"Enable PTX mma instruction for collective matrix multiply operations.")

set(CUTLASS_EXHAUSTIVE_PERFORMANCE_TEST ${CUTLASS_EXHAUSTIVE_PERFORMANCE_TEST} CACHE BOOL
"Enable more kernels instantiated in the perf suite. This might result in longer compiler time. ")

#
# NOTE: running with asan and CUDA requires the following environment variable:
#
Expand Down Expand Up @@ -131,6 +175,18 @@ foreach(ARCH ${CUTLASS_NVCC_ARCHS})
endif()
endforeach()

if (CUTLASS_ENABLE_TENSOR_CORE_MMA)
string(APPEND NVCC_FLAGS " -DCUTLASS_ENABLE_TENSOR_CORE_MMA=1")
endif()

if (CUTLASS_ENABLE_CUBLAS)
string(APPEND NVCC_FLAGS " -DCUTLASS_ENABLE_CUBLAS=1")
endif()

if (CUTLASS_EXHAUSTIVE_PERFORMANCE_TEST)
add_definitions(-DEXHAUSTIVE_PROF)
endif()

if (CUTLASS_NVCC_KEEP)
string(APPEND NVCC_FLAGS " -keep")
endif()
Expand Down Expand Up @@ -174,6 +230,7 @@ file(GLOB CUTLASS_UTIL RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} cutlass/util/*.h)
file(GLOB CUTLASS_DEVICE RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} cutlass/device/*.h)
file(GLOB CUTLASS_CORE RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} cutlass/*.h)
file(GLOB CUTLASS_REDUCTION RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} cutlass/reduction/*.h )
file(GLOB CUTLASS_LAYOUT_THREAD RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} cutlass/layout/thread/*.h)

###################################################################################################
#
Expand All @@ -185,16 +242,24 @@ source_group("cutlass\\gemm" FILES ${CUTLASS_GEMM})
source_group("cutlass\\util" FILES ${CUTLASS_UTIL})
source_group("cutlass\\device" FILES ${CUTLASS_DEVICE})
source_group("cutlass\\reduction" FILES ${CUTLASS_REDUCTION})
source_group("cutlass\\layout\\thread" FILES ${CUTLASS_LAYOUT_THREAD})
source_group("cutlass" FILES ${CUTLASS_CORE})

add_library(CUTLASS INTERFACE)
include_directories("${CMAKE_CURRENT_SOURCE_DIR}")

# Special policy introduced in CMake 3.13
if (POLICY CMP0076)
cmake_policy(SET CMP0076 NEW)
endif()

target_sources(CUTLASS INTERFACE
${CUTLASS_GEMM}
${CUTLASS_UTIL}
${CUTLASS_DEVICE}
${CUTLASS_CORE}
${CUTLASS_REDUCTION}
${CUTLASS_LAYOUT_THREAD}
)

target_include_directories(CUTLASS INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
Expand All @@ -206,6 +271,7 @@ add_custom_target(cutlass_ide SOURCES
${CUTLASS_DEVICE}
${CUTLASS_CORE}
${CUTLASS_REDUCTION}
${CUTLASS_LAYOUT_THREAD}
)
# Doxygen is available. Generate documentation
if (DOXYGEN_FOUND)
Expand Down
6 changes: 3 additions & 3 deletions CUTLASS.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ CUTLASS core components, and to identify their role in implementing GEMM computa
# <a name="S-design-patterns"></a> 1. Design Patterns

CUTLASS strives to achieve the highest performance possible on NVIDIA GPUs while also offering a
flexible composition that an be easily applied to solve new problems related to Deep Learning and
flexible composition that can be easily applied to solve new problems related to Deep Learning and
linear algebra. Though we intend to make CUTLASS as simple and straightforward as possible, given
a tradeoff between simplicity and performance, CUTLASS chooses performance. Consequently, several
design patterns are necessary to yield a composable structure while also satisfying these performance
Expand All @@ -31,7 +31,7 @@ CUTLASS embodies a design paradigm exemplified by the [CUB library](https://nvla

## <a name="S-patterns-tiles-iterators"></a> Tiles and Iterators

Efficient dense linear algebra computations emphasize data movement to match the execution of mathemtical operators to the flow of data. Consequently, CUTLASS defines a rich set of primitives for partitioning a tile of data among participating threads, warps, and threadblocks. CUTLASS applies the familiar iterator design pattern to provide an abstraction layer to (1.) access these tile objects and (2.) traverse a sequence of objects embedded in a higher level data structure. These subpartitions are typically defined by compile-time constants
Efficient dense linear algebra computations emphasize data movement to match the execution of mathematical operators to the flow of data. Consequently, CUTLASS defines a rich set of primitives for partitioning a tile of data among participating threads, warps, and threadblocks. CUTLASS applies the familiar iterator design pattern to provide an abstraction layer to (1.) access these tile objects and (2.) traverse a sequence of objects embedded in a higher level data structure. These subpartitions are typically defined by compile-time constants
specifying element type, size, and data layout. CUTLASS refers to subpartitions as _tiles_.

_Iterators_ are familiar design patterns in C++ that provide an abstraction for accessing individual
Expand Down Expand Up @@ -353,7 +353,7 @@ An example of splitK usage can be found [here](examples/06_splitK_gemm/splitK_ge

# Copyright

Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.

```
Redistribution and use in source and binary forms, with or without modification, are permitted
Expand Down
25 changes: 16 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition")

# CUTLASS 1.2
# CUTLASS 1.3

_CUTLASS 1.2 - October 2018_
_CUTLASS 1.3.0 - March 2019_

CUTLASS is a collection of CUDA C++ template abstractions for implementing
high-performance matrix-multiplication (GEMM) at all levels and scales within CUDA.
Expand All @@ -20,13 +20,18 @@ multiply-accumulate abstractions for 8-bit integer, half-precision floating
point (FP16), single-precision floating point (FP32), and double-precision floating
point (FP64) types. Furthermore, CUTLASS demonstrates CUDA's WMMA API for targeting
the programmable, high-throughput _Tensor Cores_ provided by NVIDIA's Volta architecture
and beyond.
and beyond. Even faster performance on Volta is possible via direct access to
Volta Tenor Cores via `mma.sync` (added in CUDA 10.1).

CUTLASS 1.2 is described in the [CUTLASS Documentation](CUTLASS.md) and the accompanying
CUTLASS 1.3 is described in the [CUTLASS Documentation](CUTLASS.md) and the accompanying
[Doxygen documentation](https://nvidia.github.io/cutlass).
We describe the structure of an efficient GEMM in our talk at the
[GPU Technology Conference 2018](http://on-demand.gputechconf.com/gtc/2018/presentation/s8854-cutlass-software-primitives-for-dense-linear-algebra-at-all-levels-and-scales-within-cuda.pdf).

# What's New in CUTLASS 1.3
_March 2019_
* CUTLASS 1.3 includes an efficient GEMM implementation with the `mma.sync` instruction added in CUDA 10.1.

# What's New in CUTLASS 1.2
_October 2018_
* [Parallelized Reductions](CUTLASS.md#parallel-reductions-across-gemm-k)
Expand Down Expand Up @@ -63,8 +68,8 @@ when compiled with CUDA 10.0.

# Compatibility

CUTLASS performs best when compiled with the [CUDA 10.0 Toolkit](ttps://developer.nvidia.com/cuda-toolkit).
It is compatible with CUDA 9.0, 9.1, and 9.2, but these versions of the CUDA Toolkit do not support new Turing WMMA features.
CUTLASS performs best when compiled with the [CUDA 10.1 Toolkit](ttps://developer.nvidia.com/cuda-toolkit).
It is also compatible with CUDA 9.0, 9.1, 9.2, and 10.0.

We have tested the following environments.

Expand All @@ -77,7 +82,7 @@ We have tested the following environments.
| Ubuntu 18.04 | GCC 7.3.0 |

CUTLASS runs successfully on the following NVIDIA GPUs, and it is expected to be efficient on
any Maxwell-, Pascal-, or Volta-architecture NVIDIA GPU.
any Maxwell-, Pascal-, Volta-, and Turing-architecture NVIDIA GPUs.

|**GPU**|
|---|
Expand Down Expand Up @@ -220,6 +225,9 @@ Program usage:
# Varies GEMM K dimension for SGEMM and IGEMM with column-major multiplicands
$ ./tools/test/perf/cutlass_perf_test --m=10240 --n=4096 --k=1024:8192:128 --kernels=sgemm_nn,igemm_nn
# Executes GEMM kernel on Volta Tensor Cores
$ ./tools/test/perf/cutlass_perf_test --kernels=s884gemm_nt
```

# About
Expand All @@ -230,7 +238,7 @@ CUTLASS is released by NVIDIA Corporation as Open Source software under the

# Copyright

Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.

```
Redistribution and use in source and binary forms, with or without modification, are permitted
Expand All @@ -253,4 +261,3 @@ Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
```

Loading

0 comments on commit 877bdca

Please sign in to comment.