Cutlass 1.3 Release (#42)

CUTLASS 1.3 Release - Efficient GEMM kernel targeting Volta Tensor Cores via mma.sync instruction added in CUDA 10.1.
NVIDIA · Mar 20, 2019 · 877bdca · 877bdca
1 parent 19a9d64
commit 877bdca
Show file tree

Hide file tree

Showing 256 changed files with 16,930 additions and 802 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,7 +1,7 @@
 # NVIDIA CUTLASS Changelog
 
-## [1.2.1](https://github.com/NVIDIA/cutlass/releases/tag/v1.2.1) (2018-12-19)
- * Resolved issue with sm50 and sm52 architectures
+## [1.3.0](https://github.com/NVIDIA/cutlass/releases/tag/v1.3.0) (2019-03-20)
+ * Efficient GEMM kernel targeting Volta Tensor Cores via `mma.sync` instruction added in CUDA 10.1.
 
 ## [1.2.0](https://github.com/NVIDIA/cutlass/releases/tag/v1.2.0) (2018-10-26)
  * Parallelized reductions across threadblocks ("Split-K")

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are permitted
 # provided that the following conditions are met:
@@ -20,7 +20,7 @@
 # STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-cmake_minimum_required(VERSION 3.3.0)
+cmake_minimum_required(VERSION 3.3.0 FATAL_ERROR)
 
 set(CUTLASS_LANGUAGES CXX)
 
@@ -36,7 +36,8 @@ else()
   # FindCUDA fails to detect VS 2017 due to a changed directory format of the toolkits.
   # For this configuration we need CMake >= 3.9.0 to use the native CUDA support.
   if (WIN32 AND MSVC_VERSION GREATER 1800)
-    message(FATAL_ERROR "Please upgrade CMake to version >= 3.9.0 to support Visual Studio 2017 or higher")
+    message(SEND_ERROR "Please upgrade CMake to version >= 3.9.0 to support Visual Studio 2017 or higher")
+    cmake_minimum_required(VERSION 3.9.0 FATAL_ERROR)
   endif()
 
   # Fall back to the FindCUDA version to create an executable with CUDA files
@@ -52,7 +53,11 @@ if( NOT CMAKE_SIZEOF_VOID_P EQUAL 8 )
     message(FATAL_ERROR "CUTLASS requires a 64-bit compiler!")
 endif()
 
-find_package(CUDA)
+find_package(CUDA REQUIRED)
+include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
+# Some platforms (e.g. Visual Studio) don't add the CUDA include directories to the system include
+# paths by default, so we add it explicitly here.
+
 find_package(Doxygen QUIET)
 
 ###################################################################################################
@@ -61,9 +66,18 @@ find_package(Doxygen QUIET)
 #
 ###################################################################################################
 
-find_library(CUBLAS_LIBRARY cublas HINTS
+#
+# Conditionally enable cuBLAS
+#
+set(CUTLASS_ENABLE_CUBLAS ON CACHE BOOL "Enable CUTLASS Tests to build with cuBLAS library.")
+
+if(CUTLASS_ENABLE_CUBLAS)
+
+  find_library(CUBLAS_LIBRARY cublas HINTS
                                    ${CUDA_TOOLKIT_ROOT_DIR}/lib64
                                    ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64)
+endif()
+
 
 # By default we want to build in Release mode to ensure that we're getting best performance
 if (NOT (CMAKE_BUILD_TYPE OR CONFIGURATION_TYPES))
@@ -78,26 +92,56 @@ if(WIN32)
 endif()
 
 if (WIN32)
-    # Enable more warnings and treat as errors
-    string(APPEND NVCC_FLAGS " -Xcompiler /W3 -Xcompiler /WX")
+  # Enable more warnings and treat as errors
+  string(APPEND NVCC_FLAGS " -Xcompiler /W3 -Xcompiler /WX")
 
-    # Disable warning on Unicode characters
-    string(APPEND NVCC_FLAGS " -Xcompiler /wd4819")
+  # Disable warning on Unicode characters
+  string(APPEND NVCC_FLAGS " -Xcompiler /wd4819")
 
-    # Disable excess x86 floating point precision that can lead to results being labeled incorrectly
-    string(APPEND NVCC_FLAGS " -Xcompiler /fp:strict")
+  # Disable excess x86 floating point precision that can lead to results being labeled incorrectly
+  string(APPEND NVCC_FLAGS " -Xcompiler /fp:strict")
 
-    # Verbose option
-    if (${CUTLASS_NVCC_VERBOSE})
-        string(APPEND NVCC_FLAGS " -v")
-    endif()
+  # Verbose option
+  if (${CUTLASS_NVCC_VERBOSE})
+    string(APPEND NVCC_FLAGS " -v")
+  endif()
 endif(WIN32)
 
-set(CUTLASS_NVCC_ARCHS "50;60;61;70;75" CACHE STRING "The SM architectures to build code for.")
+set(CUTLASS_NVCC_ARCHS_DEFAULT "")
+if(NOT CUDA_VERSION VERSION_LESS 7.5)
+  list(APPEND CUTLASS_NVCC_ARCHS_DEFAULT 50)
+endif()
+if(NOT CUDA_VERSION VERSION_LESS 8.0)
+  list(APPEND CUTLASS_NVCC_ARCHS_DEFAULT 60 61)
+endif()
+if(NOT CUDA_VERSION VERSION_LESS 9.0)
+  list(APPEND CUTLASS_NVCC_ARCHS_DEFAULT 70)
+endif()
+if(NOT CUDA_VERSION VERSION_LESS 9.2)
+  list(APPEND CUTLASS_NVCC_ARCHS_DEFAULT 72)
+endif()
+if(NOT CUDA_VERSION VERSION_LESS 10.0)
+  list(APPEND CUTLASS_NVCC_ARCHS_DEFAULT 75)
+endif()
+set(CUTLASS_NVCC_ARCHS ${CUTLASS_NVCC_ARCHS_DEFAULT} CACHE STRING "The SM architectures to build code for.")
+
 set(CUTLASS_NVCC_EMBED_CUBIN ON CACHE BOOL "Embed compiled CUDA kernel binaries into executables.")
 set(CUTLASS_NVCC_EMBED_PTX ON CACHE BOOL "Embed compiled PTX into executables.")
 set(CUTLASS_NVCC_KEEP OFF CACHE BOOL "Keep intermediate files generated by NVCC.")
 
+# CUDA 10.1 introduces "mma" in PTX performing collective matrix multiply operations.
+if (CUDA_VERSION VERSION_LESS 10.1)
+  set(CUTLASS_ENABLE_TENSOR_CORE_MMA_DEFAULT OFF)
+else()
+  set(CUTLASS_ENABLE_TENSOR_CORE_MMA_DEFAULT ON)
+endif()
+
+set(CUTLASS_ENABLE_TENSOR_CORE_MMA ${CUTLASS_ENABLE_TENSOR_CORE_MMA_DEFAULT} CACHE BOOL
+  "Enable PTX mma instruction for collective matrix multiply operations.")
+
+set(CUTLASS_EXHAUSTIVE_PERFORMANCE_TEST ${CUTLASS_EXHAUSTIVE_PERFORMANCE_TEST} CACHE BOOL 
+  "Enable more kernels instantiated in the perf suite. This might result in longer compiler time. ")
+
 #
 # NOTE: running with asan and CUDA requires the following environment variable:
 #
@@ -131,6 +175,18 @@ foreach(ARCH ${CUTLASS_NVCC_ARCHS})
   endif()
 endforeach()
 
+if (CUTLASS_ENABLE_TENSOR_CORE_MMA)
+  string(APPEND NVCC_FLAGS " -DCUTLASS_ENABLE_TENSOR_CORE_MMA=1")
+endif()
+
+if (CUTLASS_ENABLE_CUBLAS)
+  string(APPEND NVCC_FLAGS " -DCUTLASS_ENABLE_CUBLAS=1")
+endif()
+
+if (CUTLASS_EXHAUSTIVE_PERFORMANCE_TEST)
+    add_definitions(-DEXHAUSTIVE_PROF)
+endif()
+
 if (CUTLASS_NVCC_KEEP)
     string(APPEND NVCC_FLAGS " -keep")
 endif()
@@ -174,6 +230,7 @@ file(GLOB CUTLASS_UTIL RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} cutlass/util/*.h)
 file(GLOB CUTLASS_DEVICE RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} cutlass/device/*.h)
 file(GLOB CUTLASS_CORE RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} cutlass/*.h)
 file(GLOB CUTLASS_REDUCTION RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} cutlass/reduction/*.h )
+file(GLOB CUTLASS_LAYOUT_THREAD RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} cutlass/layout/thread/*.h)
 
 ###################################################################################################
 #
@@ -185,16 +242,24 @@ source_group("cutlass\\gemm" FILES ${CUTLASS_GEMM})
 source_group("cutlass\\util" FILES ${CUTLASS_UTIL})
 source_group("cutlass\\device" FILES ${CUTLASS_DEVICE})
 source_group("cutlass\\reduction" FILES ${CUTLASS_REDUCTION})
+source_group("cutlass\\layout\\thread" FILES ${CUTLASS_LAYOUT_THREAD})
 source_group("cutlass" FILES ${CUTLASS_CORE})
 
 add_library(CUTLASS INTERFACE)
 include_directories("${CMAKE_CURRENT_SOURCE_DIR}")
+
+# Special policy introduced in CMake 3.13
+if (POLICY CMP0076)
+  cmake_policy(SET CMP0076 NEW)
+endif()
+
 target_sources(CUTLASS INTERFACE
   ${CUTLASS_GEMM}
   ${CUTLASS_UTIL}
   ${CUTLASS_DEVICE}
   ${CUTLASS_CORE}
   ${CUTLASS_REDUCTION}
+  ${CUTLASS_LAYOUT_THREAD}
 )
 
 target_include_directories(CUTLASS INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
@@ -206,6 +271,7 @@ add_custom_target(cutlass_ide SOURCES
   ${CUTLASS_DEVICE}
   ${CUTLASS_CORE}
   ${CUTLASS_REDUCTION}
+  ${CUTLASS_LAYOUT_THREAD}
 )
 # Doxygen is available. Generate documentation
 if (DOXYGEN_FOUND)

diff --git a/CUTLASS.md b/CUTLASS.md
@@ -14,7 +14,7 @@ CUTLASS core components, and to identify their role in implementing GEMM computa
 # <a name="S-design-patterns"></a> 1. Design Patterns
 
 CUTLASS strives to achieve the highest performance possible on NVIDIA GPUs while also offering a
-flexible composition that an be easily applied to solve new problems related to Deep Learning and
+flexible composition that can be easily applied to solve new problems related to Deep Learning and
 linear algebra. Though we intend to make CUTLASS as simple and straightforward as possible, given
 a tradeoff between simplicity and performance, CUTLASS chooses performance. Consequently, several
 design patterns are necessary to yield a composable structure while also satisfying these performance
@@ -31,7 +31,7 @@ CUTLASS embodies a design paradigm exemplified by the [CUB library](https://nvla
 
 ## <a name="S-patterns-tiles-iterators"></a> Tiles and Iterators
 
-Efficient dense linear algebra computations emphasize data movement to match the execution of mathemtical operators to the flow of data. Consequently, CUTLASS defines a rich set of primitives for partitioning a tile of data among participating threads, warps, and threadblocks. CUTLASS applies the familiar iterator design pattern to provide an abstraction layer to (1.) access these tile objects and (2.) traverse a sequence of objects embedded in a higher level data structure. These subpartitions are typically defined by compile-time constants
+Efficient dense linear algebra computations emphasize data movement to match the execution of mathematical operators to the flow of data. Consequently, CUTLASS defines a rich set of primitives for partitioning a tile of data among participating threads, warps, and threadblocks. CUTLASS applies the familiar iterator design pattern to provide an abstraction layer to (1.) access these tile objects and (2.) traverse a sequence of objects embedded in a higher level data structure. These subpartitions are typically defined by compile-time constants
 specifying element type, size, and data layout. CUTLASS refers to subpartitions as _tiles_.
 
 _Iterators_ are familiar design patterns in C++ that provide an abstraction for accessing individual
@@ -353,7 +353,7 @@ An example of splitK usage can be found [here](examples/06_splitK_gemm/splitK_ge
 
 # Copyright
 
-Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
 
 ```
   Redistribution and use in source and binary forms, with or without modification, are permitted

diff --git a/README.md b/README.md
@@ -1,8 +1,8 @@
 ![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition")
 
-# CUTLASS 1.2
+# CUTLASS 1.3
 
-_CUTLASS 1.2 - October 2018_
+_CUTLASS 1.3.0 - March 2019_
 
 CUTLASS is a collection of CUDA C++ template abstractions for implementing
 high-performance matrix-multiplication (GEMM) at all levels and scales within CUDA.
@@ -20,13 +20,18 @@ multiply-accumulate abstractions for 8-bit integer, half-precision floating
 point (FP16), single-precision floating point (FP32), and double-precision floating
 point (FP64) types.  Furthermore, CUTLASS demonstrates CUDA's WMMA API for targeting
 the programmable, high-throughput _Tensor Cores_ provided by NVIDIA's Volta architecture
-and beyond.
+and beyond. Even faster performance on Volta is possible via direct access to
+Volta Tenor Cores via `mma.sync` (added in CUDA 10.1).
 
-CUTLASS 1.2 is described in the [CUTLASS Documentation](CUTLASS.md) and the accompanying
+CUTLASS 1.3 is described in the [CUTLASS Documentation](CUTLASS.md) and the accompanying
 [Doxygen documentation](https://nvidia.github.io/cutlass).
 We describe the structure of an efficient GEMM in our talk at the
 [GPU Technology Conference 2018](http://on-demand.gputechconf.com/gtc/2018/presentation/s8854-cutlass-software-primitives-for-dense-linear-algebra-at-all-levels-and-scales-within-cuda.pdf).
 
+# What's New in CUTLASS 1.3
+_March 2019_
+* CUTLASS 1.3 includes an efficient GEMM implementation with the `mma.sync` instruction added in CUDA 10.1.
+
 # What's New in CUTLASS 1.2
 _October 2018_
 * [Parallelized Reductions](CUTLASS.md#parallel-reductions-across-gemm-k)
@@ -63,8 +68,8 @@ when compiled with CUDA 10.0.
 
 # Compatibility
 
-CUTLASS performs best when compiled with the [CUDA 10.0 Toolkit](ttps://developer.nvidia.com/cuda-toolkit).
-It is compatible with CUDA 9.0, 9.1, and 9.2, but these versions of the CUDA Toolkit do not support new Turing WMMA features.
+CUTLASS performs best when compiled with the [CUDA 10.1 Toolkit](ttps://developer.nvidia.com/cuda-toolkit).
+It is also compatible with CUDA 9.0, 9.1, 9.2, and 10.0.
 
 We have tested the following environments.
 
@@ -77,7 +82,7 @@ We have tested the following environments.
 | Ubuntu 18.04 | GCC 7.3.0 |
 
 CUTLASS runs successfully on the following NVIDIA GPUs, and it is expected to be efficient on
-any Maxwell-, Pascal-, or Volta-architecture NVIDIA GPU.
+any Maxwell-, Pascal-, Volta-, and Turing-architecture NVIDIA GPUs.
 
 |**GPU**|
 |---|
@@ -220,6 +225,9 @@ Program usage:
 
   # Varies GEMM K dimension for SGEMM and IGEMM with column-major multiplicands
   $ ./tools/test/perf/cutlass_perf_test --m=10240 --n=4096 --k=1024:8192:128 --kernels=sgemm_nn,igemm_nn
+
+  # Executes GEMM kernel on Volta Tensor Cores
+  $ ./tools/test/perf/cutlass_perf_test --kernels=s884gemm_nt
 ```
 
 # About
@@ -230,7 +238,7 @@ CUTLASS is released by NVIDIA Corporation as Open Source software under the
 
 # Copyright
 
-Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
 
 ```
   Redistribution and use in source and binary forms, with or without modification, are permitted
@@ -253,4 +261,3 @@ Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
   STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ```
-