Skip to content

Commit

Permalink
Merge pull request #104 from ecmwf-ifs/nams-cuda-beyond-k-caching
Browse files Browse the repository at this point in the history
CUDA variant optimised beyond k-caching
  • Loading branch information
reuterbal authored Dec 20, 2024
2 parents 842c1b8 + 6beb1d9 commit 8cec441
Show file tree
Hide file tree
Showing 8 changed files with 3,572 additions and 36 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ ecbuild_add_option( FEATURE CUDA
CONDITION CMAKE_CUDA_COMPILER )
if( HAVE_CUDA )
enable_language( CUDA )
find_package(CUDAToolkit)
endif()

ecbuild_add_option( FEATURE HIP
Expand Down
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,11 @@ Balthasar Reuter (balthasar.reuter@ecmwf.int)
of CLOUDSC including host side hoisted temporary local variables.
- **dwarf-cloudsc-cuda-k-caching**: GPU-enabled, further optimized CUDA
C version of CLOUDSC including loop fusion and temporary local
array demotion.
array demotion.
- **dwarf-cloudsc-cuda-opt**: GPU-enabled, further optimized beyond
k-caching CUDA C version that buffers some variables and
uses pipelined global-to-shared memory copies that are overlapped
with compute (TMA loads).
- **dwarf-cloudsc-gpu-scc-field**: GPU-enabled and optimized version of
CLOUDSC that uses the SCC loop layout, and uses [FIELD API](https://github.com/ecmwf-ifs/field_api) (a Fortran library purpose-built for IFS data-structures that facilitates the
creation and management of field objects in scientific code) to perform device offload
Expand Down
4 changes: 4 additions & 0 deletions arch/toolchains/github-ubuntu-nvhpc.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ set( OpenACC_Fortran_FLAGS "-acc=gpu -mp" CACHE STRING "" )
# COMMON FLAGS
####################################################################

if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
set(CMAKE_CUDA_ARCHITECTURES 80)
endif()

set(ECBUILD_Fortran_FLAGS "-fpic")
set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -Mframe")
set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -Mbyteswapio")
Expand Down
100 changes: 65 additions & 35 deletions src/cloudsc_cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,17 @@ if( HAVE_CLOUDSC_C_CUDA )
enable_language(CUDA)
enable_language(CXX)

if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
set( CLOUDSC_CUDA_FLAGS "-lineinfo -maxrregcount=128" )
else()
set( CLOUDSC_CUDA_FLAGS "-lineinfo -maxrregcount=128 -gencode arch=compute_${CMAKE_CUDA_ARCHITECTURES},code=sm_${CMAKE_CUDA_ARCHITECTURES}>" )
endif()
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
set( CLOUDSC_CUDA_OPT_FLAGS "-O0 -g -G" )
else()
set( CLOUDSC_CUDA_OPT_FLAGS "--ptxas-options=-O3 -O3 -use_fast_math" )
endif()

###### SCC-CUDA ####
ecbuild_add_library(
TARGET dwarf-cloudsc-c-cuda-lib
Expand All @@ -41,22 +52,12 @@ if( HAVE_CLOUDSC_C_CUDA )
$<${HAVE_HDF5}:hdf5::hdf5>
$<${HAVE_SERIALBOX}:Serialbox::Serialbox_C>
$<${HAVE_OMP}:OpenMP::OpenMP_C>
CUDA::cudart
DEFINITIONS
${CLOUDSC_DEFINITIONS}
)

target_include_directories(
dwarf-cloudsc-c-cuda-lib
PUBLIC
${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
)
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
target_compile_options(dwarf-cloudsc-c-cuda-lib PRIVATE $<$<COMPILE_LANGUAGE:CUDA>>)
else()
target_compile_options(dwarf-cloudsc-c-cuda-lib PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
--ptxas-options=-O3 -use_fast_math -maxrregcount=128 -gencode arch=compute_${CMAKE_CUDA_ARCHITECTURES},code=sm_${CMAKE_CUDA_ARCHITECTURES}>)
endif()

target_compile_options(dwarf-cloudsc-c-cuda-lib PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:SHELL:${CLOUDSC_CUDA_OPT_FLAGS} ${CLOUDSC_CUDA_FLAGS}>)
set_target_properties( dwarf-cloudsc-c-cuda-lib PROPERTIES CUDA_SEPARABLE_COMPILATION ON)

ecbuild_add_executable(
Expand Down Expand Up @@ -99,22 +100,12 @@ if( HAVE_CLOUDSC_C_CUDA )
$<${HAVE_HDF5}:hdf5::hdf5>
$<${HAVE_SERIALBOX}:Serialbox::Serialbox_C>
$<${HAVE_OMP}:OpenMP::OpenMP_C>
CUDA::cudart
DEFINITIONS
${CLOUDSC_DEFINITIONS}
)

target_include_directories(
dwarf-cloudsc-c-cuda-hoist-lib
PUBLIC
${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
)
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
target_compile_options(dwarf-cloudsc-c-cuda-hoist-lib PRIVATE $<$<COMPILE_LANGUAGE:CUDA>>)
else()
target_compile_options(dwarf-cloudsc-c-cuda-hoist-lib PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
--ptxas-options=-O3 -use_fast_math -maxrregcount=128 -gencode arch=compute_${CMAKE_CUDA_ARCHITECTURES},code=sm_${CMAKE_CUDA_ARCHITECTURES}>)
endif()

target_compile_options(dwarf-cloudsc-c-cuda-hoist-lib PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:SHELL:${CLOUDSC_CUDA_OPT_FLAGS} ${CLOUDSC_CUDA_FLAGS}>)
set_target_properties( dwarf-cloudsc-c-cuda-hoist-lib PROPERTIES CUDA_SEPARABLE_COMPILATION ON)

ecbuild_add_executable(
Expand Down Expand Up @@ -157,21 +148,12 @@ if( HAVE_CLOUDSC_C_CUDA )
$<${HAVE_HDF5}:hdf5::hdf5>
$<${HAVE_SERIALBOX}:Serialbox::Serialbox_C>
$<${HAVE_OMP}:OpenMP::OpenMP_C>
CUDA::cudart
DEFINITIONS
${CLOUDSC_DEFINITIONS}
)

target_include_directories(
dwarf-cloudsc-c-cuda-k-caching-lib
PUBLIC
${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
)
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
target_compile_options(dwarf-cloudsc-c-cuda-k-caching-lib PRIVATE $<$<COMPILE_LANGUAGE:CUDA>>)
else()
target_compile_options(dwarf-cloudsc-c-cuda-k-caching-lib PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
--ptxas-options=-O3 -use_fast_math -maxrregcount=128 -gencode arch=compute_${CMAKE_CUDA_ARCHITECTURES},code=sm_${CMAKE_CUDA_ARCHITECTURES}>)
endif()
target_compile_options(dwarf-cloudsc-c-cuda-k-caching-lib PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:SHELL:${CLOUDSC_CUDA_OPT_FLAGS} ${CLOUDSC_CUDA_FLAGS}>)
set_target_properties( dwarf-cloudsc-c-cuda-k-caching-lib PROPERTIES CUDA_SEPARABLE_COMPILATION ON)

ecbuild_add_executable(
Expand All @@ -191,6 +173,54 @@ if( HAVE_CLOUDSC_C_CUDA )
)
###

###### SCC-CUDA-OPT ####
ecbuild_add_library(
TARGET dwarf-cloudsc-c-cuda-opt-lib
INSTALL_HEADERS LISTED
SOURCES
cloudsc/yoecldp_c.h
cloudsc/load_state.h
cloudsc/load_state.cu
cloudsc/cloudsc_c_opt.h
cloudsc/cloudsc_c_opt.cu
cloudsc/cloudsc_driver_opt.h
cloudsc/cloudsc_driver_opt.cu
cloudsc/cloudsc_validate.h
cloudsc/cloudsc_validate.cu
cloudsc/mycpu.h
cloudsc/mycpu.cu
PUBLIC_INCLUDES
$<INSTALL_INTERFACE:include>
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/cloudsc>
PUBLIC_LIBS
$<${HAVE_HDF5}:hdf5::hdf5>
$<${HAVE_SERIALBOX}:Serialbox::Serialbox_C>
$<${HAVE_OMP}:OpenMP::OpenMP_C>
CUDA::cudart
DEFINITIONS
${CLOUDSC_DEFINITIONS}
)

target_compile_options(dwarf-cloudsc-c-cuda-opt-lib PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:SHELL:${CLOUDSC_CUDA_OPT_FLAGS} ${CLOUDSC_CUDA_FLAGS}>)
set_target_properties( dwarf-cloudsc-c-cuda-opt-lib PROPERTIES CUDA_SEPARABLE_COMPILATION ON)

ecbuild_add_executable(
TARGET dwarf-cloudsc-c-cuda-opt
SOURCES dwarf_cloudsc.cpp
LIBS dwarf-cloudsc-c-cuda-opt-lib
)

target_link_libraries(dwarf-cloudsc-c-cuda-opt dwarf-cloudsc-c-cuda-opt-lib)

ecbuild_add_test(
TARGET dwarf-cloudsc-c-cuda-opt-serial
COMMAND bin/dwarf-cloudsc-c-cuda-opt
ARGS 1 1000 128
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/../../..
OMP 1
)
###

# Create symlink for the input data
if( HAVE_SERIALBOX )
execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink
Expand Down
Loading

0 comments on commit 8cec441

Please sign in to comment.