From aa504dd4b6afbdc505b0fa876d8927f85e5ce65d Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 14 Nov 2023 06:35:43 -0800 Subject: [PATCH 01/19] speed up multidimensional non-contiguous transfers --- cmake/field_api_compile_options.cmake | 1 + field_RANKSUFF_module.fypp | 91 ++++++++++++++++++++- tests/CMakeLists.txt | 1 + tests/init_wrapper_non_contiguous_multi.F90 | 78 ++++++++++++++++++ 4 files changed, 167 insertions(+), 4 deletions(-) create mode 100644 tests/init_wrapper_non_contiguous_multi.F90 diff --git a/cmake/field_api_compile_options.cmake b/cmake/field_api_compile_options.cmake index 06c8d5a..22ad906 100644 --- a/cmake/field_api_compile_options.cmake +++ b/cmake/field_api_compile_options.cmake @@ -10,6 +10,7 @@ if(CMAKE_Fortran_COMPILER_ID MATCHES PGI|NVIDIA|NVHPC) ecbuild_add_fortran_flags("-Mlarge_arrays") ecbuild_add_fortran_flags("-gopt") + ecbuild_add_fortran_flags("-cuda") ecbuild_add_fortran_flags("-Minfo=accel,all,ccff" BUILD DEBUG) diff --git a/field_RANKSUFF_module.fypp b/field_RANKSUFF_module.fypp index c89e25f..7180423 100644 --- a/field_RANKSUFF_module.fypp +++ b/field_RANKSUFF_module.fypp @@ -328,10 +328,12 @@ CONTAINS CALL CPU_TIME(START) SELECT CASE (SELF%LAST_CONTIGUOUS_DIMENSION) -#:for d in range (ft.rank + 1) +#:for d in range (ft.rank) CASE (${d}$) - CALL COPY_DIM${d}$_CONTIGUOUS (SELF%PTR, SELF%DEVPTR, KDIR, QUEUE) + CALL COPY_2D_DIM${d}$_${d+1}$_CONTIGUOUS (SELF%PTR, SELF%DEVPTR, KDIR, QUEUE) #:endfor + CASE (${ft.rank}$) + CALL COPY_DIM${ft.rank}$_CONTIGUOUS (SELF%PTR, SELF%DEVPTR, KDIR, QUEUE) END SELECT CALL CPU_TIME(FINISH) IF (KDIR == NH2D) THEN @@ -342,6 +344,85 @@ CONTAINS CONTAINS +#:for d1 in range (0, ft.rank) +#:for d2 in range (d1+1, ft.rank+1) + SUBROUTINE COPY_2D_DIM${d1}$_${d2}$_CONTIGUOUS (HST, DEV, KDIR, QUEUE) +#ifdef _OPENACC + USE OPENACC + USE CUDAFOR +#endif + ${ft.type}$, POINTER :: HST (${ft.shape}$), DEV (${ft.shape}$) + INTEGER (KIND=JPIM), INTENT (IN) :: KDIR + INTEGER (KIND=JPIM), OPTIONAL, INTENT (IN) :: QUEUE + INTEGER (KIND=JPIM) :: IHST_PITCH, IDEV_PITCH, IRET + INTEGER (KIND=JPIM) :: IWIDTH, IHEIGHT, ISHP(${ft.rank+1}$) + #:if d2 < ft.rank + INTEGER :: ${', '.join (list (map (lambda i: 'J' + str (i+1), range (d2, ft.rank))))}$ + #:endif + INTEGER(KIND=CUDA_STREAM_KIND) :: STREAM + +#ifdef _OPENACC + ISHP(1) = 1 + ISHP(2:) = SHAPE(HST) + IWIDTH = PRODUCT(ISHP(1:${d1+1}$)) + IHEIGHT = PRODUCT(ISHP(${d1+2}$:${d2+1}$)) + +#:set lb = lambda arr, i: f'LBOUND({arr}, {i+1})' +#:set lbnds = lambda arr, start, end: [lb(arr, i) for i in range(start, end)] +#:set this_slice = lambda arr: ', '.join(lbnds(arr, 0, ft.rank)) +#:set next_slice = lambda arr: ', '.join(lbnds(arr, 0, d1) + [lb(arr, d1)+'+1'] + lbnds(arr, d1+1, ft.rank)) + IHST_PITCH = (LOC (HST(${next_slice('HST')}$)) - LOC (HST (${this_slice('HST')}$)))/KIND(HST) + IDEV_PITCH = (LOC (DEV(${next_slice('DEV')}$)) - LOC (DEV (${this_slice('DEV')}$)))/KIND(DEV) + + #:for e in range (ft.rank, d2, -1) + ${' ' * (ft.rank - e)}$DO J${e}$ = LBOUND (HST, ${e}$), UBOUND (HST, ${e}$) + #:endfor + #:set indent = ' ' * (ft.rank - d2 - 1) + #:set ar = lambda arr: ', '.join(lbnds(arr, 0, d2) + [f'J{i+1}' for i in range(d2, ft.rank)]) + ${indent}$ IF (KDIR == NH2D) THEN + ${indent}$ !$acc host_data use_device (DEV) + ${indent}$ IF(PRESENT(QUEUE)) THEN + ${indent}$ CALL ACC_SET_CUDA_STREAM(QUEUE, STREAM) + ${indent}$ IRET = CUDAMEMCPY2DASYNC (DEV (${ar('DEV')}$), IDEV_PITCH, & + ${indent}$ & HST (${ar('HST')}$), IHST_PITCH, & + ${indent}$ & IWIDTH, IHEIGHT, CUDAMEMCPYHOSTTODEVICE, & + ${indent}$ & STREAM) + ${indent}$ ELSE + ${indent}$ IRET = CUDAMEMCPY2D (DEV (${ar('DEV')}$), IDEV_PITCH, & + ${indent}$ & HST (${ar('HST')}$), IHST_PITCH, & + ${indent}$ & IWIDTH, IHEIGHT) + ${indent}$ ENDIF + ${indent}$ !$acc end host_data + ${indent}$ ELSEIF (KDIR == ND2H) THEN + ${indent}$ !$acc host_data use_device (DEV) + ${indent}$ IF(PRESENT(QUEUE)) THEN + ${indent}$ CALL ACC_SET_CUDA_STREAM(QUEUE, STREAM) + ${indent}$ IRET = CUDAMEMCPY2DASYNC (HST (${ar('HST')}$), IHST_PITCH, & + ${indent}$ & DEV (${ar('DEV')}$), IDEV_PITCH, & + ${indent}$ & IWIDTH, IHEIGHT, CUDAMEMCPYDEVICETOHOST, & + ${indent}$ & STREAM) + ${indent}$ ELSE + ${indent}$ IRET = CUDAMEMCPY2D (HST (${ar('HST')}$), IHST_PITCH, & + ${indent}$ & DEV (${ar('DEV')}$), IDEV_PITCH, & + ${indent}$ & IWIDTH, IHEIGHT) + ${indent}$ ENDIF + ${indent}$ !$acc end host_data + ${indent}$ IF (IRET /= CUDASUCCESS) STOP 1 + ${indent}$ ENDIF + #:for e in range (d2, ft.rank) + ${' ' * (ft.rank - e - 1)}$ENDDO + #:endfor +#else + IF (KDIR == NH2D) THEN + HST = DEV + ELSE + DEV = HST + ENDIF +#endif + END SUBROUTINE COPY_2D_DIM${d1}$_${d2}$_CONTIGUOUS +#:endfor +#:endfor + #:for d in range (0, ft.rank+1) SUBROUTINE COPY_DIM${d}$_CONTIGUOUS (HST, DEV, KDIR, QUEUE) #ifdef _OPENACC @@ -351,13 +432,15 @@ CONTAINS INTEGER (KIND=JPIM), INTENT (IN) :: KDIR INTEGER (KIND=JPIM), OPTIONAL, INTENT (IN) :: QUEUE INTEGER (KIND=JPIM) :: ISIZE - INTEGER :: ${', '.join (['J'] + list (map (lambda i: 'J' + str (i+1), range (d, ft.rank))))}$ + #:if d != ft.rank + INTEGER :: ${', '.join (list (map (lambda i: 'J' + str (i+1), range (d, ft.rank))))}$ + #:endif #:for e in range (ft.rank, d, -1) ${' ' * (ft.rank - e)}$DO J${e}$ = LBOUND (HST, ${e}$), UBOUND (HST, ${e}$) #:endfor #:set ar = ', '.join ([':'] * d + list (map (lambda i: 'J' + str (i+1), range (d, ft.rank)))) - #:set indent = ' ' * (ft.rank - e) + #:set indent = ' ' * (ft.rank - d) #:if d == 0 ${indent}$ ISIZE = KIND (HST) #:else diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 8a17503..9cf51b8 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -53,6 +53,7 @@ list(APPEND TEST_FILES init_wrapper_gpu.F90 init_wrapper_lbounds.F90 init_wrapper_non_contiguous.F90 + init_wrapper_non_contiguous_multi.F90 no_transfer_get_device.F90 no_transfer_get_host.F90 pointer_to_owner_wrapper.F90 diff --git a/tests/init_wrapper_non_contiguous_multi.F90 b/tests/init_wrapper_non_contiguous_multi.F90 new file mode 100644 index 0000000..962ddf6 --- /dev/null +++ b/tests/init_wrapper_non_contiguous_multi.F90 @@ -0,0 +1,78 @@ +! (C) Copyright 2022- ECMWF. +! (C) Copyright 2022- Meteo-France. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. + +PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI + ! TEST IF WRAPPER WORKS WITH NON CONTIGUOUS ARRAY + USE FIELD_MODULE + USE PARKIND1 + USE FIELD_FACTORY_MODULE + use iso_c_binding + + IMPLICIT NONE + REAL(KIND=JPRB), ALLOCATABLE, TARGET :: D1(:,:,:,:), D2(:,:,:,:) + CLASS(FIELD_2RB), POINTER :: W2 => NULL() + REAL(KIND=JPRB), POINTER :: W2PTR(:,:) + CLASS(FIELD_3RB), POINTER :: W3 => NULL() + REAL(KIND=JPRB), POINTER :: W3PTR(:,:,:) + CLASS(FIELD_4RB), POINTER :: W4 => NULL() + REAL(KIND=JPRB), POINTER :: W4PTR(:,:,:,:) + integer(kind=8) :: ptr + + ALLOCATE(D1(18, 32, 19, 27)) + ALLOCATE(D2(18, 32, 19, 27)) + D1 = 0 + D2 = 0 + + CALL FIELD_NEW(W4, DATA=D1(1:1,:,:,:)) + CALL W4%GET_HOST_DATA_RDWR(W4PTR) + W4PTR=42 + CALL W4%GET_DEVICE_DATA_RDWR(W4PTR) + !$ACC KERNELS DEFAULT(PRESENT) + W4PTR=92 + !$ACC END KERNELS + CALL W4%GET_HOST_DATA_RDONLY(W4PTR) + D2(1:1,:,:,:)=92 + CALL FIELD_DELETE(W4) + IF (ANY(D1/=D2)) ERROR STOP + + CALL FIELD_NEW(W3, DATA=D1(:,2,:,:)) + CALL W3%GET_HOST_DATA_RDWR(W3PTR) + W3PTR=51 + CALL W3%GET_DEVICE_DATA_RDWR(W3PTR) + !$ACC KERNELS DEFAULT(PRESENT) + W3PTR=61 + !$ACC END KERNELS + D2(:,2,:,:)=61 + CALL FIELD_DELETE(W3) + IF (ANY(D1/=D2)) ERROR STOP + + CALL FIELD_NEW(W3, DATA=D1(:,2,4:8,:)) + CALL W3%GET_HOST_DATA_RDWR(W3PTR) + W3PTR=91 + D2(:,2,4:8,:)=91 + CALL FIELD_DELETE(W3) + IF (ANY(D1/=D2)) ERROR STOP + + CALL FIELD_NEW(W3, DATA=D1(:,2,4:8,3:5)) + CALL W3%GET_DEVICE_DATA_RDWR(W3PTR) + !$ACC KERNELS DEFAULT(PRESENT) + W3PTR=91 + !$ACC END KERNELS + D2(:,2,4:8,:)=91 + CALL FIELD_DELETE(W3) + IF (ANY(D1/=D2)) ERROR STOP + + CALL FIELD_NEW(W2, DATA=D1(:,2,4:8,8)) + CALL W2%GET_HOST_DATA_RDWR(W2PTR) + W2PTR=12.1 + D2(:,2,4:8,8)=12.1 + CALL FIELD_DELETE(W2) + IF (ANY(D1/=D2)) ERROR STOP + +END PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI From 8c5f5f96082bd1231659e56300981feec9d76a43 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 14 Nov 2023 12:03:01 -0800 Subject: [PATCH 02/19] dispatch to correct 2d variant --- field_RANKSUFF_module.fypp | 46 ++++++++++++++----- tests/init_wrapper_non_contiguous_multi.F90 | 49 ++++++++++++++------- 2 files changed, 66 insertions(+), 29 deletions(-) diff --git a/field_RANKSUFF_module.fypp b/field_RANKSUFF_module.fypp index 7180423..e328c64 100644 --- a/field_RANKSUFF_module.fypp +++ b/field_RANKSUFF_module.fypp @@ -323,17 +323,28 @@ CONTAINS INTEGER (KIND=JPIM), INTENT(IN) :: KDIR INTEGER (KIND=JPIM), OPTIONAL, INTENT(IN) :: QUEUE REAL :: START, FINISH + INTEGER :: NEXT_LAST_CONTIGUOUS_DIMENSION SELF%LAST_CONTIGUOUS_DIMENSION = ${ftn}$_GET_LAST_CONTIGUOUS_DIMENSION (SELF%PTR) + NEXT_LAST_CONTIGUOUS_DIMENSION = ${ftn}$_GET_LAST_CONTIGUOUS_DIMENSION (SELF%PTR, SELF%LAST_CONTIGUOUS_DIMENSION+1) CALL CPU_TIME(START) SELECT CASE (SELF%LAST_CONTIGUOUS_DIMENSION) -#:for d in range (ft.rank) - CASE (${d}$) - CALL COPY_2D_DIM${d}$_${d+1}$_CONTIGUOUS (SELF%PTR, SELF%DEVPTR, KDIR, QUEUE) +#:for d1 in range (ft.rank) + CASE (${d1}$) + SELECT CASE (NEXT_LAST_CONTIGUOUS_DIMENSION) + #:for d2 in range (d1+1, ft.rank+1) + CASE (${d2}$) + CALL COPY_2D_DIM${d1}$_${d2}$_CONTIGUOUS (SELF%PTR, SELF%DEVPTR, KDIR, QUEUE) + #:endfor + CASE DEFAULT + ERROR STOP + END SELECT #:endfor CASE (${ft.rank}$) CALL COPY_DIM${ft.rank}$_CONTIGUOUS (SELF%PTR, SELF%DEVPTR, KDIR, QUEUE) + CASE DEFAULT + ERROR STOP END SELECT CALL CPU_TIME(FINISH) IF (KDIR == NH2D) THEN @@ -423,7 +434,7 @@ CONTAINS #:endfor #:endfor -#:for d in range (0, ft.rank+1) +#:for d in range (ft.rank, ft.rank+1) SUBROUTINE COPY_DIM${d}$_CONTIGUOUS (HST, DEV, KDIR, QUEUE) #ifdef _OPENACC USE OPENACC @@ -688,16 +699,20 @@ CONTAINS #:for ft in fieldTypeList #:set ftn = ft.name - INTEGER (KIND=JPIM) FUNCTION ${ftn}$_GET_LAST_CONTIGUOUS_DIMENSION (PTR) RESULT (JDIM) + INTEGER (KIND=JPIM) FUNCTION ${ftn}$_GET_LAST_CONTIGUOUS_DIMENSION (PTR, AFTER) RESULT (JDIM) ${ft.type}$, POINTER :: PTR (${ft.shape}$) - INTEGER*8 :: ISTRIDE (${ft.rank}$) - INTEGER (KIND=JPIM) :: J, LB(${ft.rank}$) + INTEGER (KIND=JPIM), OPTIONAL :: AFTER + INTEGER*8 :: IPREVIOUS_STRIDE, ITHIS_STRIDE, ISIZE + INTEGER (KIND=JPIM) :: J, LB(${ft.rank}$), IAFTER + + IF (PRESENT(AFTER)) THEN + IAFTER = AFTER + ELSE + IAFTER = 0 + ENDIF LB = LBOUND(PTR) - ISTRIDE (1) = KIND (PTR) - DO J = 2, ${ft.rank}$ - ISTRIDE (J) = ISTRIDE (J-1) * SIZE (PTR, J-1) - ENDDO + IPREVIOUS_STRIDE = KIND (PTR) JDIM = 0 #:for d in range (1, ft.rank+1) @@ -709,9 +724,16 @@ CONTAINS #:endfor #:set ind0 = ind0[:-2] #:set ind1 = ind1[:-2] - IF (LOC (PTR (${ind1}$)) - LOC (PTR (${ind0}$)) /= ISTRIDE (${d}$)) THEN + ITHIS_STRIDE = LOC (PTR (${ind1}$)) - LOC (PTR (${ind0}$)) + #:if d == 1 + ISIZE = 1 + #:else + ISIZE = SIZE(PTR, ${d-1}$) + #:endif + IF (IAFTER < ${d}$ .AND. IPREVIOUS_STRIDE * ISIZE /= ITHIS_STRIDE) THEN RETURN ENDIF + IPREVIOUS_STRIDE = ITHIS_STRIDE JDIM = ${d}$ diff --git a/tests/init_wrapper_non_contiguous_multi.F90 b/tests/init_wrapper_non_contiguous_multi.F90 index 962ddf6..b2841b9 100644 --- a/tests/init_wrapper_non_contiguous_multi.F90 +++ b/tests/init_wrapper_non_contiguous_multi.F90 @@ -15,21 +15,23 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI use iso_c_binding IMPLICIT NONE - REAL(KIND=JPRB), ALLOCATABLE, TARGET :: D1(:,:,:,:), D2(:,:,:,:) + REAL(KIND=JPRB), ALLOCATABLE, TARGET :: D1(:,:,:,:,:), D2(:,:,:,:,:) CLASS(FIELD_2RB), POINTER :: W2 => NULL() REAL(KIND=JPRB), POINTER :: W2PTR(:,:) CLASS(FIELD_3RB), POINTER :: W3 => NULL() REAL(KIND=JPRB), POINTER :: W3PTR(:,:,:) CLASS(FIELD_4RB), POINTER :: W4 => NULL() REAL(KIND=JPRB), POINTER :: W4PTR(:,:,:,:) + CLASS(FIELD_5RB), POINTER :: W5 => NULL() + REAL(KIND=JPRB), POINTER :: W5PTR(:,:,:,:,:) integer(kind=8) :: ptr - ALLOCATE(D1(18, 32, 19, 27)) - ALLOCATE(D2(18, 32, 19, 27)) + ALLOCATE(D1(18, 32, 19, 27, 12)) + ALLOCATE(D2(18, 32, 19, 27, 12)) D1 = 0 D2 = 0 - CALL FIELD_NEW(W4, DATA=D1(1:1,:,:,:)) + CALL FIELD_NEW(W4, DATA=D1(1:1,:,:,:,3)) CALL W4%GET_HOST_DATA_RDWR(W4PTR) W4PTR=42 CALL W4%GET_DEVICE_DATA_RDWR(W4PTR) @@ -37,42 +39,55 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI W4PTR=92 !$ACC END KERNELS CALL W4%GET_HOST_DATA_RDONLY(W4PTR) - D2(1:1,:,:,:)=92 + D2(1:1,:,:,:,3)=92 CALL FIELD_DELETE(W4) IF (ANY(D1/=D2)) ERROR STOP - CALL FIELD_NEW(W3, DATA=D1(:,2,:,:)) + CALL FIELD_NEW(W3, DATA=D1(:,2,:,:,3)) CALL W3%GET_HOST_DATA_RDWR(W3PTR) W3PTR=51 CALL W3%GET_DEVICE_DATA_RDWR(W3PTR) !$ACC KERNELS DEFAULT(PRESENT) W3PTR=61 !$ACC END KERNELS - D2(:,2,:,:)=61 + D2(:,2,:,:,3)=61 CALL FIELD_DELETE(W3) IF (ANY(D1/=D2)) ERROR STOP - CALL FIELD_NEW(W3, DATA=D1(:,2,4:8,:)) - CALL W3%GET_HOST_DATA_RDWR(W3PTR) - W3PTR=91 - D2(:,2,4:8,:)=91 - CALL FIELD_DELETE(W3) + CALL FIELD_NEW(W4, DATA=D1(:,:,4:8,:,3)) + CALL W4%GET_DEVICE_DATA_RDWR(W4PTR) + !$ACC KERNELS DEFAULT(PRESENT) + W4PTR=31 + !$ACC END KERNELS + D2(:,:,4:8,:,3)=31 + CALL FIELD_DELETE(W4) IF (ANY(D1/=D2)) ERROR STOP - CALL FIELD_NEW(W3, DATA=D1(:,2,4:8,3:5)) + CALL FIELD_NEW(W3, DATA=D1(:,2,4:8,3:5,3)) CALL W3%GET_DEVICE_DATA_RDWR(W3PTR) !$ACC KERNELS DEFAULT(PRESENT) W3PTR=91 !$ACC END KERNELS - D2(:,2,4:8,:)=91 + D2(:,2,4:8,3:5,3)=91 CALL FIELD_DELETE(W3) IF (ANY(D1/=D2)) ERROR STOP - CALL FIELD_NEW(W2, DATA=D1(:,2,4:8,8)) - CALL W2%GET_HOST_DATA_RDWR(W2PTR) + CALL FIELD_NEW(W2, DATA=D1(:,2,4:8,8,3)) + CALL W2%GET_DEVICE_DATA_RDWR(W2PTR) + !$ACC KERNELS DEFAULT(PRESENT) W2PTR=12.1 - D2(:,2,4:8,8)=12.1 + !$ACC END KERNELS + D2(:,2,4:8,8,3)=12.1 CALL FIELD_DELETE(W2) IF (ANY(D1/=D2)) ERROR STOP + CALL FIELD_NEW(W4, DATA=D1(:,:,4,:,:)) + CALL W4%GET_DEVICE_DATA_RDWR(W4PTR) + !$ACC KERNELS DEFAULT(PRESENT) + W4PTR=22.1 + !$ACC END KERNELS + D2(:,:,4,:,:)=22.1 + CALL FIELD_DELETE(W4) + IF (ANY(D1/=D2)) ERROR STOP + END PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI From 0e86739696c20c5a1ad6ce4f7034dd907a2024a7 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Wed, 15 Nov 2023 00:13:00 -0800 Subject: [PATCH 03/19] make cpu only working --- cmake/field_api_compile_options.cmake | 7 ++++--- field_RANKSUFF_module.fypp | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/cmake/field_api_compile_options.cmake b/cmake/field_api_compile_options.cmake index 22ad906..c834143 100644 --- a/cmake/field_api_compile_options.cmake +++ b/cmake/field_api_compile_options.cmake @@ -10,9 +10,10 @@ if(CMAKE_Fortran_COMPILER_ID MATCHES PGI|NVIDIA|NVHPC) ecbuild_add_fortran_flags("-Mlarge_arrays") ecbuild_add_fortran_flags("-gopt") - ecbuild_add_fortran_flags("-cuda") - - ecbuild_add_fortran_flags("-Minfo=accel,all,ccff" BUILD DEBUG) + if(HAVE_ACC) + ecbuild_add_fortran_flags("-cuda") + ecbuild_add_fortran_flags("-Minfo=accel,all,ccff" BUILD DEBUG) + endif() # These are architecture/compiler/offload-library specific options # that should really be coming from external input diff --git a/field_RANKSUFF_module.fypp b/field_RANKSUFF_module.fypp index e328c64..c56d7c5 100644 --- a/field_RANKSUFF_module.fypp +++ b/field_RANKSUFF_module.fypp @@ -365,6 +365,7 @@ CONTAINS ${ft.type}$, POINTER :: HST (${ft.shape}$), DEV (${ft.shape}$) INTEGER (KIND=JPIM), INTENT (IN) :: KDIR INTEGER (KIND=JPIM), OPTIONAL, INTENT (IN) :: QUEUE +#ifdef _OPENACC INTEGER (KIND=JPIM) :: IHST_PITCH, IDEV_PITCH, IRET INTEGER (KIND=JPIM) :: IWIDTH, IHEIGHT, ISHP(${ft.rank+1}$) #:if d2 < ft.rank @@ -372,7 +373,6 @@ CONTAINS #:endif INTEGER(KIND=CUDA_STREAM_KIND) :: STREAM -#ifdef _OPENACC ISHP(1) = 1 ISHP(2:) = SHAPE(HST) IWIDTH = PRODUCT(ISHP(1:${d1+1}$)) From cab2dacfb8094a49f930caba9de73af511fe210c Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Thu, 16 Nov 2023 01:46:32 -0800 Subject: [PATCH 04/19] license --- LICENSE | 1 + field_RANKSUFF_module.fypp | 1 + tests/init_wrapper_non_contiguous_multi.F90 | 1 + 3 files changed, 3 insertions(+) diff --git a/LICENSE b/LICENSE index 7a1ad19..6e21471 100644 --- a/LICENSE +++ b/LICENSE @@ -176,6 +176,7 @@ END OF TERMS AND CONDITIONS Copyright 2018- ECMWF + Copyright 2023- NVIDIA Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/field_RANKSUFF_module.fypp b/field_RANKSUFF_module.fypp index c56d7c5..ab9df9a 100644 --- a/field_RANKSUFF_module.fypp +++ b/field_RANKSUFF_module.fypp @@ -1,5 +1,6 @@ #! (C) Copyright 2022- ECMWF. #! (C) Copyright 2022- Meteo-France. +#! (C) Copyright 2023- NVIDIA #! #! This software is licensed under the terms of the Apache Licence Version 2.0 #! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. diff --git a/tests/init_wrapper_non_contiguous_multi.F90 b/tests/init_wrapper_non_contiguous_multi.F90 index b2841b9..f635fe7 100644 --- a/tests/init_wrapper_non_contiguous_multi.F90 +++ b/tests/init_wrapper_non_contiguous_multi.F90 @@ -1,5 +1,6 @@ ! (C) Copyright 2022- ECMWF. ! (C) Copyright 2022- Meteo-France. +! (C) Copyright 2023- NVIDIA ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. From 02ed2d8addeda0d2ebea218bca1c53404a393c69 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Thu, 16 Nov 2023 22:55:53 -0800 Subject: [PATCH 05/19] fix compilation --- cmake/field_api_compile_options.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/cmake/field_api_compile_options.cmake b/cmake/field_api_compile_options.cmake index c834143..4685539 100644 --- a/cmake/field_api_compile_options.cmake +++ b/cmake/field_api_compile_options.cmake @@ -11,6 +11,7 @@ if(CMAKE_Fortran_COMPILER_ID MATCHES PGI|NVIDIA|NVHPC) ecbuild_add_fortran_flags("-Mlarge_arrays") ecbuild_add_fortran_flags("-gopt") if(HAVE_ACC) + list(APPEND CMAKE_REQUIRED_LINK_OPTIONS "-cuda") ecbuild_add_fortran_flags("-cuda") ecbuild_add_fortran_flags("-Minfo=accel,all,ccff" BUILD DEBUG) endif() From 19b073e49457165082d7e2a38cd0b92b7129f781 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Fri, 17 Nov 2023 02:55:31 -0800 Subject: [PATCH 06/19] use contiguous copies for size=1 --- field_RANKSUFF_module.fypp | 40 ++++-- tests/init_wrapper_non_contiguous_multi.F90 | 131 +++++++++++++++++++- 2 files changed, 158 insertions(+), 13 deletions(-) diff --git a/field_RANKSUFF_module.fypp b/field_RANKSUFF_module.fypp index ab9df9a..fea9806 100644 --- a/field_RANKSUFF_module.fypp +++ b/field_RANKSUFF_module.fypp @@ -379,6 +379,8 @@ CONTAINS IWIDTH = PRODUCT(ISHP(1:${d1+1}$)) IHEIGHT = PRODUCT(ISHP(${d1+2}$:${d2+1}$)) + PRINT *, "CALLED COPY_2D_DIM${d1}$_${d2}$_CONTIGUOUS" + #:set lb = lambda arr, i: f'LBOUND({arr}, {i+1})' #:set lbnds = lambda arr, start, end: [lb(arr, i) for i in range(start, end)] #:set this_slice = lambda arr: ', '.join(lbnds(arr, 0, ft.rank)) @@ -448,6 +450,8 @@ CONTAINS INTEGER :: ${', '.join (list (map (lambda i: 'J' + str (i+1), range (d, ft.rank))))}$ #:endif + PRINT *, "CALLED COPY_DIM${d}$_CONTIGUOUS" + #:for e in range (ft.rank, d, -1) ${' ' * (ft.rank - e)}$DO J${e}$ = LBOUND (HST, ${e}$), UBOUND (HST, ${e}$) #:endfor @@ -706,6 +710,7 @@ CONTAINS INTEGER*8 :: IPREVIOUS_STRIDE, ITHIS_STRIDE, ISIZE INTEGER (KIND=JPIM) :: J, LB(${ft.rank}$), IAFTER + ! assume that dimension all dimensions before IAFTER are contiguous... IF (PRESENT(AFTER)) THEN IAFTER = AFTER ELSE @@ -713,9 +718,12 @@ CONTAINS ENDIF LB = LBOUND(PTR) - IPREVIOUS_STRIDE = KIND (PTR) + IF (IAFTER == 0) THEN + IPREVIOUS_STRIDE = KIND (PTR) + ENDIF + + PRINT *, "GET_LAST_CONTIGUOUS_DIMENSION" - JDIM = 0 #:for d in range (1, ft.rank+1) #:set ind0 = "" #:set ind1 = "" @@ -726,19 +734,27 @@ CONTAINS #:set ind0 = ind0[:-2] #:set ind1 = ind1[:-2] ITHIS_STRIDE = LOC (PTR (${ind1}$)) - LOC (PTR (${ind0}$)) - #:if d == 1 - ISIZE = 1 - #:else - ISIZE = SIZE(PTR, ${d-1}$) - #:endif - IF (IAFTER < ${d}$ .AND. IPREVIOUS_STRIDE * ISIZE /= ITHIS_STRIDE) THEN - RETURN + IF (IAFTER < ${d}$) THEN + #:if d == 1 + ISIZE = 1 + #:else + ISIZE = SIZE(PTR, ${d-1}$) + #:endif + PRINT *, "ITER", ${d}$ + PRINT *, "SIZE(PTR, ${d}$) /= 1", SIZE(PTR, ${d}$), 1 + PRINT *, "IPREVIOUS_STRIDE * ISIZE /= ITHIS_STRIDE", IPREVIOUS_STRIDE * ISIZE, ITHIS_STRIDE + IF (SIZE(PTR, ${d}$) /= 1 .AND. IPREVIOUS_STRIDE * ISIZE /= ITHIS_STRIDE) THEN + JDIM = ${d-1}$ + RETURN + ENDIF + IPREVIOUS_STRIDE = IPREVIOUS_STRIDE * ISIZE + ELSE IF (IAFTER == ${d}$) THEN + IPREVIOUS_STRIDE = ITHIS_STRIDE ENDIF - IPREVIOUS_STRIDE = ITHIS_STRIDE - - JDIM = ${d}$ #:endfor + JDIM = ${ft.rank}$ + PRINT *, "" END FUNCTION ${ftn}$_GET_LAST_CONTIGUOUS_DIMENSION #:endfor diff --git a/tests/init_wrapper_non_contiguous_multi.F90 b/tests/init_wrapper_non_contiguous_multi.F90 index f635fe7..7a31ed0 100644 --- a/tests/init_wrapper_non_contiguous_multi.F90 +++ b/tests/init_wrapper_non_contiguous_multi.F90 @@ -32,6 +32,7 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI D1 = 0 D2 = 0 + PRINT *, "begin 1" CALL FIELD_NEW(W4, DATA=D1(1:1,:,:,:,3)) CALL W4%GET_HOST_DATA_RDWR(W4PTR) W4PTR=42 @@ -39,11 +40,16 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI !$ACC KERNELS DEFAULT(PRESENT) W4PTR=92 !$ACC END KERNELS - CALL W4%GET_HOST_DATA_RDONLY(W4PTR) + D1 = -1 + D2 = -1 D2(1:1,:,:,:,3)=92 + CALL W4%GET_HOST_DATA_RDONLY(W4PTR) CALL FIELD_DELETE(W4) IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 1" + PRINT *, "" + PRINT *, "begin 2" CALL FIELD_NEW(W3, DATA=D1(:,2,:,:,3)) CALL W3%GET_HOST_DATA_RDWR(W3PTR) W3PTR=51 @@ -51,44 +57,167 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI !$ACC KERNELS DEFAULT(PRESENT) W3PTR=61 !$ACC END KERNELS + D1 = -1 + D2 = -1 D2(:,2,:,:,3)=61 CALL FIELD_DELETE(W3) IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 2" + PRINT *, "" + PRINT *, "begin 3" CALL FIELD_NEW(W4, DATA=D1(:,:,4:8,:,3)) CALL W4%GET_DEVICE_DATA_RDWR(W4PTR) !$ACC KERNELS DEFAULT(PRESENT) W4PTR=31 !$ACC END KERNELS + D1 = -1 + D2 = -1 D2(:,:,4:8,:,3)=31 CALL FIELD_DELETE(W4) IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 3" + PRINT *, "" + PRINT *, "begin 4" CALL FIELD_NEW(W3, DATA=D1(:,2,4:8,3:5,3)) CALL W3%GET_DEVICE_DATA_RDWR(W3PTR) !$ACC KERNELS DEFAULT(PRESENT) W3PTR=91 !$ACC END KERNELS + D1 = -1 + D2 = -1 D2(:,2,4:8,3:5,3)=91 CALL FIELD_DELETE(W3) IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 4" + PRINT *, "" + PRINT *, "begin 5" CALL FIELD_NEW(W2, DATA=D1(:,2,4:8,8,3)) CALL W2%GET_DEVICE_DATA_RDWR(W2PTR) !$ACC KERNELS DEFAULT(PRESENT) W2PTR=12.1 !$ACC END KERNELS + D1 = -1 + D2 = -1 D2(:,2,4:8,8,3)=12.1 CALL FIELD_DELETE(W2) IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 5" + PRINT *, "" + PRINT *, "begin 6" CALL FIELD_NEW(W4, DATA=D1(:,:,4,:,:)) CALL W4%GET_DEVICE_DATA_RDWR(W4PTR) !$ACC KERNELS DEFAULT(PRESENT) W4PTR=22.1 !$ACC END KERNELS + D1 = -1 + D2 = -1 D2(:,:,4,:,:)=22.1 CALL FIELD_DELETE(W4) IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 6" + PRINT *, "" + + + PRINT *, "begin 7" + CALL FIELD_NEW(W5, DATA=D1(:,1:1,1:1,1:1,1:1)) + CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) + !$ACC KERNELS DEFAULT(PRESENT) + W5PTR=1.1 + !$ACC END KERNELS + D1 = -1 + D2 = -1 + D2(:,1:1,1:1,1:1,1:1)=1.1 + CALL FIELD_DELETE(W5) + IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 7" + PRINT *, "" + + PRINT *, "begin 8" + CALL FIELD_NEW(W5, DATA=D1(:3,1:1,3:3,:,2:4)) + CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) + !$ACC KERNELS DEFAULT(PRESENT) + W5PTR=1.2 + !$ACC END KERNELS + D1 = -1 + D2 = -1 + D2(:3,1:1,3:3,:,2:4)=1.2 + CALL FIELD_DELETE(W5) + IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 8" + PRINT *, "" + + PRINT *, "begin 9" + CALL FIELD_NEW(W5, DATA=D1(:,1:1,3:3,:,2:4)) + CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) + !$ACC KERNELS DEFAULT(PRESENT) + W5PTR=2.5 + !$ACC END KERNELS + D1 = -1 + D2 = -1 + D2(:,1:1,3:3,:,2:4)=2.5 + CALL FIELD_DELETE(W5) + IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 9" + PRINT *, "" + + PRINT *, "begin 10" + CALL FIELD_NEW(W5, DATA=D1(:,1:1,:,1:5,2:4)) + CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) + !$ACC KERNELS DEFAULT(PRESENT) + W5PTR=9.1 + !$ACC END KERNELS + D1 = -1 + D2 = -1 + D2(:,1:1,:,1:5,2:4)=9.1 + CALL FIELD_DELETE(W5) + IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 10" + PRINT *, "" + + PRINT *, "begin 11" + CALL FIELD_NEW(W5, DATA=D1(:,1:1,:,8:12,:)) + CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) + !$ACC KERNELS DEFAULT(PRESENT) + W5PTR=8.1 + !$ACC END KERNELS + D1 = -1 + D2 = -1 + D2(:,1:1,:,8:12,:)=8.1 + CALL FIELD_DELETE(W5) + IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 11" + PRINT *, "" + + PRINT *, "begin 12" + CALL FIELD_NEW(W5, DATA=D1(3:7,:,:,:,3:3)) + CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) + !$ACC KERNELS DEFAULT(PRESENT) + W5PTR=8.4 + !$ACC END KERNELS + D1 = -1 + D2 = -1 + D2(3:7,:,:,:,3:3)=8.4 + CALL FIELD_DELETE(W5) + IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 12" + PRINT *, "" + + PRINT *, "begin 13" + CALL FIELD_NEW(W5, DATA=D1(3:3,:,:,:,:)) + CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) + !$ACC KERNELS DEFAULT(PRESENT) + W5PTR=12 + !$ACC END KERNELS + D1 = -1 + D2 = -1 + D2(3:3,:,:,:,:)=12 + CALL FIELD_DELETE(W5) + IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 13" + PRINT *, "" END PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI From 09cd076b2740f2888bd2fa6ea81f11d784ff3876 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Fri, 17 Nov 2023 02:56:41 -0800 Subject: [PATCH 07/19] remove prints --- field_RANKSUFF_module.fypp | 10 ------ tests/init_wrapper_non_contiguous_multi.F90 | 39 --------------------- 2 files changed, 49 deletions(-) diff --git a/field_RANKSUFF_module.fypp b/field_RANKSUFF_module.fypp index fea9806..abc5249 100644 --- a/field_RANKSUFF_module.fypp +++ b/field_RANKSUFF_module.fypp @@ -379,8 +379,6 @@ CONTAINS IWIDTH = PRODUCT(ISHP(1:${d1+1}$)) IHEIGHT = PRODUCT(ISHP(${d1+2}$:${d2+1}$)) - PRINT *, "CALLED COPY_2D_DIM${d1}$_${d2}$_CONTIGUOUS" - #:set lb = lambda arr, i: f'LBOUND({arr}, {i+1})' #:set lbnds = lambda arr, start, end: [lb(arr, i) for i in range(start, end)] #:set this_slice = lambda arr: ', '.join(lbnds(arr, 0, ft.rank)) @@ -450,8 +448,6 @@ CONTAINS INTEGER :: ${', '.join (list (map (lambda i: 'J' + str (i+1), range (d, ft.rank))))}$ #:endif - PRINT *, "CALLED COPY_DIM${d}$_CONTIGUOUS" - #:for e in range (ft.rank, d, -1) ${' ' * (ft.rank - e)}$DO J${e}$ = LBOUND (HST, ${e}$), UBOUND (HST, ${e}$) #:endfor @@ -722,8 +718,6 @@ CONTAINS IPREVIOUS_STRIDE = KIND (PTR) ENDIF - PRINT *, "GET_LAST_CONTIGUOUS_DIMENSION" - #:for d in range (1, ft.rank+1) #:set ind0 = "" #:set ind1 = "" @@ -740,9 +734,6 @@ CONTAINS #:else ISIZE = SIZE(PTR, ${d-1}$) #:endif - PRINT *, "ITER", ${d}$ - PRINT *, "SIZE(PTR, ${d}$) /= 1", SIZE(PTR, ${d}$), 1 - PRINT *, "IPREVIOUS_STRIDE * ISIZE /= ITHIS_STRIDE", IPREVIOUS_STRIDE * ISIZE, ITHIS_STRIDE IF (SIZE(PTR, ${d}$) /= 1 .AND. IPREVIOUS_STRIDE * ISIZE /= ITHIS_STRIDE) THEN JDIM = ${d-1}$ RETURN @@ -754,7 +745,6 @@ CONTAINS #:endfor JDIM = ${ft.rank}$ - PRINT *, "" END FUNCTION ${ftn}$_GET_LAST_CONTIGUOUS_DIMENSION #:endfor diff --git a/tests/init_wrapper_non_contiguous_multi.F90 b/tests/init_wrapper_non_contiguous_multi.F90 index 7a31ed0..d24b334 100644 --- a/tests/init_wrapper_non_contiguous_multi.F90 +++ b/tests/init_wrapper_non_contiguous_multi.F90 @@ -32,7 +32,6 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI D1 = 0 D2 = 0 - PRINT *, "begin 1" CALL FIELD_NEW(W4, DATA=D1(1:1,:,:,:,3)) CALL W4%GET_HOST_DATA_RDWR(W4PTR) W4PTR=42 @@ -46,10 +45,7 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI CALL W4%GET_HOST_DATA_RDONLY(W4PTR) CALL FIELD_DELETE(W4) IF (ANY(D1/=D2)) ERROR STOP - PRINT *, "end 1" - PRINT *, "" - PRINT *, "begin 2" CALL FIELD_NEW(W3, DATA=D1(:,2,:,:,3)) CALL W3%GET_HOST_DATA_RDWR(W3PTR) W3PTR=51 @@ -62,10 +58,7 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI D2(:,2,:,:,3)=61 CALL FIELD_DELETE(W3) IF (ANY(D1/=D2)) ERROR STOP - PRINT *, "end 2" - PRINT *, "" - PRINT *, "begin 3" CALL FIELD_NEW(W4, DATA=D1(:,:,4:8,:,3)) CALL W4%GET_DEVICE_DATA_RDWR(W4PTR) !$ACC KERNELS DEFAULT(PRESENT) @@ -76,10 +69,7 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI D2(:,:,4:8,:,3)=31 CALL FIELD_DELETE(W4) IF (ANY(D1/=D2)) ERROR STOP - PRINT *, "end 3" - PRINT *, "" - PRINT *, "begin 4" CALL FIELD_NEW(W3, DATA=D1(:,2,4:8,3:5,3)) CALL W3%GET_DEVICE_DATA_RDWR(W3PTR) !$ACC KERNELS DEFAULT(PRESENT) @@ -90,10 +80,7 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI D2(:,2,4:8,3:5,3)=91 CALL FIELD_DELETE(W3) IF (ANY(D1/=D2)) ERROR STOP - PRINT *, "end 4" - PRINT *, "" - PRINT *, "begin 5" CALL FIELD_NEW(W2, DATA=D1(:,2,4:8,8,3)) CALL W2%GET_DEVICE_DATA_RDWR(W2PTR) !$ACC KERNELS DEFAULT(PRESENT) @@ -104,10 +91,7 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI D2(:,2,4:8,8,3)=12.1 CALL FIELD_DELETE(W2) IF (ANY(D1/=D2)) ERROR STOP - PRINT *, "end 5" - PRINT *, "" - PRINT *, "begin 6" CALL FIELD_NEW(W4, DATA=D1(:,:,4,:,:)) CALL W4%GET_DEVICE_DATA_RDWR(W4PTR) !$ACC KERNELS DEFAULT(PRESENT) @@ -118,11 +102,8 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI D2(:,:,4,:,:)=22.1 CALL FIELD_DELETE(W4) IF (ANY(D1/=D2)) ERROR STOP - PRINT *, "end 6" - PRINT *, "" - PRINT *, "begin 7" CALL FIELD_NEW(W5, DATA=D1(:,1:1,1:1,1:1,1:1)) CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) !$ACC KERNELS DEFAULT(PRESENT) @@ -133,10 +114,7 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI D2(:,1:1,1:1,1:1,1:1)=1.1 CALL FIELD_DELETE(W5) IF (ANY(D1/=D2)) ERROR STOP - PRINT *, "end 7" - PRINT *, "" - PRINT *, "begin 8" CALL FIELD_NEW(W5, DATA=D1(:3,1:1,3:3,:,2:4)) CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) !$ACC KERNELS DEFAULT(PRESENT) @@ -147,10 +125,7 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI D2(:3,1:1,3:3,:,2:4)=1.2 CALL FIELD_DELETE(W5) IF (ANY(D1/=D2)) ERROR STOP - PRINT *, "end 8" - PRINT *, "" - PRINT *, "begin 9" CALL FIELD_NEW(W5, DATA=D1(:,1:1,3:3,:,2:4)) CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) !$ACC KERNELS DEFAULT(PRESENT) @@ -161,10 +136,7 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI D2(:,1:1,3:3,:,2:4)=2.5 CALL FIELD_DELETE(W5) IF (ANY(D1/=D2)) ERROR STOP - PRINT *, "end 9" - PRINT *, "" - PRINT *, "begin 10" CALL FIELD_NEW(W5, DATA=D1(:,1:1,:,1:5,2:4)) CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) !$ACC KERNELS DEFAULT(PRESENT) @@ -175,10 +147,7 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI D2(:,1:1,:,1:5,2:4)=9.1 CALL FIELD_DELETE(W5) IF (ANY(D1/=D2)) ERROR STOP - PRINT *, "end 10" - PRINT *, "" - PRINT *, "begin 11" CALL FIELD_NEW(W5, DATA=D1(:,1:1,:,8:12,:)) CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) !$ACC KERNELS DEFAULT(PRESENT) @@ -189,10 +158,7 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI D2(:,1:1,:,8:12,:)=8.1 CALL FIELD_DELETE(W5) IF (ANY(D1/=D2)) ERROR STOP - PRINT *, "end 11" - PRINT *, "" - PRINT *, "begin 12" CALL FIELD_NEW(W5, DATA=D1(3:7,:,:,:,3:3)) CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) !$ACC KERNELS DEFAULT(PRESENT) @@ -203,10 +169,7 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI D2(3:7,:,:,:,3:3)=8.4 CALL FIELD_DELETE(W5) IF (ANY(D1/=D2)) ERROR STOP - PRINT *, "end 12" - PRINT *, "" - PRINT *, "begin 13" CALL FIELD_NEW(W5, DATA=D1(3:3,:,:,:,:)) CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) !$ACC KERNELS DEFAULT(PRESENT) @@ -217,7 +180,5 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI D2(3:3,:,:,:,:)=12 CALL FIELD_DELETE(W5) IF (ANY(D1/=D2)) ERROR STOP - PRINT *, "end 13" - PRINT *, "" END PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI From e327d07bdb4a5946b1afc61862fe8045eb7d9cc0 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Fri, 17 Nov 2023 09:37:51 -0800 Subject: [PATCH 08/19] stupid mistake in cpu code --- field_RANKSUFF_module.fypp | 14 +++- tests/init_wrapper_non_contiguous_multi.F90 | 81 +++++++++++++++++++++ 2 files changed, 93 insertions(+), 2 deletions(-) diff --git a/field_RANKSUFF_module.fypp b/field_RANKSUFF_module.fypp index abc5249..ee7e7d7 100644 --- a/field_RANKSUFF_module.fypp +++ b/field_RANKSUFF_module.fypp @@ -379,6 +379,8 @@ CONTAINS IWIDTH = PRODUCT(ISHP(1:${d1+1}$)) IHEIGHT = PRODUCT(ISHP(${d1+2}$:${d2+1}$)) + PRINT *, "CALLED COPY_2D_DIM${d1}$_${d2}$_CONTIGUOUS" + #:set lb = lambda arr, i: f'LBOUND({arr}, {i+1})' #:set lbnds = lambda arr, start, end: [lb(arr, i) for i in range(start, end)] #:set this_slice = lambda arr: ', '.join(lbnds(arr, 0, ft.rank)) @@ -426,9 +428,9 @@ CONTAINS #:endfor #else IF (KDIR == NH2D) THEN - HST = DEV - ELSE DEV = HST + ELSE + HST = DEV ENDIF #endif END SUBROUTINE COPY_2D_DIM${d1}$_${d2}$_CONTIGUOUS @@ -448,6 +450,8 @@ CONTAINS INTEGER :: ${', '.join (list (map (lambda i: 'J' + str (i+1), range (d, ft.rank))))}$ #:endif + PRINT *, "CALLED COPY_DIM${d}$_CONTIGUOUS" + #:for e in range (ft.rank, d, -1) ${' ' * (ft.rank - e)}$DO J${e}$ = LBOUND (HST, ${e}$), UBOUND (HST, ${e}$) #:endfor @@ -718,6 +722,8 @@ CONTAINS IPREVIOUS_STRIDE = KIND (PTR) ENDIF + PRINT *, "GET_LAST_CONTIGUOUS_DIMENSION" + #:for d in range (1, ft.rank+1) #:set ind0 = "" #:set ind1 = "" @@ -734,6 +740,9 @@ CONTAINS #:else ISIZE = SIZE(PTR, ${d-1}$) #:endif + PRINT *, "ITER", ${d}$ + PRINT *, "SIZE(PTR, ${d}$) /= 1", SIZE(PTR, ${d}$), 1 + PRINT *, "IPREVIOUS_STRIDE * ISIZE /= ITHIS_STRIDE", IPREVIOUS_STRIDE * ISIZE, ITHIS_STRIDE IF (SIZE(PTR, ${d}$) /= 1 .AND. IPREVIOUS_STRIDE * ISIZE /= ITHIS_STRIDE) THEN JDIM = ${d-1}$ RETURN @@ -745,6 +754,7 @@ CONTAINS #:endfor JDIM = ${ft.rank}$ + PRINT *, "" END FUNCTION ${ftn}$_GET_LAST_CONTIGUOUS_DIMENSION #:endfor diff --git a/tests/init_wrapper_non_contiguous_multi.F90 b/tests/init_wrapper_non_contiguous_multi.F90 index d24b334..64e9864 100644 --- a/tests/init_wrapper_non_contiguous_multi.F90 +++ b/tests/init_wrapper_non_contiguous_multi.F90 @@ -32,6 +32,7 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI D1 = 0 D2 = 0 + PRINT *, "begin 1" CALL FIELD_NEW(W4, DATA=D1(1:1,:,:,:,3)) CALL W4%GET_HOST_DATA_RDWR(W4PTR) W4PTR=42 @@ -45,7 +46,10 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI CALL W4%GET_HOST_DATA_RDONLY(W4PTR) CALL FIELD_DELETE(W4) IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 1" + PRINT *, "" + PRINT *, "begin 2" CALL FIELD_NEW(W3, DATA=D1(:,2,:,:,3)) CALL W3%GET_HOST_DATA_RDWR(W3PTR) W3PTR=51 @@ -58,7 +62,10 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI D2(:,2,:,:,3)=61 CALL FIELD_DELETE(W3) IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 2" + PRINT *, "" + PRINT *, "begin 3" CALL FIELD_NEW(W4, DATA=D1(:,:,4:8,:,3)) CALL W4%GET_DEVICE_DATA_RDWR(W4PTR) !$ACC KERNELS DEFAULT(PRESENT) @@ -69,7 +76,10 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI D2(:,:,4:8,:,3)=31 CALL FIELD_DELETE(W4) IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 3" + PRINT *, "" + PRINT *, "begin 4" CALL FIELD_NEW(W3, DATA=D1(:,2,4:8,3:5,3)) CALL W3%GET_DEVICE_DATA_RDWR(W3PTR) !$ACC KERNELS DEFAULT(PRESENT) @@ -80,7 +90,10 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI D2(:,2,4:8,3:5,3)=91 CALL FIELD_DELETE(W3) IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 4" + PRINT *, "" + PRINT *, "begin 5" CALL FIELD_NEW(W2, DATA=D1(:,2,4:8,8,3)) CALL W2%GET_DEVICE_DATA_RDWR(W2PTR) !$ACC KERNELS DEFAULT(PRESENT) @@ -91,7 +104,10 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI D2(:,2,4:8,8,3)=12.1 CALL FIELD_DELETE(W2) IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 5" + PRINT *, "" + PRINT *, "begin 6" CALL FIELD_NEW(W4, DATA=D1(:,:,4,:,:)) CALL W4%GET_DEVICE_DATA_RDWR(W4PTR) !$ACC KERNELS DEFAULT(PRESENT) @@ -102,8 +118,11 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI D2(:,:,4,:,:)=22.1 CALL FIELD_DELETE(W4) IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 6" + PRINT *, "" + PRINT *, "begin 7" CALL FIELD_NEW(W5, DATA=D1(:,1:1,1:1,1:1,1:1)) CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) !$ACC KERNELS DEFAULT(PRESENT) @@ -114,7 +133,10 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI D2(:,1:1,1:1,1:1,1:1)=1.1 CALL FIELD_DELETE(W5) IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 7" + PRINT *, "" + PRINT *, "begin 8" CALL FIELD_NEW(W5, DATA=D1(:3,1:1,3:3,:,2:4)) CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) !$ACC KERNELS DEFAULT(PRESENT) @@ -125,7 +147,10 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI D2(:3,1:1,3:3,:,2:4)=1.2 CALL FIELD_DELETE(W5) IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 8" + PRINT *, "" + PRINT *, "begin 9" CALL FIELD_NEW(W5, DATA=D1(:,1:1,3:3,:,2:4)) CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) !$ACC KERNELS DEFAULT(PRESENT) @@ -136,7 +161,10 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI D2(:,1:1,3:3,:,2:4)=2.5 CALL FIELD_DELETE(W5) IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 9" + PRINT *, "" + PRINT *, "begin 10" CALL FIELD_NEW(W5, DATA=D1(:,1:1,:,1:5,2:4)) CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) !$ACC KERNELS DEFAULT(PRESENT) @@ -147,7 +175,10 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI D2(:,1:1,:,1:5,2:4)=9.1 CALL FIELD_DELETE(W5) IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 10" + PRINT *, "" + PRINT *, "begin 11" CALL FIELD_NEW(W5, DATA=D1(:,1:1,:,8:12,:)) CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) !$ACC KERNELS DEFAULT(PRESENT) @@ -158,7 +189,10 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI D2(:,1:1,:,8:12,:)=8.1 CALL FIELD_DELETE(W5) IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 11" + PRINT *, "" + PRINT *, "begin 12" CALL FIELD_NEW(W5, DATA=D1(3:7,:,:,:,3:3)) CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) !$ACC KERNELS DEFAULT(PRESENT) @@ -169,7 +203,10 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI D2(3:7,:,:,:,3:3)=8.4 CALL FIELD_DELETE(W5) IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 12" + PRINT *, "" + PRINT *, "begin 13" CALL FIELD_NEW(W5, DATA=D1(3:3,:,:,:,:)) CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) !$ACC KERNELS DEFAULT(PRESENT) @@ -180,5 +217,49 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI D2(3:3,:,:,:,:)=12 CALL FIELD_DELETE(W5) IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 13" + PRINT *, "" + + PRINT *, "begin 14" + CALL FIELD_NEW(W5, DATA=D1(1:9,1:9:2,:,3:12:3,:)) + CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) + !$ACC KERNELS DEFAULT(PRESENT) + W5PTR=18 + !$ACC END KERNELS + D1 = -1 + D2 = -1 + D2(1:9,1:9:2,:,3:12:3,:)=18 + CALL FIELD_DELETE(W5) + IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 14" + PRINT *, "" + + PRINT *, "begin 15" + CALL FIELD_NEW(W5, DATA=D1(:,1:9:2,:,3:12:3,:)) + CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) + !$ACC KERNELS DEFAULT(PRESENT) + W5PTR=19 + !$ACC END KERNELS + D1 = -1 + D2 = -1 + D2(:,1:9:2,:,3:12:3,:)=19 + CALL FIELD_DELETE(W5) + IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 15" + PRINT *, "" + + PRINT *, "begin 16" + CALL FIELD_NEW(W5, DATA=D1(:,:,:,3:12:3,:)) + CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) + !$ACC KERNELS DEFAULT(PRESENT) + W5PTR=19 + !$ACC END KERNELS + D1 = -1 + D2 = -1 + D2(:,:,:,3:12:3,:)=19 + CALL FIELD_DELETE(W5) + IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 16" + PRINT *, "" END PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI From 0c79835c97cd37b3048ec89fae83fef942e37deb Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Mon, 20 Nov 2023 06:18:20 -0800 Subject: [PATCH 09/19] remove prints --- field_RANKSUFF_module.fypp | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/field_RANKSUFF_module.fypp b/field_RANKSUFF_module.fypp index ee7e7d7..46c41b0 100644 --- a/field_RANKSUFF_module.fypp +++ b/field_RANKSUFF_module.fypp @@ -379,8 +379,6 @@ CONTAINS IWIDTH = PRODUCT(ISHP(1:${d1+1}$)) IHEIGHT = PRODUCT(ISHP(${d1+2}$:${d2+1}$)) - PRINT *, "CALLED COPY_2D_DIM${d1}$_${d2}$_CONTIGUOUS" - #:set lb = lambda arr, i: f'LBOUND({arr}, {i+1})' #:set lbnds = lambda arr, start, end: [lb(arr, i) for i in range(start, end)] #:set this_slice = lambda arr: ', '.join(lbnds(arr, 0, ft.rank)) @@ -450,8 +448,6 @@ CONTAINS INTEGER :: ${', '.join (list (map (lambda i: 'J' + str (i+1), range (d, ft.rank))))}$ #:endif - PRINT *, "CALLED COPY_DIM${d}$_CONTIGUOUS" - #:for e in range (ft.rank, d, -1) ${' ' * (ft.rank - e)}$DO J${e}$ = LBOUND (HST, ${e}$), UBOUND (HST, ${e}$) #:endfor @@ -722,8 +718,6 @@ CONTAINS IPREVIOUS_STRIDE = KIND (PTR) ENDIF - PRINT *, "GET_LAST_CONTIGUOUS_DIMENSION" - #:for d in range (1, ft.rank+1) #:set ind0 = "" #:set ind1 = "" @@ -740,9 +734,6 @@ CONTAINS #:else ISIZE = SIZE(PTR, ${d-1}$) #:endif - PRINT *, "ITER", ${d}$ - PRINT *, "SIZE(PTR, ${d}$) /= 1", SIZE(PTR, ${d}$), 1 - PRINT *, "IPREVIOUS_STRIDE * ISIZE /= ITHIS_STRIDE", IPREVIOUS_STRIDE * ISIZE, ITHIS_STRIDE IF (SIZE(PTR, ${d}$) /= 1 .AND. IPREVIOUS_STRIDE * ISIZE /= ITHIS_STRIDE) THEN JDIM = ${d-1}$ RETURN @@ -754,7 +745,6 @@ CONTAINS #:endfor JDIM = ${ft.rank}$ - PRINT *, "" END FUNCTION ${ftn}$_GET_LAST_CONTIGUOUS_DIMENSION #:endfor From 68726f6f55e72056509114f31f1db2c4c273a6e3 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Mon, 20 Nov 2023 07:13:24 -0800 Subject: [PATCH 10/19] make tests smaller --- tests/init_wrapper_non_contiguous_multi.F90 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/init_wrapper_non_contiguous_multi.F90 b/tests/init_wrapper_non_contiguous_multi.F90 index 64e9864..063703d 100644 --- a/tests/init_wrapper_non_contiguous_multi.F90 +++ b/tests/init_wrapper_non_contiguous_multi.F90 @@ -27,8 +27,8 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI REAL(KIND=JPRB), POINTER :: W5PTR(:,:,:,:,:) integer(kind=8) :: ptr - ALLOCATE(D1(18, 32, 19, 27, 12)) - ALLOCATE(D2(18, 32, 19, 27, 12)) + ALLOCATE(D1(7, 9, 11, 13, 15)) + ALLOCATE(D2(7, 9, 11, 13, 15)) D1 = 0 D2 = 0 @@ -221,14 +221,14 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI PRINT *, "" PRINT *, "begin 14" - CALL FIELD_NEW(W5, DATA=D1(1:9,1:9:2,:,3:12:3,:)) + CALL FIELD_NEW(W5, DATA=D1(1:4,1:9:2,:,3:12:3,:)) CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) !$ACC KERNELS DEFAULT(PRESENT) W5PTR=18 !$ACC END KERNELS D1 = -1 D2 = -1 - D2(1:9,1:9:2,:,3:12:3,:)=18 + D2(1:4,1:9:2,:,3:12:3,:)=18 CALL FIELD_DELETE(W5) IF (ANY(D1/=D2)) ERROR STOP PRINT *, "end 14" From 60746696e639ede172a0f561a4495d33882be1b9 Mon Sep 17 00:00:00 2001 From: Philippe Marguinaud Date: Wed, 17 Jan 2024 10:46:46 +0000 Subject: [PATCH 11/19] Compile & run tests with Intel & nvhpc --- LICENSE | 1 + cmake/field_api_compile_options.cmake | 7 +- field_RANKSUFF_data_module.fypp | 173 ++++++++++++- field_RANKSUFF_module.fypp | 12 +- field_basic_module.F90 | 1 - tests/CMakeLists.txt | 1 + tests/init_wrapper_non_contiguous_multi.F90 | 265 ++++++++++++++++++++ 7 files changed, 436 insertions(+), 24 deletions(-) create mode 100644 tests/init_wrapper_non_contiguous_multi.F90 diff --git a/LICENSE b/LICENSE index 7a1ad19..6e21471 100644 --- a/LICENSE +++ b/LICENSE @@ -176,6 +176,7 @@ END OF TERMS AND CONDITIONS Copyright 2018- ECMWF + Copyright 2023- NVIDIA Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/cmake/field_api_compile_options.cmake b/cmake/field_api_compile_options.cmake index 06c8d5a..4685539 100644 --- a/cmake/field_api_compile_options.cmake +++ b/cmake/field_api_compile_options.cmake @@ -10,8 +10,11 @@ if(CMAKE_Fortran_COMPILER_ID MATCHES PGI|NVIDIA|NVHPC) ecbuild_add_fortran_flags("-Mlarge_arrays") ecbuild_add_fortran_flags("-gopt") - - ecbuild_add_fortran_flags("-Minfo=accel,all,ccff" BUILD DEBUG) + if(HAVE_ACC) + list(APPEND CMAKE_REQUIRED_LINK_OPTIONS "-cuda") + ecbuild_add_fortran_flags("-cuda") + ecbuild_add_fortran_flags("-Minfo=accel,all,ccff" BUILD DEBUG) + endif() # These are architecture/compiler/offload-library specific options # that should really be coming from external input diff --git a/field_RANKSUFF_data_module.fypp b/field_RANKSUFF_data_module.fypp index cdb16b2..06bdba7 100644 --- a/field_RANKSUFF_data_module.fypp +++ b/field_RANKSUFF_data_module.fypp @@ -16,11 +16,69 @@ ${fieldType.useParkind1 ()}$ IMPLICIT NONE +PRIVATE + +#:for ft in fieldTypeList +#:set ftn = ft.name + +PUBLIC :: ${ftn}$_COPY + +#:endfor + CONTAINS #:for ft in fieldTypeList #:set ftn = ft.name + SUBROUTINE ${ftn}$_COPY (HST, DEV, MAP_DEVPTR, KDIR, QUEUE) + + USE FIELD_ABORT_MODULE + + ${ft.type}$, POINTER :: HST (${ft.shape}$), DEV (${ft.shape}$) + LOGICAL, INTENT (IN) :: MAP_DEVPTR + INTEGER (KIND=JPIM), INTENT (IN) :: KDIR + INTEGER (KIND=JPIM), OPTIONAL, INTENT (IN) :: QUEUE + INTEGER :: LAST_CONTIG_DIM + INTEGER :: NEXT_CONTIG_DIM + + LAST_CONTIG_DIM = ${ftn}$_GET_LAST_CONTIGUOUS_DIMENSION (HST, 0) + NEXT_CONTIG_DIM = ${ftn}$_GET_LAST_CONTIGUOUS_DIMENSION (HST, LAST_CONTIG_DIM+1) + + IF (MAP_DEVPTR) THEN + + SELECT CASE (LAST_CONTIG_DIM) +#:for d1 in range (ft.rank) + CASE (${d1}$) + SELECT CASE (NEXT_CONTIG_DIM) + #:for d2 in range (d1+1, ft.rank+1) + CASE (${d2}$) + CALL ${ftn}$_COPY_2D_DIM${d1}$_${d2}$_CONTIGUOUS (HST, DEV, KDIR, QUEUE) + #:endfor + CASE DEFAULT + CALL FIELD_ABORT ('INTERNAL ERROR: UNEXPECTED NEXT_CONTIG_DIM') + END SELECT +#:endfor + CASE (${ft.rank}$) + CALL ${ftn}$_COPY_DIM${ft.rank}$_CONTIGUOUS (HST, DEV, MAP_DEVPTR, KDIR, QUEUE) + CASE DEFAULT + CALL FIELD_ABORT ('INTERNAL ERROR: UNEXPECTED LAST_CONTIG_DIM') + END SELECT + + ELSE + + SELECT CASE (LAST_CONTIG_DIM) +#:for d in range (ft.rank + 1) + CASE (${d}$) + CALL ${ftn}$_COPY_DIM${d}$_CONTIGUOUS (HST, DEV, MAP_DEVPTR, KDIR, QUEUE) +#:endfor + CASE DEFAULT + CALL FIELD_ABORT ('INTERNAL ERROR: UNEXPECTED LAST_CONTIG_DIM') + END SELECT + + ENDIF + + END SUBROUTINE + #:for d in range (0, ft.rank+1) SUBROUTINE ${ftn}$_COPY_DIM${d}$_CONTIGUOUS (HST, DEV, MAP_DEVPTR, KDIR, QUEUE) #ifdef _OPENACC @@ -85,22 +143,103 @@ CONTAINS #:endfor +#:for d1 in range (0, ft.rank) +#:for d2 in range (d1+1, ft.rank+1) + SUBROUTINE ${ftn}$_COPY_2D_DIM${d1}$_${d2}$_CONTIGUOUS (HST, DEV, KDIR, QUEUE) +#ifdef _OPENACC + USE OPENACC + USE CUDAFOR +#endif + ${ft.type}$, POINTER :: HST (${ft.shape}$), DEV (${ft.shape}$) + INTEGER (KIND=JPIM), INTENT (IN) :: KDIR + INTEGER (KIND=JPIM), OPTIONAL, INTENT (IN) :: QUEUE +#ifdef _OPENACC + INTEGER (KIND=JPIM) :: IHST_PITCH, IDEV_PITCH, IRET + INTEGER (KIND=JPIM) :: IWIDTH, IHEIGHT, ISHP(${ft.rank+1}$) + #:if d2 < ft.rank + INTEGER :: ${', '.join (list (map (lambda i: 'J' + str (i+1), range (d2, ft.rank))))}$ + #:endif + INTEGER(KIND=CUDA_STREAM_KIND) :: STREAM + + ISHP(1) = 1 + ISHP(2:) = SHAPE(HST) + IWIDTH = PRODUCT(ISHP(1:${d1+1}$)) + IHEIGHT = PRODUCT(ISHP(${d1+2}$:${d2+1}$)) + +#:set lb = lambda arr, i: f'LBOUND({arr}, {i+1})' +#:set lbnds = lambda arr, start, end: [lb(arr, i) for i in range(start, end)] +#:set this_slice = lambda arr: ', '.join(lbnds(arr, 0, ft.rank)) +#:set next_slice = lambda arr: ', '.join(lbnds(arr, 0, d1) + [lb(arr, d1)+'+1'] + lbnds(arr, d1+1, ft.rank)) + IHST_PITCH = (LOC (HST(${next_slice('HST')}$)) - LOC (HST (${this_slice('HST')}$)))/KIND(HST) + IDEV_PITCH = (LOC (DEV(${next_slice('DEV')}$)) - LOC (DEV (${this_slice('DEV')}$)))/KIND(DEV) + + #:for e in range (ft.rank, d2, -1) + ${' ' * (ft.rank - e)}$DO J${e}$ = LBOUND (HST, ${e}$), UBOUND (HST, ${e}$) + #:endfor + #:set indent = ' ' * (ft.rank - d2 - 1) + #:set ar = lambda arr: ', '.join(lbnds(arr, 0, d2) + [f'J{i+1}' for i in range(d2, ft.rank)]) + ${indent}$ IF (KDIR == NH2D) THEN + ${indent}$ !$acc host_data use_device (DEV) + ${indent}$ IF(PRESENT(QUEUE)) THEN + ${indent}$ CALL ACC_SET_CUDA_STREAM(QUEUE, STREAM) + ${indent}$ IRET = CUDAMEMCPY2DASYNC (DEV (${ar('DEV')}$), IDEV_PITCH, & + ${indent}$ & HST (${ar('HST')}$), IHST_PITCH, & + ${indent}$ & IWIDTH, IHEIGHT, CUDAMEMCPYHOSTTODEVICE, & + ${indent}$ & STREAM) + ${indent}$ ELSE + ${indent}$ IRET = CUDAMEMCPY2D (DEV (${ar('DEV')}$), IDEV_PITCH, & + ${indent}$ & HST (${ar('HST')}$), IHST_PITCH, & + ${indent}$ & IWIDTH, IHEIGHT) + ${indent}$ ENDIF + ${indent}$ !$acc end host_data + ${indent}$ ELSEIF (KDIR == ND2H) THEN + ${indent}$ !$acc host_data use_device (DEV) + ${indent}$ IF(PRESENT(QUEUE)) THEN + ${indent}$ CALL ACC_SET_CUDA_STREAM(QUEUE, STREAM) + ${indent}$ IRET = CUDAMEMCPY2DASYNC (HST (${ar('HST')}$), IHST_PITCH, & + ${indent}$ & DEV (${ar('DEV')}$), IDEV_PITCH, & + ${indent}$ & IWIDTH, IHEIGHT, CUDAMEMCPYDEVICETOHOST, & + ${indent}$ & STREAM) + ${indent}$ ELSE + ${indent}$ IRET = CUDAMEMCPY2D (HST (${ar('HST')}$), IHST_PITCH, & + ${indent}$ & DEV (${ar('DEV')}$), IDEV_PITCH, & + ${indent}$ & IWIDTH, IHEIGHT) + ${indent}$ ENDIF + ${indent}$ !$acc end host_data + ${indent}$ IF (IRET /= CUDASUCCESS) STOP 1 + ${indent}$ ENDIF + #:for e in range (d2, ft.rank) + ${' ' * (ft.rank - e - 1)}$ENDDO + #:endfor +#else + IF (KDIR == NH2D) THEN + DEV = HST + ELSE + HST = DEV + ENDIF +#endif + END SUBROUTINE +#:endfor +#:endfor + #:endfor #:for ft in fieldTypeList #:set ftn = ft.name - INTEGER (KIND=JPIM) FUNCTION ${ftn}$_GET_LAST_CONTIGUOUS_DIMENSION (PTR) RESULT (JDIM) + + INTEGER (KIND=JPIM) FUNCTION ${ftn}$_GET_LAST_CONTIGUOUS_DIMENSION (PTR, AFTER) RESULT (JDIM) ${ft.type}$, POINTER :: PTR (${ft.shape}$) - INTEGER*8 :: ISTRIDE (${ft.rank}$) + INTEGER (KIND=JPIM) :: AFTER + INTEGER*8 :: IPREVIOUS_STRIDE, ITHIS_STRIDE, ISIZE INTEGER (KIND=JPIM) :: J, LB(${ft.rank}$) + ! assume that dimension all dimensions before AFTER are contiguous... + LB = LBOUND(PTR) - ISTRIDE (1) = KIND (PTR) - DO J = 2, ${ft.rank}$ - ISTRIDE (J) = ISTRIDE (J-1) * SIZE (PTR, J-1) - ENDDO + IF (AFTER == 0) THEN + IPREVIOUS_STRIDE = KIND (PTR) + ENDIF - JDIM = 0 #:for d in range (1, ft.rank+1) #:set ind0 = "" #:set ind1 = "" @@ -110,14 +249,26 @@ CONTAINS #:endfor #:set ind0 = ind0[:-2] #:set ind1 = ind1[:-2] - IF (LOC (PTR (${ind1}$)) - LOC (PTR (${ind0}$)) /= ISTRIDE (${d}$)) THEN - RETURN + ITHIS_STRIDE = LOC (PTR (${ind1}$)) - LOC (PTR (${ind0}$)) + IF (AFTER < ${d}$) THEN + #:if d == 1 + ISIZE = 1 + #:else + ISIZE = SIZE(PTR, ${d-1}$) + #:endif + IF (SIZE(PTR, ${d}$) /= 1 .AND. IPREVIOUS_STRIDE * ISIZE /= ITHIS_STRIDE) THEN + JDIM = ${d-1}$ + RETURN + ENDIF + IPREVIOUS_STRIDE = IPREVIOUS_STRIDE * ISIZE + ELSE IF (AFTER == ${d}$) THEN + IPREVIOUS_STRIDE = ITHIS_STRIDE ENDIF - JDIM = ${d}$ - #:endfor + JDIM = ${ft.rank}$ END FUNCTION ${ftn}$_GET_LAST_CONTIGUOUS_DIMENSION + #:endfor END MODULE FIELD_${RANK}$${SUFF}$_DATA_MODULE diff --git a/field_RANKSUFF_module.fypp b/field_RANKSUFF_module.fypp index b681069..767137a 100644 --- a/field_RANKSUFF_module.fypp +++ b/field_RANKSUFF_module.fypp @@ -1,5 +1,6 @@ #! (C) Copyright 2022- ECMWF. #! (C) Copyright 2022- Meteo-France. +#! (C) Copyright 2023- NVIDIA #! #! This software is licensed under the terms of the Apache Licence Version 2.0 #! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -217,8 +218,6 @@ CONTAINS ! By default we allocate thread-local temporaries SELF%THREAD_BUFFER = .TRUE. - SELF%LAST_CONTIGUOUS_DIMENSION = ${ft.rank}$ - IF (PRESENT(PERSISTENT)) THEN IF (PERSISTENT) THEN SELF%THREAD_BUFFER = .FALSE. @@ -372,15 +371,8 @@ CONTAINS INTEGER (KIND=JPIM), OPTIONAL, INTENT(IN) :: QUEUE REAL :: START, FINISH - SELF%LAST_CONTIGUOUS_DIMENSION = ${ftn}$_GET_LAST_CONTIGUOUS_DIMENSION (SELF%PTR) - CALL CPU_TIME(START) - SELECT CASE (SELF%LAST_CONTIGUOUS_DIMENSION) -#:for d in range (ft.rank + 1) - CASE (${d}$) - CALL ${ftn}$_COPY_DIM${d}$_CONTIGUOUS (SELF%PTR, SELF%DEVPTR, SELF%MAP_DEVPTR, KDIR, QUEUE) -#:endfor - END SELECT + CALL ${ftn}$_COPY (SELF%PTR, SELF%DEVPTR, SELF%MAP_DEVPTR, KDIR, QUEUE) CALL CPU_TIME(FINISH) IF (KDIR == NH2D) THEN CALL SELF%STATS%INC_CPU_TO_GPU_TRANSFER(START, FINISH) diff --git a/field_basic_module.F90 b/field_basic_module.F90 index 4589542..97de3cf 100644 --- a/field_basic_module.F90 +++ b/field_basic_module.F90 @@ -31,7 +31,6 @@ MODULE FIELD_BASIC_MODULE LOGICAL :: THREAD_BUFFER = .FALSE. INTEGER(KIND=JPIM) :: ISTATUS = 0 - INTEGER(KIND=JPIM) :: LAST_CONTIGUOUS_DIMENSION = 0 TYPE(GPU_STATS) :: STATS diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 94302eb..dff9dec 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -58,6 +58,7 @@ list(APPEND TEST_FILES init_wrapper_gpu.F90 init_wrapper_lbounds.F90 init_wrapper_non_contiguous.F90 + init_wrapper_non_contiguous_multi.F90 no_transfer_get_device.F90 no_transfer_get_host.F90 pointer_to_owner_wrapper.F90 diff --git a/tests/init_wrapper_non_contiguous_multi.F90 b/tests/init_wrapper_non_contiguous_multi.F90 new file mode 100644 index 0000000..063703d --- /dev/null +++ b/tests/init_wrapper_non_contiguous_multi.F90 @@ -0,0 +1,265 @@ +! (C) Copyright 2022- ECMWF. +! (C) Copyright 2022- Meteo-France. +! (C) Copyright 2023- NVIDIA +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. + +PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI + ! TEST IF WRAPPER WORKS WITH NON CONTIGUOUS ARRAY + USE FIELD_MODULE + USE PARKIND1 + USE FIELD_FACTORY_MODULE + use iso_c_binding + + IMPLICIT NONE + REAL(KIND=JPRB), ALLOCATABLE, TARGET :: D1(:,:,:,:,:), D2(:,:,:,:,:) + CLASS(FIELD_2RB), POINTER :: W2 => NULL() + REAL(KIND=JPRB), POINTER :: W2PTR(:,:) + CLASS(FIELD_3RB), POINTER :: W3 => NULL() + REAL(KIND=JPRB), POINTER :: W3PTR(:,:,:) + CLASS(FIELD_4RB), POINTER :: W4 => NULL() + REAL(KIND=JPRB), POINTER :: W4PTR(:,:,:,:) + CLASS(FIELD_5RB), POINTER :: W5 => NULL() + REAL(KIND=JPRB), POINTER :: W5PTR(:,:,:,:,:) + integer(kind=8) :: ptr + + ALLOCATE(D1(7, 9, 11, 13, 15)) + ALLOCATE(D2(7, 9, 11, 13, 15)) + D1 = 0 + D2 = 0 + + PRINT *, "begin 1" + CALL FIELD_NEW(W4, DATA=D1(1:1,:,:,:,3)) + CALL W4%GET_HOST_DATA_RDWR(W4PTR) + W4PTR=42 + CALL W4%GET_DEVICE_DATA_RDWR(W4PTR) + !$ACC KERNELS DEFAULT(PRESENT) + W4PTR=92 + !$ACC END KERNELS + D1 = -1 + D2 = -1 + D2(1:1,:,:,:,3)=92 + CALL W4%GET_HOST_DATA_RDONLY(W4PTR) + CALL FIELD_DELETE(W4) + IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 1" + PRINT *, "" + + PRINT *, "begin 2" + CALL FIELD_NEW(W3, DATA=D1(:,2,:,:,3)) + CALL W3%GET_HOST_DATA_RDWR(W3PTR) + W3PTR=51 + CALL W3%GET_DEVICE_DATA_RDWR(W3PTR) + !$ACC KERNELS DEFAULT(PRESENT) + W3PTR=61 + !$ACC END KERNELS + D1 = -1 + D2 = -1 + D2(:,2,:,:,3)=61 + CALL FIELD_DELETE(W3) + IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 2" + PRINT *, "" + + PRINT *, "begin 3" + CALL FIELD_NEW(W4, DATA=D1(:,:,4:8,:,3)) + CALL W4%GET_DEVICE_DATA_RDWR(W4PTR) + !$ACC KERNELS DEFAULT(PRESENT) + W4PTR=31 + !$ACC END KERNELS + D1 = -1 + D2 = -1 + D2(:,:,4:8,:,3)=31 + CALL FIELD_DELETE(W4) + IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 3" + PRINT *, "" + + PRINT *, "begin 4" + CALL FIELD_NEW(W3, DATA=D1(:,2,4:8,3:5,3)) + CALL W3%GET_DEVICE_DATA_RDWR(W3PTR) + !$ACC KERNELS DEFAULT(PRESENT) + W3PTR=91 + !$ACC END KERNELS + D1 = -1 + D2 = -1 + D2(:,2,4:8,3:5,3)=91 + CALL FIELD_DELETE(W3) + IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 4" + PRINT *, "" + + PRINT *, "begin 5" + CALL FIELD_NEW(W2, DATA=D1(:,2,4:8,8,3)) + CALL W2%GET_DEVICE_DATA_RDWR(W2PTR) + !$ACC KERNELS DEFAULT(PRESENT) + W2PTR=12.1 + !$ACC END KERNELS + D1 = -1 + D2 = -1 + D2(:,2,4:8,8,3)=12.1 + CALL FIELD_DELETE(W2) + IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 5" + PRINT *, "" + + PRINT *, "begin 6" + CALL FIELD_NEW(W4, DATA=D1(:,:,4,:,:)) + CALL W4%GET_DEVICE_DATA_RDWR(W4PTR) + !$ACC KERNELS DEFAULT(PRESENT) + W4PTR=22.1 + !$ACC END KERNELS + D1 = -1 + D2 = -1 + D2(:,:,4,:,:)=22.1 + CALL FIELD_DELETE(W4) + IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 6" + PRINT *, "" + + + PRINT *, "begin 7" + CALL FIELD_NEW(W5, DATA=D1(:,1:1,1:1,1:1,1:1)) + CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) + !$ACC KERNELS DEFAULT(PRESENT) + W5PTR=1.1 + !$ACC END KERNELS + D1 = -1 + D2 = -1 + D2(:,1:1,1:1,1:1,1:1)=1.1 + CALL FIELD_DELETE(W5) + IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 7" + PRINT *, "" + + PRINT *, "begin 8" + CALL FIELD_NEW(W5, DATA=D1(:3,1:1,3:3,:,2:4)) + CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) + !$ACC KERNELS DEFAULT(PRESENT) + W5PTR=1.2 + !$ACC END KERNELS + D1 = -1 + D2 = -1 + D2(:3,1:1,3:3,:,2:4)=1.2 + CALL FIELD_DELETE(W5) + IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 8" + PRINT *, "" + + PRINT *, "begin 9" + CALL FIELD_NEW(W5, DATA=D1(:,1:1,3:3,:,2:4)) + CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) + !$ACC KERNELS DEFAULT(PRESENT) + W5PTR=2.5 + !$ACC END KERNELS + D1 = -1 + D2 = -1 + D2(:,1:1,3:3,:,2:4)=2.5 + CALL FIELD_DELETE(W5) + IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 9" + PRINT *, "" + + PRINT *, "begin 10" + CALL FIELD_NEW(W5, DATA=D1(:,1:1,:,1:5,2:4)) + CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) + !$ACC KERNELS DEFAULT(PRESENT) + W5PTR=9.1 + !$ACC END KERNELS + D1 = -1 + D2 = -1 + D2(:,1:1,:,1:5,2:4)=9.1 + CALL FIELD_DELETE(W5) + IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 10" + PRINT *, "" + + PRINT *, "begin 11" + CALL FIELD_NEW(W5, DATA=D1(:,1:1,:,8:12,:)) + CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) + !$ACC KERNELS DEFAULT(PRESENT) + W5PTR=8.1 + !$ACC END KERNELS + D1 = -1 + D2 = -1 + D2(:,1:1,:,8:12,:)=8.1 + CALL FIELD_DELETE(W5) + IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 11" + PRINT *, "" + + PRINT *, "begin 12" + CALL FIELD_NEW(W5, DATA=D1(3:7,:,:,:,3:3)) + CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) + !$ACC KERNELS DEFAULT(PRESENT) + W5PTR=8.4 + !$ACC END KERNELS + D1 = -1 + D2 = -1 + D2(3:7,:,:,:,3:3)=8.4 + CALL FIELD_DELETE(W5) + IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 12" + PRINT *, "" + + PRINT *, "begin 13" + CALL FIELD_NEW(W5, DATA=D1(3:3,:,:,:,:)) + CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) + !$ACC KERNELS DEFAULT(PRESENT) + W5PTR=12 + !$ACC END KERNELS + D1 = -1 + D2 = -1 + D2(3:3,:,:,:,:)=12 + CALL FIELD_DELETE(W5) + IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 13" + PRINT *, "" + + PRINT *, "begin 14" + CALL FIELD_NEW(W5, DATA=D1(1:4,1:9:2,:,3:12:3,:)) + CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) + !$ACC KERNELS DEFAULT(PRESENT) + W5PTR=18 + !$ACC END KERNELS + D1 = -1 + D2 = -1 + D2(1:4,1:9:2,:,3:12:3,:)=18 + CALL FIELD_DELETE(W5) + IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 14" + PRINT *, "" + + PRINT *, "begin 15" + CALL FIELD_NEW(W5, DATA=D1(:,1:9:2,:,3:12:3,:)) + CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) + !$ACC KERNELS DEFAULT(PRESENT) + W5PTR=19 + !$ACC END KERNELS + D1 = -1 + D2 = -1 + D2(:,1:9:2,:,3:12:3,:)=19 + CALL FIELD_DELETE(W5) + IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 15" + PRINT *, "" + + PRINT *, "begin 16" + CALL FIELD_NEW(W5, DATA=D1(:,:,:,3:12:3,:)) + CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) + !$ACC KERNELS DEFAULT(PRESENT) + W5PTR=19 + !$ACC END KERNELS + D1 = -1 + D2 = -1 + D2(:,:,:,3:12:3,:)=19 + CALL FIELD_DELETE(W5) + IF (ANY(D1/=D2)) ERROR STOP + PRINT *, "end 16" + PRINT *, "" + +END PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI From 229a3a1f4ae51721a5717f4fc3e89650d10e4be3 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Thu, 18 Jan 2024 22:46:09 -0800 Subject: [PATCH 12/19] some minor fixes after merge --- field_RANKSUFF_data_module.fypp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/field_RANKSUFF_data_module.fypp b/field_RANKSUFF_data_module.fypp index 06bdba7..5ceb2f4 100644 --- a/field_RANKSUFF_data_module.fypp +++ b/field_RANKSUFF_data_module.fypp @@ -1,5 +1,6 @@ #! (C) Copyright 2022- ECMWF. #! (C) Copyright 2022- Meteo-France. +#! (C) Copyright 2023- NVIDIA #! #! This software is licensed under the terms of the Apache Licence Version 2.0 #! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -218,7 +219,7 @@ CONTAINS HST = DEV ENDIF #endif - END SUBROUTINE + END SUBROUTINE #:endfor #:endfor From 7a23cfab5572ca8c47fcd8e448a2b35988c09868 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Thu, 18 Jan 2024 23:28:02 -0800 Subject: [PATCH 13/19] remove cudamemcpydevicetohost argument and add better comments --- field_RANKSUFF_data_module.fypp | 8 ++--- tests/init_wrapper_non_contiguous_multi.F90 | 33 +++++++++++---------- 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/field_RANKSUFF_data_module.fypp b/field_RANKSUFF_data_module.fypp index 5ceb2f4..b50881d 100644 --- a/field_RANKSUFF_data_module.fypp +++ b/field_RANKSUFF_data_module.fypp @@ -185,8 +185,8 @@ CONTAINS ${indent}$ CALL ACC_SET_CUDA_STREAM(QUEUE, STREAM) ${indent}$ IRET = CUDAMEMCPY2DASYNC (DEV (${ar('DEV')}$), IDEV_PITCH, & ${indent}$ & HST (${ar('HST')}$), IHST_PITCH, & - ${indent}$ & IWIDTH, IHEIGHT, CUDAMEMCPYHOSTTODEVICE, & - ${indent}$ & STREAM) + ${indent}$ & IWIDTH, IHEIGHT, & + ${indent}$ & STREAM=STREAM) ${indent}$ ELSE ${indent}$ IRET = CUDAMEMCPY2D (DEV (${ar('DEV')}$), IDEV_PITCH, & ${indent}$ & HST (${ar('HST')}$), IHST_PITCH, & @@ -199,8 +199,8 @@ CONTAINS ${indent}$ CALL ACC_SET_CUDA_STREAM(QUEUE, STREAM) ${indent}$ IRET = CUDAMEMCPY2DASYNC (HST (${ar('HST')}$), IHST_PITCH, & ${indent}$ & DEV (${ar('DEV')}$), IDEV_PITCH, & - ${indent}$ & IWIDTH, IHEIGHT, CUDAMEMCPYDEVICETOHOST, & - ${indent}$ & STREAM) + ${indent}$ & IWIDTH, IHEIGHT, & + ${indent}$ & STREAM=STREAM) ${indent}$ ELSE ${indent}$ IRET = CUDAMEMCPY2D (HST (${ar('HST')}$), IHST_PITCH, & ${indent}$ & DEV (${ar('DEV')}$), IDEV_PITCH, & diff --git a/tests/init_wrapper_non_contiguous_multi.F90 b/tests/init_wrapper_non_contiguous_multi.F90 index 063703d..e2b8238 100644 --- a/tests/init_wrapper_non_contiguous_multi.F90 +++ b/tests/init_wrapper_non_contiguous_multi.F90 @@ -32,7 +32,7 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI D1 = 0 D2 = 0 - PRINT *, "begin 1" + PRINT *, "begin 1 (should call FIELD_4RB_COPY_2D_DIM1_4_CONTIGUOUS)" CALL FIELD_NEW(W4, DATA=D1(1:1,:,:,:,3)) CALL W4%GET_HOST_DATA_RDWR(W4PTR) W4PTR=42 @@ -49,7 +49,8 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI PRINT *, "end 1" PRINT *, "" - PRINT *, "begin 2" + PRINT *, "begin 2 (should call FIELD_4RB_COPY_2D_DIM1_3_CONTIGUOUS)" + ! Should call DIM1_3 CALL FIELD_NEW(W3, DATA=D1(:,2,:,:,3)) CALL W3%GET_HOST_DATA_RDWR(W3PTR) W3PTR=51 @@ -65,7 +66,7 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI PRINT *, "end 2" PRINT *, "" - PRINT *, "begin 3" + PRINT *, "begin 3 (should call FIELD_4RB_COPY_2D_DIM3_4_CONTIGUOUS)" CALL FIELD_NEW(W4, DATA=D1(:,:,4:8,:,3)) CALL W4%GET_DEVICE_DATA_RDWR(W4PTR) !$ACC KERNELS DEFAULT(PRESENT) @@ -79,7 +80,7 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI PRINT *, "end 3" PRINT *, "" - PRINT *, "begin 4" + PRINT *, "begin 4 (should call FIELD_3RB_COPY_2D_DIM1_2_CONTIGUOUS)" CALL FIELD_NEW(W3, DATA=D1(:,2,4:8,3:5,3)) CALL W3%GET_DEVICE_DATA_RDWR(W3PTR) !$ACC KERNELS DEFAULT(PRESENT) @@ -93,7 +94,7 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI PRINT *, "end 4" PRINT *, "" - PRINT *, "begin 5" + PRINT *, "begin 5 (should call FIELD_2RB_COPY_2D_DIM1_2_CONTIGUOUS)" CALL FIELD_NEW(W2, DATA=D1(:,2,4:8,8,3)) CALL W2%GET_DEVICE_DATA_RDWR(W2PTR) !$ACC KERNELS DEFAULT(PRESENT) @@ -107,7 +108,7 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI PRINT *, "end 5" PRINT *, "" - PRINT *, "begin 6" + PRINT *, "begin 6 (should call FIELD_4RB_COPY_2D_DIM2_4_CONTIGUOUS)" CALL FIELD_NEW(W4, DATA=D1(:,:,4,:,:)) CALL W4%GET_DEVICE_DATA_RDWR(W4PTR) !$ACC KERNELS DEFAULT(PRESENT) @@ -122,7 +123,7 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI PRINT *, "" - PRINT *, "begin 7" + PRINT *, "begin 7 (should call FIELD_5RB_COPY_DIM5_CONTIGUOUS)" CALL FIELD_NEW(W5, DATA=D1(:,1:1,1:1,1:1,1:1)) CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) !$ACC KERNELS DEFAULT(PRESENT) @@ -136,7 +137,7 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI PRINT *, "end 7" PRINT *, "" - PRINT *, "begin 8" + PRINT *, "begin 8 (should call FIELD_5RB_COPY_2D_DIM3_5_CONTIGUOUS)" CALL FIELD_NEW(W5, DATA=D1(:3,1:1,3:3,:,2:4)) CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) !$ACC KERNELS DEFAULT(PRESENT) @@ -150,7 +151,7 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI PRINT *, "end 8" PRINT *, "" - PRINT *, "begin 9" + PRINT *, "begin 9 (should call FIELD_5RB_COPY_2D_DIM3_5_CONTIGUOUS)" CALL FIELD_NEW(W5, DATA=D1(:,1:1,3:3,:,2:4)) CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) !$ACC KERNELS DEFAULT(PRESENT) @@ -164,7 +165,7 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI PRINT *, "end 9" PRINT *, "" - PRINT *, "begin 10" + PRINT *, "begin 10 (should call FIELD_5RB_COPY_2D_DIM2_4_CONTIGUOUS)" CALL FIELD_NEW(W5, DATA=D1(:,1:1,:,1:5,2:4)) CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) !$ACC KERNELS DEFAULT(PRESENT) @@ -178,7 +179,7 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI PRINT *, "end 10" PRINT *, "" - PRINT *, "begin 11" + PRINT *, "begin 11 (should call FIELD_5RB_COPY_2D_DIM2_4_CONTIGUOUS)" CALL FIELD_NEW(W5, DATA=D1(:,1:1,:,8:12,:)) CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) !$ACC KERNELS DEFAULT(PRESENT) @@ -192,7 +193,7 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI PRINT *, "end 11" PRINT *, "" - PRINT *, "begin 12" + PRINT *, "begin 12 (should call FIELD_5RB_COPY_2D_DIM1_5_CONTIGUOUS)" CALL FIELD_NEW(W5, DATA=D1(3:7,:,:,:,3:3)) CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) !$ACC KERNELS DEFAULT(PRESENT) @@ -206,7 +207,7 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI PRINT *, "end 12" PRINT *, "" - PRINT *, "begin 13" + PRINT *, "begin 13 (should call FIELD_5RB_COPY_2D_DIM1_5_CONTIGUOUS)" CALL FIELD_NEW(W5, DATA=D1(3:3,:,:,:,:)) CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) !$ACC KERNELS DEFAULT(PRESENT) @@ -220,7 +221,7 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI PRINT *, "end 13" PRINT *, "" - PRINT *, "begin 14" + PRINT *, "begin 14 (should call FIELD_5RB_COPY_2D_DIM1_2_CONTIGUOUS)" CALL FIELD_NEW(W5, DATA=D1(1:4,1:9:2,:,3:12:3,:)) CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) !$ACC KERNELS DEFAULT(PRESENT) @@ -234,7 +235,7 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI PRINT *, "end 14" PRINT *, "" - PRINT *, "begin 15" + PRINT *, "begin 15 (should call FIELD_5RB_COPY_2D_DIM1_2_CONTIGUOUS)" CALL FIELD_NEW(W5, DATA=D1(:,1:9:2,:,3:12:3,:)) CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) !$ACC KERNELS DEFAULT(PRESENT) @@ -248,7 +249,7 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI PRINT *, "end 15" PRINT *, "" - PRINT *, "begin 16" + PRINT *, "begin 16 (should call FIELD_5RB_COPY_2D_DIM3_4_CONTIGUOUS)" CALL FIELD_NEW(W5, DATA=D1(:,:,:,3:12:3,:)) CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) !$ACC KERNELS DEFAULT(PRESENT) From 2a127d729a9ed3e2300e06510a823faacde44d52 Mon Sep 17 00:00:00 2001 From: Ahmad Nawab Date: Fri, 19 Jan 2024 13:01:45 +0000 Subject: [PATCH 14/19] Remove redundant specification of cuda compiler flags --- cmake/field_api_compile_options.cmake | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/cmake/field_api_compile_options.cmake b/cmake/field_api_compile_options.cmake index 4685539..06c8d5a 100644 --- a/cmake/field_api_compile_options.cmake +++ b/cmake/field_api_compile_options.cmake @@ -10,11 +10,8 @@ if(CMAKE_Fortran_COMPILER_ID MATCHES PGI|NVIDIA|NVHPC) ecbuild_add_fortran_flags("-Mlarge_arrays") ecbuild_add_fortran_flags("-gopt") - if(HAVE_ACC) - list(APPEND CMAKE_REQUIRED_LINK_OPTIONS "-cuda") - ecbuild_add_fortran_flags("-cuda") - ecbuild_add_fortran_flags("-Minfo=accel,all,ccff" BUILD DEBUG) - endif() + + ecbuild_add_fortran_flags("-Minfo=accel,all,ccff" BUILD DEBUG) # These are architecture/compiler/offload-library specific options # that should really be coming from external input From 35c8aee8bf20b0465e8d9140d4ae200ba9257629 Mon Sep 17 00:00:00 2001 From: Ahmad Nawab Date: Fri, 19 Jan 2024 22:43:07 +0000 Subject: [PATCH 15/19] Add pure device-pointer support to COPY_2D --- field_RANKSUFF_data_module.fypp | 86 ++++++++++----------- tests/init_wrapper_non_contiguous_multi.F90 | 12 ++- 2 files changed, 50 insertions(+), 48 deletions(-) diff --git a/field_RANKSUFF_data_module.fypp b/field_RANKSUFF_data_module.fypp index b50881d..bfe1c73 100644 --- a/field_RANKSUFF_data_module.fypp +++ b/field_RANKSUFF_data_module.fypp @@ -45,38 +45,23 @@ CONTAINS LAST_CONTIG_DIM = ${ftn}$_GET_LAST_CONTIGUOUS_DIMENSION (HST, 0) NEXT_CONTIG_DIM = ${ftn}$_GET_LAST_CONTIGUOUS_DIMENSION (HST, LAST_CONTIG_DIM+1) - IF (MAP_DEVPTR) THEN - - SELECT CASE (LAST_CONTIG_DIM) + SELECT CASE (LAST_CONTIG_DIM) #:for d1 in range (ft.rank) - CASE (${d1}$) - SELECT CASE (NEXT_CONTIG_DIM) - #:for d2 in range (d1+1, ft.rank+1) - CASE (${d2}$) - CALL ${ftn}$_COPY_2D_DIM${d1}$_${d2}$_CONTIGUOUS (HST, DEV, KDIR, QUEUE) - #:endfor - CASE DEFAULT - CALL FIELD_ABORT ('INTERNAL ERROR: UNEXPECTED NEXT_CONTIG_DIM') - END SELECT -#:endfor - CASE (${ft.rank}$) - CALL ${ftn}$_COPY_DIM${ft.rank}$_CONTIGUOUS (HST, DEV, MAP_DEVPTR, KDIR, QUEUE) - CASE DEFAULT - CALL FIELD_ABORT ('INTERNAL ERROR: UNEXPECTED LAST_CONTIG_DIM') - END SELECT - - ELSE - - SELECT CASE (LAST_CONTIG_DIM) -#:for d in range (ft.rank + 1) - CASE (${d}$) - CALL ${ftn}$_COPY_DIM${d}$_CONTIGUOUS (HST, DEV, MAP_DEVPTR, KDIR, QUEUE) + CASE (${d1}$) + SELECT CASE (NEXT_CONTIG_DIM) + #:for d2 in range (d1+1, ft.rank+1) + CASE (${d2}$) + CALL ${ftn}$_COPY_2D_DIM${d1}$_${d2}$_CONTIGUOUS (HST, DEV, MAP_DEVPTR, KDIR, QUEUE) + #:endfor + CASE DEFAULT + CALL FIELD_ABORT ('INTERNAL ERROR: UNEXPECTED NEXT_CONTIG_DIM') + END SELECT #:endfor - CASE DEFAULT - CALL FIELD_ABORT ('INTERNAL ERROR: UNEXPECTED LAST_CONTIG_DIM') - END SELECT - - ENDIF + CASE (${ft.rank}$) + CALL ${ftn}$_COPY_DIM${ft.rank}$_CONTIGUOUS (HST, DEV, MAP_DEVPTR, KDIR, QUEUE) + CASE DEFAULT + CALL FIELD_ABORT ('INTERNAL ERROR: UNEXPECTED LAST_CONTIG_DIM') + END SELECT END SUBROUTINE @@ -146,13 +131,14 @@ CONTAINS #:for d1 in range (0, ft.rank) #:for d2 in range (d1+1, ft.rank+1) - SUBROUTINE ${ftn}$_COPY_2D_DIM${d1}$_${d2}$_CONTIGUOUS (HST, DEV, KDIR, QUEUE) + SUBROUTINE ${ftn}$_COPY_2D_DIM${d1}$_${d2}$_CONTIGUOUS (HST, DEV, MAP_DEVPTR, KDIR, QUEUE) #ifdef _OPENACC USE OPENACC USE CUDAFOR #endif ${ft.type}$, POINTER :: HST (${ft.shape}$), DEV (${ft.shape}$) INTEGER (KIND=JPIM), INTENT (IN) :: KDIR + LOGICAL, INTENT (IN) :: MAP_DEVPTR INTEGER (KIND=JPIM), OPTIONAL, INTENT (IN) :: QUEUE #ifdef _OPENACC INTEGER (KIND=JPIM) :: IHST_PITCH, IDEV_PITCH, IRET @@ -161,52 +147,60 @@ CONTAINS INTEGER :: ${', '.join (list (map (lambda i: 'J' + str (i+1), range (d2, ft.rank))))}$ #:endif INTEGER(KIND=CUDA_STREAM_KIND) :: STREAM + TYPE(C_PTR) :: HSTPTR + TYPE(C_DEVPTR) :: DEVPTR ISHP(1) = 1 ISHP(2:) = SHAPE(HST) - IWIDTH = PRODUCT(ISHP(1:${d1+1}$)) + IWIDTH = PRODUCT(ISHP(1:${d1+1}$)) * KIND(HST) IHEIGHT = PRODUCT(ISHP(${d1+2}$:${d2+1}$)) #:set lb = lambda arr, i: f'LBOUND({arr}, {i+1})' #:set lbnds = lambda arr, start, end: [lb(arr, i) for i in range(start, end)] #:set this_slice = lambda arr: ', '.join(lbnds(arr, 0, ft.rank)) #:set next_slice = lambda arr: ', '.join(lbnds(arr, 0, d1) + [lb(arr, d1)+'+1'] + lbnds(arr, d1+1, ft.rank)) - IHST_PITCH = (LOC (HST(${next_slice('HST')}$)) - LOC (HST (${this_slice('HST')}$)))/KIND(HST) - IDEV_PITCH = (LOC (DEV(${next_slice('DEV')}$)) - LOC (DEV (${this_slice('DEV')}$)))/KIND(DEV) + IHST_PITCH = LOC (HST(${next_slice('HST')}$)) - LOC (HST (${this_slice('HST')}$)) + IDEV_PITCH = LOC (DEV(${next_slice('DEV')}$)) - LOC (DEV (${this_slice('DEV')}$)) #:for e in range (ft.rank, d2, -1) ${' ' * (ft.rank - e)}$DO J${e}$ = LBOUND (HST, ${e}$), UBOUND (HST, ${e}$) #:endfor #:set indent = ' ' * (ft.rank - d2 - 1) #:set ar = lambda arr: ', '.join(lbnds(arr, 0, d2) + [f'J{i+1}' for i in range(d2, ft.rank)]) + ${indent}$ HSTPTR = C_LOC(HST (${ar('HST')}$)) + ${indent}$ IF (MAP_DEVPTR) THEN + ${indent}$ !$acc host_data use_device(DEV) + ${indent}$ DEVPTR = C_DEVLOC(DEV (${ar('DEV')}$)) + ${indent}$ !$acc end host_data + ${indent}$ ELSE + ${indent}$ !$acc data deviceptr(DEVPTR,DEV) + ${indent}$ DEVPTR = C_DEVLOC(DEV (${ar('DEV')}$)) + ${indent}$ !$acc end data + ${indent}$ ENDIF ${indent}$ IF (KDIR == NH2D) THEN - ${indent}$ !$acc host_data use_device (DEV) ${indent}$ IF(PRESENT(QUEUE)) THEN ${indent}$ CALL ACC_SET_CUDA_STREAM(QUEUE, STREAM) - ${indent}$ IRET = CUDAMEMCPY2DASYNC (DEV (${ar('DEV')}$), IDEV_PITCH, & - ${indent}$ & HST (${ar('HST')}$), IHST_PITCH, & + ${indent}$ IRET = CUDAMEMCPY2DASYNC (DEVPTR, IDEV_PITCH, & + ${indent}$ & HSTPTR, IHST_PITCH, & ${indent}$ & IWIDTH, IHEIGHT, & ${indent}$ & STREAM=STREAM) ${indent}$ ELSE - ${indent}$ IRET = CUDAMEMCPY2D (DEV (${ar('DEV')}$), IDEV_PITCH, & - ${indent}$ & HST (${ar('HST')}$), IHST_PITCH, & + ${indent}$ IRET = CUDAMEMCPY2D (DEVPTR, IDEV_PITCH, & + ${indent}$ & HSTPTR, IHST_PITCH, & ${indent}$ & IWIDTH, IHEIGHT) ${indent}$ ENDIF - ${indent}$ !$acc end host_data ${indent}$ ELSEIF (KDIR == ND2H) THEN - ${indent}$ !$acc host_data use_device (DEV) ${indent}$ IF(PRESENT(QUEUE)) THEN ${indent}$ CALL ACC_SET_CUDA_STREAM(QUEUE, STREAM) - ${indent}$ IRET = CUDAMEMCPY2DASYNC (HST (${ar('HST')}$), IHST_PITCH, & - ${indent}$ & DEV (${ar('DEV')}$), IDEV_PITCH, & + ${indent}$ IRET = CUDAMEMCPY2DASYNC (HSTPTR, IHST_PITCH, & + ${indent}$ & DEVPTR, IDEV_PITCH, & ${indent}$ & IWIDTH, IHEIGHT, & ${indent}$ & STREAM=STREAM) ${indent}$ ELSE - ${indent}$ IRET = CUDAMEMCPY2D (HST (${ar('HST')}$), IHST_PITCH, & - ${indent}$ & DEV (${ar('DEV')}$), IDEV_PITCH, & + ${indent}$ IRET = CUDAMEMCPY2D (HSTPTR, IHST_PITCH, & + ${indent}$ & DEVPTR, IDEV_PITCH, & ${indent}$ & IWIDTH, IHEIGHT) ${indent}$ ENDIF - ${indent}$ !$acc end host_data ${indent}$ IF (IRET /= CUDASUCCESS) STOP 1 ${indent}$ ENDIF #:for e in range (d2, ft.rank) diff --git a/tests/init_wrapper_non_contiguous_multi.F90 b/tests/init_wrapper_non_contiguous_multi.F90 index e2b8238..6c108a6 100644 --- a/tests/init_wrapper_non_contiguous_multi.F90 +++ b/tests/init_wrapper_non_contiguous_multi.F90 @@ -250,9 +250,17 @@ PROGRAM INIT_WRAPPER_NON_CONTIGUOUS_MULTI PRINT *, "" PRINT *, "begin 16 (should call FIELD_5RB_COPY_2D_DIM3_4_CONTIGUOUS)" - CALL FIELD_NEW(W5, DATA=D1(:,:,:,3:12:3,:)) +#ifdef _CUDA + CALL FIELD_NEW(W5, DATA=D1(:,:,:,3:12:3,:), MAP_DEVPTR=.FALSE.) +#else + CALL FIELD_NEW(W5, DATA=D1(:,:,:,3:12:3,:), MAP_DEVPTR=.TRUE.) +#endif CALL W5%GET_DEVICE_DATA_RDWR(W5PTR) - !$ACC KERNELS DEFAULT(PRESENT) +#ifdef _CUDA + !$ACC KERNELS DEVICEPTR(W5PTR) +#else + !$ACC KERNELS PRESENT(W5PTR) +#endif W5PTR=19 !$ACC END KERNELS D1 = -1 From 4d1a3c86dff54625f3be8158fa45d2e495ce51bb Mon Sep 17 00:00:00 2001 From: Ahmad Nawab Date: Fri, 19 Jan 2024 23:01:49 +0000 Subject: [PATCH 16/19] FIELD_XX_COPY now works again if ACC is enabled but CUDA is disabled --- field_RANKSUFF_data_module.fypp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/field_RANKSUFF_data_module.fypp b/field_RANKSUFF_data_module.fypp index bfe1c73..82a3804 100644 --- a/field_RANKSUFF_data_module.fypp +++ b/field_RANKSUFF_data_module.fypp @@ -46,6 +46,7 @@ CONTAINS NEXT_CONTIG_DIM = ${ftn}$_GET_LAST_CONTIGUOUS_DIMENSION (HST, LAST_CONTIG_DIM+1) SELECT CASE (LAST_CONTIG_DIM) +#:if defined('CUDA') #:for d1 in range (ft.rank) CASE (${d1}$) SELECT CASE (NEXT_CONTIG_DIM) @@ -59,6 +60,12 @@ CONTAINS #:endfor CASE (${ft.rank}$) CALL ${ftn}$_COPY_DIM${ft.rank}$_CONTIGUOUS (HST, DEV, MAP_DEVPTR, KDIR, QUEUE) +#:else +#:for d in range (ft.rank + 1) + CASE (${d}$) + CALL ${ftn}$_COPY_DIM${d}$_CONTIGUOUS (HST, DEV, MAP_DEVPTR, KDIR, QUEUE) +#:endfor +#:endif CASE DEFAULT CALL FIELD_ABORT ('INTERNAL ERROR: UNEXPECTED LAST_CONTIG_DIM') END SELECT @@ -129,18 +136,17 @@ CONTAINS #:endfor +#:if defined('CUDA') #:for d1 in range (0, ft.rank) #:for d2 in range (d1+1, ft.rank+1) SUBROUTINE ${ftn}$_COPY_2D_DIM${d1}$_${d2}$_CONTIGUOUS (HST, DEV, MAP_DEVPTR, KDIR, QUEUE) -#ifdef _OPENACC USE OPENACC USE CUDAFOR -#endif + ${ft.type}$, POINTER :: HST (${ft.shape}$), DEV (${ft.shape}$) INTEGER (KIND=JPIM), INTENT (IN) :: KDIR LOGICAL, INTENT (IN) :: MAP_DEVPTR INTEGER (KIND=JPIM), OPTIONAL, INTENT (IN) :: QUEUE -#ifdef _OPENACC INTEGER (KIND=JPIM) :: IHST_PITCH, IDEV_PITCH, IRET INTEGER (KIND=JPIM) :: IWIDTH, IHEIGHT, ISHP(${ft.rank+1}$) #:if d2 < ft.rank @@ -206,16 +212,10 @@ CONTAINS #:for e in range (d2, ft.rank) ${' ' * (ft.rank - e - 1)}$ENDDO #:endfor -#else - IF (KDIR == NH2D) THEN - DEV = HST - ELSE - HST = DEV - ENDIF -#endif END SUBROUTINE #:endfor #:endfor +#:endif #:endfor From 3bb735f2707a6fc83a3e15510a4194b59b0643cb Mon Sep 17 00:00:00 2001 From: Ahmad Nawab Date: Fri, 19 Jan 2024 23:09:27 +0000 Subject: [PATCH 17/19] Only perform CUDASUCCESS checks for DEBUG builds --- CMakeLists.txt | 3 +++ field_RANKSUFF_data_module.fypp | 14 +++++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 72417f4..0007ba2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -112,6 +112,9 @@ endif() if(HAVE_CUDA) list( APPEND fypp_defines "-DCUDA") endif() +if(CMAKE_BUILD_TYPE MATCHES "Debug") + list( APPEND fypp_defines "-DDEBUG") +endif() ## preprocess fypp files foreach (SUFF IN ITEMS IM RM RB RD LM) diff --git a/field_RANKSUFF_data_module.fypp b/field_RANKSUFF_data_module.fypp index 82a3804..194d714 100644 --- a/field_RANKSUFF_data_module.fypp +++ b/field_RANKSUFF_data_module.fypp @@ -142,6 +142,9 @@ CONTAINS SUBROUTINE ${ftn}$_COPY_2D_DIM${d1}$_${d2}$_CONTIGUOUS (HST, DEV, MAP_DEVPTR, KDIR, QUEUE) USE OPENACC USE CUDAFOR +#:if defined('DEBUG') + USE FIELD_ABORT_MODULE +#:endif ${ft.type}$, POINTER :: HST (${ft.shape}$), DEV (${ft.shape}$) INTEGER (KIND=JPIM), INTENT (IN) :: KDIR @@ -195,6 +198,11 @@ CONTAINS ${indent}$ & HSTPTR, IHST_PITCH, & ${indent}$ & IWIDTH, IHEIGHT) ${indent}$ ENDIF +#:if defined('DEBUG') + ${indent}$ IF (IRET /= CUDASUCCESS) THEN + ${indent}$ CALL FIELD_ABORT ("${ftn}$_COPY_2D_DIM${d1}$_${d2}$_CONTIGUOUS: HOST-TO-DEVICE TRANSFER FAILED") + ${indent}$ ENDIF +#:endif ${indent}$ ELSEIF (KDIR == ND2H) THEN ${indent}$ IF(PRESENT(QUEUE)) THEN ${indent}$ CALL ACC_SET_CUDA_STREAM(QUEUE, STREAM) @@ -207,7 +215,11 @@ CONTAINS ${indent}$ & DEVPTR, IDEV_PITCH, & ${indent}$ & IWIDTH, IHEIGHT) ${indent}$ ENDIF - ${indent}$ IF (IRET /= CUDASUCCESS) STOP 1 +#:if defined('DEBUG') + ${indent}$ IF (IRET /= CUDASUCCESS) THEN + ${indent}$ CALL FIELD_ABORT ("${ftn}$_COPY_2D_DIM${d1}$_${d2}$_CONTIGUOUS: DEVICE-TO-HOST TRANSFER FAILED") + ${indent}$ ENDIF +#:endif ${indent}$ ENDIF #:for e in range (d2, ft.rank) ${' ' * (ft.rank - e - 1)}$ENDDO From b129adf666c0e157fba93207630c22d5753bf880 Mon Sep 17 00:00:00 2001 From: Ahmad Nawab Date: Mon, 22 Jan 2024 09:37:25 +0000 Subject: [PATCH 18/19] Restore CUDASUCCESS checks for optimized builds --- field_RANKSUFF_data_module.fypp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/field_RANKSUFF_data_module.fypp b/field_RANKSUFF_data_module.fypp index 194d714..c859c7e 100644 --- a/field_RANKSUFF_data_module.fypp +++ b/field_RANKSUFF_data_module.fypp @@ -198,11 +198,9 @@ CONTAINS ${indent}$ & HSTPTR, IHST_PITCH, & ${indent}$ & IWIDTH, IHEIGHT) ${indent}$ ENDIF -#:if defined('DEBUG') ${indent}$ IF (IRET /= CUDASUCCESS) THEN ${indent}$ CALL FIELD_ABORT ("${ftn}$_COPY_2D_DIM${d1}$_${d2}$_CONTIGUOUS: HOST-TO-DEVICE TRANSFER FAILED") ${indent}$ ENDIF -#:endif ${indent}$ ELSEIF (KDIR == ND2H) THEN ${indent}$ IF(PRESENT(QUEUE)) THEN ${indent}$ CALL ACC_SET_CUDA_STREAM(QUEUE, STREAM) @@ -215,11 +213,9 @@ CONTAINS ${indent}$ & DEVPTR, IDEV_PITCH, & ${indent}$ & IWIDTH, IHEIGHT) ${indent}$ ENDIF -#:if defined('DEBUG') ${indent}$ IF (IRET /= CUDASUCCESS) THEN ${indent}$ CALL FIELD_ABORT ("${ftn}$_COPY_2D_DIM${d1}$_${d2}$_CONTIGUOUS: DEVICE-TO-HOST TRANSFER FAILED") ${indent}$ ENDIF -#:endif ${indent}$ ENDIF #:for e in range (d2, ft.rank) ${' ' * (ft.rank - e - 1)}$ENDDO From ca16c192fc42125a3447882303858fdb9f8e72b5 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Mon, 22 Jan 2024 01:50:18 -0800 Subject: [PATCH 19/19] DEBUG is not needed anymore --- CMakeLists.txt | 3 --- field_RANKSUFF_data_module.fypp | 2 -- 2 files changed, 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0007ba2..72417f4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -112,9 +112,6 @@ endif() if(HAVE_CUDA) list( APPEND fypp_defines "-DCUDA") endif() -if(CMAKE_BUILD_TYPE MATCHES "Debug") - list( APPEND fypp_defines "-DDEBUG") -endif() ## preprocess fypp files foreach (SUFF IN ITEMS IM RM RB RD LM) diff --git a/field_RANKSUFF_data_module.fypp b/field_RANKSUFF_data_module.fypp index c859c7e..679d185 100644 --- a/field_RANKSUFF_data_module.fypp +++ b/field_RANKSUFF_data_module.fypp @@ -142,9 +142,7 @@ CONTAINS SUBROUTINE ${ftn}$_COPY_2D_DIM${d1}$_${d2}$_CONTIGUOUS (HST, DEV, MAP_DEVPTR, KDIR, QUEUE) USE OPENACC USE CUDAFOR -#:if defined('DEBUG') USE FIELD_ABORT_MODULE -#:endif ${ft.type}$, POINTER :: HST (${ft.shape}$), DEV (${ft.shape}$) INTEGER (KIND=JPIM), INTENT (IN) :: KDIR