From 5b0418a9f6d181d668ddebdc2c540566529e4125 Mon Sep 17 00:00:00 2001 From: Tony Craig Date: Wed, 5 Apr 2023 13:29:21 -0700 Subject: [PATCH] Update global reduction implementation to improve performance, fix VP bug (#824) * Update global reduction implementation to improve performance, fix VP bug This was mainly done for situations like VP that need a fast global sum. The VP global sum is still slightly faster than the one computed in the infrastructure, so kept that implementation. Found a bug in the workspace_y calculation in VP that was fixed. Also found that the haloupdate call as part of the precondition step generally improves VP performance, so removed option to NOT call the haloupdate there. Separately, fixed a bug in the tripoleT global sum implementation, added a tripoleT global sum unit test, and resynced ice_exit.F90, ice_reprosum.F90, and ice_global_reductions.F90 between serial and mpi versions. - Refactor global sums to improve performance, move if checks outside do loops - Fix bug in tripoleT global sums, tripole seam masking - Update VP solver, use local global sum more often - Update VP solver, fix bug in workspace_y calculation - Update VP solver, always call haloupdate during precondition - Refactor ice_exit.F90 and sync serial and mpi versions - Sync ice_reprosum.F90 between serial and mpi versions - Update sumchk unit test to handle grids better - Add tripoleT sumchk test * Update VP global sum to exclude local implementation with tripole grids --- cicecore/cicedyn/analysis/ice_diagnostics.F90 | 6 +- cicecore/cicedyn/dynamics/ice_dyn_vp.F90 | 41 ++-- .../infrastructure/comm/mpi/ice_exit.F90 | 60 ++--- .../comm/mpi/ice_global_reductions.F90 | 232 ++++++++++++------ .../infrastructure/comm/mpi/ice_reprosum.F90 | 34 +-- .../infrastructure/comm/serial/ice_exit.F90 | 69 ++++-- .../comm/serial/ice_global_reductions.F90 | 232 ++++++++++++------ cicecore/drivers/unittest/sumchk/sumchk.F90 | 59 +++-- configuration/scripts/tests/unittest_suite.ts | 1 + 9 files changed, 466 insertions(+), 268 deletions(-) diff --git a/cicecore/cicedyn/analysis/ice_diagnostics.F90 b/cicecore/cicedyn/analysis/ice_diagnostics.F90 index 53631b2d4..b14dff4e3 100644 --- a/cicecore/cicedyn/analysis/ice_diagnostics.F90 +++ b/cicecore/cicedyn/analysis/ice_diagnostics.F90 @@ -261,10 +261,8 @@ subroutine runtime_diags (dt) !$OMP END PARALLEL DO extentn = c0 extents = c0 - extentn = global_sum(work1, distrb_info, field_loc_center, & - tarean) - extents = global_sum(work1, distrb_info, field_loc_center, & - tareas) + extentn = global_sum(work1, distrb_info, field_loc_center, tarean) + extents = global_sum(work1, distrb_info, field_loc_center, tareas) extentn = extentn * m2_to_km2 extents = extents * m2_to_km2 diff --git a/cicecore/cicedyn/dynamics/ice_dyn_vp.F90 b/cicecore/cicedyn/dynamics/ice_dyn_vp.F90 index 3915004b4..32971c5b6 100644 --- a/cicecore/cicedyn/dynamics/ice_dyn_vp.F90 +++ b/cicecore/cicedyn/dynamics/ice_dyn_vp.F90 @@ -2502,7 +2502,7 @@ function global_dot_product (nx_block , ny_block , & vector2_x , vector2_y) & result(dot_product) - use ice_domain, only: distrb_info + use ice_domain, only: distrb_info, ns_boundary_type use ice_domain_size, only: max_blocks use ice_fileunits, only: bfbflag @@ -2552,8 +2552,14 @@ function global_dot_product (nx_block , ny_block , & enddo !$OMP END PARALLEL DO - ! Use local summation result unless bfbflag is active - if (bfbflag == 'off') then + ! Use faster local summation result for several bfbflag settings. + ! The local implementation sums over each block, sums over local + ! blocks, and calls global_sum on a scalar and should be just as accurate as + ! bfbflag = 'off', 'lsum8', and 'lsum4' without the extra copies and overhead + ! in the more general array global_sum. But use the array global_sum + ! if bfbflag is more strict or for tripole grids (requires special masking) + if (ns_boundary_type /= 'tripole' .and. ns_boundary_type /= 'tripoleT' .and. & + (bfbflag == 'off' .or. bfbflag == 'lsum8' .or. bfbflag == 'lsum4')) then dot_product = global_sum(sum(dot), distrb_info) else dot_product = global_sum(prod, distrb_info, field_loc_NEcorner) @@ -3120,7 +3126,7 @@ subroutine fgmres (zetax2 , etax2 , & j = indxUj(ij, iblk) workspace_x(i, j, iblk) = workspace_x(i, j, iblk) + rhs_hess(it) * arnoldi_basis_x(i, j, iblk, it) - workspace_y(i, j, iblk) = workspace_x(i, j, iblk) + rhs_hess(it) * arnoldi_basis_y(i, j, iblk, it) + workspace_y(i, j, iblk) = workspace_y(i, j, iblk) + rhs_hess(it) * arnoldi_basis_y(i, j, iblk, it) enddo ! ij enddo !$OMP END PARALLEL DO @@ -3151,7 +3157,6 @@ subroutine pgmres (zetax2 , etax2 , & use ice_boundary, only: ice_HaloUpdate use ice_domain, only: maskhalo_dyn, halo_info - use ice_fileunits, only: bfbflag use ice_timers, only: ice_timer_start, ice_timer_stop, timer_bound real (kind=dbl_kind), dimension(nx_block,ny_block,max_blocks,4), intent(in) :: & @@ -3343,21 +3348,17 @@ subroutine pgmres (zetax2 , etax2 , & workspace_x , workspace_y) ! Update workspace with boundary values - ! NOTE: skipped for efficiency since this is just a preconditioner - ! unless bfbflag is active - if (bfbflag /= 'off') then - call stack_fields(workspace_x, workspace_y, fld2) - call ice_timer_start(timer_bound) - if (maskhalo_dyn) then - call ice_HaloUpdate (fld2, halo_info_mask, & - field_loc_NEcorner, field_type_vector) - else - call ice_HaloUpdate (fld2, halo_info, & - field_loc_NEcorner, field_type_vector) - endif - call ice_timer_stop(timer_bound) - call unstack_fields(fld2, workspace_x, workspace_y) + call stack_fields(workspace_x, workspace_y, fld2) + call ice_timer_start(timer_bound) + if (maskhalo_dyn) then + call ice_HaloUpdate (fld2, halo_info_mask, & + field_loc_NEcorner, field_type_vector) + else + call ice_HaloUpdate (fld2, halo_info, & + field_loc_NEcorner, field_type_vector) endif + call ice_timer_stop(timer_bound) + call unstack_fields(fld2, workspace_x, workspace_y) !$OMP PARALLEL DO PRIVATE(iblk) do iblk = 1, nblocks @@ -3528,7 +3529,7 @@ subroutine pgmres (zetax2 , etax2 , & j = indxUj(ij, iblk) workspace_x(i, j, iblk) = workspace_x(i, j, iblk) + rhs_hess(it) * arnoldi_basis_x(i, j, iblk, it) - workspace_y(i, j, iblk) = workspace_x(i, j, iblk) + rhs_hess(it) * arnoldi_basis_y(i, j, iblk, it) + workspace_y(i, j, iblk) = workspace_y(i, j, iblk) + rhs_hess(it) * arnoldi_basis_y(i, j, iblk, it) enddo ! ij enddo !$OMP END PARALLEL DO diff --git a/cicecore/cicedyn/infrastructure/comm/mpi/ice_exit.F90 b/cicecore/cicedyn/infrastructure/comm/mpi/ice_exit.F90 index eafb3228f..5351a5336 100644 --- a/cicecore/cicedyn/infrastructure/comm/mpi/ice_exit.F90 +++ b/cicecore/cicedyn/infrastructure/comm/mpi/ice_exit.F90 @@ -1,3 +1,4 @@ + !======================================================================= ! ! Exit the model. @@ -8,7 +9,15 @@ module ice_exit use ice_kinds_mod + use ice_fileunits, only: nu_diag, ice_stderr, flush_fileunit use icepack_intfc, only: icepack_warnings_flush, icepack_warnings_aborted +#if (defined CESMCOUPLED) + use shr_sys_mod +#else +#ifndef SERIAL_REMOVE_MPI + use mpi ! MPI Fortran module +#endif +#endif implicit none public @@ -23,14 +32,6 @@ subroutine abort_ice(error_message, file, line, doabort) ! This routine aborts the ice model and prints an error message. -#if (defined CESMCOUPLED) - use ice_fileunits, only: nu_diag, flush_fileunit - use shr_sys_mod -#else - use ice_fileunits, only: nu_diag, ice_stderr, flush_fileunit - use mpi ! MPI Fortran module -#endif - character (len=*), intent(in),optional :: error_message ! error message character (len=*), intent(in),optional :: file ! file integer (kind=int_kind), intent(in), optional :: line ! line number @@ -38,11 +39,10 @@ subroutine abort_ice(error_message, file, line, doabort) ! local variables -#ifndef CESMCOUPLED integer (int_kind) :: & ierr, & ! MPI error flag + outunit, & ! output unit error_code ! return code -#endif logical (log_kind) :: ldoabort ! local doabort flag character(len=*), parameter :: subname='(abort_ice)' @@ -50,30 +50,31 @@ subroutine abort_ice(error_message, file, line, doabort) if (present(doabort)) ldoabort = doabort #if (defined CESMCOUPLED) - call flush_fileunit(nu_diag) - call icepack_warnings_flush(nu_diag) - write(nu_diag,*) ' ' - write(nu_diag,*) subname, 'ABORTED: ' - if (present(file)) write (nu_diag,*) subname,' called from ',trim(file) - if (present(line)) write (nu_diag,*) subname,' line number ',line - if (present(error_message)) write (nu_diag,*) subname,' error = ',trim(error_message) - call flush_fileunit(nu_diag) - if (ldoabort) call shr_sys_abort(subname//trim(error_message)) + outunit = nu_diag #else + outunit = ice_stderr +#endif + call flush_fileunit(nu_diag) call icepack_warnings_flush(nu_diag) - write(ice_stderr,*) ' ' - write(ice_stderr,*) subname, 'ABORTED: ' - if (present(file)) write (ice_stderr,*) subname,' called from ',trim(file) - if (present(line)) write (ice_stderr,*) subname,' line number ',line - if (present(error_message)) write (ice_stderr,*) subname,' error = ',trim(error_message) - call flush_fileunit(ice_stderr) - error_code = 128 + write(outunit,*) ' ' + write(outunit,*) subname, 'ABORTED: ' + if (present(file)) write (outunit,*) subname,' called from ',trim(file) + if (present(line)) write (outunit,*) subname,' line number ',line + if (present(error_message)) write (outunit,*) subname,' error = ',trim(error_message) + call flush_fileunit(outunit) + if (ldoabort) then +#if (defined CESMCOUPLED) + call shr_sys_abort(subname//trim(error_message)) +#else +#ifndef SERIAL_REMOVE_MPI + error_code = 128 call MPI_ABORT(MPI_COMM_WORLD, error_code, ierr) +#endif stop - endif #endif + endif end subroutine abort_ice @@ -81,12 +82,15 @@ end subroutine abort_ice subroutine end_run -! Ends run by calling MPI_FINALIZE. +! Ends run by calling MPI_FINALIZE +! Does nothing in serial runs integer (int_kind) :: ierr ! MPI error flag character(len=*), parameter :: subname = '(end_run)' +#ifndef SERIAL_REMOVE_MPI call MPI_FINALIZE(ierr) +#endif end subroutine end_run diff --git a/cicecore/cicedyn/infrastructure/comm/mpi/ice_global_reductions.F90 b/cicecore/cicedyn/infrastructure/comm/mpi/ice_global_reductions.F90 index 4b94389f7..91daf53a8 100644 --- a/cicecore/cicedyn/infrastructure/comm/mpi/ice_global_reductions.F90 +++ b/cicecore/cicedyn/infrastructure/comm/mpi/ice_global_reductions.F90 @@ -181,7 +181,7 @@ function global_sum_dbl(array, dist, field_loc, mMask, lMask) & if (Nrow .and. this_block%tripoleTFlag) then maxiglob = 0 ! entire u-row on T-fold grid elseif (Nrow .or. this_block%tripoleTFlag) then - maxiglob = nx_global/2 ! half T-row on T-fold and u-row on u-fold + maxiglob = nx_global/2 ! half T-row on T-fold or half u-row on u-fold else maxiglob = -1 ! nothing to do for T-row on u-fold endif @@ -189,25 +189,45 @@ function global_sum_dbl(array, dist, field_loc, mMask, lMask) & n = (iblock-1)*nx_block*ny_block - do j=jb,je - do i=ib,ie - n = n + 1 - ! eliminate redundant points - if (maxiglob > 0 .and. j == je .and. this_block%i_glob(i) > maxiglob) then - work(n,1) = 0._dbl_kind - else - if (present(mMask)) then - work(n,1) = array(i,j,iblock)*mMask(i,j,iblock) - else if (present(lMask)) then - if (lMask(i,j,iblock)) then - work(n,1) = array(i,j,iblock) - endif - else + if (present(mMask)) then + do j=jb,je + do i=ib,ie + n = n + 1 + work(n,1) = array(i,j,iblock)*mMask(i,j,iblock) + end do + end do + elseif (present(lMask)) then + do j=jb,je + do i=ib,ie + n = n + 1 + if (lMask(i,j,iblock)) then work(n,1) = array(i,j,iblock) endif - endif - end do - end do + end do + end do + else + do j=jb,je + do i=ib,ie + n = n + 1 + work(n,1) = array(i,j,iblock) + enddo + enddo + endif + + if (maxiglob >= 0) then + ! eliminate redundant points at je + ! set n to (ib,je) index + n = (iblock-1)*nx_block*ny_block + n = n + (je-1-jb+1)*(ie-ib+1) + j=je + do i=ib,ie + n = n + 1 + if (this_block%i_glob(i) > maxiglob) then + work(n,1) = 0._dbl_kind + endif + end do + endif + end do call compute_sums_dbl(work,sums,communicator,numProcs) @@ -317,7 +337,7 @@ function global_sum_real(array, dist, field_loc, mMask, lMask) & if (Nrow .and. this_block%tripoleTFlag) then maxiglob = 0 ! entire u-row on T-fold grid elseif (Nrow .or. this_block%tripoleTFlag) then - maxiglob = nx_global/2 ! half T-row on T-fold and u-row on u-fold + maxiglob = nx_global/2 ! half T-row on T-fold or half u-row on u-fold else maxiglob = -1 ! nothing to do for T-row on u-fold endif @@ -325,25 +345,45 @@ function global_sum_real(array, dist, field_loc, mMask, lMask) & n = (iblock-1)*nx_block*ny_block - do j=jb,je - do i=ib,ie - n = n + 1 - ! eliminate redundant points - if (maxiglob > 0 .and. j == je .and. this_block%i_glob(i) > maxiglob) then - work(n,1) = 0._dbl_kind - else - if (present(mMask)) then - work(n,1) = real(array(i,j,iblock)*mMask(i,j,iblock),dbl_kind) - else if (present(lMask)) then - if (lMask(i,j,iblock)) then - work(n,1) = real(array(i,j,iblock),dbl_kind) - endif - else + if (present(mMask)) then + do j=jb,je + do i=ib,ie + n = n + 1 + work(n,1) = real(array(i,j,iblock)*mMask(i,j,iblock),dbl_kind) + end do + end do + elseif (present(lMask)) then + do j=jb,je + do i=ib,ie + n = n + 1 + if (lMask(i,j,iblock)) then work(n,1) = real(array(i,j,iblock),dbl_kind) endif - endif - end do - end do + end do + end do + else + do j=jb,je + do i=ib,ie + n = n + 1 + work(n,1) = real(array(i,j,iblock),dbl_kind) + enddo + enddo + endif + + if (maxiglob >= 0) then + ! eliminate redundant points at je + ! set n to (ib,je) index + n = (iblock-1)*nx_block*ny_block + n = n + (je-1-jb+1)*(ie-ib+1) + j=je + do i=ib,ie + n = n + 1 + if (this_block%i_glob(i) > maxiglob) then + work(n,1) = 0._dbl_kind + endif + end do + endif + end do call compute_sums_dbl(work,sums,communicator,numProcs) @@ -445,7 +485,7 @@ function global_sum_int(array, dist, field_loc, mMask, lMask) & if (Nrow .and. this_block%tripoleTFlag) then maxiglob = 0 ! entire u-row on T-fold grid elseif (Nrow .or. this_block%tripoleTFlag) then - maxiglob = nx_global/2 ! half T-row on T-fold and u-row on u-fold + maxiglob = nx_global/2 ! half T-row on T-fold or half u-row on u-fold else maxiglob = -1 ! nothing to do for T-row on u-fold endif @@ -456,7 +496,7 @@ function global_sum_int(array, dist, field_loc, mMask, lMask) & do j=jb,je do i=ib,ie ! eliminate redundant points - if (maxiglob > 0 .and. j == je .and. this_block%i_glob(i) > maxiglob) then + if (maxiglob >= 0 .and. j == je .and. this_block%i_glob(i) > maxiglob) then ! blockSum = blockSum + 0_int_kind else if (present(mMask)) then @@ -798,7 +838,7 @@ function global_sum_prod_dbl (array1, array2, dist, field_loc, & if (Nrow .and. this_block%tripoleTFlag) then maxiglob = 0 ! entire u-row on T-fold grid elseif (Nrow .or. this_block%tripoleTFlag) then - maxiglob = nx_global/2 ! half T-row on T-fold and u-row on u-fold + maxiglob = nx_global/2 ! half T-row on T-fold or half u-row on u-fold else maxiglob = -1 ! nothing to do for T-row on u-fold endif @@ -806,25 +846,45 @@ function global_sum_prod_dbl (array1, array2, dist, field_loc, & n = (iblock-1)*nx_block*ny_block - do j=jb,je - do i=ib,ie - n = n + 1 - ! eliminate redundant points - if (maxiglob > 0 .and. j == je .and. this_block%i_glob(i) > maxiglob) then - work(n,1) = 0._dbl_kind - else - if (present(mMask)) then - work(n,1) = array1(i,j,iblock)*array2(i,j,iblock)*mMask(i,j,iblock) - else if (present(lMask)) then - if (lMask(i,j,iblock)) then - work(n,1) = array1(i,j,iblock)*array2(i,j,iblock) - endif - else + if (present(mMask)) then + do j=jb,je + do i=ib,ie + n = n + 1 + work(n,1) = array1(i,j,iblock)*array2(i,j,iblock)*mMask(i,j,iblock) + end do + end do + elseif (present(lMask)) then + do j=jb,je + do i=ib,ie + n = n + 1 + if (lMask(i,j,iblock)) then work(n,1) = array1(i,j,iblock)*array2(i,j,iblock) endif - endif - end do - end do + end do + end do + else + do j=jb,je + do i=ib,ie + n = n + 1 + work(n,1) = array1(i,j,iblock)*array2(i,j,iblock) + enddo + enddo + endif + + if (maxiglob >= 0) then + ! eliminate redundant points at je + ! set n to (ib,je) index + n = (iblock-1)*nx_block*ny_block + n = n + (je-1-jb+1)*(ie-ib+1) + j=je + do i=ib,ie + n = n + 1 + if (this_block%i_glob(i) > maxiglob) then + work(n,1) = 0._dbl_kind + endif + end do + endif + end do call compute_sums_dbl(work,sums,communicator,numProcs) @@ -936,7 +996,7 @@ function global_sum_prod_real (array1, array2, dist, field_loc, & if (Nrow .and. this_block%tripoleTFlag) then maxiglob = 0 ! entire u-row on T-fold grid elseif (Nrow .or. this_block%tripoleTFlag) then - maxiglob = nx_global/2 ! half T-row on T-fold and u-row on u-fold + maxiglob = nx_global/2 ! half T-row on T-fold or half u-row on u-fold else maxiglob = -1 ! nothing to do for T-row on u-fold endif @@ -944,25 +1004,45 @@ function global_sum_prod_real (array1, array2, dist, field_loc, & n = (iblock-1)*nx_block*ny_block - do j=jb,je - do i=ib,ie - n = n + 1 - ! eliminate redundant points - if (maxiglob > 0 .and. j == je .and. this_block%i_glob(i) > maxiglob) then - work(n,1) = 0._dbl_kind - else - if (present(mMask)) then - work(n,1) = real(array1(i,j,iblock)*array2(i,j,iblock)*mMask(i,j,iblock),dbl_kind) - else if (present(lMask)) then - if (lMask(i,j,iblock)) then - work(n,1) = real(array1(i,j,iblock)*array2(i,j,iblock),dbl_kind) - endif - else + if (present(mMask)) then + do j=jb,je + do i=ib,ie + n = n + 1 + work(n,1) = real(array1(i,j,iblock)*array2(i,j,iblock)*mMask(i,j,iblock),dbl_kind) + end do + end do + elseif (present(lMask)) then + do j=jb,je + do i=ib,ie + n = n + 1 + if (lMask(i,j,iblock)) then work(n,1) = real(array1(i,j,iblock)*array2(i,j,iblock),dbl_kind) endif - endif - end do - end do + end do + end do + else + do j=jb,je + do i=ib,ie + n = n + 1 + work(n,1) = real(array1(i,j,iblock)*array2(i,j,iblock),dbl_kind) + enddo + enddo + endif + + if (maxiglob >= 0) then + ! eliminate redundant points at je + ! set n to (ib,je) index + n = (iblock-1)*nx_block*ny_block + n = n + (je-1-jb+1)*(ie-ib+1) + j=je + do i=ib,ie + n = n + 1 + if (this_block%i_glob(i) > maxiglob) then + work(n,1) = 0._dbl_kind + endif + end do + endif + end do call compute_sums_dbl(work,sums,communicator,numProcs) @@ -1066,7 +1146,7 @@ function global_sum_prod_int (array1, array2, dist, field_loc, & if (Nrow .and. this_block%tripoleTFlag) then maxiglob = 0 ! entire u-row on T-fold grid elseif (Nrow .or. this_block%tripoleTFlag) then - maxiglob = nx_global/2 ! half T-row on T-fold and u-row on u-fold + maxiglob = nx_global/2 ! half T-row on T-fold or half u-row on u-fold else maxiglob = -1 ! nothing to do for T-row on u-fold endif @@ -1077,7 +1157,7 @@ function global_sum_prod_int (array1, array2, dist, field_loc, & do j=jb,je do i=ib,ie ! eliminate redundant points - if (maxiglob > 0 .and. j == je .and. this_block%i_glob(i) > maxiglob) then + if (maxiglob >= 0 .and. j == je .and. this_block%i_glob(i) > maxiglob) then ! blockSum = blockSum + 0_int_kind else if (present(mMask)) then diff --git a/cicecore/cicedyn/infrastructure/comm/mpi/ice_reprosum.F90 b/cicecore/cicedyn/infrastructure/comm/mpi/ice_reprosum.F90 index 8c6f90363..7c6c0eb77 100644 --- a/cicecore/cicedyn/infrastructure/comm/mpi/ice_reprosum.F90 +++ b/cicecore/cicedyn/infrastructure/comm/mpi/ice_reprosum.F90 @@ -87,7 +87,7 @@ MODULE ice_reprosum !----------------------------------------------------------------------- logical :: repro_sum_use_ddpdd = .false. -! logical :: detailed_timing = .false. + logical :: detailed_timing = .false. character(len=char_len_long) :: tmpstr CONTAINS @@ -100,10 +100,10 @@ MODULE ice_reprosum !----------------------------------------------------------------------- subroutine ice_reprosum_setopts(repro_sum_use_ddpdd_in, & - repro_sum_rel_diff_max_in, & - repro_sum_recompute_in, & - repro_sum_master, & - repro_sum_logunit ) + repro_sum_rel_diff_max_in, & + repro_sum_recompute_in, & + repro_sum_master, & + repro_sum_logunit ) !------------------------------Arguments-------------------------------- logical, intent(in), optional :: repro_sum_use_ddpdd_in @@ -260,12 +260,12 @@ end subroutine ice_reprosum_setopts !---------------------------------------------------------------------- subroutine ice_reprosum_calc (arr, arr_gsum, nsummands, dsummands, & - nflds, ddpdd_sum, & - arr_gbl_max, arr_gbl_max_out, & - arr_max_levels, arr_max_levels_out, & - gbl_max_nsummands, gbl_max_nsummands_out,& - gbl_count, repro_sum_validate, & - repro_sum_stats, rel_diff, commid ) + nflds, ddpdd_sum, & + arr_gbl_max, arr_gbl_max_out, & + arr_max_levels, arr_max_levels_out, & + gbl_max_nsummands, gbl_max_nsummands_out,& + gbl_count, repro_sum_validate, & + repro_sum_stats, rel_diff, commid ) !---------------------------------------------------------------------- ! Arguments @@ -434,7 +434,7 @@ subroutine ice_reprosum_calc (arr, arr_gsum, nsummands, dsummands, & ! if (detailed_timing) call xicex_timer_start('ice_reprosum_ddpdd') call ice_reprosum_ddpdd(arr, arr_gsum, nsummands, dsummands, & - nflds, mpi_comm) + nflds, mpi_comm) repro_sum_fast = 1 ! if (detailed_timing) call xicex_timer_stop('ice_reprosum_ddpdd') @@ -774,9 +774,9 @@ end subroutine ice_reprosum_calc !---------------------------------------------------------------------- subroutine ice_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, & - arr_max_shift, arr_gmax_exp, max_levels, & - max_level, validate, recompute, & - omp_nthreads, mpi_comm ) + arr_max_shift, arr_gmax_exp, max_levels, & + max_level, validate, recompute, & + omp_nthreads, mpi_comm ) !---------------------------------------------------------------------- @@ -1224,7 +1224,7 @@ end subroutine ice_reprosum_int !---------------------------------------------------------------------- logical function ice_reprosum_tolExceeded (name, nflds, master, & - logunit, rel_diff ) + logunit, rel_diff ) !---------------------------------------------------------------------- ! Arguments @@ -1310,7 +1310,7 @@ end function ice_reprosum_tolExceeded !---------------------------------------------------------------------- subroutine ice_reprosum_ddpdd (arr, arr_gsum, nsummands, dsummands, & - nflds, mpi_comm ) + nflds, mpi_comm ) !---------------------------------------------------------------------- ! Arguments diff --git a/cicecore/cicedyn/infrastructure/comm/serial/ice_exit.F90 b/cicecore/cicedyn/infrastructure/comm/serial/ice_exit.F90 index 2daadc0e6..39f2b6702 100644 --- a/cicecore/cicedyn/infrastructure/comm/serial/ice_exit.F90 +++ b/cicecore/cicedyn/infrastructure/comm/serial/ice_exit.F90 @@ -1,7 +1,9 @@ + +#define SERIAL_REMOVE_MPI + !======================================================================= ! ! Exit the model. -! ! authors William H. Lipscomb (LANL) ! Elizabeth C. Hunke (LANL) ! 2006 ECH: separated serial and mpi functionality @@ -9,10 +11,14 @@ module ice_exit use ice_kinds_mod - use ice_fileunits, only: nu_diag, flush_fileunit + use ice_fileunits, only: nu_diag, ice_stderr, flush_fileunit use icepack_intfc, only: icepack_warnings_flush, icepack_warnings_aborted -#ifdef CESMCOUPLED +#if (defined CESMCOUPLED) use shr_sys_mod +#else +#ifndef SERIAL_REMOVE_MPI + use mpi ! MPI Fortran module +#endif #endif implicit none @@ -24,7 +30,7 @@ module ice_exit !======================================================================= - subroutine abort_ice(error_message,file,line,doabort) + subroutine abort_ice(error_message, file, line, doabort) ! This routine aborts the ice model and prints an error message. @@ -33,30 +39,44 @@ subroutine abort_ice(error_message,file,line,doabort) integer (kind=int_kind), intent(in), optional :: line ! line number logical (kind=log_kind), intent(in), optional :: doabort ! abort flag - logical (kind=log_kind) :: ldoabort ! local doabort + ! local variables + + integer (int_kind) :: & + ierr, & ! MPI error flag + outunit, & ! output unit + error_code ! return code + logical (log_kind) :: ldoabort ! local doabort flag character(len=*), parameter :: subname='(abort_ice)' ldoabort = .true. if (present(doabort)) ldoabort = doabort -#ifdef CESMCOUPLED - call icepack_warnings_flush(nu_diag) - write(nu_diag,*) ' ' - write(nu_diag,*) subname, 'ABORTED: ' - if (present(file)) write (nu_diag,*) subname,' called from ',trim(file) - if (present(line)) write (nu_diag,*) subname,' line number ',line - if (present(error_message)) write (nu_diag,*) subname,' error = ',trim(error_message) - if (ldoabort) call shr_sys_abort(subname//trim(error_message)) +#if (defined CESMCOUPLED) + outunit = nu_diag #else - call icepack_warnings_flush(nu_diag) - write(nu_diag,*) ' ' - write(nu_diag,*) subname, 'ABORTED: ' - if (present(file)) write (nu_diag,*) subname,' called from ',trim(file) - if (present(line)) write (nu_diag,*) subname,' line number ',line - if (present(error_message)) write (nu_diag,*) subname,' error = ',trim(error_message) + outunit = ice_stderr +#endif + call flush_fileunit(nu_diag) - if (ldoabort) stop + call icepack_warnings_flush(nu_diag) + write(outunit,*) ' ' + write(outunit,*) subname, 'ABORTED: ' + if (present(file)) write (outunit,*) subname,' called from ',trim(file) + if (present(line)) write (outunit,*) subname,' line number ',line + if (present(error_message)) write (outunit,*) subname,' error = ',trim(error_message) + call flush_fileunit(outunit) + + if (ldoabort) then +#if (defined CESMCOUPLED) + call shr_sys_abort(subname//trim(error_message)) +#else +#ifndef SERIAL_REMOVE_MPI + error_code = 128 + call MPI_ABORT(MPI_COMM_WORLD, error_code, ierr) #endif + stop +#endif + endif end subroutine abort_ice @@ -64,10 +84,15 @@ end subroutine abort_ice subroutine end_run +! Ends run by calling MPI_FINALIZE +! Does nothing in serial runs + + integer (int_kind) :: ierr ! MPI error flag character(len=*), parameter :: subname = '(end_run)' -! Ends parallel run by calling MPI_FINALIZE. -! Does nothing in serial runs. +#ifndef SERIAL_REMOVE_MPI + call MPI_FINALIZE(ierr) +#endif end subroutine end_run diff --git a/cicecore/cicedyn/infrastructure/comm/serial/ice_global_reductions.F90 b/cicecore/cicedyn/infrastructure/comm/serial/ice_global_reductions.F90 index 5fcd45876..ed36cc6c0 100644 --- a/cicecore/cicedyn/infrastructure/comm/serial/ice_global_reductions.F90 +++ b/cicecore/cicedyn/infrastructure/comm/serial/ice_global_reductions.F90 @@ -182,7 +182,7 @@ function global_sum_dbl(array, dist, field_loc, mMask, lMask) & if (Nrow .and. this_block%tripoleTFlag) then maxiglob = 0 ! entire u-row on T-fold grid elseif (Nrow .or. this_block%tripoleTFlag) then - maxiglob = nx_global/2 ! half T-row on T-fold and u-row on u-fold + maxiglob = nx_global/2 ! half T-row on T-fold or half u-row on u-fold else maxiglob = -1 ! nothing to do for T-row on u-fold endif @@ -190,25 +190,45 @@ function global_sum_dbl(array, dist, field_loc, mMask, lMask) & n = (iblock-1)*nx_block*ny_block - do j=jb,je - do i=ib,ie - n = n + 1 - ! eliminate redundant points - if (maxiglob > 0 .and. j == je .and. this_block%i_glob(i) > maxiglob) then - work(n,1) = 0._dbl_kind - else - if (present(mMask)) then - work(n,1) = array(i,j,iblock)*mMask(i,j,iblock) - else if (present(lMask)) then - if (lMask(i,j,iblock)) then - work(n,1) = array(i,j,iblock) - endif - else + if (present(mMask)) then + do j=jb,je + do i=ib,ie + n = n + 1 + work(n,1) = array(i,j,iblock)*mMask(i,j,iblock) + end do + end do + elseif (present(lMask)) then + do j=jb,je + do i=ib,ie + n = n + 1 + if (lMask(i,j,iblock)) then work(n,1) = array(i,j,iblock) endif - endif - end do - end do + end do + end do + else + do j=jb,je + do i=ib,ie + n = n + 1 + work(n,1) = array(i,j,iblock) + enddo + enddo + endif + + if (maxiglob >= 0) then + ! eliminate redundant points at je + ! set n to (ib,je) index + n = (iblock-1)*nx_block*ny_block + n = n + (je-1-jb+1)*(ie-ib+1) + j=je + do i=ib,ie + n = n + 1 + if (this_block%i_glob(i) > maxiglob) then + work(n,1) = 0._dbl_kind + endif + end do + endif + end do call compute_sums_dbl(work,sums,communicator,numProcs) @@ -318,7 +338,7 @@ function global_sum_real(array, dist, field_loc, mMask, lMask) & if (Nrow .and. this_block%tripoleTFlag) then maxiglob = 0 ! entire u-row on T-fold grid elseif (Nrow .or. this_block%tripoleTFlag) then - maxiglob = nx_global/2 ! half T-row on T-fold and u-row on u-fold + maxiglob = nx_global/2 ! half T-row on T-fold or half u-row on u-fold else maxiglob = -1 ! nothing to do for T-row on u-fold endif @@ -326,25 +346,45 @@ function global_sum_real(array, dist, field_loc, mMask, lMask) & n = (iblock-1)*nx_block*ny_block - do j=jb,je - do i=ib,ie - n = n + 1 - ! eliminate redundant points - if (maxiglob > 0 .and. j == je .and. this_block%i_glob(i) > maxiglob) then - work(n,1) = 0._dbl_kind - else - if (present(mMask)) then - work(n,1) = real(array(i,j,iblock)*mMask(i,j,iblock),dbl_kind) - else if (present(lMask)) then - if (lMask(i,j,iblock)) then - work(n,1) = real(array(i,j,iblock),dbl_kind) - endif - else + if (present(mMask)) then + do j=jb,je + do i=ib,ie + n = n + 1 + work(n,1) = real(array(i,j,iblock)*mMask(i,j,iblock),dbl_kind) + end do + end do + elseif (present(lMask)) then + do j=jb,je + do i=ib,ie + n = n + 1 + if (lMask(i,j,iblock)) then work(n,1) = real(array(i,j,iblock),dbl_kind) endif - endif - end do - end do + end do + end do + else + do j=jb,je + do i=ib,ie + n = n + 1 + work(n,1) = real(array(i,j,iblock),dbl_kind) + enddo + enddo + endif + + if (maxiglob >= 0) then + ! eliminate redundant points at je + ! set n to (ib,je) index + n = (iblock-1)*nx_block*ny_block + n = n + (je-1-jb+1)*(ie-ib+1) + j=je + do i=ib,ie + n = n + 1 + if (this_block%i_glob(i) > maxiglob) then + work(n,1) = 0._dbl_kind + endif + end do + endif + end do call compute_sums_dbl(work,sums,communicator,numProcs) @@ -446,7 +486,7 @@ function global_sum_int(array, dist, field_loc, mMask, lMask) & if (Nrow .and. this_block%tripoleTFlag) then maxiglob = 0 ! entire u-row on T-fold grid elseif (Nrow .or. this_block%tripoleTFlag) then - maxiglob = nx_global/2 ! half T-row on T-fold and u-row on u-fold + maxiglob = nx_global/2 ! half T-row on T-fold or half u-row on u-fold else maxiglob = -1 ! nothing to do for T-row on u-fold endif @@ -457,7 +497,7 @@ function global_sum_int(array, dist, field_loc, mMask, lMask) & do j=jb,je do i=ib,ie ! eliminate redundant points - if (maxiglob > 0 .and. j == je .and. this_block%i_glob(i) > maxiglob) then + if (maxiglob >= 0 .and. j == je .and. this_block%i_glob(i) > maxiglob) then ! blockSum = blockSum + 0_int_kind else if (present(mMask)) then @@ -799,7 +839,7 @@ function global_sum_prod_dbl (array1, array2, dist, field_loc, & if (Nrow .and. this_block%tripoleTFlag) then maxiglob = 0 ! entire u-row on T-fold grid elseif (Nrow .or. this_block%tripoleTFlag) then - maxiglob = nx_global/2 ! half T-row on T-fold and u-row on u-fold + maxiglob = nx_global/2 ! half T-row on T-fold or half u-row on u-fold else maxiglob = -1 ! nothing to do for T-row on u-fold endif @@ -807,25 +847,45 @@ function global_sum_prod_dbl (array1, array2, dist, field_loc, & n = (iblock-1)*nx_block*ny_block - do j=jb,je - do i=ib,ie - n = n + 1 - ! eliminate redundant points - if (maxiglob > 0 .and. j == je .and. this_block%i_glob(i) > maxiglob) then - work(n,1) = 0._dbl_kind - else - if (present(mMask)) then - work(n,1) = array1(i,j,iblock)*array2(i,j,iblock)*mMask(i,j,iblock) - else if (present(lMask)) then - if (lMask(i,j,iblock)) then - work(n,1) = array1(i,j,iblock)*array2(i,j,iblock) - endif - else + if (present(mMask)) then + do j=jb,je + do i=ib,ie + n = n + 1 + work(n,1) = array1(i,j,iblock)*array2(i,j,iblock)*mMask(i,j,iblock) + end do + end do + elseif (present(lMask)) then + do j=jb,je + do i=ib,ie + n = n + 1 + if (lMask(i,j,iblock)) then work(n,1) = array1(i,j,iblock)*array2(i,j,iblock) endif - endif - end do - end do + end do + end do + else + do j=jb,je + do i=ib,ie + n = n + 1 + work(n,1) = array1(i,j,iblock)*array2(i,j,iblock) + enddo + enddo + endif + + if (maxiglob >= 0) then + ! eliminate redundant points at je + ! set n to (ib,je) index + n = (iblock-1)*nx_block*ny_block + n = n + (je-1-jb+1)*(ie-ib+1) + j=je + do i=ib,ie + n = n + 1 + if (this_block%i_glob(i) > maxiglob) then + work(n,1) = 0._dbl_kind + endif + end do + endif + end do call compute_sums_dbl(work,sums,communicator,numProcs) @@ -937,7 +997,7 @@ function global_sum_prod_real (array1, array2, dist, field_loc, & if (Nrow .and. this_block%tripoleTFlag) then maxiglob = 0 ! entire u-row on T-fold grid elseif (Nrow .or. this_block%tripoleTFlag) then - maxiglob = nx_global/2 ! half T-row on T-fold and u-row on u-fold + maxiglob = nx_global/2 ! half T-row on T-fold or half u-row on u-fold else maxiglob = -1 ! nothing to do for T-row on u-fold endif @@ -945,25 +1005,45 @@ function global_sum_prod_real (array1, array2, dist, field_loc, & n = (iblock-1)*nx_block*ny_block - do j=jb,je - do i=ib,ie - n = n + 1 - ! eliminate redundant points - if (maxiglob > 0 .and. j == je .and. this_block%i_glob(i) > maxiglob) then - work(n,1) = 0._dbl_kind - else - if (present(mMask)) then - work(n,1) = real(array1(i,j,iblock)*array2(i,j,iblock)*mMask(i,j,iblock),dbl_kind) - else if (present(lMask)) then - if (lMask(i,j,iblock)) then - work(n,1) = real(array1(i,j,iblock)*array2(i,j,iblock),dbl_kind) - endif - else + if (present(mMask)) then + do j=jb,je + do i=ib,ie + n = n + 1 + work(n,1) = real(array1(i,j,iblock)*array2(i,j,iblock)*mMask(i,j,iblock),dbl_kind) + end do + end do + elseif (present(lMask)) then + do j=jb,je + do i=ib,ie + n = n + 1 + if (lMask(i,j,iblock)) then work(n,1) = real(array1(i,j,iblock)*array2(i,j,iblock),dbl_kind) endif - endif - end do - end do + end do + end do + else + do j=jb,je + do i=ib,ie + n = n + 1 + work(n,1) = real(array1(i,j,iblock)*array2(i,j,iblock),dbl_kind) + enddo + enddo + endif + + if (maxiglob >= 0) then + ! eliminate redundant points at je + ! set n to (ib,je) index + n = (iblock-1)*nx_block*ny_block + n = n + (je-1-jb+1)*(ie-ib+1) + j=je + do i=ib,ie + n = n + 1 + if (this_block%i_glob(i) > maxiglob) then + work(n,1) = 0._dbl_kind + endif + end do + endif + end do call compute_sums_dbl(work,sums,communicator,numProcs) @@ -1067,7 +1147,7 @@ function global_sum_prod_int (array1, array2, dist, field_loc, & if (Nrow .and. this_block%tripoleTFlag) then maxiglob = 0 ! entire u-row on T-fold grid elseif (Nrow .or. this_block%tripoleTFlag) then - maxiglob = nx_global/2 ! half T-row on T-fold and u-row on u-fold + maxiglob = nx_global/2 ! half T-row on T-fold or half u-row on u-fold else maxiglob = -1 ! nothing to do for T-row on u-fold endif @@ -1078,7 +1158,7 @@ function global_sum_prod_int (array1, array2, dist, field_loc, & do j=jb,je do i=ib,ie ! eliminate redundant points - if (maxiglob > 0 .and. j == je .and. this_block%i_glob(i) > maxiglob) then + if (maxiglob >= 0 .and. j == je .and. this_block%i_glob(i) > maxiglob) then ! blockSum = blockSum + 0_int_kind else if (present(mMask)) then diff --git a/cicecore/drivers/unittest/sumchk/sumchk.F90 b/cicecore/drivers/unittest/sumchk/sumchk.F90 index d9ea72d8c..1a2745aea 100644 --- a/cicecore/drivers/unittest/sumchk/sumchk.F90 +++ b/cicecore/drivers/unittest/sumchk/sumchk.F90 @@ -16,10 +16,10 @@ program sumchk use ice_communicate, only: my_task, master_task, get_num_procs use ice_domain_size, only: nx_global, ny_global use ice_domain_size, only: block_size_x, block_size_y, max_blocks - use ice_domain, only: distrb_info + use ice_domain, only: distrb_info, ns_boundary_type use ice_blocks, only: block, get_block, nx_block, ny_block, nblocks_tot use ice_distribution, only: ice_distributionGetBlockID, ice_distributionGet - use ice_constants, only: field_loc_center, field_loc_Nface + use ice_constants, only: field_loc_center, field_loc_Nface, field_loc_Eface, field_loc_NEcorner use ice_fileunits, only: bfbflag use ice_global_reductions use ice_exit, only: abort_ice, end_run @@ -113,6 +113,13 @@ program sumchk write(6,*) ' block_size_y = ',block_size_y write(6,*) ' nblocks_tot = ',nblocks_tot write(6,*) ' ' + write(6,*) ' Values are generally O(1.), lscale is the relative size of' + write(6,*) ' values set in the array to test precision. A pair of equal' + write(6,*) ' and opposite values of O(lscale) are placed in the array.' + write(6,*) ' "easy" sets the lscaled values at the start of the array so' + write(6,*) ' are added to the sum first. Otherwise, the lscaled values' + write(6,*) ' are set near the end of the array and to create precision' + write(6,*) ' challenges in the global sums' endif ! --------------------------- @@ -165,7 +172,7 @@ program sumchk reldigchk(4,4) = 0. reldigchk(5,4) = 15. if (nx_global == 360 .and. ny_global == 240) then - reldigchk(1:3,1) = 13. + reldigchk(1:3,1) = 12.5 reldigchk(5,4) = 14. endif #else @@ -181,7 +188,7 @@ program sumchk reldigchk(4,4) = 0. reldigchk(5,4) = 15. if (nx_global == 360 .and. ny_global == 240) then - reldigchk(1:2,1) = 13. + reldigchk(1:2,1) = 12.5 reldigchk(5,4) = 14. endif #endif @@ -212,20 +219,22 @@ program sumchk ! set corval to something a little interesting (not 1.0 for instance which gives atypical results) corval = 4.0_dbl_kind/3.0_dbl_kind iocval = 8 - ! tuned for gx3 and tx1 only - if ((nx_global == 100 .and. ny_global == 116) .or. & - (nx_global == 360 .and. ny_global == 240)) then - if (field_loc(m) == field_loc_Nface .and. nx_global == 360 .and. ny_global == 240) then - ! tx1 tripole face, need to adjust local value to remove half of row at ny_global - ! or modify corval to account for different sum - locval = corval / real((nblocks_tot*(block_size_x*block_size_y-2)-nx_global/2),dbl_kind) - corvali = (nblocks_tot*(block_size_x*block_size_y-2)-nx_global/2)*iocval - else - locval = corval / real(nblocks_tot*(block_size_x*block_size_y-2),dbl_kind) - corvali = nblocks_tot*(block_size_x*block_size_y-2)*iocval - endif + if ((ns_boundary_type == 'tripoleT' .and. field_loc(m) == field_loc_Nface ) .or. & + (ns_boundary_type == 'tripoleT' .and. field_loc(m) == field_loc_NEcorner)) then + ! remove full row at ny_global + locval = corval / real((nblocks_tot*(block_size_x*block_size_y-2)-nx_global),dbl_kind) + corvali = (nblocks_tot*(block_size_x*block_size_y-2)-nx_global)*iocval + elseif ((ns_boundary_type == 'tripoleT' .and. field_loc(m) == field_loc_center ) .or. & + (ns_boundary_type == 'tripoleT' .and. field_loc(m) == field_loc_Eface ) .or. & + (ns_boundary_type == 'tripole' .and. field_loc(m) == field_loc_NEcorner) .or. & + (ns_boundary_type == 'tripole' .and. field_loc(m) == field_loc_Nface )) then + ! remove half of row at ny_global + locval = corval / real((nblocks_tot*(block_size_x*block_size_y-2)-nx_global/2),dbl_kind) + corvali = (nblocks_tot*(block_size_x*block_size_y-2)-nx_global/2)*iocval else - call abort_ice(subname//' ERROR not set for this grid ') + ! all gridcells + locval = corval / real(nblocks_tot*(block_size_x*block_size_y-2),dbl_kind) + corvali = nblocks_tot*(block_size_x*block_size_y-2)*iocval endif do l = 1, nscale @@ -253,18 +262,18 @@ program sumchk jb = this_block%jlo je = this_block%jhi - lmask(ie,je-1,iblock) = .false. - lmask(ie,je-2,iblock) = .false. - arrayA(ie,je-1,iblock) = locval * lscale(l) + lmask(ie,je-1,iblock) = .false. + lmask(ie,je-2,iblock) = .false. + arrayA(ie,je-1,iblock) = locval * lscale(l) arrayA(ie,je-2,iblock) = -arrayA(ie,je-1,iblock) - arrayB(ie,je-1,iblock) = locval * lscale(l) + arrayB(ie,je-1,iblock) = locval * lscale(l) arrayB(ie,je-2,iblock) = arrayB(ie,je-1,iblock) arrayC(ib,jb,iblock) = locval * lscale(l) arrayC(ib+1,jb,iblock) = -arrayC(ib,jb,iblock) - arrayiA(:,:,iblock) = iocval - arrayiB(:,:,iblock) = iocval - arrayiA(ie,je-1,iblock) = 13 * iocval - arrayiA(ie,je-2,iblock) = -arrayiA(ie,je-1,iblock) + arrayiA(:,:,iblock) = iocval + arrayiB(:,:,iblock) = iocval + arrayiA(ie,je-1,iblock)= 13 * iocval + arrayiA(ie,je-2,iblock)= -arrayiA(ie,je-1,iblock) enddo do k = 1,ntests1 diff --git a/configuration/scripts/tests/unittest_suite.ts b/configuration/scripts/tests/unittest_suite.ts index 7486e87aa..e64bea2f7 100644 --- a/configuration/scripts/tests/unittest_suite.ts +++ b/configuration/scripts/tests/unittest_suite.ts @@ -5,6 +5,7 @@ unittest gx3 1x1 calchk,short unittest gx3 4x1x25x29x4 sumchk unittest gx3 1x1x25x29x16 sumchk unittest tx1 8x1 sumchk +unittest tx1 8x1 sumchk,tripolet unittest gx3 4x1 bcstchk unittest gx3 1x1 bcstchk unittest gx3 8x2 gridavgchk,dwblockall