From 02eeb42fef124d530c461fde2e333b75465b39b8 Mon Sep 17 00:00:00 2001 From: Philippe Blain Date: Tue, 12 Jul 2022 10:16:56 -0400 Subject: [PATCH] ice_dyn_vp: use global_{norm,dot_product} for bit-for-bit output reproducibility Make the results of the VP solver reproducible if desired by refactoring the code to use the subroutines 'global_norm' and 'global_dot_product' added in the previous commit. The same pattern appears in the FGMRES solver (subroutine 'fgmres'), the preconditioner 'pgmres' which uses the same algorithm, and the Classical and Modified Gram-Schmidt algorithms in 'orthogonalize'. These modifications do not change the number of global sums in the fgmres, pgmres and the MGS algorithm. For the CGS algorithm, there is (in theory) a slight performance impact as 'global_dot_product' is called inside the loop, whereas previously we called 'global_allreduce_sum' after the loop to compute all 'initer' sums at the same time. To keep that optimization, we would have to implement a new interface 'global_allreduce_sum' which would take an array of shape (nx_block,ny_block,max_blocks,k) and sum over their first three dimensions before performing the global reduction over the k dimension. We choose to not go that route for now mostly because anyway the CGS algorithm is (by default) only used for the PGMRES preconditioner, and so the cost should be relatively low as 'initer' corresponds to 'dim_pgmres' in the namelist, which should be kept low for efficiency (default 5). These changes lead to bit-for-bit reproducibility (the decomp_suite passes) when using 'precond=ident' and 'precond=diag' along with 'bfbflag=reprosum'. 'precond=pgmres' is still not bit-for-bit because some halo updates are skipped for efficiency. This will be addressed in a following commit. [1] https://github.com/CICE-Consortium/CICE/pull/491#discussion_r460147629 --- cicecore/cicedynB/dynamics/ice_dyn_vp.F90 | 140 ++++++---------------- 1 file changed, 39 insertions(+), 101 deletions(-) diff --git a/cicecore/cicedynB/dynamics/ice_dyn_vp.F90 b/cicecore/cicedynB/dynamics/ice_dyn_vp.F90 index 6a135991f..4e0740e15 100644 --- a/cicecore/cicedynB/dynamics/ice_dyn_vp.F90 +++ b/cicecore/cicedynB/dynamics/ice_dyn_vp.F90 @@ -51,7 +51,7 @@ module ice_dyn_vp seabed_stress, Ktens, stack_fields, unstack_fields use ice_fileunits, only: nu_diag use ice_flux, only: fmU - use ice_global_reductions, only: global_sum, global_allreduce_sum + use ice_global_reductions, only: global_sum use ice_grid, only: dxT, dyT, dxhy, dyhx, cxp, cyp, cxm, cym, uarear use ice_exit, only: abort_ice use icepack_intfc, only: icepack_warnings_flush, icepack_warnings_aborted @@ -2925,18 +2925,10 @@ subroutine fgmres (zetax2 , etax2 , & arnoldi_basis_y = c0 ! solution is zero if RHS is zero - !$OMP PARALLEL DO PRIVATE(iblk) - do iblk = 1, nblocks - call calc_L2norm_squared(nx_block , ny_block , & - icellu (iblk), & - indxui (:,iblk), indxuj(:,iblk), & - bx(:,:,iblk) , & - by(:,:,iblk) , & - norm_squared(iblk)) - - enddo - !$OMP END PARALLEL DO - norm_rhs = sqrt(global_sum(sum(norm_squared), distrb_info)) + norm_rhs = global_norm(nx_block, ny_block, & + icellU , & + indxUi , indxUj , & + bx , by ) if (norm_rhs == c0) then solx = bx soly = by @@ -2974,18 +2966,11 @@ subroutine fgmres (zetax2 , etax2 , & ! Start outer (restarts) loop do ! Compute norm of initial residual - !$OMP PARALLEL DO PRIVATE(iblk) - do iblk = 1, nblocks - call calc_L2norm_squared(nx_block , ny_block , & - icellU (iblk), & - indxUi (:,iblk), indxUj(:,iblk), & - arnoldi_basis_x(:,:,iblk, 1) , & - arnoldi_basis_y(:,:,iblk, 1) , & - norm_squared(iblk)) - - enddo - !$OMP END PARALLEL DO - norm_residual = sqrt(global_sum(sum(norm_squared), distrb_info)) + norm_residual = global_norm(nx_block, ny_block, & + icellU , & + indxUi , indxUj , & + arnoldi_basis_x(:,:,:, 1), & + arnoldi_basis_y(:,:,:, 1)) if (my_task == master_task .and. monitor_fgmres) then write(nu_diag, '(a,i4,a,d26.16)') "monitor_fgmres: iter_fgmres= ", nbiter, & @@ -3079,17 +3064,11 @@ subroutine fgmres (zetax2 , etax2 , & hessenberg) ! Compute norm of new Arnoldi vector and update Hessenberg matrix - !$OMP PARALLEL DO PRIVATE(iblk) - do iblk = 1, nblocks - call calc_L2norm_squared(nx_block , ny_block , & - icellU (iblk), & - indxUi (:,iblk), indxUj(:, iblk) , & - arnoldi_basis_x(:,:,iblk, nextit), & - arnoldi_basis_y(:,:,iblk, nextit), & - norm_squared(iblk)) - enddo - !$OMP END PARALLEL DO - hessenberg(nextit,initer) = sqrt(global_sum(sum(norm_squared), distrb_info)) + hessenberg(nextit,initer) = global_norm(nx_block, ny_block, & + icellU , & + indxUi , indxUj , & + arnoldi_basis_x(:,:,:, nextit), & + arnoldi_basis_y(:,:,:, nextit)) ! Watch out for happy breakdown if (.not. almost_zero( hessenberg(nextit,initer) ) ) then @@ -3367,18 +3346,11 @@ subroutine pgmres (zetax2 , etax2 , & ! Start outer (restarts) loop do ! Compute norm of initial residual - !$OMP PARALLEL DO PRIVATE(iblk) - do iblk = 1, nblocks - call calc_L2norm_squared(nx_block , ny_block , & - icellU (iblk), & - indxUi (:,iblk), indxUj(:, iblk), & - arnoldi_basis_x(:,:,iblk, 1), & - arnoldi_basis_y(:,:,iblk, 1), & - norm_squared(iblk)) - - enddo - !$OMP END PARALLEL DO - norm_residual = sqrt(global_sum(sum(norm_squared), distrb_info)) + norm_residual = global_norm(nx_block, ny_block, & + icellU , & + indxUi , indxUj , & + arnoldi_basis_x(:,:,:, 1), & + arnoldi_basis_y(:,:,:, 1)) if (my_task == master_task .and. monitor_pgmres) then write(nu_diag, '(a,i4,a,d26.16)') "monitor_pgmres: iter_pgmres= ", nbiter, & @@ -3461,17 +3433,11 @@ subroutine pgmres (zetax2 , etax2 , & hessenberg) ! Compute norm of new Arnoldi vector and update Hessenberg matrix - !$OMP PARALLEL DO PRIVATE(iblk) - do iblk = 1, nblocks - call calc_L2norm_squared(nx_block , ny_block , & - icellU (iblk), & - indxUi (:,iblk), indxUj(:, iblk) , & - arnoldi_basis_x(:,:,iblk, nextit), & - arnoldi_basis_y(:,:,iblk, nextit), & - norm_squared(iblk)) - enddo - !$OMP END PARALLEL DO - hessenberg(nextit,initer) = sqrt(global_sum(sum(norm_squared), distrb_info)) + hessenberg(nextit,initer) = global_norm(nx_block, ny_block, & + icellU , & + indxUi , indxUj , & + arnoldi_basis_x(:,:,:, nextit), & + arnoldi_basis_y(:,:,:, nextit)) ! Watch out for happy breakdown if (.not. almost_zero( hessenberg(nextit,initer) ) ) then @@ -3750,39 +3716,20 @@ subroutine orthogonalize(ortho_type , initer , & ij , & ! compressed index i, j ! grid indices - real (kind=dbl_kind), dimension (max_blocks) :: & - local_dot ! local array value to accumulate dot product of grid function over blocks - - real (kind=dbl_kind), dimension(maxinner) :: & - dotprod_local ! local array to accumulate several dot product computations - character(len=*), parameter :: subname = '(orthogonalize)' if (trim(ortho_type) == 'cgs') then ! Classical Gram-Schmidt ! Classical Gram-Schmidt orthogonalisation process ! First loop of Gram-Schmidt (compute coefficients) - dotprod_local = c0 do it = 1, initer - local_dot = c0 - - !$OMP PARALLEL DO PRIVATE(iblk,ij,i,j) - do iblk = 1, nblocks - do ij = 1, icellU(iblk) - i = indxUi(ij, iblk) - j = indxUj(ij, iblk) - - local_dot(iblk) = local_dot(iblk) + & - (arnoldi_basis_x(i, j, iblk, it) * arnoldi_basis_x(i, j, iblk, nextit)) + & - (arnoldi_basis_y(i, j, iblk, it) * arnoldi_basis_y(i, j, iblk, nextit)) - enddo ! ij - enddo - !$OMP END PARALLEL DO - - dotprod_local(it) = sum(local_dot) + hessenberg(it, initer) = global_dot_product(nx_block, ny_block, & + icellU , & + indxUi , indxUj , & + arnoldi_basis_x(:,:,:, it) , & + arnoldi_basis_y(:,:,:, it) , & + arnoldi_basis_x(:,:,:, nextit), & + arnoldi_basis_y(:,:,:, nextit)) end do - - hessenberg(1:initer, initer) = global_allreduce_sum(dotprod_local(1:initer), distrb_info) - ! Second loop of Gram-Schmidt (orthonormalize) do it = 1, initer !$OMP PARALLEL DO PRIVATE(iblk,ij,i,j) @@ -3802,22 +3749,13 @@ subroutine orthogonalize(ortho_type , initer , & elseif (trim(ortho_type) == 'mgs') then ! Modified Gram-Schmidt ! Modified Gram-Schmidt orthogonalisation process do it = 1, initer - local_dot = c0 - - !$OMP PARALLEL DO PRIVATE(iblk,ij,i,j) - do iblk = 1, nblocks - do ij = 1, icellU(iblk) - i = indxUi(ij, iblk) - j = indxUj(ij, iblk) - - local_dot(iblk) = local_dot(iblk) + & - (arnoldi_basis_x(i, j, iblk, it) * arnoldi_basis_x(i, j, iblk, nextit)) + & - (arnoldi_basis_y(i, j, iblk, it) * arnoldi_basis_y(i, j, iblk, nextit)) - enddo ! ij - enddo - !$OMP END PARALLEL DO - - hessenberg(it,initer) = global_sum(sum(local_dot), distrb_info) + hessenberg(it, initer) = global_dot_product(nx_block, ny_block, & + icellU , & + indxUi , indxUj , & + arnoldi_basis_x(:,:,:, it) , & + arnoldi_basis_y(:,:,:, it) , & + arnoldi_basis_x(:,:,:, nextit), & + arnoldi_basis_y(:,:,:, nextit)) !$OMP PARALLEL DO PRIVATE(iblk,ij,i,j) do iblk = 1, nblocks