diff --git a/Registry/registry.io_boilerplate b/Registry/registry.io_boilerplate index 3b1c522b72..8f20c4ddf6 100644 --- a/Registry/registry.io_boilerplate +++ b/Registry/registry.io_boilerplate @@ -37,6 +37,7 @@ rconfig logical self_test_domain namelist,time_control 1 rconfig character history_outname namelist,time_control 1 "wrfout_d_" - "name of history outfile" "" "" rconfig character history_inname namelist,time_control 1 "wrfhist_d_" - "name of history infile" "" "" rconfig logical use_netcdf_classic namelist,time_control 1 .false. - "use_netcdf_classic" "" "" +rconfig logical enable_pnetcdf_bput namelist,time_control 1 .false. - "enable_pnetcdf_bput" "" "" rconfig integer history_interval_d namelist,time_control max_domains 0 h "history_interval_d" "" "DAYS" rconfig integer history_interval_h namelist,time_control max_domains 0 h "history_interval_h" "" "HOURS" diff --git a/external/io_pnetcdf/ext_pnc_put_dom_ti.code b/external/io_pnetcdf/ext_pnc_put_dom_ti.code index a86f5c23dd..e4125c191f 100644 --- a/external/io_pnetcdf/ext_pnc_put_dom_ti.code +++ b/external/io_pnetcdf/ext_pnc_put_dom_ti.code @@ -103,7 +103,7 @@ IF ( ncd_ok_to_put_dom_ti( DataHandle ) ) THEN return endif elseif (DH%FileStatus == WRF_FILE_OPENED_FOR_WRITE) then - stat = NFMPI_REDEF(DH%NCID) + call try_redef(DH, stat) call netcdf_err(stat,Status) if(Status /= WRF_NO_ERR) then write(msg,*) & diff --git a/external/io_pnetcdf/field_routines.F90 b/external/io_pnetcdf/field_routines.F90 index 14fb35489c..488cd511b3 100644 --- a/external/io_pnetcdf/field_routines.F90 +++ b/external/io_pnetcdf/field_routines.F90 @@ -33,7 +33,7 @@ !* Date: October 6, 2000 !* !*---------------------------------------------------------------------------- -subroutine ext_pnc_RealFieldIO(Coll,IO,NCID,VarID,VStart,VCount,Data,Status) +subroutine ext_pnc_RealFieldIO(Coll,IO,NCID,VarID,VStart,VCount,EnableBput,Data,Status) use wrf_data_pnc use ext_pnc_support_routines implicit none @@ -45,16 +45,21 @@ subroutine ext_pnc_RealFieldIO(Coll,IO,NCID,VarID,VStart,VCount,Data,Status) integer ,intent(in) :: VarID integer ,dimension(NVarDims),intent(in) :: VStart integer ,dimension(NVarDims),intent(in) :: VCount + logical ,intent(in) :: EnableBput real, dimension(*) ,intent(inout) :: Data integer ,intent(out) :: Status integer :: stat !local integer(KIND=MPI_OFFSET_KIND), dimension(NVarDims) :: VStart_mpi, VCount_mpi + integer :: BputReqID VStart_mpi = VStart VCount_mpi = VCount if(IO == 'write') then - if(Coll)then + if(EnableBput)then + ! Calling non-blocking buffered-version API + stat = NFMPI_BPUT_VARA_REAL(NCID,VarID,VStart_mpi,VCount_mpi,Data,BputReqID) + else if(Coll)then stat = NFMPI_PUT_VARA_REAL_ALL(NCID,VarID,VStart_mpi,VCount_mpi,Data) else stat = NFMPI_PUT_VARA_REAL(NCID,VarID,VStart_mpi,VCount_mpi,Data) @@ -74,7 +79,7 @@ subroutine ext_pnc_RealFieldIO(Coll,IO,NCID,VarID,VStart,VCount,Data,Status) return end subroutine ext_pnc_RealFieldIO -subroutine ext_pnc_DoubleFieldIO(Coll,IO,NCID,VarID,VStart,VCount,Data,Status) +subroutine ext_pnc_DoubleFieldIO(Coll,IO,NCID,VarID,VStart,VCount,EnableBput,Data,Status) use wrf_data_pnc use ext_pnc_support_routines implicit none @@ -86,16 +91,21 @@ subroutine ext_pnc_DoubleFieldIO(Coll,IO,NCID,VarID,VStart,VCount,Data,Status) integer ,intent(in) :: VarID integer ,dimension(NVarDims),intent(in) :: VStart integer ,dimension(NVarDims),intent(in) :: VCount + logical ,intent(in) :: EnableBput real*8 ,intent(inout) :: Data integer ,intent(out) :: Status integer :: stat !local integer(KIND=MPI_OFFSET_KIND), dimension(NVarDims) :: VStart_mpi, VCount_mpi + integer :: BputReqID VStart_mpi = VStart VCount_mpi = VCount if(IO == 'write') then - if(Coll)then + if(EnableBput)then + ! Calling non-blocking buffered-version API + stat = NFMPI_BPUT_VARA_DOUBLE(NCID,VarID,VStart_mpi,VCount_mpi,Data,BputReqID) + else if(Coll)then stat = NFMPI_PUT_VARA_DOUBLE_ALL(NCID,VarID,VStart_mpi,VCount_mpi,Data) else stat = NFMPI_PUT_VARA_DOUBLE(NCID,VarID,VStart_mpi,VCount_mpi,Data) @@ -115,7 +125,7 @@ subroutine ext_pnc_DoubleFieldIO(Coll,IO,NCID,VarID,VStart,VCount,Data,Status) return end subroutine ext_pnc_DoubleFieldIO -subroutine ext_pnc_IntFieldIO(Coll,IO,NCID,VarID,VStart,VCount,Data,Status) +subroutine ext_pnc_IntFieldIO(Coll,IO,NCID,VarID,VStart,VCount,EnableBput,Data,Status) use wrf_data_pnc use ext_pnc_support_routines implicit none @@ -127,16 +137,21 @@ subroutine ext_pnc_IntFieldIO(Coll,IO,NCID,VarID,VStart,VCount,Data,Status) integer ,intent(in) :: VarID integer ,dimension(NVarDims),intent(in) :: VStart integer ,dimension(NVarDims),intent(in) :: VCount + logical ,intent(in) :: EnableBput integer ,intent(inout) :: Data integer ,intent(out) :: Status integer :: stat !local integer(KIND=MPI_OFFSET_KIND), dimension(NVarDims) :: VStart_mpi, VCount_mpi + integer :: BputReqID VStart_mpi = VStart VCount_mpi = VCount if(IO == 'write') then - if(Coll)then + if(EnableBput)then + ! Calling non-blocking buffered-version API + stat = NFMPI_BPUT_VARA_INT(NCID,VarID,VStart_mpi,VCount_mpi,Data,BputReqID) + else if(Coll)then stat = NFMPI_PUT_VARA_INT_ALL(NCID,VarID,VStart_mpi,VCount_mpi,Data) else stat = NFMPI_PUT_VARA_INT(NCID,VarID,VStart_mpi,VCount_mpi,Data) @@ -156,7 +171,7 @@ subroutine ext_pnc_IntFieldIO(Coll,IO,NCID,VarID,VStart,VCount,Data,Status) return end subroutine ext_pnc_IntFieldIO -subroutine ext_pnc_LogicalFieldIO(Coll,IO,NCID,VarID,VStart,VCount,Data,Status) +subroutine ext_pnc_LogicalFieldIO(Coll,IO,NCID,VarID,VStart,VCount,EnableBput,Data,Status) use wrf_data_pnc use ext_pnc_support_routines implicit none @@ -168,6 +183,7 @@ subroutine ext_pnc_LogicalFieldIO(Coll,IO,NCID,VarID,VStart,VCount,Data,Status) integer ,intent(in) :: VarID integer,dimension(NVarDims) ,intent(in) :: VStart integer,dimension(NVarDims) ,intent(in) :: VCount + logical ,intent(in) :: EnableBput logical,dimension(VCount(1),VCount(2),VCount(3)),intent(inout) :: Data integer ,intent(out) :: Status integer,dimension(:,:,:),allocatable :: Buffer @@ -175,6 +191,7 @@ subroutine ext_pnc_LogicalFieldIO(Coll,IO,NCID,VarID,VStart,VCount,Data,Status) integer :: i,j,k !local integer(KIND=MPI_OFFSET_KIND), dimension(NVarDims) :: VStart_mpi, VCount_mpi + integer :: BputReqID VStart_mpi = VStart VCount_mpi = VCount @@ -197,7 +214,10 @@ subroutine ext_pnc_LogicalFieldIO(Coll,IO,NCID,VarID,VStart,VCount,Data,Status) enddo enddo enddo - if(Coll)then + if(EnableBput)then + ! Calling non-blocking buffered-version API + stat = NFMPI_BPUT_VARA_INT(NCID,VarID,VStart_mpi,VCount_mpi,Buffer,BputReqID) + else if(Coll)then stat = NFMPI_PUT_VARA_INT_ALL(NCID,VarID,VStart_mpi,VCount_mpi,Buffer) else stat = NFMPI_PUT_VARA_INT(NCID,VarID,VStart_mpi,VCount_mpi,Buffer) diff --git a/external/io_pnetcdf/wrf_io.F90 b/external/io_pnetcdf/wrf_io.F90 index 9d9c3733b4..66fc09965b 100644 --- a/external/io_pnetcdf/wrf_io.F90 +++ b/external/io_pnetcdf/wrf_io.F90 @@ -91,6 +91,16 @@ module wrf_data_pnc ! Whether pnetcdf file is in collective (.true.) or independent mode ! Collective mode is the default. logical :: Collective + + ! If BputEnabled is set to .true. then PnetCDF bput calls should be used instead + ! of PnetCDF put calls. + ! It is also (always) true that: BputEnabled is set to .true. if + ! and only if a buffer is correctly attached to the file. (invariant) + logical :: BputEnabled = .false. + + ! If isDefineMode is set to .true. then the file is in define mode. + logical :: isDefineMode = .true. + end type wrf_data_handle type(wrf_data_handle),target :: WrfDataHandles(WrfDataHandleMax) end module wrf_data_pnc @@ -208,6 +218,7 @@ subroutine allocHandle(DataHandle,DH,Comm,Status) DH%Write =.false. DH%first_operation = .TRUE. DH%Collective = .TRUE. + DH%isDefineMode = .TRUE. Status = WRF_NO_ERR end subroutine allocHandle @@ -370,6 +381,7 @@ subroutine GetTimeIndex(IO,DataHandle,DateStr,TimeIndex,Status) integer(KIND=MPI_OFFSET_KIND) :: VCount(2) integer :: stat integer :: i + integer :: BputReqID, MPIRank DH => WrfDataHandles(DataHandle) call DateCheck(DateStr,Status) @@ -401,7 +413,15 @@ subroutine GetTimeIndex(IO,DataHandle,DateStr,TimeIndex,Status) VStart(2) = TimeIndex VCount(1) = DateStrLen VCount(2) = 1 - stat = NFMPI_PUT_VARA_TEXT_ALL(DH%NCID,DH%TimesVarID,VStart,VCount,DateStr) + CALL MPI_COMM_RANK(DH%Comm, MPIRank, stat) + if (DH%BputEnabled) then + if (MPIRank == 0) then ! only rank 0 calls BPUT_VARA_TEXT, since this var is not partitioned. + ! Calling non-blocking buffered-version API + stat = NFMPI_BPUT_VARA_TEXT(DH%NCID, DH%TimesVarID, VStart, VCount, DateStr, BputReqID) + endif + else + stat = NFMPI_PUT_VARA_TEXT_ALL(DH%NCID,DH%TimesVarID,VStart,VCount,DateStr) + endif call netcdf_err(stat,Status) if(Status /= WRF_NO_ERR) then write(msg,*) 'NetCDF error in ',__FILE__,', line', __LINE__ @@ -661,7 +681,7 @@ subroutine netcdf_err(err,Status) end subroutine netcdf_err subroutine FieldIO(IO,DataHandle,DateStr,Starts,Length,MemoryOrder & - ,FieldType,NCID,VarID,XField,Status) + ,FieldType,NCID,VarID,IsPartitioned,XField,Status) use wrf_data_pnc include 'wrf_status_codes.h' # include "pnetcdf.inc" @@ -674,12 +694,15 @@ subroutine FieldIO(IO,DataHandle,DateStr,Starts,Length,MemoryOrder & integer ,intent(in) :: FieldType integer ,intent(in) :: NCID integer ,intent(in) :: VarID + logical ,intent(in) :: IsPartitioned integer,dimension(*) ,intent(inout) :: XField integer ,intent(out) :: Status integer :: TimeIndex integer :: NDim integer,dimension(NVarDims) :: VStart integer,dimension(NVarDims) :: VCount + integer :: MPIRank + type(wrf_data_handle) ,pointer :: DH call GetTimeIndex(IO,DataHandle,DateStr,TimeIndex,Status) if(Status /= WRF_NO_ERR) then @@ -697,19 +720,33 @@ subroutine FieldIO(IO,DataHandle,DateStr,Starts,Length,MemoryOrder & VCount(1:NDim) = Length(1:NDim) VStart(NDim+1) = TimeIndex VCount(NDim+1) = 1 + DH => WrfDataHandles(DataHandle) + + ! when io mode is "write", and the variable is not + ! partitioned (meaning every process writes the same contents + ! to the same region), then only rank 0 writes. + IF (IO=="write" .AND. .NOT.IsPartitioned) THEN + CALL MPI_COMM_RANK(DH%Comm, MPIRank, Status) + IF ( DH%BputEnabled .AND. MPIRank /= 0) THEN + RETURN + ELSE IF (MPIRank /= 0) THEN + VCount(:) = 0 + ENDIF + ENDIF + select case (FieldType) case (WRF_REAL) - call ext_pnc_RealFieldIO (WrfDataHandles(DataHandle)%Collective, & - IO,NCID,VarID,VStart,VCount,XField,Status) + call ext_pnc_RealFieldIO (DH%Collective,IO,NCID,VarID,& + VStart,VCount,DH%BputEnabled,XField,Status) case (WRF_DOUBLE) - call ext_pnc_DoubleFieldIO (WrfDataHandles(DataHandle)%Collective, & - IO,NCID,VarID,VStart,VCount,XField,Status) + call ext_pnc_DoubleFieldIO (DH%Collective,IO,NCID,VarID,& + VStart,VCount,DH%BputEnabled,XField,Status) case (WRF_INTEGER) - call ext_pnc_IntFieldIO (WrfDataHandles(DataHandle)%Collective, & - IO,NCID,VarID,VStart,VCount,XField,Status) + call ext_pnc_IntFieldIO (DH%Collective,IO,NCID,VarID,& + VStart,VCount,DH%BputEnabled,XField,Status) case (WRF_LOGICAL) - call ext_pnc_LogicalFieldIO (WrfDataHandles(DataHandle)%Collective, & - IO,NCID,VarID,VStart,VCount,XField,Status) + call ext_pnc_LogicalFieldIO (DH%Collective,IO,NCID,VarID,& + VStart,VCount,DH%BputEnabled,XField,Status) if(Status /= WRF_NO_ERR) return case default !for wrf_complex, double_complex @@ -888,8 +925,97 @@ LOGICAL FUNCTION ncd_is_first_operation( DataHandle ) RETURN END FUNCTION ncd_is_first_operation +! enter define mode if not already in define mode +! do nothing if already in define mode +subroutine try_redef(DH, stat) + use wrf_data_pnc + use pnetcdf + type(wrf_data_handle),pointer :: DH + integer ,intent(out) :: stat + stat = 0 + if (.NOT. DH%isDefineMode) then + ! return value (stat) is checked outside of this routine + stat = NFMPI_REDEF(DH%NCID) + endif + DH%isDefineMode = .true. +end subroutine try_redef + +! exit define mode if in define mode +! do nothing if not in define mode +subroutine try_enddef(DH, stat) + use wrf_data_pnc + use pnetcdf + type(wrf_data_handle),pointer :: DH + integer ,intent(out) :: stat + stat = 0 + if (DH%isDefineMode) then + ! return value (stat) is checked outside of this routine + stat = NFMPI_ENDDEF(DH%NCID) + endif + DH%isDefineMode = .false. +end subroutine try_enddef + end module ext_pnc_support_routines +! ext_pnc_bput_set_buffer_size: +! Tell PnetCDF the size of buffer to be used by PnetCDF bput calls internally. +subroutine ext_pnc_bput_set_buffer_size(hndl, bput_buffer_size) + use wrf_data_pnc + use ext_pnc_support_routines + use pnetcdf + implicit none + include 'wrf_status_codes.h' + integer, INTENT(IN) :: hndl + integer(kind=8), INTENT(IN) :: bput_buffer_size + type(wrf_data_handle), pointer :: DH + integer :: ierr=0, status=0 + + call GetDH(hndl,DH,ierr) + + if (bput_buffer_size > 0) then + if(.NOT. DH%BputEnabled) then ! if buffer is not yet attached + ierr = NFMPI_BUFFER_ATTACH(DH%NCID, bput_buffer_size) + DH%BputEnabled = .true. + endif + + ! check error + call netcdf_err(ierr,status) + if(status /= WRF_NO_ERR) then + write(msg,*) 'NetCDF error in ',__FILE__,', line', __LINE__ + call wrf_debug ( WARN , TRIM(msg)) + return + endif + else + DH%BputEnabled = .false. + endif +end subroutine ext_pnc_bput_set_buffer_size + +! ext_pnc_bput_wait: +! Flush all cached reqs to the file. Wait/block till finished. +subroutine ext_pnc_bput_wait(hndl) + use wrf_data_pnc + use ext_pnc_support_routines + use pnetcdf + implicit none + include 'wrf_status_codes.h' + integer, INTENT(IN) :: hndl + type(wrf_data_handle), pointer :: DH + integer :: ierr, status, dummy(0) + + call GetDH(hndl,DH,ierr) + if (DH%BputEnabled) then + ierr = NFMPI_WAIT_ALL(DH%NCID, NF_REQ_ALL, dummy, dummy) + + ! check error + call netcdf_err(ierr,status) + if(status /= WRF_NO_ERR) then + write(msg,*) 'NetCDF error in ',__FILE__,', line', __LINE__ + call wrf_debug(WARN, TRIM(msg)) + return + endif + endif +end subroutine ext_pnc_bput_wait + subroutine ext_pnc_open_for_read(DatasetName, Comm1, Comm2, SysDepInfo, DataHandle, Status) use wrf_data_pnc use ext_pnc_support_routines @@ -1363,13 +1489,6 @@ SUBROUTINE ext_pnc_open_for_write_commit(DataHandle, Status) call wrf_debug ( WARN , TRIM(msg)) return endif - stat = NFMPI_ENDDEF(DH%NCID) - call netcdf_err(stat,Status) - if(Status /= WRF_NO_ERR) then - write(msg,*) 'NetCDF error (',stat,') from NFMPI_ENDDEF in ext_pnc_open_for_write_commit ',__FILE__,', line', __LINE__ - call wrf_debug ( WARN , TRIM(msg)) - return - endif DH%FileStatus = WRF_FILE_OPENED_FOR_WRITE DH%first_operation = .TRUE. return @@ -1413,6 +1532,20 @@ subroutine ext_pnc_ioclose(DataHandle, Status) return endif + ! Detach bput buffer before file close + if (DH%BputEnabled) then + stat = NFMPI_BUFFER_DETACH(DH%NCID) + + ! check error + call netcdf_err(stat,Status) + if(Status /= WRF_NO_ERR) then + write(msg,*) 'NetCDF error in ext_pnc_ioclose: buffer detach ',__FILE__,', line', __LINE__ + call wrf_debug ( WARN , TRIM(msg)) + return + endif + DH%BputEnabled = .false. + endif + stat = NFMPI_CLOSE(DH%NCID) call netcdf_err(stat,Status) if(Status /= WRF_NO_ERR) then @@ -1420,6 +1553,7 @@ subroutine ext_pnc_ioclose(DataHandle, Status) call wrf_debug ( WARN , TRIM(msg)) return endif + DH%isDefineMode = .true. CALL deallocHandle( DataHandle, Status ) DH%Free=.true. return @@ -1509,7 +1643,7 @@ subroutine ext_pnc_redef( DataHandle, Status) call wrf_debug ( FATAL , TRIM(msg)) return endif - stat = NFMPI_REDEF(DH%NCID) + call try_redef(DH, stat) call netcdf_err(stat,Status) if(Status /= WRF_NO_ERR) then write(msg,*) 'NetCDF error in ',__FILE__,', line', __LINE__ @@ -1557,7 +1691,7 @@ subroutine ext_pnc_enddef( DataHandle, Status) call wrf_debug ( FATAL , TRIM(msg)) return endif - stat = NFMPI_ENDDEF(DH%NCID) + call try_enddef(DH, stat) call netcdf_err(stat,Status) if(Status /= WRF_NO_ERR) then write(msg,*) 'NetCDF error in ',__FILE__,', line', __LINE__ @@ -2351,6 +2485,8 @@ subroutine ext_pnc_write_field(DataHandle,DateStr,Var,Field,FieldType,Comm, & logical :: quilting ! Local, possibly adjusted, copies of MemoryStart and MemoryEnd integer ,dimension(NVarDims) :: lMemoryStart, lMemoryEnd + ! IsPartitioned is set to .true. if the variable is partitioned among process. + logical :: IsPartitioned = .false. MemoryOrder = trim(adjustl(MemoryOrdIn)) NullName=char(0) call GetDim(MemoryOrder,NDim,Status) @@ -2597,10 +2733,18 @@ subroutine ext_pnc_write_field(DataHandle,DateStr,Var,Field,FieldType,Comm, & ,i1,i2,j1,j2,k1,k2 ) END IF + IsPartitioned = .false. + DO i=1, NDim + IF (DomainStart(i) .NE. PatchStart(i) .OR. DomainEnd(i) .NE. PatchEnd(i)) THEN + IsPartitioned = .true. + EXIT + ENDIF + ENDDO + StoredStart(1:NDim) = PatchStart(1:NDim) call ExtOrder(MemoryOrder,StoredStart,Status) call FieldIO('write',DataHandle,DateStr,StoredStart,Length,MemoryOrder, & - FieldType,NCID,VarID,XField,Status) + FieldType,NCID,VarID,IsPartitioned,XField,Status) if(Status /= WRF_NO_ERR) then write(msg,*) 'Warning Status = ',Status,' in ',__FILE__,', line', __LINE__ call wrf_debug ( WARN , TRIM(msg)) @@ -2850,8 +2994,10 @@ subroutine ext_pnc_read_field(DataHandle,DateStr,Var,Field,FieldType,Comm, & call wrf_debug ( FATAL , msg) return endif + ! IsPartitioned will only take effect for write reqs. + ! In case of read, fix IsPartitioned to .true. (insensitive to the actual value) call FieldIO('read',DataHandle,DateStr,StoredStart,Length,MemoryOrder, & - FieldType,NCID,VarID,XField,Status) + FieldType,NCID,VarID,.true.,XField,Status) if(Status /= WRF_NO_ERR) then write(msg,*) 'Warning Status = ',Status,' in ',__FILE__,', line', __LINE__ call wrf_debug ( WARN , TRIM(msg)) diff --git a/frame/module_io.F b/frame/module_io.F index fc450c1d83..4d732667d6 100644 --- a/frame/module_io.F +++ b/frame/module_io.F @@ -39,6 +39,8 @@ MODULE module_io ! are_bdys_distributed, bdys_are_distributed and ! bdys_not_distributed routines access this flag CHARACTER*256 extradims + INTEGER(kind=8) :: bput_buffer_size_history = -1 + INTEGER(kind=8) :: bput_buffer_size_restart = -1 ! !
@@ -71,6 +73,52 @@ INTEGER FUNCTION io_form_for_stream ( stream )
     RETURN
   END FUNCTION io_form_for_stream
 
+! BputSetBufferSize:
+! Set the buffer size used by PnetCDF bput calls.
+SUBROUTINE BputSetBufferSize(fid, bput_buffer_size)
+  INTEGER, INTENT(IN) :: fid
+  INTEGER(KIND=8), INTENT(IN) :: bput_buffer_size
+  INTEGER :: Hndl, Hopened
+  LOGICAL :: for_out
+#ifdef PNETCDF
+  call get_handle(Hndl, Hopened, for_out, fid)
+  call ext_pnc_bput_set_buffer_size(Hndl, bput_buffer_size)
+#endif
+END SUBROUTINE BputSetBufferSize
+
+! BputWait:
+! Flush all cached reqs to the file. Wait/block till finished.
+SUBROUTINE BputWait(fid)
+  INTEGER, INTENT(IN) :: fid
+  INTEGER :: Hndl, Hopened
+  LOGICAL :: for_out
+#ifdef PNETCDF
+  call get_handle(Hndl, Hopened, for_out, fid)
+  call ext_pnc_bput_wait(Hndl)
+#endif
+END SUBROUTINE BputWait
+
+! An helper routine to call enddef for PnetCDF and NetCDF(PAR).
+! So that define mode is only entered once. (This is an optimization)
+! Currently the optimization is only implemented for PnetCDF.
+SUBROUTINE wrf_enddef(fid)
+  INTEGER, INTENT(IN) :: fid
+  INTEGER :: Hndl, io_form, status
+  LOGICAL :: for_out
+  INTEGER, EXTERNAL           :: use_package
+
+  call get_handle(Hndl, io_form, for_out, fid)
+
+  SELECT CASE (use_package(io_form))
+
+#ifdef PNETCDF
+    CASE(io_pnetcdf)
+      call ext_pnc_enddef(Hndl, status)
+#endif
+
+  ENDSELECT
+END SUBROUTINE wrf_enddef
+
 !--- ioinit
 
 SUBROUTINE wrf_ioinit( Status )
diff --git a/share/output_wrf.F b/share/output_wrf.F
index 3cec620bc7..6924c982f3 100644
--- a/share/output_wrf.F
+++ b/share/output_wrf.F
@@ -103,6 +103,13 @@ SUBROUTINE output_wrf ( fid , grid , config_flags, switch , ierr )
 
     LOGICAL, EXTERNAL :: wrf_dm_on_monitor
 
+    ! bput_enabled is used to determine if non-blocking pnetcdf (bput) should be used for PnetCDF.
+    ! to use non-blocking bput:
+    !       1) enable_pnetcdf_bput = .true. in namelist.input
+    !       2) io_form_history=11 or io_form_restart=11 (or both=11) in namelist.input
+    !       3) not a dryrun
+    LOGICAL :: bput_enabled
+
     WRITE(wrf_err_message,*)'output_wrf: begin, fid = ',fid
     CALL wrf_debug( 300 , wrf_err_message )
 
@@ -1002,9 +1009,31 @@ SUBROUTINE output_wrf ( fid , grid , config_flags, switch , ierr )
     ENDIF
 #endif
 
+    IF (.NOT. dryrun) THEN
+      call wrf_enddef(fid)
+    ENDIF
+
     IF ( (first_input   .LE. switch .AND. switch .LE. last_input) .OR. &
          (first_history .LE. switch .AND. switch .LE. last_history ) .OR. &
           switch .EQ. restart_only    ) THEN
+
+      bput_enabled = grid%enable_pnetcdf_bput .AND. (.NOT.dryrun)
+      ! Calculate and set bput buffer size for history files.
+      IF (switch .EQ. history_only) THEN
+        bput_enabled = bput_enabled .AND. (grid%io_form_history == 11)
+        IF (bput_enabled) THEN
+          CALL BputCalcBufferSize(grid, switch, bput_buffer_size_history)
+          call BputSetBufferSize(fid, bput_buffer_size_history)
+        ENDIF
+      ! Calculate and set bput buffer size for restart files.
+      ELSE IF (switch .EQ. restart_only) THEN
+        bput_enabled = bput_enabled .AND. (grid%io_form_restart == 11)
+        IF (bput_enabled) THEN
+          CALL BputCalcBufferSize(grid, switch, bput_buffer_size_restart)
+          call BputSetBufferSize(fid, bput_buffer_size_restart)
+        ENDIF
+      ENDIF
+
       newswitch = switch
       p => grid%head_statevars%next
       CALL wrf_start_io_timestep(fid, ierr)
@@ -1536,6 +1565,10 @@ SUBROUTINE output_wrf ( fid , grid , config_flags, switch , ierr )
       ENDIF
     ENDIF
 
+    IF (bput_enabled) THEN
+      call BputWait(fid) ! wait till all PnetCDF bput calls (on this file) are finished.
+    ENDIF
+
 #if ( (EM_CORE == 1) && (DA_CORE != 1) )
       grid%save_topo_from_real = save_topo_orig
 #endif
@@ -1596,3 +1629,210 @@ SUBROUTINE traverse_statevars_debug (s,l)
     RETURN
   END SUBROUTINE traverse_statevars_debug
 
+  ! BputCalcBufferSize:
+  ! Calculate the buffer size needed by PnetCDF bput calls.
+  ! The buffer size should equal the total amount of writes (excluding headers)
+  SUBROUTINE BputCalcBufferSize(grid, switch, totalSize)
+    USE module_domain_type, ONLY : fieldlist
+    USE module_domain
+    USE module_io
+    IMPLICIT NONE
+    INTEGER, PARAMETER :: DateStrLen = 19
+    TYPE(domain), INTENT(IN) :: grid
+    INTEGER, INTENT(IN) :: switch ! history files or restart files
+    INTEGER(kind=8), INTENT(INOUT):: totalSize ! the buffer size to return
+    ! temp variables
+    INTEGER :: gridSize = 0, typeSize = 4
+    INTEGER :: newSwitch, itrace
+    TYPE(fieldlist), POINTER :: p
+#ifdef PNETCDF
+    IF (totalSize > -1) THEN ! size already calculated, no need to repeat.
+      RETURN
+    ENDIF
+    p => grid%head_statevars%next
+
+    ! init variables
+    newSwitch = switch
+    totalSize = 0
+    ! The following codes are borrowed from share/output_wrf.F: subroutine "output_wrf"
+    !
+    ! The subroutine "output_wrf" is to write each variable to file. The implementation
+    ! logic of "output_wrf" is that:
+    !
+    !   Iterate over all variables (do while loop):
+    !       Some "if" conditions:
+    !           write/output the variable (of size amoutX) to file
+    !       End IF
+    !   End loop.
+    !
+    ! This subroutine "BputCalcBufferSize" is to return the buffer size (i.e. the total
+    ! amount of writes). The logic here is to replace "write/output the variable to file"
+    ! in the original logic with "increment total size":
+    !
+    !   totalSize = 0
+    !   Iterate over all variables (do while loop):
+    !       Some "if" conditions:
+    !           totalSize = totalSize + amoutX
+    !       End IF
+    !   End loop.
+    DO WHILE ( ASSOCIATED( p ) )
+      IF ( p%ProcOrient .NE. 'X' .AND. p%ProcOrient .NE. 'Y' ) THEN
+        IF (p%Ndim .EQ. 0)  THEN
+          IF ((p%Restart.AND.switch.EQ.restart_only).OR.on_stream(p%streams,newSwitch)) THEN
+            IF ( in_use_for_config(grid%id,TRIM(p%VarName)) ) THEN
+              ! increment totalSize
+              CALL BputInqSizeOfType(p%Type, typeSize)
+              totalSize = totalSize + typeSize ! grid szie=1 when p%Ndim == 0
+            ENDIF
+          ENDIF
+        ELSE IF (p%Ndim .EQ. 1) THEN
+          IF ((p%Restart.AND.switch.EQ.restart_only).OR.on_stream(p%streams,newSwitch)) THEN
+            IF ( in_use_for_config(grid%id,TRIM(p%VarName)) ) THEN
+              IF (switch.EQ.restart_only.OR.p%Ntl/100.EQ.mod(p%Ntl,100)) THEN
+                ! increment totalSize
+                CALL BputCalcGridSize(p, gridSize) ! calculate grid size
+                IF (gridSize >= 0) THEN
+                  CALL BputInqSizeOfType(p%Type, typeSize)
+                  totalSize = totalSize + gridSize * typeSize
+                ENDIF
+              ENDIF
+            ENDIF
+          ENDIF
+        ELSE IF (p%Ndim .EQ. 2) THEN
+          IF ((p%Restart.AND.switch.EQ.restart_only).OR.on_stream(p%streams,newSwitch)) THEN
+            IF ( in_use_for_config(grid%id,TRIM(p%VarName)) .AND.  &
+              ( .NOT. p%subgrid_x .OR. (p%subgrid_x .AND. grid%sr_x .GT. 0) ) .AND. &
+              ( .NOT. p%subgrid_y .OR. (p%subgrid_y .AND. grid%sr_y .GT. 0) )       &
+            ) THEN
+              IF (switch.EQ.restart_only.OR.p%Ntl/100.EQ.mod(p%Ntl,100)) THEN
+                ! increment totalSize
+                CALL BputCalcGridSize(p, gridSize) ! calculate grid size
+                IF (gridSize >= 0) THEN
+                  CALL BputInqSizeOfType(p%Type, typeSize)
+                  totalSize = totalSize + gridSize * typeSize
+                ENDIF
+              ENDIF
+            ENDIF
+          ENDIF
+        ELSE IF (p%Ndim .EQ. 3) THEN
+          IF ((p%Restart.AND.switch.EQ.restart_only).OR.on_stream(p%streams,newSwitch)) THEN
+            IF ( in_use_for_config(grid%id,TRIM(p%VarName)) .AND.  &
+              ( .NOT. p%subgrid_x .OR. (p%subgrid_x .AND. grid%sr_x .GT. 0) ) .AND. &
+              ( .NOT. p%subgrid_y .OR. (p%subgrid_y .AND. grid%sr_y .GT. 0) )       &
+            ) THEN
+              IF (switch.EQ.restart_only.OR.p%Ntl/100.EQ.mod(p%Ntl,100)) THEN
+                ! increment totalSize
+                CALL BputCalcGridSize(p, gridSize) ! calculate grid size
+                IF (gridSize >= 0) THEN
+                  CALL BputInqSizeOfType(p%Type, typeSize)
+                  totalSize = totalSize + gridSize * typeSize
+                ENDIF
+              ENDIF
+            ENDIF
+          ENDIF
+        ELSE IF (p%Ndim .EQ. 4 .AND. p%scalar_array ) THEN
+          IF (switch.EQ.restart_only.OR.p%Ntl/100.EQ.mod(p%Ntl,100)) THEN
+            DO itrace = PARAM_FIRST_SCALAR , p%num_table(grid%id)
+              IF ((p%Restart.AND.switch.EQ.restart_only).OR.on_stream(p%streams_table(grid%id,itrace)%stream,newSwitch)) THEN
+                ! increment totalSize
+                CALL BputCalcGridSize(p, gridSize) ! calculate grid size
+                IF (gridSize >= 0) THEN
+                  CALL BputInqSizeOfType(p%Type, typeSize)
+                  totalSize = totalSize + gridSize * typeSize
+                ENDIF
+              ENDIF
+            ENDDO
+          ENDIF
+        ENDIF
+      ENDIF
+      p => p%next
+    ENDDO
+    ! MPI Process 0 also needs to write to a variable called TimesVariable.
+    ! This variable has fix length 19 (DateStrLen). (external/io_pnetcdf/wrf_io.F90: GetTimeIndex)
+    ! Here we give 19 (DateStrLen) extra bytes to all MPI processes instead
+    ! of just giving to process 0, so that we can skip MPI rank check here to
+    ! simplify the implementation.
+    totalSize = totalSize + DateStrLen
+    contains
+    ! BputCalcGridSize:
+    ! Returns the size of the grid.
+    ! For example,  if it is a 2D 30 x 40 grid, then return size=1200 (=30x40).
+    ! if it is a 3D 10 x 30 x 40 grid, then return size=12000 (=10x30x40).
+    SUBROUTINE BputCalcGridSize(ptr, sizeOut)
+      USE module_domain_type, ONLY : fieldlist
+      TYPE(fieldlist), INTENT(INOUT), POINTER :: ptr
+      INTEGER, INTENT(OUT) :: sizeOut
+      INTEGER :: ndim, ierr=0
+      ! Whether it is a 0D/1D/2D/3D grid.
+      CALL BputInqDim(TRIM(ptr%MemoryOrder), ndim, ierr)
+      IF (ierr < 0) THEN
+        sizeOut = -1
+        return
+      ENDIF
+      ! ptr%sp[i] and prt%ep[i] store the start and end position of
+      ! grid along the i^th dimension, both sides inclusive.
+      sizeOut = 0
+      IF (ndim .EQ. 0) THEN
+        sizeOut = 1
+      ELSE IF (ndim .EQ. 1) THEN
+        IF (ptr%ep1 - ptr%sp1 >= 0) &
+        sizeOut = (ptr%ep1 - ptr%sp1 + 1)
+      ELSE IF (ndim .EQ. 2) THEN
+        IF (ptr%ep1 - ptr%sp1 >= 0 .AND. ptr%ep2 - ptr%sp2 >= 0) &
+        sizeOut = (ptr%ep1 - ptr%sp1 + 1) * (ptr%ep2 - ptr%sp2 + 1)
+      ELSE IF (ndim .EQ. 3) THEN
+        IF (ptr%ep1 - ptr%sp1 >= 0 .AND. ptr%ep2 - ptr%sp2 >= 0 .AND. ptr%ep3 - ptr%sp3 >= 0) &
+        sizeOut = (ptr%ep1 - ptr%sp1 + 1) * (ptr%ep2 - ptr%sp2 + 1) * (ptr%ep3 - ptr%sp3 + 1)
+      ENDIF
+      return
+    END SUBROUTINE BputCalcGridSize
+    ! BputInqSizeOfType:
+    ! Returns: how many bytes needed for one variable having this type.
+    SUBROUTINE BputInqSizeOfType(Type, Size)
+      CHARACTER*1, INTENT(IN) :: Type
+      INTEGER, INTENT(OUT) :: Size
+      ! LOGICAL :: temp_logical
+      REAL :: temp_real
+      REAL*8 :: temp_double
+
+      Size = 1
+      IF (Type .EQ. 'r') THEN
+        Size = SIZEOF(temp_real)
+      ELSE IF (Type .EQ. 'd') THEN
+        Size = SIZEOF(temp_double)
+      ELSE IF (Type .EQ. 'i') THEN
+        Size = SIZEOF(Size)
+      ELSE IF (Type .EQ. 'l') THEN
+        Size = SIZEOF(Size) ! integer is used for logic type
+      ENDIF
+      RETURN
+    END SUBROUTINE BputInqSizeOfType
+    ! BputInqDim:
+    ! Returns the number of dimension of the grid
+    ! i.e. whether it is a 0D/1D/2D/3D grid.
+    ! implementation is borrowed from external/io_pnetcdf/wrf_io.F90: GetDim
+    SUBROUTINE BputInqDim(MemoryOrder,NDim,Status)
+      CHARACTER*(*) ,INTENT(IN)  :: MemoryOrder
+      INTEGER       ,INTENT(OUT) :: NDim
+      INTEGER       ,INTENT(OUT) :: Status
+      CHARACTER*3                :: MemOrd
+      CALL lower_case(MemoryOrder,MemOrd) ! Convert to lower case
+      SELECT CASE (MemOrd)
+        CASE ('xyz','xzy','yxz','yzx','zxy','zyx','xsz','xez','ysz','yez')
+          NDim = 3
+        CASE ('xy','yx','xs','xe','ys','ye', 'cc')
+          NDim = 2
+        CASE ('z','c')
+          NDim = 1
+        CASE ('0')  ! NDim=0 for scalars.  TBH:  20060502
+          NDim = 0
+        CASE default
+          print *, 'memory order = ',MemOrd,'  ',MemoryOrder
+          Status = -1
+          return
+        END SELECT
+      Status = 0
+      RETURN
+    END SUBROUTINE BputInqDim
+#endif
+  END SUBROUTINE BputCalcBufferSize
diff --git a/var/build/da_name_space.pl b/var/build/da_name_space.pl
index 170fe0315a..d0c1a94a56 100755
--- a/var/build/da_name_space.pl
+++ b/var/build/da_name_space.pl
@@ -196,4 +196,5 @@
 is_this_data_ok_to_use
 check_which_switch
 med_read_qna_emissions
+BputCalcBufferSize
 ###########################################################################################