Remove GPUClock Cost Function

Removing the GPUClock cost function due to the following reasons. Incomplete Implementation: The implementation is only added to selected kernels. The implementation is not generalized to work with varying occupancy of different kernels, even if it were used in all kernels. The implementation is verbose. Unused: Our host-side timer implementation was in the last years extended to synchronize kernels at minimal overhead cost. This and heuristic is actually used. Research scope shifted: In the last years, we realized that we do not need more precise scalar cost functions, but instead vector cost functions to build better load balance performance models from. Costly when used: The implementation uses an atomic add of each kernel, instead of, e.g., just using one per warp. This adds severe memory bandwidth strain. Costly, even if not used: The implementation adds about 4 registers unnecessary to all instrumented GPU kernels once compiled in (by default).
ECP-WarpX · Mar 25, 2024 · 6958be7 · 6958be7
1 parent d7b1aba
commit 6958be7
Show file tree

Hide file tree

Showing 18 changed files with 65 additions and 376 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -69,10 +69,6 @@ include(CMakeDependentOption)
 option(WarpX_APP           "Build the WarpX executable application"     ON)
 option(WarpX_ASCENT        "Ascent in situ diagnostics"                 OFF)
 option(WarpX_EB            "Embedded boundary support"                  OFF)
-cmake_dependent_option(WarpX_GPUCLOCK
-                           "Add GPU kernel timers (cost function)"      ON
-                           "WarpX_COMPUTE STREQUAL CUDA OR WarpX_COMPUTE STREQUAL HIP"
-                                                                        OFF)
 option(WarpX_LIB           "Build WarpX as a library"                   OFF)
 option(WarpX_MPI           "Multi-node support (message-passing)"       ON)
 option(WarpX_OPENPMD       "openPMD I/O (HDF5, ADIOS)"                  ON)
@@ -510,10 +506,6 @@ foreach(D IN LISTS WarpX_DIMS)
         target_compile_definitions(ablastr_${SD} PUBLIC WARPX_DIM_RZ WARPX_ZINDEX=1)
     endif()
 
-    if(WarpX_GPUCLOCK)
-        target_compile_definitions(ablastr_${SD} PUBLIC WARPX_USE_GPUCLOCK)
-    endif()
-
     if(WarpX_OPENPMD)
         target_compile_definitions(ablastr_${SD} PUBLIC WARPX_USE_OPENPMD)
     endif()

diff --git a/Docs/source/developers/testing.rst b/Docs/source/developers/testing.rst
@@ -90,7 +90,7 @@ The content of this directory will look like the following (possibly including b
     $ ls ./test_dir/rt-WarpX/WarpX-tests/2021-04-30/pml_x_yee/
     analysis_pml_yee.py     # Python analysis script
     inputs_2d               # input file
-    main2d.gnu.TEST.TPROF.MTMPI.OMP.QED.GPUCLOCK.ex  # executable
+    main2d.gnu.TEST.TPROF.MTMPI.OMP.QED.ex  # executable
     pml_x_yee.analysis.out  # Python analysis output
     pml_x_yee.err.out       # error output
     pml_x_yee.make.out      # build output

diff --git a/Docs/source/install/cmake.rst b/Docs/source/install/cmake.rst
@@ -89,7 +89,6 @@ CMake Option                  Default & Values                             Descr
 ``WarpX_COMPUTE``             NOACC/**OMP**/CUDA/SYCL/HIP                  On-node, accelerated computing backend
 ``WarpX_DIMS``                **3**/2/1/RZ                                 Simulation dimensionality. Use ``"1;2;RZ;3"`` for all.
 ``WarpX_EB``                  ON/**OFF**                                   Embedded boundary support (not supported in RZ yet)
-``WarpX_GPUCLOCK``            **ON**/OFF                                   Add GPU kernel timers (cost function, +4 registers/kernel)
 ``WarpX_IPO``                 ON/**OFF**                                   Compile WarpX with interprocedural optimization (aka LTO)
 ``WarpX_LIB``                 ON/**OFF**                                   Build WarpX as a library, e.g., for PICMI Python
 ``WarpX_MPI``                 **ON**/OFF                                   Multi-node support (message-passing)

diff --git a/Docs/source/usage/parameters.rst b/Docs/source/usage/parameters.rst
@@ -557,7 +557,7 @@ Distribution across MPI ranks and parallelization
     For example, if there are 4 boxes per rank and `load_balance_knapsack_factor=2`,
     no more than 8 boxes can be assigned to any rank.
 
-* ``algo.load_balance_costs_update`` (`heuristic` or `timers` or `gpuclock`) optional (default `timers`)
+* ``algo.load_balance_costs_update`` (``heuristic`` or ``timers``) optional (default ``timers``)
     If this is `heuristic`: load balance costs are updated according to a measure of
     particles and cells assigned to each box of the domain.  The cost :math:`c` is
     computed as
@@ -574,10 +574,6 @@ Distribution across MPI ranks and parallelization
 
     If this is `timers`: costs are updated according to in-code timers.
 
-    If this is `gpuclock`: [**requires to compile with option** ``-DWarpX_GPUCLOCK=ON``]
-    costs are measured as (max-over-threads) time spent in current deposition
-    routine (only applies when running on GPUs).
-
 * ``algo.costs_heuristic_particles_wt`` (`float`) optional
     Particle weight factor used in `Heuristic` strategy for costs update; if running on GPU,
     the particle weight is set to a value determined from single-GPU tests on Summit,

diff --git a/GNUmakefile b/GNUmakefile
@@ -29,7 +29,6 @@ USE_GPU   = FALSE
 
 EBASE     = main
 
-USE_GPUCLOCK = TRUE
 USE_PYTHON_MAIN = FALSE
 
 USE_SENSEI_INSITU = FALSE

diff --git a/Python/pywarpx/picmi.py b/Python/pywarpx/picmi.py
@@ -1825,7 +1825,7 @@ class Simulation(picmistandard.PICMI_Simulation):
     warpx_load_balance_knapsack_factor: float, default=1.24
         (See documentation)
 
-    warpx_load_balance_costs_update: {'heuristic' or 'timers' or 'gpuclock'}, optional
+    warpx_load_balance_costs_update: {'heuristic' or 'timers'}, optional
         (See documentation)
 
     warpx_costs_heuristic_particles_wt: float, optional

diff --git a/Source/Make.WarpX b/Source/Make.WarpX
@@ -214,11 +214,6 @@ ifeq ($(USE_HDF5),TRUE)
   DEFINES += -DWARPX_USE_HDF5
 endif
 
-ifeq ($(USE_GPUCLOCK),TRUE)
-  USERSuffix := $(USERSuffix).GPUCLOCK
-  DEFINES += -DWARPX_USE_GPUCLOCK
-endif
-
 # job_info support
 CEXE_sources += AMReX_buildInfo.cpp
 INCLUDE_LOCATIONS += $(AMREX_HOME)/Tools/C_scripts

diff --git a/Source/Particles/Deposition/ChargeDeposition.H b/Source/Particles/Deposition/ChargeDeposition.H
@@ -33,29 +33,21 @@
  * \param lo           Index lower bounds of domain.
  * \param q            species charge.
  * \param n_rz_azimuthal_modes Number of azimuthal modes when using RZ geometry.
- * \param cost: Pointer to (load balancing) cost corresponding to box where present particles deposit current.
- * \param load_balance_costs_update_algo Selected method for updating load balance costs.
  */
 template <int depos_order>
 void doChargeDepositionShapeN (const GetParticlePosition<PIdx>& GetPosition,
                                const amrex::ParticleReal * const wp,
                                const int* ion_lev,
                                amrex::FArrayBox& rho_fab,
                                long np_to_deposit,
-                               const std::array<amrex::Real,3>& dx,
+                               const std::array<amrex::Real, 3>& dx,
                                const std::array<amrex::Real, 3> xyzmin,
                                amrex::Dim3 lo,
                                amrex::Real q,
-                               int n_rz_azimuthal_modes,
-                               amrex::Real* cost,
-                               long load_balance_costs_update_algo)
+                               int n_rz_azimuthal_modes)
 {
     using namespace amrex;
 
-#if !defined(AMREX_USE_GPU)
-    amrex::ignore_unused(cost, load_balance_costs_update_algo);
-#endif
-
     // Whether ion_lev is a null pointer (do_ionization=0) or a real pointer
     // (do_ionization=1)
     const bool do_ionization = ion_lev;
@@ -87,21 +79,9 @@ void doChargeDepositionShapeN (const GetParticlePosition<PIdx>& GetPosition,
     constexpr int CELL = amrex::IndexType::CELL;
 
     // Loop over particles and deposit into rho_fab
-#if defined(WARPX_USE_GPUCLOCK)
-    amrex::Real* cost_real = nullptr;
-    if( load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock) {
-        cost_real = (amrex::Real *) amrex::The_Managed_Arena()->alloc(sizeof(amrex::Real));
-        *cost_real = 0.;
-    }
-#endif
     amrex::ParallelFor(
             np_to_deposit,
             [=] AMREX_GPU_DEVICE (long ip) {
-#if defined(WARPX_USE_GPUCLOCK)
-            const auto KernelTimer = ablastr::parallelization::KernelTimer(
-                cost && (load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock),
-                cost_real);
-#endif
             // --- Get particle quantities
             amrex::Real wq = q*wp[ip]*invvol;
             if (do_ionization){
@@ -202,13 +182,6 @@ void doChargeDepositionShapeN (const GetParticlePosition<PIdx>& GetPosition,
 #endif
         }
         );
-#if defined(WARPX_USE_GPUCLOCK)
-    if (cost && load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock) {
-        amrex::Gpu::streamSynchronize();
-        *cost += *cost_real;
-        amrex::The_Managed_Arena()->free(cost_real);
-    }
-#endif
 
 #ifndef WARPX_DIM_RZ
     amrex::ignore_unused(n_rz_azimuthal_modes);
@@ -230,8 +203,6 @@ void doChargeDepositionShapeN (const GetParticlePosition<PIdx>& GetPosition,
  * \param lo            Index lower bounds of domain.
  * \param q             species charge.
  * \param n_rz_azimuthal_modes Number of azimuthal modes when using RZ geometry.
- * \param cost Pointer to (load balancing) cost corresponding to box where present particles deposit current.
- * \param load_balance_costs_update_algo Selected method for updating load balance costs.
  * \param a_bins
  * \param box
  * \param geom
@@ -245,13 +216,11 @@ void doChargeDepositionSharedShapeN (const GetParticlePosition<PIdx>& GetPositio
                                      amrex::FArrayBox& rho_fab,
                                      const amrex::IntVect& ix_type,
                                      const long np_to_deposit,
-                                     const std::array<amrex::Real,3>& dx,
+                                     const std::array<amrex::Real, 3>& dx,
                                      const std::array<amrex::Real, 3> xyzmin,
                                      const amrex::Dim3 lo,
                                      const amrex::Real q,
                                      const int n_rz_azimuthal_modes,
-                                     amrex::Real* cost,
-                                     const long load_balance_costs_update_algo,
                                      const amrex::DenseBins<WarpXParticleContainer::ParticleTileType::ParticleTileDataType>& a_bins,
                                      const amrex::Box& box,
                                      const amrex::Geometry& geom,
@@ -264,7 +233,7 @@ void doChargeDepositionSharedShapeN (const GetParticlePosition<PIdx>& GetPositio
     const auto *permutation = a_bins.permutationPtr();
 
 #if !defined(AMREX_USE_GPU)
-    amrex::ignore_unused(ix_type, cost, load_balance_costs_update_algo, a_bins, box, geom, a_tbox_max_size, bin_size);
+    amrex::ignore_unused(ix_type, a_bins, box, geom, a_tbox_max_size, bin_size);
 #endif
 
     // Whether ion_lev is a null pointer (do_ionization=0) or a real pointer
@@ -299,14 +268,6 @@ void doChargeDepositionSharedShapeN (const GetParticlePosition<PIdx>& GetPositio
     constexpr int CELL = amrex::IndexType::CELL;
 
     // Loop over particles and deposit into rho_fab
-#if defined(WARPX_USE_GPUCLOCK)
-    amrex::Real* cost_real = nullptr;
-    if( load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock) {
-        cost_real = (amrex::Real *) amrex::The_Managed_Arena()->alloc(sizeof(amrex::Real));
-        *cost_real = 0.;
-    }
-#endif
-
 #if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
     const auto dxiarr = geom.InvCellSizeArray();
     const auto plo = geom.ProbLoArray();
@@ -394,11 +355,6 @@ void doChargeDepositionSharedShapeN (const GetParticlePosition<PIdx>& GetPositio
         {
             const unsigned int ip = permutation[ip_orig];
 
-#if defined(WARPX_USE_GPUCLOCK)
-            const auto KernelTimer = ablastr::parallelization::KernelTimer(
-                cost && (load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock),
-                cost_real);
-#endif
             // --- Get particle quantities
             amrex::Real wq = q*wp[ip]*invvol;
             if (do_ionization){
@@ -506,13 +462,6 @@ void doChargeDepositionSharedShapeN (const GetParticlePosition<PIdx>& GetPositio
 #endif // defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
       }
       );
-#if defined(WARPX_USE_GPUCLOCK)
-        if(cost && load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock) {
-            amrex::Gpu::streamSynchronize();
-            *cost += *cost_real;
-            amrex::The_Managed_Arena()->free(cost_real);
-        }
-#endif
 
 #ifndef WARPX_DIM_RZ
         amrex::ignore_unused(n_rz_azimuthal_modes);