ECP-WarpX · RemiLehe · Mar 25, 2024 · Mar 25, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -69,10 +69,6 @@ include(CMakeDependentOption)
 option(WarpX_APP           "Build the WarpX executable application"     ON)
 option(WarpX_ASCENT        "Ascent in situ diagnostics"                 OFF)
 option(WarpX_EB            "Embedded boundary support"                  OFF)
-cmake_dependent_option(WarpX_GPUCLOCK
-                           "Add GPU kernel timers (cost function)"      ON
-                           "WarpX_COMPUTE STREQUAL CUDA OR WarpX_COMPUTE STREQUAL HIP"
-                                                                        OFF)
 option(WarpX_LIB           "Build WarpX as a library"                   OFF)
 option(WarpX_MPI           "Multi-node support (message-passing)"       ON)
 option(WarpX_OPENPMD       "openPMD I/O (HDF5, ADIOS)"                  ON)
@@ -510,10 +506,6 @@ foreach(D IN LISTS WarpX_DIMS)
         target_compile_definitions(ablastr_${SD} PUBLIC WARPX_DIM_RZ WARPX_ZINDEX=1)
     endif()
 
-    if(WarpX_GPUCLOCK)
-        target_compile_definitions(ablastr_${SD} PUBLIC WARPX_USE_GPUCLOCK)
-    endif()
-
     if(WarpX_OPENPMD)
         target_compile_definitions(ablastr_${SD} PUBLIC WARPX_USE_OPENPMD)
     endif()

diff --git a/Docs/source/developers/testing.rst b/Docs/source/developers/testing.rst
@@ -90,7 +90,7 @@ The content of this directory will look like the following (possibly including b
     $ ls ./test_dir/rt-WarpX/WarpX-tests/2021-04-30/pml_x_yee/
     analysis_pml_yee.py     # Python analysis script
     inputs_2d               # input file
-    main2d.gnu.TEST.TPROF.MTMPI.OMP.QED.GPUCLOCK.ex  # executable
+    main2d.gnu.TEST.TPROF.MTMPI.OMP.QED.ex  # executable
     pml_x_yee.analysis.out  # Python analysis output
     pml_x_yee.err.out       # error output
     pml_x_yee.make.out      # build output

diff --git a/Docs/source/install/cmake.rst b/Docs/source/install/cmake.rst
@@ -89,7 +89,6 @@ CMake Option                  Default & Values                             Descr
 ``WarpX_COMPUTE``             NOACC/**OMP**/CUDA/SYCL/HIP                  On-node, accelerated computing backend
 ``WarpX_DIMS``                **3**/2/1/RZ                                 Simulation dimensionality. Use ``"1;2;RZ;3"`` for all.
 ``WarpX_EB``                  ON/**OFF**                                   Embedded boundary support (not supported in RZ yet)
-``WarpX_GPUCLOCK``            **ON**/OFF                                   Add GPU kernel timers (cost function, +4 registers/kernel)
 ``WarpX_IPO``                 ON/**OFF**                                   Compile WarpX with interprocedural optimization (aka LTO)
 ``WarpX_LIB``                 ON/**OFF**                                   Build WarpX as a library, e.g., for PICMI Python
 ``WarpX_MPI``                 **ON**/OFF                                   Multi-node support (message-passing)

diff --git a/Docs/source/usage/parameters.rst b/Docs/source/usage/parameters.rst
@@ -557,7 +557,7 @@ Distribution across MPI ranks and parallelization
     For example, if there are 4 boxes per rank and `load_balance_knapsack_factor=2`,
     no more than 8 boxes can be assigned to any rank.
 
-* ``algo.load_balance_costs_update`` (`heuristic` or `timers` or `gpuclock`) optional (default `timers`)
+* ``algo.load_balance_costs_update`` (``heuristic`` or ``timers``) optional (default ``timers``)
     If this is `heuristic`: load balance costs are updated according to a measure of
     particles and cells assigned to each box of the domain.  The cost :math:`c` is
     computed as
@@ -574,10 +574,6 @@ Distribution across MPI ranks and parallelization
 
     If this is `timers`: costs are updated according to in-code timers.
 
-    If this is `gpuclock`: [**requires to compile with option** ``-DWarpX_GPUCLOCK=ON``]
-    costs are measured as (max-over-threads) time spent in current deposition
-    routine (only applies when running on GPUs).
-
 * ``algo.costs_heuristic_particles_wt`` (`float`) optional
     Particle weight factor used in `Heuristic` strategy for costs update; if running on GPU,
     the particle weight is set to a value determined from single-GPU tests on Summit,

diff --git a/GNUmakefile b/GNUmakefile
@@ -29,7 +29,6 @@ USE_GPU   = FALSE
 
 EBASE     = main
 
-USE_GPUCLOCK = TRUE
 USE_PYTHON_MAIN = FALSE
 
 USE_SENSEI_INSITU = FALSE

diff --git a/Python/pywarpx/picmi.py b/Python/pywarpx/picmi.py
@@ -1825,7 +1825,7 @@ class Simulation(picmistandard.PICMI_Simulation):
     warpx_load_balance_knapsack_factor: float, default=1.24
         (See documentation)
 
-    warpx_load_balance_costs_update: {'heuristic' or 'timers' or 'gpuclock'}, optional
+    warpx_load_balance_costs_update: {'heuristic' or 'timers'}, optional
         (See documentation)
 
     warpx_costs_heuristic_particles_wt: float, optional

diff --git a/Source/Make.WarpX b/Source/Make.WarpX
@@ -214,11 +214,6 @@ ifeq ($(USE_HDF5),TRUE)
   DEFINES += -DWARPX_USE_HDF5
 endif
 
-ifeq ($(USE_GPUCLOCK),TRUE)
-  USERSuffix := $(USERSuffix).GPUCLOCK
-  DEFINES += -DWARPX_USE_GPUCLOCK
-endif
-
 # job_info support
 CEXE_sources += AMReX_buildInfo.cpp
 INCLUDE_LOCATIONS += $(AMREX_HOME)/Tools/C_scripts

diff --git a/Source/Particles/Deposition/ChargeDeposition.H b/Source/Particles/Deposition/ChargeDeposition.H
@@ -33,29 +33,21 @@
  * \param lo           Index lower bounds of domain.
  * \param q            species charge.
  * \param n_rz_azimuthal_modes Number of azimuthal modes when using RZ geometry.
- * \param cost: Pointer to (load balancing) cost corresponding to box where present particles deposit current.
- * \param load_balance_costs_update_algo Selected method for updating load balance costs.
  */
 template <int depos_order>
 void doChargeDepositionShapeN (const GetParticlePosition<PIdx>& GetPosition,
                                const amrex::ParticleReal * const wp,
                                const int* ion_lev,
                                amrex::FArrayBox& rho_fab,
                                long np_to_deposit,
-                               const std::array<amrex::Real,3>& dx,
+                               const std::array<amrex::Real, 3>& dx,
                                const std::array<amrex::Real, 3> xyzmin,
                                amrex::Dim3 lo,
                                amrex::Real q,
-                               int n_rz_azimuthal_modes,
-                               amrex::Real* cost,
-                               long load_balance_costs_update_algo)
+                               int n_rz_azimuthal_modes)
 {
     using namespace amrex;
 
-#if !defined(AMREX_USE_GPU)
-    amrex::ignore_unused(cost, load_balance_costs_update_algo);
-#endif
-
     // Whether ion_lev is a null pointer (do_ionization=0) or a real pointer
     // (do_ionization=1)
     const bool do_ionization = ion_lev;
@@ -87,21 +79,9 @@ void doChargeDepositionShapeN (const GetParticlePosition<PIdx>& GetPosition,
     constexpr int CELL = amrex::IndexType::CELL;
 
     // Loop over particles and deposit into rho_fab
-#if defined(WARPX_USE_GPUCLOCK)
-    amrex::Real* cost_real = nullptr;
-    if( load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock) {
-        cost_real = (amrex::Real *) amrex::The_Managed_Arena()->alloc(sizeof(amrex::Real));
-        *cost_real = 0.;
-    }
-#endif
     amrex::ParallelFor(
             np_to_deposit,
             [=] AMREX_GPU_DEVICE (long ip) {
-#if defined(WARPX_USE_GPUCLOCK)
-            const auto KernelTimer = ablastr::parallelization::KernelTimer(
-                cost && (load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock),
-                cost_real);
-#endif
             // --- Get particle quantities
             amrex::Real wq = q*wp[ip]*invvol;
             if (do_ionization){
@@ -202,13 +182,6 @@ void doChargeDepositionShapeN (const GetParticlePosition<PIdx>& GetPosition,
 #endif
         }
         );
-#if defined(WARPX_USE_GPUCLOCK)
-    if (cost && load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock) {
-        amrex::Gpu::streamSynchronize();
-        *cost += *cost_real;
-        amrex::The_Managed_Arena()->free(cost_real);
-    }
-#endif
 
 #ifndef WARPX_DIM_RZ
     amrex::ignore_unused(n_rz_azimuthal_modes);
@@ -230,8 +203,6 @@ void doChargeDepositionShapeN (const GetParticlePosition<PIdx>& GetPosition,
  * \param lo            Index lower bounds of domain.
  * \param q             species charge.
  * \param n_rz_azimuthal_modes Number of azimuthal modes when using RZ geometry.
- * \param cost Pointer to (load balancing) cost corresponding to box where present particles deposit current.
- * \param load_balance_costs_update_algo Selected method for updating load balance costs.
  * \param a_bins
  * \param box
  * \param geom
@@ -245,13 +216,11 @@ void doChargeDepositionSharedShapeN (const GetParticlePosition<PIdx>& GetPositio
                                      amrex::FArrayBox& rho_fab,
                                      const amrex::IntVect& ix_type,
                                      const long np_to_deposit,
-                                     const std::array<amrex::Real,3>& dx,
+                                     const std::array<amrex::Real, 3>& dx,
                                      const std::array<amrex::Real, 3> xyzmin,
                                      const amrex::Dim3 lo,
                                      const amrex::Real q,
                                      const int n_rz_azimuthal_modes,
-                                     amrex::Real* cost,
-                                     const long load_balance_costs_update_algo,
                                      const amrex::DenseBins<WarpXParticleContainer::ParticleTileType::ParticleTileDataType>& a_bins,
                                      const amrex::Box& box,
                                      const amrex::Geometry& geom,
@@ -264,7 +233,7 @@ void doChargeDepositionSharedShapeN (const GetParticlePosition<PIdx>& GetPositio
     const auto *permutation = a_bins.permutationPtr();
 
 #if !defined(AMREX_USE_GPU)
-    amrex::ignore_unused(ix_type, cost, load_balance_costs_update_algo, a_bins, box, geom, a_tbox_max_size, bin_size);
+    amrex::ignore_unused(ix_type, a_bins, box, geom, a_tbox_max_size, bin_size);
 #endif
 
     // Whether ion_lev is a null pointer (do_ionization=0) or a real pointer
@@ -299,14 +268,6 @@ void doChargeDepositionSharedShapeN (const GetParticlePosition<PIdx>& GetPositio
     constexpr int CELL = amrex::IndexType::CELL;
 
     // Loop over particles and deposit into rho_fab
-#if defined(WARPX_USE_GPUCLOCK)
-    amrex::Real* cost_real = nullptr;
-    if( load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock) {
-        cost_real = (amrex::Real *) amrex::The_Managed_Arena()->alloc(sizeof(amrex::Real));
-        *cost_real = 0.;
-    }
-#endif
-
 #if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
     const auto dxiarr = geom.InvCellSizeArray();
     const auto plo = geom.ProbLoArray();
@@ -394,11 +355,6 @@ void doChargeDepositionSharedShapeN (const GetParticlePosition<PIdx>& GetPositio
         {
             const unsigned int ip = permutation[ip_orig];
 
-#if defined(WARPX_USE_GPUCLOCK)
-            const auto KernelTimer = ablastr::parallelization::KernelTimer(
-                cost && (load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock),
-                cost_real);
-#endif
             // --- Get particle quantities
             amrex::Real wq = q*wp[ip]*invvol;
             if (do_ionization){
@@ -506,13 +462,6 @@ void doChargeDepositionSharedShapeN (const GetParticlePosition<PIdx>& GetPositio
 #endif // defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
       }
       );
-#if defined(WARPX_USE_GPUCLOCK)
-        if(cost && load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock) {
-            amrex::Gpu::streamSynchronize();
-            *cost += *cost_real;
-            amrex::The_Managed_Arena()->free(cost_real);
-        }
-#endif
 
 #ifndef WARPX_DIM_RZ
         amrex::ignore_unused(n_rz_azimuthal_modes);