Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove GPUClock Cost Function #4802

Merged
merged 1 commit into from
Mar 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 0 additions & 8 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,6 @@ include(CMakeDependentOption)
option(WarpX_APP "Build the WarpX executable application" ON)
option(WarpX_ASCENT "Ascent in situ diagnostics" OFF)
option(WarpX_EB "Embedded boundary support" OFF)
cmake_dependent_option(WarpX_GPUCLOCK
"Add GPU kernel timers (cost function)" ON
"WarpX_COMPUTE STREQUAL CUDA OR WarpX_COMPUTE STREQUAL HIP"
OFF)
option(WarpX_LIB "Build WarpX as a library" OFF)
option(WarpX_MPI "Multi-node support (message-passing)" ON)
option(WarpX_OPENPMD "openPMD I/O (HDF5, ADIOS)" ON)
Expand Down Expand Up @@ -510,10 +506,6 @@ foreach(D IN LISTS WarpX_DIMS)
target_compile_definitions(ablastr_${SD} PUBLIC WARPX_DIM_RZ WARPX_ZINDEX=1)
endif()

if(WarpX_GPUCLOCK)
target_compile_definitions(ablastr_${SD} PUBLIC WARPX_USE_GPUCLOCK)
endif()

if(WarpX_OPENPMD)
target_compile_definitions(ablastr_${SD} PUBLIC WARPX_USE_OPENPMD)
endif()
Expand Down
2 changes: 1 addition & 1 deletion Docs/source/developers/testing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ The content of this directory will look like the following (possibly including b
$ ls ./test_dir/rt-WarpX/WarpX-tests/2021-04-30/pml_x_yee/
analysis_pml_yee.py # Python analysis script
inputs_2d # input file
main2d.gnu.TEST.TPROF.MTMPI.OMP.QED.GPUCLOCK.ex # executable
main2d.gnu.TEST.TPROF.MTMPI.OMP.QED.ex # executable
pml_x_yee.analysis.out # Python analysis output
pml_x_yee.err.out # error output
pml_x_yee.make.out # build output
Expand Down
1 change: 0 additions & 1 deletion Docs/source/install/cmake.rst
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,6 @@ CMake Option Default & Values Descr
``WarpX_COMPUTE`` NOACC/**OMP**/CUDA/SYCL/HIP On-node, accelerated computing backend
``WarpX_DIMS`` **3**/2/1/RZ Simulation dimensionality. Use ``"1;2;RZ;3"`` for all.
``WarpX_EB`` ON/**OFF** Embedded boundary support (not supported in RZ yet)
``WarpX_GPUCLOCK`` **ON**/OFF Add GPU kernel timers (cost function, +4 registers/kernel)
``WarpX_IPO`` ON/**OFF** Compile WarpX with interprocedural optimization (aka LTO)
``WarpX_LIB`` ON/**OFF** Build WarpX as a library, e.g., for PICMI Python
``WarpX_MPI`` **ON**/OFF Multi-node support (message-passing)
Expand Down
6 changes: 1 addition & 5 deletions Docs/source/usage/parameters.rst
Original file line number Diff line number Diff line change
Expand Up @@ -557,7 +557,7 @@ Distribution across MPI ranks and parallelization
For example, if there are 4 boxes per rank and `load_balance_knapsack_factor=2`,
no more than 8 boxes can be assigned to any rank.

* ``algo.load_balance_costs_update`` (`heuristic` or `timers` or `gpuclock`) optional (default `timers`)
* ``algo.load_balance_costs_update`` (``heuristic`` or ``timers``) optional (default ``timers``)
If this is `heuristic`: load balance costs are updated according to a measure of
particles and cells assigned to each box of the domain. The cost :math:`c` is
computed as
Expand All @@ -574,10 +574,6 @@ Distribution across MPI ranks and parallelization

If this is `timers`: costs are updated according to in-code timers.

If this is `gpuclock`: [**requires to compile with option** ``-DWarpX_GPUCLOCK=ON``]
costs are measured as (max-over-threads) time spent in current deposition
routine (only applies when running on GPUs).

* ``algo.costs_heuristic_particles_wt`` (`float`) optional
Particle weight factor used in `Heuristic` strategy for costs update; if running on GPU,
the particle weight is set to a value determined from single-GPU tests on Summit,
Expand Down
1 change: 0 additions & 1 deletion GNUmakefile
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ USE_GPU = FALSE

EBASE = main

USE_GPUCLOCK = TRUE
USE_PYTHON_MAIN = FALSE

USE_SENSEI_INSITU = FALSE
Expand Down
2 changes: 1 addition & 1 deletion Python/pywarpx/picmi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1825,7 +1825,7 @@ class Simulation(picmistandard.PICMI_Simulation):
warpx_load_balance_knapsack_factor: float, default=1.24
(See documentation)

warpx_load_balance_costs_update: {'heuristic' or 'timers' or 'gpuclock'}, optional
warpx_load_balance_costs_update: {'heuristic' or 'timers'}, optional
(See documentation)

warpx_costs_heuristic_particles_wt: float, optional
Expand Down
5 changes: 0 additions & 5 deletions Source/Make.WarpX
Original file line number Diff line number Diff line change
Expand Up @@ -214,11 +214,6 @@ ifeq ($(USE_HDF5),TRUE)
DEFINES += -DWARPX_USE_HDF5
endif

ifeq ($(USE_GPUCLOCK),TRUE)
USERSuffix := $(USERSuffix).GPUCLOCK
DEFINES += -DWARPX_USE_GPUCLOCK
endif

# job_info support
CEXE_sources += AMReX_buildInfo.cpp
INCLUDE_LOCATIONS += $(AMREX_HOME)/Tools/C_scripts
Expand Down
59 changes: 4 additions & 55 deletions Source/Particles/Deposition/ChargeDeposition.H
Original file line number Diff line number Diff line change
Expand Up @@ -33,29 +33,21 @@
* \param lo Index lower bounds of domain.
* \param q species charge.
* \param n_rz_azimuthal_modes Number of azimuthal modes when using RZ geometry.
* \param cost: Pointer to (load balancing) cost corresponding to box where present particles deposit current.
* \param load_balance_costs_update_algo Selected method for updating load balance costs.
*/
template <int depos_order>
void doChargeDepositionShapeN (const GetParticlePosition<PIdx>& GetPosition,
const amrex::ParticleReal * const wp,
const int* ion_lev,
amrex::FArrayBox& rho_fab,
long np_to_deposit,
const std::array<amrex::Real,3>& dx,
const std::array<amrex::Real, 3>& dx,
const std::array<amrex::Real, 3> xyzmin,
amrex::Dim3 lo,
amrex::Real q,
int n_rz_azimuthal_modes,
amrex::Real* cost,
long load_balance_costs_update_algo)
int n_rz_azimuthal_modes)
{
using namespace amrex;

#if !defined(AMREX_USE_GPU)
amrex::ignore_unused(cost, load_balance_costs_update_algo);
#endif

// Whether ion_lev is a null pointer (do_ionization=0) or a real pointer
// (do_ionization=1)
const bool do_ionization = ion_lev;
Expand Down Expand Up @@ -87,21 +79,9 @@ void doChargeDepositionShapeN (const GetParticlePosition<PIdx>& GetPosition,
constexpr int CELL = amrex::IndexType::CELL;

// Loop over particles and deposit into rho_fab
#if defined(WARPX_USE_GPUCLOCK)
amrex::Real* cost_real = nullptr;
if( load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock) {
cost_real = (amrex::Real *) amrex::The_Managed_Arena()->alloc(sizeof(amrex::Real));
*cost_real = 0.;
}
#endif
amrex::ParallelFor(
np_to_deposit,
[=] AMREX_GPU_DEVICE (long ip) {
#if defined(WARPX_USE_GPUCLOCK)
const auto KernelTimer = ablastr::parallelization::KernelTimer(
cost && (load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock),
cost_real);
#endif
// --- Get particle quantities
amrex::Real wq = q*wp[ip]*invvol;
if (do_ionization){
Expand Down Expand Up @@ -202,13 +182,6 @@ void doChargeDepositionShapeN (const GetParticlePosition<PIdx>& GetPosition,
#endif
}
);
#if defined(WARPX_USE_GPUCLOCK)
if (cost && load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock) {
amrex::Gpu::streamSynchronize();
*cost += *cost_real;
amrex::The_Managed_Arena()->free(cost_real);
}
#endif

#ifndef WARPX_DIM_RZ
amrex::ignore_unused(n_rz_azimuthal_modes);
Expand All @@ -230,8 +203,6 @@ void doChargeDepositionShapeN (const GetParticlePosition<PIdx>& GetPosition,
* \param lo Index lower bounds of domain.
* \param q species charge.
* \param n_rz_azimuthal_modes Number of azimuthal modes when using RZ geometry.
* \param cost Pointer to (load balancing) cost corresponding to box where present particles deposit current.
* \param load_balance_costs_update_algo Selected method for updating load balance costs.
* \param a_bins
* \param box
* \param geom
Expand All @@ -245,13 +216,11 @@ void doChargeDepositionSharedShapeN (const GetParticlePosition<PIdx>& GetPositio
amrex::FArrayBox& rho_fab,
const amrex::IntVect& ix_type,
const long np_to_deposit,
const std::array<amrex::Real,3>& dx,
const std::array<amrex::Real, 3>& dx,
const std::array<amrex::Real, 3> xyzmin,
const amrex::Dim3 lo,
const amrex::Real q,
const int n_rz_azimuthal_modes,
amrex::Real* cost,
const long load_balance_costs_update_algo,
const amrex::DenseBins<WarpXParticleContainer::ParticleTileType::ParticleTileDataType>& a_bins,
const amrex::Box& box,
const amrex::Geometry& geom,
Expand All @@ -264,7 +233,7 @@ void doChargeDepositionSharedShapeN (const GetParticlePosition<PIdx>& GetPositio
const auto *permutation = a_bins.permutationPtr();

#if !defined(AMREX_USE_GPU)
amrex::ignore_unused(ix_type, cost, load_balance_costs_update_algo, a_bins, box, geom, a_tbox_max_size, bin_size);
amrex::ignore_unused(ix_type, a_bins, box, geom, a_tbox_max_size, bin_size);
#endif

// Whether ion_lev is a null pointer (do_ionization=0) or a real pointer
Expand Down Expand Up @@ -299,14 +268,6 @@ void doChargeDepositionSharedShapeN (const GetParticlePosition<PIdx>& GetPositio
constexpr int CELL = amrex::IndexType::CELL;

// Loop over particles and deposit into rho_fab
#if defined(WARPX_USE_GPUCLOCK)
amrex::Real* cost_real = nullptr;
if( load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock) {
cost_real = (amrex::Real *) amrex::The_Managed_Arena()->alloc(sizeof(amrex::Real));
*cost_real = 0.;
}
#endif

#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
const auto dxiarr = geom.InvCellSizeArray();
const auto plo = geom.ProbLoArray();
Expand Down Expand Up @@ -394,11 +355,6 @@ void doChargeDepositionSharedShapeN (const GetParticlePosition<PIdx>& GetPositio
{
const unsigned int ip = permutation[ip_orig];

#if defined(WARPX_USE_GPUCLOCK)
const auto KernelTimer = ablastr::parallelization::KernelTimer(
cost && (load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock),
cost_real);
#endif
// --- Get particle quantities
amrex::Real wq = q*wp[ip]*invvol;
if (do_ionization){
Expand Down Expand Up @@ -506,13 +462,6 @@ void doChargeDepositionSharedShapeN (const GetParticlePosition<PIdx>& GetPositio
#endif // defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
}
);
#if defined(WARPX_USE_GPUCLOCK)
if(cost && load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock) {
amrex::Gpu::streamSynchronize();
*cost += *cost_real;
amrex::The_Managed_Arena()->free(cost_real);
}
#endif

#ifndef WARPX_DIM_RZ
amrex::ignore_unused(n_rz_azimuthal_modes);
Expand Down
Loading
Loading