From 60b4f0ebd6fd5f569f2c8e3f14a8dbc391f28506 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= Date: Wed, 1 Dec 2021 18:24:51 +0100 Subject: [PATCH 01/21] core: Cleanup globals and remove unnecessary macros --- src/core/EspressoSystemInterface.hpp | 5 ++-- src/core/actor/DipolarBarnesHut.cpp | 6 ++-- src/core/actor/DipolarBarnesHut.hpp | 2 -- src/core/actor/DipolarBarnesHut_cuda.cu | 5 +--- src/core/actor/DipolarDirectSum.cpp | 4 +-- src/core/actor/DipolarDirectSum.hpp | 2 -- src/core/actor/DipolarDirectSum_cuda.cu | 22 +++++++------- src/core/cuda_interface.cpp | 2 +- .../p3m_gpu_cuda.cu | 29 +++++++++---------- src/core/energy.cpp | 5 ++-- src/core/forces.cpp | 7 +++-- 11 files changed, 41 insertions(+), 48 deletions(-) diff --git a/src/core/EspressoSystemInterface.hpp b/src/core/EspressoSystemInterface.hpp index 4f00171452a..cdcad3db7f8 100644 --- a/src/core/EspressoSystemInterface.hpp +++ b/src/core/EspressoSystemInterface.hpp @@ -24,9 +24,6 @@ #include -/* Syntactic sugar */ -#define espressoSystemInterface EspressoSystemInterface::Instance() - class EspressoSystemInterface : public SystemInterface { public: static EspressoSystemInterface &Instance() { @@ -83,6 +80,7 @@ class EspressoSystemInterface : public SystemInterface { return m_needsVGpu; }; +#ifdef ELECTROSTATICS float *qGpuBegin() override { return m_q_gpu_begin; }; float *qGpuEnd() override { return m_q_gpu_end; }; bool hasQGpu() override { return true; }; @@ -94,6 +92,7 @@ class EspressoSystemInterface : public SystemInterface { enableParticleCommunication(); return m_needsQGpu; }; +#endif float *directorGpuBegin() override { return m_director_gpu_begin; }; float *directorGpuEnd() override { return m_director_gpu_end; }; diff --git a/src/core/actor/DipolarBarnesHut.cpp b/src/core/actor/DipolarBarnesHut.cpp index 9040cf02705..ce5fd05b6f6 100644 --- a/src/core/actor/DipolarBarnesHut.cpp +++ b/src/core/actor/DipolarBarnesHut.cpp @@ -31,15 +31,15 @@ #include -std::unique_ptr dipolarBarnesHut; +static std::unique_ptr dipolarBarnesHut; void activate_dipolar_barnes_hut(float epssq, float itolsq) { // also necessary on 1 CPU or GPU, does more than just broadcasting dipole.method = DIPOLAR_BH_GPU; mpi_bcast_coulomb_params(); - dipolarBarnesHut = std::make_unique(espressoSystemInterface, - epssq, itolsq); + dipolarBarnesHut = std::make_unique( + EspressoSystemInterface::Instance(), epssq, itolsq); forceActors.push_back(dipolarBarnesHut.get()); energyActors.push_back(dipolarBarnesHut.get()); } diff --git a/src/core/actor/DipolarBarnesHut.hpp b/src/core/actor/DipolarBarnesHut.hpp index 8631e8c9eb6..98364fde674 100644 --- a/src/core/actor/DipolarBarnesHut.hpp +++ b/src/core/actor/DipolarBarnesHut.hpp @@ -100,8 +100,6 @@ class DipolarBarnesHut : public Actor { void activate_dipolar_barnes_hut(float epssq, float itolsq); void deactivate_dipolar_barnes_hut(); -extern std::unique_ptr dipolarBarnesHut; - #endif #endif // DIPOLAR_BARNES_HUT diff --git a/src/core/actor/DipolarBarnesHut_cuda.cu b/src/core/actor/DipolarBarnesHut_cuda.cu index f2e2185b896..8f0f11ba330 100644 --- a/src/core/actor/DipolarBarnesHut_cuda.cu +++ b/src/core/actor/DipolarBarnesHut_cuda.cu @@ -36,8 +36,6 @@ #include -#define IND (blockDim.x * blockIdx.x + threadIdx.x) - // Method performance/accuracy parameters __constant__ float epssqd[1], itolsqd[1]; // blkcntd is a factual blocks' count. @@ -73,8 +71,7 @@ __device__ void dds_sumReduction_BH(float *input, float *sum) { /******************************************************************************/ __global__ void initializationKernel() { - int ind; - ind = IND; + auto const ind = blockDim.x * blockIdx.x + threadIdx.x; if (ind == 0) { *bhpara->err = 0; *bhpara->max_lps = 0; diff --git a/src/core/actor/DipolarDirectSum.cpp b/src/core/actor/DipolarDirectSum.cpp index 55cb6858819..2ca0de24bb6 100644 --- a/src/core/actor/DipolarDirectSum.cpp +++ b/src/core/actor/DipolarDirectSum.cpp @@ -30,7 +30,7 @@ #include -std::unique_ptr dipolarDirectSum; +static std::unique_ptr dipolarDirectSum; void activate_dipolar_direct_sum_gpu() { // also necessary on 1 CPU or GPU, does more than just broadcasting @@ -38,7 +38,7 @@ void activate_dipolar_direct_sum_gpu() { mpi_bcast_coulomb_params(); dipolarDirectSum = - std::make_unique(espressoSystemInterface); + std::make_unique(EspressoSystemInterface::Instance()); forceActors.push_back(dipolarDirectSum.get()); energyActors.push_back(dipolarDirectSum.get()); } diff --git a/src/core/actor/DipolarDirectSum.hpp b/src/core/actor/DipolarDirectSum.hpp index 0e9dd8d505b..aa41cdcde58 100644 --- a/src/core/actor/DipolarDirectSum.hpp +++ b/src/core/actor/DipolarDirectSum.hpp @@ -84,8 +84,6 @@ class DipolarDirectSum : public Actor { void activate_dipolar_direct_sum_gpu(); void deactivate_dipolar_direct_sum_gpu(); -extern std::unique_ptr dipolarDirectSum; - #endif #endif diff --git a/src/core/actor/DipolarDirectSum_cuda.cu b/src/core/actor/DipolarDirectSum_cuda.cu index 02df0d4e9e7..0182c140460 100644 --- a/src/core/actor/DipolarDirectSum_cuda.cu +++ b/src/core/actor/DipolarDirectSum_cuda.cu @@ -32,6 +32,10 @@ #error CU-file includes mpi.h! This should not happen! #endif +__device__ inline float scalar_product(float const *a, float const *b) { + return a[0] * b[0] + a[1] * b[1] + a[2] * b[2]; +} + __device__ inline void get_mi_vector_dds(float res[3], float const a[3], float const b[3], float const box_l[3], int const periodic[3]) { @@ -42,8 +46,6 @@ __device__ inline void get_mi_vector_dds(float res[3], float const a[3], } } -#define scalar(a, b) ((a)[0] * (b)[0] + (a)[1] * (b)[1] + (a)[2] * (b)[2]) - __device__ void dipole_ia_force(int id, float pf, float const *r1, float const *r2, float const *dip1, float const *dip2, float *f1, float *torque1, @@ -69,7 +71,7 @@ __device__ void dipole_ia_force(int id, float pf, float const *r1, get_mi_vector_dds(dr, _r1, _r2, box_l, periodic); // Powers of distance - r_sq = scalar(dr, dr); + r_sq = scalar_product(dr, dr); r_sq_inv = 1 / r_sq; r_inv = rsqrtf(r_sq); r3_inv = 1 / r_sq * r_inv; @@ -77,9 +79,9 @@ __device__ void dipole_ia_force(int id, float pf, float const *r1, r7_inv = r5_inv * r_sq_inv; // Dot products - pe1 = scalar(_dip1, _dip2); - pe2 = scalar(_dip1, dr); - pe3 = scalar(_dip2, dr); + pe1 = scalar_product(_dip1, _dip2); + pe2 = scalar_product(_dip1, dr); + pe3 = scalar_product(_dip2, dr); pe4 = 3.0f * r5_inv; // Force @@ -136,16 +138,16 @@ __device__ float dipole_ia_energy(int id, float pf, float const *r1, get_mi_vector_dds(dr, _r1, _r2, box_l, periodic); // Powers of distance - r_sq = scalar(dr, dr); + r_sq = scalar_product(dr, dr); r_sq_inv = 1 / r_sq; r_inv = rsqrtf(r_sq); r3_inv = 1 / r_sq * r_inv; r5_inv = r3_inv * r_sq_inv; // Dot products - pe1 = scalar(_dip1, _dip2); - pe2 = scalar(_dip1, dr); - pe3 = scalar(_dip2, dr); + pe1 = scalar_product(_dip1, _dip2); + pe2 = scalar_product(_dip1, dr); + pe3 = scalar_product(_dip2, dr); pe4 = 3.0f * r5_inv; // Energy diff --git a/src/core/cuda_interface.cpp b/src/core/cuda_interface.cpp index 7d8353efe68..8b5b87d9927 100644 --- a/src/core/cuda_interface.cpp +++ b/src/core/cuda_interface.cpp @@ -166,7 +166,7 @@ void cuda_mpi_send_forces(const ParticleRange &particles, static void cuda_bcast_global_part_params_local() { MPI_Bcast(gpu_get_global_particle_vars_pointer_host(), sizeof(CUDA_global_part_vars), MPI_BYTE, 0, comm_cart); - espressoSystemInterface.requestParticleStructGpu(); + EspressoSystemInterface::Instance().requestParticleStructGpu(); } REGISTER_CALLBACK(cuda_bcast_global_part_params_local) diff --git a/src/core/electrostatics_magnetostatics/p3m_gpu_cuda.cu b/src/core/electrostatics_magnetostatics/p3m_gpu_cuda.cu index 1df4f265478..5b3c3c1988f 100644 --- a/src/core/electrostatics_magnetostatics/p3m_gpu_cuda.cu +++ b/src/core/electrostatics_magnetostatics/p3m_gpu_cuda.cu @@ -51,7 +51,6 @@ #include "electrostatics_magnetostatics/p3m_gpu.hpp" -#include "BoxGeometry.hpp" #include "EspressoSystemInterface.hpp" #include "cuda_interface.hpp" #include "cuda_utils.cuh" @@ -76,8 +75,6 @@ using Utils::int_pow; using Utils::sqr; -extern BoxGeometry box_geo; - struct P3MGpuData { /** Charge mesh */ FFT_TYPE_COMPLEX *charge_mesh; @@ -115,7 +112,7 @@ struct p3m_gpu_fft_plans_t { cufftHandle back_plan; } p3m_gpu_fft_plans; -static char p3m_gpu_data_initialized = 0; +static bool p3m_gpu_data_initialized = false; template __device__ void static Aliasing_sums_ik(const P3MGpuData p, int NX, int NY, @@ -562,33 +559,34 @@ void p3m_gpu_init(int cao, const int mesh[3], double alpha) { if (mesh[0] == -1 && mesh[1] == -1 && mesh[2] == -1) throw std::runtime_error("P3M: invalid mesh size"); - espressoSystemInterface.requestParticleStructGpu(); + auto &espresso_system = EspressoSystemInterface::Instance(); + espresso_system.requestParticleStructGpu(); bool reinit_if = false, mesh_changed = false; p3m_gpu_data.n_part = gpu_get_particle_pointer().size(); - if ((p3m_gpu_data_initialized == 0) || (p3m_gpu_data.alpha != alpha)) { + if (!p3m_gpu_data_initialized || p3m_gpu_data.alpha != alpha) { p3m_gpu_data.alpha = static_cast(alpha); reinit_if = true; } - if ((p3m_gpu_data_initialized == 0) || (p3m_gpu_data.cao != cao)) { + if (!p3m_gpu_data_initialized || p3m_gpu_data.cao != cao) { p3m_gpu_data.cao = cao; // NOLINTNEXTLINE(bugprone-integer-division) p3m_gpu_data.pos_shift = static_cast((p3m_gpu_data.cao - 1) / 2); reinit_if = true; } - if ((p3m_gpu_data_initialized == 0) || (p3m_gpu_data.mesh[0] != mesh[0]) || + if (!p3m_gpu_data_initialized || (p3m_gpu_data.mesh[0] != mesh[0]) || (p3m_gpu_data.mesh[1] != mesh[1]) || (p3m_gpu_data.mesh[2] != mesh[2])) { std::copy(mesh, mesh + 3, p3m_gpu_data.mesh); mesh_changed = true; reinit_if = true; } - auto const box_l = box_geo.length(); + auto const box_l = espresso_system.box(); - if ((p3m_gpu_data_initialized == 0) || (p3m_gpu_data.box[0] != box_l[0]) || + if (!p3m_gpu_data_initialized || (p3m_gpu_data.box[0] != box_l[0]) || (p3m_gpu_data.box[1] != box_l[1]) || (p3m_gpu_data.box[2] != box_l[2])) { std::copy(box_l.begin(), box_l.end(), p3m_gpu_data.box); reinit_if = true; @@ -602,7 +600,7 @@ void p3m_gpu_init(int cao, const int mesh[3], double alpha) { static_cast(p3m_gpu_data.mesh[i]) / p3m_gpu_data.box[i]; } - if ((p3m_gpu_data_initialized == 1) && mesh_changed) { + if (p3m_gpu_data_initialized && mesh_changed) { cuda_safe_mem(cudaFree(p3m_gpu_data.charge_mesh)); p3m_gpu_data.charge_mesh = nullptr; cuda_safe_mem(cudaFree(p3m_gpu_data.force_mesh_x)); @@ -617,10 +615,10 @@ void p3m_gpu_init(int cao, const int mesh[3], double alpha) { cufftDestroy(p3m_gpu_fft_plans.forw_plan); cufftDestroy(p3m_gpu_fft_plans.back_plan); - p3m_gpu_data_initialized = 0; + p3m_gpu_data_initialized = false; } - if ((p3m_gpu_data_initialized == 0) && (p3m_gpu_data.mesh_size > 0)) { + if (!p3m_gpu_data_initialized && p3m_gpu_data.mesh_size > 0) { /* Size of the complex mesh Nx * Ny * ( Nz / 2 + 1 ) */ const int cmesh_size = p3m_gpu_data.mesh[0] * p3m_gpu_data.mesh[1] * (p3m_gpu_data.mesh[2] / 2 + 1); @@ -644,8 +642,7 @@ void p3m_gpu_init(int cao, const int mesh[3], double alpha) { } } - if ((reinit_if || (p3m_gpu_data_initialized == 0)) && - (p3m_gpu_data.mesh_size > 0)) { + if ((reinit_if or !p3m_gpu_data_initialized) && p3m_gpu_data.mesh_size > 0) { dim3 grid(1, 1, 1); dim3 block(1, 1, 1); block.x = 512 / mesh[0] + 1; @@ -686,7 +683,7 @@ void p3m_gpu_init(int cao, const int mesh[3], double alpha) { } } if (p3m_gpu_data.mesh_size > 0) - p3m_gpu_data_initialized = 1; + p3m_gpu_data_initialized = true; } /** diff --git a/src/core/energy.cpp b/src/core/energy.cpp index ffce964d177..b9f9a6b5e26 100644 --- a/src/core/energy.cpp +++ b/src/core/energy.cpp @@ -57,11 +57,12 @@ static std::shared_ptr calculate_energy_local() { clear_energy_on_GPU(); #endif - EspressoSystemInterface::Instance().update(); + auto &espresso_system = EspressoSystemInterface::Instance(); + espresso_system.update(); // Compute the energies from the energyActors for (auto &energyActor : energyActors) - energyActor->computeEnergy(espressoSystemInterface); + energyActor->computeEnergy(espresso_system); on_observable_calc(); diff --git a/src/core/forces.cpp b/src/core/forces.cpp index f054ba60b7a..8dfefee3dfe 100644 --- a/src/core/forces.cpp +++ b/src/core/forces.cpp @@ -88,7 +88,8 @@ void init_forces_ghosts(const ParticleRange &particles) { void force_calc(CellStructure &cell_structure, double time_step, double kT) { ESPRESSO_PROFILER_CXX_MARK_FUNCTION; - espressoSystemInterface.update(); + auto &espresso_system = EspressoSystemInterface::Instance(); + espresso_system.update(); #ifdef COLLISION_DETECTION prepare_local_collision_queue(); @@ -102,9 +103,9 @@ void force_calc(CellStructure &cell_structure, double time_step, double kT) { init_forces(particles, time_step, kT); for (auto &forceActor : forceActors) { - forceActor->computeForces(espressoSystemInterface); + forceActor->computeForces(espresso_system); #ifdef ROTATION - forceActor->computeTorques(espressoSystemInterface); + forceActor->computeTorques(espresso_system); #endif } From 7104cb2fa40be29505d6530cd1d1d7ca2f572a4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= Date: Wed, 1 Dec 2021 21:46:18 +0100 Subject: [PATCH 02/21] core: Modernize CUDA code Use `auto const`, reduce scope of local variables, reduce code duplication, simplify expressions (e.g. vector products), fix conversion warnings. --- src/core/actor/DipolarBarnesHut_cuda.cu | 228 ++++++++---------- src/core/actor/DipolarDirectSum.hpp | 14 +- src/core/actor/DipolarDirectSum_cuda.cu | 201 +++++++-------- .../p3m-dipolar.hpp | 6 - .../p3m_gpu_cuda.cu | 205 ++++++++-------- .../p3m_gpu_error_cuda.cu | 119 ++++----- 6 files changed, 348 insertions(+), 425 deletions(-) diff --git a/src/core/actor/DipolarBarnesHut_cuda.cu b/src/core/actor/DipolarBarnesHut_cuda.cu index 8f0f11ba330..baadc95522e 100644 --- a/src/core/actor/DipolarBarnesHut_cuda.cu +++ b/src/core/actor/DipolarBarnesHut_cuda.cu @@ -85,13 +85,11 @@ __global__ void initializationKernel() { /******************************************************************************/ __global__ __launch_bounds__(THREADS1, FACTOR1) void boundingBoxKernel() { - int i, j, k, l, t, inc, n_blocks; - float val; // min/max positions per the thread: float minp[3], maxp[3]; // min/max positions per block: __shared__ float smin[3 * THREADS1], smax[3 * THREADS1]; - for (l = 0; l < 3; l++) { + for (int l = 0; l < 3; l++) { minp[l] = maxp[l] = bhpara->r[l]; } @@ -101,24 +99,24 @@ __global__ __launch_bounds__(THREADS1, FACTOR1) void boundingBoxKernel() { // inc = [number of blocks: gridDim.x] * [THREADS1 per block within given // kernel]. Hence, this approach could handle an infinite number of bodies // (particles) - i = static_cast(threadIdx.x); - inc = THREADS1 * gridDim.x; + auto i = static_cast(threadIdx.x); + int const inc = THREADS1 * static_cast(gridDim.x); // j is an absolute index of the particle. // It is shifted over a count of the passed block threads behind: blockIdx.x * // THREADS1. NOTE: this loop is extrema search among all particles of the // given thread in the present block. However, one is not among all threads of // this block. - for (j = i + static_cast(blockIdx.x) * THREADS1; j < bhpara->nbodies; - j += inc) - for (l = 0; l < 3; l++) { - val = bhpara->r[3 * j + l]; + for (auto j = i + static_cast(blockIdx.x) * THREADS1; + j < bhpara->nbodies; j += inc) + for (int l = 0; l < 3; l++) { + auto const val = bhpara->r[3 * j + l]; minp[l] = min(minp[l], val); maxp[l] = max(maxp[l], val); } // For a start point of a reduction in the given block shared memory // of the i-th thread extrema: - for (l = 0; l < 3; l++) { + for (int l = 0; l < 3; l++) { smin[3 * i + l] = minp[l]; smax[3 * i + l] = maxp[l]; } @@ -130,11 +128,11 @@ __global__ __launch_bounds__(THREADS1, FACTOR1) void boundingBoxKernel() { // the thread with a lower index will operate longer and // final result (here: the shared memory extrema) // is flowing down towards zero thread. - for (t = THREADS1 / 2; t > 0; t /= 2) { + for (int t = THREADS1 / 2; t > 0; t /= 2) { __syncthreads(); if (i < t) { - k = i + t; - for (l = 0; l < 3; l++) { + auto k = i + t; + for (int l = 0; l < 3; l++) { smin[3 * i + l] = minp[l] = min(minp[l], smin[3 * k + l]); smax[3 * i + l] = maxp[l] = max(maxp[l], smax[3 * k + l]); // very last minp/maxp assignment will be made by zero thread (i == 0) @@ -147,8 +145,8 @@ __global__ __launch_bounds__(THREADS1, FACTOR1) void boundingBoxKernel() { // and other per-block operations. if (i == 0) { // per k-th block - k = static_cast(blockIdx.x); - for (l = 0; l < 3; l++) { + auto k = static_cast(blockIdx.x); + for (int l = 0; l < 3; l++) { // global memory storage of the per-block extrema bhpara->minp[3 * k + l] = minp[l]; bhpara->maxp[3 * k + l] = maxp[l]; @@ -156,19 +154,19 @@ __global__ __launch_bounds__(THREADS1, FACTOR1) void boundingBoxKernel() { // contain de facto already reduced (see above) shared extrema smin/smax } - n_blocks = static_cast(gridDim.x) - 1; + auto const n_blocks = static_cast(gridDim.x) - 1; // The block increment is performing by its zero thread. if (n_blocks == atomicInc((unsigned int *)&blkcntd, n_blocks)) { // I'm the (randomly) last block, so combine all other blocks' results // over the index j: - for (j = 0; j <= n_blocks; j++) - for (l = 0; l < 3; l++) { - minp[l] = min(minp[l], bhpara->minp[3 * j + l]); - maxp[l] = max(maxp[l], bhpara->maxp[3 * j + l]); + for (int im = 0; im <= n_blocks; im++) + for (int l = 0; l < 3; l++) { + minp[l] = min(minp[l], bhpara->minp[3 * im + l]); + maxp[l] = max(maxp[l], bhpara->maxp[3 * im + l]); } // Compute 'radius': - val = max(maxp[0] - minp[0], maxp[1] - minp[1]); + auto const val = max(maxp[0] - minp[0], maxp[1] - minp[1]); radiusd = max(val, maxp[2] - minp[2]) * 0.5f; // NOTE: now the extrema are global. @@ -187,7 +185,7 @@ __global__ __launch_bounds__(THREADS1, FACTOR1) void boundingBoxKernel() { bhpara->start[k] = 0; // Position of the root node should be in the center of just defined BH // box: - for (l = 0; l < 3; l++) + for (int l = 0; l < 3; l++) bhpara->r[3 * k + l] = (minp[l] + maxp[l]) * 0.5f; // Init further tree building octo- meaning their absence at the // beginning: @@ -202,32 +200,31 @@ __global__ __launch_bounds__(THREADS1, FACTOR1) void boundingBoxKernel() { /******************************************************************************/ __global__ __launch_bounds__(THREADS2, FACTOR2) void treeBuildingKernel() { - int i, j, k, l, depth, localmaxdepth, skip, inc, lps; + int j, depth; float r; float pos[3]; float p[3]; - int ch, n, cell, locked, patch; - float radius; + int n; float root[3]; // Radius is determined in boundingBoxKernel - radius = radiusd; + auto const radius = radiusd; // The root node has been created at the end of the boundingBoxKernel. // Cache the root data: - for (l = 0; l < 3; l++) + for (int l = 0; l < 3; l++) root[l] = bhpara->r[3 * bhpara->nnodes + l]; // Maximum tree depth within the given thread. - localmaxdepth = 1; + int localmaxdepth = 1; // Skip the branch following and start from the root. - skip = 1; + int skip = 1; // Number of loops for the threads sync algorithm - lps = 0; + int lps = 0; // Increment to move among the bodies assigned to the given thread. // Hence, one should step over all other threads in GPU with // a quantity of blockDim.x * gridDim.x. - inc = static_cast(blockDim.x * gridDim.x); + auto const inc = static_cast(blockDim.x * gridDim.x); // Just a regular 1D GPU index - i = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + auto i = static_cast(threadIdx.x + blockIdx.x * blockDim.x); // Iterate over all bodies assigned to thread. while (i < bhpara->nbodies) { @@ -235,7 +232,7 @@ __global__ __launch_bounds__(THREADS2, FACTOR2) void treeBuildingKernel() { // New body, so start traversing at root. Skip it further. skip = 0; // Particle position corresponding to the given thread and block: - for (l = 0; l < 3; l++) + for (int l = 0; l < 3; l++) p[l] = bhpara->r[3 * i + l]; // Let's start a moving via the tree from the root node 8 * nbodiesd: n = bhpara->nnodes; @@ -244,14 +241,14 @@ __global__ __launch_bounds__(THREADS2, FACTOR2) void treeBuildingKernel() { // Determine which child to follow. // j=0..7 determines the octant in a binary representations j = 0; - for (l = 0; l < 3; l++) + for (int l = 0; l < 3; l++) if (root[l] < p[l]) j += static_cast(pow(2, l)); } // Follow path to leaf cell. Should not happen at the first iteration of // this loop. - ch = bhpara->child[n * 8 + j]; + int ch = bhpara->child[n * 8 + j]; // Global memory writing related threads sync if (lps++ > THREADS2) { @@ -277,7 +274,7 @@ __global__ __launch_bounds__(THREADS2, FACTOR2) void treeBuildingKernel() { j = 0; // Determine which child octant to follow based on body coordinates. // j=0..7 determines the octant in a binary representations. - for (l = 0; l < 3; l++) + for (int l = 0; l < 3; l++) if (bhpara->r[3 * n + l] < p[l]) j += static_cast(pow(2, l)); ch = bhpara->child[n * 8 + j]; @@ -285,7 +282,7 @@ __global__ __launch_bounds__(THREADS2, FACTOR2) void treeBuildingKernel() { // Now we are deep enough in the tree, passed all levels of cells and // reached the body (particle). if (ch != -2) { // Skip if child pointer is locked (-2) and try again later. - locked = n * 8 + j; + int const locked = n * 8 + j; // Try to lock and iterate towards next body: if (ch == atomicCAS((int *)&bhpara->child[locked], ch, -2)) { // If we are here then child[locked] was equal to "ch" and now is @@ -306,7 +303,7 @@ __global__ __launch_bounds__(THREADS2, FACTOR2) void treeBuildingKernel() { // If -1 (i.e. no child index) then just insert a new body bhpara->child[locked] = i; } else { - patch = -1; + int patch = -1; // There already is a body and/or cell(s) in this position. // We should start from a loop of the deepest child cell // determination. Then, we need to create new cells and to distribute @@ -318,7 +315,7 @@ __global__ __launch_bounds__(THREADS2, FACTOR2) void treeBuildingKernel() { // nbodiesd nodes. These lists will be correctly handled by the // sortKernel later. depth++; - cell = atomicSub((int *)&bottomd, 1) - 1; + int const cell = atomicSub((int *)&bottomd, 1) - 1; if (cell <= bhpara->nbodies) { // This should not happen. A cell cannot have such index. Error. *bhpara->err = 1; @@ -333,7 +330,7 @@ __global__ __launch_bounds__(THREADS2, FACTOR2) void treeBuildingKernel() { // within above "while (ch >= nbodiesd)" loop. // The new cell will be placed below relatively the center of // corresponding j-th octant: - for (l = 0; l < 3; l++) + for (int l = 0; l < 3; l++) pos[l] = static_cast((j >> l) & 1) * r; // Note, that negative octants correspond to pos[l] == 0 and // positive octants correspond to pos[l] == r. @@ -366,12 +363,12 @@ __global__ __launch_bounds__(THREADS2, FACTOR2) void treeBuildingKernel() { // need to add -r to obtain -r and r for negative and positive // octants. Now, the child (cell) octants centers are deriving from // the parent (n) octant center: - for (l = 0; l < 3; l++) + for (int l = 0; l < 3; l++) pos[l] = bhpara->r[3 * cell + l] = bhpara->r[3 * n + l] - r + pos[l]; // By default, the new cell has no children in all k-th octants: - for (k = 0; k < 8; k++) + for (int k = 0; k < 8; k++) bhpara->child[cell * 8 + k] = -1; // This condition should always be true cause "patch" is -1 at the @@ -384,7 +381,7 @@ __global__ __launch_bounds__(THREADS2, FACTOR2) void treeBuildingKernel() { // pos[l] already contains the child cell coordinates. // Let's assign "child" then. First the octant should be selected: j = 0; - for (l = 0; l < 3; l++) + for (int l = 0; l < 3; l++) if (pos[l] < bhpara->r[3 * ch + l]) j += static_cast(pow(2, l)); // New element just appeared in the chain of cells. Hence, that what @@ -400,7 +397,7 @@ __global__ __launch_bounds__(THREADS2, FACTOR2) void treeBuildingKernel() { __threadfence(); // Let's handle the particle position (p[l]) corresponding to the // given thread and block against new octant cell (pos[l]): - for (l = 0; l < 3; l++) + for (int l = 0; l < 3; l++) if (pos[l] < p[l]) j += static_cast(pow(2, l)); @@ -449,8 +446,7 @@ __global__ __launch_bounds__(THREADS2, FACTOR2) void treeBuildingKernel() { /******************************************************************************/ __global__ __launch_bounds__(THREADS3, FACTOR3) void summarizationKernel() { - int i, j, k, l, im, ch, inc, missing, missing_max, cnt, bottom, lps; - int iteration, repeat_flag; + int i, j, cnt; // the node "mass" and its count respectively: float m, cm; // position of equivalent total dipole and its magnitude: @@ -462,25 +458,26 @@ __global__ __launch_bounds__(THREADS3, FACTOR3) void summarizationKernel() { // no children by default: for (i = 0; i < 8; i++) child[i * THREADS3 + threadIdx.x] = -1; - bottom = bottomd; + auto const bottom = bottomd; // Increment towards other particles assigned to the given thread: - inc = static_cast(blockDim.x * gridDim.x); + auto const inc = static_cast(blockDim.x * gridDim.x); // Nodes iteration "k" should start from the "bottomd" level of the cells, // which is a minimal index of the last created cell. // Starting "k" value should be aligned using the warp size // according to the designed threads performance. // k = (bottom & (-WARPSIZE)) + threadIdx.x + blockIdx.x * blockDim.x; - k = bottom + static_cast(threadIdx.x + blockIdx.x * blockDim.x); + auto k = bottom + static_cast(threadIdx.x + blockIdx.x * blockDim.x); // Threads below the bottom line could proceed to their next cells. // if (k < bottom) k += inc; // Assume no missing children: - missing = 0; - iteration = 0; - repeat_flag = 0; + int missing = 0; + int missing_max = 0; + int iteration = 0; + int repeat_flag = 0; __syncthreads(); // throttle // threads sync related - lps = 0; + int lps = 0; // Iterate over all cells (not particles) assigned to the thread: while (k <= bhpara->nnodes) { if (lps++ > THREADS3) { @@ -492,14 +489,14 @@ __global__ __launch_bounds__(THREADS3, FACTOR3) void summarizationKernel() { if (missing == 0) { // New cell, so initialize: cm = 0.0f; - for (l = 0; l < 3; l++) { + for (int l = 0; l < 3; l++) { p[l] = 0.0f; u[l] = 0.0f; } cnt = 0; j = 0; for (i = 0; i < 8; i++) { - ch = bhpara->child[k * 8 + i]; + auto const ch = bhpara->child[k * 8 + i]; if (ch >= 0) { if (i != j) { // Move child to front (needed later for a speed only). @@ -532,7 +529,7 @@ __global__ __launch_bounds__(THREADS3, FACTOR3) void summarizationKernel() { } // add child's contribution cm += m; - for (l = 0; l < 3; l++) { + for (int l = 0; l < 3; l++) { p[l] += bhpara->r[3 * ch + l] * m; u[l] += bhpara->u[3 * ch + l]; } @@ -548,9 +545,9 @@ __global__ __launch_bounds__(THREADS3, FACTOR3) void summarizationKernel() { //__syncthreads(); // throttle if (missing != 0) { - for (im = 0; im < missing_max; im++) { + for (int im = 0; im < missing_max; im++) { // poll missing child - ch = child[im * THREADS3 + threadIdx.x]; + auto const ch = child[im * THREADS3 + threadIdx.x]; if (ch >= 0) { m = bhpara->mass[ch]; // Is a child the particle? Only particles have non-negative mass @@ -567,7 +564,7 @@ __global__ __launch_bounds__(THREADS3, FACTOR3) void summarizationKernel() { } // add child's contribution cm += m; - for (l = 0; l < 3; l++) { + for (int l = 0; l < 3; l++) { p[l] += bhpara->r[3 * ch + l] * m; u[l] += bhpara->u[3 * ch + l]; } @@ -588,7 +585,7 @@ __global__ __launch_bounds__(THREADS3, FACTOR3) void summarizationKernel() { // all children are ready, so store computed information bhpara->count[k] = cnt; m = 1.0f / cm; - for (l = 0; l < 3; l++) { + for (int l = 0; l < 3; l++) { bhpara->r[3 * k + l] = p[l] * m; bhpara->u[3 * k + l] = u[l]; } @@ -627,23 +624,21 @@ __global__ __launch_bounds__(THREADS3, FACTOR3) void summarizationKernel() { // same octant cells) together, and these grouped bodies are crucial to speed up // forceCalculationKernel and energyCalculationKernel __global__ __launch_bounds__(THREADS4, FACTOR4) void sortKernel() { - int i, k, ch, dec, start, bottom, lps; - - bottom = bottomd; - dec = static_cast(blockDim.x * gridDim.x); + auto const bottom = bottomd; + auto const dec = static_cast(blockDim.x * gridDim.x); // Start from the end of the nnodesd == 8 * nbodiesd. // Reverse order is required now cause octant cells which are more close // to the root have a larger count of entities inside (countd[k]). // Particles should be sorted over all entities count in the tree array // representation made by treeBuildingKernel. - k = bhpara->nnodes + 1 - dec + - static_cast(threadIdx.x + blockIdx.x * blockDim.x); + int k = bhpara->nnodes + 1 - dec + + static_cast(threadIdx.x + blockIdx.x * blockDim.x); // threads sync related - lps = 0; + int lps = 0; // iterate over all cells assigned to thread while (k >= bottom) { - start = bhpara->start[k]; + int start = bhpara->start[k]; // Threads sync related if (lps++ > THREADS4) { *bhpara->max_lps = lps; @@ -652,8 +647,8 @@ __global__ __launch_bounds__(THREADS4, FACTOR4) void sortKernel() { // Let's start from the root which has only startd=0 defined // in boundingBoxKernel. All other bodies and cells have -1. if (start >= 0) { - for (i = 0; i < 8; i++) { - ch = bhpara->child[k * 8 + i]; + for (int i = 0; i < 8; i++) { + int const ch = bhpara->child[k * 8 + i]; if (ch >= bhpara->nbodies) { // child is a cell bhpara->start[ch] = start; // set start ID of child @@ -684,8 +679,7 @@ __global__ __launch_bounds__(THREADS4, FACTOR4) void sortKernel() { __global__ __launch_bounds__(THREADS5, FACTOR5) void forceCalculationKernel( float pf, float *force, float *torque) { - int i, j, k, l, n, depth, base, sbase, diff, t; - float tmp; + int i, t; // dr is a distance between particles. // f,h, and N are a target force, field, and torque respectively. // u and uc are dipole moments of two particles. @@ -698,11 +692,9 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void forceCalculationKernel( __shared__ int pos[MAXDEPTH * THREADS5 / WARPSIZE], node[MAXDEPTH * THREADS5 / WARPSIZE]; __shared__ float dq[MAXDEPTH * THREADS5 / WARPSIZE]; - float b, b2, d1, dd5; - float bb2d7, umd5; // Zero thread of the block initialize shared data for all warps. - if (0 == threadIdx.x) { + if (threadIdx.x == 0) { // Precompute values that depend only on tree level. // The method's parameters (a trade-off accuracy/performance) // which determine that the @@ -715,8 +707,7 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void forceCalculationKernel( // and the cell depending on a cell level. // Original tree box edge (2*radiusd) should be divided *0.5 // as much as the tree depth takes place. - tmp = radiusd; - dq[0] = tmp * tmp * *itolsqd; + dq[0] = radiusd * radiusd * *itolsqd; for (i = 1; i < maxdepthd; i++) { dq[i] = dq[i - 1] * 0.25f; dq[i - 1] += *epssqd; @@ -735,15 +726,15 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void forceCalculationKernel( // Only maximal Barnes-Hut tree depth is allowed. if (maxdepthd <= MAXDEPTH) { // How many warps are behind the current thread (?): - base = static_cast(threadIdx.x) / WARPSIZE; + auto const base = static_cast(threadIdx.x) / WARPSIZE; // Figure out first thread in each warp (lane 0): - sbase = base * WARPSIZE; + auto const sbase = base * WARPSIZE; // Initial stack index is its MAXDEPTH portion start for the given warp // count base: - j = base * MAXDEPTH; + auto const j = base * MAXDEPTH; // How far the thread is from the warp beginning (?): - diff = static_cast(threadIdx.x) - sbase; + auto const diff = static_cast(threadIdx.x) - sbase; // Make multiple copies to avoid index calculations later: if (diff < MAXDEPTH) { // Each thread copies its own dq[] element to a part of @@ -753,12 +744,12 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void forceCalculationKernel( __syncthreads(); // Iterate over all bodies assigned to thread: - for (k = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + for (auto k = static_cast(threadIdx.x + blockIdx.x * blockDim.x); k < bhpara->nbodies; k += static_cast(blockDim.x * gridDim.x)) { // Sorted body indexes assigned to me: i = bhpara->sort[k]; // get permuted/sorted index // Cache the particle position info: - for (l = 0; l < 3; l++) { + for (int l = 0; l < 3; l++) { u[l] = bhpara->u[3 * i + l]; h[l] = 0.0f; f[l] = 0.0f; @@ -769,7 +760,7 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void forceCalculationKernel( // Hence, we should start from the root node (whole Barnes-Hut cube). // Initialize iteration stack, i.e., push root node onto stack. // Let's start from zero octant. - depth = j; + auto depth = j; if (sbase == threadIdx.x) { node[j] = bhpara->nnodes; pos[j] = 0; @@ -780,7 +771,7 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void forceCalculationKernel( // Hence, there are still some children to consider. while ((t = pos[depth]) < 8) { // Node on top of stack has more children to process: - n = bhpara->child[node[depth] * 8 + t]; // load child pointer + auto const n = bhpara->child[node[depth] * 8 + t]; // child pointer if (sbase == threadIdx.x) { // I'm the first thread in the warp. // Let me check for the current depth level of the tree @@ -792,8 +783,8 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void forceCalculationKernel( // There is a child (octant cell) with a dipole moment uxd[3 * n + l] // and the center position bhpara->r[3 * n + l]: if (n >= 0) { - tmp = 0.0f; // compute distance squared - for (l = 0; l < 3; l++) { + auto tmp = 0.0f; // compute distance squared + for (int l = 0; l < 3; l++) { dr[l] = -bhpara->r[3 * n + l] + bhpara->r[3 * i + l]; tmp += dr[l] * dr[l]; } @@ -825,12 +816,12 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void forceCalculationKernel( #endif if (n != i) { - d1 = sqrtf(tmp /*, 0.5f*/); - dd5 = __fdividef(1.0f, tmp * tmp * d1); - b = 0.0f; - b2 = 0.0f; - umd5 = 0.0f; - for (l = 0; l < 3; l++) { + auto const d1 = sqrtf(tmp /*, 0.5f*/); + auto const dd5 = __fdividef(1.0f, tmp * tmp * d1); + auto b = 0.0f; + auto b2 = 0.0f; + auto umd5 = 0.0f; + for (int l = 0; l < 3; l++) { uc[l] = bhpara->u[3 * n + l]; b += uc[l] * dr[l]; b2 += u[l] * dr[l]; @@ -838,9 +829,9 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void forceCalculationKernel( } umd5 *= -3.0f * dd5; - bb2d7 = 15.0f * b * b2 * __fdividef(dd5, tmp); + auto const bb2d7 = 15.0f * b * b2 * __fdividef(dd5, tmp); - for (l = 0; l < 3; l++) { + for (int l = 0; l < 3; l++) { h[l] += (b * 3.0f * dr[l] - tmp * uc[l]) * dd5; f[l] += -dr[l] * (umd5 + bb2d7) + 3.0f * (b * u[l] + b2 * uc[l]) * dd5; @@ -874,7 +865,7 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void forceCalculationKernel( N[1] = u[2] * h[0] - u[0] * h[2]; N[2] = u[0] * h[1] - u[1] * h[0]; - for (l = 0; l < 3; l++) { + for (int l = 0; l < 3; l++) { if (f[l] != f[l] || h[l] != h[l]) { // nan printf("Force Kernel: NAN in particle[%d]\n", i); printf("x = %f, y = %f, z = %f,\n", bhpara->u[3 * i + 0], @@ -890,8 +881,7 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void forceCalculationKernel( } /******************************************************************************/ -/*** compute energy - * ************************************************************/ +/*** compute energy ***********************************************************/ /******************************************************************************/ __global__ __launch_bounds__(THREADS5, FACTOR5) void energyCalculationKernel( @@ -899,8 +889,7 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void energyCalculationKernel( // NOTE: the algorithm of this kernel is almost identical to // forceCalculationKernel. See comments there. - int i, j, k, l, n, depth, base, sbase, diff, t; - float tmp; + int i, n, t; float dr[3], h[3], u[3], uc[3]; __shared__ int pos[MAXDEPTH * THREADS5 / WARPSIZE], node[MAXDEPTH * THREADS5 / WARPSIZE]; @@ -908,12 +897,9 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void energyCalculationKernel( float sum = 0.0; extern __shared__ float res[]; - float b, d1, dd5; - - if (0 == threadIdx.x) { - tmp = radiusd; + if (threadIdx.x == 0) { // precompute values that depend only on tree level - dq[0] = tmp * tmp * *itolsqd; + dq[0] = radiusd * radiusd * *itolsqd; for (i = 1; i < maxdepthd; i++) { dq[i] = dq[i - 1] * 0.25f; dq[i - 1] += *epssqd; @@ -928,11 +914,11 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void energyCalculationKernel( if (maxdepthd <= MAXDEPTH) { // figure out first thread in each warp (lane 0) - base = static_cast(threadIdx.x) / WARPSIZE; - sbase = base * WARPSIZE; - j = base * MAXDEPTH; + auto const base = static_cast(threadIdx.x) / WARPSIZE; + auto const sbase = base * WARPSIZE; + auto const j = base * MAXDEPTH; - diff = static_cast(threadIdx.x) - sbase; + auto const diff = static_cast(threadIdx.x) - sbase; // make multiple copies to avoid index calculations later if (diff < MAXDEPTH) { dq[diff + j] = dq[diff]; @@ -940,17 +926,17 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void energyCalculationKernel( __syncthreads(); // iterate over all bodies assigned to thread - for (k = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + for (auto k = static_cast(threadIdx.x + blockIdx.x * blockDim.x); k < bhpara->nbodies; k += static_cast(blockDim.x * gridDim.x)) { i = bhpara->sort[k]; // get permuted/sorted index // cache position info - for (l = 0; l < 3; l++) { + for (int l = 0; l < 3; l++) { u[l] = bhpara->u[3 * i + l]; h[l] = 0.0f; } // initialize iteration stack, i.e., push root node onto stack - depth = j; + auto depth = j; if (sbase == threadIdx.x) { node[j] = bhpara->nnodes; pos[j] = 0; @@ -967,8 +953,8 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void energyCalculationKernel( } __threadfence_block(); if (n >= 0) { - tmp = 0.0f; - for (l = 0; l < 3; l++) { + auto tmp = 0.0f; + for (int l = 0; l < 3; l++) { dr[l] = -bhpara->r[3 * n + l] + bhpara->r[3 * i + l]; tmp += dr[l] * dr[l]; } @@ -985,15 +971,15 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void energyCalculationKernel( // is far enough away (or is a body) #endif if (n != i) { - d1 = sqrtf(tmp /*, 0.5f*/); - dd5 = __fdividef(1.0f, tmp * tmp * d1); - b = 0.0f; - for (l = 0; l < 3; l++) { + auto const d1 = sqrtf(tmp /*, 0.5f*/); + auto const dd5 = __fdividef(1.0f, tmp * tmp * d1); + auto b = 0.0f; + for (int l = 0; l < 3; l++) { uc[l] = bhpara->u[3 * n + l]; b += uc[l] * dr[l]; } - for (l = 0; l < 3; l++) + for (int l = 0; l < 3; l++) h[l] += (b * 3.0f * dr[l] - tmp * uc[l]) * dd5; } } else { @@ -1013,7 +999,7 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void energyCalculationKernel( depth--; // done with this level } - for (l = 0; l < 3; l++) { + for (int l = 0; l < 3; l++) { sum += -u[l] * h[l]; if (h[l] != h[l]) { // nan printf("Energy Kernel: NAN in particle[%d]\n", i); @@ -1183,7 +1169,7 @@ void allocBHmemCopy(int nbodies, BHData *bh_data) { // cubic boxes hence, 8 nodes per particle is a theoretical octree limit: bh_data->nnodes = bh_data->nbodies * 8; - int n_total_threads = 1024 * bh_data->blocks; + int const n_total_threads = 1024 * bh_data->blocks; if (bh_data->nnodes < n_total_threads) bh_data->nnodes = n_total_threads; else diff --git a/src/core/actor/DipolarDirectSum.hpp b/src/core/actor/DipolarDirectSum.hpp index aa41cdcde58..1938dd224ab 100644 --- a/src/core/actor/DipolarDirectSum.hpp +++ b/src/core/actor/DipolarDirectSum.hpp @@ -32,10 +32,10 @@ #include -void DipolarDirectSum_kernel_wrapper_energy(float k, int n, float *pos, +void DipolarDirectSum_kernel_wrapper_energy(float k, unsigned int n, float *pos, float *dip, float box_l[3], int periodic[3], float *E); -void DipolarDirectSum_kernel_wrapper_force(float k, int n, float *pos, +void DipolarDirectSum_kernel_wrapper_force(float k, unsigned int n, float *pos, float *dip, float *f, float *torque, float box_l[3], int periodic[3]); @@ -59,10 +59,10 @@ class DipolarDirectSum : public Actor { int per[3]; for (int i = 0; i < 3; i++) { box[i] = static_cast(s.box()[i]); - per[i] = (box_geo.periodic(i)); + per[i] = box_geo.periodic(i); } DipolarDirectSum_kernel_wrapper_force( - k, static_cast(s.npart_gpu()), s.rGpuBegin(), s.dipGpuBegin(), + k, static_cast(s.npart_gpu()), s.rGpuBegin(), s.dipGpuBegin(), s.fGpuBegin(), s.torqueGpuBegin(), box, per); }; void computeEnergy(SystemInterface &s) override { @@ -70,11 +70,11 @@ class DipolarDirectSum : public Actor { int per[3]; for (int i = 0; i < 3; i++) { box[i] = static_cast(s.box()[i]); - per[i] = (box_geo.periodic(i)); + per[i] = box_geo.periodic(i); } DipolarDirectSum_kernel_wrapper_energy( - k, static_cast(s.npart_gpu()), s.rGpuBegin(), s.dipGpuBegin(), box, - per, &(reinterpret_cast(s.eGpu())->dipolar)); + k, static_cast(s.npart_gpu()), s.rGpuBegin(), s.dipGpuBegin(), + box, per, &(reinterpret_cast(s.eGpu())->dipolar)); }; private: diff --git a/src/core/actor/DipolarDirectSum_cuda.cu b/src/core/actor/DipolarDirectSum_cuda.cu index 0182c140460..8c1e37754b2 100644 --- a/src/core/actor/DipolarDirectSum_cuda.cu +++ b/src/core/actor/DipolarDirectSum_cuda.cu @@ -36,6 +36,13 @@ __device__ inline float scalar_product(float const *a, float const *b) { return a[0] * b[0] + a[1] * b[1] + a[2] * b[2]; } +__device__ inline void vector_product(float const *a, float const *b, + float *out) { + out[0] = a[1] * b[2] - a[2] * b[1]; + out[1] = a[2] * b[0] - a[0] * b[2]; + out[2] = a[0] * b[1] - a[1] * b[0]; +} + __device__ inline void get_mi_vector_dds(float res[3], float const a[3], float const b[3], float const box_l[3], int const periodic[3]) { @@ -46,120 +53,89 @@ __device__ inline void get_mi_vector_dds(float res[3], float const a[3], } } -__device__ void dipole_ia_force(int id, float pf, float const *r1, - float const *r2, float const *dip1, - float const *dip2, float *f1, float *torque1, - float *torque2, float box_l[3], +__device__ void dipole_ia_force(float pf, float const *r1, float const *r2, + float const *dip1, float const *dip2, float *f1, + float *torque1, float *torque2, float box_l[3], int periodic[3]) { - float r_inv, pe1, pe2, pe3, pe4, r_sq, r3_inv, r5_inv, r_sq_inv, r7_inv, a, b, - cc, d, ab; -#ifdef ROTATION - float bx, by, bz, ax, ay, az; -#endif - - float dr[3]; - float _r1[3], _r2[3], _dip1[3], _dip2[3]; - - for (int i = 0; i < 3; i++) { - _r1[i] = r1[i]; - _r2[i] = r2[i]; - _dip1[i] = dip1[i]; - _dip2[i] = dip2[i]; - } - // Distance between particles - get_mi_vector_dds(dr, _r1, _r2, box_l, periodic); + float dr[3]; + get_mi_vector_dds(dr, r1, r2, box_l, periodic); // Powers of distance - r_sq = scalar_product(dr, dr); - r_sq_inv = 1 / r_sq; - r_inv = rsqrtf(r_sq); - r3_inv = 1 / r_sq * r_inv; - r5_inv = r3_inv * r_sq_inv; - r7_inv = r5_inv * r_sq_inv; + auto const r_sq = scalar_product(dr, dr); + auto const r_sq_inv = 1.0f / r_sq; + auto const r_inv = rsqrtf(r_sq); + auto const r3_inv = 1.0f / r_sq * r_inv; + auto const r5_inv = r3_inv * r_sq_inv; + auto const r7_inv = r5_inv * r_sq_inv; // Dot products - pe1 = scalar_product(_dip1, _dip2); - pe2 = scalar_product(_dip1, dr); - pe3 = scalar_product(_dip2, dr); - pe4 = 3.0f * r5_inv; + auto const pe1 = scalar_product(dip1, dip2); + auto const pe2 = scalar_product(dip1, dr); + auto const pe3 = scalar_product(dip2, dr); + auto const pe4 = 3.0f * r5_inv; // Force - a = pe4 * pe1; - b = -15.0f * pe2 * pe3 * r7_inv; - ab = a + b; - cc = pe4 * pe3; - d = pe4 * pe2; + auto const aa = pe4 * pe1; + auto const bb = -15.0f * pe2 * pe3 * r7_inv; + auto const ab = aa + bb; + auto const cc = pe4 * pe3; + auto const dd = pe4 * pe2; - f1[0] = (pf * (ab * dr[0] + cc * _dip1[0] + d * _dip2[0])); - f1[1] = (pf * (ab * dr[1] + cc * _dip1[1] + d * _dip2[1])); - f1[2] = (pf * (ab * dr[2] + cc * _dip1[2] + d * _dip2[2])); + f1[0] = (pf * (ab * dr[0] + cc * dip1[0] + dd * dip2[0])); + f1[1] = (pf * (ab * dr[1] + cc * dip1[1] + dd * dip2[1])); + f1[2] = (pf * (ab * dr[2] + cc * dip1[2] + dd * dip2[2])); #ifdef ROTATION // Torques - ax = _dip1[1] * _dip2[2] - _dip2[1] * _dip1[2]; - ay = _dip2[0] * _dip1[2] - _dip1[0] * _dip2[2]; - az = _dip1[0] * _dip2[1] - _dip2[0] * _dip1[1]; + float a[3]; + vector_product(dip1, dip2, a); - bx = _dip1[1] * dr[2] - dr[1] * _dip1[2]; - by = dr[0] * _dip1[2] - _dip1[0] * dr[2]; - bz = _dip1[0] * dr[1] - dr[0] * _dip1[1]; + float b[3]; + vector_product(dip1, dr, b); - torque1[0] = (pf * (-ax * r3_inv + bx * cc)); - torque1[1] = (pf * (-ay * r3_inv + by * cc)); - torque1[2] = (pf * (-az * r3_inv + bz * cc)); + torque1[0] = pf * (-a[0] * r3_inv + b[0] * cc); + torque1[1] = pf * (-a[1] * r3_inv + b[1] * cc); + torque1[2] = pf * (-a[2] * r3_inv + b[2] * cc); - bx = _dip2[1] * dr[2] - dr[1] * _dip2[2]; - by = dr[0] * _dip2[2] - _dip2[0] * dr[2]; - bz = _dip2[0] * dr[1] - dr[0] * _dip2[1]; + vector_product(dip2, dr, b); - torque2[0] = pf * (ax * r3_inv + bx * d); - torque2[1] = pf * (ay * r3_inv + by * d); - torque2[2] = pf * (az * r3_inv + bz * d); + torque2[0] = pf * (a[0] * r3_inv + b[0] * dd); + torque2[1] = pf * (a[1] * r3_inv + b[1] * dd); + torque2[2] = pf * (a[2] * r3_inv + b[2] * dd); #endif } -__device__ float dipole_ia_energy(int id, float pf, float const *r1, - float const *r2, float const *dip1, - float const *dip2, float box_l[3], - int periodic[3]) { - float r_inv, pe1, pe2, pe3, pe4, r_sq, r3_inv, r5_inv, r_sq_inv; - float dr[3]; - float _r1[3], _r2[3], _dip1[3], _dip2[3]; - - for (int i = 0; i < 3; i++) { - _r1[i] = r1[i]; - _r2[i] = r2[i]; - _dip1[i] = dip1[i]; - _dip2[i] = dip2[i]; - } - +__device__ float dipole_ia_energy(float pf, float const *r1, float const *r2, + float const *dip1, float const *dip2, + float box_l[3], int periodic[3]) { // Distance between particles - get_mi_vector_dds(dr, _r1, _r2, box_l, periodic); + float dr[3]; + get_mi_vector_dds(dr, r1, r2, box_l, periodic); // Powers of distance - r_sq = scalar_product(dr, dr); - r_sq_inv = 1 / r_sq; - r_inv = rsqrtf(r_sq); - r3_inv = 1 / r_sq * r_inv; - r5_inv = r3_inv * r_sq_inv; + auto const r_sq = scalar_product(dr, dr); + auto const r_sq_inv = 1.0f / r_sq; + auto const r_inv = rsqrtf(r_sq); + auto const r3_inv = 1.0f / r_sq * r_inv; + auto const r5_inv = r3_inv * r_sq_inv; // Dot products - pe1 = scalar_product(_dip1, _dip2); - pe2 = scalar_product(_dip1, dr); - pe3 = scalar_product(_dip2, dr); - pe4 = 3.0f * r5_inv; + auto const pe1 = scalar_product(dip1, dip2); + auto const pe2 = scalar_product(dip1, dr); + auto const pe3 = scalar_product(dip2, dr); + auto const pe4 = 3.0f * r5_inv; // Energy return pf * (pe1 * r3_inv - pe4 * pe2 * pe3); } -__global__ void DipolarDirectSum_kernel_force(float pf, int n, float *pos, - float *dip, float *f, +__global__ void DipolarDirectSum_kernel_force(float pf, unsigned int n, + float *pos, float *dip, float *f, float *torque, float box_l[3], int periodic[3]) { - auto i = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + auto const i = blockIdx.x * blockDim.x + threadIdx.x; if (i >= n) return; @@ -179,17 +155,17 @@ __global__ void DipolarDirectSum_kernel_force(float pf, int n, float *pos, // to global memory at the end. // Clear summation vars - for (int j = 0; j < 3; j++) { + for (unsigned int j = 0; j < 3; j++) { // Force fsum[j] = 0; // Torque tsum[j] = 0; } - for (int j = i + 1; j < n; j++) { - dipole_ia_force(i, pf, pos + 3 * i, pos + 3 * j, dip + 3 * i, dip + 3 * j, - fi, ti, tj, box_l, periodic); - for (int k = 0; k < 3; k++) { + for (unsigned int j = i + 1; j < n; j++) { + dipole_ia_force(pf, pos + 3 * i, pos + 3 * j, dip + 3 * i, dip + 3 * j, fi, + ti, tj, box_l, periodic); + for (unsigned int k = 0; k < 3; k++) { // Add rhs to global memory atomicAdd(f + 3 * j + k, -fi[k]); atomicAdd((torque + 3 * j + k), tj[k]); @@ -206,7 +182,7 @@ __global__ void DipolarDirectSum_kernel_force(float pf, int n, float *pos, } __device__ void dds_sumReduction(float *input, float *sum) { - auto tid = static_cast(threadIdx.x); + auto const tid = static_cast(threadIdx.x); for (auto i = static_cast(blockDim.x); i > 1; i /= 2) { __syncthreads(); if (tid < i / 2) @@ -215,17 +191,17 @@ __device__ void dds_sumReduction(float *input, float *sum) { input[tid] += input[i - 1]; } __syncthreads(); - if (threadIdx.x == 0) { + if (tid == 0) { sum[0] = input[0]; } } -__global__ void DipolarDirectSum_kernel_energy(float pf, int n, float *pos, - float *dip, float box_l[3], - int periodic[3], +__global__ void DipolarDirectSum_kernel_energy(float pf, unsigned int n, + float *pos, float *dip, + float box_l[3], int periodic[3], float *energySum) { - auto i = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + auto const i = blockIdx.x * blockDim.x + threadIdx.x; float sum = 0.0; extern __shared__ float res[]; @@ -236,8 +212,8 @@ __global__ void DipolarDirectSum_kernel_energy(float pf, int n, float *pos, if (i < n) { // Summation for particle i - for (int j = i + 1; j < n; j++) { - sum += dipole_ia_energy(i, pf, pos + 3 * i, pos + 3 * j, dip + 3 * i, + for (unsigned int j = i + 1; j < n; j++) { + sum += dipole_ia_energy(pf, pos + 3 * i, pos + 3 * j, dip + 3 * i, dip + 3 * j, box_l, periodic); } @@ -251,11 +227,22 @@ __global__ void DipolarDirectSum_kernel_energy(float pf, int n, float *pos, dds_sumReduction(res, &(energySum[blockIdx.x])); } -void DipolarDirectSum_kernel_wrapper_force(float k, int n, float *pos, +inline void copy_box_data(float **box_l_gpu, int **periodic_gpu, + float const *box_l, int const *periodic) { + auto const s_box = 3u * sizeof(float); + auto const s_per = 3u * sizeof(int); + cuda_safe_mem(cudaMalloc(reinterpret_cast(box_l_gpu), s_box)); + cuda_safe_mem(cudaMalloc(reinterpret_cast(periodic_gpu), s_per)); + cuda_safe_mem(cudaMemcpy(*box_l_gpu, box_l, s_box, cudaMemcpyHostToDevice)); + cuda_safe_mem( + cudaMemcpy(*periodic_gpu, periodic, s_per, cudaMemcpyHostToDevice)); +} + +void DipolarDirectSum_kernel_wrapper_force(float k, unsigned int n, float *pos, float *dip, float *f, float *torque, float box_l[3], int periodic[3]) { - const int bs = 64; + unsigned int const bs = 64; dim3 grid(1, 1, 1); dim3 block(1, 1, 1); @@ -272,12 +259,7 @@ void DipolarDirectSum_kernel_wrapper_force(float k, int n, float *pos, float *box_l_gpu; int *periodic_gpu; - cuda_safe_mem(cudaMalloc((void **)&box_l_gpu, 3 * sizeof(float))); - cuda_safe_mem(cudaMalloc((void **)&periodic_gpu, 3 * sizeof(int))); - cuda_safe_mem( - cudaMemcpy(box_l_gpu, box_l, 3 * sizeof(float), cudaMemcpyHostToDevice)); - cuda_safe_mem(cudaMemcpy(periodic_gpu, periodic, 3 * sizeof(int), - cudaMemcpyHostToDevice)); + copy_box_data(&box_l_gpu, &periodic_gpu, box_l, periodic); KERNELCALL(DipolarDirectSum_kernel_force, grid, block, k, n, pos, dip, f, torque, box_l_gpu, periodic_gpu); @@ -285,11 +267,11 @@ void DipolarDirectSum_kernel_wrapper_force(float k, int n, float *pos, cudaFree(periodic_gpu); } -void DipolarDirectSum_kernel_wrapper_energy(float k, int n, float *pos, +void DipolarDirectSum_kernel_wrapper_energy(float k, unsigned int n, float *pos, float *dip, float box_l[3], int periodic[3], float *E) { - const int bs = 512; + unsigned int const bs = 512; dim3 grid(1, 1, 1); dim3 block(1, 1, 1); @@ -306,15 +288,10 @@ void DipolarDirectSum_kernel_wrapper_energy(float k, int n, float *pos, float *box_l_gpu; int *periodic_gpu; - cuda_safe_mem(cudaMalloc((void **)&box_l_gpu, 3 * sizeof(float))); - cuda_safe_mem(cudaMalloc((void **)&periodic_gpu, 3 * sizeof(int))); - cuda_safe_mem( - cudaMemcpy(box_l_gpu, box_l, 3 * sizeof(float), cudaMemcpyHostToDevice)); - cuda_safe_mem(cudaMemcpy(periodic_gpu, periodic, 3 * sizeof(int), - cudaMemcpyHostToDevice)); + copy_box_data(&box_l_gpu, &periodic_gpu, box_l, periodic); float *energySum; - cuda_safe_mem(cudaMalloc(&energySum, (int)(sizeof(float) * grid.x))); + cuda_safe_mem(cudaMalloc(&energySum, sizeof(float) * grid.x)); // This will sum the energies up to the block level KERNELCALL_shared(DipolarDirectSum_kernel_energy, grid, block, diff --git a/src/core/electrostatics_magnetostatics/p3m-dipolar.hpp b/src/core/electrostatics_magnetostatics/p3m-dipolar.hpp index 4060b32cfb3..1053bdcdddb 100644 --- a/src/core/electrostatics_magnetostatics/p3m-dipolar.hpp +++ b/src/core/electrostatics_magnetostatics/p3m-dipolar.hpp @@ -280,12 +280,6 @@ inline double dp3m_pair_energy(Particle const &p1, Particle const &p2, auto const C_r = (3 * B_r + 2 * alpsq * coeff * exp_adist2) * dist2i; - /* - printf("(%4i %4i) pair energy = %f (B_r=%15.12f C_r=%15.12f)\n", - p1.p.identity,p2.p.identity,fac1*(mimj*B_r-mir*mjr*C_r),B_r,C_r); - */ - - /* old line return fac1 * ( mimj*B_r - mir*mjr * C_r );*/ return dipole.prefactor * (mimj * B_r - mir * mjr * C_r); } diff --git a/src/core/electrostatics_magnetostatics/p3m_gpu_cuda.cu b/src/core/electrostatics_magnetostatics/p3m_gpu_cuda.cu index 5b3c3c1988f..b03339e7927 100644 --- a/src/core/electrostatics_magnetostatics/p3m_gpu_cuda.cu +++ b/src/core/electrostatics_magnetostatics/p3m_gpu_cuda.cu @@ -149,7 +149,7 @@ __device__ void static Aliasing_sums_ik(const P3MGpuData p, int NX, int NY, NM2 = sqr(NMX * Leni[0]) + sqr(NMY * Leni[1]) + sqr(NMZ * Leni[2]); *Nenner += S3; - TE = exp(-sqr(Utils::pi() / (p.alpha)) * NM2); + TE = exp(-sqr(Utils::pi() / (p.alpha)) * NM2); zwi = S3 * TE / NM2; Zaehler[0] += NMX * zwi * Leni[0]; Zaehler[1] += NMY * zwi * Leni[1]; @@ -181,9 +181,9 @@ __global__ void calculate_influence_function_device(const P3MGpuData p) { if (((NX == 0) && (NY == 0) && (NZ == 0)) || ((NX % (p.mesh[0] / 2) == 0) && (NY % (p.mesh[1] / 2) == 0) && - (NZ % (p.mesh[2] / 2) == 0))) - p.G_hat[ind] = 0.0; - else { + (NZ % (p.mesh[2] / 2) == 0))) { + p.G_hat[ind] = 0; + } else { Aliasing_sums_ik(p, NX, NY, NZ, Zaehler, &Nenner); Dnx = static_cast((NX > p.mesh[0] / 2) ? NX - p.mesh[0] : NX); @@ -194,7 +194,7 @@ __global__ void calculate_influence_function_device(const P3MGpuData p) { Dnz * Zaehler[2] * Leni[2]; zwi /= ((sqr(Dnx * Leni[0]) + sqr(Dny * Leni[1]) + sqr(Dnz * Leni[2])) * sqr(Nenner)); - p.G_hat[ind] = 2.0f * zwi / Utils::pi(); + p.G_hat[ind] = 2 * zwi / Utils::pi(); } } @@ -226,10 +226,10 @@ __global__ void apply_diff_op(const P3MGpuData p) { static_cast(blockIdx.y), static_cast(threadIdx.x)); - auto const nx = static_cast( - (blockIdx.x > p.mesh[0] / 2) ? blockIdx.x - p.mesh[0] : blockIdx.x); - auto const ny = static_cast( - (blockIdx.y > p.mesh[1] / 2) ? blockIdx.y - p.mesh[1] : blockIdx.y); + auto const bidx = static_cast(blockIdx.x); + auto const bidy = static_cast(blockIdx.y); + auto const nx = (bidx > p.mesh[0] / 2) ? bidx - p.mesh[0] : bidx; + auto const ny = (bidy > p.mesh[1] / 2) ? bidy - p.mesh[1] : bidy; auto const nz = static_cast(threadIdx.x); const FFT_TYPE_COMPLEX meshw = p.charge_mesh[linear_index]; @@ -274,11 +274,12 @@ template __global__ void assign_charge_kernel(const CUDA_particle_data *const pdata, const P3MGpuData par, const int parts_per_block) { - const int part_in_block = threadIdx.x / cao; - const int cao_id_x = threadIdx.x - part_in_block * cao; + auto const part_in_block = static_cast(threadIdx.x) / cao; + auto const cao_id_x = static_cast(threadIdx.x) - part_in_block * cao; /* id of the particle */ - int id = - parts_per_block * (blockIdx.x * gridDim.y + blockIdx.y) + part_in_block; + auto const id = + parts_per_block * static_cast(blockIdx.x * gridDim.y + blockIdx.y) + + part_in_block; if (id >= par.n_part) return; /* position relative to the closest gird point */ @@ -310,31 +311,32 @@ __global__ void assign_charge_kernel(const CUDA_particle_data *const pdata, extern __shared__ float weights[]; if (shared) { + auto const offset = static_cast(cao * part_in_block); if ((threadIdx.y < 3) && (threadIdx.z == 0)) { - weights[3 * cao * part_in_block + 3 * cao_id_x + threadIdx.y] = + weights[3 * offset + 3 * static_cast(cao_id_x) + threadIdx.y] = Utils::bspline(cao_id_x, m_pos[threadIdx.y]); } __syncthreads(); - atomicAdd(&(charge_mesh[ind]), - weights[3 * cao * part_in_block + 3 * cao_id_x + 0] * - weights[3 * cao * part_in_block + 3 * threadIdx.y + 1] * - weights[3 * cao * part_in_block + 3 * threadIdx.z + 2] * p.q); + auto const c = + weights[3 * offset + 3 * static_cast(cao_id_x) + 0] * + weights[3 * offset + 3 * threadIdx.y + 1] * + weights[3 * offset + 3 * threadIdx.z + 2] * p.q; + atomicAdd(&(charge_mesh[ind]), c); } else { - atomicAdd(&(charge_mesh[ind]), - Utils::bspline(cao_id_x, m_pos[0]) * - Utils::bspline(threadIdx.y, m_pos[1]) * - Utils::bspline(threadIdx.z, m_pos[2]) * p.q); + auto const c = + Utils::bspline(cao_id_x, m_pos[0]) * + Utils::bspline(static_cast(threadIdx.y), m_pos[1]) * + Utils::bspline(static_cast(threadIdx.z), m_pos[2]) * p.q; + atomicAdd(&(charge_mesh[ind]), c); } } void assign_charges(const CUDA_particle_data *const pdata, const P3MGpuData p) { - dim3 grid, block; - grid.z = 1; - const int cao3 = p.cao * p.cao * p.cao; - const int cao = p.cao; + auto const cao = p.cao; + auto const cao3 = int_pow<3>(cao); int parts_per_block = 1, n_blocks = 1; while ((parts_per_block + 1) * cao3 <= 1024) { @@ -345,20 +347,19 @@ void assign_charges(const CUDA_particle_data *const pdata, const P3MGpuData p) { else n_blocks = p.n_part / parts_per_block + 1; - grid.x = n_blocks; - grid.y = 1; + dim3 block(static_cast(parts_per_block * cao), + static_cast(cao), static_cast(cao)); + dim3 grid(static_cast(n_blocks), 1u, 1u); while (grid.x > 65536) { grid.y++; - if ((n_blocks % grid.y) == 0) - grid.x = std::max(1, n_blocks / grid.y); + if ((static_cast(n_blocks) % grid.y) == 0) + grid.x = std::max(1u, static_cast(n_blocks) / grid.y); else - grid.x = n_blocks / grid.y + 1; + grid.x = static_cast(n_blocks) / grid.y + 1; } - block.x = parts_per_block * cao; - block.y = cao; - block.z = cao; - + auto const data_length = + 3 * static_cast(parts_per_block * cao) * sizeof(REAL_TYPE); switch (cao) { case 1: (assign_charge_kernel<1, false>)<<>>( @@ -370,32 +371,27 @@ void assign_charges(const CUDA_particle_data *const pdata, const P3MGpuData p) { break; case 3: (assign_charge_kernel< - 3, true>)<<>>( + 3, true>)<<>>( pdata, p, parts_per_block); break; case 4: (assign_charge_kernel< - 4, true>)<<>>( + 4, true>)<<>>( pdata, p, parts_per_block); break; case 5: (assign_charge_kernel< - 5, true>)<<>>( + 5, true>)<<>>( pdata, p, parts_per_block); break; case 6: (assign_charge_kernel< - 6, true>)<<>>( + 6, true>)<<>>( pdata, p, parts_per_block); break; case 7: (assign_charge_kernel< - 7, true>)<<>>( + 7, true>)<<>>( pdata, p, parts_per_block); break; default: @@ -409,11 +405,12 @@ __global__ void assign_forces_kernel(const CUDA_particle_data *const pdata, const P3MGpuData par, float *lb_particle_force_gpu, REAL_TYPE prefactor, int parts_per_block) { - const int part_in_block = threadIdx.x / cao; - const int cao_id_x = threadIdx.x - part_in_block * cao; + auto const part_in_block = static_cast(threadIdx.x) / cao; + auto const cao_id_x = static_cast(threadIdx.x) - part_in_block * cao; /* id of the particle */ - int id = - parts_per_block * (blockIdx.x * gridDim.y + blockIdx.y) + part_in_block; + auto const id = + parts_per_block * static_cast(blockIdx.x * gridDim.y + blockIdx.y) + + part_in_block; if (id >= par.n_part) return; /* position relative to the closest grid point */ @@ -445,20 +442,22 @@ __global__ void assign_forces_kernel(const CUDA_particle_data *const pdata, extern __shared__ float weights[]; if (shared) { + auto const offset = static_cast(cao * part_in_block); if ((threadIdx.y < 3) && (threadIdx.z == 0)) { - weights[3 * cao * part_in_block + 3 * cao_id_x + threadIdx.y] = + weights[3 * offset + 3 * static_cast(cao_id_x) + threadIdx.y] = Utils::bspline(cao_id_x, m_pos[threadIdx.y]); } __syncthreads(); - c = -prefactor * weights[3 * cao * part_in_block + 3 * cao_id_x + 0] * - weights[3 * cao * part_in_block + 3 * threadIdx.y + 1] * - weights[3 * cao * part_in_block + 3 * threadIdx.z + 2] * p.q; + c = -prefactor * + weights[3 * offset + 3 * static_cast(cao_id_x) + 0] * + weights[3 * offset + 3 * threadIdx.y + 1] * + weights[3 * offset + 3 * threadIdx.z + 2] * p.q; } else { c = -prefactor * Utils::bspline(cao_id_x, m_pos[0]) * - Utils::bspline(threadIdx.y, m_pos[1]) * - Utils::bspline(threadIdx.z, m_pos[2]) * p.q; + Utils::bspline(static_cast(threadIdx.y), m_pos[1]) * + Utils::bspline(static_cast(threadIdx.z), m_pos[2]) * p.q; } const REAL_TYPE *force_mesh_x = (REAL_TYPE *)par.force_mesh_x; @@ -472,11 +471,8 @@ __global__ void assign_forces_kernel(const CUDA_particle_data *const pdata, void assign_forces(const CUDA_particle_data *const pdata, const P3MGpuData p, float *lb_particle_force_gpu, REAL_TYPE prefactor) { - dim3 grid, block; - grid.z = 1; - - const int cao = p.cao; - const int cao3 = cao * cao * cao; + auto const cao = p.cao; + auto const cao3 = int_pow<3>(cao); int parts_per_block = 1, n_blocks = 1; while ((parts_per_block + 1) * cao3 <= 1024) { @@ -488,22 +484,21 @@ void assign_forces(const CUDA_particle_data *const pdata, const P3MGpuData p, else n_blocks = p3m_gpu_data.n_part / parts_per_block + 1; - grid.x = n_blocks; - grid.y = 1; + dim3 block(static_cast(parts_per_block * cao), + static_cast(cao), static_cast(cao)); + dim3 grid(static_cast(n_blocks), 1u, 1u); while (grid.x > 65536) { grid.y++; - if ((n_blocks % grid.y) == 0) - grid.x = std::max(1, n_blocks / grid.y); + if ((static_cast(n_blocks) % grid.y) == 0) + grid.x = std::max(1u, static_cast(n_blocks) / grid.y); else - grid.x = n_blocks / grid.y + 1; + grid.x = static_cast(n_blocks) / grid.y + 1; } - block.x = parts_per_block * cao; - block.y = cao; - block.z = cao; - /* Switch for assignment templates, the shared version only is faster for cao * > 2 */ + auto const data_length = + 3 * static_cast(parts_per_block * cao) * sizeof(float); switch (cao) { case 1: (assign_forces_kernel<1, false>)<<>>( @@ -515,32 +510,27 @@ void assign_forces(const CUDA_particle_data *const pdata, const P3MGpuData p, break; case 3: (assign_forces_kernel< - 3, true>)<<>>( + 3, true>)<<>>( pdata, p, lb_particle_force_gpu, prefactor, parts_per_block); break; case 4: (assign_forces_kernel< - 4, true>)<<>>( + 4, true>)<<>>( pdata, p, lb_particle_force_gpu, prefactor, parts_per_block); break; case 5: (assign_forces_kernel< - 5, true>)<<>>( + 5, true>)<<>>( pdata, p, lb_particle_force_gpu, prefactor, parts_per_block); break; case 6: (assign_forces_kernel< - 6, true>)<<>>( + 6, true>)<<>>( pdata, p, lb_particle_force_gpu, prefactor, parts_per_block); break; case 7: (assign_forces_kernel< - 7, true>)<<>>( + 7, true>)<<>>( pdata, p, lb_particle_force_gpu, prefactor, parts_per_block); break; default: @@ -562,26 +552,26 @@ void p3m_gpu_init(int cao, const int mesh[3], double alpha) { auto &espresso_system = EspressoSystemInterface::Instance(); espresso_system.requestParticleStructGpu(); - bool reinit_if = false, mesh_changed = false; - p3m_gpu_data.n_part = gpu_get_particle_pointer().size(); + bool do_reinit = false, mesh_changed = false; + p3m_gpu_data.n_part = static_cast(gpu_get_particle_pointer().size()); if (!p3m_gpu_data_initialized || p3m_gpu_data.alpha != alpha) { p3m_gpu_data.alpha = static_cast(alpha); - reinit_if = true; + do_reinit = true; } if (!p3m_gpu_data_initialized || p3m_gpu_data.cao != cao) { p3m_gpu_data.cao = cao; // NOLINTNEXTLINE(bugprone-integer-division) p3m_gpu_data.pos_shift = static_cast((p3m_gpu_data.cao - 1) / 2); - reinit_if = true; + do_reinit = true; } if (!p3m_gpu_data_initialized || (p3m_gpu_data.mesh[0] != mesh[0]) || (p3m_gpu_data.mesh[1] != mesh[1]) || (p3m_gpu_data.mesh[2] != mesh[2])) { std::copy(mesh, mesh + 3, p3m_gpu_data.mesh); mesh_changed = true; - reinit_if = true; + do_reinit = true; } auto const box_l = espresso_system.box(); @@ -589,7 +579,7 @@ void p3m_gpu_init(int cao, const int mesh[3], double alpha) { if (!p3m_gpu_data_initialized || (p3m_gpu_data.box[0] != box_l[0]) || (p3m_gpu_data.box[1] != box_l[1]) || (p3m_gpu_data.box[2] != box_l[2])) { std::copy(box_l.begin(), box_l.end(), p3m_gpu_data.box); - reinit_if = true; + do_reinit = true; } p3m_gpu_data.mesh_z_padded = (mesh[2] / 2 + 1) * 2; @@ -620,17 +610,15 @@ void p3m_gpu_init(int cao, const int mesh[3], double alpha) { if (!p3m_gpu_data_initialized && p3m_gpu_data.mesh_size > 0) { /* Size of the complex mesh Nx * Ny * ( Nz / 2 + 1 ) */ - const int cmesh_size = p3m_gpu_data.mesh[0] * p3m_gpu_data.mesh[1] * - (p3m_gpu_data.mesh[2] / 2 + 1); - - cuda_safe_mem(cudaMalloc((void **)&(p3m_gpu_data.charge_mesh), - cmesh_size * sizeof(FFT_TYPE_COMPLEX))); - cuda_safe_mem(cudaMalloc((void **)&(p3m_gpu_data.force_mesh_x), - cmesh_size * sizeof(FFT_TYPE_COMPLEX))); - cuda_safe_mem(cudaMalloc((void **)&(p3m_gpu_data.force_mesh_y), - cmesh_size * sizeof(FFT_TYPE_COMPLEX))); - cuda_safe_mem(cudaMalloc((void **)&(p3m_gpu_data.force_mesh_z), - cmesh_size * sizeof(FFT_TYPE_COMPLEX))); + auto const cmesh_size = + static_cast(p3m_gpu_data.mesh[0]) * + static_cast(p3m_gpu_data.mesh[1]) * + static_cast(p3m_gpu_data.mesh[2] / 2 + 1); + auto const mesh_len = cmesh_size * sizeof(FFT_TYPE_COMPLEX); + cuda_safe_mem(cudaMalloc((void **)&(p3m_gpu_data.charge_mesh), mesh_len)); + cuda_safe_mem(cudaMalloc((void **)&(p3m_gpu_data.force_mesh_x), mesh_len)); + cuda_safe_mem(cudaMalloc((void **)&(p3m_gpu_data.force_mesh_y), mesh_len)); + cuda_safe_mem(cudaMalloc((void **)&(p3m_gpu_data.force_mesh_z), mesh_len)); cuda_safe_mem(cudaMalloc((void **)&(p3m_gpu_data.G_hat), cmesh_size * sizeof(REAL_TYPE))); @@ -642,14 +630,14 @@ void p3m_gpu_init(int cao, const int mesh[3], double alpha) { } } - if ((reinit_if or !p3m_gpu_data_initialized) && p3m_gpu_data.mesh_size > 0) { + if ((do_reinit or !p3m_gpu_data_initialized) && p3m_gpu_data.mesh_size > 0) { dim3 grid(1, 1, 1); dim3 block(1, 1, 1); - block.x = 512 / mesh[0] + 1; - block.y = mesh[1]; + block.x = static_cast(512 / mesh[0] + 1); + block.y = static_cast(mesh[1]); block.z = 1; - grid.x = mesh[0] / block.x + 1; - grid.z = mesh[2] / 2 + 1; + grid.x = static_cast(mesh[0]) / block.x + 1; + grid.z = static_cast(mesh[2]) / 2 + 1; switch (p3m_gpu_data.cao) { case 1: @@ -692,22 +680,23 @@ void p3m_gpu_init(int cao, const int mesh[3], double alpha) { void p3m_gpu_add_farfield_force() { auto device_particles = gpu_get_particle_pointer(); CUDA_particle_data *lb_particle_gpu = device_particles.data(); - p3m_gpu_data.n_part = device_particles.size(); + p3m_gpu_data.n_part = static_cast(device_particles.size()); float *lb_particle_force_gpu = gpu_get_particle_force_pointer(); if (p3m_gpu_data.n_part == 0) return; - dim3 gridConv(p3m_gpu_data.mesh[0], p3m_gpu_data.mesh[1], 1); - dim3 threadsConv(p3m_gpu_data.mesh[2] / 2 + 1, 1, 1); + dim3 gridConv(static_cast(p3m_gpu_data.mesh[0]), + static_cast(p3m_gpu_data.mesh[1]), 1u); + dim3 threadsConv(static_cast(p3m_gpu_data.mesh[2] / 2 + 1), 1u, 1u); - auto prefactor = - REAL_TYPE(coulomb.prefactor / (p3m_gpu_data.box[0] * p3m_gpu_data.box[1] * - p3m_gpu_data.box[2] * 2.0)); + auto const volume = Utils::product(Utils::Vector3d(p3m_gpu_data.box)); + auto const prefactor = REAL_TYPE(coulomb.prefactor / (volume * 2.0)); cuda_safe_mem(cudaMemset(p3m_gpu_data.charge_mesh, 0, - p3m_gpu_data.mesh_size * sizeof(REAL_TYPE))); + static_cast(p3m_gpu_data.mesh_size) * + sizeof(REAL_TYPE))); /* Interpolate the charges to the mesh */ assign_charges(lb_particle_gpu, p3m_gpu_data); diff --git a/src/core/electrostatics_magnetostatics/p3m_gpu_error_cuda.cu b/src/core/electrostatics_magnetostatics/p3m_gpu_error_cuda.cu index 98b2aaf5b7e..2d2a40604b8 100644 --- a/src/core/electrostatics_magnetostatics/p3m_gpu_error_cuda.cu +++ b/src/core/electrostatics_magnetostatics/p3m_gpu_error_cuda.cu @@ -104,8 +104,7 @@ __global__ void p3m_k_space_error_gpu_kernel_ik(int3 mesh, double3 meshi, const double cs = p3m_analytic_cotangent_sum(nz, meshi.z) * p3m_analytic_cotangent_sum(nx, meshi.x) * p3m_analytic_cotangent_sum(ny, meshi.y); - const double ex = - exp(-(Utils::pi() * alpha_L_i) * (Utils::pi() * alpha_L_i) * n2); + const double ex = exp(-sqr(Utils::pi() * alpha_L_i) * n2); const double ex2 = sqr(ex); const double U2 = int_pow<2 * cao>(Utils::sinc(meshi.x * nx) * Utils::sinc(meshi.y * ny) * @@ -123,11 +122,11 @@ __global__ void p3m_k_space_error_gpu_kernel_ik(int3 mesh, double3 meshi, __global__ void p3m_k_space_error_gpu_kernel_ad(const int3 mesh, const double3 meshi, int cao, double alpha_L, double *he_q) { - int nx = + auto const nx = -mesh.x / 2 + static_cast(blockDim.x * blockIdx.x + threadIdx.x); - int ny = + auto const ny = -mesh.y / 2 + static_cast(blockDim.y * blockIdx.y + threadIdx.y); - int nz = + auto const nz = -mesh.z / 2 + static_cast(blockDim.z * blockIdx.z + threadIdx.z); if ((nx >= mesh.x / 2) || (ny >= mesh.y / 2) || (nz >= mesh.z / 2)) @@ -136,31 +135,25 @@ __global__ void p3m_k_space_error_gpu_kernel_ad(const int3 mesh, int lind = ((nx + mesh.x / 2) * mesh.y * mesh.z + (ny + mesh.y / 2) * mesh.z + (nz + mesh.z / 2)); - double alpha_L_i = 1. / alpha_L; - double n2; - double U2, ex, ex2; - int nmx, nmy, nmz; + auto const alpha_L_i = 1. / alpha_L; double alias1, alias2, alias3, alias4; - alias1 = alias2 = alias3 = alias4 = 0; if ((nx != 0) || (ny != 0) || (nz != 0)) { for (int mx = -1; mx <= 1; mx++) { - nmx = nx + mx * mesh.x; + auto const nmx = static_cast(nx + mx * mesh.x); for (int my = -1; my <= 1; my++) { - nmy = ny + my * mesh.y; + auto const nmy = static_cast(ny + my * mesh.y); for (int mz = -1; mz <= 1; mz++) { - nmz = nz + mz * mesh.z; - - n2 = sqr(nmx) + sqr(nmy) + sqr(nmz); - - ex = exp(-(Utils::pi() * alpha_L_i) * (Utils::pi() * alpha_L_i) * n2); - - ex2 = sqr(ex); + auto const nmz = static_cast(nz + mz * mesh.z); - U2 = pow((double)Utils::sinc(meshi.x * nmx) * - Utils::sinc(meshi.y * nmy) * Utils::sinc(meshi.z * nmz), - 2.0 * cao); + auto const n2 = static_cast(sqr(nmx) + sqr(nmy) + sqr(nmz)); + auto const ex = exp(-sqr(Utils::pi() * alpha_L_i) * n2); + auto const ex2 = sqr(ex); + auto const U2 = + pow(Utils::sinc(meshi.x * nmx) * Utils::sinc(meshi.y * nmy) * + Utils::sinc(meshi.z * nmz), + 2.0 * cao); alias1 += ex2 / n2; alias2 += U2 * ex; @@ -185,11 +178,11 @@ __global__ void p3m_k_space_error_gpu_kernel_ik_i(const int3 mesh, double alpha_L, double *he_q) { - int nx = + auto const nx = -mesh.x / 2 + static_cast(blockDim.x * blockIdx.x + threadIdx.x); - int ny = + auto const ny = -mesh.y / 2 + static_cast(blockDim.y * blockIdx.y + threadIdx.y); - int nz = + auto const nz = -mesh.z / 2 + static_cast(blockDim.z * blockIdx.z + threadIdx.z); if ((nx >= mesh.x / 2) || (ny >= mesh.y / 2) || (nz >= mesh.z / 2)) @@ -198,31 +191,25 @@ __global__ void p3m_k_space_error_gpu_kernel_ik_i(const int3 mesh, int lind = ((nx + mesh.x / 2) * mesh.y * mesh.z + (ny + mesh.y / 2) * mesh.z + (nz + mesh.z / 2)); - double alpha_L_i = 1. / alpha_L; - double n2; - double U2, ex, ex2; - int nmx, nmy, nmz; + auto const alpha_L_i = 1. / alpha_L; double alias1, alias2, alias3, alias4; - alias1 = alias2 = alias3 = alias4 = 0; if ((nx != 0) || (ny != 0) || (nz != 0)) { for (int mx = -1; mx <= 1; mx++) { - nmx = nx + mx * mesh.x; + auto const nmx = static_cast(nx + mx * mesh.x); for (int my = -1; my <= 1; my++) { - nmy = ny + my * mesh.y; + auto const nmy = static_cast(ny + my * mesh.y); for (int mz = -1; mz <= 1; mz++) { - nmz = nz + mz * mesh.z; - - n2 = sqr(nmx) + sqr(nmy) + sqr(nmz); - - ex = exp(-(Utils::pi() * alpha_L_i) * (Utils::pi() * alpha_L_i) * n2); + auto const nmz = static_cast(nz + mz * mesh.z); - ex2 = sqr(ex); - - U2 = pow((double)Utils::sinc(meshi.x * nmx) * - Utils::sinc(meshi.y * nmy) * Utils::sinc(meshi.z * nmz), - 2.0 * cao); + auto const n2 = static_cast(sqr(nmx) + sqr(nmy) + sqr(nmz)); + auto const ex = exp(-sqr(Utils::pi() * alpha_L_i) * n2); + auto const ex2 = sqr(ex); + auto const U2 = + pow(Utils::sinc(meshi.x * nmx) * Utils::sinc(meshi.y * nmy) * + Utils::sinc(meshi.z * nmz), + 2.0 * cao); alias1 += ex2 / n2; alias2 += U2 * ex * (nx * nmx + ny * nmy + nz * nmz) / n2; @@ -251,11 +238,11 @@ __global__ void p3m_k_space_error_gpu_kernel_ad_i(const int3 mesh, double alpha_L, double *he_q) { - int nx = + auto const nx = -mesh.x / 2 + static_cast(blockDim.x * blockIdx.x + threadIdx.x); - int ny = + auto const ny = -mesh.y / 2 + static_cast(blockDim.y * blockIdx.y + threadIdx.y); - int nz = + auto const nz = -mesh.z / 2 + static_cast(blockDim.z * blockIdx.z + threadIdx.z); if ((nx >= mesh.x / 2) || (ny >= mesh.y / 2) || (nz >= mesh.z / 2)) @@ -264,31 +251,25 @@ __global__ void p3m_k_space_error_gpu_kernel_ad_i(const int3 mesh, int lind = ((nx + mesh.x / 2) * mesh.y * mesh.z + (ny + mesh.y / 2) * mesh.z + (nz + mesh.z / 2)); - double alpha_L_i = 1. / alpha_L; - double n2; - double U2, ex, ex2; - int nmx, nmy, nmz; + auto const alpha_L_i = 1. / alpha_L; double alias1, alias2, alias3, alias4, alias5, alias6; - alias1 = alias2 = alias3 = alias4 = alias5 = alias6 = 0; if ((nx != 0) && (ny != 0) && (nz != 0)) { for (int mx = -1; mx <= 1; mx++) { - nmx = nx + mx * mesh.x; + auto const nmx = static_cast(nx + mx * mesh.x); for (int my = -1; my <= 1; my++) { - nmy = ny + my * mesh.y; + auto const nmy = static_cast(ny + my * mesh.y); for (int mz = -1; mz <= 1; mz++) { - nmz = nz + mz * mesh.z; - - n2 = sqr(nmx) + sqr(nmy) + sqr(nmz); - - ex = exp(-(Utils::pi() * alpha_L_i) * (Utils::pi() * alpha_L_i) * n2); + auto const nmz = static_cast(nz + mz * mesh.z); - ex2 = sqr(ex); - - U2 = pow((double)Utils::sinc(meshi.x * nmx) * - Utils::sinc(meshi.y * nmy) * Utils::sinc(meshi.z * nmz), - 2.0 * cao); + auto const n2 = sqr(nmx) + sqr(nmy) + sqr(nmz); + auto const ex = exp(-sqr(Utils::pi() * alpha_L_i) * n2); + auto const ex2 = sqr(ex); + auto const U2 = + pow(Utils::sinc(meshi.x * nmx) * Utils::sinc(meshi.y * nmy) * + Utils::sinc(meshi.z * nmz), + 2.0 * cao); alias1 += ex2 / n2; alias2 += U2 * ex; @@ -318,14 +299,11 @@ double p3m_k_space_error_gpu(double prefactor, const int *mesh, int cao, int npart, double sum_q2, double alpha_L, const double *box) { static thrust::device_vector he_q; + he_q.resize(static_cast(mesh[0] * mesh[1] * mesh[2])); - const std::size_t mesh_size = mesh[0] * mesh[1] * mesh[2]; - - he_q.resize(mesh_size); - - dim3 grid(std::max(1, mesh[0] / 8 + 1), - std::max(1, mesh[1] / 8 + 1), - std::max(1, mesh[2] / 8 + 1)); + dim3 grid(std::max(1u, static_cast(mesh[0] / 8 + 1)), + std::max(1u, static_cast(mesh[1] / 8 + 1)), + std::max(1u, static_cast(mesh[2] / 8 + 1))); dim3 block(8, 8, 8); @@ -372,6 +350,5 @@ double p3m_k_space_error_gpu(double prefactor, const int *mesh, int cao, auto const he_q_final = thrust::reduce(he_q.begin(), he_q.end()); - return 2.0 * prefactor * sum_q2 * sqrt(he_q_final / npart) / - (box[1] * box[2]); + return 2 * prefactor * sum_q2 * sqrt(he_q_final / npart) / (box[1] * box[2]); } From 48592362a65c41fc740b1027824719100dbe3155 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= Date: Wed, 1 Dec 2021 21:58:56 +0100 Subject: [PATCH 03/21] core: Cleanup header includes --- src/core/EspressoSystemInterface.cpp | 2 ++ src/core/EspressoSystemInterface.hpp | 3 +++ src/core/actor/DipolarBarnesHut.hpp | 2 -- src/core/actor/DipolarBarnesHut_cuda.cu | 3 +++ src/core/actor/DipolarDirectSum.hpp | 2 -- src/core/cuda_interface.cpp | 1 + src/core/electrostatics_magnetostatics/coulomb_inline.hpp | 1 + src/core/electrostatics_magnetostatics/p3m_gpu_error_cuda.cu | 3 ++- src/core/event.cpp | 2 -- src/core/grid_based_algorithms/electrokinetics_cuda.cu | 1 + src/core/immersed_boundary/ibm_triel.cpp | 4 ++-- src/core/pair_criteria/pair_criteria.hpp | 1 + 12 files changed, 16 insertions(+), 9 deletions(-) diff --git a/src/core/EspressoSystemInterface.cpp b/src/core/EspressoSystemInterface.cpp index 3784f39903f..61bc74e830d 100644 --- a/src/core/EspressoSystemInterface.cpp +++ b/src/core/EspressoSystemInterface.cpp @@ -23,6 +23,8 @@ #include "cuda_interface.hpp" #include "grid.hpp" +#include + /* Initialize instance pointer */ EspressoSystemInterface *EspressoSystemInterface::m_instance = nullptr; diff --git a/src/core/EspressoSystemInterface.hpp b/src/core/EspressoSystemInterface.hpp index cdcad3db7f8..91329c65c7f 100644 --- a/src/core/EspressoSystemInterface.hpp +++ b/src/core/EspressoSystemInterface.hpp @@ -20,8 +20,11 @@ #define ESPRESSOSYSTEMINTERFACE_H #include "SystemInterface.hpp" +#include "config.hpp" #include "cuda_interface.hpp" +#include + #include class EspressoSystemInterface : public SystemInterface { diff --git a/src/core/actor/DipolarBarnesHut.hpp b/src/core/actor/DipolarBarnesHut.hpp index 98364fde674..1bf3815527b 100644 --- a/src/core/actor/DipolarBarnesHut.hpp +++ b/src/core/actor/DipolarBarnesHut.hpp @@ -31,8 +31,6 @@ #include "electrostatics_magnetostatics/dipole.hpp" #include "errorhandling.hpp" -#include - class DipolarBarnesHut : public Actor { public: DipolarBarnesHut(SystemInterface &s, float epssq, float itolsq) { diff --git a/src/core/actor/DipolarBarnesHut_cuda.cu b/src/core/actor/DipolarBarnesHut_cuda.cu index baadc95522e..456895976ff 100644 --- a/src/core/actor/DipolarBarnesHut_cuda.cu +++ b/src/core/actor/DipolarBarnesHut_cuda.cu @@ -36,6 +36,9 @@ #include +#include +#include + // Method performance/accuracy parameters __constant__ float epssqd[1], itolsqd[1]; // blkcntd is a factual blocks' count. diff --git a/src/core/actor/DipolarDirectSum.hpp b/src/core/actor/DipolarDirectSum.hpp index 1938dd224ab..ba5cdc9d05c 100644 --- a/src/core/actor/DipolarDirectSum.hpp +++ b/src/core/actor/DipolarDirectSum.hpp @@ -30,8 +30,6 @@ #include "errorhandling.hpp" #include "grid.hpp" -#include - void DipolarDirectSum_kernel_wrapper_energy(float k, unsigned int n, float *pos, float *dip, float box_l[3], int periodic[3], float *E); diff --git a/src/core/cuda_interface.cpp b/src/core/cuda_interface.cpp index 8b5b87d9927..a92fe9e4bfb 100644 --- a/src/core/cuda_interface.cpp +++ b/src/core/cuda_interface.cpp @@ -28,6 +28,7 @@ #include "nonbonded_interactions/nonbonded_interaction_data.hpp" #include "serialization/CUDA_particle_data.hpp" +#include #include #include diff --git a/src/core/electrostatics_magnetostatics/coulomb_inline.hpp b/src/core/electrostatics_magnetostatics/coulomb_inline.hpp index d2a0837ccf4..e3c912c94ae 100644 --- a/src/core/electrostatics_magnetostatics/coulomb_inline.hpp +++ b/src/core/electrostatics_magnetostatics/coulomb_inline.hpp @@ -35,6 +35,7 @@ #include #include +#include #include namespace Coulomb { diff --git a/src/core/electrostatics_magnetostatics/p3m_gpu_error_cuda.cu b/src/core/electrostatics_magnetostatics/p3m_gpu_error_cuda.cu index 2d2a40604b8..3b6591c4388 100644 --- a/src/core/electrostatics_magnetostatics/p3m_gpu_error_cuda.cu +++ b/src/core/electrostatics_magnetostatics/p3m_gpu_error_cuda.cu @@ -36,7 +36,8 @@ #include -#include +#include +#include #if defined(OMPI_MPI_H) || defined(_MPI_H) #error CU-file includes mpi.h! This should not happen! diff --git a/src/core/event.cpp b/src/core/event.cpp index 2753a609305..883818372fd 100644 --- a/src/core/event.cpp +++ b/src/core/event.cpp @@ -56,8 +56,6 @@ #include -#include - #include /** whether the thermostat has to be reinitialized before integration */ diff --git a/src/core/grid_based_algorithms/electrokinetics_cuda.cu b/src/core/grid_based_algorithms/electrokinetics_cuda.cu index b7ad6e3ff1c..9640489e2b5 100644 --- a/src/core/grid_based_algorithms/electrokinetics_cuda.cu +++ b/src/core/grid_based_algorithms/electrokinetics_cuda.cu @@ -46,6 +46,7 @@ #include #include +#include #include #include #include diff --git a/src/core/immersed_boundary/ibm_triel.cpp b/src/core/immersed_boundary/ibm_triel.cpp index 8bebd14df86..123ffa587ef 100644 --- a/src/core/immersed_boundary/ibm_triel.cpp +++ b/src/core/immersed_boundary/ibm_triel.cpp @@ -27,8 +27,8 @@ #include #include -#include -#include +#include + #include namespace { diff --git a/src/core/pair_criteria/pair_criteria.hpp b/src/core/pair_criteria/pair_criteria.hpp index ac4d787c693..fde2c8b4a2b 100644 --- a/src/core/pair_criteria/pair_criteria.hpp +++ b/src/core/pair_criteria/pair_criteria.hpp @@ -21,6 +21,7 @@ #include "Particle.hpp" #include "energy_inline.hpp" +#include "grid.hpp" #include "particle_data.hpp" #include From 851d218afe178300ca60d7a758aa1cc21c834e62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= Date: Thu, 2 Dec 2021 20:53:58 +0100 Subject: [PATCH 04/21] core: Extract GPU check logic --- src/core/cuda_common_cuda.cu | 14 +------------- src/core/cuda_init.hpp | 7 +++++++ src/core/cuda_init_cuda.cu | 14 ++++++++++++++ 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/src/core/cuda_common_cuda.cu b/src/core/cuda_common_cuda.cu index 1f6b867e9e3..e5d97e8baf3 100644 --- a/src/core/cuda_common_cuda.cu +++ b/src/core/cuda_common_cuda.cu @@ -133,19 +133,7 @@ void resize_buffers(std::size_t number_of_particles) { void gpu_init_particle_comm() { if (this_node == 0 && global_part_vars_host.communication_enabled == 0) { try { - if (cuda_get_n_gpus() == 0) { - fprintf(stderr, "ERROR: No GPU was found.\n"); - errexit(); - } - auto const devID = cuda_get_device(); - auto const compute_capability = cuda_check_gpu_compute_capability(devID); - auto const communication_test = cuda_test_device_access(); - if (compute_capability != ES_OK or communication_test != ES_OK) { - fprintf(stderr, - "ERROR: CUDA device %i is not capable of running ESPResSo.\n", - devID); - errexit(); - } + cuda_check_device(); } catch (cuda_runtime_error const &err) { fprintf(stderr, "ERROR: %s\n", err.what()); errexit(); diff --git a/src/core/cuda_init.hpp b/src/core/cuda_init.hpp index 810cf075d9d..b81d54dc551 100644 --- a/src/core/cuda_init.hpp +++ b/src/core/cuda_init.hpp @@ -92,6 +92,13 @@ int cuda_get_device(); */ int cuda_test_device_access(); +/** + * Check that a device is available, that its compute capability + * is sufficient for ESPResSo, and that data can be written to + * and read from it. Otherwise, throw an exception. + */ +void cuda_check_device(); + /** Gather unique list of CUDA devices on all nodes. * @return vector of device properties. */ diff --git a/src/core/cuda_init_cuda.cu b/src/core/cuda_init_cuda.cu index e434d7132f0..b5616ae0da1 100644 --- a/src/core/cuda_init_cuda.cu +++ b/src/core/cuda_init_cuda.cu @@ -25,6 +25,7 @@ #include #include +#include #if defined(OMPI_MPI_H) || defined(_MPI_H) #error CU-file includes mpi.h! This should not happen! @@ -118,4 +119,17 @@ int cuda_test_device_access() { return ES_OK; } +void cuda_check_device() { + if (cuda_get_n_gpus() == 0) { + throw cuda_runtime_error("No GPU was found."); + } + auto const devID = cuda_get_device(); + auto const compute_capability = cuda_check_gpu_compute_capability(devID); + auto const communication_test = cuda_test_device_access(); + if (compute_capability != ES_OK or communication_test != ES_OK) { + throw cuda_runtime_error("CUDA device " + std::to_string(devID) + + " is not capable of running ESPResSo."); + } +} + #endif /* defined(CUDA) */ From 0dd9acb71843779bf10684d8c331e4c0a92180af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= Date: Thu, 2 Dec 2021 21:10:26 +0100 Subject: [PATCH 05/21] tests: Check EspressoSystemInterface class --- src/core/unit_tests/CMakeLists.txt | 2 + .../EspressoSystemInterface_test.cpp | 206 ++++++++++++++++++ 2 files changed, 208 insertions(+) create mode 100644 src/core/unit_tests/EspressoSystemInterface_test.cpp diff --git a/src/core/unit_tests/CMakeLists.txt b/src/core/unit_tests/CMakeLists.txt index 6f9c701753c..5e051fce5f3 100644 --- a/src/core/unit_tests/CMakeLists.txt +++ b/src/core/unit_tests/CMakeLists.txt @@ -27,6 +27,8 @@ unit_test(NAME RuntimeErrorCollector_test SRC RuntimeErrorCollector_test.cpp unit_test(NAME EspressoSystemStandAlone_test SRC EspressoSystemStandAlone_test.cpp DEPENDS EspressoCore Boost::mpi MPI::MPI_CXX NUM_PROC 2) +unit_test(NAME EspressoSystemInterface_test SRC + EspressoSystemInterface_test.cpp DEPENDS EspressoCore Boost::mpi) unit_test(NAME MpiCallbacks_test SRC MpiCallbacks_test.cpp DEPENDS EspressoUtils Boost::mpi MPI::MPI_CXX NUM_PROC 2) unit_test(NAME ParticleIterator_test SRC ParticleIterator_test.cpp DEPENDS diff --git a/src/core/unit_tests/EspressoSystemInterface_test.cpp b/src/core/unit_tests/EspressoSystemInterface_test.cpp new file mode 100644 index 00000000000..a270579384f --- /dev/null +++ b/src/core/unit_tests/EspressoSystemInterface_test.cpp @@ -0,0 +1,206 @@ +/* + * Copyright (C) 2021 The ESPResSo project + * + * This file is part of ESPResSo. + * + * ESPResSo is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * ESPResSo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#define BOOST_TEST_NO_MAIN +#define BOOST_TEST_MODULE EspressoSystemInterface test +#define BOOST_TEST_ALTERNATIVE_INIT_API +#define BOOST_TEST_DYN_LINK +#include + +#include "EspressoSystemInterface.hpp" +#include "communication.hpp" +#include "config.hpp" +#include "particle_data.hpp" +#include "virtual_sites.hpp" +#include "virtual_sites/VirtualSitesOff.hpp" + +#include + +#include +#include + +namespace espresso { +auto &system = EspressoSystemInterface::Instance(); +} + +inline void check_uninitialized_device_pointers() { + BOOST_CHECK_EQUAL(espresso::system.eGpu(), nullptr); + BOOST_CHECK_EQUAL(espresso::system.rGpuBegin(), nullptr); + BOOST_CHECK_EQUAL(espresso::system.rGpuEnd(), nullptr); + BOOST_CHECK_EQUAL(espresso::system.dipGpuBegin(), nullptr); + BOOST_CHECK_EQUAL(espresso::system.dipGpuEnd(), nullptr); + BOOST_CHECK_EQUAL(espresso::system.fGpuBegin(), nullptr); + BOOST_CHECK_EQUAL(espresso::system.fGpuEnd(), nullptr); + BOOST_CHECK_EQUAL(espresso::system.torqueGpuBegin(), nullptr); + BOOST_CHECK_EQUAL(espresso::system.torqueGpuEnd(), nullptr); + BOOST_CHECK_EQUAL(espresso::system.qGpuBegin(), nullptr); + BOOST_CHECK_EQUAL(espresso::system.qGpuEnd(), nullptr); +} + +#ifdef CUDA + +#include "cuda_init.hpp" +#include "cuda_utils.hpp" + +/* Decorator to run a unit test depending on GPU availability. */ +boost::test_tools::assertion_result has_gpu(boost::unit_test::test_unit_id) { + bool has_compatible_gpu = false; + try { + cuda_check_device(); + has_compatible_gpu = true; + } catch (cuda_runtime_error const &err) { + } + return has_compatible_gpu; +} + +BOOST_AUTO_TEST_CASE(check_with_gpu, *boost::unit_test::precondition(has_gpu)) { + check_uninitialized_device_pointers(); + BOOST_CHECK_EQUAL(espresso::system.npart_gpu(), 0); + + // features compiled in + auto has_feature_rotation = false; + auto has_feature_electrostatics = false; + auto has_feature_dipoles = false; +#ifdef ROTATION + has_feature_rotation = true; +#endif +#ifdef ELECTROSTATICS + has_feature_electrostatics = true; +#endif +#ifdef DIPOLES + has_feature_dipoles = true; +#endif + + auto const pid = 1; + place_particle(pid, {0., 0., 0.}); + set_particle_type(pid, 0); + + BOOST_CHECK_EQUAL(espresso::system.npart_gpu(), 0); + espresso::system.update(); + BOOST_CHECK_EQUAL(espresso::system.npart_gpu(), 0); + BOOST_TEST(espresso::system.requestParticleStructGpu()); + espresso::system.update(); + BOOST_CHECK_EQUAL(espresso::system.npart_gpu(), 1); + + // check position split + BOOST_TEST(espresso::system.hasRGpu()); + BOOST_TEST(espresso::system.requestRGpu()); + espresso::system.update(); + BOOST_TEST(espresso::system.rGpuBegin() != nullptr); + BOOST_TEST(espresso::system.rGpuEnd() != nullptr); + BOOST_CHECK_EQUAL(espresso::system.rGpuEnd() - espresso::system.rGpuBegin(), + 3); + + // check force split + BOOST_TEST(espresso::system.hasFGpu()); + BOOST_TEST(espresso::system.requestFGpu()); + espresso::system.update(); + BOOST_TEST(espresso::system.fGpuBegin() != nullptr); + BOOST_TEST(espresso::system.fGpuEnd() != nullptr); + BOOST_CHECK_EQUAL(espresso::system.fGpuEnd() - espresso::system.fGpuBegin(), + 3); + + // check torque split + BOOST_CHECK_EQUAL(espresso::system.hasTorqueGpu(), has_feature_rotation); + BOOST_CHECK_EQUAL(espresso::system.requestTorqueGpu(), has_feature_rotation); +#ifdef ROTATION + espresso::system.update(); + BOOST_TEST(espresso::system.torqueGpuBegin() != nullptr); + BOOST_TEST(espresso::system.torqueGpuEnd() != nullptr); + BOOST_CHECK_EQUAL( + espresso::system.torqueGpuEnd() - espresso::system.torqueGpuBegin(), 3); +#endif + + // check charge split + BOOST_CHECK_EQUAL(espresso::system.hasQGpu(), has_feature_electrostatics); + BOOST_CHECK_EQUAL(espresso::system.requestQGpu(), has_feature_electrostatics); +#ifdef ELECTROSTATICS + espresso::system.update(); + BOOST_TEST(espresso::system.qGpuBegin() != nullptr); + BOOST_TEST(espresso::system.qGpuEnd() != nullptr); + BOOST_CHECK_EQUAL(espresso::system.qGpuEnd() - espresso::system.qGpuBegin(), + 3); +#endif + + // check dipole split + BOOST_CHECK_EQUAL(espresso::system.hasDipGpu(), has_feature_dipoles); + BOOST_CHECK_EQUAL(espresso::system.requestDipGpu(), has_feature_dipoles); +#ifdef DIPOLES + espresso::system.update(); + BOOST_TEST(espresso::system.dipGpuBegin() != nullptr); + BOOST_TEST(espresso::system.dipGpuEnd() != nullptr); + BOOST_CHECK_EQUAL( + espresso::system.dipGpuEnd() - espresso::system.dipGpuBegin(), 3); +#endif + + // clear device memory + remove_particle(pid); + espresso::system.update(); + BOOST_CHECK_EQUAL(espresso::system.npart_gpu(), 0); + BOOST_CHECK_EQUAL(espresso::system.rGpuBegin(), espresso::system.rGpuEnd()); + BOOST_CHECK_EQUAL(espresso::system.fGpuBegin(), espresso::system.fGpuEnd()); +#ifdef ELECTROSTATICS + BOOST_CHECK_EQUAL(espresso::system.qGpuBegin(), espresso::system.qGpuEnd()); +#endif +#ifdef DIPOLES + BOOST_CHECK_EQUAL(espresso::system.dipGpuBegin(), + espresso::system.dipGpuEnd()); +#endif +#ifdef ROTATION + BOOST_CHECK_EQUAL(espresso::system.torqueGpuBegin(), + espresso::system.torqueGpuEnd()); +#endif +} + +#else // CUDA + +BOOST_AUTO_TEST_CASE(check_without_cuda) { + check_uninitialized_device_pointers(); + BOOST_CHECK_EQUAL(espresso::system.npart_gpu(), 0); + BOOST_TEST(!espresso::system.hasRGpu()); + BOOST_TEST(!espresso::system.requestRGpu()); + BOOST_TEST(!espresso::system.hasDipGpu()); + BOOST_TEST(!espresso::system.requestDipGpu()); + BOOST_TEST(!espresso::system.hasFGpu()); + BOOST_TEST(!espresso::system.requestFGpu()); + BOOST_TEST(!espresso::system.hasTorqueGpu()); + BOOST_TEST(!espresso::system.requestTorqueGpu()); + BOOST_TEST(!espresso::system.hasQGpu()); + BOOST_TEST(!espresso::system.requestQGpu()); +} + +#endif // CUDA + +int main(int argc, char **argv) { + auto mpi_env = mpi_init(argc, argv); + + // this unit test only runs on 1 core + assert(boost::mpi::communicator().size() == 1); + + // initialize the MpiCallbacks framework + Communication::init(mpi_env); +#ifdef VIRTUAL_SITES + set_virtual_sites(std::make_shared()); +#endif + mpi_loop(); + + espresso::system.init(); + + return boost::unit_test::unit_test_main(init_unit_test, argc, argv); +} From b27eee642a5a31bbf8cae4638acca4449a477de2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= Date: Fri, 3 Dec 2021 16:48:27 +0100 Subject: [PATCH 06/21] core: Remove unused GPU interface for AoS->SoA No features require the particle velocity and director to be converted from an array of structs to a struct of arrays. --- src/core/EspressoSystemInterface.hpp | 35 +------------- src/core/EspressoSystemInterface_cuda.cu | 59 ------------------------ src/core/SystemInterface.hpp | 24 +--------- 3 files changed, 4 insertions(+), 114 deletions(-) diff --git a/src/core/EspressoSystemInterface.hpp b/src/core/EspressoSystemInterface.hpp index 91329c65c7f..e22507d81f0 100644 --- a/src/core/EspressoSystemInterface.hpp +++ b/src/core/EspressoSystemInterface.hpp @@ -71,17 +71,6 @@ class EspressoSystemInterface : public SystemInterface { return m_needsDipGpu; }; #endif - float *vGpuBegin() override { return m_v_gpu_begin; }; - float *vGpuEnd() override { return m_v_gpu_end; }; - bool hasVGpu() override { return true; }; - bool requestVGpu() override { - m_needsVGpu = hasVGpu(); - m_splitParticleStructGpu |= m_needsVGpu; - m_gpu |= m_needsVGpu; - if (m_gpu) - enableParticleCommunication(); - return m_needsVGpu; - }; #ifdef ELECTROSTATICS float *qGpuBegin() override { return m_q_gpu_begin; }; @@ -97,18 +86,6 @@ class EspressoSystemInterface : public SystemInterface { }; #endif - float *directorGpuBegin() override { return m_director_gpu_begin; }; - float *directorGpuEnd() override { return m_director_gpu_end; }; - bool hasDirectorGpu() override { return true; }; - bool requestDirectorGpu() override { - m_needsDirectorGpu = hasDirectorGpu(); - m_splitParticleStructGpu |= m_needsDirectorGpu; - m_gpu |= m_needsDirectorGpu; - if (m_gpu) - enableParticleCommunication(); - return m_needsDirectorGpu; - }; - bool requestParticleStructGpu() { m_needsParticleStructGpu = true; m_gpu |= m_needsParticleStructGpu; @@ -169,10 +146,8 @@ class EspressoSystemInterface : public SystemInterface { EspressoSystemInterface() : m_gpu(false), m_r_gpu_begin(nullptr), m_r_gpu_end(nullptr), m_dip_gpu_begin(nullptr), m_dip_gpu_end(nullptr), - m_v_gpu_begin(nullptr), m_v_gpu_end(nullptr), m_q_gpu_begin(nullptr), - m_q_gpu_end(nullptr), m_director_gpu_begin(nullptr), - m_director_gpu_end(nullptr), m_needsParticleStructGpu(false), - m_splitParticleStructGpu(false){}; + m_q_gpu_begin(nullptr), m_q_gpu_end(nullptr), + m_needsParticleStructGpu(false), m_splitParticleStructGpu(false){}; ~EspressoSystemInterface() override = default; void gatherParticles(); @@ -197,15 +172,9 @@ class EspressoSystemInterface : public SystemInterface { float *m_dip_gpu_begin; float *m_dip_gpu_end; - float *m_v_gpu_begin; - float *m_v_gpu_end; - float *m_q_gpu_begin; float *m_q_gpu_end; - float *m_director_gpu_begin; - float *m_director_gpu_end; - bool m_needsParticleStructGpu; bool m_splitParticleStructGpu; }; diff --git a/src/core/EspressoSystemInterface_cuda.cu b/src/core/EspressoSystemInterface_cuda.cu index 2904f2994fb..4d62086a3ef 100644 --- a/src/core/EspressoSystemInterface_cuda.cu +++ b/src/core/EspressoSystemInterface_cuda.cu @@ -80,24 +80,6 @@ __global__ void split_kernel_r(CUDA_particle_data *particles, float *r, r[idx + 2] = p.p[2]; } -#ifdef CUDA -// Velocity -__global__ void split_kernel_v(CUDA_particle_data *particles, float *v, - unsigned int n) { - auto idx = blockDim.x * blockIdx.x + threadIdx.x; - if (idx >= n) - return; - - CUDA_particle_data p = particles[idx]; - - idx *= 3; - - v[idx + 0] = p.v[0]; - v[idx + 1] = p.v[1]; - v[idx + 2] = p.v[2]; -} -#endif - #ifdef DIPOLES // Dipole moment __global__ void split_kernel_dip(CUDA_particle_data *particles, float *dip, @@ -116,23 +98,6 @@ __global__ void split_kernel_dip(CUDA_particle_data *particles, float *dip, } #endif -__global__ void split_kernel_director(CUDA_particle_data *particles, - float *director, unsigned int n) { -#ifdef ROTATION - auto idx = blockDim.x * blockIdx.x + threadIdx.x; - if (idx >= n) - return; - - CUDA_particle_data p = particles[idx]; - - idx *= 3; - - director[idx + 0] = p.director[0]; - director[idx + 1] = p.director[1]; - director[idx + 2] = p.director[2]; -#endif -} - void EspressoSystemInterface::reallocDeviceMemory(std::size_t n) { if (m_needsRGpu && ((n != m_gpu_npart) || (m_r_gpu_begin == nullptr))) { if (m_r_gpu_begin != nullptr) @@ -148,13 +113,6 @@ void EspressoSystemInterface::reallocDeviceMemory(std::size_t n) { m_dip_gpu_end = m_dip_gpu_begin + 3 * n; } #endif - if (m_needsVGpu && ((n != m_gpu_npart) || (m_v_gpu_begin == nullptr))) { - if (m_v_gpu_begin != nullptr) - cuda_safe_mem(cudaFree(m_v_gpu_begin)); - cuda_safe_mem(cudaMalloc(&m_v_gpu_begin, 3 * n * sizeof(float))); - m_v_gpu_end = m_v_gpu_begin + 3 * n; - } - if (m_needsQGpu && ((n != m_gpu_npart) || (m_q_gpu_begin == nullptr))) { if (m_q_gpu_begin != nullptr) cuda_safe_mem(cudaFree(m_q_gpu_begin)); @@ -162,14 +120,6 @@ void EspressoSystemInterface::reallocDeviceMemory(std::size_t n) { m_q_gpu_end = m_q_gpu_begin + 3 * n; } - if (m_needsDirectorGpu && - ((n != m_gpu_npart) || (m_director_gpu_begin == nullptr))) { - if (m_director_gpu_begin != nullptr) - cuda_safe_mem(cudaFree(m_director_gpu_begin)); - cuda_safe_mem(cudaMalloc(&m_director_gpu_begin, 3 * n * sizeof(float))); - m_director_gpu_end = m_director_gpu_begin + 3 * n; - } - m_gpu_npart = n; } @@ -191,19 +141,10 @@ void EspressoSystemInterface::split_particle_struct() { if (!m_needsQGpu && m_needsRGpu) split_kernel_r<<>>( device_particles.data(), m_r_gpu_begin, n); -#ifdef CUDA - if (m_needsVGpu) - split_kernel_v<<>>( - device_particles.data(), m_v_gpu_begin, n); -#endif #ifdef DIPOLES if (m_needsDipGpu) split_kernel_dip<<>>( device_particles.data(), m_dip_gpu_begin, n); #endif - - if (m_needsDirectorGpu) - split_kernel_director<<>>( - device_particles.data(), m_director_gpu_begin, n); } diff --git a/src/core/SystemInterface.hpp b/src/core/SystemInterface.hpp index 5f78a98581b..73c577dfdfe 100644 --- a/src/core/SystemInterface.hpp +++ b/src/core/SystemInterface.hpp @@ -30,9 +30,8 @@ class SystemInterface { public: SystemInterface() - : m_needsRGpu(false), m_needsVGpu(false), m_needsQGpu(false), - m_needsDirectorGpu(false), m_needsFGpu(false), m_needsDipGpu(false), - m_needsTorqueGpu(false){}; + : m_needsRGpu(false), m_needsQGpu(false), m_needsFGpu(false), + m_needsDipGpu(false), m_needsTorqueGpu(false){}; typedef Utils::Vector3d Vector3; typedef double Real; @@ -63,14 +62,6 @@ class SystemInterface { return m_needsTorqueGpu; } - virtual float *vGpuBegin() { return nullptr; }; - virtual float *vGpuEnd() { return nullptr; }; - virtual bool hasVGpu() { return false; }; - virtual bool requestVGpu() { - m_needsVGpu = hasVGpu(); - return m_needsVGpu; - } - virtual float *qGpuBegin() { return nullptr; }; virtual float *qGpuEnd() { return nullptr; }; virtual bool hasQGpu() { return false; }; @@ -88,30 +79,19 @@ class SystemInterface { return m_needsFGpu; } - virtual float *directorGpuBegin() { return nullptr; }; - virtual float *directorGpuEnd() { return nullptr; }; - virtual bool hasDirectorGpu() { return false; }; - virtual bool requestDirectorGpu() { - m_needsDirectorGpu = hasDirectorGpu(); - return m_needsDirectorGpu; - } - virtual std::size_t npart_gpu() const { return 0; }; virtual Vector3 box() const = 0; virtual bool needsRGpu() { return m_needsRGpu; }; virtual bool needsDipGpu() { return m_needsRGpu; }; virtual bool needsQGpu() { return m_needsQGpu; }; - virtual bool needsDirectorGpu() { return m_needsDirectorGpu; }; virtual bool needsFGpu() { return m_needsFGpu; }; virtual bool needsTorqueGpu() { return m_needsTorqueGpu; }; virtual ~SystemInterface() = default; protected: bool m_needsRGpu; - bool m_needsVGpu; bool m_needsQGpu; - bool m_needsDirectorGpu; bool m_needsFGpu; bool m_needsDipGpu; bool m_needsTorqueGpu; From 9277a8fb89e644eefaa0759af70145c6a6e0fb4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= Date: Fri, 3 Dec 2021 16:57:01 +0100 Subject: [PATCH 07/21] core: Re-organize SystemInterface Group functions and globals by particle property. Cleanup ifdefs and comments. --- src/core/EspressoSystemInterface.cpp | 4 ---- src/core/EspressoSystemInterface.hpp | 26 ++++++++++++------------ src/core/EspressoSystemInterface_cuda.cu | 7 ++++--- src/core/SystemInterface.hpp | 5 ++--- src/core/actor/DipolarBarnesHut.hpp | 4 ++-- src/core/actor/DipolarDirectSum.hpp | 4 ++-- src/core/cuda_common_cuda.cu | 9 +++++--- src/core/cuda_interface.cpp | 16 ++------------- src/core/cuda_interface.hpp | 7 ++----- 9 files changed, 33 insertions(+), 49 deletions(-) diff --git a/src/core/EspressoSystemInterface.cpp b/src/core/EspressoSystemInterface.cpp index 61bc74e830d..fc09ddb1994 100644 --- a/src/core/EspressoSystemInterface.cpp +++ b/src/core/EspressoSystemInterface.cpp @@ -25,13 +25,9 @@ #include -/* Initialize instance pointer */ EspressoSystemInterface *EspressoSystemInterface::m_instance = nullptr; -/********************************************************************************************/ - void EspressoSystemInterface::gatherParticles() { -// get particles from other nodes #ifdef CUDA if (m_gpu) { if (gpu_get_global_particle_vars_pointer_host()->communication_enabled) { diff --git a/src/core/EspressoSystemInterface.hpp b/src/core/EspressoSystemInterface.hpp index e22507d81f0..ef1e9fc4d38 100644 --- a/src/core/EspressoSystemInterface.hpp +++ b/src/core/EspressoSystemInterface.hpp @@ -58,6 +58,7 @@ class EspressoSystemInterface : public SystemInterface { enableParticleCommunication(); return m_needsRGpu; }; + #ifdef DIPOLES float *dipGpuBegin() override { return m_dip_gpu_begin; }; float *dipGpuEnd() override { return m_dip_gpu_end; }; @@ -98,17 +99,6 @@ class EspressoSystemInterface : public SystemInterface { float *fGpuEnd() override { return gpu_get_particle_force_pointer() + 3 * m_gpu_npart; }; - float *eGpu() override { - // cast pointer to struct of floats to array of floats - // https://stackoverflow.com/a/29278260 - return reinterpret_cast(gpu_get_energy_pointer()); - }; - float *torqueGpuBegin() override { - return gpu_get_particle_torque_pointer(); - }; - float *torqueGpuEnd() override { - return gpu_get_particle_torque_pointer() + 3 * m_gpu_npart; - }; bool hasFGpu() override { return true; }; bool requestFGpu() override { m_needsFGpu = hasFGpu(); @@ -119,6 +109,12 @@ class EspressoSystemInterface : public SystemInterface { }; #ifdef ROTATION + float *torqueGpuBegin() override { + return gpu_get_particle_torque_pointer(); + }; + float *torqueGpuEnd() override { + return gpu_get_particle_torque_pointer() + 3 * m_gpu_npart; + }; bool hasTorqueGpu() override { return true; }; bool requestTorqueGpu() override { m_needsTorqueGpu = hasTorqueGpu(); @@ -129,6 +125,12 @@ class EspressoSystemInterface : public SystemInterface { }; #endif + float *eGpu() override { + // cast pointer to struct of floats to array of floats + // https://stackoverflow.com/a/29278260 + return reinterpret_cast(gpu_get_energy_pointer()); + }; + #endif // ifdef CUDA Utils::Vector3d box() const override; @@ -179,6 +181,4 @@ class EspressoSystemInterface : public SystemInterface { bool m_splitParticleStructGpu; }; -/********************************************************************************************/ - #endif diff --git a/src/core/EspressoSystemInterface_cuda.cu b/src/core/EspressoSystemInterface_cuda.cu index 4d62086a3ef..53cb31b43df 100644 --- a/src/core/EspressoSystemInterface_cuda.cu +++ b/src/core/EspressoSystemInterface_cuda.cu @@ -113,12 +113,14 @@ void EspressoSystemInterface::reallocDeviceMemory(std::size_t n) { m_dip_gpu_end = m_dip_gpu_begin + 3 * n; } #endif +#ifdef ELECTROSTATICS if (m_needsQGpu && ((n != m_gpu_npart) || (m_q_gpu_begin == nullptr))) { if (m_q_gpu_begin != nullptr) cuda_safe_mem(cudaFree(m_q_gpu_begin)); cuda_safe_mem(cudaMalloc(&m_q_gpu_begin, 3 * n * sizeof(float))); m_q_gpu_end = m_q_gpu_begin + 3 * n; } +#endif m_gpu_npart = n; } @@ -135,16 +137,15 @@ void EspressoSystemInterface::split_particle_struct() { if (m_needsQGpu && m_needsRGpu) split_kernel_rq<<>>( device_particles.data(), m_r_gpu_begin, m_q_gpu_begin, n); - if (m_needsQGpu && !m_needsRGpu) + else if (m_needsQGpu) split_kernel_q<<>>( device_particles.data(), m_q_gpu_begin, n); - if (!m_needsQGpu && m_needsRGpu) + else if (m_needsRGpu) split_kernel_r<<>>( device_particles.data(), m_r_gpu_begin, n); #ifdef DIPOLES if (m_needsDipGpu) split_kernel_dip<<>>( device_particles.data(), m_dip_gpu_begin, n); - #endif } diff --git a/src/core/SystemInterface.hpp b/src/core/SystemInterface.hpp index 73c577dfdfe..cd8b1fe1be6 100644 --- a/src/core/SystemInterface.hpp +++ b/src/core/SystemInterface.hpp @@ -25,8 +25,6 @@ #include -/** @todo: Turn needsXY in getter/setter **/ - class SystemInterface { public: SystemInterface() @@ -72,13 +70,14 @@ class SystemInterface { virtual float *fGpuBegin() { return nullptr; }; virtual float *fGpuEnd() { return nullptr; }; - virtual float *eGpu() { return nullptr; }; virtual bool hasFGpu() { return false; }; virtual bool requestFGpu() { m_needsFGpu = hasFGpu(); return m_needsFGpu; } + virtual float *eGpu() { return nullptr; }; + virtual std::size_t npart_gpu() const { return 0; }; virtual Vector3 box() const = 0; diff --git a/src/core/actor/DipolarBarnesHut.hpp b/src/core/actor/DipolarBarnesHut.hpp index 1bf3815527b..1db84237d44 100644 --- a/src/core/actor/DipolarBarnesHut.hpp +++ b/src/core/actor/DipolarBarnesHut.hpp @@ -98,6 +98,6 @@ class DipolarBarnesHut : public Actor { void activate_dipolar_barnes_hut(float epssq, float itolsq); void deactivate_dipolar_barnes_hut(); -#endif - #endif // DIPOLAR_BARNES_HUT + +#endif // ACTOR_DIPOLARBARNESHUT_HPP diff --git a/src/core/actor/DipolarDirectSum.hpp b/src/core/actor/DipolarDirectSum.hpp index ba5cdc9d05c..48c13b8aa3a 100644 --- a/src/core/actor/DipolarDirectSum.hpp +++ b/src/core/actor/DipolarDirectSum.hpp @@ -82,6 +82,6 @@ class DipolarDirectSum : public Actor { void activate_dipolar_direct_sum_gpu(); void deactivate_dipolar_direct_sum_gpu(); -#endif +#endif // DIPOLAR_DIRECT_SUM -#endif +#endif // ACTOR_DIPOLARDIRECTSUM_HPP diff --git a/src/core/cuda_common_cuda.cu b/src/core/cuda_common_cuda.cu index e5d97e8baf3..2a084138012 100644 --- a/src/core/cuda_common_cuda.cu +++ b/src/core/cuda_common_cuda.cu @@ -54,7 +54,9 @@ template std::size_t byte_size(SpanLike const &v) { /** struct for particle force */ static device_vector particle_forces_device; +#ifdef ROTATION static device_vector particle_torques_device; +#endif /** struct for particle position and velocity */ static device_vector particle_data_device; @@ -63,9 +65,8 @@ static CUDA_energy *energy_device = nullptr; pinned_vector particle_data_host; pinned_vector particle_forces_host; -CUDA_energy energy_host; - pinned_vector particle_torques_host; +CUDA_energy energy_host; cudaStream_t stream[1]; @@ -151,10 +152,12 @@ CUDA_global_part_vars *gpu_get_global_particle_vars_pointer_host() { float *gpu_get_particle_force_pointer() { return raw_data_pointer(particle_forces_device); } -CUDA_energy *gpu_get_energy_pointer() { return energy_device; } +#ifdef ROTATION float *gpu_get_particle_torque_pointer() { return raw_data_pointer(particle_torques_device); } +#endif +CUDA_energy *gpu_get_energy_pointer() { return energy_device; } void copy_part_data_to_gpu(ParticleRange particles) { if (global_part_vars_host.communication_enabled == 1) { diff --git a/src/core/cuda_interface.cpp b/src/core/cuda_interface.cpp index a92fe9e4bfb..26971b8f4eb 100644 --- a/src/core/cuda_interface.cpp +++ b/src/core/cuda_interface.cpp @@ -34,10 +34,6 @@ #include -/* TODO: We should only transfer data for enabled methods, - not for those that are barely compiled in. (fw) -*/ - static void pack_particles(ParticleRange particles, CUDA_particle_data *buffer) { using Utils::Vector3f; @@ -46,19 +42,17 @@ static void pack_particles(ParticleRange particles, for (auto const &part : particles) { buffer[i].p = static_cast(folded_position(part.r.p, box_geo)); -#ifdef CUDA buffer[i].identity = part.p.identity; buffer[i].v = static_cast(part.m.v); #ifdef VIRTUAL_SITES buffer[i].is_virtual = part.p.is_virtual; #endif -#endif #ifdef DIPOLES buffer[i].dip = static_cast(part.calc_dip()); #endif -#if defined(LB_ELECTROHYDRODYNAMICS) && defined(CUDA) +#ifdef LB_ELECTROHYDRODYNAMICS buffer[i].mu_E = static_cast(part.p.mu_E); #endif @@ -141,25 +135,19 @@ void cuda_mpi_send_forces(const ParticleRange &particles, if (this_node > 0) { static std::vector buffer_forces; static std::vector buffer_torques; - /* Alloc buffer */ - buffer_forces.resize(n_elements); + buffer_forces.resize(n_elements); Utils::Mpi::scatter_buffer(buffer_forces.data(), n_elements, comm_cart); #ifdef ROTATION - /* Alloc buffer */ buffer_torques.resize(n_elements); - Utils::Mpi::scatter_buffer(buffer_torques.data(), n_elements, comm_cart); #endif - add_forces_and_torques(particles, buffer_forces, buffer_torques); } else { - /* Scatter forces */ Utils::Mpi::scatter_buffer(host_forces.data(), n_elements, comm_cart); #ifdef ROTATION Utils::Mpi::scatter_buffer(host_torques.data(), n_elements, comm_cart); #endif - add_forces_and_torques(particles, host_forces, host_torques); } } diff --git a/src/core/cuda_interface.hpp b/src/core/cuda_interface.hpp index dcff4826a4e..7844c1db3d8 100644 --- a/src/core/cuda_interface.hpp +++ b/src/core/cuda_interface.hpp @@ -54,25 +54,23 @@ struct CUDA_particle_data { CUDA_ParticleParametersSwimming swim; #endif - /** particle position given from md part*/ Vector3f p; - /** particle id */ int identity; + #ifdef VIRTUAL_SITES bool is_virtual; #else static constexpr const bool is_virtual = false; #endif - /** particle momentum struct velocity p.m->v*/ Vector3f v; #ifdef ROTATION Vector3f director; #endif -#if defined(LB_ELECTROHYDRODYNAMICS) && defined(CUDA) +#ifdef LB_ELECTROHYDRODYNAMICS Vector3f mu_E; #endif @@ -116,7 +114,6 @@ float *gpu_get_particle_torque_pointer(); #endif CUDA_energy *gpu_get_energy_pointer(); -float *gpu_get_particle_torque_pointer(); void gpu_init_particle_comm(); void cuda_mpi_get_particles( From 607dad4821b50de0634e5304b6706f1df93f6b80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= Date: Fri, 3 Dec 2021 17:27:15 +0100 Subject: [PATCH 08/21] core: Narrow CUDA electrostatics interface Remove unused class members and function overloads. Hide implementation details of MMM1DGPU from the Python interface. Request GPU SoA for particle torques and positions in the core. Activate MMM1DGPU when added to the list of actors, not during construction. Use automatic memory management for MMM1DGPU. --- src/core/EspressoSystemInterface.hpp | 7 ------- src/core/actor/DipolarBarnesHut.hpp | 3 +++ src/core/actor/DipolarDirectSum.hpp | 3 +++ src/core/actor/Mmm1dgpuForce.hpp | 6 ++---- src/core/actor/Mmm1dgpuForce_cuda.cu | 16 ++++----------- src/python/espressomd/electrostatics.pxd | 25 ++---------------------- src/python/espressomd/electrostatics.pyx | 25 +++++++++--------------- 7 files changed, 23 insertions(+), 62 deletions(-) diff --git a/src/core/EspressoSystemInterface.hpp b/src/core/EspressoSystemInterface.hpp index ef1e9fc4d38..0395a75cb81 100644 --- a/src/core/EspressoSystemInterface.hpp +++ b/src/core/EspressoSystemInterface.hpp @@ -36,13 +36,6 @@ class EspressoSystemInterface : public SystemInterface { return *m_instance; }; - static EspressoSystemInterface *_Instance() { - if (!m_instance) - m_instance = new EspressoSystemInterface; - - return m_instance; - }; - void init() override; void update() override; diff --git a/src/core/actor/DipolarBarnesHut.hpp b/src/core/actor/DipolarBarnesHut.hpp index 1db84237d44..35fb9ac0701 100644 --- a/src/core/actor/DipolarBarnesHut.hpp +++ b/src/core/actor/DipolarBarnesHut.hpp @@ -41,6 +41,9 @@ class DipolarBarnesHut : public Actor { if (!s.requestFGpu()) runtimeErrorMsg() << "DipolarBarnesHut needs access to forces on GPU!"; + if (!s.requestTorqueGpu()) + runtimeErrorMsg() << "DipolarBarnesHut needs access to torques on GPU!"; + if (!s.requestRGpu()) runtimeErrorMsg() << "DipolarBarnesHut needs access to positions on GPU!"; diff --git a/src/core/actor/DipolarDirectSum.hpp b/src/core/actor/DipolarDirectSum.hpp index 48c13b8aa3a..02a08722813 100644 --- a/src/core/actor/DipolarDirectSum.hpp +++ b/src/core/actor/DipolarDirectSum.hpp @@ -46,6 +46,9 @@ class DipolarDirectSum : public Actor { if (!s.requestFGpu()) runtimeErrorMsg() << "DipolarDirectSum needs access to forces on GPU!"; + if (!s.requestTorqueGpu()) + runtimeErrorMsg() << "DipolarDirectSum needs access to torques on GPU!"; + if (!s.requestRGpu()) runtimeErrorMsg() << "DipolarDirectSum needs access to positions on GPU!"; diff --git a/src/core/actor/Mmm1dgpuForce.hpp b/src/core/actor/Mmm1dgpuForce.hpp index a767a38cad5..293154c1a30 100644 --- a/src/core/actor/Mmm1dgpuForce.hpp +++ b/src/core/actor/Mmm1dgpuForce.hpp @@ -29,8 +29,7 @@ class Mmm1dgpuForce : public Actor { public: // constructor - Mmm1dgpuForce(SystemInterface &s, float coulomb_prefactor, float maxPWerror, - float far_switch_radius = -1, int bessel_cutoff = -1); + Mmm1dgpuForce(SystemInterface &s); ~Mmm1dgpuForce() override; // interface methods void computeForces(SystemInterface &s) override; @@ -40,8 +39,7 @@ class Mmm1dgpuForce : public Actor { void tune(SystemInterface &s, float _maxPWerror, float _far_switch_radius, int _bessel_cutoff); void set_params(float _boxz, float _coulomb_prefactor, float _maxPWerror, - float _far_switch_radius, int _bessel_cutoff, - bool manual = false); + float _far_switch_radius, int _bessel_cutoff); void activate(); void deactivate(); diff --git a/src/core/actor/Mmm1dgpuForce_cuda.cu b/src/core/actor/Mmm1dgpuForce_cuda.cu index ed4cd132f7a..6c5a7f57a06 100644 --- a/src/core/actor/Mmm1dgpuForce_cuda.cu +++ b/src/core/actor/Mmm1dgpuForce_cuda.cu @@ -133,13 +133,10 @@ int modpsi_init() { return 0; } -Mmm1dgpuForce::Mmm1dgpuForce(SystemInterface &s, float _coulomb_prefactor, - float _maxPWerror, float _far_switch_radius, - int _bessel_cutoff) +Mmm1dgpuForce::Mmm1dgpuForce(SystemInterface &s) : numThreads(64), host_boxz(0), host_npart(0), need_tune(true), pairs(-1), - dev_forcePairs(nullptr), dev_energyBlocks(nullptr), - coulomb_prefactor(_coulomb_prefactor), maxPWerror(_maxPWerror), - far_switch_radius(_far_switch_radius), bessel_cutoff(_bessel_cutoff) { + dev_forcePairs(nullptr), dev_energyBlocks(nullptr), coulomb_prefactor(0), + maxPWerror(-1), far_switch_radius(-1), bessel_cutoff(-1) { // interface sanity checks if (!s.requestFGpu()) throw std::runtime_error("Mmm1dgpuForce needs access to forces on GPU!"); @@ -312,7 +309,7 @@ void Mmm1dgpuForce::tune(SystemInterface &s, float _maxPWerror, void Mmm1dgpuForce::set_params(float _boxz, float _coulomb_prefactor, float _maxPWerror, float _far_switch_radius, - int _bessel_cutoff, bool manual) { + int _bessel_cutoff) { if (_boxz > 0 && _far_switch_radius > _boxz) { throw std::runtime_error( "switching radius must not be larger than box length"); @@ -323,11 +320,6 @@ void Mmm1dgpuForce::set_params(float _boxz, float _coulomb_prefactor, // double colons are needed to access the constant memory variables because // they are file globals and we have identically named class variables cudaSetDevice(d); - if (manual) // tuning needs to be performed again - { - far_switch_radius = _far_switch_radius; - bessel_cutoff = _bessel_cutoff; - } if (_far_switch_radius >= 0) { mmm1d_params.far_switch_radius_2 = _far_switch_radius_2; cuda_safe_mem(cudaMemcpyToSymbol(::far_switch_radius_2, diff --git a/src/python/espressomd/electrostatics.pxd b/src/python/espressomd/electrostatics.pxd index 5d687f144e4..17124972bac 100644 --- a/src/python/espressomd/electrostatics.pxd +++ b/src/python/espressomd/electrostatics.pxd @@ -27,9 +27,7 @@ cdef extern from "SystemInterface.hpp": cdef extern from "EspressoSystemInterface.hpp": cdef cppclass EspressoSystemInterface(SystemInterface): @staticmethod - EspressoSystemInterface * _Instance() - bool requestRGpu() - void update() + EspressoSystemInterface & Instance() IF ELECTROSTATICS: @@ -139,30 +137,11 @@ IF ELECTROSTATICS and MMM1D_GPU: cdef extern from "actor/Mmm1dgpuForce.hpp": cdef cppclass Mmm1dgpuForce: - Mmm1dgpuForce(SystemInterface & s, float coulomb_prefactor, float maxPWerror, float far_switch_radius, int bessel_cutoff) except+ - Mmm1dgpuForce(SystemInterface & s, float coulomb_prefactor, float maxPWerror, float far_switch_radius) except+ - Mmm1dgpuForce(SystemInterface & s, float coulomb_prefactor, float maxPWerror) except+ + Mmm1dgpuForce(SystemInterface & s) except+ void setup(SystemInterface & s) void tune(SystemInterface & s, float _maxPWerror, float _far_switch_radius, int _bessel_cutoff) - void set_params(float _boxz, float _coulomb_prefactor, float _maxPWerror, float _far_switch_radius, int _bessel_cutoff, bool manual) void set_params(float _boxz, float _coulomb_prefactor, float _maxPWerror, float _far_switch_radius, int _bessel_cutoff) - unsigned int numThreads - unsigned int numBlocks(SystemInterface & s) - - float host_boxz - unsigned int host_npart - bool need_tune - - int pairs - float * dev_forcePairs - float * dev_energyBlocks - - float coulomb_prefactor, maxPWerror, far_switch_radius - int bessel_cutoff - - float force_benchmark(SystemInterface & s) - void activate() void deactivate() diff --git a/src/python/espressomd/electrostatics.pyx b/src/python/espressomd/electrostatics.pyx index 287e3d39335..269bd511ddf 100644 --- a/src/python/espressomd/electrostatics.pyx +++ b/src/python/espressomd/electrostatics.pyx @@ -16,6 +16,7 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . # +from libcpp.memory cimport shared_ptr, make_shared from cython.operator cimport dereference include "myconfig.pxi" from .actors cimport Actor @@ -634,21 +635,12 @@ IF ELECTROSTATICS and MMM1D_GPU: tune : :obj:`bool`, optional Specify whether to automatically tune or not. Defaults to ``True``. """ - cdef Mmm1dgpuForce * thisptr + cdef shared_ptr[Mmm1dgpuForce] sip cdef EspressoSystemInterface * interface - cdef char * log - cdef int resp def __cinit__(self): - self.interface = EspressoSystemInterface._Instance() - default_params = self.default_params() - self.thisptr = new Mmm1dgpuForce(dereference(self.interface), 0.0, default_params["maxPWerror"]) - self.interface.update() - self.interface.requestRGpu() - dereference(self.thisptr).activate() - - def __dealloc__(self): - del self.thisptr + self.interface = &EspressoSystemInterface.Instance() + self.sip = make_shared[Mmm1dgpuForce](dereference(self.interface)) def validate_params(self): default_params = self.default_params() @@ -686,27 +678,28 @@ IF ELECTROSTATICS and MMM1D_GPU: set_prefactor(self._params["prefactor"]) default_params = self.default_params() - self.thisptr.set_params( + dereference(self.sip).set_params( < float > box_geo.length()[2], < float > coulomb.prefactor, self._params["maxPWerror"], self._params["far_switch_radius"], self._params["bessel_cutoff"]) def _tune(self): - self.thisptr.setup(dereference(self.interface)) - self.thisptr.tune( + dereference(self.sip).setup(dereference(self.interface)) + dereference(self.sip).tune( dereference(self.interface), self._params["maxPWerror"], self._params["far_switch_radius"], self._params["bessel_cutoff"]) def _activate_method(self): check_neutrality(self._params) self._set_params_in_es_core() + dereference(self.sip).activate() coulomb.method = COULOMB_MMM1D_GPU if self._params["tune"]: self._tune() self._set_params_in_es_core() def _deactivate_method(self): - dereference(self.thisptr).deactivate() + dereference(self.sip).deactivate() IF ELECTROSTATICS: IF SCAFACOS == 1: From 4fc4d7047f34089f7736a903c58373dfc19cbf16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= Date: Fri, 3 Dec 2021 18:52:15 +0100 Subject: [PATCH 09/21] core: Simplify SystemInterface class --- src/core/EspressoSystemInterface.hpp | 38 ++++++++++++---------- src/core/SystemInterface.hpp | 48 +++++----------------------- 2 files changed, 29 insertions(+), 57 deletions(-) diff --git a/src/core/EspressoSystemInterface.hpp b/src/core/EspressoSystemInterface.hpp index 0395a75cb81..918e0df08d3 100644 --- a/src/core/EspressoSystemInterface.hpp +++ b/src/core/EspressoSystemInterface.hpp @@ -29,6 +29,9 @@ class EspressoSystemInterface : public SystemInterface { public: + EspressoSystemInterface() = default; + ~EspressoSystemInterface() override = default; + static EspressoSystemInterface &Instance() { if (!m_instance) m_instance = new EspressoSystemInterface; @@ -138,12 +141,6 @@ class EspressoSystemInterface : public SystemInterface { protected: static EspressoSystemInterface *m_instance; - EspressoSystemInterface() - : m_gpu(false), m_r_gpu_begin(nullptr), m_r_gpu_end(nullptr), - m_dip_gpu_begin(nullptr), m_dip_gpu_end(nullptr), - m_q_gpu_begin(nullptr), m_q_gpu_end(nullptr), - m_needsParticleStructGpu(false), m_splitParticleStructGpu(false){}; - ~EspressoSystemInterface() override = default; void gatherParticles(); void split_particle_struct(); @@ -156,22 +153,29 @@ class EspressoSystemInterface : public SystemInterface { } }; void reallocDeviceMemory(std::size_t n); -#endif - std::size_t m_gpu_npart; - bool m_gpu; +private: + std::size_t m_gpu_npart = 0; + bool m_gpu = false; + + float *m_r_gpu_begin = nullptr; + float *m_r_gpu_end = nullptr; - float *m_r_gpu_begin; - float *m_r_gpu_end; + float *m_dip_gpu_begin = nullptr; + float *m_dip_gpu_end = nullptr; - float *m_dip_gpu_begin; - float *m_dip_gpu_end; + float *m_q_gpu_begin = nullptr; + float *m_q_gpu_end = nullptr; - float *m_q_gpu_begin; - float *m_q_gpu_end; + bool m_needsParticleStructGpu = false; + bool m_splitParticleStructGpu = false; - bool m_needsParticleStructGpu; - bool m_splitParticleStructGpu; + bool m_needsRGpu = false; + bool m_needsQGpu = false; + bool m_needsFGpu = false; + bool m_needsDipGpu = false; + bool m_needsTorqueGpu = false; +#endif }; #endif diff --git a/src/core/SystemInterface.hpp b/src/core/SystemInterface.hpp index cd8b1fe1be6..cdce5d99f44 100644 --- a/src/core/SystemInterface.hpp +++ b/src/core/SystemInterface.hpp @@ -27,11 +27,8 @@ class SystemInterface { public: - SystemInterface() - : m_needsRGpu(false), m_needsQGpu(false), m_needsFGpu(false), - m_needsDipGpu(false), m_needsTorqueGpu(false){}; - typedef Utils::Vector3d Vector3; - typedef double Real; + SystemInterface() = default; + virtual ~SystemInterface() = default; virtual void init() = 0; virtual void update() = 0; @@ -39,61 +36,32 @@ class SystemInterface { virtual float *rGpuBegin() { return nullptr; }; virtual float *rGpuEnd() { return nullptr; }; virtual bool hasRGpu() { return false; }; - virtual bool requestRGpu() { - m_needsRGpu = hasRGpu(); - return m_needsRGpu; - } + virtual bool requestRGpu() { return false; } virtual float *dipGpuBegin() { return nullptr; }; virtual float *dipGpuEnd() { return nullptr; }; virtual bool hasDipGpu() { return false; }; - virtual bool requestDipGpu() { - m_needsDipGpu = hasDipGpu(); - return m_needsDipGpu; - } + virtual bool requestDipGpu() { return false; } virtual float *torqueGpuBegin() { return nullptr; }; virtual float *torqueGpuEnd() { return nullptr; }; virtual bool hasTorqueGpu() { return false; }; - virtual bool requestTorqueGpu() { - m_needsTorqueGpu = hasTorqueGpu(); - return m_needsTorqueGpu; - } + virtual bool requestTorqueGpu() { return false; } virtual float *qGpuBegin() { return nullptr; }; virtual float *qGpuEnd() { return nullptr; }; virtual bool hasQGpu() { return false; }; - virtual bool requestQGpu() { - m_needsQGpu = hasQGpu(); - return m_needsQGpu; - } + virtual bool requestQGpu() { return false; } virtual float *fGpuBegin() { return nullptr; }; virtual float *fGpuEnd() { return nullptr; }; virtual bool hasFGpu() { return false; }; - virtual bool requestFGpu() { - m_needsFGpu = hasFGpu(); - return m_needsFGpu; - } + virtual bool requestFGpu() { return false; } virtual float *eGpu() { return nullptr; }; virtual std::size_t npart_gpu() const { return 0; }; - virtual Vector3 box() const = 0; - - virtual bool needsRGpu() { return m_needsRGpu; }; - virtual bool needsDipGpu() { return m_needsRGpu; }; - virtual bool needsQGpu() { return m_needsQGpu; }; - virtual bool needsFGpu() { return m_needsFGpu; }; - virtual bool needsTorqueGpu() { return m_needsTorqueGpu; }; - virtual ~SystemInterface() = default; - -protected: - bool m_needsRGpu; - bool m_needsQGpu; - bool m_needsFGpu; - bool m_needsDipGpu; - bool m_needsTorqueGpu; + virtual Utils::Vector3d box() const = 0; }; #endif From f505fcd863bc7eed33ab6440973e1fd506f814ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= Date: Fri, 3 Dec 2021 19:39:43 +0100 Subject: [PATCH 10/21] core: Document SystemInterface class --- src/core/EspressoSystemInterface.hpp | 10 +++++++++ src/core/EspressoSystemInterface_cuda.cu | 7 ++++--- src/core/SystemInterface.hpp | 26 ++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 3 deletions(-) diff --git a/src/core/EspressoSystemInterface.hpp b/src/core/EspressoSystemInterface.hpp index 918e0df08d3..ae15b33d754 100644 --- a/src/core/EspressoSystemInterface.hpp +++ b/src/core/EspressoSystemInterface.hpp @@ -27,6 +27,16 @@ #include +/** + * @brief CUDA implementation of @ref SystemInterface. + * + * When data is synchronized between host and device memory, a subset + * of the @ref Particle struct is copied from each particle on the host + * to the corresponding @ref CUDA_particle_data struct on the device via + * @ref EspressoSystemInterface::gatherParticles(). Once the transfer is + * complete, the particle AoS on the device is copied (or "split") to + * a SoA via @ref EspressoSystemInterface::split_particle_struct(). + */ class EspressoSystemInterface : public SystemInterface { public: EspressoSystemInterface() = default; diff --git a/src/core/EspressoSystemInterface_cuda.cu b/src/core/EspressoSystemInterface_cuda.cu index 53cb31b43df..8261d1c07ba 100644 --- a/src/core/EspressoSystemInterface_cuda.cu +++ b/src/core/EspressoSystemInterface_cuda.cu @@ -16,6 +16,10 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ +/** + * @file + * CUDA kernels to convert the particles AoS to a SoA on the device. + */ #include "EspressoSystemInterface.hpp" #include "cuda_interface.hpp" @@ -30,9 +34,6 @@ #error CU-file includes mpi.h! This should not happen! #endif -// These functions will split the particle data structure into individual arrays -// for each property - // Position and charge __global__ void split_kernel_rq(CUDA_particle_data *particles, float *r, float *q, unsigned int n) { diff --git a/src/core/SystemInterface.hpp b/src/core/SystemInterface.hpp index cdce5d99f44..884eaf08c3b 100644 --- a/src/core/SystemInterface.hpp +++ b/src/core/SystemInterface.hpp @@ -25,6 +25,32 @@ #include +/** + * Public interface for the ESPResSo system and device memory management. + * + * This interface is responsible for device memory management and memory + * layout conversion (AoS to SoA conversion). On the CPU, particle data + * is stored in an Array of Structures (AoS). On the GPU, some algorithms + * have better performance with a Structure of Arrays (SoA). When data + * is synchronized between host and device memory, particle objects are + * copied to device memory in the form of an AoS. A few particle properties + * are further copied ("split") into a SoA by the split callback. + * + * This interface provides getters in the form xGpuBegin() and + * xGpuEnd() (with @c x a particle property, e.g. @c r for position) + * that return pointers to device memory for each array in the SoA. + * To register a new particle property in the split callback, call the + * corresponding requestXGpu() method, with @c X the uppercase + * version of the particle property @c x. It is currently impossible to + * undo this action. + * + * Since properties in the @ref Particle struct depend on the config file, + * it is necessary to provide a means to check if it is even possible to + * carry out the split. This is achieved by the hasXGpu() methods. + * The same value is returned from requestXGpu() and it needs to + * be checked at the call site to throw an error. This class won't throw + * exceptions. + */ class SystemInterface { public: SystemInterface() = default; From 2a52d199c1f9742004e78cd0138efa33d2967831 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= Date: Mon, 6 Dec 2021 12:16:01 +0100 Subject: [PATCH 11/21] core: Remove broken past-the-end device pointers --- src/core/EspressoSystemInterface.hpp | 14 -------- src/core/EspressoSystemInterface_cuda.cu | 3 -- src/core/SystemInterface.hpp | 5 --- .../EspressoSystemInterface_test.cpp | 33 ------------------- 4 files changed, 55 deletions(-) diff --git a/src/core/EspressoSystemInterface.hpp b/src/core/EspressoSystemInterface.hpp index ae15b33d754..feecf6bd53b 100644 --- a/src/core/EspressoSystemInterface.hpp +++ b/src/core/EspressoSystemInterface.hpp @@ -54,7 +54,6 @@ class EspressoSystemInterface : public SystemInterface { #ifdef CUDA float *rGpuBegin() override { return m_r_gpu_begin; }; - float *rGpuEnd() override { return m_r_gpu_end; }; bool hasRGpu() override { return true; }; bool requestRGpu() override { m_needsRGpu = hasRGpu(); @@ -67,7 +66,6 @@ class EspressoSystemInterface : public SystemInterface { #ifdef DIPOLES float *dipGpuBegin() override { return m_dip_gpu_begin; }; - float *dipGpuEnd() override { return m_dip_gpu_end; }; bool hasDipGpu() override { return true; }; bool requestDipGpu() override { m_needsDipGpu = hasDipGpu(); @@ -81,7 +79,6 @@ class EspressoSystemInterface : public SystemInterface { #ifdef ELECTROSTATICS float *qGpuBegin() override { return m_q_gpu_begin; }; - float *qGpuEnd() override { return m_q_gpu_end; }; bool hasQGpu() override { return true; }; bool requestQGpu() override { m_needsQGpu = hasQGpu(); @@ -102,9 +99,6 @@ class EspressoSystemInterface : public SystemInterface { }; float *fGpuBegin() override { return gpu_get_particle_force_pointer(); }; - float *fGpuEnd() override { - return gpu_get_particle_force_pointer() + 3 * m_gpu_npart; - }; bool hasFGpu() override { return true; }; bool requestFGpu() override { m_needsFGpu = hasFGpu(); @@ -118,9 +112,6 @@ class EspressoSystemInterface : public SystemInterface { float *torqueGpuBegin() override { return gpu_get_particle_torque_pointer(); }; - float *torqueGpuEnd() override { - return gpu_get_particle_torque_pointer() + 3 * m_gpu_npart; - }; bool hasTorqueGpu() override { return true; }; bool requestTorqueGpu() override { m_needsTorqueGpu = hasTorqueGpu(); @@ -169,13 +160,8 @@ class EspressoSystemInterface : public SystemInterface { bool m_gpu = false; float *m_r_gpu_begin = nullptr; - float *m_r_gpu_end = nullptr; - float *m_dip_gpu_begin = nullptr; - float *m_dip_gpu_end = nullptr; - float *m_q_gpu_begin = nullptr; - float *m_q_gpu_end = nullptr; bool m_needsParticleStructGpu = false; bool m_splitParticleStructGpu = false; diff --git a/src/core/EspressoSystemInterface_cuda.cu b/src/core/EspressoSystemInterface_cuda.cu index 8261d1c07ba..5f29ae8fc64 100644 --- a/src/core/EspressoSystemInterface_cuda.cu +++ b/src/core/EspressoSystemInterface_cuda.cu @@ -104,14 +104,12 @@ void EspressoSystemInterface::reallocDeviceMemory(std::size_t n) { if (m_r_gpu_begin != nullptr) cuda_safe_mem(cudaFree(m_r_gpu_begin)); cuda_safe_mem(cudaMalloc(&m_r_gpu_begin, 3 * n * sizeof(float))); - m_r_gpu_end = m_r_gpu_begin + 3 * n; } #ifdef DIPOLES if (m_needsDipGpu && ((n != m_gpu_npart) || (m_dip_gpu_begin == nullptr))) { if (m_dip_gpu_begin != nullptr) cuda_safe_mem(cudaFree(m_dip_gpu_begin)); cuda_safe_mem(cudaMalloc(&m_dip_gpu_begin, 3 * n * sizeof(float))); - m_dip_gpu_end = m_dip_gpu_begin + 3 * n; } #endif #ifdef ELECTROSTATICS @@ -119,7 +117,6 @@ void EspressoSystemInterface::reallocDeviceMemory(std::size_t n) { if (m_q_gpu_begin != nullptr) cuda_safe_mem(cudaFree(m_q_gpu_begin)); cuda_safe_mem(cudaMalloc(&m_q_gpu_begin, 3 * n * sizeof(float))); - m_q_gpu_end = m_q_gpu_begin + 3 * n; } #endif diff --git a/src/core/SystemInterface.hpp b/src/core/SystemInterface.hpp index 884eaf08c3b..006232efab6 100644 --- a/src/core/SystemInterface.hpp +++ b/src/core/SystemInterface.hpp @@ -60,27 +60,22 @@ class SystemInterface { virtual void update() = 0; virtual float *rGpuBegin() { return nullptr; }; - virtual float *rGpuEnd() { return nullptr; }; virtual bool hasRGpu() { return false; }; virtual bool requestRGpu() { return false; } virtual float *dipGpuBegin() { return nullptr; }; - virtual float *dipGpuEnd() { return nullptr; }; virtual bool hasDipGpu() { return false; }; virtual bool requestDipGpu() { return false; } virtual float *torqueGpuBegin() { return nullptr; }; - virtual float *torqueGpuEnd() { return nullptr; }; virtual bool hasTorqueGpu() { return false; }; virtual bool requestTorqueGpu() { return false; } virtual float *qGpuBegin() { return nullptr; }; - virtual float *qGpuEnd() { return nullptr; }; virtual bool hasQGpu() { return false; }; virtual bool requestQGpu() { return false; } virtual float *fGpuBegin() { return nullptr; }; - virtual float *fGpuEnd() { return nullptr; }; virtual bool hasFGpu() { return false; }; virtual bool requestFGpu() { return false; } diff --git a/src/core/unit_tests/EspressoSystemInterface_test.cpp b/src/core/unit_tests/EspressoSystemInterface_test.cpp index a270579384f..09fb492aea2 100644 --- a/src/core/unit_tests/EspressoSystemInterface_test.cpp +++ b/src/core/unit_tests/EspressoSystemInterface_test.cpp @@ -42,15 +42,10 @@ auto &system = EspressoSystemInterface::Instance(); inline void check_uninitialized_device_pointers() { BOOST_CHECK_EQUAL(espresso::system.eGpu(), nullptr); BOOST_CHECK_EQUAL(espresso::system.rGpuBegin(), nullptr); - BOOST_CHECK_EQUAL(espresso::system.rGpuEnd(), nullptr); BOOST_CHECK_EQUAL(espresso::system.dipGpuBegin(), nullptr); - BOOST_CHECK_EQUAL(espresso::system.dipGpuEnd(), nullptr); BOOST_CHECK_EQUAL(espresso::system.fGpuBegin(), nullptr); - BOOST_CHECK_EQUAL(espresso::system.fGpuEnd(), nullptr); BOOST_CHECK_EQUAL(espresso::system.torqueGpuBegin(), nullptr); - BOOST_CHECK_EQUAL(espresso::system.torqueGpuEnd(), nullptr); BOOST_CHECK_EQUAL(espresso::system.qGpuBegin(), nullptr); - BOOST_CHECK_EQUAL(espresso::system.qGpuEnd(), nullptr); } #ifdef CUDA @@ -103,18 +98,12 @@ BOOST_AUTO_TEST_CASE(check_with_gpu, *boost::unit_test::precondition(has_gpu)) { BOOST_TEST(espresso::system.requestRGpu()); espresso::system.update(); BOOST_TEST(espresso::system.rGpuBegin() != nullptr); - BOOST_TEST(espresso::system.rGpuEnd() != nullptr); - BOOST_CHECK_EQUAL(espresso::system.rGpuEnd() - espresso::system.rGpuBegin(), - 3); // check force split BOOST_TEST(espresso::system.hasFGpu()); BOOST_TEST(espresso::system.requestFGpu()); espresso::system.update(); BOOST_TEST(espresso::system.fGpuBegin() != nullptr); - BOOST_TEST(espresso::system.fGpuEnd() != nullptr); - BOOST_CHECK_EQUAL(espresso::system.fGpuEnd() - espresso::system.fGpuBegin(), - 3); // check torque split BOOST_CHECK_EQUAL(espresso::system.hasTorqueGpu(), has_feature_rotation); @@ -122,9 +111,6 @@ BOOST_AUTO_TEST_CASE(check_with_gpu, *boost::unit_test::precondition(has_gpu)) { #ifdef ROTATION espresso::system.update(); BOOST_TEST(espresso::system.torqueGpuBegin() != nullptr); - BOOST_TEST(espresso::system.torqueGpuEnd() != nullptr); - BOOST_CHECK_EQUAL( - espresso::system.torqueGpuEnd() - espresso::system.torqueGpuBegin(), 3); #endif // check charge split @@ -133,9 +119,6 @@ BOOST_AUTO_TEST_CASE(check_with_gpu, *boost::unit_test::precondition(has_gpu)) { #ifdef ELECTROSTATICS espresso::system.update(); BOOST_TEST(espresso::system.qGpuBegin() != nullptr); - BOOST_TEST(espresso::system.qGpuEnd() != nullptr); - BOOST_CHECK_EQUAL(espresso::system.qGpuEnd() - espresso::system.qGpuBegin(), - 3); #endif // check dipole split @@ -144,28 +127,12 @@ BOOST_AUTO_TEST_CASE(check_with_gpu, *boost::unit_test::precondition(has_gpu)) { #ifdef DIPOLES espresso::system.update(); BOOST_TEST(espresso::system.dipGpuBegin() != nullptr); - BOOST_TEST(espresso::system.dipGpuEnd() != nullptr); - BOOST_CHECK_EQUAL( - espresso::system.dipGpuEnd() - espresso::system.dipGpuBegin(), 3); #endif // clear device memory remove_particle(pid); espresso::system.update(); BOOST_CHECK_EQUAL(espresso::system.npart_gpu(), 0); - BOOST_CHECK_EQUAL(espresso::system.rGpuBegin(), espresso::system.rGpuEnd()); - BOOST_CHECK_EQUAL(espresso::system.fGpuBegin(), espresso::system.fGpuEnd()); -#ifdef ELECTROSTATICS - BOOST_CHECK_EQUAL(espresso::system.qGpuBegin(), espresso::system.qGpuEnd()); -#endif -#ifdef DIPOLES - BOOST_CHECK_EQUAL(espresso::system.dipGpuBegin(), - espresso::system.dipGpuEnd()); -#endif -#ifdef ROTATION - BOOST_CHECK_EQUAL(espresso::system.torqueGpuBegin(), - espresso::system.torqueGpuEnd()); -#endif } #else // CUDA From 867cf47d22f821dfe262552e64e1d5855084d76c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= Date: Mon, 6 Dec 2021 12:27:26 +0100 Subject: [PATCH 12/21] core: Remove magnetostatics global variables --- src/core/actor/DipolarBarnesHut.cpp | 25 +++--- src/core/actor/DipolarBarnesHut.hpp | 26 +++--- src/core/actor/DipolarBarnesHut_cuda.cu | 80 ++++++++++--------- src/core/actor/DipolarBarnesHut_cuda.cuh | 15 ++-- src/core/actor/DipolarDirectSum.cpp | 25 +++--- src/core/actor/DipolarDirectSum.hpp | 22 ++--- .../electrostatics_magnetostatics/dipole.cpp | 16 +--- src/python/espressomd/magnetostatics.pxd | 22 ++++- src/python/espressomd/magnetostatics.pyx | 23 +++++- 9 files changed, 138 insertions(+), 116 deletions(-) diff --git a/src/core/actor/DipolarBarnesHut.cpp b/src/core/actor/DipolarBarnesHut.cpp index ce5fd05b6f6..21a239ec449 100644 --- a/src/core/actor/DipolarBarnesHut.cpp +++ b/src/core/actor/DipolarBarnesHut.cpp @@ -23,33 +23,26 @@ #include "DipolarBarnesHut.hpp" -#include "EspressoSystemInterface.hpp" #include "actor/ActorList.hpp" #include "electrostatics_magnetostatics/common.hpp" #include "energy.hpp" #include "forces.hpp" -#include - -static std::unique_ptr dipolarBarnesHut; - -void activate_dipolar_barnes_hut(float epssq, float itolsq) { +void DipolarBarnesHut::activate() { // also necessary on 1 CPU or GPU, does more than just broadcasting dipole.method = DIPOLAR_BH_GPU; mpi_bcast_coulomb_params(); - dipolarBarnesHut = std::make_unique( - EspressoSystemInterface::Instance(), epssq, itolsq); - forceActors.push_back(dipolarBarnesHut.get()); - energyActors.push_back(dipolarBarnesHut.get()); + forceActors.push_back(this); + energyActors.push_back(this); } -void deactivate_dipolar_barnes_hut() { - if (dipolarBarnesHut) { - forceActors.remove(dipolarBarnesHut.get()); - energyActors.remove(dipolarBarnesHut.get()); - dipolarBarnesHut.reset(); - } +void DipolarBarnesHut::deactivate() { + dipole.method = DIPOLAR_NONE; + mpi_bcast_coulomb_params(); + + forceActors.remove(this); + energyActors.remove(this); } #endif diff --git a/src/core/actor/DipolarBarnesHut.hpp b/src/core/actor/DipolarBarnesHut.hpp index 35fb9ac0701..829a6318d53 100644 --- a/src/core/actor/DipolarBarnesHut.hpp +++ b/src/core/actor/DipolarBarnesHut.hpp @@ -33,11 +33,7 @@ class DipolarBarnesHut : public Actor { public: - DipolarBarnesHut(SystemInterface &s, float epssq, float itolsq) { - m_k = static_cast(dipole.prefactor); - m_epssq = epssq; - m_itolsq = itolsq; - setBHPrecision(&m_epssq, &m_itolsq); + DipolarBarnesHut(SystemInterface &s) { if (!s.requestFGpu()) runtimeErrorMsg() << "DipolarBarnesHut needs access to forces on GPU!"; @@ -51,6 +47,15 @@ class DipolarBarnesHut : public Actor { runtimeErrorMsg() << "DipolarBarnesHut needs access to dipoles on GPU!"; }; + ~DipolarBarnesHut() override { deallocBH(&m_bh_data); } + + void set_params(float epssq, float itolsq) { + m_k = static_cast(dipole.prefactor); + m_epssq = epssq; + m_itolsq = itolsq; + setBHPrecision(m_epssq, m_itolsq); + } + void computeForces(SystemInterface &s) override { try { allocBHmemCopy(static_cast(s.npart_gpu()), &m_bh_data); @@ -59,7 +64,7 @@ class DipolarBarnesHut : public Actor { return; } - fillConstantPointers(s.rGpuBegin(), s.dipGpuBegin(), m_bh_data); + fill_bh_data(s.rGpuBegin(), s.dipGpuBegin(), &m_bh_data); initBHgpu(m_bh_data.blocks); buildBoxBH(m_bh_data.blocks); buildTreeBH(m_bh_data.blocks); @@ -77,7 +82,7 @@ class DipolarBarnesHut : public Actor { return; } - fillConstantPointers(s.rGpuBegin(), s.dipGpuBegin(), m_bh_data); + fill_bh_data(s.rGpuBegin(), s.dipGpuBegin(), &m_bh_data); initBHgpu(m_bh_data.blocks); buildBoxBH(m_bh_data.blocks); buildTreeBH(m_bh_data.blocks); @@ -89,18 +94,19 @@ class DipolarBarnesHut : public Actor { } }; + void activate(); + void deactivate(); + private: float m_k; float m_epssq; float m_itolsq; + /// Container for pointers to device memory. BHData m_bh_data = {0, 0, 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr}; }; -void activate_dipolar_barnes_hut(float epssq, float itolsq); -void deactivate_dipolar_barnes_hut(); - #endif // DIPOLAR_BARNES_HUT #endif // ACTOR_DIPOLARBARNESHUT_HPP diff --git a/src/core/actor/DipolarBarnesHut_cuda.cu b/src/core/actor/DipolarBarnesHut_cuda.cu index 456895976ff..e33f99fc123 100644 --- a/src/core/actor/DipolarBarnesHut_cuda.cu +++ b/src/core/actor/DipolarBarnesHut_cuda.cu @@ -1148,16 +1148,38 @@ int energyBH(BHData *bh_data, float k, float *E) { return error_code; } -// Function to set the BH method parameters. -void setBHPrecision(float *epssq, float *itolsq) { - cuda_safe_mem(cudaMemcpyToSymbol(epssqd, epssq, sizeof(float), 0, +void setBHPrecision(float epssq, float itolsq) { + cuda_safe_mem(cudaMemcpyToSymbol(epssqd, &epssq, sizeof(float), 0, cudaMemcpyHostToDevice)); - cuda_safe_mem(cudaMemcpyToSymbol(itolsqd, itolsq, sizeof(float), 0, + cuda_safe_mem(cudaMemcpyToSymbol(itolsqd, &itolsq, sizeof(float), 0, cudaMemcpyHostToDevice)); } -// An allocation of the GPU device memory and an initialization where it is -// needed. +void deallocBH(BHData *bh_data) { + cuda_safe_mem(cudaFree(bh_data->err)); + cuda_safe_mem(cudaFree(bh_data->max_lps)); + cuda_safe_mem(cudaFree(bh_data->child)); + cuda_safe_mem(cudaFree(bh_data->count)); + cuda_safe_mem(cudaFree(bh_data->start)); + cuda_safe_mem(cudaFree(bh_data->sort)); + cuda_safe_mem(cudaFree(bh_data->mass)); + cuda_safe_mem(cudaFree(bh_data->maxp)); + cuda_safe_mem(cudaFree(bh_data->minp)); + cuda_safe_mem(cudaFree(bh_data->r)); + cuda_safe_mem(cudaFree(bh_data->u)); + bh_data->err = nullptr; + bh_data->max_lps = nullptr; + bh_data->child = nullptr; + bh_data->count = nullptr; + bh_data->start = nullptr; + bh_data->sort = nullptr; + bh_data->mass = nullptr; + bh_data->maxp = nullptr; + bh_data->minp = nullptr; + bh_data->r = nullptr; + bh_data->u = nullptr; +} + void allocBHmemCopy(int nbodies, BHData *bh_data) { if (bh_data->nbodies == nbodies) return; @@ -1178,37 +1200,30 @@ void allocBHmemCopy(int nbodies, BHData *bh_data) { else bh_data->nnodes = (bh_data->nnodes / n_total_threads) * n_total_threads; - if (bh_data->err != nullptr) - cuda_safe_mem(cudaFree(bh_data->err)); + cuda_safe_mem(cudaFree(bh_data->err)); cuda_safe_mem(cudaMalloc((void **)&(bh_data->err), sizeof(int))); - if (bh_data->max_lps != nullptr) - cuda_safe_mem(cudaFree(bh_data->max_lps)); + cuda_safe_mem(cudaFree(bh_data->max_lps)); cuda_safe_mem(cudaMalloc((void **)&(bh_data->max_lps), sizeof(int))); - if (bh_data->child != nullptr) - cuda_safe_mem(cudaFree(bh_data->child)); + cuda_safe_mem(cudaFree(bh_data->child)); cuda_safe_mem(cudaMalloc((void **)&(bh_data->child), sizeof(int) * (bh_data->nnodes + 1) * 8)); - if (bh_data->count != nullptr) - cuda_safe_mem(cudaFree(bh_data->count)); + cuda_safe_mem(cudaFree(bh_data->count)); cuda_safe_mem(cudaMalloc((void **)&(bh_data->count), sizeof(int) * (bh_data->nnodes + 1))); - if (bh_data->start != nullptr) - cuda_safe_mem(cudaFree(bh_data->start)); + cuda_safe_mem(cudaFree(bh_data->start)); cuda_safe_mem(cudaMalloc((void **)&(bh_data->start), sizeof(int) * (bh_data->nnodes + 1))); - if (bh_data->sort != nullptr) - cuda_safe_mem(cudaFree(bh_data->sort)); + cuda_safe_mem(cudaFree(bh_data->sort)); cuda_safe_mem(cudaMalloc((void **)&(bh_data->sort), sizeof(int) * (bh_data->nnodes + 1))); // Weight coefficients of m_bhnnodes nodes: both particles and octant cells - if (bh_data->mass != nullptr) - cuda_safe_mem(cudaFree(bh_data->mass)); + cuda_safe_mem(cudaFree(bh_data->mass)); cuda_safe_mem(cudaMalloc((void **)&(bh_data->mass), sizeof(float) * (bh_data->nnodes + 1))); @@ -1226,39 +1241,32 @@ void allocBHmemCopy(int nbodies, BHData *bh_data) { // are the octree box dynamical spatial constraints // this array is updating per each block at each interaction calculation // within the boundingBoxKernel - if (bh_data->maxp != nullptr) - cuda_safe_mem(cudaFree(bh_data->maxp)); + cuda_safe_mem(cudaFree(bh_data->maxp)); cuda_safe_mem(cudaMalloc((void **)&(bh_data->maxp), sizeof(float) * bh_data->blocks * FACTOR1 * 3)); // (min[3*i], min[3*i+1], min[3*i+2]) // are the octree box dynamical spatial constraints // this array is updating per each block at each interaction calculation // within the boundingBoxKernel - if (bh_data->minp != nullptr) - cuda_safe_mem(cudaFree(bh_data->minp)); + cuda_safe_mem(cudaFree(bh_data->minp)); cuda_safe_mem(cudaMalloc((void **)&(bh_data->minp), sizeof(float) * bh_data->blocks * FACTOR1 * 3)); - if (bh_data->r != nullptr) - cuda_safe_mem(cudaFree(bh_data->r)); + cuda_safe_mem(cudaFree(bh_data->r)); cuda_safe_mem( cudaMalloc(&(bh_data->r), 3 * (bh_data->nnodes + 1) * sizeof(float))); - if (bh_data->u != nullptr) - cuda_safe_mem(cudaFree(bh_data->u)); + cuda_safe_mem(cudaFree(bh_data->u)); cuda_safe_mem( cudaMalloc(&(bh_data->u), 3 * (bh_data->nnodes + 1) * sizeof(float))); } -// Populating of array pointers allocated in GPU device before. -// Copy the particle data to the Barnes-Hut related arrays. -void fillConstantPointers(float *r, float *dip, BHData bh_data) { - cuda_safe_mem(cudaMemcpyToSymbol(bhpara, &bh_data, sizeof(BHData), 0, +void fill_bh_data(float const *r, float const *dip, BHData const *bh_data) { + auto const size = 3 * bh_data->nbodies * sizeof(float); + cuda_safe_mem(cudaMemcpyToSymbol(bhpara, bh_data, sizeof(BHData), 0, cudaMemcpyHostToDevice)); - cuda_safe_mem(cudaMemcpy(bh_data.r, r, 3 * bh_data.nbodies * sizeof(float), - cudaMemcpyDeviceToDevice)); - cuda_safe_mem(cudaMemcpy(bh_data.u, dip, 3 * bh_data.nbodies * sizeof(float), - cudaMemcpyDeviceToDevice)); + cuda_safe_mem(cudaMemcpy(bh_data->r, r, size, cudaMemcpyDeviceToDevice)); + cuda_safe_mem(cudaMemcpy(bh_data->u, dip, size, cudaMemcpyDeviceToDevice)); } #endif // BARNES_HUT diff --git a/src/core/actor/DipolarBarnesHut_cuda.cuh b/src/core/actor/DipolarBarnesHut_cuda.cuh index 6e64b93cea9..9ad9437042c 100644 --- a/src/core/actor/DipolarBarnesHut_cuda.cuh +++ b/src/core/actor/DipolarBarnesHut_cuda.cuh @@ -83,16 +83,21 @@ struct BHData { /// Maximal depth of the Barnes-Hut tree branching. #define MAXDEPTH 32 -/// Function to set the Barnes-Hut parameters. -void setBHPrecision(float *epssq, float *itolsq); +/// Barnes-Hut parameters setter. +void setBHPrecision(float epssq, float itolsq); /// An allocation of the GPU device memory and an initialization where it is /// needed. void allocBHmemCopy(int nbodies, BHData *bh_data); -/// Populating of array pointers allocated in GPU device before. -/// Copy the particle data to the Barnes-Hut related arrays. -void fillConstantPointers(float *r, float *dip, BHData bh_data); +/// A deallocation of the GPU device memory. +void deallocBH(BHData *bh_data); + +/// Copy Barnes-Hut data to @ref bhpara and copy particle data. +/// @param r device particle positions to copy +/// @param dip device particle dipoles to copy +/// @param bh_data Barnes-Hut container +void fill_bh_data(float const *r, float const *dip, BHData const *bh_data); /// Barnes-Hut CUDA initialization. void initBHgpu(int blocks); diff --git a/src/core/actor/DipolarDirectSum.cpp b/src/core/actor/DipolarDirectSum.cpp index 2ca0de24bb6..01d48bf487c 100644 --- a/src/core/actor/DipolarDirectSum.cpp +++ b/src/core/actor/DipolarDirectSum.cpp @@ -23,32 +23,25 @@ #include "DipolarDirectSum.hpp" -#include "EspressoSystemInterface.hpp" #include "electrostatics_magnetostatics/common.hpp" #include "energy.hpp" #include "forces.hpp" -#include - -static std::unique_ptr dipolarDirectSum; - -void activate_dipolar_direct_sum_gpu() { +void DipolarDirectSum::activate() { // also necessary on 1 CPU or GPU, does more than just broadcasting dipole.method = DIPOLAR_DS_GPU; mpi_bcast_coulomb_params(); - dipolarDirectSum = - std::make_unique(EspressoSystemInterface::Instance()); - forceActors.push_back(dipolarDirectSum.get()); - energyActors.push_back(dipolarDirectSum.get()); + forceActors.push_back(this); + energyActors.push_back(this); } -void deactivate_dipolar_direct_sum_gpu() { - if (dipolarDirectSum) { - forceActors.remove(dipolarDirectSum.get()); - energyActors.remove(dipolarDirectSum.get()); - dipolarDirectSum.reset(); - } +void DipolarDirectSum::deactivate() { + dipole.method = DIPOLAR_NONE; + mpi_bcast_coulomb_params(); + + forceActors.remove(this); + energyActors.remove(this); } #endif diff --git a/src/core/actor/DipolarDirectSum.hpp b/src/core/actor/DipolarDirectSum.hpp index 02a08722813..3dc82d68a79 100644 --- a/src/core/actor/DipolarDirectSum.hpp +++ b/src/core/actor/DipolarDirectSum.hpp @@ -41,8 +41,6 @@ class DipolarDirectSum : public Actor { public: DipolarDirectSum(SystemInterface &s) { - k = static_cast(dipole.prefactor); - if (!s.requestFGpu()) runtimeErrorMsg() << "DipolarDirectSum needs access to forces on GPU!"; @@ -55,6 +53,9 @@ class DipolarDirectSum : public Actor { if (!s.requestDipGpu()) runtimeErrorMsg() << "DipolarDirectSum needs access to dipoles on GPU!"; }; + + void set_params() { m_pk = static_cast(dipole.prefactor); } + void computeForces(SystemInterface &s) override { float box[3]; int per[3]; @@ -63,8 +64,8 @@ class DipolarDirectSum : public Actor { per[i] = box_geo.periodic(i); } DipolarDirectSum_kernel_wrapper_force( - k, static_cast(s.npart_gpu()), s.rGpuBegin(), s.dipGpuBegin(), - s.fGpuBegin(), s.torqueGpuBegin(), box, per); + m_pk, static_cast(s.npart_gpu()), s.rGpuBegin(), + s.dipGpuBegin(), s.fGpuBegin(), s.torqueGpuBegin(), box, per); }; void computeEnergy(SystemInterface &s) override { float box[3]; @@ -73,18 +74,19 @@ class DipolarDirectSum : public Actor { box[i] = static_cast(s.box()[i]); per[i] = box_geo.periodic(i); } + auto energy = reinterpret_cast(s.eGpu()); DipolarDirectSum_kernel_wrapper_energy( - k, static_cast(s.npart_gpu()), s.rGpuBegin(), s.dipGpuBegin(), - box, per, &(reinterpret_cast(s.eGpu())->dipolar)); + m_pk, static_cast(s.npart_gpu()), s.rGpuBegin(), + s.dipGpuBegin(), box, per, &(energy->dipolar)); }; + void activate(); + void deactivate(); + private: - float k; + float m_pk; }; -void activate_dipolar_direct_sum_gpu(); -void deactivate_dipolar_direct_sum_gpu(); - #endif // DIPOLAR_DIRECT_SUM #endif // ACTOR_DIPOLARDIRECTSUM_HPP diff --git a/src/core/electrostatics_magnetostatics/dipole.cpp b/src/core/electrostatics_magnetostatics/dipole.cpp index 629d13bc8cf..77291e3bae0 100644 --- a/src/core/electrostatics_magnetostatics/dipole.cpp +++ b/src/core/electrostatics_magnetostatics/dipole.cpp @@ -23,8 +23,6 @@ #include "electrostatics_magnetostatics/dipole.hpp" -#include "actor/DipolarBarnesHut.hpp" -#include "actor/DipolarDirectSum.hpp" #include "electrostatics_magnetostatics/common.hpp" #include "electrostatics_magnetostatics/magnetic_non_p3m_methods.hpp" #include "electrostatics_magnetostatics/mdlc_correction.hpp" @@ -295,19 +293,7 @@ void set_Dprefactor(double prefactor) { mpi_bcast_coulomb_params(); } -void set_method_local(DipolarInteraction method) { -#ifdef DIPOLAR_DIRECT_SUM - if ((dipole.method == DIPOLAR_DS_GPU) && (method != DIPOLAR_DS_GPU)) { - deactivate_dipolar_direct_sum_gpu(); - } -#endif -#ifdef DIPOLAR_BARNES_HUT - if ((dipole.method == DIPOLAR_BH_GPU) && (method != DIPOLAR_BH_GPU)) { - deactivate_dipolar_barnes_hut(); - } -#endif // DIPOLAR_ BARNES_HUT - dipole.method = method; -} +void set_method_local(DipolarInteraction method) { dipole.method = method; } } // namespace Dipole #endif // DIPOLES diff --git a/src/python/espressomd/magnetostatics.pxd b/src/python/espressomd/magnetostatics.pxd index 1bb064674d7..d12f5bee383 100644 --- a/src/python/espressomd/magnetostatics.pxd +++ b/src/python/espressomd/magnetostatics.pxd @@ -19,6 +19,14 @@ from libcpp cimport bool include "myconfig.pxi" +cdef extern from "SystemInterface.hpp": + cdef cppclass SystemInterface: + pass +cdef extern from "EspressoSystemInterface.hpp": + cdef cppclass EspressoSystemInterface(SystemInterface): + @staticmethod + EspressoSystemInterface & Instance() + IF DIPOLES == 1: cdef extern from "electrostatics_magnetostatics/common.hpp": void mpi_bcast_coulomb_params() @@ -49,13 +57,19 @@ IF DIPOLES == 1: IF(CUDA == 1) and (ROTATION == 1): cdef extern from "actor/DipolarDirectSum.hpp": - void activate_dipolar_direct_sum_gpu() - void deactivate_dipolar_direct_sum_gpu() + cdef cppclass DipolarDirectSum: + DipolarDirectSum(SystemInterface & s) + void set_params() + void activate() + void deactivate() IF(DIPOLAR_BARNES_HUT == 1): cdef extern from "actor/DipolarBarnesHut.hpp": - void activate_dipolar_barnes_hut(float epssq, float itolsq) - void deactivate_dipolar_barnes_hut() + cdef cppclass DipolarBarnesHut: + DipolarBarnesHut(SystemInterface & s) + void set_params(float epssq, float itolsq) + void activate() + void deactivate() IF DP3M == 1: from p3m_common cimport P3MParameters diff --git a/src/python/espressomd/magnetostatics.pyx b/src/python/espressomd/magnetostatics.pyx index c360288482b..f6b09376e51 100644 --- a/src/python/espressomd/magnetostatics.pyx +++ b/src/python/espressomd/magnetostatics.pyx @@ -16,6 +16,9 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . # +from libcpp.memory cimport shared_ptr, make_shared +from cython.operator cimport dereference + include "myconfig.pxi" from .actors cimport Actor IF SCAFACOS == 1: @@ -324,6 +327,11 @@ IF DIPOLES == 1: Magnetostatics prefactor (:math:`\\mu_0/(4\\pi)`) """ + cdef shared_ptr[DipolarDirectSum] sip + + def __cinit__(self): + self.sip = make_shared[DipolarDirectSum]( + EspressoSystemInterface.Instance()) def default_params(self): return {} @@ -339,14 +347,15 @@ IF DIPOLES == 1: def _activate_method(self): self._set_params_in_es_core() + dereference(self.sip).activate() def _deactivate_method(self): super()._deactivate_method() - deactivate_dipolar_direct_sum_gpu() + dereference(self.sip).deactivate() def _set_params_in_es_core(self): self.set_magnetostatics_prefactor() - activate_dipolar_direct_sum_gpu() + dereference(self.sip).set_params() IF(DIPOLAR_BARNES_HUT == 1): cdef class DipolarBarnesHutGpu(MagnetostaticInteraction): @@ -358,6 +367,11 @@ IF DIPOLES == 1: TODO: If the system has periodic boundaries, the minimum image convention is applied. """ + cdef shared_ptr[DipolarBarnesHut] sip + + def __cinit__(self): + self.sip = make_shared[DipolarBarnesHut]( + EspressoSystemInterface.Instance()) def default_params(self): return {"epssq": 100.0, @@ -374,12 +388,13 @@ IF DIPOLES == 1: def _activate_method(self): self._set_params_in_es_core() + dereference(self.sip).activate() def _deactivate_method(self): super()._deactivate_method() - deactivate_dipolar_barnes_hut() + dereference(self.sip).deactivate() def _set_params_in_es_core(self): self.set_magnetostatics_prefactor() - activate_dipolar_barnes_hut( + dereference(self.sip).set_params( self._params["epssq"], self._params["itolsq"]) From ac990c551fb637a93de7f1e56cc524fda0322576 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= Date: Wed, 8 Dec 2021 10:52:04 +0100 Subject: [PATCH 13/21] core: Separation of concerns for GPU methods Move implementation details from class definitions in .hpp files to the corresponding .cpp files. Don't activate MMM1D GPU during construction, but when added to the actor list. --- src/core/actor/DipolarBarnesHut.cpp | 68 +++++++++++++++++++++++- src/core/actor/DipolarBarnesHut.hpp | 67 ++--------------------- src/core/actor/DipolarBarnesHut_cuda.cu | 1 - src/core/actor/DipolarDirectSum.cpp | 50 +++++++++++++++++ src/core/actor/DipolarDirectSum.hpp | 56 +++---------------- src/core/actor/DipolarDirectSum_cuda.cu | 2 + src/core/actor/DipolarDirectSum_cuda.cuh | 36 +++++++++++++ src/core/actor/Mmm1dgpuForce.cpp | 15 +++++- src/core/actor/Mmm1dgpuForce.hpp | 23 ++++---- src/core/actor/Mmm1dgpuForce_cuda.cu | 24 +-------- 10 files changed, 195 insertions(+), 147 deletions(-) create mode 100644 src/core/actor/DipolarDirectSum_cuda.cuh diff --git a/src/core/actor/DipolarBarnesHut.cpp b/src/core/actor/DipolarBarnesHut.cpp index 21a239ec449..85a6d5bbe6b 100644 --- a/src/core/actor/DipolarBarnesHut.cpp +++ b/src/core/actor/DipolarBarnesHut.cpp @@ -22,12 +22,32 @@ #ifdef DIPOLAR_BARNES_HUT #include "DipolarBarnesHut.hpp" +#include "DipolarBarnesHut_cuda.cuh" -#include "actor/ActorList.hpp" #include "electrostatics_magnetostatics/common.hpp" +#include "electrostatics_magnetostatics/dipole.hpp" + +#include "SystemInterface.hpp" +#include "cuda_interface.hpp" +#include "cuda_utils.hpp" #include "energy.hpp" +#include "errorhandling.hpp" #include "forces.hpp" +DipolarBarnesHut::DipolarBarnesHut(SystemInterface &s) { + if (!s.requestFGpu()) + runtimeErrorMsg() << "DipolarBarnesHut needs access to forces on GPU!"; + + if (!s.requestTorqueGpu()) + runtimeErrorMsg() << "DipolarBarnesHut needs access to torques on GPU!"; + + if (!s.requestRGpu()) + runtimeErrorMsg() << "DipolarBarnesHut needs access to positions on GPU!"; + + if (!s.requestDipGpu()) + runtimeErrorMsg() << "DipolarBarnesHut needs access to dipoles on GPU!"; +} + void DipolarBarnesHut::activate() { // also necessary on 1 CPU or GPU, does more than just broadcasting dipole.method = DIPOLAR_BH_GPU; @@ -45,4 +65,50 @@ void DipolarBarnesHut::deactivate() { energyActors.remove(this); } +void DipolarBarnesHut::set_params(float epssq, float itolsq) { + m_prefactor = static_cast(dipole.prefactor); + m_epssq = epssq; + m_itolsq = itolsq; + setBHPrecision(m_epssq, m_itolsq); +} + +void DipolarBarnesHut::computeForces(SystemInterface &s) { + try { + allocBHmemCopy(static_cast(s.npart_gpu()), &m_bh_data); + } catch (cuda_runtime_error const &err) { + runtimeErrorMsg() << "DipolarBarnesHut: " << err.what(); + return; + } + + fill_bh_data(s.rGpuBegin(), s.dipGpuBegin(), &m_bh_data); + initBHgpu(m_bh_data.blocks); + buildBoxBH(m_bh_data.blocks); + buildTreeBH(m_bh_data.blocks); + summarizeBH(m_bh_data.blocks); + sortBH(m_bh_data.blocks); + if (forceBH(&m_bh_data, m_prefactor, s.fGpuBegin(), s.torqueGpuBegin())) { + runtimeErrorMsg() << "kernels encountered a functional error"; + } +} + +void DipolarBarnesHut::computeEnergy(SystemInterface &s) { + try { + allocBHmemCopy(static_cast(s.npart_gpu()), &m_bh_data); + } catch (cuda_runtime_error const &err) { + runtimeErrorMsg() << "DipolarBarnesHut: " << err.what(); + return; + } + + fill_bh_data(s.rGpuBegin(), s.dipGpuBegin(), &m_bh_data); + initBHgpu(m_bh_data.blocks); + buildBoxBH(m_bh_data.blocks); + buildTreeBH(m_bh_data.blocks); + summarizeBH(m_bh_data.blocks); + sortBH(m_bh_data.blocks); + auto energy = reinterpret_cast(s.eGpu()); + if (energyBH(&m_bh_data, m_prefactor, &(energy->dipolar))) { + runtimeErrorMsg() << "kernels encountered a functional error"; + } +} + #endif diff --git a/src/core/actor/DipolarBarnesHut.hpp b/src/core/actor/DipolarBarnesHut.hpp index 829a6318d53..4cf76ecd5ec 100644 --- a/src/core/actor/DipolarBarnesHut.hpp +++ b/src/core/actor/DipolarBarnesHut.hpp @@ -26,79 +26,22 @@ #include "Actor.hpp" #include "DipolarBarnesHut_cuda.cuh" #include "SystemInterface.hpp" -#include "cuda_interface.hpp" -#include "cuda_utils.hpp" -#include "electrostatics_magnetostatics/dipole.hpp" -#include "errorhandling.hpp" class DipolarBarnesHut : public Actor { public: - DipolarBarnesHut(SystemInterface &s) { - if (!s.requestFGpu()) - runtimeErrorMsg() << "DipolarBarnesHut needs access to forces on GPU!"; - - if (!s.requestTorqueGpu()) - runtimeErrorMsg() << "DipolarBarnesHut needs access to torques on GPU!"; - - if (!s.requestRGpu()) - runtimeErrorMsg() << "DipolarBarnesHut needs access to positions on GPU!"; - - if (!s.requestDipGpu()) - runtimeErrorMsg() << "DipolarBarnesHut needs access to dipoles on GPU!"; - }; - + DipolarBarnesHut(SystemInterface &s); ~DipolarBarnesHut() override { deallocBH(&m_bh_data); } - void set_params(float epssq, float itolsq) { - m_k = static_cast(dipole.prefactor); - m_epssq = epssq; - m_itolsq = itolsq; - setBHPrecision(m_epssq, m_itolsq); - } - - void computeForces(SystemInterface &s) override { - try { - allocBHmemCopy(static_cast(s.npart_gpu()), &m_bh_data); - } catch (cuda_runtime_error const &err) { - runtimeErrorMsg() << "DipolarBarnesHut: " << err.what(); - return; - } - - fill_bh_data(s.rGpuBegin(), s.dipGpuBegin(), &m_bh_data); - initBHgpu(m_bh_data.blocks); - buildBoxBH(m_bh_data.blocks); - buildTreeBH(m_bh_data.blocks); - summarizeBH(m_bh_data.blocks); - sortBH(m_bh_data.blocks); - if (forceBH(&m_bh_data, m_k, s.fGpuBegin(), s.torqueGpuBegin())) { - runtimeErrorMsg() << "kernels encountered a functional error"; - } - }; - void computeEnergy(SystemInterface &s) override { - try { - allocBHmemCopy(static_cast(s.npart_gpu()), &m_bh_data); - } catch (cuda_runtime_error const &err) { - runtimeErrorMsg() << "DipolarBarnesHut: " << err.what(); - return; - } + void set_params(float epssq, float itolsq); - fill_bh_data(s.rGpuBegin(), s.dipGpuBegin(), &m_bh_data); - initBHgpu(m_bh_data.blocks); - buildBoxBH(m_bh_data.blocks); - buildTreeBH(m_bh_data.blocks); - summarizeBH(m_bh_data.blocks); - sortBH(m_bh_data.blocks); - if (energyBH(&m_bh_data, m_k, - &(reinterpret_cast(s.eGpu())->dipolar))) { - runtimeErrorMsg() << "kernels encountered a functional error"; - } - }; + void computeForces(SystemInterface &s) override; + void computeEnergy(SystemInterface &s) override; void activate(); void deactivate(); private: - float m_k; + float m_prefactor; float m_epssq; float m_itolsq; /// Container for pointers to device memory. diff --git a/src/core/actor/DipolarBarnesHut_cuda.cu b/src/core/actor/DipolarBarnesHut_cuda.cu index e33f99fc123..67d083efdc0 100644 --- a/src/core/actor/DipolarBarnesHut_cuda.cu +++ b/src/core/actor/DipolarBarnesHut_cuda.cu @@ -29,7 +29,6 @@ #include "cuda_init.hpp" #include "cuda_utils.cuh" -#include "errorhandling.hpp" #include #include diff --git a/src/core/actor/DipolarDirectSum.cpp b/src/core/actor/DipolarDirectSum.cpp index 01d48bf487c..a6da6b4019d 100644 --- a/src/core/actor/DipolarDirectSum.cpp +++ b/src/core/actor/DipolarDirectSum.cpp @@ -22,10 +22,31 @@ #ifdef DIPOLAR_DIRECT_SUM #include "DipolarDirectSum.hpp" +#include "DipolarDirectSum_cuda.cuh" #include "electrostatics_magnetostatics/common.hpp" +#include "electrostatics_magnetostatics/dipole.hpp" + +#include "SystemInterface.hpp" +#include "cuda_interface.hpp" #include "energy.hpp" +#include "errorhandling.hpp" #include "forces.hpp" +#include "grid.hpp" + +DipolarDirectSum::DipolarDirectSum(SystemInterface &s) { + if (!s.requestFGpu()) + runtimeErrorMsg() << "DipolarDirectSum needs access to forces on GPU!"; + + if (!s.requestTorqueGpu()) + runtimeErrorMsg() << "DipolarDirectSum needs access to torques on GPU!"; + + if (!s.requestRGpu()) + runtimeErrorMsg() << "DipolarDirectSum needs access to positions on GPU!"; + + if (!s.requestDipGpu()) + runtimeErrorMsg() << "DipolarDirectSum needs access to dipoles on GPU!"; +} void DipolarDirectSum::activate() { // also necessary on 1 CPU or GPU, does more than just broadcasting @@ -44,4 +65,33 @@ void DipolarDirectSum::deactivate() { energyActors.remove(this); } +void DipolarDirectSum::set_params() { + m_prefactor = static_cast(dipole.prefactor); +} + +void DipolarDirectSum::computeForces(SystemInterface &s) { + float box[3]; + int per[3]; + for (int i = 0; i < 3; i++) { + box[i] = static_cast(s.box()[i]); + per[i] = box_geo.periodic(i); + } + DipolarDirectSum_kernel_wrapper_force( + m_prefactor, static_cast(s.npart_gpu()), s.rGpuBegin(), + s.dipGpuBegin(), s.fGpuBegin(), s.torqueGpuBegin(), box, per); +} + +void DipolarDirectSum::computeEnergy(SystemInterface &s) { + float box[3]; + int per[3]; + for (int i = 0; i < 3; i++) { + box[i] = static_cast(s.box()[i]); + per[i] = box_geo.periodic(i); + } + auto energy = reinterpret_cast(s.eGpu()); + DipolarDirectSum_kernel_wrapper_energy( + m_prefactor, static_cast(s.npart_gpu()), s.rGpuBegin(), + s.dipGpuBegin(), box, per, &(energy->dipolar)); +} + #endif diff --git a/src/core/actor/DipolarDirectSum.hpp b/src/core/actor/DipolarDirectSum.hpp index 3dc82d68a79..940cc9082d9 100644 --- a/src/core/actor/DipolarDirectSum.hpp +++ b/src/core/actor/DipolarDirectSum.hpp @@ -25,66 +25,22 @@ #include "Actor.hpp" #include "SystemInterface.hpp" -#include "cuda_interface.hpp" -#include "electrostatics_magnetostatics/dipole.hpp" -#include "errorhandling.hpp" -#include "grid.hpp" - -void DipolarDirectSum_kernel_wrapper_energy(float k, unsigned int n, float *pos, - float *dip, float box_l[3], - int periodic[3], float *E); -void DipolarDirectSum_kernel_wrapper_force(float k, unsigned int n, float *pos, - float *dip, float *f, float *torque, - float box_l[3], int periodic[3]); class DipolarDirectSum : public Actor { public: - DipolarDirectSum(SystemInterface &s) { - - if (!s.requestFGpu()) - runtimeErrorMsg() << "DipolarDirectSum needs access to forces on GPU!"; - - if (!s.requestTorqueGpu()) - runtimeErrorMsg() << "DipolarDirectSum needs access to torques on GPU!"; - - if (!s.requestRGpu()) - runtimeErrorMsg() << "DipolarDirectSum needs access to positions on GPU!"; - - if (!s.requestDipGpu()) - runtimeErrorMsg() << "DipolarDirectSum needs access to dipoles on GPU!"; - }; + DipolarDirectSum(SystemInterface &s); + ~DipolarDirectSum() override = default; - void set_params() { m_pk = static_cast(dipole.prefactor); } + void set_params(); - void computeForces(SystemInterface &s) override { - float box[3]; - int per[3]; - for (int i = 0; i < 3; i++) { - box[i] = static_cast(s.box()[i]); - per[i] = box_geo.periodic(i); - } - DipolarDirectSum_kernel_wrapper_force( - m_pk, static_cast(s.npart_gpu()), s.rGpuBegin(), - s.dipGpuBegin(), s.fGpuBegin(), s.torqueGpuBegin(), box, per); - }; - void computeEnergy(SystemInterface &s) override { - float box[3]; - int per[3]; - for (int i = 0; i < 3; i++) { - box[i] = static_cast(s.box()[i]); - per[i] = box_geo.periodic(i); - } - auto energy = reinterpret_cast(s.eGpu()); - DipolarDirectSum_kernel_wrapper_energy( - m_pk, static_cast(s.npart_gpu()), s.rGpuBegin(), - s.dipGpuBegin(), box, per, &(energy->dipolar)); - }; + void computeForces(SystemInterface &s) override; + void computeEnergy(SystemInterface &s) override; void activate(); void deactivate(); private: - float m_pk; + float m_prefactor; }; #endif // DIPOLAR_DIRECT_SUM diff --git a/src/core/actor/DipolarDirectSum_cuda.cu b/src/core/actor/DipolarDirectSum_cuda.cu index 8c1e37754b2..532f551e85a 100644 --- a/src/core/actor/DipolarDirectSum_cuda.cu +++ b/src/core/actor/DipolarDirectSum_cuda.cu @@ -21,6 +21,8 @@ #ifdef DIPOLAR_DIRECT_SUM +#include "DipolarDirectSum_cuda.cuh" + #include "cuda_utils.cuh" #include diff --git a/src/core/actor/DipolarDirectSum_cuda.cuh b/src/core/actor/DipolarDirectSum_cuda.cuh new file mode 100644 index 00000000000..31b3171a45b --- /dev/null +++ b/src/core/actor/DipolarDirectSum_cuda.cuh @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2010-2019 The ESPResSo project + * + * This file is part of ESPResSo. + * + * ESPResSo is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * ESPResSo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef DIPOLARDIRECTSUM_CUH_ +#define DIPOLARDIRECTSUM_CUH_ + +#include "config.hpp" + +#ifdef DIPOLAR_DIRECT_SUM + +void DipolarDirectSum_kernel_wrapper_energy(float k, unsigned int n, float *pos, + float *dip, float box_l[3], + int periodic[3], float *E); +void DipolarDirectSum_kernel_wrapper_force(float k, unsigned int n, float *pos, + float *dip, float *f, float *torque, + float box_l[3], int periodic[3]); + +#endif // DIPOLAR_DIRECT_SUM + +#endif /* DIPOLARDIRECTSUM_CUH_ */ diff --git a/src/core/actor/Mmm1dgpuForce.cpp b/src/core/actor/Mmm1dgpuForce.cpp index dcfcdd857c1..7baedba8e19 100644 --- a/src/core/actor/Mmm1dgpuForce.cpp +++ b/src/core/actor/Mmm1dgpuForce.cpp @@ -28,13 +28,26 @@ #include -void Mmm1dgpuForce::sanity_checks() { +Mmm1dgpuForce::Mmm1dgpuForce(SystemInterface &s) { + // interface sanity checks + if (!s.requestFGpu()) + throw std::runtime_error("Mmm1dgpuForce needs access to forces on GPU!"); + + if (!s.requestRGpu()) + throw std::runtime_error("Mmm1dgpuForce needs access to positions on GPU!"); + + if (!s.requestQGpu()) + throw std::runtime_error("Mmm1dgpuForce needs access to charges on GPU!"); + + // system sanity checks if (box_geo.periodic(0) || box_geo.periodic(1) || !box_geo.periodic(2)) { throw std::runtime_error("MMM1D requires periodicity (0, 0, 1)"); } if (cell_structure.decomposition_type() != CELL_STRUCTURE_NSQUARE) { throw std::runtime_error("MMM1D requires the N-square cellsystem"); } + + modpsi_init(); } void Mmm1dgpuForce::activate() { diff --git a/src/core/actor/Mmm1dgpuForce.hpp b/src/core/actor/Mmm1dgpuForce.hpp index 293154c1a30..ea16efb3d31 100644 --- a/src/core/actor/Mmm1dgpuForce.hpp +++ b/src/core/actor/Mmm1dgpuForce.hpp @@ -28,9 +28,9 @@ class Mmm1dgpuForce : public Actor { public: - // constructor Mmm1dgpuForce(SystemInterface &s); ~Mmm1dgpuForce() override; + // interface methods void computeForces(SystemInterface &s) override; void computeEnergy(SystemInterface &s) override; @@ -45,32 +45,37 @@ class Mmm1dgpuForce : public Actor { private: // CUDA parameters - unsigned int numThreads; + unsigned int numThreads = 64; unsigned int numBlocks(SystemInterface const &s) const; // the box length currently set on the GPU // Needed to make sure it hasn't been modified after inter coulomb was used. - float host_boxz; + float host_boxz = 0; // the number of particles we had during the last run. Needed to check if we // have to realloc dev_forcePairs - unsigned int host_npart; - bool need_tune; + unsigned int host_npart = 0; + bool need_tune = true; // pairs==0: return forces using atomicAdd // pairs==1: return force pairs // pairs==2: return forces using a global memory reduction - int pairs; + int pairs = -1; // variables for forces and energies calculated pre-reduction - float *dev_forcePairs, *dev_energyBlocks; + float *dev_forcePairs = nullptr; + float *dev_energyBlocks = nullptr; // MMM1D parameters - float coulomb_prefactor, maxPWerror, far_switch_radius; - int bessel_cutoff; + float coulomb_prefactor = 0; + float maxPWerror = -1; + float far_switch_radius = -1; + int bessel_cutoff = -1; // run a single force calculation and return the time it takes using // high-precision CUDA timers float force_benchmark(SystemInterface &s); + void modpsi_init(); + // some functions to move MPI dependencies out of the .cu file void sanity_checks(); }; diff --git a/src/core/actor/Mmm1dgpuForce_cuda.cu b/src/core/actor/Mmm1dgpuForce_cuda.cu index 6c5a7f57a06..0755b319686 100644 --- a/src/core/actor/Mmm1dgpuForce_cuda.cu +++ b/src/core/actor/Mmm1dgpuForce_cuda.cu @@ -86,7 +86,7 @@ __device__ float dev_mod_psi_odd(int n, float x) { static_cast(device_linModPsi_lengths[2 * n + 1]), x * x); } -int modpsi_init() { +void Mmm1dgpuForce::modpsi_init() { create_mod_psi_up_to(modpsi_order); // linearized array on host @@ -129,28 +129,6 @@ int modpsi_init() { auto const n_modPsi = static_cast(modPsi.size() >> 1); cuda_safe_mem(cudaMemcpyToSymbol(device_n_modPsi, &n_modPsi, sizeof(int))); } - - return 0; -} - -Mmm1dgpuForce::Mmm1dgpuForce(SystemInterface &s) - : numThreads(64), host_boxz(0), host_npart(0), need_tune(true), pairs(-1), - dev_forcePairs(nullptr), dev_energyBlocks(nullptr), coulomb_prefactor(0), - maxPWerror(-1), far_switch_radius(-1), bessel_cutoff(-1) { - // interface sanity checks - if (!s.requestFGpu()) - throw std::runtime_error("Mmm1dgpuForce needs access to forces on GPU!"); - - if (!s.requestRGpu()) - throw std::runtime_error("Mmm1dgpuForce needs access to positions on GPU!"); - - if (!s.requestQGpu()) - throw std::runtime_error("Mmm1dgpuForce needs access to charges on GPU!"); - - // system sanity checks - sanity_checks(); - - modpsi_init(); } void Mmm1dgpuForce::setup(SystemInterface &s) { From 2401244c1e92ce02595db2a93bfacf9fd2850b58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= Date: Wed, 8 Dec 2021 13:04:19 +0100 Subject: [PATCH 14/21] core: Reduce visibility of magnetostatics globals The `dipole` global variable and the associated enum values are no longer used by the magnetostatics Cython interface. --- src/core/actor/DipolarBarnesHut.cpp | 1 - src/core/actor/Mmm1dgpuForce.cpp | 10 +++++++ .../electrostatics_magnetostatics/dipole.cpp | 8 ++++-- .../electrostatics_magnetostatics/dipole.hpp | 3 ++ .../magnetic_non_p3m_methods.cpp | 10 +++++-- .../magnetic_non_p3m_methods.hpp | 4 +-- src/python/espressomd/magnetostatics.pxd | 20 ++----------- src/python/espressomd/magnetostatics.pyx | 28 +++++++++++-------- src/python/espressomd/scafacos.pyx | 2 +- 9 files changed, 47 insertions(+), 39 deletions(-) diff --git a/src/core/actor/DipolarBarnesHut.cpp b/src/core/actor/DipolarBarnesHut.cpp index 85a6d5bbe6b..2b067dc2584 100644 --- a/src/core/actor/DipolarBarnesHut.cpp +++ b/src/core/actor/DipolarBarnesHut.cpp @@ -49,7 +49,6 @@ DipolarBarnesHut::DipolarBarnesHut(SystemInterface &s) { } void DipolarBarnesHut::activate() { - // also necessary on 1 CPU or GPU, does more than just broadcasting dipole.method = DIPOLAR_BH_GPU; mpi_bcast_coulomb_params(); diff --git a/src/core/actor/Mmm1dgpuForce.cpp b/src/core/actor/Mmm1dgpuForce.cpp index 7baedba8e19..827337d8c1a 100644 --- a/src/core/actor/Mmm1dgpuForce.cpp +++ b/src/core/actor/Mmm1dgpuForce.cpp @@ -21,6 +21,10 @@ #ifdef MMM1D_GPU #include "actor/Mmm1dgpuForce.hpp" + +#include "electrostatics_magnetostatics/common.hpp" +#include "electrostatics_magnetostatics/coulomb.hpp" + #include "cells.hpp" #include "energy.hpp" #include "forces.hpp" @@ -51,11 +55,17 @@ Mmm1dgpuForce::Mmm1dgpuForce(SystemInterface &s) { } void Mmm1dgpuForce::activate() { + coulomb.method = COULOMB_MMM1D_GPU; + mpi_bcast_coulomb_params(); + forceActors.push_back(this); energyActors.push_back(this); } void Mmm1dgpuForce::deactivate() { + coulomb.method = COULOMB_NONE; + mpi_bcast_coulomb_params(); + forceActors.remove(this); energyActors.remove(this); } diff --git a/src/core/electrostatics_magnetostatics/dipole.cpp b/src/core/electrostatics_magnetostatics/dipole.cpp index 77291e3bae0..4295ac62647 100644 --- a/src/core/electrostatics_magnetostatics/dipole.cpp +++ b/src/core/electrostatics_magnetostatics/dipole.cpp @@ -78,7 +78,7 @@ bool sanity_checks() { mdlc_sanity_checks(); // fall through case DIPOLAR_DS: - mdds_sanity_checks(mdds_n_replica); + mdds_sanity_checks(); break; default: break; @@ -287,13 +287,15 @@ void set_Dprefactor(double prefactor) { if (prefactor < 0.0) { throw std::invalid_argument("Dipolar prefactor has to be >= 0"); } - dipole.prefactor = prefactor; - mpi_bcast_coulomb_params(); } +double get_Dprefactor() { return dipole.prefactor; } + void set_method_local(DipolarInteraction method) { dipole.method = method; } +void disable_method_local() { dipole.method = DIPOLAR_NONE; } + } // namespace Dipole #endif // DIPOLES diff --git a/src/core/electrostatics_magnetostatics/dipole.hpp b/src/core/electrostatics_magnetostatics/dipole.hpp index 7ef92038ef3..a6cdb49ebf9 100644 --- a/src/core/electrostatics_magnetostatics/dipole.hpp +++ b/src/core/electrostatics_magnetostatics/dipole.hpp @@ -86,8 +86,11 @@ void bcast_params(const boost::mpi::communicator &comm); /** @brief Set the dipolar prefactor */ void set_Dprefactor(double prefactor); +/** @brief Get the dipolar prefactor */ +double get_Dprefactor(); void set_method_local(DipolarInteraction method); +void disable_method_local(); } // namespace Dipole #else // DIPOLES namespace Dipole { diff --git a/src/core/electrostatics_magnetostatics/magnetic_non_p3m_methods.cpp b/src/core/electrostatics_magnetostatics/magnetic_non_p3m_methods.cpp index 52fbb901f8b..619af521368 100644 --- a/src/core/electrostatics_magnetostatics/magnetic_non_p3m_methods.cpp +++ b/src/core/electrostatics_magnetostatics/magnetic_non_p3m_methods.cpp @@ -137,9 +137,11 @@ double dawaanr_calculations(bool force_flag, bool energy_flag, ============================================================================= */ -int mdds_n_replica = 0; +static int mdds_n_replica = 0; -void mdds_sanity_checks(int n_replica) { +int mdds_get_n_replica() { return mdds_n_replica; } + +void sanity_checks(int n_replica) { if (box_geo.periodic(0) and box_geo.periodic(1) and box_geo.periodic(2) and n_replica == 0) { throw std::runtime_error("Dipolar direct sum with replica does not " @@ -147,6 +149,8 @@ void mdds_sanity_checks(int n_replica) { } } +void mdds_sanity_checks() { sanity_checks(mdds_n_replica); } + double magnetic_dipolar_direct_sum_calculations(bool force_flag, bool energy_flag, ParticleRange const &particles) { @@ -320,7 +324,7 @@ void mdds_set_params(int n_replica) { if (n_replica < 0) { throw std::runtime_error("Dipolar direct sum requires n_replica >= 0."); } - mdds_sanity_checks(n_replica); + sanity_checks(n_replica); if (n_replica == 0) { fprintf(stderr, "Careful: the number of extra replicas to take into " "account during the direct sum calculation is zero\n"); diff --git a/src/core/electrostatics_magnetostatics/magnetic_non_p3m_methods.hpp b/src/core/electrostatics_magnetostatics/magnetic_non_p3m_methods.hpp index dd1664e9d93..7565f32cbe9 100644 --- a/src/core/electrostatics_magnetostatics/magnetic_non_p3m_methods.hpp +++ b/src/core/electrostatics_magnetostatics/magnetic_non_p3m_methods.hpp @@ -62,7 +62,7 @@ void dawaanr_set_params(); */ /** Sanity checks for the magnetic dipolar direct sum. */ -void mdds_sanity_checks(int n_replica); +void mdds_sanity_checks(); /** Core of the method: here you compute all the magnetic forces, torques and * the energy for the whole system using direct sum @@ -76,7 +76,7 @@ double magnetic_dipolar_direct_sum_calculations(bool force_flag, */ void mdds_set_params(int n_replica); -extern int mdds_n_replica; +int mdds_get_n_replica(); #endif /*of ifdef DIPOLES */ #endif /* of ifndef MAG_NON_P3M_H */ diff --git a/src/python/espressomd/magnetostatics.pxd b/src/python/espressomd/magnetostatics.pxd index d12f5bee383..fb6893f3ec6 100644 --- a/src/python/espressomd/magnetostatics.pxd +++ b/src/python/espressomd/magnetostatics.pxd @@ -31,29 +31,15 @@ IF DIPOLES == 1: cdef extern from "electrostatics_magnetostatics/common.hpp": void mpi_bcast_coulomb_params() - cdef extern from "electrostatics_magnetostatics/dipole.hpp": - ctypedef enum DipolarInteraction: - DIPOLAR_NONE = 0, - DIPOLAR_P3M, - DIPOLAR_MDLC_P3M, - DIPOLAR_ALL_WITH_ALL_AND_NO_REPLICA, - DIPOLAR_DS, - DIPOLAR_MDLC_DS, - DIPOLAR_SCAFACOS - - ctypedef struct Dipole_parameters: - double prefactor - DipolarInteraction method - - cdef extern Dipole_parameters dipole - cdef extern from "electrostatics_magnetostatics/dipole.hpp" namespace "Dipole": void set_Dprefactor(double prefactor) except + + double get_Dprefactor() + void disable_method_local() cdef extern from "electrostatics_magnetostatics/magnetic_non_p3m_methods.hpp": void dawaanr_set_params() except + void mdds_set_params(int n_replica) except + - int mdds_n_replica + int mdds_get_n_replica() IF(CUDA == 1) and (ROTATION == 1): cdef extern from "actor/DipolarDirectSum.hpp": diff --git a/src/python/espressomd/magnetostatics.pyx b/src/python/espressomd/magnetostatics.pyx index f6b09376e51..3dc0571a3bf 100644 --- a/src/python/espressomd/magnetostatics.pyx +++ b/src/python/espressomd/magnetostatics.pyx @@ -45,6 +45,13 @@ IF DIPOLES == 1: if not self._params["prefactor"] >= 0: raise ValueError("prefactor should be a positive float") + def get_magnetostatics_prefactor(self): + """ + Get the magnetostatics prefactor + + """ + return get_Dprefactor() + def set_magnetostatics_prefactor(self): """ Set the magnetostatics prefactor @@ -54,12 +61,9 @@ IF DIPOLES == 1: # also necessary on 1 CPU or GPU, does more than just broadcasting mpi_bcast_coulomb_params() - def _get_active_method_from_es_core(self): - return dipole.method - def _deactivate_method(self): set_Dprefactor(0.0) - dipole.method = DIPOLAR_NONE + disable_method_local() mpi_bcast_coulomb_params() IF DP3M == 1: @@ -148,7 +152,7 @@ IF DP3M == 1: def _get_params_from_es_core(self): params = {} params.update(dp3m.params) - params["prefactor"] = dipole.prefactor + params["prefactor"] = self.get_magnetostatics_prefactor() params["tune"] = self._params["tune"] params["timings"] = self._params["timings"] return params @@ -220,7 +224,7 @@ IF DIPOLES == 1: return ("prefactor",) def _get_params_from_es_core(self): - return {"prefactor": dipole.prefactor} + return {"prefactor": self.get_magnetostatics_prefactor()} def _activate_method(self): self._set_params_in_es_core() @@ -258,7 +262,8 @@ IF DIPOLES == 1: return ("prefactor", "n_replica") def _get_params_from_es_core(self): - return {"prefactor": dipole.prefactor, "n_replica": mdds_n_replica} + return {"prefactor": self.get_magnetostatics_prefactor(), + "n_replica": mdds_get_n_replica()} def _activate_method(self): self._set_params_in_es_core() @@ -296,13 +301,12 @@ IF DIPOLES == 1: Actor.__init__(self, *args, **kwargs) def _activate_method(self): - set_Dprefactor(self._params["prefactor"]) + self.set_magnetostatics_prefactor() self._set_params_in_es_core() def _deactivate_method(self): - dipole.method = DIPOLAR_NONE scafacos.free_handle(self.dipolar) - mpi_bcast_coulomb_params() + super()._deactivate_method() def default_params(self): return {} @@ -343,7 +347,7 @@ IF DIPOLES == 1: return ("prefactor",) def _get_params_from_es_core(self): - return {"prefactor": dipole.prefactor} + return {"prefactor": self.get_magnetostatics_prefactor()} def _activate_method(self): self._set_params_in_es_core() @@ -384,7 +388,7 @@ IF DIPOLES == 1: return ("prefactor", "epssq", "itolsq") def _get_params_from_es_core(self): - return {"prefactor": dipole.prefactor} + return {"prefactor": self.get_magnetostatics_prefactor()} def _activate_method(self): self._set_params_in_es_core() diff --git a/src/python/espressomd/scafacos.pyx b/src/python/espressomd/scafacos.pyx index 07394e77e8c..04dd88496da 100644 --- a/src/python/espressomd/scafacos.pyx +++ b/src/python/espressomd/scafacos.pyx @@ -98,7 +98,7 @@ IF SCAFACOS == 1: # Re-add the prefactor to the parameter set if self.dipolar: IF DIPOLES == 1: - res["prefactor"] = magnetostatics.dipole.prefactor + res["prefactor"] = magnetostatics.get_Dprefactor() pass else: IF ELECTROSTATICS == 1: From 0e6c336c6f3934a87cbe84f0368a819b313cf270 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= Date: Wed, 8 Dec 2021 16:38:41 +0100 Subject: [PATCH 15/21] core: Proper error propagation --- src/core/EspressoSystemInterface.hpp | 18 +++---- src/core/SystemInterface.hpp | 36 +++++++++---- src/core/actor/DipolarBarnesHut.cpp | 15 ++---- src/core/actor/DipolarDirectSum.cpp | 16 ++---- src/core/actor/Mmm1dgpuForce.cpp | 12 ++--- .../EspressoSystemInterface_test.cpp | 51 +++++++++---------- src/python/espressomd/magnetostatics.pxd | 4 +- 7 files changed, 69 insertions(+), 83 deletions(-) diff --git a/src/core/EspressoSystemInterface.hpp b/src/core/EspressoSystemInterface.hpp index feecf6bd53b..838520b0ae1 100644 --- a/src/core/EspressoSystemInterface.hpp +++ b/src/core/EspressoSystemInterface.hpp @@ -55,57 +55,52 @@ class EspressoSystemInterface : public SystemInterface { #ifdef CUDA float *rGpuBegin() override { return m_r_gpu_begin; }; bool hasRGpu() override { return true; }; - bool requestRGpu() override { + void requestRGpu() override { m_needsRGpu = hasRGpu(); m_splitParticleStructGpu |= m_needsRGpu; m_gpu |= m_needsRGpu; if (m_gpu) enableParticleCommunication(); - return m_needsRGpu; }; #ifdef DIPOLES float *dipGpuBegin() override { return m_dip_gpu_begin; }; bool hasDipGpu() override { return true; }; - bool requestDipGpu() override { + void requestDipGpu() override { m_needsDipGpu = hasDipGpu(); m_splitParticleStructGpu |= m_needsRGpu; m_gpu |= m_needsRGpu; if (m_gpu) enableParticleCommunication(); - return m_needsDipGpu; }; #endif #ifdef ELECTROSTATICS float *qGpuBegin() override { return m_q_gpu_begin; }; bool hasQGpu() override { return true; }; - bool requestQGpu() override { + void requestQGpu() override { m_needsQGpu = hasQGpu(); m_splitParticleStructGpu |= m_needsQGpu; m_gpu |= m_needsQGpu; if (m_gpu) enableParticleCommunication(); - return m_needsQGpu; }; #endif - bool requestParticleStructGpu() { + void requestParticleStructGpu() { m_needsParticleStructGpu = true; m_gpu |= m_needsParticleStructGpu; if (m_gpu) enableParticleCommunication(); - return true; }; float *fGpuBegin() override { return gpu_get_particle_force_pointer(); }; bool hasFGpu() override { return true; }; - bool requestFGpu() override { + void requestFGpu() override { m_needsFGpu = hasFGpu(); m_gpu |= m_needsFGpu; if (m_gpu) enableParticleCommunication(); - return m_needsFGpu; }; #ifdef ROTATION @@ -113,12 +108,11 @@ class EspressoSystemInterface : public SystemInterface { return gpu_get_particle_torque_pointer(); }; bool hasTorqueGpu() override { return true; }; - bool requestTorqueGpu() override { + void requestTorqueGpu() override { m_needsTorqueGpu = hasTorqueGpu(); m_gpu |= m_needsTorqueGpu; if (m_gpu) enableParticleCommunication(); - return m_needsTorqueGpu; }; #endif diff --git a/src/core/SystemInterface.hpp b/src/core/SystemInterface.hpp index 006232efab6..15b79825362 100644 --- a/src/core/SystemInterface.hpp +++ b/src/core/SystemInterface.hpp @@ -24,6 +24,8 @@ #include #include +#include +#include /** * Public interface for the ESPResSo system and device memory management. @@ -46,10 +48,11 @@ * * Since properties in the @ref Particle struct depend on the config file, * it is necessary to provide a means to check if it is even possible to - * carry out the split. This is achieved by the hasXGpu() methods. - * The same value is returned from requestXGpu() and it needs to - * be checked at the call site to throw an error. This class won't throw - * exceptions. + * carry out the split. This is achieved by the hasXGpu() methods, + * whose return value must be checked at the call site to raise a runtime + * error. This is important in MPI callbacks, since they have limited support + * for standard exceptions. Methods requestXGpu() will throw an + * exception if the particle property cannot be split on device memory. */ class SystemInterface { public: @@ -61,28 +64,43 @@ class SystemInterface { virtual float *rGpuBegin() { return nullptr; }; virtual bool hasRGpu() { return false; }; - virtual bool requestRGpu() { return false; } + virtual void requestRGpu() { + throw std::runtime_error(error_message("positions")); + } virtual float *dipGpuBegin() { return nullptr; }; virtual bool hasDipGpu() { return false; }; - virtual bool requestDipGpu() { return false; } + virtual void requestDipGpu() { + throw std::runtime_error(error_message("dipoles")); + } virtual float *torqueGpuBegin() { return nullptr; }; virtual bool hasTorqueGpu() { return false; }; - virtual bool requestTorqueGpu() { return false; } + virtual void requestTorqueGpu() { + throw std::runtime_error(error_message("torques")); + } virtual float *qGpuBegin() { return nullptr; }; virtual bool hasQGpu() { return false; }; - virtual bool requestQGpu() { return false; } + virtual void requestQGpu() { + throw std::runtime_error(error_message("charges")); + } virtual float *fGpuBegin() { return nullptr; }; virtual bool hasFGpu() { return false; }; - virtual bool requestFGpu() { return false; } + virtual void requestFGpu() { + throw std::runtime_error(error_message("forces")); + } virtual float *eGpu() { return nullptr; }; virtual std::size_t npart_gpu() const { return 0; }; virtual Utils::Vector3d box() const = 0; + +private: + std::string error_message(std::string property) const { + return "No GPU available or particle " + property + " not compiled in."; + } }; #endif diff --git a/src/core/actor/DipolarBarnesHut.cpp b/src/core/actor/DipolarBarnesHut.cpp index 2b067dc2584..582d30827c3 100644 --- a/src/core/actor/DipolarBarnesHut.cpp +++ b/src/core/actor/DipolarBarnesHut.cpp @@ -35,17 +35,10 @@ #include "forces.hpp" DipolarBarnesHut::DipolarBarnesHut(SystemInterface &s) { - if (!s.requestFGpu()) - runtimeErrorMsg() << "DipolarBarnesHut needs access to forces on GPU!"; - - if (!s.requestTorqueGpu()) - runtimeErrorMsg() << "DipolarBarnesHut needs access to torques on GPU!"; - - if (!s.requestRGpu()) - runtimeErrorMsg() << "DipolarBarnesHut needs access to positions on GPU!"; - - if (!s.requestDipGpu()) - runtimeErrorMsg() << "DipolarBarnesHut needs access to dipoles on GPU!"; + s.requestFGpu(); + s.requestTorqueGpu(); + s.requestRGpu(); + s.requestDipGpu(); } void DipolarBarnesHut::activate() { diff --git a/src/core/actor/DipolarDirectSum.cpp b/src/core/actor/DipolarDirectSum.cpp index a6da6b4019d..95f4cd4dc6b 100644 --- a/src/core/actor/DipolarDirectSum.cpp +++ b/src/core/actor/DipolarDirectSum.cpp @@ -30,22 +30,14 @@ #include "SystemInterface.hpp" #include "cuda_interface.hpp" #include "energy.hpp" -#include "errorhandling.hpp" #include "forces.hpp" #include "grid.hpp" DipolarDirectSum::DipolarDirectSum(SystemInterface &s) { - if (!s.requestFGpu()) - runtimeErrorMsg() << "DipolarDirectSum needs access to forces on GPU!"; - - if (!s.requestTorqueGpu()) - runtimeErrorMsg() << "DipolarDirectSum needs access to torques on GPU!"; - - if (!s.requestRGpu()) - runtimeErrorMsg() << "DipolarDirectSum needs access to positions on GPU!"; - - if (!s.requestDipGpu()) - runtimeErrorMsg() << "DipolarDirectSum needs access to dipoles on GPU!"; + s.requestFGpu(); + s.requestTorqueGpu(); + s.requestRGpu(); + s.requestDipGpu(); } void DipolarDirectSum::activate() { diff --git a/src/core/actor/Mmm1dgpuForce.cpp b/src/core/actor/Mmm1dgpuForce.cpp index 827337d8c1a..d5695ceede6 100644 --- a/src/core/actor/Mmm1dgpuForce.cpp +++ b/src/core/actor/Mmm1dgpuForce.cpp @@ -33,15 +33,9 @@ #include Mmm1dgpuForce::Mmm1dgpuForce(SystemInterface &s) { - // interface sanity checks - if (!s.requestFGpu()) - throw std::runtime_error("Mmm1dgpuForce needs access to forces on GPU!"); - - if (!s.requestRGpu()) - throw std::runtime_error("Mmm1dgpuForce needs access to positions on GPU!"); - - if (!s.requestQGpu()) - throw std::runtime_error("Mmm1dgpuForce needs access to charges on GPU!"); + s.requestFGpu(); + s.requestRGpu(); + s.requestQGpu(); // system sanity checks if (box_geo.periodic(0) || box_geo.periodic(1) || !box_geo.periodic(2)) { diff --git a/src/core/unit_tests/EspressoSystemInterface_test.cpp b/src/core/unit_tests/EspressoSystemInterface_test.cpp index 09fb492aea2..61eace5d331 100644 --- a/src/core/unit_tests/EspressoSystemInterface_test.cpp +++ b/src/core/unit_tests/EspressoSystemInterface_test.cpp @@ -68,20 +68,6 @@ BOOST_AUTO_TEST_CASE(check_with_gpu, *boost::unit_test::precondition(has_gpu)) { check_uninitialized_device_pointers(); BOOST_CHECK_EQUAL(espresso::system.npart_gpu(), 0); - // features compiled in - auto has_feature_rotation = false; - auto has_feature_electrostatics = false; - auto has_feature_dipoles = false; -#ifdef ROTATION - has_feature_rotation = true; -#endif -#ifdef ELECTROSTATICS - has_feature_electrostatics = true; -#endif -#ifdef DIPOLES - has_feature_dipoles = true; -#endif - auto const pid = 1; place_particle(pid, {0., 0., 0.}); set_particle_type(pid, 0); @@ -89,44 +75,53 @@ BOOST_AUTO_TEST_CASE(check_with_gpu, *boost::unit_test::precondition(has_gpu)) { BOOST_CHECK_EQUAL(espresso::system.npart_gpu(), 0); espresso::system.update(); BOOST_CHECK_EQUAL(espresso::system.npart_gpu(), 0); - BOOST_TEST(espresso::system.requestParticleStructGpu()); + espresso::system.requestParticleStructGpu(); espresso::system.update(); BOOST_CHECK_EQUAL(espresso::system.npart_gpu(), 1); // check position split BOOST_TEST(espresso::system.hasRGpu()); - BOOST_TEST(espresso::system.requestRGpu()); + espresso::system.requestRGpu(); espresso::system.update(); BOOST_TEST(espresso::system.rGpuBegin() != nullptr); // check force split BOOST_TEST(espresso::system.hasFGpu()); - BOOST_TEST(espresso::system.requestFGpu()); + espresso::system.requestFGpu(); espresso::system.update(); BOOST_TEST(espresso::system.fGpuBegin() != nullptr); // check torque split - BOOST_CHECK_EQUAL(espresso::system.hasTorqueGpu(), has_feature_rotation); - BOOST_CHECK_EQUAL(espresso::system.requestTorqueGpu(), has_feature_rotation); #ifdef ROTATION + BOOST_CHECK(espresso::system.hasTorqueGpu()); + espresso::system.requestTorqueGpu(); espresso::system.update(); BOOST_TEST(espresso::system.torqueGpuBegin() != nullptr); +#else + BOOST_CHECK(!espresso::system.hasTorqueGpu()); + BOOST_CHECK_THROW(espresso::system.requestTorqueGpu(), std::runtime_error); #endif // check charge split - BOOST_CHECK_EQUAL(espresso::system.hasQGpu(), has_feature_electrostatics); - BOOST_CHECK_EQUAL(espresso::system.requestQGpu(), has_feature_electrostatics); #ifdef ELECTROSTATICS + BOOST_CHECK(espresso::system.hasQGpu()); + espresso::system.requestQGpu(); espresso::system.update(); BOOST_TEST(espresso::system.qGpuBegin() != nullptr); +#else + BOOST_CHECK(!espresso::system.hasQGpu()); + BOOST_CHECK_THROW(espresso::system.requestQGpu(), std::runtime_error); #endif // check dipole split - BOOST_CHECK_EQUAL(espresso::system.hasDipGpu(), has_feature_dipoles); - BOOST_CHECK_EQUAL(espresso::system.requestDipGpu(), has_feature_dipoles); #ifdef DIPOLES + BOOST_CHECK(espresso::system.hasDipGpu()); + espresso::system.requestDipGpu(); espresso::system.update(); BOOST_TEST(espresso::system.dipGpuBegin() != nullptr); +#else + BOOST_CHECK(!espresso::system.hasDipGpu()); + BOOST_CHECK_THROW(espresso::system.requestDipGpu(), std::runtime_error); #endif // clear device memory @@ -141,15 +136,15 @@ BOOST_AUTO_TEST_CASE(check_without_cuda) { check_uninitialized_device_pointers(); BOOST_CHECK_EQUAL(espresso::system.npart_gpu(), 0); BOOST_TEST(!espresso::system.hasRGpu()); - BOOST_TEST(!espresso::system.requestRGpu()); BOOST_TEST(!espresso::system.hasDipGpu()); - BOOST_TEST(!espresso::system.requestDipGpu()); BOOST_TEST(!espresso::system.hasFGpu()); - BOOST_TEST(!espresso::system.requestFGpu()); BOOST_TEST(!espresso::system.hasTorqueGpu()); - BOOST_TEST(!espresso::system.requestTorqueGpu()); BOOST_TEST(!espresso::system.hasQGpu()); - BOOST_TEST(!espresso::system.requestQGpu()); + BOOST_CHECK_THROW(espresso::system.requestRGpu(), std::runtime_error); + BOOST_CHECK_THROW(espresso::system.requestDipGpu(), std::runtime_error); + BOOST_CHECK_THROW(espresso::system.requestFGpu(), std::runtime_error); + BOOST_CHECK_THROW(espresso::system.requestTorqueGpu(), std::runtime_error); + BOOST_CHECK_THROW(espresso::system.requestQGpu(), std::runtime_error); } #endif // CUDA diff --git a/src/python/espressomd/magnetostatics.pxd b/src/python/espressomd/magnetostatics.pxd index fb6893f3ec6..45df8411b2f 100644 --- a/src/python/espressomd/magnetostatics.pxd +++ b/src/python/espressomd/magnetostatics.pxd @@ -44,7 +44,7 @@ IF DIPOLES == 1: IF(CUDA == 1) and (ROTATION == 1): cdef extern from "actor/DipolarDirectSum.hpp": cdef cppclass DipolarDirectSum: - DipolarDirectSum(SystemInterface & s) + DipolarDirectSum(SystemInterface & s) except + void set_params() void activate() void deactivate() @@ -52,7 +52,7 @@ IF DIPOLES == 1: IF(DIPOLAR_BARNES_HUT == 1): cdef extern from "actor/DipolarBarnesHut.hpp": cdef cppclass DipolarBarnesHut: - DipolarBarnesHut(SystemInterface & s) + DipolarBarnesHut(SystemInterface & s) except + void set_params(float epssq, float itolsq) void activate() void deactivate() From d29e63302c1c112acd4e717055fc5c2c6034fef4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= Date: Thu, 9 Dec 2021 09:39:04 +0100 Subject: [PATCH 16/21] core: Remove code duplication --- src/core/actor/DipolarBarnesHut.cpp | 36 +++++++++++++---------------- src/core/actor/DipolarBarnesHut.hpp | 1 + 2 files changed, 17 insertions(+), 20 deletions(-) diff --git a/src/core/actor/DipolarBarnesHut.cpp b/src/core/actor/DipolarBarnesHut.cpp index 582d30827c3..67d82ad2960 100644 --- a/src/core/actor/DipolarBarnesHut.cpp +++ b/src/core/actor/DipolarBarnesHut.cpp @@ -64,12 +64,12 @@ void DipolarBarnesHut::set_params(float epssq, float itolsq) { setBHPrecision(m_epssq, m_itolsq); } -void DipolarBarnesHut::computeForces(SystemInterface &s) { +int DipolarBarnesHut::initialize_data_structure(SystemInterface &s) { try { allocBHmemCopy(static_cast(s.npart_gpu()), &m_bh_data); } catch (cuda_runtime_error const &err) { runtimeErrorMsg() << "DipolarBarnesHut: " << err.what(); - return; + return ES_ERROR; } fill_bh_data(s.rGpuBegin(), s.dipGpuBegin(), &m_bh_data); @@ -78,28 +78,24 @@ void DipolarBarnesHut::computeForces(SystemInterface &s) { buildTreeBH(m_bh_data.blocks); summarizeBH(m_bh_data.blocks); sortBH(m_bh_data.blocks); - if (forceBH(&m_bh_data, m_prefactor, s.fGpuBegin(), s.torqueGpuBegin())) { - runtimeErrorMsg() << "kernels encountered a functional error"; - } + + return ES_OK; } -void DipolarBarnesHut::computeEnergy(SystemInterface &s) { - try { - allocBHmemCopy(static_cast(s.npart_gpu()), &m_bh_data); - } catch (cuda_runtime_error const &err) { - runtimeErrorMsg() << "DipolarBarnesHut: " << err.what(); - return; +void DipolarBarnesHut::computeForces(SystemInterface &s) { + if (initialize_data_structure(s) == ES_OK) { + if (forceBH(&m_bh_data, m_prefactor, s.fGpuBegin(), s.torqueGpuBegin())) { + runtimeErrorMsg() << "kernels encountered a functional error"; + } } +} - fill_bh_data(s.rGpuBegin(), s.dipGpuBegin(), &m_bh_data); - initBHgpu(m_bh_data.blocks); - buildBoxBH(m_bh_data.blocks); - buildTreeBH(m_bh_data.blocks); - summarizeBH(m_bh_data.blocks); - sortBH(m_bh_data.blocks); - auto energy = reinterpret_cast(s.eGpu()); - if (energyBH(&m_bh_data, m_prefactor, &(energy->dipolar))) { - runtimeErrorMsg() << "kernels encountered a functional error"; +void DipolarBarnesHut::computeEnergy(SystemInterface &s) { + if (initialize_data_structure(s) == ES_OK) { + auto energy = reinterpret_cast(s.eGpu()); + if (energyBH(&m_bh_data, m_prefactor, &(energy->dipolar))) { + runtimeErrorMsg() << "kernels encountered a functional error"; + } } } diff --git a/src/core/actor/DipolarBarnesHut.hpp b/src/core/actor/DipolarBarnesHut.hpp index 4cf76ecd5ec..4c380b39d63 100644 --- a/src/core/actor/DipolarBarnesHut.hpp +++ b/src/core/actor/DipolarBarnesHut.hpp @@ -48,6 +48,7 @@ class DipolarBarnesHut : public Actor { BHData m_bh_data = {0, 0, 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr}; + int initialize_data_structure(SystemInterface &s); }; #endif // DIPOLAR_BARNES_HUT From 799aee08771866d28a45de760fbd80c6b0624f72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= Date: Thu, 9 Dec 2021 10:00:04 +0100 Subject: [PATCH 17/21] core: Remove unused GPU kernels --- src/core/grid_based_algorithms/lbgpu_cuda.cu | 22 ------------------- .../virtual_sites/lb_inertialess_tracers.hpp | 10 +-------- .../lb_inertialess_tracers_cuda.cu | 11 ---------- 3 files changed, 1 insertion(+), 42 deletions(-) diff --git a/src/core/grid_based_algorithms/lbgpu_cuda.cu b/src/core/grid_based_algorithms/lbgpu_cuda.cu index cc80bec200d..becd3ec8ba0 100644 --- a/src/core/grid_based_algorithms/lbgpu_cuda.cu +++ b/src/core/grid_based_algorithms/lbgpu_cuda.cu @@ -1035,28 +1035,6 @@ __device__ void calc_values_from_m(Utils::Array const &mode_single, pi_out = stress_from_stress_modes(stress_modes(d_v_single, mode_single)); } -/** Calculate temperature of the fluid kernel - * @param[out] cpu_jsquared Result - * @param[in] n_a Local node residing in array a - * @param[out] number_of_non_boundary_nodes Local node residing in array a - */ -__global__ void temperature(LB_nodes_gpu n_a, float *cpu_jsquared, - int *number_of_non_boundary_nodes) { - Utils::Array mode; - float jsquared = 0.0f; - unsigned int index = blockIdx.y * gridDim.x * blockDim.x + - blockDim.x * blockIdx.x + threadIdx.x; - - if (index < para->number_of_nodes) { - if (!n_a.boundary[index]) { - calc_mass_and_momentum_mode(mode, n_a, index); - jsquared = mode[1] * mode[1] + mode[2] * mode[2] + mode[3] * mode[3]; - atomicAdd(cpu_jsquared, jsquared); - atomicAdd(number_of_non_boundary_nodes, 1); - } - } -} - /** Interpolation kernel. * See @cite duenweg09a * @param u Distance to grid point in units of agrid diff --git a/src/core/virtual_sites/lb_inertialess_tracers.hpp b/src/core/virtual_sites/lb_inertialess_tracers.hpp index dbdf1b30de6..8377e8d33d4 100644 --- a/src/core/virtual_sites/lb_inertialess_tracers.hpp +++ b/src/core/virtual_sites/lb_inertialess_tracers.hpp @@ -28,18 +28,10 @@ #include "ParticleRange.hpp" -// Main functions for CPU & GPU void IBM_UpdateParticlePositions(ParticleRange const &particles, double time_step); - -// Main functions for CPU implementation - called from integrate.cpp void IBM_ForcesIntoFluid_CPU(); - -// Main functions for GPU implementation - called from integrate.cpp -// These are in ibm_cuda.cu void IBM_ForcesIntoFluid_GPU(ParticleRange const &particles); -void IBM_ResetLBForces_GPU(); - -#endif +#endif // VIRTUAL_SITES_INERTIALESS_TRACERS #endif diff --git a/src/core/virtual_sites/lb_inertialess_tracers_cuda.cu b/src/core/virtual_sites/lb_inertialess_tracers_cuda.cu index 021f03373af..8fa0363d6a0 100644 --- a/src/core/virtual_sites/lb_inertialess_tracers_cuda.cu +++ b/src/core/virtual_sites/lb_inertialess_tracers_cuda.cu @@ -280,17 +280,6 @@ __global__ void ResetLBForces_Kernel(LB_node_force_density_gpu node_f, } } -/** Call a kernel to reset the forces on the LB nodes to the external force. */ -void IBM_ResetLBForces_GPU() { - if (this_node == 0) { - dim3 dim_grid = - calculate_dim_grid(lbpar_gpu.number_of_nodes, 4, threads_per_block); - - KERNELCALL(ResetLBForces_Kernel, dim_grid, threads_per_block, node_f, - para_gpu); - } -} - /** Transfer particle forces into the LB fluid. * Called from @ref integrate. * This must be the first CUDA-IBM function to be called because it also does From a8f1403035b5beb849232a1fcd850ca69ebbd011 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= Date: Thu, 9 Dec 2021 10:33:12 +0100 Subject: [PATCH 18/21] core: Hide GPU interface implementation details --- src/core/EspressoSystemInterface.cpp | 12 ++++++++++++ src/core/EspressoSystemInterface.hpp | 26 +++++++------------------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/src/core/EspressoSystemInterface.cpp b/src/core/EspressoSystemInterface.cpp index fc09ddb1994..559b4d51830 100644 --- a/src/core/EspressoSystemInterface.cpp +++ b/src/core/EspressoSystemInterface.cpp @@ -40,6 +40,18 @@ void EspressoSystemInterface::gatherParticles() { #endif } +#ifdef CUDA +void EspressoSystemInterface::enableParticleCommunication() { + if (m_gpu) { + if (!gpu_get_global_particle_vars_pointer_host()->communication_enabled) { + gpu_init_particle_comm(); + cuda_bcast_global_part_params(); + reallocDeviceMemory(gpu_get_particle_pointer().size()); + } + } +} +#endif + void EspressoSystemInterface::init() { gatherParticles(); } void EspressoSystemInterface::update() { gatherParticles(); } diff --git a/src/core/EspressoSystemInterface.hpp b/src/core/EspressoSystemInterface.hpp index 838520b0ae1..136ff5d0180 100644 --- a/src/core/EspressoSystemInterface.hpp +++ b/src/core/EspressoSystemInterface.hpp @@ -59,8 +59,7 @@ class EspressoSystemInterface : public SystemInterface { m_needsRGpu = hasRGpu(); m_splitParticleStructGpu |= m_needsRGpu; m_gpu |= m_needsRGpu; - if (m_gpu) - enableParticleCommunication(); + enableParticleCommunication(); }; #ifdef DIPOLES @@ -70,8 +69,7 @@ class EspressoSystemInterface : public SystemInterface { m_needsDipGpu = hasDipGpu(); m_splitParticleStructGpu |= m_needsRGpu; m_gpu |= m_needsRGpu; - if (m_gpu) - enableParticleCommunication(); + enableParticleCommunication(); }; #endif @@ -82,16 +80,14 @@ class EspressoSystemInterface : public SystemInterface { m_needsQGpu = hasQGpu(); m_splitParticleStructGpu |= m_needsQGpu; m_gpu |= m_needsQGpu; - if (m_gpu) - enableParticleCommunication(); + enableParticleCommunication(); }; #endif void requestParticleStructGpu() { m_needsParticleStructGpu = true; m_gpu |= m_needsParticleStructGpu; - if (m_gpu) - enableParticleCommunication(); + enableParticleCommunication(); }; float *fGpuBegin() override { return gpu_get_particle_force_pointer(); }; @@ -99,8 +95,7 @@ class EspressoSystemInterface : public SystemInterface { void requestFGpu() override { m_needsFGpu = hasFGpu(); m_gpu |= m_needsFGpu; - if (m_gpu) - enableParticleCommunication(); + enableParticleCommunication(); }; #ifdef ROTATION @@ -111,8 +106,7 @@ class EspressoSystemInterface : public SystemInterface { void requestTorqueGpu() override { m_needsTorqueGpu = hasTorqueGpu(); m_gpu |= m_needsTorqueGpu; - if (m_gpu) - enableParticleCommunication(); + enableParticleCommunication(); }; #endif @@ -140,13 +134,7 @@ class EspressoSystemInterface : public SystemInterface { void gatherParticles(); void split_particle_struct(); #ifdef CUDA - void enableParticleCommunication() { - if (!gpu_get_global_particle_vars_pointer_host()->communication_enabled) { - gpu_init_particle_comm(); - cuda_bcast_global_part_params(); - reallocDeviceMemory(gpu_get_particle_pointer().size()); - } - }; + void enableParticleCommunication(); void reallocDeviceMemory(std::size_t n); private: From cc7c4d6e1e5824c03edbd6a912735dc3e60e7b19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= Date: Thu, 9 Dec 2021 17:50:24 +0100 Subject: [PATCH 19/21] core: Pass global variable this_node by value --- src/core/EspressoSystemInterface.cpp | 4 +- src/core/cuda_common_cuda.cu | 14 +-- src/core/cuda_interface.hpp | 6 +- .../p3m-common.cpp | 4 +- src/core/forces.cpp | 2 +- src/core/grid_based_algorithms/lbgpu.cpp | 2 +- src/core/grid_based_algorithms/lbgpu_cuda.cu | 4 - .../VirtualSitesInertialessTracers.cpp | 8 +- .../virtual_sites/lb_inertialess_tracers.cpp | 4 +- .../virtual_sites/lb_inertialess_tracers.hpp | 4 +- .../lb_inertialess_tracers_cuda.cu | 109 ++++++++---------- .../lb_inertialess_tracers_cuda_interface.hpp | 3 +- 12 files changed, 76 insertions(+), 88 deletions(-) diff --git a/src/core/EspressoSystemInterface.cpp b/src/core/EspressoSystemInterface.cpp index 559b4d51830..17c7015b2ad 100644 --- a/src/core/EspressoSystemInterface.cpp +++ b/src/core/EspressoSystemInterface.cpp @@ -31,7 +31,7 @@ void EspressoSystemInterface::gatherParticles() { #ifdef CUDA if (m_gpu) { if (gpu_get_global_particle_vars_pointer_host()->communication_enabled) { - copy_part_data_to_gpu(cell_structure.local_particles()); + copy_part_data_to_gpu(cell_structure.local_particles(), this_node); reallocDeviceMemory(gpu_get_particle_pointer().size()); if (m_splitParticleStructGpu && (this_node == 0)) split_particle_struct(); @@ -44,7 +44,7 @@ void EspressoSystemInterface::gatherParticles() { void EspressoSystemInterface::enableParticleCommunication() { if (m_gpu) { if (!gpu_get_global_particle_vars_pointer_host()->communication_enabled) { - gpu_init_particle_comm(); + gpu_init_particle_comm(this_node); cuda_bcast_global_part_params(); reallocDeviceMemory(gpu_get_particle_pointer().size()); } diff --git a/src/core/cuda_common_cuda.cu b/src/core/cuda_common_cuda.cu index 2a084138012..b0e9bf38148 100644 --- a/src/core/cuda_common_cuda.cu +++ b/src/core/cuda_common_cuda.cu @@ -36,8 +36,6 @@ #include #include -extern int this_node; - template using device_vector = thrust::device_vector>; @@ -76,10 +74,10 @@ void cuda_check_errors_exit(const dim3 &block, const dim3 &grid, cudaError_t CU_err = cudaGetLastError(); if (CU_err != cudaSuccess) { fprintf(stderr, - "%d: error \"%s\" calling %s with dim %d %d %d, grid %d %d " + "error \"%s\" calling %s with dim %d %d %d, grid %d %d " "%d in %s:%u\n", - this_node, cudaGetErrorString(CU_err), function, block.x, block.y, - block.z, grid.x, grid.y, grid.z, file, line); + cudaGetErrorString(CU_err), function, block.x, block.y, block.z, + grid.x, grid.y, grid.z, file, line); errexit(); } } @@ -131,7 +129,7 @@ void resize_buffers(std::size_t number_of_particles) { * sizeof(CUDA_global_part_vars), MPI_BYTE, 0, comm_cart)` (when executed * on all nodes) */ -void gpu_init_particle_comm() { +void gpu_init_particle_comm(int this_node) { if (this_node == 0 && global_part_vars_host.communication_enabled == 0) { try { cuda_check_device(); @@ -159,7 +157,7 @@ float *gpu_get_particle_torque_pointer() { #endif CUDA_energy *gpu_get_energy_pointer() { return energy_device; } -void copy_part_data_to_gpu(ParticleRange particles) { +void copy_part_data_to_gpu(ParticleRange particles, int this_node) { if (global_part_vars_host.communication_enabled == 1) { cuda_mpi_get_particles(particles, particle_data_host); @@ -182,7 +180,7 @@ void copy_part_data_to_gpu(ParticleRange particles) { /** setup and call kernel to copy particle forces to host */ -void copy_forces_from_GPU(ParticleRange &particles) { +void copy_forces_from_GPU(ParticleRange &particles, int this_node) { if (global_part_vars_host.communication_enabled == 1) { /* Copy result from device memory to host memory*/ if (this_node == 0 && (not particle_forces_device.empty())) { diff --git a/src/core/cuda_interface.hpp b/src/core/cuda_interface.hpp index 7844c1db3d8..af9eda408b3 100644 --- a/src/core/cuda_interface.hpp +++ b/src/core/cuda_interface.hpp @@ -102,7 +102,7 @@ struct CUDA_global_part_vars { unsigned int communication_enabled; }; -void copy_forces_from_GPU(ParticleRange &particles); +void copy_forces_from_GPU(ParticleRange &particles, int this_node); CUDA_energy copy_energy_from_GPU(); void clear_energy_on_GPU(); @@ -114,12 +114,12 @@ float *gpu_get_particle_torque_pointer(); #endif CUDA_energy *gpu_get_energy_pointer(); -void gpu_init_particle_comm(); +void gpu_init_particle_comm(int this_node); void cuda_mpi_get_particles( const ParticleRange &particles, pinned_vector &particle_data_host); -void copy_part_data_to_gpu(ParticleRange particles); +void copy_part_data_to_gpu(ParticleRange particles, int this_node); /** * @brief Distribute forces to the worker nodes, and add them to the particles. diff --git a/src/core/electrostatics_magnetostatics/p3m-common.cpp b/src/core/electrostatics_magnetostatics/p3m-common.cpp index 6ef78228bd0..48bbcdbf376 100644 --- a/src/core/electrostatics_magnetostatics/p3m-common.cpp +++ b/src/core/electrostatics_magnetostatics/p3m-common.cpp @@ -24,6 +24,7 @@ #include "p3m-common.hpp" #if defined(P3M) || defined(DP3M) +#include "communication.hpp" #include "errorhandling.hpp" #include @@ -33,9 +34,6 @@ #include #include -/* For debug messages */ -extern int this_node; - void p3m_add_block(double const *in, double *out, int const start[3], int const size[3], int const dim[3]) { /* fast,mid and slow changing indices */ diff --git a/src/core/forces.cpp b/src/core/forces.cpp index 8dfefee3dfe..9c050768dfe 100644 --- a/src/core/forces.cpp +++ b/src/core/forces.cpp @@ -156,7 +156,7 @@ void force_calc(CellStructure &cell_structure, double time_step, double kT) { ghost_particles, time_step); #ifdef CUDA - copy_forces_from_GPU(particles); + copy_forces_from_GPU(particles, this_node); #endif // VIRTUAL_SITES distribute forces diff --git a/src/core/grid_based_algorithms/lbgpu.cpp b/src/core/grid_based_algorithms/lbgpu.cpp index c5de7c8d7d6..3d01fecb276 100644 --- a/src/core/grid_based_algorithms/lbgpu.cpp +++ b/src/core/grid_based_algorithms/lbgpu.cpp @@ -164,7 +164,7 @@ void lb_init_gpu() { lb_init_GPU(lbpar_gpu); - gpu_init_particle_comm(); + gpu_init_particle_comm(this_node); cuda_bcast_global_part_params(); } diff --git a/src/core/grid_based_algorithms/lbgpu_cuda.cu b/src/core/grid_based_algorithms/lbgpu_cuda.cu index becd3ec8ba0..3d80fd71a52 100644 --- a/src/core/grid_based_algorithms/lbgpu_cuda.cu +++ b/src/core/grid_based_algorithms/lbgpu_cuda.cu @@ -59,8 +59,6 @@ #include #include -extern int this_node; - /** struct for hydrodynamic fields: this is for internal use * (i.e. stores values in LB units) and should not be used for * printing values @@ -2151,8 +2149,6 @@ void lb_init_boundaries_GPU(std::size_t host_n_lb_boundaries, int *host_boundary_node_list, int *host_boundary_index_list, float *host_lb_boundary_velocity) { - if (this_node != 0) - return; float *boundary_velocity = nullptr; int *boundary_node_list = nullptr; diff --git a/src/core/virtual_sites/VirtualSitesInertialessTracers.cpp b/src/core/virtual_sites/VirtualSitesInertialessTracers.cpp index 567d8715f1b..fcd2396c148 100644 --- a/src/core/virtual_sites/VirtualSitesInertialessTracers.cpp +++ b/src/core/virtual_sites/VirtualSitesInertialessTracers.cpp @@ -21,6 +21,7 @@ #ifdef VIRTUAL_SITES_INERTIALESS_TRACERS #include "VirtualSitesInertialessTracers.hpp" #include "cells.hpp" +#include "communication.hpp" #include "errorhandling.hpp" #include "grid_based_algorithms/lb_interface.hpp" #include "virtual_sites/lb_inertialess_tracers.hpp" @@ -34,7 +35,7 @@ void VirtualSitesInertialessTracers::after_force_calc() { } #ifdef CUDA if (lattice_switch == ActiveLB::GPU) { - IBM_ForcesIntoFluid_GPU(cell_structure.local_particles()); + IBM_ForcesIntoFluid_GPU(cell_structure.local_particles(), this_node); return; } #endif @@ -49,7 +50,8 @@ void VirtualSitesInertialessTracers::after_force_calc() { void VirtualSitesInertialessTracers::after_lb_propagation(double time_step) { #ifdef VIRTUAL_SITES_INERTIALESS_TRACERS - IBM_UpdateParticlePositions(cell_structure.local_particles(), time_step); -#endif // VS inertialess tracers + IBM_UpdateParticlePositions(cell_structure.local_particles(), time_step, + this_node); +#endif } #endif diff --git a/src/core/virtual_sites/lb_inertialess_tracers.cpp b/src/core/virtual_sites/lb_inertialess_tracers.cpp index bce83e17163..5d006a60f78 100644 --- a/src/core/virtual_sites/lb_inertialess_tracers.cpp +++ b/src/core/virtual_sites/lb_inertialess_tracers.cpp @@ -91,13 +91,13 @@ void IBM_ForcesIntoFluid_CPU() { * Called from the integration loop right after the LB update. */ void IBM_UpdateParticlePositions(ParticleRange const &particles, - double time_step) { + double time_step, int this_node) { // Get velocities if (lattice_switch == ActiveLB::CPU) ParticleVelocitiesFromLB_CPU(); #ifdef CUDA if (lattice_switch == ActiveLB::GPU) - ParticleVelocitiesFromLB_GPU(particles); + ParticleVelocitiesFromLB_GPU(particles, this_node); #endif // Do update: Euler diff --git a/src/core/virtual_sites/lb_inertialess_tracers.hpp b/src/core/virtual_sites/lb_inertialess_tracers.hpp index 8377e8d33d4..876774553a9 100644 --- a/src/core/virtual_sites/lb_inertialess_tracers.hpp +++ b/src/core/virtual_sites/lb_inertialess_tracers.hpp @@ -29,9 +29,9 @@ #include "ParticleRange.hpp" void IBM_UpdateParticlePositions(ParticleRange const &particles, - double time_step); + double time_step, int this_node); void IBM_ForcesIntoFluid_CPU(); -void IBM_ForcesIntoFluid_GPU(ParticleRange const &particles); +void IBM_ForcesIntoFluid_GPU(ParticleRange const &particles, int this_node); #endif // VIRTUAL_SITES_INERTIALESS_TRACERS #endif diff --git a/src/core/virtual_sites/lb_inertialess_tracers_cuda.cu b/src/core/virtual_sites/lb_inertialess_tracers_cuda.cu index 8fa0363d6a0..03f4cf9476a 100644 --- a/src/core/virtual_sites/lb_inertialess_tracers_cuda.cu +++ b/src/core/virtual_sites/lb_inertialess_tracers_cuda.cu @@ -39,9 +39,6 @@ #include -// To avoid including communication.hpp -extern int this_node; - // Other functions for internal use void InitCUDA_IBM(std::size_t numParticles); @@ -285,7 +282,7 @@ __global__ void ResetLBForces_Kernel(LB_node_force_density_gpu node_f, * This must be the first CUDA-IBM function to be called because it also does * some initialization. */ -void IBM_ForcesIntoFluid_GPU(ParticleRange const &particles) { +void IBM_ForcesIntoFluid_GPU(ParticleRange const &particles, int this_node) { // This function does // (1) Gather forces from all particles via MPI // (2) Copy forces to the GPU @@ -294,8 +291,9 @@ void IBM_ForcesIntoFluid_GPU(ParticleRange const &particles) { auto const numParticles = gpu_get_particle_pointer().size(); // Storage only needed on head node - if (IBM_ParticleDataInput_host.empty() || !IBM_initialized || - numParticles != IBM_numParticlesCache) + if (this_node == 0 && + (IBM_ParticleDataInput_host.empty() || !IBM_initialized || + numParticles != IBM_numParticlesCache)) InitCUDA_IBM(numParticles); // We gather particle positions and forces from all nodes @@ -320,70 +318,65 @@ void IBM_ForcesIntoFluid_GPU(ParticleRange const &particles) { void InitCUDA_IBM(std::size_t const numParticles) { - // GPU only on head node - if (this_node == 0) { - - // Check if we have to delete - if (!IBM_ParticleDataInput_host.empty()) { - IBM_ParticleDataInput_host.clear(); - IBM_ParticleDataOutput_host.clear(); - cuda_safe_mem(cudaFree(IBM_ParticleDataInput_device)); - cuda_safe_mem(cudaFree(IBM_ParticleDataOutput_device)); - cuda_safe_mem(cudaFree(lb_boundary_velocity_IBM)); - } + // Check if we have to delete + if (!IBM_ParticleDataInput_host.empty()) { + IBM_ParticleDataInput_host.clear(); + IBM_ParticleDataOutput_host.clear(); + cuda_safe_mem(cudaFree(IBM_ParticleDataInput_device)); + cuda_safe_mem(cudaFree(IBM_ParticleDataOutput_device)); + cuda_safe_mem(cudaFree(lb_boundary_velocity_IBM)); + } + + // Back and forth communication of positions and velocities + IBM_ParticleDataInput_host.resize(numParticles); + IBM_ParticleDataOutput_host.resize(numParticles); + cuda_safe_mem(cudaMalloc((void **)&IBM_ParticleDataInput_device, + numParticles * sizeof(IBM_CUDA_ParticleDataInput))); + cuda_safe_mem(cudaMalloc((void **)&IBM_ParticleDataOutput_device, + numParticles * sizeof(IBM_CUDA_ParticleDataOutput))); - // Back and forth communication of positions and velocities - IBM_ParticleDataInput_host.resize(numParticles); - IBM_ParticleDataOutput_host.resize(numParticles); - cuda_safe_mem( - cudaMalloc((void **)&IBM_ParticleDataInput_device, - numParticles * sizeof(IBM_CUDA_ParticleDataInput))); - cuda_safe_mem( - cudaMalloc((void **)&IBM_ParticleDataOutput_device, - numParticles * sizeof(IBM_CUDA_ParticleDataOutput))); - - // Use LB parameters - lb_get_para_pointer(¶_gpu); - - // Copy boundary velocities to the GPU - // First put them into correct format + // Use LB parameters + lb_get_para_pointer(¶_gpu); + + // Copy boundary velocities to the GPU + // First put them into correct format #ifdef LB_BOUNDARIES_GPU - auto *host_lb_boundary_velocity = - new float[3 * (LBBoundaries::lbboundaries.size() + 1)]; - - for (int n = 0; n < LBBoundaries::lbboundaries.size(); n++) { - host_lb_boundary_velocity[3 * n + 0] = - static_cast(LBBoundaries::lbboundaries[n]->velocity()[0]); - host_lb_boundary_velocity[3 * n + 1] = - static_cast(LBBoundaries::lbboundaries[n]->velocity()[1]); - host_lb_boundary_velocity[3 * n + 2] = - static_cast(LBBoundaries::lbboundaries[n]->velocity()[2]); - } + auto *host_lb_boundary_velocity = + new float[3 * (LBBoundaries::lbboundaries.size() + 1)]; + + for (int n = 0; n < LBBoundaries::lbboundaries.size(); n++) { + host_lb_boundary_velocity[3 * n + 0] = + static_cast(LBBoundaries::lbboundaries[n]->velocity()[0]); + host_lb_boundary_velocity[3 * n + 1] = + static_cast(LBBoundaries::lbboundaries[n]->velocity()[1]); + host_lb_boundary_velocity[3 * n + 2] = + static_cast(LBBoundaries::lbboundaries[n]->velocity()[2]); + } - host_lb_boundary_velocity[3 * LBBoundaries::lbboundaries.size() + 0] = 0.0f; - host_lb_boundary_velocity[3 * LBBoundaries::lbboundaries.size() + 1] = 0.0f; - host_lb_boundary_velocity[3 * LBBoundaries::lbboundaries.size() + 2] = 0.0f; + host_lb_boundary_velocity[3 * LBBoundaries::lbboundaries.size() + 0] = 0.0f; + host_lb_boundary_velocity[3 * LBBoundaries::lbboundaries.size() + 1] = 0.0f; + host_lb_boundary_velocity[3 * LBBoundaries::lbboundaries.size() + 2] = 0.0f; - cuda_safe_mem( - cudaMalloc((void **)&lb_boundary_velocity_IBM, - 3 * LBBoundaries::lbboundaries.size() * sizeof(float))); - cuda_safe_mem( - cudaMemcpy(lb_boundary_velocity_IBM, host_lb_boundary_velocity, - 3 * LBBoundaries::lbboundaries.size() * sizeof(float), - cudaMemcpyHostToDevice)); + cuda_safe_mem( + cudaMalloc((void **)&lb_boundary_velocity_IBM, + 3 * LBBoundaries::lbboundaries.size() * sizeof(float))); + cuda_safe_mem( + cudaMemcpy(lb_boundary_velocity_IBM, host_lb_boundary_velocity, + 3 * LBBoundaries::lbboundaries.size() * sizeof(float), + cudaMemcpyHostToDevice)); - delete[] host_lb_boundary_velocity; + delete[] host_lb_boundary_velocity; #endif - IBM_numParticlesCache = numParticles; - IBM_initialized = true; - } + IBM_numParticlesCache = numParticles; + IBM_initialized = true; } /** Call a kernel function to interpolate the velocity at each IBM particle's * position. Store velocity in the particle data structure. */ -void ParticleVelocitiesFromLB_GPU(ParticleRange const &particles) { +void ParticleVelocitiesFromLB_GPU(ParticleRange const &particles, + int this_node) { // This function performs three steps: // (1) interpolate velocities on GPU // (2) transfer velocities back to CPU diff --git a/src/core/virtual_sites/lb_inertialess_tracers_cuda_interface.hpp b/src/core/virtual_sites/lb_inertialess_tracers_cuda_interface.hpp index 6baecd4d2d5..59afe0dd838 100644 --- a/src/core/virtual_sites/lb_inertialess_tracers_cuda_interface.hpp +++ b/src/core/virtual_sites/lb_inertialess_tracers_cuda_interface.hpp @@ -38,7 +38,8 @@ void IBM_cuda_mpi_send_velocities(ParticleRange const &particles); void IBM_cuda_mpi_get_particles(ParticleRange const &particles); -void ParticleVelocitiesFromLB_GPU(ParticleRange const &particles); +void ParticleVelocitiesFromLB_GPU(ParticleRange const &particles, + int this_node); // ******** data types for CUDA and MPI communication ****** struct IBM_CUDA_ParticleDataInput { From e38486cb995adeb49d81d18ac7596fd9d18aa5b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= Date: Fri, 10 Dec 2021 12:26:16 +0100 Subject: [PATCH 20/21] core: Improve separation of concerns - decouple thermostats from long-range GPU methods - narrow includes of thermostat.hpp, cells.hpp, rotation.hpp - break cyclic dependency in bonded interactions - move functions to different files as necessary - pass cell_structure global as function argument as necessary --- .../bonded_interaction_data.hpp | 1 - .../bonded_interactions/thermalized_bond.hpp | 57 ------------ .../thermalized_bond_kernel.hpp | 92 +++++++++++++++++++ .../electrostatics_magnetostatics/coulomb.cpp | 2 +- .../electrostatics_magnetostatics/icc.cpp | 40 ++++---- .../electrostatics_magnetostatics/icc.hpp | 8 +- src/core/energy_inline.hpp | 2 +- src/core/forces.cpp | 65 ++++++++++++- src/core/forces.hpp | 7 +- src/core/forces_inline.hpp | 53 +---------- src/core/integrate.cpp | 1 + src/core/rattle.cpp | 4 + src/core/rattle.hpp | 5 +- src/python/espressomd/particle_data.pxd | 2 +- 14 files changed, 192 insertions(+), 147 deletions(-) create mode 100644 src/core/bonded_interactions/thermalized_bond_kernel.hpp diff --git a/src/core/bonded_interactions/bonded_interaction_data.hpp b/src/core/bonded_interactions/bonded_interaction_data.hpp index f243eea4ba0..0b864ad8fa7 100644 --- a/src/core/bonded_interactions/bonded_interaction_data.hpp +++ b/src/core/bonded_interactions/bonded_interaction_data.hpp @@ -39,7 +39,6 @@ #include "object-in-fluid/oif_global_forces_params.hpp" #include "object-in-fluid/oif_local_forces.hpp" #include "quartic.hpp" -#include "rattle.hpp" #include "rigid_bond.hpp" #include "thermalized_bond.hpp" diff --git a/src/core/bonded_interactions/thermalized_bond.hpp b/src/core/bonded_interactions/thermalized_bond.hpp index e838231b84e..10c12041e85 100644 --- a/src/core/bonded_interactions/thermalized_bond.hpp +++ b/src/core/bonded_interactions/thermalized_bond.hpp @@ -32,14 +32,11 @@ extern int n_thermalized_bonds; #include "Particle.hpp" -#include "random.hpp" -#include "thermostat.hpp" #include #include -#include #include /** Parameters for Thermalized bond */ @@ -80,58 +77,4 @@ struct ThermalizedBond { } }; -/** Separately thermalizes the com and distance of a particle pair. - * @param[in] p1 First particle. - * @param[in] p2 Second particle. - * @param[in] dx %Distance between the particles. - * @return the forces on @p p1 and @p p2 - */ -inline boost::optional> -ThermalizedBond::forces(Particle const &p1, Particle const &p2, - Utils::Vector3d const &dx) const { - // Bond broke? - if (r_cut > 0.0 && dx.norm() > r_cut) { - return {}; - } - - auto const mass_tot = p1.p.mass + p2.p.mass; - auto const mass_tot_inv = 1.0 / mass_tot; - auto const sqrt_mass_tot = sqrt(mass_tot); - auto const sqrt_mass_red = sqrt(p1.p.mass * p2.p.mass / mass_tot); - auto const com_vel = mass_tot_inv * (p1.p.mass * p1.m.v + p2.p.mass * p2.m.v); - auto const dist_vel = p2.m.v - p1.m.v; - - extern ThermalizedBondThermostat thermalized_bond; - Utils::Vector3d force1{}; - Utils::Vector3d force2{}; - auto const noise = Random::noise_uniform( - thermalized_bond.rng_counter(), thermalized_bond.rng_seed(), - p1.p.identity, p2.p.identity); - - for (int i = 0; i < 3; i++) { - double force_lv_com, force_lv_dist; - - // Langevin thermostat for center of mass - if (pref2_com > 0.0) { - force_lv_com = - -pref1_com * com_vel[i] + sqrt_mass_tot * pref2_com * noise[i]; - } else { - force_lv_com = -pref1_com * com_vel[i]; - } - - // Langevin thermostat for distance p1->p2 - if (pref2_dist > 0.0) { - force_lv_dist = - -pref1_dist * dist_vel[i] + sqrt_mass_red * pref2_dist * noise[i]; - } else { - force_lv_dist = -pref1_dist * dist_vel[i]; - } - // Add forces - force1[i] = p1.p.mass * mass_tot_inv * force_lv_com - force_lv_dist; - force2[i] = p2.p.mass * mass_tot_inv * force_lv_com + force_lv_dist; - } - - return std::make_tuple(force1, force2); -} - #endif diff --git a/src/core/bonded_interactions/thermalized_bond_kernel.hpp b/src/core/bonded_interactions/thermalized_bond_kernel.hpp new file mode 100644 index 00000000000..47cc82adc49 --- /dev/null +++ b/src/core/bonded_interactions/thermalized_bond_kernel.hpp @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2010-2019 The ESPResSo project + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010 + * Max-Planck-Institute for Polymer Research, Theory Group + * + * This file is part of ESPResSo. + * + * ESPResSo is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * ESPResSo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef THERMALIZED_DIST_KERNEL_H +#define THERMALIZED_DIST_KERNEL_H + +#include "thermalized_bond.hpp" + +#include "Particle.hpp" +#include "random.hpp" +#include "thermostat.hpp" + +#include + +#include + +#include +#include + +/** Separately thermalizes the com and distance of a particle pair. + * @param[in] p1 First particle. + * @param[in] p2 Second particle. + * @param[in] dx %Distance between the particles. + * @return the forces on @p p1 and @p p2 + */ +inline boost::optional> +ThermalizedBond::forces(Particle const &p1, Particle const &p2, + Utils::Vector3d const &dx) const { + // Bond broke? + if (r_cut > 0.0 && dx.norm() > r_cut) { + return {}; + } + + auto const mass_tot = p1.p.mass + p2.p.mass; + auto const mass_tot_inv = 1.0 / mass_tot; + auto const sqrt_mass_tot = sqrt(mass_tot); + auto const sqrt_mass_red = sqrt(p1.p.mass * p2.p.mass / mass_tot); + auto const com_vel = mass_tot_inv * (p1.p.mass * p1.m.v + p2.p.mass * p2.m.v); + auto const dist_vel = p2.m.v - p1.m.v; + + extern ThermalizedBondThermostat thermalized_bond; + Utils::Vector3d force1{}; + Utils::Vector3d force2{}; + auto const noise = Random::noise_uniform( + thermalized_bond.rng_counter(), thermalized_bond.rng_seed(), + p1.p.identity, p2.p.identity); + + for (int i = 0; i < 3; i++) { + double force_lv_com, force_lv_dist; + + // Langevin thermostat for center of mass + if (pref2_com > 0.0) { + force_lv_com = + -pref1_com * com_vel[i] + sqrt_mass_tot * pref2_com * noise[i]; + } else { + force_lv_com = -pref1_com * com_vel[i]; + } + + // Langevin thermostat for distance p1->p2 + if (pref2_dist > 0.0) { + force_lv_dist = + -pref1_dist * dist_vel[i] + sqrt_mass_red * pref2_dist * noise[i]; + } else { + force_lv_dist = -pref1_dist * dist_vel[i]; + } + // Add forces + force1[i] = p1.p.mass * mass_tot_inv * force_lv_com - force_lv_dist; + force2[i] = p2.p.mass * mass_tot_inv * force_lv_com + force_lv_dist; + } + + return std::make_tuple(force1, force2); +} + +#endif diff --git a/src/core/electrostatics_magnetostatics/coulomb.cpp b/src/core/electrostatics_magnetostatics/coulomb.cpp index 525bca1eefa..f2adfd61209 100644 --- a/src/core/electrostatics_magnetostatics/coulomb.cpp +++ b/src/core/electrostatics_magnetostatics/coulomb.cpp @@ -160,7 +160,7 @@ void deactivate() { } void update_dependent_particles() { - icc_iteration(cell_structure.local_particles(), + icc_iteration(cell_structure, cell_structure.local_particles(), cell_structure.ghost_particles()); } diff --git a/src/core/electrostatics_magnetostatics/icc.cpp b/src/core/electrostatics_magnetostatics/icc.cpp index 67de472a2b4..ea2cbcabf0c 100644 --- a/src/core/electrostatics_magnetostatics/icc.cpp +++ b/src/core/electrostatics_magnetostatics/icc.cpp @@ -32,6 +32,7 @@ #include "icc.hpp" +#include "CellStructure.hpp" #include "Particle.hpp" #include "ParticleRange.hpp" #include "cells.hpp" @@ -46,6 +47,7 @@ #include +#include #include #include #include @@ -54,16 +56,14 @@ icc_struct icc_cfg; -void init_forces_icc(const ParticleRange &particles, - const ParticleRange &ghosts_particles); - /** Calculate the electrostatic forces between source charges (= real charges) * and wall charges. For each electrostatic method, the proper functions * for short- and long-range parts are called. Long-range parts are calculated * directly, short-range parts need helper functions according to the particle * data organisation. This is a modified version of \ref force_calc. */ -void force_calc_icc(const ParticleRange &particles, +void force_calc_icc(CellStructure &cell_structure, + const ParticleRange &particles, const ParticleRange &ghost_particles); /** Variant of @ref add_non_bonded_pair_force where only %Coulomb @@ -81,7 +81,9 @@ inline void add_non_bonded_pair_force_icc(Particle &p1, Particle &p2, p2.f.f += std::get<2>(forces); #endif } -void icc_iteration(const ParticleRange &particles, + +void icc_iteration(CellStructure &cell_structure, + const ParticleRange &particles, const ParticleRange &ghost_particles) { if (icc_cfg.n_icc == 0) return; @@ -96,8 +98,8 @@ void icc_iteration(const ParticleRange &particles, for (int j = 0; j < icc_cfg.num_iteration; j++) { double charge_density_max = 0.; - force_calc_icc(particles, ghost_particles); /* Calculate electrostatic - forces (SR+LR) excluding source source interaction*/ + // calculate electrostatic forces (SR+LR) excluding self-interactions + force_calc_icc(cell_structure, particles, ghost_particles); cell_structure.ghosts_reduce_forces(); double diff = 0; @@ -177,30 +179,26 @@ void icc_iteration(const ParticleRange &particles, on_particle_charge_change(); } -void force_calc_icc(const ParticleRange &particles, +void force_calc_icc(CellStructure &cell_structure, + const ParticleRange &particles, const ParticleRange &ghost_particles) { - init_forces_icc(particles, ghost_particles); + // reset forces + for (auto &p : particles) { + p.f.f = {}; + } + for (auto &p : ghost_particles) { + p.f.f = {}; + } + // calc ICC forces cell_structure.non_bonded_loop( [](Particle &p1, Particle &p2, Distance const &d) { - /* calc non-bonded interactions */ add_non_bonded_pair_force_icc(p1, p2, d.vec21, sqrt(d.dist2), d.dist2); }); Coulomb::calc_long_range_force(particles); } -void init_forces_icc(const ParticleRange &particles, - const ParticleRange &ghosts_particles) { - for (auto &p : particles) { - p.f.f = {}; - } - - for (auto &p : ghosts_particles) { - p.f.f = {}; - } -} - void mpi_icc_init_local(const icc_struct &icc_cfg_) { icc_cfg = icc_cfg_; diff --git a/src/core/electrostatics_magnetostatics/icc.hpp b/src/core/electrostatics_magnetostatics/icc.hpp index 358cbb46586..05c351ccc6f 100644 --- a/src/core/electrostatics_magnetostatics/icc.hpp +++ b/src/core/electrostatics_magnetostatics/icc.hpp @@ -48,10 +48,11 @@ #if defined(ELECTROSTATICS) -#include +#include "CellStructure.hpp" +#include "ParticleRange.hpp" + #include -#include #include /** ICC data structure */ @@ -104,7 +105,8 @@ extern icc_struct icc_cfg; /** The main iterative scheme, where the surface element charges are calculated * self-consistently. */ -void icc_iteration(const ParticleRange &particles, +void icc_iteration(CellStructure &cell_structure, + const ParticleRange &particles, const ParticleRange &ghost_particles); /** Perform ICC initialization. diff --git a/src/core/energy_inline.hpp b/src/core/energy_inline.hpp index cd1026ac61d..41cc2fa565b 100644 --- a/src/core/energy_inline.hpp +++ b/src/core/energy_inline.hpp @@ -57,7 +57,7 @@ #include "Observable_stat.hpp" #include "Particle.hpp" -#include "cells.hpp" +#include "bond_error.hpp" #include "errorhandling.hpp" #include "exclusions.hpp" diff --git a/src/core/forces.cpp b/src/core/forces.cpp index 9c050768dfe..d0758d36e2a 100644 --- a/src/core/forces.cpp +++ b/src/core/forces.cpp @@ -27,6 +27,7 @@ #include "EspressoSystemInterface.hpp" +#include "cells.hpp" #include "collision.hpp" #include "comfixed_global.hpp" #include "communication.hpp" @@ -45,7 +46,10 @@ #include "nonbonded_interactions/VerletCriterion.hpp" #include "nonbonded_interactions/nonbonded_interaction_data.hpp" #include "npt.hpp" +#include "rotation.hpp" #include "short_range_loop.hpp" +#include "thermostat.hpp" +#include "thermostats/langevin_inline.hpp" #include "virtual_sites.hpp" #include @@ -54,7 +58,58 @@ ActorList forceActors; -void init_forces(const ParticleRange &particles, double time_step, double kT) { +/** Initialize the forces for a ghost particle */ +inline ParticleForce init_ghost_force(Particle const &) { return {}; } + +/** External particle forces */ +inline ParticleForce external_force(Particle const &p) { + ParticleForce f = {}; + +#ifdef EXTERNAL_FORCES + f.f += p.p.ext_force; +#ifdef ROTATION + f.torque += p.p.ext_torque; +#endif +#endif + +#ifdef ENGINE + // apply a swimming force in the direction of + // the particle's orientation axis + if (p.p.swim.swimming) { + f.f += p.p.swim.f_swim * p.r.calc_director(); + } +#endif + + return f; +} + +inline ParticleForce thermostat_force(Particle const &p, double time_step, + double kT) { + extern LangevinThermostat langevin; + if (!(thermo_switch & THERMO_LANGEVIN)) { + return {}; + } + +#ifdef ROTATION + return {friction_thermo_langevin(langevin, p, time_step, kT), + p.p.rotation ? convert_vector_body_to_space( + p, friction_thermo_langevin_rotation( + langevin, p, time_step, kT)) + : Utils::Vector3d{}}; +#else + return friction_thermo_langevin(langevin, p, time_step, kT); +#endif +} + +/** Initialize the forces for a real particle */ +inline ParticleForce init_real_particle_force(Particle const &p, + double time_step, double kT) { + return thermostat_force(p, time_step, kT) + external_force(p); +} + +static void init_forces(const ParticleRange &particles, + const ParticleRange &ghost_particles, double time_step, + double kT) { ESPRESSO_PROFILER_CXX_MARK_FUNCTION; /* The force initialization depends on the used thermostat and the thermodynamic ensemble */ @@ -68,13 +123,13 @@ void init_forces(const ParticleRange &particles, double time_step, double kT) { set torque to zero for all and rescale quaternions */ for (auto &p : particles) { - p.f = init_local_particle_force(p, time_step, kT); + p.f = init_real_particle_force(p, time_step, kT); } /* initialize ghost forces with zero set torque to zero for all and rescale quaternions */ - for (auto &p : cell_structure.ghost_particles()) { + for (auto &p : ghost_particles) { p.f = init_ghost_force(p); } } @@ -98,9 +153,9 @@ void force_calc(CellStructure &cell_structure, double time_step, double kT) { auto particles = cell_structure.local_particles(); auto ghost_particles = cell_structure.ghost_particles(); #ifdef ELECTROSTATICS - icc_iteration(particles, cell_structure.ghost_particles()); + icc_iteration(cell_structure, particles, ghost_particles); #endif - init_forces(particles, time_step, kT); + init_forces(particles, ghost_particles, time_step, kT); for (auto &forceActor : forceActors) { forceActor->computeForces(espresso_system); diff --git a/src/core/forces.hpp b/src/core/forces.hpp index 299456f5027..21166dff9e5 100644 --- a/src/core/forces.hpp +++ b/src/core/forces.hpp @@ -30,11 +30,12 @@ * Implementation in forces.cpp. */ +#include "CellStructure.hpp" +#include "ParticleRange.hpp" #include "actor/Actor.hpp" #include "actor/ActorList.hpp" -#include "bonded_interactions/bonded_interaction_data.hpp" -#include "cells.hpp" -#include "nonbonded_interactions/nonbonded_interaction_data.hpp" + +#include extern ActorList forceActors; diff --git a/src/core/forces_inline.hpp b/src/core/forces_inline.hpp index 815424897e5..d5758e7b90b 100644 --- a/src/core/forces_inline.hpp +++ b/src/core/forces_inline.hpp @@ -26,6 +26,7 @@ #include "forces.hpp" #include "bonded_interactions/bonded_interaction_data.hpp" +#include "bonded_interactions/thermalized_bond_kernel.hpp" #include "immersed_boundary/ibm_tribend.hpp" #include "immersed_boundary/ibm_triel.hpp" #include "nonbonded_interactions/bmhtf-nacl.hpp" @@ -61,11 +62,10 @@ #endif #include "Particle.hpp" +#include "bond_error.hpp" #include "errorhandling.hpp" #include "exclusions.hpp" -#include "rotation.hpp" #include "thermostat.hpp" -#include "thermostats/langevin_inline.hpp" #include #include @@ -75,55 +75,6 @@ #include -/** Initialize the forces for a ghost particle */ -inline ParticleForce init_ghost_force(Particle const &) { return {}; } - -/** External particle forces */ -inline ParticleForce external_force(Particle const &p) { - ParticleForce f = {}; - -#ifdef EXTERNAL_FORCES - f.f += p.p.ext_force; -#ifdef ROTATION - f.torque += p.p.ext_torque; -#endif -#endif - -#ifdef ENGINE - // apply a swimming force in the direction of - // the particle's orientation axis - if (p.p.swim.swimming) { - f.f += p.p.swim.f_swim * p.r.calc_director(); - } -#endif - - return f; -} - -inline ParticleForce thermostat_force(Particle const &p, double time_step, - double kT) { - extern LangevinThermostat langevin; - if (!(thermo_switch & THERMO_LANGEVIN)) { - return {}; - } - -#ifdef ROTATION - return {friction_thermo_langevin(langevin, p, time_step, kT), - p.p.rotation ? convert_vector_body_to_space( - p, friction_thermo_langevin_rotation( - langevin, p, time_step, kT)) - : Utils::Vector3d{}}; -#else - return friction_thermo_langevin(langevin, p, time_step, kT); -#endif -} - -/** Initialize the forces for a real particle */ -inline ParticleForce init_local_particle_force(Particle const &part, - double time_step, double kT) { - return thermostat_force(part, time_step, kT) + external_force(part); -} - inline ParticleForce calc_non_bonded_pair_force(Particle const &p1, Particle const &p2, IA_parameters const &ia_params, diff --git a/src/core/integrate.cpp b/src/core/integrate.cpp index 011ae63ec9b..799f9285be6 100644 --- a/src/core/integrate.cpp +++ b/src/core/integrate.cpp @@ -35,6 +35,7 @@ #include "ParticleRange.hpp" #include "accumulators.hpp" +#include "bonded_interactions/rigid_bond.hpp" #include "cells.hpp" #include "collision.hpp" #include "communication.hpp" diff --git a/src/core/rattle.cpp b/src/core/rattle.cpp index 497cdc52093..e141405167c 100644 --- a/src/core/rattle.cpp +++ b/src/core/rattle.cpp @@ -25,6 +25,10 @@ #include "CellStructure.hpp" #include "Particle.hpp" +#include "ParticleRange.hpp" +#include "bonded_interactions/bonded_interaction_data.hpp" +#include "bonded_interactions/rigid_bond.hpp" +#include "cells.hpp" #include "communication.hpp" #include "errorhandling.hpp" #include "grid.hpp" diff --git a/src/core/rattle.hpp b/src/core/rattle.hpp index b35d9dca9dd..a5dd83b2e62 100644 --- a/src/core/rattle.hpp +++ b/src/core/rattle.hpp @@ -27,13 +27,12 @@ * For more information see \ref rattle.cpp. */ -#include "bonded_interactions/bonded_interaction_data.hpp" -#include "bonded_interactions/rigid_bond.hpp" -#include "cells.hpp" #include "config.hpp" #ifdef BOND_CONSTRAINT +#include "CellStructure.hpp" + /** Transfer the current particle positions from @ref ParticlePosition::p * "Particle::r::p" to @ref ParticlePosition::p_last_timestep * "Particle::r::p_last_timestep" diff --git a/src/python/espressomd/particle_data.pxd b/src/python/espressomd/particle_data.pxd index 7e1785bf9bb..0b095a27063 100644 --- a/src/python/espressomd/particle_data.pxd +++ b/src/python/espressomd/particle_data.pxd @@ -202,7 +202,7 @@ cdef extern from "rotation.hpp": Vector3d convert_vector_space_to_body(const particle & p, const Vector3d & v) void rotate_particle(int pid, const Vector3d & axis, double angle) -cdef extern from "rattle.hpp": +cdef extern from "bonded_interactions/rigid_bond.hpp": extern int n_rigidbonds cdef class ParticleHandle: From aae26c92c2a694b09ea4c841a2cefd8fe35930ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= Date: Thu, 23 Dec 2021 14:20:14 +0100 Subject: [PATCH 21/21] core: Apply suggested changes Co-authored-by: Alexander Reinauer <38552369+reinaual@users.noreply.github.com> --- src/core/EspressoSystemInterface.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/EspressoSystemInterface.hpp b/src/core/EspressoSystemInterface.hpp index 136ff5d0180..ef5b27e973d 100644 --- a/src/core/EspressoSystemInterface.hpp +++ b/src/core/EspressoSystemInterface.hpp @@ -111,7 +111,7 @@ class EspressoSystemInterface : public SystemInterface { #endif float *eGpu() override { - // cast pointer to struct of floats to array of floats + // cast pointer from struct of floats to array of floats // https://stackoverflow.com/a/29278260 return reinterpret_cast(gpu_get_energy_pointer()); };