From 60b4f0ebd6fd5f569f2c8e3f14a8dbc391f28506 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Wed, 1 Dec 2021 18:24:51 +0100
Subject: [PATCH 01/21] core: Cleanup globals and remove unnecessary macros

---
 src/core/EspressoSystemInterface.hpp          |  5 ++--
 src/core/actor/DipolarBarnesHut.cpp           |  6 ++--
 src/core/actor/DipolarBarnesHut.hpp           |  2 --
 src/core/actor/DipolarBarnesHut_cuda.cu       |  5 +---
 src/core/actor/DipolarDirectSum.cpp           |  4 +--
 src/core/actor/DipolarDirectSum.hpp           |  2 --
 src/core/actor/DipolarDirectSum_cuda.cu       | 22 +++++++-------
 src/core/cuda_interface.cpp                   |  2 +-
 .../p3m_gpu_cuda.cu                           | 29 +++++++++----------
 src/core/energy.cpp                           |  5 ++--
 src/core/forces.cpp                           |  7 +++--
 11 files changed, 41 insertions(+), 48 deletions(-)
diff --git a/src/core/EspressoSystemInterface.hpp b/src/core/EspressoSystemInterface.hpp
index 4f00171452a..cdcad3db7f8 100644
--- a/src/core/EspressoSystemInterface.hpp
+++ b/src/core/EspressoSystemInterface.hpp
@@ -24,9 +24,6 @@
 
 #include <cstddef>
 
-/* Syntactic sugar */
-#define espressoSystemInterface EspressoSystemInterface::Instance()
-
 class EspressoSystemInterface : public SystemInterface {
 public:
   static EspressoSystemInterface &Instance() {
@@ -83,6 +80,7 @@ class EspressoSystemInterface : public SystemInterface {
     return m_needsVGpu;
   };
 
+#ifdef ELECTROSTATICS
   float *qGpuBegin() override { return m_q_gpu_begin; };
   float *qGpuEnd() override { return m_q_gpu_end; };
   bool hasQGpu() override { return true; };
@@ -94,6 +92,7 @@ class EspressoSystemInterface : public SystemInterface {
       enableParticleCommunication();
     return m_needsQGpu;
   };
+#endif
 
   float *directorGpuBegin() override { return m_director_gpu_begin; };
   float *directorGpuEnd() override { return m_director_gpu_end; };
diff --git a/src/core/actor/DipolarBarnesHut.cpp b/src/core/actor/DipolarBarnesHut.cpp
index 9040cf02705..ce5fd05b6f6 100644
--- a/src/core/actor/DipolarBarnesHut.cpp
+++ b/src/core/actor/DipolarBarnesHut.cpp
@@ -31,15 +31,15 @@
 
 #include <memory>
 
-std::unique_ptr<DipolarBarnesHut> dipolarBarnesHut;
+static std::unique_ptr<DipolarBarnesHut> dipolarBarnesHut;
 
 void activate_dipolar_barnes_hut(float epssq, float itolsq) {
   // also necessary on 1 CPU or GPU, does more than just broadcasting
   dipole.method = DIPOLAR_BH_GPU;
   mpi_bcast_coulomb_params();
 
-  dipolarBarnesHut = std::make_unique<DipolarBarnesHut>(espressoSystemInterface,
-                                                        epssq, itolsq);
+  dipolarBarnesHut = std::make_unique<DipolarBarnesHut>(
+      EspressoSystemInterface::Instance(), epssq, itolsq);
   forceActors.push_back(dipolarBarnesHut.get());
   energyActors.push_back(dipolarBarnesHut.get());
 }
diff --git a/src/core/actor/DipolarBarnesHut.hpp b/src/core/actor/DipolarBarnesHut.hpp
index 8631e8c9eb6..98364fde674 100644
--- a/src/core/actor/DipolarBarnesHut.hpp
+++ b/src/core/actor/DipolarBarnesHut.hpp
@@ -100,8 +100,6 @@ class DipolarBarnesHut : public Actor {
 void activate_dipolar_barnes_hut(float epssq, float itolsq);
 void deactivate_dipolar_barnes_hut();
 
-extern std::unique_ptr<DipolarBarnesHut> dipolarBarnesHut;
-
 #endif
 
 #endif // DIPOLAR_BARNES_HUT
diff --git a/src/core/actor/DipolarBarnesHut_cuda.cu b/src/core/actor/DipolarBarnesHut_cuda.cu
index f2e2185b896..8f0f11ba330 100644
--- a/src/core/actor/DipolarBarnesHut_cuda.cu
+++ b/src/core/actor/DipolarBarnesHut_cuda.cu
@@ -36,8 +36,6 @@
 
 #include <cuda.h>
 
-#define IND (blockDim.x * blockIdx.x + threadIdx.x)
-
 // Method performance/accuracy parameters
 __constant__ float epssqd[1], itolsqd[1];
 // blkcntd is a factual blocks' count.
@@ -73,8 +71,7 @@ __device__ void dds_sumReduction_BH(float *input, float *sum) {
 /******************************************************************************/
 
 __global__ void initializationKernel() {
-  int ind;
-  ind = IND;
+  auto const ind = blockDim.x * blockIdx.x + threadIdx.x;
   if (ind == 0) {
     *bhpara->err = 0;
     *bhpara->max_lps = 0;
diff --git a/src/core/actor/DipolarDirectSum.cpp b/src/core/actor/DipolarDirectSum.cpp
index 55cb6858819..2ca0de24bb6 100644
--- a/src/core/actor/DipolarDirectSum.cpp
+++ b/src/core/actor/DipolarDirectSum.cpp
@@ -30,7 +30,7 @@
 
 #include <memory>
 
-std::unique_ptr<DipolarDirectSum> dipolarDirectSum;
+static std::unique_ptr<DipolarDirectSum> dipolarDirectSum;
 
 void activate_dipolar_direct_sum_gpu() {
   // also necessary on 1 CPU or GPU, does more than just broadcasting
@@ -38,7 +38,7 @@ void activate_dipolar_direct_sum_gpu() {
   mpi_bcast_coulomb_params();
 
   dipolarDirectSum =
-      std::make_unique<DipolarDirectSum>(espressoSystemInterface);
+      std::make_unique<DipolarDirectSum>(EspressoSystemInterface::Instance());
   forceActors.push_back(dipolarDirectSum.get());
   energyActors.push_back(dipolarDirectSum.get());
 }
diff --git a/src/core/actor/DipolarDirectSum.hpp b/src/core/actor/DipolarDirectSum.hpp
index 0e9dd8d505b..aa41cdcde58 100644
--- a/src/core/actor/DipolarDirectSum.hpp
+++ b/src/core/actor/DipolarDirectSum.hpp
@@ -84,8 +84,6 @@ class DipolarDirectSum : public Actor {
 void activate_dipolar_direct_sum_gpu();
 void deactivate_dipolar_direct_sum_gpu();
 
-extern std::unique_ptr<DipolarDirectSum> dipolarDirectSum;
-
 #endif
 
 #endif
diff --git a/src/core/actor/DipolarDirectSum_cuda.cu b/src/core/actor/DipolarDirectSum_cuda.cu
index 02df0d4e9e7..0182c140460 100644
--- a/src/core/actor/DipolarDirectSum_cuda.cu
+++ b/src/core/actor/DipolarDirectSum_cuda.cu
@@ -32,6 +32,10 @@
 #error CU-file includes mpi.h! This should not happen!
 #endif
 
+__device__ inline float scalar_product(float const *a, float const *b) {
+  return a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
+}
+
 __device__ inline void get_mi_vector_dds(float res[3], float const a[3],
                                          float const b[3], float const box_l[3],
                                          int const periodic[3]) {
@@ -42,8 +46,6 @@ __device__ inline void get_mi_vector_dds(float res[3], float const a[3],
   }
 }
 
-#define scalar(a, b) ((a)[0] * (b)[0] + (a)[1] * (b)[1] + (a)[2] * (b)[2])
-
 __device__ void dipole_ia_force(int id, float pf, float const *r1,
                                 float const *r2, float const *dip1,
                                 float const *dip2, float *f1, float *torque1,
@@ -69,7 +71,7 @@ __device__ void dipole_ia_force(int id, float pf, float const *r1,
   get_mi_vector_dds(dr, _r1, _r2, box_l, periodic);
 
   // Powers of distance
-  r_sq = scalar(dr, dr);
+  r_sq = scalar_product(dr, dr);
   r_sq_inv = 1 / r_sq;
   r_inv = rsqrtf(r_sq);
   r3_inv = 1 / r_sq * r_inv;
@@ -77,9 +79,9 @@ __device__ void dipole_ia_force(int id, float pf, float const *r1,
   r7_inv = r5_inv * r_sq_inv;
 
   // Dot products
-  pe1 = scalar(_dip1, _dip2);
-  pe2 = scalar(_dip1, dr);
-  pe3 = scalar(_dip2, dr);
+  pe1 = scalar_product(_dip1, _dip2);
+  pe2 = scalar_product(_dip1, dr);
+  pe3 = scalar_product(_dip2, dr);
   pe4 = 3.0f * r5_inv;
 
   // Force
@@ -136,16 +138,16 @@ __device__ float dipole_ia_energy(int id, float pf, float const *r1,
   get_mi_vector_dds(dr, _r1, _r2, box_l, periodic);
 
   // Powers of distance
-  r_sq = scalar(dr, dr);
+  r_sq = scalar_product(dr, dr);
   r_sq_inv = 1 / r_sq;
   r_inv = rsqrtf(r_sq);
   r3_inv = 1 / r_sq * r_inv;
   r5_inv = r3_inv * r_sq_inv;
 
   // Dot products
-  pe1 = scalar(_dip1, _dip2);
-  pe2 = scalar(_dip1, dr);
-  pe3 = scalar(_dip2, dr);
+  pe1 = scalar_product(_dip1, _dip2);
+  pe2 = scalar_product(_dip1, dr);
+  pe3 = scalar_product(_dip2, dr);
   pe4 = 3.0f * r5_inv;
 
   // Energy
diff --git a/src/core/cuda_interface.cpp b/src/core/cuda_interface.cpp
index 7d8353efe68..8b5b87d9927 100644
--- a/src/core/cuda_interface.cpp
+++ b/src/core/cuda_interface.cpp
@@ -166,7 +166,7 @@ void cuda_mpi_send_forces(const ParticleRange &particles,
 static void cuda_bcast_global_part_params_local() {
   MPI_Bcast(gpu_get_global_particle_vars_pointer_host(),
             sizeof(CUDA_global_part_vars), MPI_BYTE, 0, comm_cart);
-  espressoSystemInterface.requestParticleStructGpu();
+  EspressoSystemInterface::Instance().requestParticleStructGpu();
 }
 
 REGISTER_CALLBACK(cuda_bcast_global_part_params_local)
diff --git a/src/core/electrostatics_magnetostatics/p3m_gpu_cuda.cu b/src/core/electrostatics_magnetostatics/p3m_gpu_cuda.cu
index 1df4f265478..5b3c3c1988f 100644
--- a/src/core/electrostatics_magnetostatics/p3m_gpu_cuda.cu
+++ b/src/core/electrostatics_magnetostatics/p3m_gpu_cuda.cu
@@ -51,7 +51,6 @@
 
 #include "electrostatics_magnetostatics/p3m_gpu.hpp"
 
-#include "BoxGeometry.hpp"
 #include "EspressoSystemInterface.hpp"
 #include "cuda_interface.hpp"
 #include "cuda_utils.cuh"
@@ -76,8 +75,6 @@
 using Utils::int_pow;
 using Utils::sqr;
 
-extern BoxGeometry box_geo;
-
 struct P3MGpuData {
   /** Charge mesh */
   FFT_TYPE_COMPLEX *charge_mesh;
@@ -115,7 +112,7 @@ struct p3m_gpu_fft_plans_t {
   cufftHandle back_plan;
 } p3m_gpu_fft_plans;
 
-static char p3m_gpu_data_initialized = 0;
+static bool p3m_gpu_data_initialized = false;
 
 template <int cao>
 __device__ void static Aliasing_sums_ik(const P3MGpuData p, int NX, int NY,
@@ -562,33 +559,34 @@ void p3m_gpu_init(int cao, const int mesh[3], double alpha) {
   if (mesh[0] == -1 && mesh[1] == -1 && mesh[2] == -1)
     throw std::runtime_error("P3M: invalid mesh size");
 
-  espressoSystemInterface.requestParticleStructGpu();
+  auto &espresso_system = EspressoSystemInterface::Instance();
+  espresso_system.requestParticleStructGpu();
 
   bool reinit_if = false, mesh_changed = false;
   p3m_gpu_data.n_part = gpu_get_particle_pointer().size();
 
-  if ((p3m_gpu_data_initialized == 0) || (p3m_gpu_data.alpha != alpha)) {
+  if (!p3m_gpu_data_initialized || p3m_gpu_data.alpha != alpha) {
     p3m_gpu_data.alpha = static_cast<REAL_TYPE>(alpha);
     reinit_if = true;
   }
 
-  if ((p3m_gpu_data_initialized == 0) || (p3m_gpu_data.cao != cao)) {
+  if (!p3m_gpu_data_initialized || p3m_gpu_data.cao != cao) {
     p3m_gpu_data.cao = cao;
     // NOLINTNEXTLINE(bugprone-integer-division)
     p3m_gpu_data.pos_shift = static_cast<REAL_TYPE>((p3m_gpu_data.cao - 1) / 2);
     reinit_if = true;
   }
 
-  if ((p3m_gpu_data_initialized == 0) || (p3m_gpu_data.mesh[0] != mesh[0]) ||
+  if (!p3m_gpu_data_initialized || (p3m_gpu_data.mesh[0] != mesh[0]) ||
       (p3m_gpu_data.mesh[1] != mesh[1]) || (p3m_gpu_data.mesh[2] != mesh[2])) {
     std::copy(mesh, mesh + 3, p3m_gpu_data.mesh);
     mesh_changed = true;
     reinit_if = true;
   }
 
-  auto const box_l = box_geo.length();
+  auto const box_l = espresso_system.box();
 
-  if ((p3m_gpu_data_initialized == 0) || (p3m_gpu_data.box[0] != box_l[0]) ||
+  if (!p3m_gpu_data_initialized || (p3m_gpu_data.box[0] != box_l[0]) ||
       (p3m_gpu_data.box[1] != box_l[1]) || (p3m_gpu_data.box[2] != box_l[2])) {
     std::copy(box_l.begin(), box_l.end(), p3m_gpu_data.box);
     reinit_if = true;
@@ -602,7 +600,7 @@ void p3m_gpu_init(int cao, const int mesh[3], double alpha) {
         static_cast<REAL_TYPE>(p3m_gpu_data.mesh[i]) / p3m_gpu_data.box[i];
   }
 
-  if ((p3m_gpu_data_initialized == 1) && mesh_changed) {
+  if (p3m_gpu_data_initialized && mesh_changed) {
     cuda_safe_mem(cudaFree(p3m_gpu_data.charge_mesh));
     p3m_gpu_data.charge_mesh = nullptr;
     cuda_safe_mem(cudaFree(p3m_gpu_data.force_mesh_x));
@@ -617,10 +615,10 @@ void p3m_gpu_init(int cao, const int mesh[3], double alpha) {
     cufftDestroy(p3m_gpu_fft_plans.forw_plan);
     cufftDestroy(p3m_gpu_fft_plans.back_plan);
 
-    p3m_gpu_data_initialized = 0;
+    p3m_gpu_data_initialized = false;
   }
 
-  if ((p3m_gpu_data_initialized == 0) && (p3m_gpu_data.mesh_size > 0)) {
+  if (!p3m_gpu_data_initialized && p3m_gpu_data.mesh_size > 0) {
     /* Size of the complex mesh Nx * Ny * ( Nz / 2 + 1 ) */
     const int cmesh_size = p3m_gpu_data.mesh[0] * p3m_gpu_data.mesh[1] *
                            (p3m_gpu_data.mesh[2] / 2 + 1);
@@ -644,8 +642,7 @@ void p3m_gpu_init(int cao, const int mesh[3], double alpha) {
     }
   }
 
-  if ((reinit_if || (p3m_gpu_data_initialized == 0)) &&
-      (p3m_gpu_data.mesh_size > 0)) {
+  if ((reinit_if or !p3m_gpu_data_initialized) && p3m_gpu_data.mesh_size > 0) {
     dim3 grid(1, 1, 1);
     dim3 block(1, 1, 1);
     block.x = 512 / mesh[0] + 1;
@@ -686,7 +683,7 @@ void p3m_gpu_init(int cao, const int mesh[3], double alpha) {
     }
   }
   if (p3m_gpu_data.mesh_size > 0)
-    p3m_gpu_data_initialized = 1;
+    p3m_gpu_data_initialized = true;
 }
 
 /**
diff --git a/src/core/energy.cpp b/src/core/energy.cpp
index ffce964d177..b9f9a6b5e26 100644
--- a/src/core/energy.cpp
+++ b/src/core/energy.cpp
@@ -57,11 +57,12 @@ static std::shared_ptr<Observable_stat> calculate_energy_local() {
   clear_energy_on_GPU();
 #endif
 
-  EspressoSystemInterface::Instance().update();
+  auto &espresso_system = EspressoSystemInterface::Instance();
+  espresso_system.update();
 
   // Compute the energies from the energyActors
   for (auto &energyActor : energyActors)
-    energyActor->computeEnergy(espressoSystemInterface);
+    energyActor->computeEnergy(espresso_system);
 
   on_observable_calc();
 
diff --git a/src/core/forces.cpp b/src/core/forces.cpp
index f054ba60b7a..8dfefee3dfe 100644
--- a/src/core/forces.cpp
+++ b/src/core/forces.cpp
@@ -88,7 +88,8 @@ void init_forces_ghosts(const ParticleRange &particles) {
 void force_calc(CellStructure &cell_structure, double time_step, double kT) {
   ESPRESSO_PROFILER_CXX_MARK_FUNCTION;
 
-  espressoSystemInterface.update();
+  auto &espresso_system = EspressoSystemInterface::Instance();
+  espresso_system.update();
 
 #ifdef COLLISION_DETECTION
   prepare_local_collision_queue();
@@ -102,9 +103,9 @@ void force_calc(CellStructure &cell_structure, double time_step, double kT) {
   init_forces(particles, time_step, kT);
 
   for (auto &forceActor : forceActors) {
-    forceActor->computeForces(espressoSystemInterface);
+    forceActor->computeForces(espresso_system);
 #ifdef ROTATION
-    forceActor->computeTorques(espressoSystemInterface);
+    forceActor->computeTorques(espresso_system);
 #endif
   }
 

From 7104cb2fa40be29505d6530cd1d1d7ca2f572a4b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Wed, 1 Dec 2021 21:46:18 +0100
Subject: [PATCH 02/21] core: Modernize CUDA code

Use `auto const`, reduce scope of local variables, reduce code
duplication, simplify expressions (e.g. vector products), fix
conversion warnings.
---
 src/core/actor/DipolarBarnesHut_cuda.cu       | 228 ++++++++----------
 src/core/actor/DipolarDirectSum.hpp           |  14 +-
 src/core/actor/DipolarDirectSum_cuda.cu       | 201 +++++++--------
 .../p3m-dipolar.hpp                           |   6 -
 .../p3m_gpu_cuda.cu                           | 205 ++++++++--------
 .../p3m_gpu_error_cuda.cu                     | 119 ++++-----
 6 files changed, 348 insertions(+), 425 deletions(-)

diff --git a/src/core/actor/DipolarBarnesHut_cuda.cu b/src/core/actor/DipolarBarnesHut_cuda.cu
index 8f0f11ba330..baadc95522e 100644
--- a/src/core/actor/DipolarBarnesHut_cuda.cu
+++ b/src/core/actor/DipolarBarnesHut_cuda.cu
@@ -85,13 +85,11 @@ __global__ void initializationKernel() {
 /******************************************************************************/
 
 __global__ __launch_bounds__(THREADS1, FACTOR1) void boundingBoxKernel() {
-  int i, j, k, l, t, inc, n_blocks;
-  float val;
   // min/max positions per the thread:
   float minp[3], maxp[3];
   // min/max positions per block:
   __shared__ float smin[3 * THREADS1], smax[3 * THREADS1];
-  for (l = 0; l < 3; l++) {
+  for (int l = 0; l < 3; l++) {
     minp[l] = maxp[l] = bhpara->r[l];
   }
 
@@ -101,24 +99,24 @@ __global__ __launch_bounds__(THREADS1, FACTOR1) void boundingBoxKernel() {
   // inc = [number of blocks: gridDim.x] * [THREADS1 per block within given
   // kernel]. Hence, this approach could handle an infinite number of bodies
   // (particles)
-  i = static_cast<int>(threadIdx.x);
-  inc = THREADS1 * gridDim.x;
+  auto i = static_cast<int>(threadIdx.x);
+  int const inc = THREADS1 * static_cast<int>(gridDim.x);
   // j is an absolute index of the particle.
   // It is shifted over a count of the passed block threads behind: blockIdx.x *
   // THREADS1. NOTE: this loop is extrema search among all particles of the
   // given thread in the present block. However, one is not among all threads of
   // this block.
-  for (j = i + static_cast<int>(blockIdx.x) * THREADS1; j < bhpara->nbodies;
-       j += inc)
-    for (l = 0; l < 3; l++) {
-      val = bhpara->r[3 * j + l];
+  for (auto j = i + static_cast<int>(blockIdx.x) * THREADS1;
+       j < bhpara->nbodies; j += inc)
+    for (int l = 0; l < 3; l++) {
+      auto const val = bhpara->r[3 * j + l];
       minp[l] = min(minp[l], val);
       maxp[l] = max(maxp[l], val);
     }
 
   // For a start point of a reduction in the given block shared memory
   // of the i-th thread extrema:
-  for (l = 0; l < 3; l++) {
+  for (int l = 0; l < 3; l++) {
     smin[3 * i + l] = minp[l];
     smax[3 * i + l] = maxp[l];
   }
@@ -130,11 +128,11 @@ __global__ __launch_bounds__(THREADS1, FACTOR1) void boundingBoxKernel() {
   // the thread with a lower index will operate longer and
   // final result (here: the shared memory extrema)
   // is flowing down towards zero thread.
-  for (t = THREADS1 / 2; t > 0; t /= 2) {
+  for (int t = THREADS1 / 2; t > 0; t /= 2) {
     __syncthreads();
     if (i < t) {
-      k = i + t;
-      for (l = 0; l < 3; l++) {
+      auto k = i + t;
+      for (int l = 0; l < 3; l++) {
         smin[3 * i + l] = minp[l] = min(minp[l], smin[3 * k + l]);
         smax[3 * i + l] = maxp[l] = max(maxp[l], smax[3 * k + l]);
         // very last minp/maxp assignment will be made by zero thread (i == 0)
@@ -147,8 +145,8 @@ __global__ __launch_bounds__(THREADS1, FACTOR1) void boundingBoxKernel() {
   // and other per-block operations.
   if (i == 0) {
     // per k-th block
-    k = static_cast<int>(blockIdx.x);
-    for (l = 0; l < 3; l++) {
+    auto k = static_cast<int>(blockIdx.x);
+    for (int l = 0; l < 3; l++) {
       // global memory storage of the per-block extrema
       bhpara->minp[3 * k + l] = minp[l];
       bhpara->maxp[3 * k + l] = maxp[l];
@@ -156,19 +154,19 @@ __global__ __launch_bounds__(THREADS1, FACTOR1) void boundingBoxKernel() {
       // contain de facto already reduced (see above) shared extrema smin/smax
     }
 
-    n_blocks = static_cast<int>(gridDim.x) - 1;
+    auto const n_blocks = static_cast<int>(gridDim.x) - 1;
     // The block increment is performing by its zero thread.
     if (n_blocks == atomicInc((unsigned int *)&blkcntd, n_blocks)) {
       // I'm the (randomly) last block, so combine all other blocks' results
       // over the index j:
-      for (j = 0; j <= n_blocks; j++)
-        for (l = 0; l < 3; l++) {
-          minp[l] = min(minp[l], bhpara->minp[3 * j + l]);
-          maxp[l] = max(maxp[l], bhpara->maxp[3 * j + l]);
+      for (int im = 0; im <= n_blocks; im++)
+        for (int l = 0; l < 3; l++) {
+          minp[l] = min(minp[l], bhpara->minp[3 * im + l]);
+          maxp[l] = max(maxp[l], bhpara->maxp[3 * im + l]);
         }
 
       // Compute 'radius':
-      val = max(maxp[0] - minp[0], maxp[1] - minp[1]);
+      auto const val = max(maxp[0] - minp[0], maxp[1] - minp[1]);
       radiusd = max(val, maxp[2] - minp[2]) * 0.5f;
 
       // NOTE: now the extrema are global.
@@ -187,7 +185,7 @@ __global__ __launch_bounds__(THREADS1, FACTOR1) void boundingBoxKernel() {
       bhpara->start[k] = 0;
       // Position of the root node should be in the center of just defined BH
       // box:
-      for (l = 0; l < 3; l++)
+      for (int l = 0; l < 3; l++)
         bhpara->r[3 * k + l] = (minp[l] + maxp[l]) * 0.5f;
       // Init further tree building octo- meaning their absence at the
       // beginning:
@@ -202,32 +200,31 @@ __global__ __launch_bounds__(THREADS1, FACTOR1) void boundingBoxKernel() {
 /******************************************************************************/
 
 __global__ __launch_bounds__(THREADS2, FACTOR2) void treeBuildingKernel() {
-  int i, j, k, l, depth, localmaxdepth, skip, inc, lps;
+  int j, depth;
   float r;
   float pos[3];
   float p[3];
-  int ch, n, cell, locked, patch;
-  float radius;
+  int n;
   float root[3];
 
   // Radius is determined in boundingBoxKernel
-  radius = radiusd;
+  auto const radius = radiusd;
   // The root node has been created at the end of the boundingBoxKernel.
   // Cache the root data:
-  for (l = 0; l < 3; l++)
+  for (int l = 0; l < 3; l++)
     root[l] = bhpara->r[3 * bhpara->nnodes + l];
   // Maximum tree depth within the given thread.
-  localmaxdepth = 1;
+  int localmaxdepth = 1;
   // Skip the branch following and start from the root.
-  skip = 1;
+  int skip = 1;
   // Number of loops for the threads sync algorithm
-  lps = 0;
+  int lps = 0;
   // Increment to move among the bodies assigned to the given thread.
   // Hence, one should step over all other threads in GPU with
   // a quantity of blockDim.x * gridDim.x.
-  inc = static_cast<int>(blockDim.x * gridDim.x);
+  auto const inc = static_cast<int>(blockDim.x * gridDim.x);
   // Just a regular 1D GPU index
-  i = static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
+  auto i = static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
 
   // Iterate over all bodies assigned to thread.
   while (i < bhpara->nbodies) {
@@ -235,7 +232,7 @@ __global__ __launch_bounds__(THREADS2, FACTOR2) void treeBuildingKernel() {
       // New body, so start traversing at root. Skip it further.
       skip = 0;
       // Particle position corresponding to the given thread and block:
-      for (l = 0; l < 3; l++)
+      for (int l = 0; l < 3; l++)
         p[l] = bhpara->r[3 * i + l];
       // Let's start a moving via the tree from the root node 8 * nbodiesd:
       n = bhpara->nnodes;
@@ -244,14 +241,14 @@ __global__ __launch_bounds__(THREADS2, FACTOR2) void treeBuildingKernel() {
       // Determine which child to follow.
       // j=0..7 determines the octant in a binary representations
       j = 0;
-      for (l = 0; l < 3; l++)
+      for (int l = 0; l < 3; l++)
         if (root[l] < p[l])
           j += static_cast<int>(pow(2, l));
     }
 
     // Follow path to leaf cell. Should not happen at the first iteration of
     // this loop.
-    ch = bhpara->child[n * 8 + j];
+    int ch = bhpara->child[n * 8 + j];
 
     // Global memory writing related threads sync
     if (lps++ > THREADS2) {
@@ -277,7 +274,7 @@ __global__ __launch_bounds__(THREADS2, FACTOR2) void treeBuildingKernel() {
       j = 0;
       // Determine which child octant to follow based on body coordinates.
       // j=0..7 determines the octant in a binary representations.
-      for (l = 0; l < 3; l++)
+      for (int l = 0; l < 3; l++)
         if (bhpara->r[3 * n + l] < p[l])
           j += static_cast<int>(pow(2, l));
       ch = bhpara->child[n * 8 + j];
@@ -285,7 +282,7 @@ __global__ __launch_bounds__(THREADS2, FACTOR2) void treeBuildingKernel() {
     // Now we are deep enough in the tree, passed all levels of cells and
     // reached the body (particle).
     if (ch != -2) { // Skip if child pointer is locked (-2) and try again later.
-      locked = n * 8 + j;
+      int const locked = n * 8 + j;
       // Try to lock and iterate towards next body:
       if (ch == atomicCAS((int *)&bhpara->child[locked], ch, -2)) {
         // If we are here then child[locked] was equal to "ch" and now is
@@ -306,7 +303,7 @@ __global__ __launch_bounds__(THREADS2, FACTOR2) void treeBuildingKernel() {
           // If -1 (i.e. no child index) then just insert a new body
           bhpara->child[locked] = i;
         } else {
-          patch = -1;
+          int patch = -1;
           // There already is a body and/or cell(s) in this position.
           // We should start from a loop of the deepest child cell
           // determination. Then, we need to create new cells and to distribute
@@ -318,7 +315,7 @@ __global__ __launch_bounds__(THREADS2, FACTOR2) void treeBuildingKernel() {
             // nbodiesd nodes. These lists will be correctly handled by the
             // sortKernel later.
             depth++;
-            cell = atomicSub((int *)&bottomd, 1) - 1;
+            int const cell = atomicSub((int *)&bottomd, 1) - 1;
             if (cell <= bhpara->nbodies) {
               // This should not happen. A cell cannot have such index. Error.
               *bhpara->err = 1;
@@ -333,7 +330,7 @@ __global__ __launch_bounds__(THREADS2, FACTOR2) void treeBuildingKernel() {
             // within above "while (ch >= nbodiesd)" loop.
             // The new cell will be placed below relatively the center of
             // corresponding j-th octant:
-            for (l = 0; l < 3; l++)
+            for (int l = 0; l < 3; l++)
               pos[l] = static_cast<float>((j >> l) & 1) * r;
             // Note, that negative octants correspond to pos[l] == 0 and
             // positive octants correspond to pos[l] == r.
@@ -366,12 +363,12 @@ __global__ __launch_bounds__(THREADS2, FACTOR2) void treeBuildingKernel() {
             // need to add -r to obtain -r and r for negative and positive
             // octants. Now, the child (cell) octants centers are deriving from
             // the parent (n) octant center:
-            for (l = 0; l < 3; l++)
+            for (int l = 0; l < 3; l++)
               pos[l] = bhpara->r[3 * cell + l] =
                   bhpara->r[3 * n + l] - r + pos[l];
 
             // By default, the new cell has no children in all k-th octants:
-            for (k = 0; k < 8; k++)
+            for (int k = 0; k < 8; k++)
               bhpara->child[cell * 8 + k] = -1;
 
             // This condition should always be true cause "patch" is -1 at the
@@ -384,7 +381,7 @@ __global__ __launch_bounds__(THREADS2, FACTOR2) void treeBuildingKernel() {
             // pos[l] already contains the child cell coordinates.
             // Let's assign "child" then. First the octant should be selected:
             j = 0;
-            for (l = 0; l < 3; l++)
+            for (int l = 0; l < 3; l++)
               if (pos[l] < bhpara->r[3 * ch + l])
                 j += static_cast<int>(pow(2, l));
             // New element just appeared in the chain of cells. Hence, that what
@@ -400,7 +397,7 @@ __global__ __launch_bounds__(THREADS2, FACTOR2) void treeBuildingKernel() {
             __threadfence();
             // Let's handle the particle position (p[l]) corresponding to the
             // given thread and block against new octant cell (pos[l]):
-            for (l = 0; l < 3; l++)
+            for (int l = 0; l < 3; l++)
               if (pos[l] < p[l])
                 j += static_cast<int>(pow(2, l));
 
@@ -449,8 +446,7 @@ __global__ __launch_bounds__(THREADS2, FACTOR2) void treeBuildingKernel() {
 /******************************************************************************/
 
 __global__ __launch_bounds__(THREADS3, FACTOR3) void summarizationKernel() {
-  int i, j, k, l, im, ch, inc, missing, missing_max, cnt, bottom, lps;
-  int iteration, repeat_flag;
+  int i, j, cnt;
   // the node "mass" and its count respectively:
   float m, cm;
   // position of equivalent total dipole and its magnitude:
@@ -462,25 +458,26 @@ __global__ __launch_bounds__(THREADS3, FACTOR3) void summarizationKernel() {
   // no children by default:
   for (i = 0; i < 8; i++)
     child[i * THREADS3 + threadIdx.x] = -1;
-  bottom = bottomd;
+  auto const bottom = bottomd;
   // Increment towards other particles assigned to the given thread:
-  inc = static_cast<int>(blockDim.x * gridDim.x);
+  auto const inc = static_cast<int>(blockDim.x * gridDim.x);
   // Nodes iteration "k" should start from the "bottomd" level of the cells,
   // which is a minimal index of the last created cell.
   // Starting "k" value should be aligned using the warp size
   // according to the designed threads performance.
   // k = (bottom & (-WARPSIZE)) + threadIdx.x + blockIdx.x * blockDim.x;
-  k = bottom + static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
+  auto k = bottom + static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
   // Threads below the bottom line could proceed to their next cells.
   // if (k < bottom) k += inc;
 
   // Assume no missing children:
-  missing = 0;
-  iteration = 0;
-  repeat_flag = 0;
+  int missing = 0;
+  int missing_max = 0;
+  int iteration = 0;
+  int repeat_flag = 0;
   __syncthreads(); // throttle
   // threads sync related
-  lps = 0;
+  int lps = 0;
   // Iterate over all cells (not particles) assigned to the thread:
   while (k <= bhpara->nnodes) {
     if (lps++ > THREADS3) {
@@ -492,14 +489,14 @@ __global__ __launch_bounds__(THREADS3, FACTOR3) void summarizationKernel() {
       if (missing == 0) {
         // New cell, so initialize:
         cm = 0.0f;
-        for (l = 0; l < 3; l++) {
+        for (int l = 0; l < 3; l++) {
           p[l] = 0.0f;
           u[l] = 0.0f;
         }
         cnt = 0;
         j = 0;
         for (i = 0; i < 8; i++) {
-          ch = bhpara->child[k * 8 + i];
+          auto const ch = bhpara->child[k * 8 + i];
           if (ch >= 0) {
             if (i != j) {
               // Move child to front (needed later for a speed only).
@@ -532,7 +529,7 @@ __global__ __launch_bounds__(THREADS3, FACTOR3) void summarizationKernel() {
               }
               // add child's contribution
               cm += m;
-              for (l = 0; l < 3; l++) {
+              for (int l = 0; l < 3; l++) {
                 p[l] += bhpara->r[3 * ch + l] * m;
                 u[l] += bhpara->u[3 * ch + l];
               }
@@ -548,9 +545,9 @@ __global__ __launch_bounds__(THREADS3, FACTOR3) void summarizationKernel() {
       //__syncthreads();    // throttle
 
       if (missing != 0) {
-        for (im = 0; im < missing_max; im++) {
+        for (int im = 0; im < missing_max; im++) {
           // poll missing child
-          ch = child[im * THREADS3 + threadIdx.x];
+          auto const ch = child[im * THREADS3 + threadIdx.x];
           if (ch >= 0) {
             m = bhpara->mass[ch];
             // Is a child the particle? Only particles have non-negative mass
@@ -567,7 +564,7 @@ __global__ __launch_bounds__(THREADS3, FACTOR3) void summarizationKernel() {
               }
               // add child's contribution
               cm += m;
-              for (l = 0; l < 3; l++) {
+              for (int l = 0; l < 3; l++) {
                 p[l] += bhpara->r[3 * ch + l] * m;
                 u[l] += bhpara->u[3 * ch + l];
               }
@@ -588,7 +585,7 @@ __global__ __launch_bounds__(THREADS3, FACTOR3) void summarizationKernel() {
         // all children are ready, so store computed information
         bhpara->count[k] = cnt;
         m = 1.0f / cm;
-        for (l = 0; l < 3; l++) {
+        for (int l = 0; l < 3; l++) {
           bhpara->r[3 * k + l] = p[l] * m;
           bhpara->u[3 * k + l] = u[l];
         }
@@ -627,23 +624,21 @@ __global__ __launch_bounds__(THREADS3, FACTOR3) void summarizationKernel() {
 // same octant cells) together, and these grouped bodies are crucial to speed up
 // forceCalculationKernel and energyCalculationKernel
 __global__ __launch_bounds__(THREADS4, FACTOR4) void sortKernel() {
-  int i, k, ch, dec, start, bottom, lps;
-
-  bottom = bottomd;
-  dec = static_cast<int>(blockDim.x * gridDim.x);
+  auto const bottom = bottomd;
+  auto const dec = static_cast<int>(blockDim.x * gridDim.x);
   // Start from the end of the nnodesd == 8 * nbodiesd.
   // Reverse order is required now cause octant cells which are more close
   // to the root have a larger count of entities inside (countd[k]).
   // Particles should be sorted over all entities count in the tree array
   // representation made by treeBuildingKernel.
-  k = bhpara->nnodes + 1 - dec +
-      static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
+  int k = bhpara->nnodes + 1 - dec +
+          static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
   // threads sync related
-  lps = 0;
+  int lps = 0;
 
   // iterate over all cells assigned to thread
   while (k >= bottom) {
-    start = bhpara->start[k];
+    int start = bhpara->start[k];
     // Threads sync related
     if (lps++ > THREADS4) {
       *bhpara->max_lps = lps;
@@ -652,8 +647,8 @@ __global__ __launch_bounds__(THREADS4, FACTOR4) void sortKernel() {
     // Let's start from the root which has only startd=0 defined
     // in boundingBoxKernel. All other bodies and cells have -1.
     if (start >= 0) {
-      for (i = 0; i < 8; i++) {
-        ch = bhpara->child[k * 8 + i];
+      for (int i = 0; i < 8; i++) {
+        int const ch = bhpara->child[k * 8 + i];
         if (ch >= bhpara->nbodies) {
           // child is a cell
           bhpara->start[ch] = start;  // set start ID of child
@@ -684,8 +679,7 @@ __global__ __launch_bounds__(THREADS4, FACTOR4) void sortKernel() {
 
 __global__ __launch_bounds__(THREADS5, FACTOR5) void forceCalculationKernel(
     float pf, float *force, float *torque) {
-  int i, j, k, l, n, depth, base, sbase, diff, t;
-  float tmp;
+  int i, t;
   // dr is a distance between particles.
   // f,h, and N are a target force, field, and torque respectively.
   // u and uc are dipole moments of two particles.
@@ -698,11 +692,9 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void forceCalculationKernel(
   __shared__ int pos[MAXDEPTH * THREADS5 / WARPSIZE],
       node[MAXDEPTH * THREADS5 / WARPSIZE];
   __shared__ float dq[MAXDEPTH * THREADS5 / WARPSIZE];
-  float b, b2, d1, dd5;
-  float bb2d7, umd5;
 
   // Zero thread of the block initialize shared data for all warps.
-  if (0 == threadIdx.x) {
+  if (threadIdx.x == 0) {
     // Precompute values that depend only on tree level.
     // The method's parameters (a trade-off accuracy/performance)
     // which determine that the
@@ -715,8 +707,7 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void forceCalculationKernel(
     // and the cell depending on a cell level.
     // Original tree box edge (2*radiusd) should be divided *0.5
     // as much as the tree depth takes place.
-    tmp = radiusd;
-    dq[0] = tmp * tmp * *itolsqd;
+    dq[0] = radiusd * radiusd * *itolsqd;
     for (i = 1; i < maxdepthd; i++) {
       dq[i] = dq[i - 1] * 0.25f;
       dq[i - 1] += *epssqd;
@@ -735,15 +726,15 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void forceCalculationKernel(
   // Only maximal Barnes-Hut tree depth is allowed.
   if (maxdepthd <= MAXDEPTH) {
     // How many warps are behind the current thread (?):
-    base = static_cast<int>(threadIdx.x) / WARPSIZE;
+    auto const base = static_cast<int>(threadIdx.x) / WARPSIZE;
     // Figure out first thread in each warp (lane 0):
-    sbase = base * WARPSIZE;
+    auto const sbase = base * WARPSIZE;
     // Initial stack index is its MAXDEPTH portion start for the given warp
     // count base:
-    j = base * MAXDEPTH;
+    auto const j = base * MAXDEPTH;
 
     // How far the thread is from the warp beginning (?):
-    diff = static_cast<int>(threadIdx.x) - sbase;
+    auto const diff = static_cast<int>(threadIdx.x) - sbase;
     // Make multiple copies to avoid index calculations later:
     if (diff < MAXDEPTH) {
       // Each thread copies its own dq[] element to a part of
@@ -753,12 +744,12 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void forceCalculationKernel(
     __syncthreads();
 
     // Iterate over all bodies assigned to thread:
-    for (k = static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
+    for (auto k = static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
          k < bhpara->nbodies; k += static_cast<int>(blockDim.x * gridDim.x)) {
       // Sorted body indexes assigned to me:
       i = bhpara->sort[k]; // get permuted/sorted index
       // Cache the particle position info:
-      for (l = 0; l < 3; l++) {
+      for (int l = 0; l < 3; l++) {
         u[l] = bhpara->u[3 * i + l];
         h[l] = 0.0f;
         f[l] = 0.0f;
@@ -769,7 +760,7 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void forceCalculationKernel(
       // Hence, we should start from the root node (whole Barnes-Hut cube).
       // Initialize iteration stack, i.e., push root node onto stack.
       // Let's start from zero octant.
-      depth = j;
+      auto depth = j;
       if (sbase == threadIdx.x) {
         node[j] = bhpara->nnodes;
         pos[j] = 0;
@@ -780,7 +771,7 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void forceCalculationKernel(
         // Hence, there are still some children to consider.
         while ((t = pos[depth]) < 8) {
           // Node on top of stack has more children to process:
-          n = bhpara->child[node[depth] * 8 + t]; // load child pointer
+          auto const n = bhpara->child[node[depth] * 8 + t]; // child pointer
           if (sbase == threadIdx.x) {
             // I'm the first thread in the warp.
             // Let me check for the current depth level of the tree
@@ -792,8 +783,8 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void forceCalculationKernel(
           // There is a child (octant cell) with a dipole moment uxd[3 * n + l]
           // and the center position bhpara->r[3 * n + l]:
           if (n >= 0) {
-            tmp = 0.0f; // compute distance squared
-            for (l = 0; l < 3; l++) {
+            auto tmp = 0.0f; // compute distance squared
+            for (int l = 0; l < 3; l++) {
               dr[l] = -bhpara->r[3 * n + l] + bhpara->r[3 * i + l];
               tmp += dr[l] * dr[l];
             }
@@ -825,12 +816,12 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void forceCalculationKernel(
 #endif
               if (n != i) {
 
-                d1 = sqrtf(tmp /*, 0.5f*/);
-                dd5 = __fdividef(1.0f, tmp * tmp * d1);
-                b = 0.0f;
-                b2 = 0.0f;
-                umd5 = 0.0f;
-                for (l = 0; l < 3; l++) {
+                auto const d1 = sqrtf(tmp /*, 0.5f*/);
+                auto const dd5 = __fdividef(1.0f, tmp * tmp * d1);
+                auto b = 0.0f;
+                auto b2 = 0.0f;
+                auto umd5 = 0.0f;
+                for (int l = 0; l < 3; l++) {
                   uc[l] = bhpara->u[3 * n + l];
                   b += uc[l] * dr[l];
                   b2 += u[l] * dr[l];
@@ -838,9 +829,9 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void forceCalculationKernel(
                 }
                 umd5 *= -3.0f * dd5;
 
-                bb2d7 = 15.0f * b * b2 * __fdividef(dd5, tmp);
+                auto const bb2d7 = 15.0f * b * b2 * __fdividef(dd5, tmp);
 
-                for (l = 0; l < 3; l++) {
+                for (int l = 0; l < 3; l++) {
                   h[l] += (b * 3.0f * dr[l] - tmp * uc[l]) * dd5;
                   f[l] += -dr[l] * (umd5 + bb2d7) +
                           3.0f * (b * u[l] + b2 * uc[l]) * dd5;
@@ -874,7 +865,7 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void forceCalculationKernel(
       N[1] = u[2] * h[0] - u[0] * h[2];
       N[2] = u[0] * h[1] - u[1] * h[0];
 
-      for (l = 0; l < 3; l++) {
+      for (int l = 0; l < 3; l++) {
         if (f[l] != f[l] || h[l] != h[l]) { // nan
           printf("Force Kernel: NAN in particle[%d]\n", i);
           printf("x = %f, y = %f, z = %f,\n", bhpara->u[3 * i + 0],
@@ -890,8 +881,7 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void forceCalculationKernel(
 }
 
 /******************************************************************************/
-/*** compute energy
- * ************************************************************/
+/*** compute energy ***********************************************************/
 /******************************************************************************/
 
 __global__ __launch_bounds__(THREADS5, FACTOR5) void energyCalculationKernel(
@@ -899,8 +889,7 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void energyCalculationKernel(
   // NOTE: the algorithm of this kernel is almost identical to
   // forceCalculationKernel. See comments there.
 
-  int i, j, k, l, n, depth, base, sbase, diff, t;
-  float tmp;
+  int i, n, t;
   float dr[3], h[3], u[3], uc[3];
   __shared__ int pos[MAXDEPTH * THREADS5 / WARPSIZE],
       node[MAXDEPTH * THREADS5 / WARPSIZE];
@@ -908,12 +897,9 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void energyCalculationKernel(
   float sum = 0.0;
   extern __shared__ float res[];
 
-  float b, d1, dd5;
-
-  if (0 == threadIdx.x) {
-    tmp = radiusd;
+  if (threadIdx.x == 0) {
     // precompute values that depend only on tree level
-    dq[0] = tmp * tmp * *itolsqd;
+    dq[0] = radiusd * radiusd * *itolsqd;
     for (i = 1; i < maxdepthd; i++) {
       dq[i] = dq[i - 1] * 0.25f;
       dq[i - 1] += *epssqd;
@@ -928,11 +914,11 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void energyCalculationKernel(
 
   if (maxdepthd <= MAXDEPTH) {
     // figure out first thread in each warp (lane 0)
-    base = static_cast<int>(threadIdx.x) / WARPSIZE;
-    sbase = base * WARPSIZE;
-    j = base * MAXDEPTH;
+    auto const base = static_cast<int>(threadIdx.x) / WARPSIZE;
+    auto const sbase = base * WARPSIZE;
+    auto const j = base * MAXDEPTH;
 
-    diff = static_cast<int>(threadIdx.x) - sbase;
+    auto const diff = static_cast<int>(threadIdx.x) - sbase;
     // make multiple copies to avoid index calculations later
     if (diff < MAXDEPTH) {
       dq[diff + j] = dq[diff];
@@ -940,17 +926,17 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void energyCalculationKernel(
     __syncthreads();
 
     // iterate over all bodies assigned to thread
-    for (k = static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
+    for (auto k = static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
          k < bhpara->nbodies; k += static_cast<int>(blockDim.x * gridDim.x)) {
       i = bhpara->sort[k]; // get permuted/sorted index
       // cache position info
-      for (l = 0; l < 3; l++) {
+      for (int l = 0; l < 3; l++) {
         u[l] = bhpara->u[3 * i + l];
         h[l] = 0.0f;
       }
 
       // initialize iteration stack, i.e., push root node onto stack
-      depth = j;
+      auto depth = j;
       if (sbase == threadIdx.x) {
         node[j] = bhpara->nnodes;
         pos[j] = 0;
@@ -967,8 +953,8 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void energyCalculationKernel(
           }
           __threadfence_block();
           if (n >= 0) {
-            tmp = 0.0f;
-            for (l = 0; l < 3; l++) {
+            auto tmp = 0.0f;
+            for (int l = 0; l < 3; l++) {
               dr[l] = -bhpara->r[3 * n + l] + bhpara->r[3 * i + l];
               tmp += dr[l] * dr[l];
             }
@@ -985,15 +971,15 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void energyCalculationKernel(
                                     // is far enough away (or is a body)
 #endif
               if (n != i) {
-                d1 = sqrtf(tmp /*, 0.5f*/);
-                dd5 = __fdividef(1.0f, tmp * tmp * d1);
-                b = 0.0f;
-                for (l = 0; l < 3; l++) {
+                auto const d1 = sqrtf(tmp /*, 0.5f*/);
+                auto const dd5 = __fdividef(1.0f, tmp * tmp * d1);
+                auto b = 0.0f;
+                for (int l = 0; l < 3; l++) {
                   uc[l] = bhpara->u[3 * n + l];
                   b += uc[l] * dr[l];
                 }
 
-                for (l = 0; l < 3; l++)
+                for (int l = 0; l < 3; l++)
                   h[l] += (b * 3.0f * dr[l] - tmp * uc[l]) * dd5;
               }
             } else {
@@ -1013,7 +999,7 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void energyCalculationKernel(
         depth--; // done with this level
       }
 
-      for (l = 0; l < 3; l++) {
+      for (int l = 0; l < 3; l++) {
         sum += -u[l] * h[l];
         if (h[l] != h[l]) { // nan
           printf("Energy Kernel: NAN in particle[%d]\n", i);
@@ -1183,7 +1169,7 @@ void allocBHmemCopy(int nbodies, BHData *bh_data) {
   // cubic boxes hence, 8 nodes per particle is a theoretical octree limit:
   bh_data->nnodes = bh_data->nbodies * 8;
 
-  int n_total_threads = 1024 * bh_data->blocks;
+  int const n_total_threads = 1024 * bh_data->blocks;
   if (bh_data->nnodes < n_total_threads)
     bh_data->nnodes = n_total_threads;
   else
diff --git a/src/core/actor/DipolarDirectSum.hpp b/src/core/actor/DipolarDirectSum.hpp
index aa41cdcde58..1938dd224ab 100644
--- a/src/core/actor/DipolarDirectSum.hpp
+++ b/src/core/actor/DipolarDirectSum.hpp
@@ -32,10 +32,10 @@
 
 #include <memory>
 
-void DipolarDirectSum_kernel_wrapper_energy(float k, int n, float *pos,
+void DipolarDirectSum_kernel_wrapper_energy(float k, unsigned int n, float *pos,
                                             float *dip, float box_l[3],
                                             int periodic[3], float *E);
-void DipolarDirectSum_kernel_wrapper_force(float k, int n, float *pos,
+void DipolarDirectSum_kernel_wrapper_force(float k, unsigned int n, float *pos,
                                            float *dip, float *f, float *torque,
                                            float box_l[3], int periodic[3]);
 
@@ -59,10 +59,10 @@ class DipolarDirectSum : public Actor {
     int per[3];
     for (int i = 0; i < 3; i++) {
       box[i] = static_cast<float>(s.box()[i]);
-      per[i] = (box_geo.periodic(i));
+      per[i] = box_geo.periodic(i);
     }
     DipolarDirectSum_kernel_wrapper_force(
-        k, static_cast<int>(s.npart_gpu()), s.rGpuBegin(), s.dipGpuBegin(),
+        k, static_cast<unsigned>(s.npart_gpu()), s.rGpuBegin(), s.dipGpuBegin(),
         s.fGpuBegin(), s.torqueGpuBegin(), box, per);
   };
   void computeEnergy(SystemInterface &s) override {
@@ -70,11 +70,11 @@ class DipolarDirectSum : public Actor {
     int per[3];
     for (int i = 0; i < 3; i++) {
       box[i] = static_cast<float>(s.box()[i]);
-      per[i] = (box_geo.periodic(i));
+      per[i] = box_geo.periodic(i);
     }
     DipolarDirectSum_kernel_wrapper_energy(
-        k, static_cast<int>(s.npart_gpu()), s.rGpuBegin(), s.dipGpuBegin(), box,
-        per, &(reinterpret_cast<CUDA_energy *>(s.eGpu())->dipolar));
+        k, static_cast<unsigned>(s.npart_gpu()), s.rGpuBegin(), s.dipGpuBegin(),
+        box, per, &(reinterpret_cast<CUDA_energy *>(s.eGpu())->dipolar));
   };
 
 private:
diff --git a/src/core/actor/DipolarDirectSum_cuda.cu b/src/core/actor/DipolarDirectSum_cuda.cu
index 0182c140460..8c1e37754b2 100644
--- a/src/core/actor/DipolarDirectSum_cuda.cu
+++ b/src/core/actor/DipolarDirectSum_cuda.cu
@@ -36,6 +36,13 @@ __device__ inline float scalar_product(float const *a, float const *b) {
   return a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
 }
 
+__device__ inline void vector_product(float const *a, float const *b,
+                                      float *out) {
+  out[0] = a[1] * b[2] - a[2] * b[1];
+  out[1] = a[2] * b[0] - a[0] * b[2];
+  out[2] = a[0] * b[1] - a[1] * b[0];
+}
+
 __device__ inline void get_mi_vector_dds(float res[3], float const a[3],
                                          float const b[3], float const box_l[3],
                                          int const periodic[3]) {
@@ -46,120 +53,89 @@ __device__ inline void get_mi_vector_dds(float res[3], float const a[3],
   }
 }
 
-__device__ void dipole_ia_force(int id, float pf, float const *r1,
-                                float const *r2, float const *dip1,
-                                float const *dip2, float *f1, float *torque1,
-                                float *torque2, float box_l[3],
+__device__ void dipole_ia_force(float pf, float const *r1, float const *r2,
+                                float const *dip1, float const *dip2, float *f1,
+                                float *torque1, float *torque2, float box_l[3],
                                 int periodic[3]) {
-  float r_inv, pe1, pe2, pe3, pe4, r_sq, r3_inv, r5_inv, r_sq_inv, r7_inv, a, b,
-      cc, d, ab;
-#ifdef ROTATION
-  float bx, by, bz, ax, ay, az;
-#endif
-
-  float dr[3];
-  float _r1[3], _r2[3], _dip1[3], _dip2[3];
-
-  for (int i = 0; i < 3; i++) {
-    _r1[i] = r1[i];
-    _r2[i] = r2[i];
-    _dip1[i] = dip1[i];
-    _dip2[i] = dip2[i];
-  }
-
   // Distance between particles
-  get_mi_vector_dds(dr, _r1, _r2, box_l, periodic);
+  float dr[3];
+  get_mi_vector_dds(dr, r1, r2, box_l, periodic);
 
   // Powers of distance
-  r_sq = scalar_product(dr, dr);
-  r_sq_inv = 1 / r_sq;
-  r_inv = rsqrtf(r_sq);
-  r3_inv = 1 / r_sq * r_inv;
-  r5_inv = r3_inv * r_sq_inv;
-  r7_inv = r5_inv * r_sq_inv;
+  auto const r_sq = scalar_product(dr, dr);
+  auto const r_sq_inv = 1.0f / r_sq;
+  auto const r_inv = rsqrtf(r_sq);
+  auto const r3_inv = 1.0f / r_sq * r_inv;
+  auto const r5_inv = r3_inv * r_sq_inv;
+  auto const r7_inv = r5_inv * r_sq_inv;
 
   // Dot products
-  pe1 = scalar_product(_dip1, _dip2);
-  pe2 = scalar_product(_dip1, dr);
-  pe3 = scalar_product(_dip2, dr);
-  pe4 = 3.0f * r5_inv;
+  auto const pe1 = scalar_product(dip1, dip2);
+  auto const pe2 = scalar_product(dip1, dr);
+  auto const pe3 = scalar_product(dip2, dr);
+  auto const pe4 = 3.0f * r5_inv;
 
   // Force
-  a = pe4 * pe1;
-  b = -15.0f * pe2 * pe3 * r7_inv;
-  ab = a + b;
-  cc = pe4 * pe3;
-  d = pe4 * pe2;
+  auto const aa = pe4 * pe1;
+  auto const bb = -15.0f * pe2 * pe3 * r7_inv;
+  auto const ab = aa + bb;
+  auto const cc = pe4 * pe3;
+  auto const dd = pe4 * pe2;
 
-  f1[0] = (pf * (ab * dr[0] + cc * _dip1[0] + d * _dip2[0]));
-  f1[1] = (pf * (ab * dr[1] + cc * _dip1[1] + d * _dip2[1]));
-  f1[2] = (pf * (ab * dr[2] + cc * _dip1[2] + d * _dip2[2]));
+  f1[0] = (pf * (ab * dr[0] + cc * dip1[0] + dd * dip2[0]));
+  f1[1] = (pf * (ab * dr[1] + cc * dip1[1] + dd * dip2[1]));
+  f1[2] = (pf * (ab * dr[2] + cc * dip1[2] + dd * dip2[2]));
 
 #ifdef ROTATION
   // Torques
-  ax = _dip1[1] * _dip2[2] - _dip2[1] * _dip1[2];
-  ay = _dip2[0] * _dip1[2] - _dip1[0] * _dip2[2];
-  az = _dip1[0] * _dip2[1] - _dip2[0] * _dip1[1];
+  float a[3];
+  vector_product(dip1, dip2, a);
 
-  bx = _dip1[1] * dr[2] - dr[1] * _dip1[2];
-  by = dr[0] * _dip1[2] - _dip1[0] * dr[2];
-  bz = _dip1[0] * dr[1] - dr[0] * _dip1[1];
+  float b[3];
+  vector_product(dip1, dr, b);
 
-  torque1[0] = (pf * (-ax * r3_inv + bx * cc));
-  torque1[1] = (pf * (-ay * r3_inv + by * cc));
-  torque1[2] = (pf * (-az * r3_inv + bz * cc));
+  torque1[0] = pf * (-a[0] * r3_inv + b[0] * cc);
+  torque1[1] = pf * (-a[1] * r3_inv + b[1] * cc);
+  torque1[2] = pf * (-a[2] * r3_inv + b[2] * cc);
 
-  bx = _dip2[1] * dr[2] - dr[1] * _dip2[2];
-  by = dr[0] * _dip2[2] - _dip2[0] * dr[2];
-  bz = _dip2[0] * dr[1] - dr[0] * _dip2[1];
+  vector_product(dip2, dr, b);
 
-  torque2[0] = pf * (ax * r3_inv + bx * d);
-  torque2[1] = pf * (ay * r3_inv + by * d);
-  torque2[2] = pf * (az * r3_inv + bz * d);
+  torque2[0] = pf * (a[0] * r3_inv + b[0] * dd);
+  torque2[1] = pf * (a[1] * r3_inv + b[1] * dd);
+  torque2[2] = pf * (a[2] * r3_inv + b[2] * dd);
 #endif
 }
 
-__device__ float dipole_ia_energy(int id, float pf, float const *r1,
-                                  float const *r2, float const *dip1,
-                                  float const *dip2, float box_l[3],
-                                  int periodic[3]) {
-  float r_inv, pe1, pe2, pe3, pe4, r_sq, r3_inv, r5_inv, r_sq_inv;
-  float dr[3];
-  float _r1[3], _r2[3], _dip1[3], _dip2[3];
-
-  for (int i = 0; i < 3; i++) {
-    _r1[i] = r1[i];
-    _r2[i] = r2[i];
-    _dip1[i] = dip1[i];
-    _dip2[i] = dip2[i];
-  }
-
+__device__ float dipole_ia_energy(float pf, float const *r1, float const *r2,
+                                  float const *dip1, float const *dip2,
+                                  float box_l[3], int periodic[3]) {
   // Distance between particles
-  get_mi_vector_dds(dr, _r1, _r2, box_l, periodic);
+  float dr[3];
+  get_mi_vector_dds(dr, r1, r2, box_l, periodic);
 
   // Powers of distance
-  r_sq = scalar_product(dr, dr);
-  r_sq_inv = 1 / r_sq;
-  r_inv = rsqrtf(r_sq);
-  r3_inv = 1 / r_sq * r_inv;
-  r5_inv = r3_inv * r_sq_inv;
+  auto const r_sq = scalar_product(dr, dr);
+  auto const r_sq_inv = 1.0f / r_sq;
+  auto const r_inv = rsqrtf(r_sq);
+  auto const r3_inv = 1.0f / r_sq * r_inv;
+  auto const r5_inv = r3_inv * r_sq_inv;
 
   // Dot products
-  pe1 = scalar_product(_dip1, _dip2);
-  pe2 = scalar_product(_dip1, dr);
-  pe3 = scalar_product(_dip2, dr);
-  pe4 = 3.0f * r5_inv;
+  auto const pe1 = scalar_product(dip1, dip2);
+  auto const pe2 = scalar_product(dip1, dr);
+  auto const pe3 = scalar_product(dip2, dr);
+  auto const pe4 = 3.0f * r5_inv;
 
   // Energy
   return pf * (pe1 * r3_inv - pe4 * pe2 * pe3);
 }
 
-__global__ void DipolarDirectSum_kernel_force(float pf, int n, float *pos,
-                                              float *dip, float *f,
+__global__ void DipolarDirectSum_kernel_force(float pf, unsigned int n,
+                                              float *pos, float *dip, float *f,
                                               float *torque, float box_l[3],
                                               int periodic[3]) {
 
-  auto i = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
+  auto const i = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (i >= n)
     return;
@@ -179,17 +155,17 @@ __global__ void DipolarDirectSum_kernel_force(float pf, int n, float *pos,
   // to global memory at the end.
 
   // Clear summation vars
-  for (int j = 0; j < 3; j++) {
+  for (unsigned int j = 0; j < 3; j++) {
     // Force
     fsum[j] = 0;
     // Torque
     tsum[j] = 0;
   }
 
-  for (int j = i + 1; j < n; j++) {
-    dipole_ia_force(i, pf, pos + 3 * i, pos + 3 * j, dip + 3 * i, dip + 3 * j,
-                    fi, ti, tj, box_l, periodic);
-    for (int k = 0; k < 3; k++) {
+  for (unsigned int j = i + 1; j < n; j++) {
+    dipole_ia_force(pf, pos + 3 * i, pos + 3 * j, dip + 3 * i, dip + 3 * j, fi,
+                    ti, tj, box_l, periodic);
+    for (unsigned int k = 0; k < 3; k++) {
       // Add rhs to global memory
       atomicAdd(f + 3 * j + k, -fi[k]);
       atomicAdd((torque + 3 * j + k), tj[k]);
@@ -206,7 +182,7 @@ __global__ void DipolarDirectSum_kernel_force(float pf, int n, float *pos,
 }
 
 __device__ void dds_sumReduction(float *input, float *sum) {
-  auto tid = static_cast<int>(threadIdx.x);
+  auto const tid = static_cast<int>(threadIdx.x);
   for (auto i = static_cast<int>(blockDim.x); i > 1; i /= 2) {
     __syncthreads();
     if (tid < i / 2)
@@ -215,17 +191,17 @@ __device__ void dds_sumReduction(float *input, float *sum) {
       input[tid] += input[i - 1];
   }
   __syncthreads();
-  if (threadIdx.x == 0) {
+  if (tid == 0) {
     sum[0] = input[0];
   }
 }
 
-__global__ void DipolarDirectSum_kernel_energy(float pf, int n, float *pos,
-                                               float *dip, float box_l[3],
-                                               int periodic[3],
+__global__ void DipolarDirectSum_kernel_energy(float pf, unsigned int n,
+                                               float *pos, float *dip,
+                                               float box_l[3], int periodic[3],
                                                float *energySum) {
 
-  auto i = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
+  auto const i = blockIdx.x * blockDim.x + threadIdx.x;
   float sum = 0.0;
   extern __shared__ float res[];
 
@@ -236,8 +212,8 @@ __global__ void DipolarDirectSum_kernel_energy(float pf, int n, float *pos,
 
   if (i < n) {
     // Summation for particle i
-    for (int j = i + 1; j < n; j++) {
-      sum += dipole_ia_energy(i, pf, pos + 3 * i, pos + 3 * j, dip + 3 * i,
+    for (unsigned int j = i + 1; j < n; j++) {
+      sum += dipole_ia_energy(pf, pos + 3 * i, pos + 3 * j, dip + 3 * i,
                               dip + 3 * j, box_l, periodic);
     }
 
@@ -251,11 +227,22 @@ __global__ void DipolarDirectSum_kernel_energy(float pf, int n, float *pos,
   dds_sumReduction(res, &(energySum[blockIdx.x]));
 }
 
-void DipolarDirectSum_kernel_wrapper_force(float k, int n, float *pos,
+inline void copy_box_data(float **box_l_gpu, int **periodic_gpu,
+                          float const *box_l, int const *periodic) {
+  auto const s_box = 3u * sizeof(float);
+  auto const s_per = 3u * sizeof(int);
+  cuda_safe_mem(cudaMalloc(reinterpret_cast<void **>(box_l_gpu), s_box));
+  cuda_safe_mem(cudaMalloc(reinterpret_cast<void **>(periodic_gpu), s_per));
+  cuda_safe_mem(cudaMemcpy(*box_l_gpu, box_l, s_box, cudaMemcpyHostToDevice));
+  cuda_safe_mem(
+      cudaMemcpy(*periodic_gpu, periodic, s_per, cudaMemcpyHostToDevice));
+}
+
+void DipolarDirectSum_kernel_wrapper_force(float k, unsigned int n, float *pos,
                                            float *dip, float *f, float *torque,
                                            float box_l[3], int periodic[3]) {
 
-  const int bs = 64;
+  unsigned int const bs = 64;
   dim3 grid(1, 1, 1);
   dim3 block(1, 1, 1);
 
@@ -272,12 +259,7 @@ void DipolarDirectSum_kernel_wrapper_force(float k, int n, float *pos,
 
   float *box_l_gpu;
   int *periodic_gpu;
-  cuda_safe_mem(cudaMalloc((void **)&box_l_gpu, 3 * sizeof(float)));
-  cuda_safe_mem(cudaMalloc((void **)&periodic_gpu, 3 * sizeof(int)));
-  cuda_safe_mem(
-      cudaMemcpy(box_l_gpu, box_l, 3 * sizeof(float), cudaMemcpyHostToDevice));
-  cuda_safe_mem(cudaMemcpy(periodic_gpu, periodic, 3 * sizeof(int),
-                           cudaMemcpyHostToDevice));
+  copy_box_data(&box_l_gpu, &periodic_gpu, box_l, periodic);
 
   KERNELCALL(DipolarDirectSum_kernel_force, grid, block, k, n, pos, dip, f,
              torque, box_l_gpu, periodic_gpu);
@@ -285,11 +267,11 @@ void DipolarDirectSum_kernel_wrapper_force(float k, int n, float *pos,
   cudaFree(periodic_gpu);
 }
 
-void DipolarDirectSum_kernel_wrapper_energy(float k, int n, float *pos,
+void DipolarDirectSum_kernel_wrapper_energy(float k, unsigned int n, float *pos,
                                             float *dip, float box_l[3],
                                             int periodic[3], float *E) {
 
-  const int bs = 512;
+  unsigned int const bs = 512;
   dim3 grid(1, 1, 1);
   dim3 block(1, 1, 1);
 
@@ -306,15 +288,10 @@ void DipolarDirectSum_kernel_wrapper_energy(float k, int n, float *pos,
 
   float *box_l_gpu;
   int *periodic_gpu;
-  cuda_safe_mem(cudaMalloc((void **)&box_l_gpu, 3 * sizeof(float)));
-  cuda_safe_mem(cudaMalloc((void **)&periodic_gpu, 3 * sizeof(int)));
-  cuda_safe_mem(
-      cudaMemcpy(box_l_gpu, box_l, 3 * sizeof(float), cudaMemcpyHostToDevice));
-  cuda_safe_mem(cudaMemcpy(periodic_gpu, periodic, 3 * sizeof(int),
-                           cudaMemcpyHostToDevice));
+  copy_box_data(&box_l_gpu, &periodic_gpu, box_l, periodic);
 
   float *energySum;
-  cuda_safe_mem(cudaMalloc(&energySum, (int)(sizeof(float) * grid.x)));
+  cuda_safe_mem(cudaMalloc(&energySum, sizeof(float) * grid.x));
 
   // This will sum the energies up to the block level
   KERNELCALL_shared(DipolarDirectSum_kernel_energy, grid, block,
diff --git a/src/core/electrostatics_magnetostatics/p3m-dipolar.hpp b/src/core/electrostatics_magnetostatics/p3m-dipolar.hpp
index 4060b32cfb3..1053bdcdddb 100644
--- a/src/core/electrostatics_magnetostatics/p3m-dipolar.hpp
+++ b/src/core/electrostatics_magnetostatics/p3m-dipolar.hpp
@@ -280,12 +280,6 @@ inline double dp3m_pair_energy(Particle const &p1, Particle const &p2,
 
   auto const C_r = (3 * B_r + 2 * alpsq * coeff * exp_adist2) * dist2i;
 
-  /*
-    printf("(%4i %4i) pair energy = %f (B_r=%15.12f C_r=%15.12f)\n",
-    p1.p.identity,p2.p.identity,fac1*(mimj*B_r-mir*mjr*C_r),B_r,C_r);
-  */
-
-  /* old line return fac1 * ( mimj*B_r - mir*mjr * C_r );*/
   return dipole.prefactor * (mimj * B_r - mir * mjr * C_r);
 }
 
diff --git a/src/core/electrostatics_magnetostatics/p3m_gpu_cuda.cu b/src/core/electrostatics_magnetostatics/p3m_gpu_cuda.cu
index 5b3c3c1988f..b03339e7927 100644
--- a/src/core/electrostatics_magnetostatics/p3m_gpu_cuda.cu
+++ b/src/core/electrostatics_magnetostatics/p3m_gpu_cuda.cu
@@ -149,7 +149,7 @@ __device__ void static Aliasing_sums_ik(const P3MGpuData p, int NX, int NY,
         NM2 = sqr(NMX * Leni[0]) + sqr(NMY * Leni[1]) + sqr(NMZ * Leni[2]);
         *Nenner += S3;
 
-        TE = exp(-sqr(Utils::pi<float>() / (p.alpha)) * NM2);
+        TE = exp(-sqr(Utils::pi<REAL_TYPE>() / (p.alpha)) * NM2);
         zwi = S3 * TE / NM2;
         Zaehler[0] += NMX * zwi * Leni[0];
         Zaehler[1] += NMY * zwi * Leni[1];
@@ -181,9 +181,9 @@ __global__ void calculate_influence_function_device(const P3MGpuData p) {
 
   if (((NX == 0) && (NY == 0) && (NZ == 0)) ||
       ((NX % (p.mesh[0] / 2) == 0) && (NY % (p.mesh[1] / 2) == 0) &&
-       (NZ % (p.mesh[2] / 2) == 0)))
-    p.G_hat[ind] = 0.0;
-  else {
+       (NZ % (p.mesh[2] / 2) == 0))) {
+    p.G_hat[ind] = 0;
+  } else {
     Aliasing_sums_ik<cao>(p, NX, NY, NZ, Zaehler, &Nenner);
 
     Dnx = static_cast<REAL_TYPE>((NX > p.mesh[0] / 2) ? NX - p.mesh[0] : NX);
@@ -194,7 +194,7 @@ __global__ void calculate_influence_function_device(const P3MGpuData p) {
           Dnz * Zaehler[2] * Leni[2];
     zwi /= ((sqr(Dnx * Leni[0]) + sqr(Dny * Leni[1]) + sqr(Dnz * Leni[2])) *
             sqr(Nenner));
-    p.G_hat[ind] = 2.0f * zwi / Utils::pi<float>();
+    p.G_hat[ind] = 2 * zwi / Utils::pi<REAL_TYPE>();
   }
 }
 
@@ -226,10 +226,10 @@ __global__ void apply_diff_op(const P3MGpuData p) {
                                           static_cast<int>(blockIdx.y),
                                           static_cast<int>(threadIdx.x));
 
-  auto const nx = static_cast<int>(
-      (blockIdx.x > p.mesh[0] / 2) ? blockIdx.x - p.mesh[0] : blockIdx.x);
-  auto const ny = static_cast<int>(
-      (blockIdx.y > p.mesh[1] / 2) ? blockIdx.y - p.mesh[1] : blockIdx.y);
+  auto const bidx = static_cast<int>(blockIdx.x);
+  auto const bidy = static_cast<int>(blockIdx.y);
+  auto const nx = (bidx > p.mesh[0] / 2) ? bidx - p.mesh[0] : bidx;
+  auto const ny = (bidy > p.mesh[1] / 2) ? bidy - p.mesh[1] : bidy;
   auto const nz = static_cast<int>(threadIdx.x);
 
   const FFT_TYPE_COMPLEX meshw = p.charge_mesh[linear_index];
@@ -274,11 +274,12 @@ template <int cao, bool shared>
 __global__ void assign_charge_kernel(const CUDA_particle_data *const pdata,
                                      const P3MGpuData par,
                                      const int parts_per_block) {
-  const int part_in_block = threadIdx.x / cao;
-  const int cao_id_x = threadIdx.x - part_in_block * cao;
+  auto const part_in_block = static_cast<int>(threadIdx.x) / cao;
+  auto const cao_id_x = static_cast<int>(threadIdx.x) - part_in_block * cao;
   /* id of the particle */
-  int id =
-      parts_per_block * (blockIdx.x * gridDim.y + blockIdx.y) + part_in_block;
+  auto const id =
+      parts_per_block * static_cast<int>(blockIdx.x * gridDim.y + blockIdx.y) +
+      part_in_block;
   if (id >= par.n_part)
     return;
   /* position relative to the closest gird point */
@@ -310,31 +311,32 @@ __global__ void assign_charge_kernel(const CUDA_particle_data *const pdata,
   extern __shared__ float weights[];
 
   if (shared) {
+    auto const offset = static_cast<unsigned>(cao * part_in_block);
     if ((threadIdx.y < 3) && (threadIdx.z == 0)) {
-      weights[3 * cao * part_in_block + 3 * cao_id_x + threadIdx.y] =
+      weights[3 * offset + 3 * static_cast<unsigned>(cao_id_x) + threadIdx.y] =
           Utils::bspline<cao>(cao_id_x, m_pos[threadIdx.y]);
     }
 
     __syncthreads();
 
-    atomicAdd(&(charge_mesh[ind]),
-              weights[3 * cao * part_in_block + 3 * cao_id_x + 0] *
-                  weights[3 * cao * part_in_block + 3 * threadIdx.y + 1] *
-                  weights[3 * cao * part_in_block + 3 * threadIdx.z + 2] * p.q);
+    auto const c =
+        weights[3 * offset + 3 * static_cast<unsigned>(cao_id_x) + 0] *
+        weights[3 * offset + 3 * threadIdx.y + 1] *
+        weights[3 * offset + 3 * threadIdx.z + 2] * p.q;
+    atomicAdd(&(charge_mesh[ind]), c);
 
   } else {
-    atomicAdd(&(charge_mesh[ind]),
-              Utils::bspline<cao>(cao_id_x, m_pos[0]) *
-                  Utils::bspline<cao>(threadIdx.y, m_pos[1]) *
-                  Utils::bspline<cao>(threadIdx.z, m_pos[2]) * p.q);
+    auto const c =
+        Utils::bspline<cao>(cao_id_x, m_pos[0]) *
+        Utils::bspline<cao>(static_cast<int>(threadIdx.y), m_pos[1]) *
+        Utils::bspline<cao>(static_cast<int>(threadIdx.z), m_pos[2]) * p.q;
+    atomicAdd(&(charge_mesh[ind]), c);
   }
 }
 
 void assign_charges(const CUDA_particle_data *const pdata, const P3MGpuData p) {
-  dim3 grid, block;
-  grid.z = 1;
-  const int cao3 = p.cao * p.cao * p.cao;
-  const int cao = p.cao;
+  auto const cao = p.cao;
+  auto const cao3 = int_pow<3>(cao);
   int parts_per_block = 1, n_blocks = 1;
 
   while ((parts_per_block + 1) * cao3 <= 1024) {
@@ -345,20 +347,19 @@ void assign_charges(const CUDA_particle_data *const pdata, const P3MGpuData p) {
   else
     n_blocks = p.n_part / parts_per_block + 1;
 
-  grid.x = n_blocks;
-  grid.y = 1;
+  dim3 block(static_cast<unsigned>(parts_per_block * cao),
+             static_cast<unsigned>(cao), static_cast<unsigned>(cao));
+  dim3 grid(static_cast<unsigned>(n_blocks), 1u, 1u);
   while (grid.x > 65536) {
     grid.y++;
-    if ((n_blocks % grid.y) == 0)
-      grid.x = std::max<int>(1, n_blocks / grid.y);
+    if ((static_cast<unsigned>(n_blocks) % grid.y) == 0)
+      grid.x = std::max<unsigned>(1u, static_cast<unsigned>(n_blocks) / grid.y);
     else
-      grid.x = n_blocks / grid.y + 1;
+      grid.x = static_cast<unsigned>(n_blocks) / grid.y + 1;
   }
 
-  block.x = parts_per_block * cao;
-  block.y = cao;
-  block.z = cao;
-
+  auto const data_length =
+      3 * static_cast<std::size_t>(parts_per_block * cao) * sizeof(REAL_TYPE);
   switch (cao) {
   case 1:
     (assign_charge_kernel<1, false>)<<<dim3(grid), dim3(block), 0, nullptr>>>(
@@ -370,32 +371,27 @@ void assign_charges(const CUDA_particle_data *const pdata, const P3MGpuData p) {
     break;
   case 3:
     (assign_charge_kernel<
-        3, true>)<<<dim3(grid), dim3(block),
-                    3 * parts_per_block * cao * sizeof(REAL_TYPE), nullptr>>>(
+        3, true>)<<<dim3(grid), dim3(block), data_length, nullptr>>>(
         pdata, p, parts_per_block);
     break;
   case 4:
     (assign_charge_kernel<
-        4, true>)<<<dim3(grid), dim3(block),
-                    3 * parts_per_block * cao * sizeof(REAL_TYPE), nullptr>>>(
+        4, true>)<<<dim3(grid), dim3(block), data_length, nullptr>>>(
         pdata, p, parts_per_block);
     break;
   case 5:
     (assign_charge_kernel<
-        5, true>)<<<dim3(grid), dim3(block),
-                    3 * parts_per_block * cao * sizeof(REAL_TYPE), nullptr>>>(
+        5, true>)<<<dim3(grid), dim3(block), data_length, nullptr>>>(
         pdata, p, parts_per_block);
     break;
   case 6:
     (assign_charge_kernel<
-        6, true>)<<<dim3(grid), dim3(block),
-                    3 * parts_per_block * cao * sizeof(REAL_TYPE), nullptr>>>(
+        6, true>)<<<dim3(grid), dim3(block), data_length, nullptr>>>(
         pdata, p, parts_per_block);
     break;
   case 7:
     (assign_charge_kernel<
-        7, true>)<<<dim3(grid), dim3(block),
-                    3 * parts_per_block * cao * sizeof(REAL_TYPE), nullptr>>>(
+        7, true>)<<<dim3(grid), dim3(block), data_length, nullptr>>>(
         pdata, p, parts_per_block);
     break;
   default:
@@ -409,11 +405,12 @@ __global__ void assign_forces_kernel(const CUDA_particle_data *const pdata,
                                      const P3MGpuData par,
                                      float *lb_particle_force_gpu,
                                      REAL_TYPE prefactor, int parts_per_block) {
-  const int part_in_block = threadIdx.x / cao;
-  const int cao_id_x = threadIdx.x - part_in_block * cao;
+  auto const part_in_block = static_cast<int>(threadIdx.x) / cao;
+  auto const cao_id_x = static_cast<int>(threadIdx.x) - part_in_block * cao;
   /* id of the particle */
-  int id =
-      parts_per_block * (blockIdx.x * gridDim.y + blockIdx.y) + part_in_block;
+  auto const id =
+      parts_per_block * static_cast<int>(blockIdx.x * gridDim.y + blockIdx.y) +
+      part_in_block;
   if (id >= par.n_part)
     return;
   /* position relative to the closest grid point */
@@ -445,20 +442,22 @@ __global__ void assign_forces_kernel(const CUDA_particle_data *const pdata,
   extern __shared__ float weights[];
 
   if (shared) {
+    auto const offset = static_cast<unsigned>(cao * part_in_block);
     if ((threadIdx.y < 3) && (threadIdx.z == 0)) {
-      weights[3 * cao * part_in_block + 3 * cao_id_x + threadIdx.y] =
+      weights[3 * offset + 3 * static_cast<unsigned>(cao_id_x) + threadIdx.y] =
           Utils::bspline<cao>(cao_id_x, m_pos[threadIdx.y]);
     }
 
     __syncthreads();
 
-    c = -prefactor * weights[3 * cao * part_in_block + 3 * cao_id_x + 0] *
-        weights[3 * cao * part_in_block + 3 * threadIdx.y + 1] *
-        weights[3 * cao * part_in_block + 3 * threadIdx.z + 2] * p.q;
+    c = -prefactor *
+        weights[3 * offset + 3 * static_cast<unsigned>(cao_id_x) + 0] *
+        weights[3 * offset + 3 * threadIdx.y + 1] *
+        weights[3 * offset + 3 * threadIdx.z + 2] * p.q;
   } else {
     c = -prefactor * Utils::bspline<cao>(cao_id_x, m_pos[0]) *
-        Utils::bspline<cao>(threadIdx.y, m_pos[1]) *
-        Utils::bspline<cao>(threadIdx.z, m_pos[2]) * p.q;
+        Utils::bspline<cao>(static_cast<int>(threadIdx.y), m_pos[1]) *
+        Utils::bspline<cao>(static_cast<int>(threadIdx.z), m_pos[2]) * p.q;
   }
 
   const REAL_TYPE *force_mesh_x = (REAL_TYPE *)par.force_mesh_x;
@@ -472,11 +471,8 @@ __global__ void assign_forces_kernel(const CUDA_particle_data *const pdata,
 
 void assign_forces(const CUDA_particle_data *const pdata, const P3MGpuData p,
                    float *lb_particle_force_gpu, REAL_TYPE prefactor) {
-  dim3 grid, block;
-  grid.z = 1;
-
-  const int cao = p.cao;
-  const int cao3 = cao * cao * cao;
+  auto const cao = p.cao;
+  auto const cao3 = int_pow<3>(cao);
   int parts_per_block = 1, n_blocks = 1;
 
   while ((parts_per_block + 1) * cao3 <= 1024) {
@@ -488,22 +484,21 @@ void assign_forces(const CUDA_particle_data *const pdata, const P3MGpuData p,
   else
     n_blocks = p3m_gpu_data.n_part / parts_per_block + 1;
 
-  grid.x = n_blocks;
-  grid.y = 1;
+  dim3 block(static_cast<unsigned>(parts_per_block * cao),
+             static_cast<unsigned>(cao), static_cast<unsigned>(cao));
+  dim3 grid(static_cast<unsigned>(n_blocks), 1u, 1u);
   while (grid.x > 65536) {
     grid.y++;
-    if ((n_blocks % grid.y) == 0)
-      grid.x = std::max<int>(1, n_blocks / grid.y);
+    if ((static_cast<unsigned>(n_blocks) % grid.y) == 0)
+      grid.x = std::max<unsigned>(1u, static_cast<unsigned>(n_blocks) / grid.y);
     else
-      grid.x = n_blocks / grid.y + 1;
+      grid.x = static_cast<unsigned>(n_blocks) / grid.y + 1;
   }
 
-  block.x = parts_per_block * cao;
-  block.y = cao;
-  block.z = cao;
-
   /* Switch for assignment templates, the shared version only is faster for cao
    * > 2 */
+  auto const data_length =
+      3 * static_cast<std::size_t>(parts_per_block * cao) * sizeof(float);
   switch (cao) {
   case 1:
     (assign_forces_kernel<1, false>)<<<dim3(grid), dim3(block), 0, nullptr>>>(
@@ -515,32 +510,27 @@ void assign_forces(const CUDA_particle_data *const pdata, const P3MGpuData p,
     break;
   case 3:
     (assign_forces_kernel<
-        3, true>)<<<dim3(grid), dim3(block),
-                    3 * parts_per_block * cao * sizeof(float), nullptr>>>(
+        3, true>)<<<dim3(grid), dim3(block), data_length, nullptr>>>(
         pdata, p, lb_particle_force_gpu, prefactor, parts_per_block);
     break;
   case 4:
     (assign_forces_kernel<
-        4, true>)<<<dim3(grid), dim3(block),
-                    3 * parts_per_block * cao * sizeof(float), nullptr>>>(
+        4, true>)<<<dim3(grid), dim3(block), data_length, nullptr>>>(
         pdata, p, lb_particle_force_gpu, prefactor, parts_per_block);
     break;
   case 5:
     (assign_forces_kernel<
-        5, true>)<<<dim3(grid), dim3(block),
-                    3 * parts_per_block * cao * sizeof(float), nullptr>>>(
+        5, true>)<<<dim3(grid), dim3(block), data_length, nullptr>>>(
         pdata, p, lb_particle_force_gpu, prefactor, parts_per_block);
     break;
   case 6:
     (assign_forces_kernel<
-        6, true>)<<<dim3(grid), dim3(block),
-                    3 * parts_per_block * cao * sizeof(float), nullptr>>>(
+        6, true>)<<<dim3(grid), dim3(block), data_length, nullptr>>>(
         pdata, p, lb_particle_force_gpu, prefactor, parts_per_block);
     break;
   case 7:
     (assign_forces_kernel<
-        7, true>)<<<dim3(grid), dim3(block),
-                    3 * parts_per_block * cao * sizeof(float), nullptr>>>(
+        7, true>)<<<dim3(grid), dim3(block), data_length, nullptr>>>(
         pdata, p, lb_particle_force_gpu, prefactor, parts_per_block);
     break;
   default:
@@ -562,26 +552,26 @@ void p3m_gpu_init(int cao, const int mesh[3], double alpha) {
   auto &espresso_system = EspressoSystemInterface::Instance();
   espresso_system.requestParticleStructGpu();
 
-  bool reinit_if = false, mesh_changed = false;
-  p3m_gpu_data.n_part = gpu_get_particle_pointer().size();
+  bool do_reinit = false, mesh_changed = false;
+  p3m_gpu_data.n_part = static_cast<int>(gpu_get_particle_pointer().size());
 
   if (!p3m_gpu_data_initialized || p3m_gpu_data.alpha != alpha) {
     p3m_gpu_data.alpha = static_cast<REAL_TYPE>(alpha);
-    reinit_if = true;
+    do_reinit = true;
   }
 
   if (!p3m_gpu_data_initialized || p3m_gpu_data.cao != cao) {
     p3m_gpu_data.cao = cao;
     // NOLINTNEXTLINE(bugprone-integer-division)
     p3m_gpu_data.pos_shift = static_cast<REAL_TYPE>((p3m_gpu_data.cao - 1) / 2);
-    reinit_if = true;
+    do_reinit = true;
   }
 
   if (!p3m_gpu_data_initialized || (p3m_gpu_data.mesh[0] != mesh[0]) ||
       (p3m_gpu_data.mesh[1] != mesh[1]) || (p3m_gpu_data.mesh[2] != mesh[2])) {
     std::copy(mesh, mesh + 3, p3m_gpu_data.mesh);
     mesh_changed = true;
-    reinit_if = true;
+    do_reinit = true;
   }
 
   auto const box_l = espresso_system.box();
@@ -589,7 +579,7 @@ void p3m_gpu_init(int cao, const int mesh[3], double alpha) {
   if (!p3m_gpu_data_initialized || (p3m_gpu_data.box[0] != box_l[0]) ||
       (p3m_gpu_data.box[1] != box_l[1]) || (p3m_gpu_data.box[2] != box_l[2])) {
     std::copy(box_l.begin(), box_l.end(), p3m_gpu_data.box);
-    reinit_if = true;
+    do_reinit = true;
   }
 
   p3m_gpu_data.mesh_z_padded = (mesh[2] / 2 + 1) * 2;
@@ -620,17 +610,15 @@ void p3m_gpu_init(int cao, const int mesh[3], double alpha) {
 
   if (!p3m_gpu_data_initialized && p3m_gpu_data.mesh_size > 0) {
     /* Size of the complex mesh Nx * Ny * ( Nz / 2 + 1 ) */
-    const int cmesh_size = p3m_gpu_data.mesh[0] * p3m_gpu_data.mesh[1] *
-                           (p3m_gpu_data.mesh[2] / 2 + 1);
-
-    cuda_safe_mem(cudaMalloc((void **)&(p3m_gpu_data.charge_mesh),
-                             cmesh_size * sizeof(FFT_TYPE_COMPLEX)));
-    cuda_safe_mem(cudaMalloc((void **)&(p3m_gpu_data.force_mesh_x),
-                             cmesh_size * sizeof(FFT_TYPE_COMPLEX)));
-    cuda_safe_mem(cudaMalloc((void **)&(p3m_gpu_data.force_mesh_y),
-                             cmesh_size * sizeof(FFT_TYPE_COMPLEX)));
-    cuda_safe_mem(cudaMalloc((void **)&(p3m_gpu_data.force_mesh_z),
-                             cmesh_size * sizeof(FFT_TYPE_COMPLEX)));
+    auto const cmesh_size =
+        static_cast<std::size_t>(p3m_gpu_data.mesh[0]) *
+        static_cast<std::size_t>(p3m_gpu_data.mesh[1]) *
+        static_cast<std::size_t>(p3m_gpu_data.mesh[2] / 2 + 1);
+    auto const mesh_len = cmesh_size * sizeof(FFT_TYPE_COMPLEX);
+    cuda_safe_mem(cudaMalloc((void **)&(p3m_gpu_data.charge_mesh), mesh_len));
+    cuda_safe_mem(cudaMalloc((void **)&(p3m_gpu_data.force_mesh_x), mesh_len));
+    cuda_safe_mem(cudaMalloc((void **)&(p3m_gpu_data.force_mesh_y), mesh_len));
+    cuda_safe_mem(cudaMalloc((void **)&(p3m_gpu_data.force_mesh_z), mesh_len));
     cuda_safe_mem(cudaMalloc((void **)&(p3m_gpu_data.G_hat),
                              cmesh_size * sizeof(REAL_TYPE)));
 
@@ -642,14 +630,14 @@ void p3m_gpu_init(int cao, const int mesh[3], double alpha) {
     }
   }
 
-  if ((reinit_if or !p3m_gpu_data_initialized) && p3m_gpu_data.mesh_size > 0) {
+  if ((do_reinit or !p3m_gpu_data_initialized) && p3m_gpu_data.mesh_size > 0) {
     dim3 grid(1, 1, 1);
     dim3 block(1, 1, 1);
-    block.x = 512 / mesh[0] + 1;
-    block.y = mesh[1];
+    block.x = static_cast<unsigned>(512 / mesh[0] + 1);
+    block.y = static_cast<unsigned>(mesh[1]);
     block.z = 1;
-    grid.x = mesh[0] / block.x + 1;
-    grid.z = mesh[2] / 2 + 1;
+    grid.x = static_cast<unsigned>(mesh[0]) / block.x + 1;
+    grid.z = static_cast<unsigned>(mesh[2]) / 2 + 1;
 
     switch (p3m_gpu_data.cao) {
     case 1:
@@ -692,22 +680,23 @@ void p3m_gpu_init(int cao, const int mesh[3], double alpha) {
 void p3m_gpu_add_farfield_force() {
   auto device_particles = gpu_get_particle_pointer();
   CUDA_particle_data *lb_particle_gpu = device_particles.data();
-  p3m_gpu_data.n_part = device_particles.size();
+  p3m_gpu_data.n_part = static_cast<int>(device_particles.size());
 
   float *lb_particle_force_gpu = gpu_get_particle_force_pointer();
 
   if (p3m_gpu_data.n_part == 0)
     return;
 
-  dim3 gridConv(p3m_gpu_data.mesh[0], p3m_gpu_data.mesh[1], 1);
-  dim3 threadsConv(p3m_gpu_data.mesh[2] / 2 + 1, 1, 1);
+  dim3 gridConv(static_cast<unsigned>(p3m_gpu_data.mesh[0]),
+                static_cast<unsigned>(p3m_gpu_data.mesh[1]), 1u);
+  dim3 threadsConv(static_cast<unsigned>(p3m_gpu_data.mesh[2] / 2 + 1), 1u, 1u);
 
-  auto prefactor =
-      REAL_TYPE(coulomb.prefactor / (p3m_gpu_data.box[0] * p3m_gpu_data.box[1] *
-                                     p3m_gpu_data.box[2] * 2.0));
+  auto const volume = Utils::product(Utils::Vector3d(p3m_gpu_data.box));
+  auto const prefactor = REAL_TYPE(coulomb.prefactor / (volume * 2.0));
 
   cuda_safe_mem(cudaMemset(p3m_gpu_data.charge_mesh, 0,
-                           p3m_gpu_data.mesh_size * sizeof(REAL_TYPE)));
+                           static_cast<std::size_t>(p3m_gpu_data.mesh_size) *
+                               sizeof(REAL_TYPE)));
 
   /* Interpolate the charges to the mesh */
   assign_charges(lb_particle_gpu, p3m_gpu_data);
diff --git a/src/core/electrostatics_magnetostatics/p3m_gpu_error_cuda.cu b/src/core/electrostatics_magnetostatics/p3m_gpu_error_cuda.cu
index 98b2aaf5b7e..2d2a40604b8 100644
--- a/src/core/electrostatics_magnetostatics/p3m_gpu_error_cuda.cu
+++ b/src/core/electrostatics_magnetostatics/p3m_gpu_error_cuda.cu
@@ -104,8 +104,7 @@ __global__ void p3m_k_space_error_gpu_kernel_ik(int3 mesh, double3 meshi,
     const double cs = p3m_analytic_cotangent_sum<cao>(nz, meshi.z) *
                       p3m_analytic_cotangent_sum<cao>(nx, meshi.x) *
                       p3m_analytic_cotangent_sum<cao>(ny, meshi.y);
-    const double ex =
-        exp(-(Utils::pi() * alpha_L_i) * (Utils::pi() * alpha_L_i) * n2);
+    const double ex = exp(-sqr(Utils::pi() * alpha_L_i) * n2);
     const double ex2 = sqr(ex);
     const double U2 =
         int_pow<2 * cao>(Utils::sinc(meshi.x * nx) * Utils::sinc(meshi.y * ny) *
@@ -123,11 +122,11 @@ __global__ void p3m_k_space_error_gpu_kernel_ik(int3 mesh, double3 meshi,
 __global__ void p3m_k_space_error_gpu_kernel_ad(const int3 mesh,
                                                 const double3 meshi, int cao,
                                                 double alpha_L, double *he_q) {
-  int nx =
+  auto const nx =
       -mesh.x / 2 + static_cast<int>(blockDim.x * blockIdx.x + threadIdx.x);
-  int ny =
+  auto const ny =
       -mesh.y / 2 + static_cast<int>(blockDim.y * blockIdx.y + threadIdx.y);
-  int nz =
+  auto const nz =
       -mesh.z / 2 + static_cast<int>(blockDim.z * blockIdx.z + threadIdx.z);
 
   if ((nx >= mesh.x / 2) || (ny >= mesh.y / 2) || (nz >= mesh.z / 2))
@@ -136,31 +135,25 @@ __global__ void p3m_k_space_error_gpu_kernel_ad(const int3 mesh,
   int lind = ((nx + mesh.x / 2) * mesh.y * mesh.z + (ny + mesh.y / 2) * mesh.z +
               (nz + mesh.z / 2));
 
-  double alpha_L_i = 1. / alpha_L;
-  double n2;
-  double U2, ex, ex2;
-  int nmx, nmy, nmz;
+  auto const alpha_L_i = 1. / alpha_L;
   double alias1, alias2, alias3, alias4;
-
   alias1 = alias2 = alias3 = alias4 = 0;
 
   if ((nx != 0) || (ny != 0) || (nz != 0)) {
     for (int mx = -1; mx <= 1; mx++) {
-      nmx = nx + mx * mesh.x;
+      auto const nmx = static_cast<double>(nx + mx * mesh.x);
       for (int my = -1; my <= 1; my++) {
-        nmy = ny + my * mesh.y;
+        auto const nmy = static_cast<double>(ny + my * mesh.y);
         for (int mz = -1; mz <= 1; mz++) {
-          nmz = nz + mz * mesh.z;
-
-          n2 = sqr(nmx) + sqr(nmy) + sqr(nmz);
-
-          ex = exp(-(Utils::pi() * alpha_L_i) * (Utils::pi() * alpha_L_i) * n2);
-
-          ex2 = sqr(ex);
+          auto const nmz = static_cast<double>(nz + mz * mesh.z);
 
-          U2 = pow((double)Utils::sinc(meshi.x * nmx) *
-                       Utils::sinc(meshi.y * nmy) * Utils::sinc(meshi.z * nmz),
-                   2.0 * cao);
+          auto const n2 = static_cast<double>(sqr(nmx) + sqr(nmy) + sqr(nmz));
+          auto const ex = exp(-sqr(Utils::pi() * alpha_L_i) * n2);
+          auto const ex2 = sqr(ex);
+          auto const U2 =
+              pow(Utils::sinc(meshi.x * nmx) * Utils::sinc(meshi.y * nmy) *
+                      Utils::sinc(meshi.z * nmz),
+                  2.0 * cao);
 
           alias1 += ex2 / n2;
           alias2 += U2 * ex;
@@ -185,11 +178,11 @@ __global__ void p3m_k_space_error_gpu_kernel_ik_i(const int3 mesh,
                                                   double alpha_L,
                                                   double *he_q) {
 
-  int nx =
+  auto const nx =
       -mesh.x / 2 + static_cast<int>(blockDim.x * blockIdx.x + threadIdx.x);
-  int ny =
+  auto const ny =
       -mesh.y / 2 + static_cast<int>(blockDim.y * blockIdx.y + threadIdx.y);
-  int nz =
+  auto const nz =
       -mesh.z / 2 + static_cast<int>(blockDim.z * blockIdx.z + threadIdx.z);
 
   if ((nx >= mesh.x / 2) || (ny >= mesh.y / 2) || (nz >= mesh.z / 2))
@@ -198,31 +191,25 @@ __global__ void p3m_k_space_error_gpu_kernel_ik_i(const int3 mesh,
   int lind = ((nx + mesh.x / 2) * mesh.y * mesh.z + (ny + mesh.y / 2) * mesh.z +
               (nz + mesh.z / 2));
 
-  double alpha_L_i = 1. / alpha_L;
-  double n2;
-  double U2, ex, ex2;
-  int nmx, nmy, nmz;
+  auto const alpha_L_i = 1. / alpha_L;
   double alias1, alias2, alias3, alias4;
-
   alias1 = alias2 = alias3 = alias4 = 0;
 
   if ((nx != 0) || (ny != 0) || (nz != 0)) {
     for (int mx = -1; mx <= 1; mx++) {
-      nmx = nx + mx * mesh.x;
+      auto const nmx = static_cast<double>(nx + mx * mesh.x);
       for (int my = -1; my <= 1; my++) {
-        nmy = ny + my * mesh.y;
+        auto const nmy = static_cast<double>(ny + my * mesh.y);
         for (int mz = -1; mz <= 1; mz++) {
-          nmz = nz + mz * mesh.z;
-
-          n2 = sqr(nmx) + sqr(nmy) + sqr(nmz);
-
-          ex = exp(-(Utils::pi() * alpha_L_i) * (Utils::pi() * alpha_L_i) * n2);
+          auto const nmz = static_cast<double>(nz + mz * mesh.z);
 
-          ex2 = sqr(ex);
-
-          U2 = pow((double)Utils::sinc(meshi.x * nmx) *
-                       Utils::sinc(meshi.y * nmy) * Utils::sinc(meshi.z * nmz),
-                   2.0 * cao);
+          auto const n2 = static_cast<double>(sqr(nmx) + sqr(nmy) + sqr(nmz));
+          auto const ex = exp(-sqr(Utils::pi() * alpha_L_i) * n2);
+          auto const ex2 = sqr(ex);
+          auto const U2 =
+              pow(Utils::sinc(meshi.x * nmx) * Utils::sinc(meshi.y * nmy) *
+                      Utils::sinc(meshi.z * nmz),
+                  2.0 * cao);
 
           alias1 += ex2 / n2;
           alias2 += U2 * ex * (nx * nmx + ny * nmy + nz * nmz) / n2;
@@ -251,11 +238,11 @@ __global__ void p3m_k_space_error_gpu_kernel_ad_i(const int3 mesh,
                                                   double alpha_L,
                                                   double *he_q) {
 
-  int nx =
+  auto const nx =
       -mesh.x / 2 + static_cast<int>(blockDim.x * blockIdx.x + threadIdx.x);
-  int ny =
+  auto const ny =
       -mesh.y / 2 + static_cast<int>(blockDim.y * blockIdx.y + threadIdx.y);
-  int nz =
+  auto const nz =
       -mesh.z / 2 + static_cast<int>(blockDim.z * blockIdx.z + threadIdx.z);
 
   if ((nx >= mesh.x / 2) || (ny >= mesh.y / 2) || (nz >= mesh.z / 2))
@@ -264,31 +251,25 @@ __global__ void p3m_k_space_error_gpu_kernel_ad_i(const int3 mesh,
   int lind = ((nx + mesh.x / 2) * mesh.y * mesh.z + (ny + mesh.y / 2) * mesh.z +
               (nz + mesh.z / 2));
 
-  double alpha_L_i = 1. / alpha_L;
-  double n2;
-  double U2, ex, ex2;
-  int nmx, nmy, nmz;
+  auto const alpha_L_i = 1. / alpha_L;
   double alias1, alias2, alias3, alias4, alias5, alias6;
-
   alias1 = alias2 = alias3 = alias4 = alias5 = alias6 = 0;
 
   if ((nx != 0) && (ny != 0) && (nz != 0)) {
     for (int mx = -1; mx <= 1; mx++) {
-      nmx = nx + mx * mesh.x;
+      auto const nmx = static_cast<double>(nx + mx * mesh.x);
       for (int my = -1; my <= 1; my++) {
-        nmy = ny + my * mesh.y;
+        auto const nmy = static_cast<double>(ny + my * mesh.y);
         for (int mz = -1; mz <= 1; mz++) {
-          nmz = nz + mz * mesh.z;
-
-          n2 = sqr(nmx) + sqr(nmy) + sqr(nmz);
-
-          ex = exp(-(Utils::pi() * alpha_L_i) * (Utils::pi() * alpha_L_i) * n2);
+          auto const nmz = static_cast<double>(nz + mz * mesh.z);
 
-          ex2 = sqr(ex);
-
-          U2 = pow((double)Utils::sinc(meshi.x * nmx) *
-                       Utils::sinc(meshi.y * nmy) * Utils::sinc(meshi.z * nmz),
-                   2.0 * cao);
+          auto const n2 = sqr(nmx) + sqr(nmy) + sqr(nmz);
+          auto const ex = exp(-sqr(Utils::pi() * alpha_L_i) * n2);
+          auto const ex2 = sqr(ex);
+          auto const U2 =
+              pow(Utils::sinc(meshi.x * nmx) * Utils::sinc(meshi.y * nmy) *
+                      Utils::sinc(meshi.z * nmz),
+                  2.0 * cao);
 
           alias1 += ex2 / n2;
           alias2 += U2 * ex;
@@ -318,14 +299,11 @@ double p3m_k_space_error_gpu(double prefactor, const int *mesh, int cao,
                              int npart, double sum_q2, double alpha_L,
                              const double *box) {
   static thrust::device_vector<double> he_q;
+  he_q.resize(static_cast<unsigned>(mesh[0] * mesh[1] * mesh[2]));
 
-  const std::size_t mesh_size = mesh[0] * mesh[1] * mesh[2];
-
-  he_q.resize(mesh_size);
-
-  dim3 grid(std::max<int>(1, mesh[0] / 8 + 1),
-            std::max<int>(1, mesh[1] / 8 + 1),
-            std::max<int>(1, mesh[2] / 8 + 1));
+  dim3 grid(std::max<unsigned>(1u, static_cast<unsigned>(mesh[0] / 8 + 1)),
+            std::max<unsigned>(1u, static_cast<unsigned>(mesh[1] / 8 + 1)),
+            std::max<unsigned>(1u, static_cast<unsigned>(mesh[2] / 8 + 1)));
 
   dim3 block(8, 8, 8);
 
@@ -372,6 +350,5 @@ double p3m_k_space_error_gpu(double prefactor, const int *mesh, int cao,
 
   auto const he_q_final = thrust::reduce(he_q.begin(), he_q.end());
 
-  return 2.0 * prefactor * sum_q2 * sqrt(he_q_final / npart) /
-         (box[1] * box[2]);
+  return 2 * prefactor * sum_q2 * sqrt(he_q_final / npart) / (box[1] * box[2]);
 }

From 48592362a65c41fc740b1027824719100dbe3155 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Wed, 1 Dec 2021 21:58:56 +0100
Subject: [PATCH 03/21] core: Cleanup header includes

---
 src/core/EspressoSystemInterface.cpp                         | 2 ++
 src/core/EspressoSystemInterface.hpp                         | 3 +++
 src/core/actor/DipolarBarnesHut.hpp                          | 2 --
 src/core/actor/DipolarBarnesHut_cuda.cu                      | 3 +++
 src/core/actor/DipolarDirectSum.hpp                          | 2 --
 src/core/cuda_interface.cpp                                  | 1 +
 src/core/electrostatics_magnetostatics/coulomb_inline.hpp    | 1 +
 src/core/electrostatics_magnetostatics/p3m_gpu_error_cuda.cu | 3 ++-
 src/core/event.cpp                                           | 2 --
 src/core/grid_based_algorithms/electrokinetics_cuda.cu       | 1 +
 src/core/immersed_boundary/ibm_triel.cpp                     | 4 ++--
 src/core/pair_criteria/pair_criteria.hpp                     | 1 +
 12 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/src/core/EspressoSystemInterface.cpp b/src/core/EspressoSystemInterface.cpp
index 3784f39903f..61bc74e830d 100644
--- a/src/core/EspressoSystemInterface.cpp
+++ b/src/core/EspressoSystemInterface.cpp
@@ -23,6 +23,8 @@
 #include "cuda_interface.hpp"
 #include "grid.hpp"
 
+#include <utils/Vector.hpp>
+
 /* Initialize instance pointer */
 EspressoSystemInterface *EspressoSystemInterface::m_instance = nullptr;
 
diff --git a/src/core/EspressoSystemInterface.hpp b/src/core/EspressoSystemInterface.hpp
index cdcad3db7f8..91329c65c7f 100644
--- a/src/core/EspressoSystemInterface.hpp
+++ b/src/core/EspressoSystemInterface.hpp
@@ -20,8 +20,11 @@
 #define ESPRESSOSYSTEMINTERFACE_H
 
 #include "SystemInterface.hpp"
+#include "config.hpp"
 #include "cuda_interface.hpp"
 
+#include <utils/Vector.hpp>
+
 #include <cstddef>
 
 class EspressoSystemInterface : public SystemInterface {
diff --git a/src/core/actor/DipolarBarnesHut.hpp b/src/core/actor/DipolarBarnesHut.hpp
index 98364fde674..1bf3815527b 100644
--- a/src/core/actor/DipolarBarnesHut.hpp
+++ b/src/core/actor/DipolarBarnesHut.hpp
@@ -31,8 +31,6 @@
 #include "electrostatics_magnetostatics/dipole.hpp"
 #include "errorhandling.hpp"
 
-#include <memory>
-
 class DipolarBarnesHut : public Actor {
 public:
   DipolarBarnesHut(SystemInterface &s, float epssq, float itolsq) {
diff --git a/src/core/actor/DipolarBarnesHut_cuda.cu b/src/core/actor/DipolarBarnesHut_cuda.cu
index baadc95522e..456895976ff 100644
--- a/src/core/actor/DipolarBarnesHut_cuda.cu
+++ b/src/core/actor/DipolarBarnesHut_cuda.cu
@@ -36,6 +36,9 @@
 
 #include <cuda.h>
 
+#include <algorithm>
+#include <cstdio>
+
 // Method performance/accuracy parameters
 __constant__ float epssqd[1], itolsqd[1];
 // blkcntd is a factual blocks' count.
diff --git a/src/core/actor/DipolarDirectSum.hpp b/src/core/actor/DipolarDirectSum.hpp
index 1938dd224ab..ba5cdc9d05c 100644
--- a/src/core/actor/DipolarDirectSum.hpp
+++ b/src/core/actor/DipolarDirectSum.hpp
@@ -30,8 +30,6 @@
 #include "errorhandling.hpp"
 #include "grid.hpp"
 
-#include <memory>
-
 void DipolarDirectSum_kernel_wrapper_energy(float k, unsigned int n, float *pos,
                                             float *dip, float box_l[3],
                                             int periodic[3], float *E);
diff --git a/src/core/cuda_interface.cpp b/src/core/cuda_interface.cpp
index 8b5b87d9927..a92fe9e4bfb 100644
--- a/src/core/cuda_interface.cpp
+++ b/src/core/cuda_interface.cpp
@@ -28,6 +28,7 @@
 #include "nonbonded_interactions/nonbonded_interaction_data.hpp"
 #include "serialization/CUDA_particle_data.hpp"
 
+#include <utils/Vector.hpp>
 #include <utils/mpi/gather_buffer.hpp>
 #include <utils/mpi/scatter_buffer.hpp>
 
diff --git a/src/core/electrostatics_magnetostatics/coulomb_inline.hpp b/src/core/electrostatics_magnetostatics/coulomb_inline.hpp
index d2a0837ccf4..e3c912c94ae 100644
--- a/src/core/electrostatics_magnetostatics/coulomb_inline.hpp
+++ b/src/core/electrostatics_magnetostatics/coulomb_inline.hpp
@@ -35,6 +35,7 @@
 #include <utils/math/tensor_product.hpp>
 #include <utils/matrix.hpp>
 
+#include <cstdio>
 #include <tuple>
 
 namespace Coulomb {
diff --git a/src/core/electrostatics_magnetostatics/p3m_gpu_error_cuda.cu b/src/core/electrostatics_magnetostatics/p3m_gpu_error_cuda.cu
index 2d2a40604b8..3b6591c4388 100644
--- a/src/core/electrostatics_magnetostatics/p3m_gpu_error_cuda.cu
+++ b/src/core/electrostatics_magnetostatics/p3m_gpu_error_cuda.cu
@@ -36,7 +36,8 @@
 
 #include <cuda.h>
 
-#include <cstdio>
+#include <algorithm>
+#include <cstddef>
 
 #if defined(OMPI_MPI_H) || defined(_MPI_H)
 #error CU-file includes mpi.h! This should not happen!
diff --git a/src/core/event.cpp b/src/core/event.cpp
index 2753a609305..883818372fd 100644
--- a/src/core/event.cpp
+++ b/src/core/event.cpp
@@ -56,8 +56,6 @@
 
 #include <utils/mpi/all_compare.hpp>
 
-#include <cstdio>
-
 #include <mpi.h>
 
 /** whether the thermostat has to be reinitialized before integration */
diff --git a/src/core/grid_based_algorithms/electrokinetics_cuda.cu b/src/core/grid_based_algorithms/electrokinetics_cuda.cu
index b7ad6e3ff1c..9640489e2b5 100644
--- a/src/core/grid_based_algorithms/electrokinetics_cuda.cu
+++ b/src/core/grid_based_algorithms/electrokinetics_cuda.cu
@@ -46,6 +46,7 @@
 #include <cufft.h>
 
 #include <cstddef>
+#include <cstdio>
 #include <fstream>
 #include <iostream>
 #include <limits>
diff --git a/src/core/immersed_boundary/ibm_triel.cpp b/src/core/immersed_boundary/ibm_triel.cpp
index 8bebd14df86..123ffa587ef 100644
--- a/src/core/immersed_boundary/ibm_triel.cpp
+++ b/src/core/immersed_boundary/ibm_triel.cpp
@@ -27,8 +27,8 @@
 #include <utils/Vector.hpp>
 #include <utils/math/sqr.hpp>
 
-#include <cmath>
-#include <cstdio>
+#include <boost/optional.hpp>
+
 #include <tuple>
 
 namespace {
diff --git a/src/core/pair_criteria/pair_criteria.hpp b/src/core/pair_criteria/pair_criteria.hpp
index ac4d787c693..fde2c8b4a2b 100644
--- a/src/core/pair_criteria/pair_criteria.hpp
+++ b/src/core/pair_criteria/pair_criteria.hpp
@@ -21,6 +21,7 @@
 
 #include "Particle.hpp"
 #include "energy_inline.hpp"
+#include "grid.hpp"
 #include "particle_data.hpp"
 
 #include <stdexcept>

From 851d218afe178300ca60d7a758aa1cc21c834e62 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Thu, 2 Dec 2021 20:53:58 +0100
Subject: [PATCH 04/21] core: Extract GPU check logic

---
 src/core/cuda_common_cuda.cu | 14 +-------------
 src/core/cuda_init.hpp       |  7 +++++++
 src/core/cuda_init_cuda.cu   | 14 ++++++++++++++
 3 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/src/core/cuda_common_cuda.cu b/src/core/cuda_common_cuda.cu
index 1f6b867e9e3..e5d97e8baf3 100644
--- a/src/core/cuda_common_cuda.cu
+++ b/src/core/cuda_common_cuda.cu
@@ -133,19 +133,7 @@ void resize_buffers(std::size_t number_of_particles) {
 void gpu_init_particle_comm() {
   if (this_node == 0 && global_part_vars_host.communication_enabled == 0) {
     try {
-      if (cuda_get_n_gpus() == 0) {
-        fprintf(stderr, "ERROR: No GPU was found.\n");
-        errexit();
-      }
-      auto const devID = cuda_get_device();
-      auto const compute_capability = cuda_check_gpu_compute_capability(devID);
-      auto const communication_test = cuda_test_device_access();
-      if (compute_capability != ES_OK or communication_test != ES_OK) {
-        fprintf(stderr,
-                "ERROR: CUDA device %i is not capable of running ESPResSo.\n",
-                devID);
-        errexit();
-      }
+      cuda_check_device();
     } catch (cuda_runtime_error const &err) {
       fprintf(stderr, "ERROR: %s\n", err.what());
       errexit();
diff --git a/src/core/cuda_init.hpp b/src/core/cuda_init.hpp
index 810cf075d9d..b81d54dc551 100644
--- a/src/core/cuda_init.hpp
+++ b/src/core/cuda_init.hpp
@@ -92,6 +92,13 @@ int cuda_get_device();
  */
 int cuda_test_device_access();
 
+/**
+ * Check that a device is available, that its compute capability
+ * is sufficient for ESPResSo, and that data can be written to
+ * and read from it. Otherwise, throw an exception.
+ */
+void cuda_check_device();
+
 /** Gather unique list of CUDA devices on all nodes.
  *  @return vector of device properties.
  */
diff --git a/src/core/cuda_init_cuda.cu b/src/core/cuda_init_cuda.cu
index e434d7132f0..b5616ae0da1 100644
--- a/src/core/cuda_init_cuda.cu
+++ b/src/core/cuda_init_cuda.cu
@@ -25,6 +25,7 @@
 #include <utils/constants.hpp>
 
 #include <cstring>
+#include <string>
 
 #if defined(OMPI_MPI_H) || defined(_MPI_H)
 #error CU-file includes mpi.h! This should not happen!
@@ -118,4 +119,17 @@ int cuda_test_device_access() {
   return ES_OK;
 }
 
+void cuda_check_device() {
+  if (cuda_get_n_gpus() == 0) {
+    throw cuda_runtime_error("No GPU was found.");
+  }
+  auto const devID = cuda_get_device();
+  auto const compute_capability = cuda_check_gpu_compute_capability(devID);
+  auto const communication_test = cuda_test_device_access();
+  if (compute_capability != ES_OK or communication_test != ES_OK) {
+    throw cuda_runtime_error("CUDA device " + std::to_string(devID) +
+                             " is not capable of running ESPResSo.");
+  }
+}
+
 #endif /* defined(CUDA) */

From 0dd9acb71843779bf10684d8c331e4c0a92180af Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Thu, 2 Dec 2021 21:10:26 +0100
Subject: [PATCH 05/21] tests: Check EspressoSystemInterface class

---
 src/core/unit_tests/CMakeLists.txt            |   2 +
 .../EspressoSystemInterface_test.cpp          | 206 ++++++++++++++++++
 2 files changed, 208 insertions(+)
 create mode 100644 src/core/unit_tests/EspressoSystemInterface_test.cpp

diff --git a/src/core/unit_tests/CMakeLists.txt b/src/core/unit_tests/CMakeLists.txt
index 6f9c701753c..5e051fce5f3 100644
--- a/src/core/unit_tests/CMakeLists.txt
+++ b/src/core/unit_tests/CMakeLists.txt
@@ -27,6 +27,8 @@ unit_test(NAME RuntimeErrorCollector_test SRC RuntimeErrorCollector_test.cpp
 unit_test(NAME EspressoSystemStandAlone_test SRC
           EspressoSystemStandAlone_test.cpp DEPENDS EspressoCore Boost::mpi
           MPI::MPI_CXX NUM_PROC 2)
+unit_test(NAME EspressoSystemInterface_test SRC
+          EspressoSystemInterface_test.cpp DEPENDS EspressoCore Boost::mpi)
 unit_test(NAME MpiCallbacks_test SRC MpiCallbacks_test.cpp DEPENDS
           EspressoUtils Boost::mpi MPI::MPI_CXX NUM_PROC 2)
 unit_test(NAME ParticleIterator_test SRC ParticleIterator_test.cpp DEPENDS
diff --git a/src/core/unit_tests/EspressoSystemInterface_test.cpp b/src/core/unit_tests/EspressoSystemInterface_test.cpp
new file mode 100644
index 00000000000..a270579384f
--- /dev/null
+++ b/src/core/unit_tests/EspressoSystemInterface_test.cpp
@@ -0,0 +1,206 @@
+/*
+ * Copyright (C) 2021 The ESPResSo project
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define BOOST_TEST_NO_MAIN
+#define BOOST_TEST_MODULE EspressoSystemInterface test
+#define BOOST_TEST_ALTERNATIVE_INIT_API
+#define BOOST_TEST_DYN_LINK
+#include <boost/test/unit_test.hpp>
+
+#include "EspressoSystemInterface.hpp"
+#include "communication.hpp"
+#include "config.hpp"
+#include "particle_data.hpp"
+#include "virtual_sites.hpp"
+#include "virtual_sites/VirtualSitesOff.hpp"
+
+#include <boost/mpi.hpp>
+
+#include <cassert>
+#include <memory>
+
+namespace espresso {
+auto &system = EspressoSystemInterface::Instance();
+}
+
+inline void check_uninitialized_device_pointers() {
+  BOOST_CHECK_EQUAL(espresso::system.eGpu(), nullptr);
+  BOOST_CHECK_EQUAL(espresso::system.rGpuBegin(), nullptr);
+  BOOST_CHECK_EQUAL(espresso::system.rGpuEnd(), nullptr);
+  BOOST_CHECK_EQUAL(espresso::system.dipGpuBegin(), nullptr);
+  BOOST_CHECK_EQUAL(espresso::system.dipGpuEnd(), nullptr);
+  BOOST_CHECK_EQUAL(espresso::system.fGpuBegin(), nullptr);
+  BOOST_CHECK_EQUAL(espresso::system.fGpuEnd(), nullptr);
+  BOOST_CHECK_EQUAL(espresso::system.torqueGpuBegin(), nullptr);
+  BOOST_CHECK_EQUAL(espresso::system.torqueGpuEnd(), nullptr);
+  BOOST_CHECK_EQUAL(espresso::system.qGpuBegin(), nullptr);
+  BOOST_CHECK_EQUAL(espresso::system.qGpuEnd(), nullptr);
+}
+
+#ifdef CUDA
+
+#include "cuda_init.hpp"
+#include "cuda_utils.hpp"
+
+/* Decorator to run a unit test depending on GPU availability. */
+boost::test_tools::assertion_result has_gpu(boost::unit_test::test_unit_id) {
+  bool has_compatible_gpu = false;
+  try {
+    cuda_check_device();
+    has_compatible_gpu = true;
+  } catch (cuda_runtime_error const &err) {
+  }
+  return has_compatible_gpu;
+}
+
+BOOST_AUTO_TEST_CASE(check_with_gpu, *boost::unit_test::precondition(has_gpu)) {
+  check_uninitialized_device_pointers();
+  BOOST_CHECK_EQUAL(espresso::system.npart_gpu(), 0);
+
+  // features compiled in
+  auto has_feature_rotation = false;
+  auto has_feature_electrostatics = false;
+  auto has_feature_dipoles = false;
+#ifdef ROTATION
+  has_feature_rotation = true;
+#endif
+#ifdef ELECTROSTATICS
+  has_feature_electrostatics = true;
+#endif
+#ifdef DIPOLES
+  has_feature_dipoles = true;
+#endif
+
+  auto const pid = 1;
+  place_particle(pid, {0., 0., 0.});
+  set_particle_type(pid, 0);
+
+  BOOST_CHECK_EQUAL(espresso::system.npart_gpu(), 0);
+  espresso::system.update();
+  BOOST_CHECK_EQUAL(espresso::system.npart_gpu(), 0);
+  BOOST_TEST(espresso::system.requestParticleStructGpu());
+  espresso::system.update();
+  BOOST_CHECK_EQUAL(espresso::system.npart_gpu(), 1);
+
+  // check position split
+  BOOST_TEST(espresso::system.hasRGpu());
+  BOOST_TEST(espresso::system.requestRGpu());
+  espresso::system.update();
+  BOOST_TEST(espresso::system.rGpuBegin() != nullptr);
+  BOOST_TEST(espresso::system.rGpuEnd() != nullptr);
+  BOOST_CHECK_EQUAL(espresso::system.rGpuEnd() - espresso::system.rGpuBegin(),
+                    3);
+
+  // check force split
+  BOOST_TEST(espresso::system.hasFGpu());
+  BOOST_TEST(espresso::system.requestFGpu());
+  espresso::system.update();
+  BOOST_TEST(espresso::system.fGpuBegin() != nullptr);
+  BOOST_TEST(espresso::system.fGpuEnd() != nullptr);
+  BOOST_CHECK_EQUAL(espresso::system.fGpuEnd() - espresso::system.fGpuBegin(),
+                    3);
+
+  // check torque split
+  BOOST_CHECK_EQUAL(espresso::system.hasTorqueGpu(), has_feature_rotation);
+  BOOST_CHECK_EQUAL(espresso::system.requestTorqueGpu(), has_feature_rotation);
+#ifdef ROTATION
+  espresso::system.update();
+  BOOST_TEST(espresso::system.torqueGpuBegin() != nullptr);
+  BOOST_TEST(espresso::system.torqueGpuEnd() != nullptr);
+  BOOST_CHECK_EQUAL(
+      espresso::system.torqueGpuEnd() - espresso::system.torqueGpuBegin(), 3);
+#endif
+
+  // check charge split
+  BOOST_CHECK_EQUAL(espresso::system.hasQGpu(), has_feature_electrostatics);
+  BOOST_CHECK_EQUAL(espresso::system.requestQGpu(), has_feature_electrostatics);
+#ifdef ELECTROSTATICS
+  espresso::system.update();
+  BOOST_TEST(espresso::system.qGpuBegin() != nullptr);
+  BOOST_TEST(espresso::system.qGpuEnd() != nullptr);
+  BOOST_CHECK_EQUAL(espresso::system.qGpuEnd() - espresso::system.qGpuBegin(),
+                    3);
+#endif
+
+  // check dipole split
+  BOOST_CHECK_EQUAL(espresso::system.hasDipGpu(), has_feature_dipoles);
+  BOOST_CHECK_EQUAL(espresso::system.requestDipGpu(), has_feature_dipoles);
+#ifdef DIPOLES
+  espresso::system.update();
+  BOOST_TEST(espresso::system.dipGpuBegin() != nullptr);
+  BOOST_TEST(espresso::system.dipGpuEnd() != nullptr);
+  BOOST_CHECK_EQUAL(
+      espresso::system.dipGpuEnd() - espresso::system.dipGpuBegin(), 3);
+#endif
+
+  // clear device memory
+  remove_particle(pid);
+  espresso::system.update();
+  BOOST_CHECK_EQUAL(espresso::system.npart_gpu(), 0);
+  BOOST_CHECK_EQUAL(espresso::system.rGpuBegin(), espresso::system.rGpuEnd());
+  BOOST_CHECK_EQUAL(espresso::system.fGpuBegin(), espresso::system.fGpuEnd());
+#ifdef ELECTROSTATICS
+  BOOST_CHECK_EQUAL(espresso::system.qGpuBegin(), espresso::system.qGpuEnd());
+#endif
+#ifdef DIPOLES
+  BOOST_CHECK_EQUAL(espresso::system.dipGpuBegin(),
+                    espresso::system.dipGpuEnd());
+#endif
+#ifdef ROTATION
+  BOOST_CHECK_EQUAL(espresso::system.torqueGpuBegin(),
+                    espresso::system.torqueGpuEnd());
+#endif
+}
+
+#else // CUDA
+
+BOOST_AUTO_TEST_CASE(check_without_cuda) {
+  check_uninitialized_device_pointers();
+  BOOST_CHECK_EQUAL(espresso::system.npart_gpu(), 0);
+  BOOST_TEST(!espresso::system.hasRGpu());
+  BOOST_TEST(!espresso::system.requestRGpu());
+  BOOST_TEST(!espresso::system.hasDipGpu());
+  BOOST_TEST(!espresso::system.requestDipGpu());
+  BOOST_TEST(!espresso::system.hasFGpu());
+  BOOST_TEST(!espresso::system.requestFGpu());
+  BOOST_TEST(!espresso::system.hasTorqueGpu());
+  BOOST_TEST(!espresso::system.requestTorqueGpu());
+  BOOST_TEST(!espresso::system.hasQGpu());
+  BOOST_TEST(!espresso::system.requestQGpu());
+}
+
+#endif // CUDA
+
+int main(int argc, char **argv) {
+  auto mpi_env = mpi_init(argc, argv);
+
+  // this unit test only runs on 1 core
+  assert(boost::mpi::communicator().size() == 1);
+
+  // initialize the MpiCallbacks framework
+  Communication::init(mpi_env);
+#ifdef VIRTUAL_SITES
+  set_virtual_sites(std::make_shared<VirtualSitesOff>());
+#endif
+  mpi_loop();
+
+  espresso::system.init();
+
+  return boost::unit_test::unit_test_main(init_unit_test, argc, argv);
+}

From b27eee642a5a31bbf8cae4638acca4449a477de2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Fri, 3 Dec 2021 16:48:27 +0100
Subject: [PATCH 06/21] core: Remove unused GPU interface for AoS->SoA

No features require the particle velocity and director to be
converted from an array of structs to a struct of arrays.
---
 src/core/EspressoSystemInterface.hpp     | 35 +-------------
 src/core/EspressoSystemInterface_cuda.cu | 59 ------------------------
 src/core/SystemInterface.hpp             | 24 +---------
 3 files changed, 4 insertions(+), 114 deletions(-)

diff --git a/src/core/EspressoSystemInterface.hpp b/src/core/EspressoSystemInterface.hpp
index 91329c65c7f..e22507d81f0 100644
--- a/src/core/EspressoSystemInterface.hpp
+++ b/src/core/EspressoSystemInterface.hpp
@@ -71,17 +71,6 @@ class EspressoSystemInterface : public SystemInterface {
     return m_needsDipGpu;
   };
 #endif
-  float *vGpuBegin() override { return m_v_gpu_begin; };
-  float *vGpuEnd() override { return m_v_gpu_end; };
-  bool hasVGpu() override { return true; };
-  bool requestVGpu() override {
-    m_needsVGpu = hasVGpu();
-    m_splitParticleStructGpu |= m_needsVGpu;
-    m_gpu |= m_needsVGpu;
-    if (m_gpu)
-      enableParticleCommunication();
-    return m_needsVGpu;
-  };
 
 #ifdef ELECTROSTATICS
   float *qGpuBegin() override { return m_q_gpu_begin; };
@@ -97,18 +86,6 @@ class EspressoSystemInterface : public SystemInterface {
   };
 #endif
 
-  float *directorGpuBegin() override { return m_director_gpu_begin; };
-  float *directorGpuEnd() override { return m_director_gpu_end; };
-  bool hasDirectorGpu() override { return true; };
-  bool requestDirectorGpu() override {
-    m_needsDirectorGpu = hasDirectorGpu();
-    m_splitParticleStructGpu |= m_needsDirectorGpu;
-    m_gpu |= m_needsDirectorGpu;
-    if (m_gpu)
-      enableParticleCommunication();
-    return m_needsDirectorGpu;
-  };
-
   bool requestParticleStructGpu() {
     m_needsParticleStructGpu = true;
     m_gpu |= m_needsParticleStructGpu;
@@ -169,10 +146,8 @@ class EspressoSystemInterface : public SystemInterface {
   EspressoSystemInterface()
       : m_gpu(false), m_r_gpu_begin(nullptr), m_r_gpu_end(nullptr),
         m_dip_gpu_begin(nullptr), m_dip_gpu_end(nullptr),
-        m_v_gpu_begin(nullptr), m_v_gpu_end(nullptr), m_q_gpu_begin(nullptr),
-        m_q_gpu_end(nullptr), m_director_gpu_begin(nullptr),
-        m_director_gpu_end(nullptr), m_needsParticleStructGpu(false),
-        m_splitParticleStructGpu(false){};
+        m_q_gpu_begin(nullptr), m_q_gpu_end(nullptr),
+        m_needsParticleStructGpu(false), m_splitParticleStructGpu(false){};
   ~EspressoSystemInterface() override = default;
 
   void gatherParticles();
@@ -197,15 +172,9 @@ class EspressoSystemInterface : public SystemInterface {
   float *m_dip_gpu_begin;
   float *m_dip_gpu_end;
 
-  float *m_v_gpu_begin;
-  float *m_v_gpu_end;
-
   float *m_q_gpu_begin;
   float *m_q_gpu_end;
 
-  float *m_director_gpu_begin;
-  float *m_director_gpu_end;
-
   bool m_needsParticleStructGpu;
   bool m_splitParticleStructGpu;
 };
diff --git a/src/core/EspressoSystemInterface_cuda.cu b/src/core/EspressoSystemInterface_cuda.cu
index 2904f2994fb..4d62086a3ef 100644
--- a/src/core/EspressoSystemInterface_cuda.cu
+++ b/src/core/EspressoSystemInterface_cuda.cu
@@ -80,24 +80,6 @@ __global__ void split_kernel_r(CUDA_particle_data *particles, float *r,
   r[idx + 2] = p.p[2];
 }
 
-#ifdef CUDA
-// Velocity
-__global__ void split_kernel_v(CUDA_particle_data *particles, float *v,
-                               unsigned int n) {
-  auto idx = blockDim.x * blockIdx.x + threadIdx.x;
-  if (idx >= n)
-    return;
-
-  CUDA_particle_data p = particles[idx];
-
-  idx *= 3;
-
-  v[idx + 0] = p.v[0];
-  v[idx + 1] = p.v[1];
-  v[idx + 2] = p.v[2];
-}
-#endif
-
 #ifdef DIPOLES
 // Dipole moment
 __global__ void split_kernel_dip(CUDA_particle_data *particles, float *dip,
@@ -116,23 +98,6 @@ __global__ void split_kernel_dip(CUDA_particle_data *particles, float *dip,
 }
 #endif
 
-__global__ void split_kernel_director(CUDA_particle_data *particles,
-                                      float *director, unsigned int n) {
-#ifdef ROTATION
-  auto idx = blockDim.x * blockIdx.x + threadIdx.x;
-  if (idx >= n)
-    return;
-
-  CUDA_particle_data p = particles[idx];
-
-  idx *= 3;
-
-  director[idx + 0] = p.director[0];
-  director[idx + 1] = p.director[1];
-  director[idx + 2] = p.director[2];
-#endif
-}
-
 void EspressoSystemInterface::reallocDeviceMemory(std::size_t n) {
   if (m_needsRGpu && ((n != m_gpu_npart) || (m_r_gpu_begin == nullptr))) {
     if (m_r_gpu_begin != nullptr)
@@ -148,13 +113,6 @@ void EspressoSystemInterface::reallocDeviceMemory(std::size_t n) {
     m_dip_gpu_end = m_dip_gpu_begin + 3 * n;
   }
 #endif
-  if (m_needsVGpu && ((n != m_gpu_npart) || (m_v_gpu_begin == nullptr))) {
-    if (m_v_gpu_begin != nullptr)
-      cuda_safe_mem(cudaFree(m_v_gpu_begin));
-    cuda_safe_mem(cudaMalloc(&m_v_gpu_begin, 3 * n * sizeof(float)));
-    m_v_gpu_end = m_v_gpu_begin + 3 * n;
-  }
-
   if (m_needsQGpu && ((n != m_gpu_npart) || (m_q_gpu_begin == nullptr))) {
     if (m_q_gpu_begin != nullptr)
       cuda_safe_mem(cudaFree(m_q_gpu_begin));
@@ -162,14 +120,6 @@ void EspressoSystemInterface::reallocDeviceMemory(std::size_t n) {
     m_q_gpu_end = m_q_gpu_begin + 3 * n;
   }
 
-  if (m_needsDirectorGpu &&
-      ((n != m_gpu_npart) || (m_director_gpu_begin == nullptr))) {
-    if (m_director_gpu_begin != nullptr)
-      cuda_safe_mem(cudaFree(m_director_gpu_begin));
-    cuda_safe_mem(cudaMalloc(&m_director_gpu_begin, 3 * n * sizeof(float)));
-    m_director_gpu_end = m_director_gpu_begin + 3 * n;
-  }
-
   m_gpu_npart = n;
 }
 
@@ -191,19 +141,10 @@ void EspressoSystemInterface::split_particle_struct() {
   if (!m_needsQGpu && m_needsRGpu)
     split_kernel_r<<<dim3(grid), dim3(block), 0, nullptr>>>(
         device_particles.data(), m_r_gpu_begin, n);
-#ifdef CUDA
-  if (m_needsVGpu)
-    split_kernel_v<<<dim3(grid), dim3(block), 0, nullptr>>>(
-        device_particles.data(), m_v_gpu_begin, n);
-#endif
 #ifdef DIPOLES
   if (m_needsDipGpu)
     split_kernel_dip<<<dim3(grid), dim3(block), 0, nullptr>>>(
         device_particles.data(), m_dip_gpu_begin, n);
 
 #endif
-
-  if (m_needsDirectorGpu)
-    split_kernel_director<<<dim3(grid), dim3(block), 0, nullptr>>>(
-        device_particles.data(), m_director_gpu_begin, n);
 }
diff --git a/src/core/SystemInterface.hpp b/src/core/SystemInterface.hpp
index 5f78a98581b..73c577dfdfe 100644
--- a/src/core/SystemInterface.hpp
+++ b/src/core/SystemInterface.hpp
@@ -30,9 +30,8 @@
 class SystemInterface {
 public:
   SystemInterface()
-      : m_needsRGpu(false), m_needsVGpu(false), m_needsQGpu(false),
-        m_needsDirectorGpu(false), m_needsFGpu(false), m_needsDipGpu(false),
-        m_needsTorqueGpu(false){};
+      : m_needsRGpu(false), m_needsQGpu(false), m_needsFGpu(false),
+        m_needsDipGpu(false), m_needsTorqueGpu(false){};
   typedef Utils::Vector3d Vector3;
   typedef double Real;
 
@@ -63,14 +62,6 @@ class SystemInterface {
     return m_needsTorqueGpu;
   }
 
-  virtual float *vGpuBegin() { return nullptr; };
-  virtual float *vGpuEnd() { return nullptr; };
-  virtual bool hasVGpu() { return false; };
-  virtual bool requestVGpu() {
-    m_needsVGpu = hasVGpu();
-    return m_needsVGpu;
-  }
-
   virtual float *qGpuBegin() { return nullptr; };
   virtual float *qGpuEnd() { return nullptr; };
   virtual bool hasQGpu() { return false; };
@@ -88,30 +79,19 @@ class SystemInterface {
     return m_needsFGpu;
   }
 
-  virtual float *directorGpuBegin() { return nullptr; };
-  virtual float *directorGpuEnd() { return nullptr; };
-  virtual bool hasDirectorGpu() { return false; };
-  virtual bool requestDirectorGpu() {
-    m_needsDirectorGpu = hasDirectorGpu();
-    return m_needsDirectorGpu;
-  }
-
   virtual std::size_t npart_gpu() const { return 0; };
   virtual Vector3 box() const = 0;
 
   virtual bool needsRGpu() { return m_needsRGpu; };
   virtual bool needsDipGpu() { return m_needsRGpu; };
   virtual bool needsQGpu() { return m_needsQGpu; };
-  virtual bool needsDirectorGpu() { return m_needsDirectorGpu; };
   virtual bool needsFGpu() { return m_needsFGpu; };
   virtual bool needsTorqueGpu() { return m_needsTorqueGpu; };
   virtual ~SystemInterface() = default;
 
 protected:
   bool m_needsRGpu;
-  bool m_needsVGpu;
   bool m_needsQGpu;
-  bool m_needsDirectorGpu;
   bool m_needsFGpu;
   bool m_needsDipGpu;
   bool m_needsTorqueGpu;

From 9277a8fb89e644eefaa0759af70145c6a6e0fb4c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Fri, 3 Dec 2021 16:57:01 +0100
Subject: [PATCH 07/21] core: Re-organize SystemInterface

Group functions and globals by particle property.
Cleanup ifdefs and comments.
---
 src/core/EspressoSystemInterface.cpp     |  4 ----
 src/core/EspressoSystemInterface.hpp     | 26 ++++++++++++------------
 src/core/EspressoSystemInterface_cuda.cu |  7 ++++---
 src/core/SystemInterface.hpp             |  5 ++---
 src/core/actor/DipolarBarnesHut.hpp      |  4 ++--
 src/core/actor/DipolarDirectSum.hpp      |  4 ++--
 src/core/cuda_common_cuda.cu             |  9 +++++---
 src/core/cuda_interface.cpp              | 16 ++-------------
 src/core/cuda_interface.hpp              |  7 ++-----
 9 files changed, 33 insertions(+), 49 deletions(-)

diff --git a/src/core/EspressoSystemInterface.cpp b/src/core/EspressoSystemInterface.cpp
index 61bc74e830d..fc09ddb1994 100644
--- a/src/core/EspressoSystemInterface.cpp
+++ b/src/core/EspressoSystemInterface.cpp
@@ -25,13 +25,9 @@
 
 #include <utils/Vector.hpp>
 
-/* Initialize instance pointer */
 EspressoSystemInterface *EspressoSystemInterface::m_instance = nullptr;
 
-/********************************************************************************************/
-
 void EspressoSystemInterface::gatherParticles() {
-// get particles from other nodes
 #ifdef CUDA
   if (m_gpu) {
     if (gpu_get_global_particle_vars_pointer_host()->communication_enabled) {
diff --git a/src/core/EspressoSystemInterface.hpp b/src/core/EspressoSystemInterface.hpp
index e22507d81f0..ef1e9fc4d38 100644
--- a/src/core/EspressoSystemInterface.hpp
+++ b/src/core/EspressoSystemInterface.hpp
@@ -58,6 +58,7 @@ class EspressoSystemInterface : public SystemInterface {
       enableParticleCommunication();
     return m_needsRGpu;
   };
+
 #ifdef DIPOLES
   float *dipGpuBegin() override { return m_dip_gpu_begin; };
   float *dipGpuEnd() override { return m_dip_gpu_end; };
@@ -98,17 +99,6 @@ class EspressoSystemInterface : public SystemInterface {
   float *fGpuEnd() override {
     return gpu_get_particle_force_pointer() + 3 * m_gpu_npart;
   };
-  float *eGpu() override {
-    // cast pointer to struct of floats to array of floats
-    // https://stackoverflow.com/a/29278260
-    return reinterpret_cast<float *>(gpu_get_energy_pointer());
-  };
-  float *torqueGpuBegin() override {
-    return gpu_get_particle_torque_pointer();
-  };
-  float *torqueGpuEnd() override {
-    return gpu_get_particle_torque_pointer() + 3 * m_gpu_npart;
-  };
   bool hasFGpu() override { return true; };
   bool requestFGpu() override {
     m_needsFGpu = hasFGpu();
@@ -119,6 +109,12 @@ class EspressoSystemInterface : public SystemInterface {
   };
 
 #ifdef ROTATION
+  float *torqueGpuBegin() override {
+    return gpu_get_particle_torque_pointer();
+  };
+  float *torqueGpuEnd() override {
+    return gpu_get_particle_torque_pointer() + 3 * m_gpu_npart;
+  };
   bool hasTorqueGpu() override { return true; };
   bool requestTorqueGpu() override {
     m_needsTorqueGpu = hasTorqueGpu();
@@ -129,6 +125,12 @@ class EspressoSystemInterface : public SystemInterface {
   };
 #endif
 
+  float *eGpu() override {
+    // cast pointer to struct of floats to array of floats
+    // https://stackoverflow.com/a/29278260
+    return reinterpret_cast<float *>(gpu_get_energy_pointer());
+  };
+
 #endif // ifdef CUDA
 
   Utils::Vector3d box() const override;
@@ -179,6 +181,4 @@ class EspressoSystemInterface : public SystemInterface {
   bool m_splitParticleStructGpu;
 };
 
-/********************************************************************************************/
-
 #endif
diff --git a/src/core/EspressoSystemInterface_cuda.cu b/src/core/EspressoSystemInterface_cuda.cu
index 4d62086a3ef..53cb31b43df 100644
--- a/src/core/EspressoSystemInterface_cuda.cu
+++ b/src/core/EspressoSystemInterface_cuda.cu
@@ -113,12 +113,14 @@ void EspressoSystemInterface::reallocDeviceMemory(std::size_t n) {
     m_dip_gpu_end = m_dip_gpu_begin + 3 * n;
   }
 #endif
+#ifdef ELECTROSTATICS
   if (m_needsQGpu && ((n != m_gpu_npart) || (m_q_gpu_begin == nullptr))) {
     if (m_q_gpu_begin != nullptr)
       cuda_safe_mem(cudaFree(m_q_gpu_begin));
     cuda_safe_mem(cudaMalloc(&m_q_gpu_begin, 3 * n * sizeof(float)));
     m_q_gpu_end = m_q_gpu_begin + 3 * n;
   }
+#endif
 
   m_gpu_npart = n;
 }
@@ -135,16 +137,15 @@ void EspressoSystemInterface::split_particle_struct() {
   if (m_needsQGpu && m_needsRGpu)
     split_kernel_rq<<<dim3(grid), dim3(block), 0, nullptr>>>(
         device_particles.data(), m_r_gpu_begin, m_q_gpu_begin, n);
-  if (m_needsQGpu && !m_needsRGpu)
+  else if (m_needsQGpu)
     split_kernel_q<<<dim3(grid), dim3(block), 0, nullptr>>>(
         device_particles.data(), m_q_gpu_begin, n);
-  if (!m_needsQGpu && m_needsRGpu)
+  else if (m_needsRGpu)
     split_kernel_r<<<dim3(grid), dim3(block), 0, nullptr>>>(
         device_particles.data(), m_r_gpu_begin, n);
 #ifdef DIPOLES
   if (m_needsDipGpu)
     split_kernel_dip<<<dim3(grid), dim3(block), 0, nullptr>>>(
         device_particles.data(), m_dip_gpu_begin, n);
-
 #endif
 }
diff --git a/src/core/SystemInterface.hpp b/src/core/SystemInterface.hpp
index 73c577dfdfe..cd8b1fe1be6 100644
--- a/src/core/SystemInterface.hpp
+++ b/src/core/SystemInterface.hpp
@@ -25,8 +25,6 @@
 
 #include <cstddef>
 
-/** @todo: Turn needsXY in getter/setter **/
-
 class SystemInterface {
 public:
   SystemInterface()
@@ -72,13 +70,14 @@ class SystemInterface {
 
   virtual float *fGpuBegin() { return nullptr; };
   virtual float *fGpuEnd() { return nullptr; };
-  virtual float *eGpu() { return nullptr; };
   virtual bool hasFGpu() { return false; };
   virtual bool requestFGpu() {
     m_needsFGpu = hasFGpu();
     return m_needsFGpu;
   }
 
+  virtual float *eGpu() { return nullptr; };
+
   virtual std::size_t npart_gpu() const { return 0; };
   virtual Vector3 box() const = 0;
 
diff --git a/src/core/actor/DipolarBarnesHut.hpp b/src/core/actor/DipolarBarnesHut.hpp
index 1bf3815527b..1db84237d44 100644
--- a/src/core/actor/DipolarBarnesHut.hpp
+++ b/src/core/actor/DipolarBarnesHut.hpp
@@ -98,6 +98,6 @@ class DipolarBarnesHut : public Actor {
 void activate_dipolar_barnes_hut(float epssq, float itolsq);
 void deactivate_dipolar_barnes_hut();
 
-#endif
-
 #endif // DIPOLAR_BARNES_HUT
+
+#endif // ACTOR_DIPOLARBARNESHUT_HPP
diff --git a/src/core/actor/DipolarDirectSum.hpp b/src/core/actor/DipolarDirectSum.hpp
index ba5cdc9d05c..48c13b8aa3a 100644
--- a/src/core/actor/DipolarDirectSum.hpp
+++ b/src/core/actor/DipolarDirectSum.hpp
@@ -82,6 +82,6 @@ class DipolarDirectSum : public Actor {
 void activate_dipolar_direct_sum_gpu();
 void deactivate_dipolar_direct_sum_gpu();
 
-#endif
+#endif // DIPOLAR_DIRECT_SUM
 
-#endif
+#endif // ACTOR_DIPOLARDIRECTSUM_HPP
diff --git a/src/core/cuda_common_cuda.cu b/src/core/cuda_common_cuda.cu
index e5d97e8baf3..2a084138012 100644
--- a/src/core/cuda_common_cuda.cu
+++ b/src/core/cuda_common_cuda.cu
@@ -54,7 +54,9 @@ template <class SpanLike> std::size_t byte_size(SpanLike const &v) {
 
 /** struct for particle force */
 static device_vector<float> particle_forces_device;
+#ifdef ROTATION
 static device_vector<float> particle_torques_device;
+#endif
 
 /** struct for particle position and velocity */
 static device_vector<CUDA_particle_data> particle_data_device;
@@ -63,9 +65,8 @@ static CUDA_energy *energy_device = nullptr;
 
 pinned_vector<CUDA_particle_data> particle_data_host;
 pinned_vector<float> particle_forces_host;
-CUDA_energy energy_host;
-
 pinned_vector<float> particle_torques_host;
+CUDA_energy energy_host;
 
 cudaStream_t stream[1];
 
@@ -151,10 +152,12 @@ CUDA_global_part_vars *gpu_get_global_particle_vars_pointer_host() {
 float *gpu_get_particle_force_pointer() {
   return raw_data_pointer(particle_forces_device);
 }
-CUDA_energy *gpu_get_energy_pointer() { return energy_device; }
+#ifdef ROTATION
 float *gpu_get_particle_torque_pointer() {
   return raw_data_pointer(particle_torques_device);
 }
+#endif
+CUDA_energy *gpu_get_energy_pointer() { return energy_device; }
 
 void copy_part_data_to_gpu(ParticleRange particles) {
   if (global_part_vars_host.communication_enabled == 1) {
diff --git a/src/core/cuda_interface.cpp b/src/core/cuda_interface.cpp
index a92fe9e4bfb..26971b8f4eb 100644
--- a/src/core/cuda_interface.cpp
+++ b/src/core/cuda_interface.cpp
@@ -34,10 +34,6 @@
 
 #include <vector>
 
-/* TODO: We should only transfer data for enabled methods,
-         not for those that are barely compiled in. (fw)
-*/
-
 static void pack_particles(ParticleRange particles,
                            CUDA_particle_data *buffer) {
   using Utils::Vector3f;
@@ -46,19 +42,17 @@ static void pack_particles(ParticleRange particles,
   for (auto const &part : particles) {
     buffer[i].p = static_cast<Vector3f>(folded_position(part.r.p, box_geo));
 
-#ifdef CUDA
     buffer[i].identity = part.p.identity;
     buffer[i].v = static_cast<Vector3f>(part.m.v);
 #ifdef VIRTUAL_SITES
     buffer[i].is_virtual = part.p.is_virtual;
 #endif
-#endif
 
 #ifdef DIPOLES
     buffer[i].dip = static_cast<Vector3f>(part.calc_dip());
 #endif
 
-#if defined(LB_ELECTROHYDRODYNAMICS) && defined(CUDA)
+#ifdef LB_ELECTROHYDRODYNAMICS
     buffer[i].mu_E = static_cast<Vector3f>(part.p.mu_E);
 #endif
 
@@ -141,25 +135,19 @@ void cuda_mpi_send_forces(const ParticleRange &particles,
   if (this_node > 0) {
     static std::vector<float> buffer_forces;
     static std::vector<float> buffer_torques;
-    /* Alloc buffer */
-    buffer_forces.resize(n_elements);
 
+    buffer_forces.resize(n_elements);
     Utils::Mpi::scatter_buffer(buffer_forces.data(), n_elements, comm_cart);
 #ifdef ROTATION
-    /* Alloc buffer */
     buffer_torques.resize(n_elements);
-
     Utils::Mpi::scatter_buffer(buffer_torques.data(), n_elements, comm_cart);
 #endif
-
     add_forces_and_torques(particles, buffer_forces, buffer_torques);
   } else {
-    /* Scatter forces */
     Utils::Mpi::scatter_buffer(host_forces.data(), n_elements, comm_cart);
 #ifdef ROTATION
     Utils::Mpi::scatter_buffer(host_torques.data(), n_elements, comm_cart);
 #endif
-
     add_forces_and_torques(particles, host_forces, host_torques);
   }
 }
diff --git a/src/core/cuda_interface.hpp b/src/core/cuda_interface.hpp
index dcff4826a4e..7844c1db3d8 100644
--- a/src/core/cuda_interface.hpp
+++ b/src/core/cuda_interface.hpp
@@ -54,25 +54,23 @@ struct CUDA_particle_data {
   CUDA_ParticleParametersSwimming swim;
 #endif
 
-  /** particle position given from md part*/
   Vector3f p;
 
-  /** particle id */
   int identity;
+
 #ifdef VIRTUAL_SITES
   bool is_virtual;
 #else
   static constexpr const bool is_virtual = false;
 #endif
 
-  /** particle momentum struct velocity p.m->v*/
   Vector3f v;
 
 #ifdef ROTATION
   Vector3f director;
 #endif
 
-#if defined(LB_ELECTROHYDRODYNAMICS) && defined(CUDA)
+#ifdef LB_ELECTROHYDRODYNAMICS
   Vector3f mu_E;
 #endif
 
@@ -116,7 +114,6 @@ float *gpu_get_particle_torque_pointer();
 #endif
 
 CUDA_energy *gpu_get_energy_pointer();
-float *gpu_get_particle_torque_pointer();
 void gpu_init_particle_comm();
 
 void cuda_mpi_get_particles(

From 607dad4821b50de0634e5304b6706f1df93f6b80 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Fri, 3 Dec 2021 17:27:15 +0100
Subject: [PATCH 08/21] core: Narrow CUDA electrostatics interface

Remove unused class members and function overloads. Hide
implementation details of MMM1DGPU from the Python interface.
Request GPU SoA for particle torques and positions in the core.
Activate MMM1DGPU when added to the list of actors, not during
construction. Use automatic memory management for MMM1DGPU.
---
 src/core/EspressoSystemInterface.hpp     |  7 -------
 src/core/actor/DipolarBarnesHut.hpp      |  3 +++
 src/core/actor/DipolarDirectSum.hpp      |  3 +++
 src/core/actor/Mmm1dgpuForce.hpp         |  6 ++----
 src/core/actor/Mmm1dgpuForce_cuda.cu     | 16 ++++-----------
 src/python/espressomd/electrostatics.pxd | 25 ++----------------------
 src/python/espressomd/electrostatics.pyx | 25 +++++++++---------------
 7 files changed, 23 insertions(+), 62 deletions(-)

diff --git a/src/core/EspressoSystemInterface.hpp b/src/core/EspressoSystemInterface.hpp
index ef1e9fc4d38..0395a75cb81 100644
--- a/src/core/EspressoSystemInterface.hpp
+++ b/src/core/EspressoSystemInterface.hpp
@@ -36,13 +36,6 @@ class EspressoSystemInterface : public SystemInterface {
     return *m_instance;
   };
 
-  static EspressoSystemInterface *_Instance() {
-    if (!m_instance)
-      m_instance = new EspressoSystemInterface;
-
-    return m_instance;
-  };
-
   void init() override;
   void update() override;
 
diff --git a/src/core/actor/DipolarBarnesHut.hpp b/src/core/actor/DipolarBarnesHut.hpp
index 1db84237d44..35fb9ac0701 100644
--- a/src/core/actor/DipolarBarnesHut.hpp
+++ b/src/core/actor/DipolarBarnesHut.hpp
@@ -41,6 +41,9 @@ class DipolarBarnesHut : public Actor {
     if (!s.requestFGpu())
       runtimeErrorMsg() << "DipolarBarnesHut needs access to forces on GPU!";
 
+    if (!s.requestTorqueGpu())
+      runtimeErrorMsg() << "DipolarBarnesHut needs access to torques on GPU!";
+
     if (!s.requestRGpu())
       runtimeErrorMsg() << "DipolarBarnesHut needs access to positions on GPU!";
 
diff --git a/src/core/actor/DipolarDirectSum.hpp b/src/core/actor/DipolarDirectSum.hpp
index 48c13b8aa3a..02a08722813 100644
--- a/src/core/actor/DipolarDirectSum.hpp
+++ b/src/core/actor/DipolarDirectSum.hpp
@@ -46,6 +46,9 @@ class DipolarDirectSum : public Actor {
     if (!s.requestFGpu())
       runtimeErrorMsg() << "DipolarDirectSum needs access to forces on GPU!";
 
+    if (!s.requestTorqueGpu())
+      runtimeErrorMsg() << "DipolarDirectSum needs access to torques on GPU!";
+
     if (!s.requestRGpu())
       runtimeErrorMsg() << "DipolarDirectSum needs access to positions on GPU!";
 
diff --git a/src/core/actor/Mmm1dgpuForce.hpp b/src/core/actor/Mmm1dgpuForce.hpp
index a767a38cad5..293154c1a30 100644
--- a/src/core/actor/Mmm1dgpuForce.hpp
+++ b/src/core/actor/Mmm1dgpuForce.hpp
@@ -29,8 +29,7 @@
 class Mmm1dgpuForce : public Actor {
 public:
   // constructor
-  Mmm1dgpuForce(SystemInterface &s, float coulomb_prefactor, float maxPWerror,
-                float far_switch_radius = -1, int bessel_cutoff = -1);
+  Mmm1dgpuForce(SystemInterface &s);
   ~Mmm1dgpuForce() override;
   // interface methods
   void computeForces(SystemInterface &s) override;
@@ -40,8 +39,7 @@ class Mmm1dgpuForce : public Actor {
   void tune(SystemInterface &s, float _maxPWerror, float _far_switch_radius,
             int _bessel_cutoff);
   void set_params(float _boxz, float _coulomb_prefactor, float _maxPWerror,
-                  float _far_switch_radius, int _bessel_cutoff,
-                  bool manual = false);
+                  float _far_switch_radius, int _bessel_cutoff);
   void activate();
   void deactivate();
 
diff --git a/src/core/actor/Mmm1dgpuForce_cuda.cu b/src/core/actor/Mmm1dgpuForce_cuda.cu
index ed4cd132f7a..6c5a7f57a06 100644
--- a/src/core/actor/Mmm1dgpuForce_cuda.cu
+++ b/src/core/actor/Mmm1dgpuForce_cuda.cu
@@ -133,13 +133,10 @@ int modpsi_init() {
   return 0;
 }
 
-Mmm1dgpuForce::Mmm1dgpuForce(SystemInterface &s, float _coulomb_prefactor,
-                             float _maxPWerror, float _far_switch_radius,
-                             int _bessel_cutoff)
+Mmm1dgpuForce::Mmm1dgpuForce(SystemInterface &s)
     : numThreads(64), host_boxz(0), host_npart(0), need_tune(true), pairs(-1),
-      dev_forcePairs(nullptr), dev_energyBlocks(nullptr),
-      coulomb_prefactor(_coulomb_prefactor), maxPWerror(_maxPWerror),
-      far_switch_radius(_far_switch_radius), bessel_cutoff(_bessel_cutoff) {
+      dev_forcePairs(nullptr), dev_energyBlocks(nullptr), coulomb_prefactor(0),
+      maxPWerror(-1), far_switch_radius(-1), bessel_cutoff(-1) {
   // interface sanity checks
   if (!s.requestFGpu())
     throw std::runtime_error("Mmm1dgpuForce needs access to forces on GPU!");
@@ -312,7 +309,7 @@ void Mmm1dgpuForce::tune(SystemInterface &s, float _maxPWerror,
 
 void Mmm1dgpuForce::set_params(float _boxz, float _coulomb_prefactor,
                                float _maxPWerror, float _far_switch_radius,
-                               int _bessel_cutoff, bool manual) {
+                               int _bessel_cutoff) {
   if (_boxz > 0 && _far_switch_radius > _boxz) {
     throw std::runtime_error(
         "switching radius must not be larger than box length");
@@ -323,11 +320,6 @@ void Mmm1dgpuForce::set_params(float _boxz, float _coulomb_prefactor,
     // double colons are needed to access the constant memory variables because
     // they are file globals and we have identically named class variables
     cudaSetDevice(d);
-    if (manual) // tuning needs to be performed again
-    {
-      far_switch_radius = _far_switch_radius;
-      bessel_cutoff = _bessel_cutoff;
-    }
     if (_far_switch_radius >= 0) {
       mmm1d_params.far_switch_radius_2 = _far_switch_radius_2;
       cuda_safe_mem(cudaMemcpyToSymbol(::far_switch_radius_2,
diff --git a/src/python/espressomd/electrostatics.pxd b/src/python/espressomd/electrostatics.pxd
index 5d687f144e4..17124972bac 100644
--- a/src/python/espressomd/electrostatics.pxd
+++ b/src/python/espressomd/electrostatics.pxd
@@ -27,9 +27,7 @@ cdef extern from "SystemInterface.hpp":
 cdef extern from "EspressoSystemInterface.hpp":
     cdef cppclass EspressoSystemInterface(SystemInterface):
         @staticmethod
-        EspressoSystemInterface * _Instance()
-        bool requestRGpu()
-        void update()
+        EspressoSystemInterface & Instance()
 
 
 IF ELECTROSTATICS:
@@ -139,30 +137,11 @@ IF ELECTROSTATICS and MMM1D_GPU:
 
     cdef extern from "actor/Mmm1dgpuForce.hpp":
         cdef cppclass Mmm1dgpuForce:
-            Mmm1dgpuForce(SystemInterface & s, float coulomb_prefactor, float maxPWerror, float far_switch_radius, int bessel_cutoff) except+
-            Mmm1dgpuForce(SystemInterface & s, float coulomb_prefactor, float maxPWerror, float far_switch_radius) except+
-            Mmm1dgpuForce(SystemInterface & s, float coulomb_prefactor, float maxPWerror) except+
+            Mmm1dgpuForce(SystemInterface & s) except+
             void setup(SystemInterface & s)
             void tune(SystemInterface & s, float _maxPWerror, float _far_switch_radius, int _bessel_cutoff)
-            void set_params(float _boxz, float _coulomb_prefactor, float _maxPWerror, float _far_switch_radius, int _bessel_cutoff, bool manual)
             void set_params(float _boxz, float _coulomb_prefactor, float _maxPWerror, float _far_switch_radius, int _bessel_cutoff)
 
-            unsigned int numThreads
-            unsigned int numBlocks(SystemInterface & s)
-
-            float host_boxz
-            unsigned int host_npart
-            bool need_tune
-
-            int pairs
-            float * dev_forcePairs
-            float * dev_energyBlocks
-
-            float coulomb_prefactor, maxPWerror, far_switch_radius
-            int bessel_cutoff
-
-            float force_benchmark(SystemInterface & s)
-
             void activate()
             void deactivate()
 
diff --git a/src/python/espressomd/electrostatics.pyx b/src/python/espressomd/electrostatics.pyx
index 287e3d39335..269bd511ddf 100644
--- a/src/python/espressomd/electrostatics.pyx
+++ b/src/python/espressomd/electrostatics.pyx
@@ -16,6 +16,7 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
+from libcpp.memory cimport shared_ptr, make_shared
 from cython.operator cimport dereference
 include "myconfig.pxi"
 from .actors cimport Actor
@@ -634,21 +635,12 @@ IF ELECTROSTATICS and MMM1D_GPU:
         tune : :obj:`bool`, optional
             Specify whether to automatically tune or not. Defaults to ``True``.
         """
-        cdef Mmm1dgpuForce * thisptr
+        cdef shared_ptr[Mmm1dgpuForce] sip
         cdef EspressoSystemInterface * interface
-        cdef char * log
-        cdef int resp
 
         def __cinit__(self):
-            self.interface = EspressoSystemInterface._Instance()
-            default_params = self.default_params()
-            self.thisptr = new Mmm1dgpuForce(dereference(self.interface), 0.0, default_params["maxPWerror"])
-            self.interface.update()
-            self.interface.requestRGpu()
-            dereference(self.thisptr).activate()
-
-        def __dealloc__(self):
-            del self.thisptr
+            self.interface = &EspressoSystemInterface.Instance()
+            self.sip = make_shared[Mmm1dgpuForce](dereference(self.interface))
 
         def validate_params(self):
             default_params = self.default_params()
@@ -686,27 +678,28 @@ IF ELECTROSTATICS and MMM1D_GPU:
             set_prefactor(self._params["prefactor"])
             default_params = self.default_params()
 
-            self.thisptr.set_params(
+            dereference(self.sip).set_params(
                 < float > box_geo.length()[2], < float > coulomb.prefactor,
                 self._params["maxPWerror"], self._params["far_switch_radius"],
                 self._params["bessel_cutoff"])
 
         def _tune(self):
-            self.thisptr.setup(dereference(self.interface))
-            self.thisptr.tune(
+            dereference(self.sip).setup(dereference(self.interface))
+            dereference(self.sip).tune(
                 dereference(self.interface), self._params["maxPWerror"],
                 self._params["far_switch_radius"], self._params["bessel_cutoff"])
 
         def _activate_method(self):
             check_neutrality(self._params)
             self._set_params_in_es_core()
+            dereference(self.sip).activate()
             coulomb.method = COULOMB_MMM1D_GPU
             if self._params["tune"]:
                 self._tune()
             self._set_params_in_es_core()
 
         def _deactivate_method(self):
-            dereference(self.thisptr).deactivate()
+            dereference(self.sip).deactivate()
 
 IF ELECTROSTATICS:
     IF SCAFACOS == 1:

From 4fc4d7047f34089f7736a903c58373dfc19cbf16 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Fri, 3 Dec 2021 18:52:15 +0100
Subject: [PATCH 09/21] core: Simplify SystemInterface class

---
 src/core/EspressoSystemInterface.hpp | 38 ++++++++++++----------
 src/core/SystemInterface.hpp         | 48 +++++-----------------------
 2 files changed, 29 insertions(+), 57 deletions(-)

diff --git a/src/core/EspressoSystemInterface.hpp b/src/core/EspressoSystemInterface.hpp
index 0395a75cb81..918e0df08d3 100644
--- a/src/core/EspressoSystemInterface.hpp
+++ b/src/core/EspressoSystemInterface.hpp
@@ -29,6 +29,9 @@
 
 class EspressoSystemInterface : public SystemInterface {
 public:
+  EspressoSystemInterface() = default;
+  ~EspressoSystemInterface() override = default;
+
   static EspressoSystemInterface &Instance() {
     if (!m_instance)
       m_instance = new EspressoSystemInterface;
@@ -138,12 +141,6 @@ class EspressoSystemInterface : public SystemInterface {
 
 protected:
   static EspressoSystemInterface *m_instance;
-  EspressoSystemInterface()
-      : m_gpu(false), m_r_gpu_begin(nullptr), m_r_gpu_end(nullptr),
-        m_dip_gpu_begin(nullptr), m_dip_gpu_end(nullptr),
-        m_q_gpu_begin(nullptr), m_q_gpu_end(nullptr),
-        m_needsParticleStructGpu(false), m_splitParticleStructGpu(false){};
-  ~EspressoSystemInterface() override = default;
 
   void gatherParticles();
   void split_particle_struct();
@@ -156,22 +153,29 @@ class EspressoSystemInterface : public SystemInterface {
     }
   };
   void reallocDeviceMemory(std::size_t n);
-#endif
 
-  std::size_t m_gpu_npart;
-  bool m_gpu;
+private:
+  std::size_t m_gpu_npart = 0;
+  bool m_gpu = false;
+
+  float *m_r_gpu_begin = nullptr;
+  float *m_r_gpu_end = nullptr;
 
-  float *m_r_gpu_begin;
-  float *m_r_gpu_end;
+  float *m_dip_gpu_begin = nullptr;
+  float *m_dip_gpu_end = nullptr;
 
-  float *m_dip_gpu_begin;
-  float *m_dip_gpu_end;
+  float *m_q_gpu_begin = nullptr;
+  float *m_q_gpu_end = nullptr;
 
-  float *m_q_gpu_begin;
-  float *m_q_gpu_end;
+  bool m_needsParticleStructGpu = false;
+  bool m_splitParticleStructGpu = false;
 
-  bool m_needsParticleStructGpu;
-  bool m_splitParticleStructGpu;
+  bool m_needsRGpu = false;
+  bool m_needsQGpu = false;
+  bool m_needsFGpu = false;
+  bool m_needsDipGpu = false;
+  bool m_needsTorqueGpu = false;
+#endif
 };
 
 #endif
diff --git a/src/core/SystemInterface.hpp b/src/core/SystemInterface.hpp
index cd8b1fe1be6..cdce5d99f44 100644
--- a/src/core/SystemInterface.hpp
+++ b/src/core/SystemInterface.hpp
@@ -27,11 +27,8 @@
 
 class SystemInterface {
 public:
-  SystemInterface()
-      : m_needsRGpu(false), m_needsQGpu(false), m_needsFGpu(false),
-        m_needsDipGpu(false), m_needsTorqueGpu(false){};
-  typedef Utils::Vector3d Vector3;
-  typedef double Real;
+  SystemInterface() = default;
+  virtual ~SystemInterface() = default;
 
   virtual void init() = 0;
   virtual void update() = 0;
@@ -39,61 +36,32 @@ class SystemInterface {
   virtual float *rGpuBegin() { return nullptr; };
   virtual float *rGpuEnd() { return nullptr; };
   virtual bool hasRGpu() { return false; };
-  virtual bool requestRGpu() {
-    m_needsRGpu = hasRGpu();
-    return m_needsRGpu;
-  }
+  virtual bool requestRGpu() { return false; }
 
   virtual float *dipGpuBegin() { return nullptr; };
   virtual float *dipGpuEnd() { return nullptr; };
   virtual bool hasDipGpu() { return false; };
-  virtual bool requestDipGpu() {
-    m_needsDipGpu = hasDipGpu();
-    return m_needsDipGpu;
-  }
+  virtual bool requestDipGpu() { return false; }
 
   virtual float *torqueGpuBegin() { return nullptr; };
   virtual float *torqueGpuEnd() { return nullptr; };
   virtual bool hasTorqueGpu() { return false; };
-  virtual bool requestTorqueGpu() {
-    m_needsTorqueGpu = hasTorqueGpu();
-    return m_needsTorqueGpu;
-  }
+  virtual bool requestTorqueGpu() { return false; }
 
   virtual float *qGpuBegin() { return nullptr; };
   virtual float *qGpuEnd() { return nullptr; };
   virtual bool hasQGpu() { return false; };
-  virtual bool requestQGpu() {
-    m_needsQGpu = hasQGpu();
-    return m_needsQGpu;
-  }
+  virtual bool requestQGpu() { return false; }
 
   virtual float *fGpuBegin() { return nullptr; };
   virtual float *fGpuEnd() { return nullptr; };
   virtual bool hasFGpu() { return false; };
-  virtual bool requestFGpu() {
-    m_needsFGpu = hasFGpu();
-    return m_needsFGpu;
-  }
+  virtual bool requestFGpu() { return false; }
 
   virtual float *eGpu() { return nullptr; };
 
   virtual std::size_t npart_gpu() const { return 0; };
-  virtual Vector3 box() const = 0;
-
-  virtual bool needsRGpu() { return m_needsRGpu; };
-  virtual bool needsDipGpu() { return m_needsRGpu; };
-  virtual bool needsQGpu() { return m_needsQGpu; };
-  virtual bool needsFGpu() { return m_needsFGpu; };
-  virtual bool needsTorqueGpu() { return m_needsTorqueGpu; };
-  virtual ~SystemInterface() = default;
-
-protected:
-  bool m_needsRGpu;
-  bool m_needsQGpu;
-  bool m_needsFGpu;
-  bool m_needsDipGpu;
-  bool m_needsTorqueGpu;
+  virtual Utils::Vector3d box() const = 0;
 };
 
 #endif

From f505fcd863bc7eed33ab6440973e1fd506f814ba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Fri, 3 Dec 2021 19:39:43 +0100
Subject: [PATCH 10/21] core: Document SystemInterface class

---
 src/core/EspressoSystemInterface.hpp     | 10 +++++++++
 src/core/EspressoSystemInterface_cuda.cu |  7 ++++---
 src/core/SystemInterface.hpp             | 26 ++++++++++++++++++++++++
 3 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/src/core/EspressoSystemInterface.hpp b/src/core/EspressoSystemInterface.hpp
index 918e0df08d3..ae15b33d754 100644
--- a/src/core/EspressoSystemInterface.hpp
+++ b/src/core/EspressoSystemInterface.hpp
@@ -27,6 +27,16 @@
 
 #include <cstddef>
 
+/**
+ * @brief CUDA implementation of @ref SystemInterface.
+ *
+ * When data is synchronized between host and device memory, a subset
+ * of the @ref Particle struct is copied from each particle on the host
+ * to the corresponding @ref CUDA_particle_data struct on the device via
+ * @ref EspressoSystemInterface::gatherParticles(). Once the transfer is
+ * complete, the particle AoS on the device is copied (or "split") to
+ * a SoA via @ref EspressoSystemInterface::split_particle_struct().
+ */
 class EspressoSystemInterface : public SystemInterface {
 public:
   EspressoSystemInterface() = default;
diff --git a/src/core/EspressoSystemInterface_cuda.cu b/src/core/EspressoSystemInterface_cuda.cu
index 53cb31b43df..8261d1c07ba 100644
--- a/src/core/EspressoSystemInterface_cuda.cu
+++ b/src/core/EspressoSystemInterface_cuda.cu
@@ -16,6 +16,10 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
+/**
+ * @file
+ * CUDA kernels to convert the particles AoS to a SoA on the device.
+ */
 
 #include "EspressoSystemInterface.hpp"
 #include "cuda_interface.hpp"
@@ -30,9 +34,6 @@
 #error CU-file includes mpi.h! This should not happen!
 #endif
 
-// These functions will split the particle data structure into individual arrays
-// for each property
-
 // Position and charge
 __global__ void split_kernel_rq(CUDA_particle_data *particles, float *r,
                                 float *q, unsigned int n) {
diff --git a/src/core/SystemInterface.hpp b/src/core/SystemInterface.hpp
index cdce5d99f44..884eaf08c3b 100644
--- a/src/core/SystemInterface.hpp
+++ b/src/core/SystemInterface.hpp
@@ -25,6 +25,32 @@
 
 #include <cstddef>
 
+/**
+ * Public interface for the ESPResSo system and device memory management.
+ *
+ * This interface is responsible for device memory management and memory
+ * layout conversion (AoS to SoA conversion). On the CPU, particle data
+ * is stored in an Array of Structures (AoS). On the GPU, some algorithms
+ * have better performance with a Structure of Arrays (SoA). When data
+ * is synchronized between host and device memory, particle objects are
+ * copied to device memory in the form of an AoS. A few particle properties
+ * are further copied ("split") into a SoA by the split callback.
+ *
+ * This interface provides getters in the form <tt>xGpuBegin()</tt> and
+ * <tt>xGpuEnd()</tt> (with @c x a particle property, e.g. @c r for position)
+ * that return pointers to device memory for each array in the SoA.
+ * To register a new particle property in the split callback, call the
+ * corresponding <tt>requestXGpu()</tt> method, with @c X the uppercase
+ * version of the particle property @c x. It is currently impossible to
+ * undo this action.
+ *
+ * Since properties in the @ref Particle struct depend on the config file,
+ * it is necessary to provide a means to check if it is even possible to
+ * carry out the split. This is achieved by the <tt>hasXGpu()</tt> methods.
+ * The same value is returned from <tt>requestXGpu()</tt> and it needs to
+ * be checked at the call site to throw an error. This class won't throw
+ * exceptions.
+ */
 class SystemInterface {
 public:
   SystemInterface() = default;

From 2a52d199c1f9742004e78cd0138efa33d2967831 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Mon, 6 Dec 2021 12:16:01 +0100
Subject: [PATCH 11/21] core: Remove broken past-the-end device pointers

---
 src/core/EspressoSystemInterface.hpp          | 14 --------
 src/core/EspressoSystemInterface_cuda.cu      |  3 --
 src/core/SystemInterface.hpp                  |  5 ---
 .../EspressoSystemInterface_test.cpp          | 33 -------------------
 4 files changed, 55 deletions(-)

diff --git a/src/core/EspressoSystemInterface.hpp b/src/core/EspressoSystemInterface.hpp
index ae15b33d754..feecf6bd53b 100644
--- a/src/core/EspressoSystemInterface.hpp
+++ b/src/core/EspressoSystemInterface.hpp
@@ -54,7 +54,6 @@ class EspressoSystemInterface : public SystemInterface {
 
 #ifdef CUDA
   float *rGpuBegin() override { return m_r_gpu_begin; };
-  float *rGpuEnd() override { return m_r_gpu_end; };
   bool hasRGpu() override { return true; };
   bool requestRGpu() override {
     m_needsRGpu = hasRGpu();
@@ -67,7 +66,6 @@ class EspressoSystemInterface : public SystemInterface {
 
 #ifdef DIPOLES
   float *dipGpuBegin() override { return m_dip_gpu_begin; };
-  float *dipGpuEnd() override { return m_dip_gpu_end; };
   bool hasDipGpu() override { return true; };
   bool requestDipGpu() override {
     m_needsDipGpu = hasDipGpu();
@@ -81,7 +79,6 @@ class EspressoSystemInterface : public SystemInterface {
 
 #ifdef ELECTROSTATICS
   float *qGpuBegin() override { return m_q_gpu_begin; };
-  float *qGpuEnd() override { return m_q_gpu_end; };
   bool hasQGpu() override { return true; };
   bool requestQGpu() override {
     m_needsQGpu = hasQGpu();
@@ -102,9 +99,6 @@ class EspressoSystemInterface : public SystemInterface {
   };
 
   float *fGpuBegin() override { return gpu_get_particle_force_pointer(); };
-  float *fGpuEnd() override {
-    return gpu_get_particle_force_pointer() + 3 * m_gpu_npart;
-  };
   bool hasFGpu() override { return true; };
   bool requestFGpu() override {
     m_needsFGpu = hasFGpu();
@@ -118,9 +112,6 @@ class EspressoSystemInterface : public SystemInterface {
   float *torqueGpuBegin() override {
     return gpu_get_particle_torque_pointer();
   };
-  float *torqueGpuEnd() override {
-    return gpu_get_particle_torque_pointer() + 3 * m_gpu_npart;
-  };
   bool hasTorqueGpu() override { return true; };
   bool requestTorqueGpu() override {
     m_needsTorqueGpu = hasTorqueGpu();
@@ -169,13 +160,8 @@ class EspressoSystemInterface : public SystemInterface {
   bool m_gpu = false;
 
   float *m_r_gpu_begin = nullptr;
-  float *m_r_gpu_end = nullptr;
-
   float *m_dip_gpu_begin = nullptr;
-  float *m_dip_gpu_end = nullptr;
-
   float *m_q_gpu_begin = nullptr;
-  float *m_q_gpu_end = nullptr;
 
   bool m_needsParticleStructGpu = false;
   bool m_splitParticleStructGpu = false;
diff --git a/src/core/EspressoSystemInterface_cuda.cu b/src/core/EspressoSystemInterface_cuda.cu
index 8261d1c07ba..5f29ae8fc64 100644
--- a/src/core/EspressoSystemInterface_cuda.cu
+++ b/src/core/EspressoSystemInterface_cuda.cu
@@ -104,14 +104,12 @@ void EspressoSystemInterface::reallocDeviceMemory(std::size_t n) {
     if (m_r_gpu_begin != nullptr)
       cuda_safe_mem(cudaFree(m_r_gpu_begin));
     cuda_safe_mem(cudaMalloc(&m_r_gpu_begin, 3 * n * sizeof(float)));
-    m_r_gpu_end = m_r_gpu_begin + 3 * n;
   }
 #ifdef DIPOLES
   if (m_needsDipGpu && ((n != m_gpu_npart) || (m_dip_gpu_begin == nullptr))) {
     if (m_dip_gpu_begin != nullptr)
       cuda_safe_mem(cudaFree(m_dip_gpu_begin));
     cuda_safe_mem(cudaMalloc(&m_dip_gpu_begin, 3 * n * sizeof(float)));
-    m_dip_gpu_end = m_dip_gpu_begin + 3 * n;
   }
 #endif
 #ifdef ELECTROSTATICS
@@ -119,7 +117,6 @@ void EspressoSystemInterface::reallocDeviceMemory(std::size_t n) {
     if (m_q_gpu_begin != nullptr)
       cuda_safe_mem(cudaFree(m_q_gpu_begin));
     cuda_safe_mem(cudaMalloc(&m_q_gpu_begin, 3 * n * sizeof(float)));
-    m_q_gpu_end = m_q_gpu_begin + 3 * n;
   }
 #endif
 
diff --git a/src/core/SystemInterface.hpp b/src/core/SystemInterface.hpp
index 884eaf08c3b..006232efab6 100644
--- a/src/core/SystemInterface.hpp
+++ b/src/core/SystemInterface.hpp
@@ -60,27 +60,22 @@ class SystemInterface {
   virtual void update() = 0;
 
   virtual float *rGpuBegin() { return nullptr; };
-  virtual float *rGpuEnd() { return nullptr; };
   virtual bool hasRGpu() { return false; };
   virtual bool requestRGpu() { return false; }
 
   virtual float *dipGpuBegin() { return nullptr; };
-  virtual float *dipGpuEnd() { return nullptr; };
   virtual bool hasDipGpu() { return false; };
   virtual bool requestDipGpu() { return false; }
 
   virtual float *torqueGpuBegin() { return nullptr; };
-  virtual float *torqueGpuEnd() { return nullptr; };
   virtual bool hasTorqueGpu() { return false; };
   virtual bool requestTorqueGpu() { return false; }
 
   virtual float *qGpuBegin() { return nullptr; };
-  virtual float *qGpuEnd() { return nullptr; };
   virtual bool hasQGpu() { return false; };
   virtual bool requestQGpu() { return false; }
 
   virtual float *fGpuBegin() { return nullptr; };
-  virtual float *fGpuEnd() { return nullptr; };
   virtual bool hasFGpu() { return false; };
   virtual bool requestFGpu() { return false; }
 
diff --git a/src/core/unit_tests/EspressoSystemInterface_test.cpp b/src/core/unit_tests/EspressoSystemInterface_test.cpp
index a270579384f..09fb492aea2 100644
--- a/src/core/unit_tests/EspressoSystemInterface_test.cpp
+++ b/src/core/unit_tests/EspressoSystemInterface_test.cpp
@@ -42,15 +42,10 @@ auto &system = EspressoSystemInterface::Instance();
 inline void check_uninitialized_device_pointers() {
   BOOST_CHECK_EQUAL(espresso::system.eGpu(), nullptr);
   BOOST_CHECK_EQUAL(espresso::system.rGpuBegin(), nullptr);
-  BOOST_CHECK_EQUAL(espresso::system.rGpuEnd(), nullptr);
   BOOST_CHECK_EQUAL(espresso::system.dipGpuBegin(), nullptr);
-  BOOST_CHECK_EQUAL(espresso::system.dipGpuEnd(), nullptr);
   BOOST_CHECK_EQUAL(espresso::system.fGpuBegin(), nullptr);
-  BOOST_CHECK_EQUAL(espresso::system.fGpuEnd(), nullptr);
   BOOST_CHECK_EQUAL(espresso::system.torqueGpuBegin(), nullptr);
-  BOOST_CHECK_EQUAL(espresso::system.torqueGpuEnd(), nullptr);
   BOOST_CHECK_EQUAL(espresso::system.qGpuBegin(), nullptr);
-  BOOST_CHECK_EQUAL(espresso::system.qGpuEnd(), nullptr);
 }
 
 #ifdef CUDA
@@ -103,18 +98,12 @@ BOOST_AUTO_TEST_CASE(check_with_gpu, *boost::unit_test::precondition(has_gpu)) {
   BOOST_TEST(espresso::system.requestRGpu());
   espresso::system.update();
   BOOST_TEST(espresso::system.rGpuBegin() != nullptr);
-  BOOST_TEST(espresso::system.rGpuEnd() != nullptr);
-  BOOST_CHECK_EQUAL(espresso::system.rGpuEnd() - espresso::system.rGpuBegin(),
-                    3);
 
   // check force split
   BOOST_TEST(espresso::system.hasFGpu());
   BOOST_TEST(espresso::system.requestFGpu());
   espresso::system.update();
   BOOST_TEST(espresso::system.fGpuBegin() != nullptr);
-  BOOST_TEST(espresso::system.fGpuEnd() != nullptr);
-  BOOST_CHECK_EQUAL(espresso::system.fGpuEnd() - espresso::system.fGpuBegin(),
-                    3);
 
   // check torque split
   BOOST_CHECK_EQUAL(espresso::system.hasTorqueGpu(), has_feature_rotation);
@@ -122,9 +111,6 @@ BOOST_AUTO_TEST_CASE(check_with_gpu, *boost::unit_test::precondition(has_gpu)) {
 #ifdef ROTATION
   espresso::system.update();
   BOOST_TEST(espresso::system.torqueGpuBegin() != nullptr);
-  BOOST_TEST(espresso::system.torqueGpuEnd() != nullptr);
-  BOOST_CHECK_EQUAL(
-      espresso::system.torqueGpuEnd() - espresso::system.torqueGpuBegin(), 3);
 #endif
 
   // check charge split
@@ -133,9 +119,6 @@ BOOST_AUTO_TEST_CASE(check_with_gpu, *boost::unit_test::precondition(has_gpu)) {
 #ifdef ELECTROSTATICS
   espresso::system.update();
   BOOST_TEST(espresso::system.qGpuBegin() != nullptr);
-  BOOST_TEST(espresso::system.qGpuEnd() != nullptr);
-  BOOST_CHECK_EQUAL(espresso::system.qGpuEnd() - espresso::system.qGpuBegin(),
-                    3);
 #endif
 
   // check dipole split
@@ -144,28 +127,12 @@ BOOST_AUTO_TEST_CASE(check_with_gpu, *boost::unit_test::precondition(has_gpu)) {
 #ifdef DIPOLES
   espresso::system.update();
   BOOST_TEST(espresso::system.dipGpuBegin() != nullptr);
-  BOOST_TEST(espresso::system.dipGpuEnd() != nullptr);
-  BOOST_CHECK_EQUAL(
-      espresso::system.dipGpuEnd() - espresso::system.dipGpuBegin(), 3);
 #endif
 
   // clear device memory
   remove_particle(pid);
   espresso::system.update();
   BOOST_CHECK_EQUAL(espresso::system.npart_gpu(), 0);
-  BOOST_CHECK_EQUAL(espresso::system.rGpuBegin(), espresso::system.rGpuEnd());
-  BOOST_CHECK_EQUAL(espresso::system.fGpuBegin(), espresso::system.fGpuEnd());
-#ifdef ELECTROSTATICS
-  BOOST_CHECK_EQUAL(espresso::system.qGpuBegin(), espresso::system.qGpuEnd());
-#endif
-#ifdef DIPOLES
-  BOOST_CHECK_EQUAL(espresso::system.dipGpuBegin(),
-                    espresso::system.dipGpuEnd());
-#endif
-#ifdef ROTATION
-  BOOST_CHECK_EQUAL(espresso::system.torqueGpuBegin(),
-                    espresso::system.torqueGpuEnd());
-#endif
 }
 
 #else // CUDA

From 867cf47d22f821dfe262552e64e1d5855084d76c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Mon, 6 Dec 2021 12:27:26 +0100
Subject: [PATCH 12/21] core: Remove magnetostatics global variables

---
 src/core/actor/DipolarBarnesHut.cpp           | 25 +++---
 src/core/actor/DipolarBarnesHut.hpp           | 26 +++---
 src/core/actor/DipolarBarnesHut_cuda.cu       | 80 ++++++++++---------
 src/core/actor/DipolarBarnesHut_cuda.cuh      | 15 ++--
 src/core/actor/DipolarDirectSum.cpp           | 25 +++---
 src/core/actor/DipolarDirectSum.hpp           | 22 ++---
 .../electrostatics_magnetostatics/dipole.cpp  | 16 +---
 src/python/espressomd/magnetostatics.pxd      | 22 ++++-
 src/python/espressomd/magnetostatics.pyx      | 23 +++++-
 9 files changed, 138 insertions(+), 116 deletions(-)

diff --git a/src/core/actor/DipolarBarnesHut.cpp b/src/core/actor/DipolarBarnesHut.cpp
index ce5fd05b6f6..21a239ec449 100644
--- a/src/core/actor/DipolarBarnesHut.cpp
+++ b/src/core/actor/DipolarBarnesHut.cpp
@@ -23,33 +23,26 @@
 
 #include "DipolarBarnesHut.hpp"
 
-#include "EspressoSystemInterface.hpp"
 #include "actor/ActorList.hpp"
 #include "electrostatics_magnetostatics/common.hpp"
 #include "energy.hpp"
 #include "forces.hpp"
 
-#include <memory>
-
-static std::unique_ptr<DipolarBarnesHut> dipolarBarnesHut;
-
-void activate_dipolar_barnes_hut(float epssq, float itolsq) {
+void DipolarBarnesHut::activate() {
   // also necessary on 1 CPU or GPU, does more than just broadcasting
   dipole.method = DIPOLAR_BH_GPU;
   mpi_bcast_coulomb_params();
 
-  dipolarBarnesHut = std::make_unique<DipolarBarnesHut>(
-      EspressoSystemInterface::Instance(), epssq, itolsq);
-  forceActors.push_back(dipolarBarnesHut.get());
-  energyActors.push_back(dipolarBarnesHut.get());
+  forceActors.push_back(this);
+  energyActors.push_back(this);
 }
 
-void deactivate_dipolar_barnes_hut() {
-  if (dipolarBarnesHut) {
-    forceActors.remove(dipolarBarnesHut.get());
-    energyActors.remove(dipolarBarnesHut.get());
-    dipolarBarnesHut.reset();
-  }
+void DipolarBarnesHut::deactivate() {
+  dipole.method = DIPOLAR_NONE;
+  mpi_bcast_coulomb_params();
+
+  forceActors.remove(this);
+  energyActors.remove(this);
 }
 
 #endif
diff --git a/src/core/actor/DipolarBarnesHut.hpp b/src/core/actor/DipolarBarnesHut.hpp
index 35fb9ac0701..829a6318d53 100644
--- a/src/core/actor/DipolarBarnesHut.hpp
+++ b/src/core/actor/DipolarBarnesHut.hpp
@@ -33,11 +33,7 @@
 
 class DipolarBarnesHut : public Actor {
 public:
-  DipolarBarnesHut(SystemInterface &s, float epssq, float itolsq) {
-    m_k = static_cast<float>(dipole.prefactor);
-    m_epssq = epssq;
-    m_itolsq = itolsq;
-    setBHPrecision(&m_epssq, &m_itolsq);
+  DipolarBarnesHut(SystemInterface &s) {
     if (!s.requestFGpu())
       runtimeErrorMsg() << "DipolarBarnesHut needs access to forces on GPU!";
 
@@ -51,6 +47,15 @@ class DipolarBarnesHut : public Actor {
       runtimeErrorMsg() << "DipolarBarnesHut needs access to dipoles on GPU!";
   };
 
+  ~DipolarBarnesHut() override { deallocBH(&m_bh_data); }
+
+  void set_params(float epssq, float itolsq) {
+    m_k = static_cast<float>(dipole.prefactor);
+    m_epssq = epssq;
+    m_itolsq = itolsq;
+    setBHPrecision(m_epssq, m_itolsq);
+  }
+
   void computeForces(SystemInterface &s) override {
     try {
       allocBHmemCopy(static_cast<int>(s.npart_gpu()), &m_bh_data);
@@ -59,7 +64,7 @@ class DipolarBarnesHut : public Actor {
       return;
     }
 
-    fillConstantPointers(s.rGpuBegin(), s.dipGpuBegin(), m_bh_data);
+    fill_bh_data(s.rGpuBegin(), s.dipGpuBegin(), &m_bh_data);
     initBHgpu(m_bh_data.blocks);
     buildBoxBH(m_bh_data.blocks);
     buildTreeBH(m_bh_data.blocks);
@@ -77,7 +82,7 @@ class DipolarBarnesHut : public Actor {
       return;
     }
 
-    fillConstantPointers(s.rGpuBegin(), s.dipGpuBegin(), m_bh_data);
+    fill_bh_data(s.rGpuBegin(), s.dipGpuBegin(), &m_bh_data);
     initBHgpu(m_bh_data.blocks);
     buildBoxBH(m_bh_data.blocks);
     buildTreeBH(m_bh_data.blocks);
@@ -89,18 +94,19 @@ class DipolarBarnesHut : public Actor {
     }
   };
 
+  void activate();
+  void deactivate();
+
 private:
   float m_k;
   float m_epssq;
   float m_itolsq;
+  /// Container for pointers to device memory.
   BHData m_bh_data = {0,       0,       0,       nullptr, nullptr,
                       nullptr, nullptr, nullptr, nullptr, nullptr,
                       nullptr, nullptr, nullptr, nullptr};
 };
 
-void activate_dipolar_barnes_hut(float epssq, float itolsq);
-void deactivate_dipolar_barnes_hut();
-
 #endif // DIPOLAR_BARNES_HUT
 
 #endif // ACTOR_DIPOLARBARNESHUT_HPP
diff --git a/src/core/actor/DipolarBarnesHut_cuda.cu b/src/core/actor/DipolarBarnesHut_cuda.cu
index 456895976ff..e33f99fc123 100644
--- a/src/core/actor/DipolarBarnesHut_cuda.cu
+++ b/src/core/actor/DipolarBarnesHut_cuda.cu
@@ -1148,16 +1148,38 @@ int energyBH(BHData *bh_data, float k, float *E) {
   return error_code;
 }
 
-// Function to set the BH method parameters.
-void setBHPrecision(float *epssq, float *itolsq) {
-  cuda_safe_mem(cudaMemcpyToSymbol(epssqd, epssq, sizeof(float), 0,
+void setBHPrecision(float epssq, float itolsq) {
+  cuda_safe_mem(cudaMemcpyToSymbol(epssqd, &epssq, sizeof(float), 0,
                                    cudaMemcpyHostToDevice));
-  cuda_safe_mem(cudaMemcpyToSymbol(itolsqd, itolsq, sizeof(float), 0,
+  cuda_safe_mem(cudaMemcpyToSymbol(itolsqd, &itolsq, sizeof(float), 0,
                                    cudaMemcpyHostToDevice));
 }
 
-// An allocation of the GPU device memory and an initialization where it is
-// needed.
+void deallocBH(BHData *bh_data) {
+  cuda_safe_mem(cudaFree(bh_data->err));
+  cuda_safe_mem(cudaFree(bh_data->max_lps));
+  cuda_safe_mem(cudaFree(bh_data->child));
+  cuda_safe_mem(cudaFree(bh_data->count));
+  cuda_safe_mem(cudaFree(bh_data->start));
+  cuda_safe_mem(cudaFree(bh_data->sort));
+  cuda_safe_mem(cudaFree(bh_data->mass));
+  cuda_safe_mem(cudaFree(bh_data->maxp));
+  cuda_safe_mem(cudaFree(bh_data->minp));
+  cuda_safe_mem(cudaFree(bh_data->r));
+  cuda_safe_mem(cudaFree(bh_data->u));
+  bh_data->err = nullptr;
+  bh_data->max_lps = nullptr;
+  bh_data->child = nullptr;
+  bh_data->count = nullptr;
+  bh_data->start = nullptr;
+  bh_data->sort = nullptr;
+  bh_data->mass = nullptr;
+  bh_data->maxp = nullptr;
+  bh_data->minp = nullptr;
+  bh_data->r = nullptr;
+  bh_data->u = nullptr;
+}
+
 void allocBHmemCopy(int nbodies, BHData *bh_data) {
   if (bh_data->nbodies == nbodies)
     return;
@@ -1178,37 +1200,30 @@ void allocBHmemCopy(int nbodies, BHData *bh_data) {
   else
     bh_data->nnodes = (bh_data->nnodes / n_total_threads) * n_total_threads;
 
-  if (bh_data->err != nullptr)
-    cuda_safe_mem(cudaFree(bh_data->err));
+  cuda_safe_mem(cudaFree(bh_data->err));
   cuda_safe_mem(cudaMalloc((void **)&(bh_data->err), sizeof(int)));
 
-  if (bh_data->max_lps != nullptr)
-    cuda_safe_mem(cudaFree(bh_data->max_lps));
+  cuda_safe_mem(cudaFree(bh_data->max_lps));
   cuda_safe_mem(cudaMalloc((void **)&(bh_data->max_lps), sizeof(int)));
 
-  if (bh_data->child != nullptr)
-    cuda_safe_mem(cudaFree(bh_data->child));
+  cuda_safe_mem(cudaFree(bh_data->child));
   cuda_safe_mem(cudaMalloc((void **)&(bh_data->child),
                            sizeof(int) * (bh_data->nnodes + 1) * 8));
 
-  if (bh_data->count != nullptr)
-    cuda_safe_mem(cudaFree(bh_data->count));
+  cuda_safe_mem(cudaFree(bh_data->count));
   cuda_safe_mem(cudaMalloc((void **)&(bh_data->count),
                            sizeof(int) * (bh_data->nnodes + 1)));
 
-  if (bh_data->start != nullptr)
-    cuda_safe_mem(cudaFree(bh_data->start));
+  cuda_safe_mem(cudaFree(bh_data->start));
   cuda_safe_mem(cudaMalloc((void **)&(bh_data->start),
                            sizeof(int) * (bh_data->nnodes + 1)));
 
-  if (bh_data->sort != nullptr)
-    cuda_safe_mem(cudaFree(bh_data->sort));
+  cuda_safe_mem(cudaFree(bh_data->sort));
   cuda_safe_mem(cudaMalloc((void **)&(bh_data->sort),
                            sizeof(int) * (bh_data->nnodes + 1)));
 
   // Weight coefficients of m_bhnnodes nodes: both particles and octant cells
-  if (bh_data->mass != nullptr)
-    cuda_safe_mem(cudaFree(bh_data->mass));
+  cuda_safe_mem(cudaFree(bh_data->mass));
   cuda_safe_mem(cudaMalloc((void **)&(bh_data->mass),
                            sizeof(float) * (bh_data->nnodes + 1)));
 
@@ -1226,39 +1241,32 @@ void allocBHmemCopy(int nbodies, BHData *bh_data) {
   // are the octree box dynamical spatial constraints
   // this array is updating per each block at each interaction calculation
   // within the boundingBoxKernel
-  if (bh_data->maxp != nullptr)
-    cuda_safe_mem(cudaFree(bh_data->maxp));
+  cuda_safe_mem(cudaFree(bh_data->maxp));
   cuda_safe_mem(cudaMalloc((void **)&(bh_data->maxp),
                            sizeof(float) * bh_data->blocks * FACTOR1 * 3));
   // (min[3*i], min[3*i+1], min[3*i+2])
   // are the octree box dynamical spatial constraints
   // this array is updating per each block at each interaction calculation
   // within the boundingBoxKernel
-  if (bh_data->minp != nullptr)
-    cuda_safe_mem(cudaFree(bh_data->minp));
+  cuda_safe_mem(cudaFree(bh_data->minp));
   cuda_safe_mem(cudaMalloc((void **)&(bh_data->minp),
                            sizeof(float) * bh_data->blocks * FACTOR1 * 3));
 
-  if (bh_data->r != nullptr)
-    cuda_safe_mem(cudaFree(bh_data->r));
+  cuda_safe_mem(cudaFree(bh_data->r));
   cuda_safe_mem(
       cudaMalloc(&(bh_data->r), 3 * (bh_data->nnodes + 1) * sizeof(float)));
 
-  if (bh_data->u != nullptr)
-    cuda_safe_mem(cudaFree(bh_data->u));
+  cuda_safe_mem(cudaFree(bh_data->u));
   cuda_safe_mem(
       cudaMalloc(&(bh_data->u), 3 * (bh_data->nnodes + 1) * sizeof(float)));
 }
 
-// Populating of array pointers allocated in GPU device before.
-// Copy the particle data to the Barnes-Hut related arrays.
-void fillConstantPointers(float *r, float *dip, BHData bh_data) {
-  cuda_safe_mem(cudaMemcpyToSymbol(bhpara, &bh_data, sizeof(BHData), 0,
+void fill_bh_data(float const *r, float const *dip, BHData const *bh_data) {
+  auto const size = 3 * bh_data->nbodies * sizeof(float);
+  cuda_safe_mem(cudaMemcpyToSymbol(bhpara, bh_data, sizeof(BHData), 0,
                                    cudaMemcpyHostToDevice));
-  cuda_safe_mem(cudaMemcpy(bh_data.r, r, 3 * bh_data.nbodies * sizeof(float),
-                           cudaMemcpyDeviceToDevice));
-  cuda_safe_mem(cudaMemcpy(bh_data.u, dip, 3 * bh_data.nbodies * sizeof(float),
-                           cudaMemcpyDeviceToDevice));
+  cuda_safe_mem(cudaMemcpy(bh_data->r, r, size, cudaMemcpyDeviceToDevice));
+  cuda_safe_mem(cudaMemcpy(bh_data->u, dip, size, cudaMemcpyDeviceToDevice));
 }
 
 #endif // BARNES_HUT
diff --git a/src/core/actor/DipolarBarnesHut_cuda.cuh b/src/core/actor/DipolarBarnesHut_cuda.cuh
index 6e64b93cea9..9ad9437042c 100644
--- a/src/core/actor/DipolarBarnesHut_cuda.cuh
+++ b/src/core/actor/DipolarBarnesHut_cuda.cuh
@@ -83,16 +83,21 @@ struct BHData {
 /// Maximal depth of the Barnes-Hut tree branching.
 #define MAXDEPTH 32
 
-/// Function to set the Barnes-Hut parameters.
-void setBHPrecision(float *epssq, float *itolsq);
+/// Barnes-Hut parameters setter.
+void setBHPrecision(float epssq, float itolsq);
 
 /// An allocation of the GPU device memory and an initialization where it is
 /// needed.
 void allocBHmemCopy(int nbodies, BHData *bh_data);
 
-/// Populating of array pointers allocated in GPU device before.
-/// Copy the particle data to the Barnes-Hut related arrays.
-void fillConstantPointers(float *r, float *dip, BHData bh_data);
+/// A deallocation of the GPU device memory.
+void deallocBH(BHData *bh_data);
+
+/// Copy Barnes-Hut data to @ref bhpara and copy particle data.
+/// @param r       device particle positions to copy
+/// @param dip     device particle dipoles to copy
+/// @param bh_data Barnes-Hut container
+void fill_bh_data(float const *r, float const *dip, BHData const *bh_data);
 
 /// Barnes-Hut CUDA initialization.
 void initBHgpu(int blocks);
diff --git a/src/core/actor/DipolarDirectSum.cpp b/src/core/actor/DipolarDirectSum.cpp
index 2ca0de24bb6..01d48bf487c 100644
--- a/src/core/actor/DipolarDirectSum.cpp
+++ b/src/core/actor/DipolarDirectSum.cpp
@@ -23,32 +23,25 @@
 
 #include "DipolarDirectSum.hpp"
 
-#include "EspressoSystemInterface.hpp"
 #include "electrostatics_magnetostatics/common.hpp"
 #include "energy.hpp"
 #include "forces.hpp"
 
-#include <memory>
-
-static std::unique_ptr<DipolarDirectSum> dipolarDirectSum;
-
-void activate_dipolar_direct_sum_gpu() {
+void DipolarDirectSum::activate() {
   // also necessary on 1 CPU or GPU, does more than just broadcasting
   dipole.method = DIPOLAR_DS_GPU;
   mpi_bcast_coulomb_params();
 
-  dipolarDirectSum =
-      std::make_unique<DipolarDirectSum>(EspressoSystemInterface::Instance());
-  forceActors.push_back(dipolarDirectSum.get());
-  energyActors.push_back(dipolarDirectSum.get());
+  forceActors.push_back(this);
+  energyActors.push_back(this);
 }
 
-void deactivate_dipolar_direct_sum_gpu() {
-  if (dipolarDirectSum) {
-    forceActors.remove(dipolarDirectSum.get());
-    energyActors.remove(dipolarDirectSum.get());
-    dipolarDirectSum.reset();
-  }
+void DipolarDirectSum::deactivate() {
+  dipole.method = DIPOLAR_NONE;
+  mpi_bcast_coulomb_params();
+
+  forceActors.remove(this);
+  energyActors.remove(this);
 }
 
 #endif
diff --git a/src/core/actor/DipolarDirectSum.hpp b/src/core/actor/DipolarDirectSum.hpp
index 02a08722813..3dc82d68a79 100644
--- a/src/core/actor/DipolarDirectSum.hpp
+++ b/src/core/actor/DipolarDirectSum.hpp
@@ -41,8 +41,6 @@ class DipolarDirectSum : public Actor {
 public:
   DipolarDirectSum(SystemInterface &s) {
 
-    k = static_cast<float>(dipole.prefactor);
-
     if (!s.requestFGpu())
       runtimeErrorMsg() << "DipolarDirectSum needs access to forces on GPU!";
 
@@ -55,6 +53,9 @@ class DipolarDirectSum : public Actor {
     if (!s.requestDipGpu())
       runtimeErrorMsg() << "DipolarDirectSum needs access to dipoles on GPU!";
   };
+
+  void set_params() { m_pk = static_cast<float>(dipole.prefactor); }
+
   void computeForces(SystemInterface &s) override {
     float box[3];
     int per[3];
@@ -63,8 +64,8 @@ class DipolarDirectSum : public Actor {
       per[i] = box_geo.periodic(i);
     }
     DipolarDirectSum_kernel_wrapper_force(
-        k, static_cast<unsigned>(s.npart_gpu()), s.rGpuBegin(), s.dipGpuBegin(),
-        s.fGpuBegin(), s.torqueGpuBegin(), box, per);
+        m_pk, static_cast<unsigned>(s.npart_gpu()), s.rGpuBegin(),
+        s.dipGpuBegin(), s.fGpuBegin(), s.torqueGpuBegin(), box, per);
   };
   void computeEnergy(SystemInterface &s) override {
     float box[3];
@@ -73,18 +74,19 @@ class DipolarDirectSum : public Actor {
       box[i] = static_cast<float>(s.box()[i]);
       per[i] = box_geo.periodic(i);
     }
+    auto energy = reinterpret_cast<CUDA_energy *>(s.eGpu());
     DipolarDirectSum_kernel_wrapper_energy(
-        k, static_cast<unsigned>(s.npart_gpu()), s.rGpuBegin(), s.dipGpuBegin(),
-        box, per, &(reinterpret_cast<CUDA_energy *>(s.eGpu())->dipolar));
+        m_pk, static_cast<unsigned>(s.npart_gpu()), s.rGpuBegin(),
+        s.dipGpuBegin(), box, per, &(energy->dipolar));
   };
 
+  void activate();
+  void deactivate();
+
 private:
-  float k;
+  float m_pk;
 };
 
-void activate_dipolar_direct_sum_gpu();
-void deactivate_dipolar_direct_sum_gpu();
-
 #endif // DIPOLAR_DIRECT_SUM
 
 #endif // ACTOR_DIPOLARDIRECTSUM_HPP
diff --git a/src/core/electrostatics_magnetostatics/dipole.cpp b/src/core/electrostatics_magnetostatics/dipole.cpp
index 629d13bc8cf..77291e3bae0 100644
--- a/src/core/electrostatics_magnetostatics/dipole.cpp
+++ b/src/core/electrostatics_magnetostatics/dipole.cpp
@@ -23,8 +23,6 @@
 
 #include "electrostatics_magnetostatics/dipole.hpp"
 
-#include "actor/DipolarBarnesHut.hpp"
-#include "actor/DipolarDirectSum.hpp"
 #include "electrostatics_magnetostatics/common.hpp"
 #include "electrostatics_magnetostatics/magnetic_non_p3m_methods.hpp"
 #include "electrostatics_magnetostatics/mdlc_correction.hpp"
@@ -295,19 +293,7 @@ void set_Dprefactor(double prefactor) {
   mpi_bcast_coulomb_params();
 }
 
-void set_method_local(DipolarInteraction method) {
-#ifdef DIPOLAR_DIRECT_SUM
-  if ((dipole.method == DIPOLAR_DS_GPU) && (method != DIPOLAR_DS_GPU)) {
-    deactivate_dipolar_direct_sum_gpu();
-  }
-#endif
-#ifdef DIPOLAR_BARNES_HUT
-  if ((dipole.method == DIPOLAR_BH_GPU) && (method != DIPOLAR_BH_GPU)) {
-    deactivate_dipolar_barnes_hut();
-  }
-#endif // DIPOLAR_ BARNES_HUT
-  dipole.method = method;
-}
+void set_method_local(DipolarInteraction method) { dipole.method = method; }
 
 } // namespace Dipole
 #endif // DIPOLES
diff --git a/src/python/espressomd/magnetostatics.pxd b/src/python/espressomd/magnetostatics.pxd
index 1bb064674d7..d12f5bee383 100644
--- a/src/python/espressomd/magnetostatics.pxd
+++ b/src/python/espressomd/magnetostatics.pxd
@@ -19,6 +19,14 @@ from libcpp cimport bool
 
 include "myconfig.pxi"
 
+cdef extern from "SystemInterface.hpp":
+    cdef cppclass SystemInterface:
+        pass
+cdef extern from "EspressoSystemInterface.hpp":
+    cdef cppclass EspressoSystemInterface(SystemInterface):
+        @staticmethod
+        EspressoSystemInterface & Instance()
+
 IF DIPOLES == 1:
     cdef extern from "electrostatics_magnetostatics/common.hpp":
         void mpi_bcast_coulomb_params()
@@ -49,13 +57,19 @@ IF DIPOLES == 1:
 
     IF(CUDA == 1) and (ROTATION == 1):
         cdef extern from "actor/DipolarDirectSum.hpp":
-            void activate_dipolar_direct_sum_gpu()
-            void deactivate_dipolar_direct_sum_gpu()
+            cdef cppclass DipolarDirectSum:
+                DipolarDirectSum(SystemInterface & s)
+                void set_params()
+                void activate()
+                void deactivate()
 
     IF(DIPOLAR_BARNES_HUT == 1):
         cdef extern from "actor/DipolarBarnesHut.hpp":
-            void activate_dipolar_barnes_hut(float epssq, float itolsq)
-            void deactivate_dipolar_barnes_hut()
+            cdef cppclass DipolarBarnesHut:
+                DipolarBarnesHut(SystemInterface & s)
+                void set_params(float epssq, float itolsq)
+                void activate()
+                void deactivate()
 
 IF DP3M == 1:
     from p3m_common cimport P3MParameters
diff --git a/src/python/espressomd/magnetostatics.pyx b/src/python/espressomd/magnetostatics.pyx
index c360288482b..f6b09376e51 100644
--- a/src/python/espressomd/magnetostatics.pyx
+++ b/src/python/espressomd/magnetostatics.pyx
@@ -16,6 +16,9 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
+from libcpp.memory cimport shared_ptr, make_shared
+from cython.operator cimport dereference
+
 include "myconfig.pxi"
 from .actors cimport Actor
 IF SCAFACOS == 1:
@@ -324,6 +327,11 @@ IF DIPOLES == 1:
                 Magnetostatics prefactor (:math:`\\mu_0/(4\\pi)`)
 
             """
+            cdef shared_ptr[DipolarDirectSum] sip
+
+            def __cinit__(self):
+                self.sip = make_shared[DipolarDirectSum](
+                    EspressoSystemInterface.Instance())
 
             def default_params(self):
                 return {}
@@ -339,14 +347,15 @@ IF DIPOLES == 1:
 
             def _activate_method(self):
                 self._set_params_in_es_core()
+                dereference(self.sip).activate()
 
             def _deactivate_method(self):
                 super()._deactivate_method()
-                deactivate_dipolar_direct_sum_gpu()
+                dereference(self.sip).deactivate()
 
             def _set_params_in_es_core(self):
                 self.set_magnetostatics_prefactor()
-                activate_dipolar_direct_sum_gpu()
+                dereference(self.sip).set_params()
 
     IF(DIPOLAR_BARNES_HUT == 1):
         cdef class DipolarBarnesHutGpu(MagnetostaticInteraction):
@@ -358,6 +367,11 @@ IF DIPOLES == 1:
             TODO: If the system has periodic boundaries, the minimum image
             convention is applied.
             """
+            cdef shared_ptr[DipolarBarnesHut] sip
+
+            def __cinit__(self):
+                self.sip = make_shared[DipolarBarnesHut](
+                    EspressoSystemInterface.Instance())
 
             def default_params(self):
                 return {"epssq": 100.0,
@@ -374,12 +388,13 @@ IF DIPOLES == 1:
 
             def _activate_method(self):
                 self._set_params_in_es_core()
+                dereference(self.sip).activate()
 
             def _deactivate_method(self):
                 super()._deactivate_method()
-                deactivate_dipolar_barnes_hut()
+                dereference(self.sip).deactivate()
 
             def _set_params_in_es_core(self):
                 self.set_magnetostatics_prefactor()
-                activate_dipolar_barnes_hut(
+                dereference(self.sip).set_params(
                     self._params["epssq"], self._params["itolsq"])

From ac990c551fb637a93de7f1e56cc524fda0322576 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Wed, 8 Dec 2021 10:52:04 +0100
Subject: [PATCH 13/21] core: Separation of concerns for GPU methods

Move implementation details from class definitions in .hpp files
to the corresponding .cpp files. Don't activate MMM1D GPU during
construction, but when added to the actor list.
---
 src/core/actor/DipolarBarnesHut.cpp      | 68 +++++++++++++++++++++++-
 src/core/actor/DipolarBarnesHut.hpp      | 67 ++---------------------
 src/core/actor/DipolarBarnesHut_cuda.cu  |  1 -
 src/core/actor/DipolarDirectSum.cpp      | 50 +++++++++++++++++
 src/core/actor/DipolarDirectSum.hpp      | 56 +++----------------
 src/core/actor/DipolarDirectSum_cuda.cu  |  2 +
 src/core/actor/DipolarDirectSum_cuda.cuh | 36 +++++++++++++
 src/core/actor/Mmm1dgpuForce.cpp         | 15 +++++-
 src/core/actor/Mmm1dgpuForce.hpp         | 23 ++++----
 src/core/actor/Mmm1dgpuForce_cuda.cu     | 24 +--------
 10 files changed, 195 insertions(+), 147 deletions(-)
 create mode 100644 src/core/actor/DipolarDirectSum_cuda.cuh

diff --git a/src/core/actor/DipolarBarnesHut.cpp b/src/core/actor/DipolarBarnesHut.cpp
index 21a239ec449..85a6d5bbe6b 100644
--- a/src/core/actor/DipolarBarnesHut.cpp
+++ b/src/core/actor/DipolarBarnesHut.cpp
@@ -22,12 +22,32 @@
 #ifdef DIPOLAR_BARNES_HUT
 
 #include "DipolarBarnesHut.hpp"
+#include "DipolarBarnesHut_cuda.cuh"
 
-#include "actor/ActorList.hpp"
 #include "electrostatics_magnetostatics/common.hpp"
+#include "electrostatics_magnetostatics/dipole.hpp"
+
+#include "SystemInterface.hpp"
+#include "cuda_interface.hpp"
+#include "cuda_utils.hpp"
 #include "energy.hpp"
+#include "errorhandling.hpp"
 #include "forces.hpp"
 
+DipolarBarnesHut::DipolarBarnesHut(SystemInterface &s) {
+  if (!s.requestFGpu())
+    runtimeErrorMsg() << "DipolarBarnesHut needs access to forces on GPU!";
+
+  if (!s.requestTorqueGpu())
+    runtimeErrorMsg() << "DipolarBarnesHut needs access to torques on GPU!";
+
+  if (!s.requestRGpu())
+    runtimeErrorMsg() << "DipolarBarnesHut needs access to positions on GPU!";
+
+  if (!s.requestDipGpu())
+    runtimeErrorMsg() << "DipolarBarnesHut needs access to dipoles on GPU!";
+}
+
 void DipolarBarnesHut::activate() {
   // also necessary on 1 CPU or GPU, does more than just broadcasting
   dipole.method = DIPOLAR_BH_GPU;
@@ -45,4 +65,50 @@ void DipolarBarnesHut::deactivate() {
   energyActors.remove(this);
 }
 
+void DipolarBarnesHut::set_params(float epssq, float itolsq) {
+  m_prefactor = static_cast<float>(dipole.prefactor);
+  m_epssq = epssq;
+  m_itolsq = itolsq;
+  setBHPrecision(m_epssq, m_itolsq);
+}
+
+void DipolarBarnesHut::computeForces(SystemInterface &s) {
+  try {
+    allocBHmemCopy(static_cast<int>(s.npart_gpu()), &m_bh_data);
+  } catch (cuda_runtime_error const &err) {
+    runtimeErrorMsg() << "DipolarBarnesHut: " << err.what();
+    return;
+  }
+
+  fill_bh_data(s.rGpuBegin(), s.dipGpuBegin(), &m_bh_data);
+  initBHgpu(m_bh_data.blocks);
+  buildBoxBH(m_bh_data.blocks);
+  buildTreeBH(m_bh_data.blocks);
+  summarizeBH(m_bh_data.blocks);
+  sortBH(m_bh_data.blocks);
+  if (forceBH(&m_bh_data, m_prefactor, s.fGpuBegin(), s.torqueGpuBegin())) {
+    runtimeErrorMsg() << "kernels encountered a functional error";
+  }
+}
+
+void DipolarBarnesHut::computeEnergy(SystemInterface &s) {
+  try {
+    allocBHmemCopy(static_cast<int>(s.npart_gpu()), &m_bh_data);
+  } catch (cuda_runtime_error const &err) {
+    runtimeErrorMsg() << "DipolarBarnesHut: " << err.what();
+    return;
+  }
+
+  fill_bh_data(s.rGpuBegin(), s.dipGpuBegin(), &m_bh_data);
+  initBHgpu(m_bh_data.blocks);
+  buildBoxBH(m_bh_data.blocks);
+  buildTreeBH(m_bh_data.blocks);
+  summarizeBH(m_bh_data.blocks);
+  sortBH(m_bh_data.blocks);
+  auto energy = reinterpret_cast<CUDA_energy *>(s.eGpu());
+  if (energyBH(&m_bh_data, m_prefactor, &(energy->dipolar))) {
+    runtimeErrorMsg() << "kernels encountered a functional error";
+  }
+}
+
 #endif
diff --git a/src/core/actor/DipolarBarnesHut.hpp b/src/core/actor/DipolarBarnesHut.hpp
index 829a6318d53..4cf76ecd5ec 100644
--- a/src/core/actor/DipolarBarnesHut.hpp
+++ b/src/core/actor/DipolarBarnesHut.hpp
@@ -26,79 +26,22 @@
 #include "Actor.hpp"
 #include "DipolarBarnesHut_cuda.cuh"
 #include "SystemInterface.hpp"
-#include "cuda_interface.hpp"
-#include "cuda_utils.hpp"
-#include "electrostatics_magnetostatics/dipole.hpp"
-#include "errorhandling.hpp"
 
 class DipolarBarnesHut : public Actor {
 public:
-  DipolarBarnesHut(SystemInterface &s) {
-    if (!s.requestFGpu())
-      runtimeErrorMsg() << "DipolarBarnesHut needs access to forces on GPU!";
-
-    if (!s.requestTorqueGpu())
-      runtimeErrorMsg() << "DipolarBarnesHut needs access to torques on GPU!";
-
-    if (!s.requestRGpu())
-      runtimeErrorMsg() << "DipolarBarnesHut needs access to positions on GPU!";
-
-    if (!s.requestDipGpu())
-      runtimeErrorMsg() << "DipolarBarnesHut needs access to dipoles on GPU!";
-  };
-
+  DipolarBarnesHut(SystemInterface &s);
   ~DipolarBarnesHut() override { deallocBH(&m_bh_data); }
 
-  void set_params(float epssq, float itolsq) {
-    m_k = static_cast<float>(dipole.prefactor);
-    m_epssq = epssq;
-    m_itolsq = itolsq;
-    setBHPrecision(m_epssq, m_itolsq);
-  }
-
-  void computeForces(SystemInterface &s) override {
-    try {
-      allocBHmemCopy(static_cast<int>(s.npart_gpu()), &m_bh_data);
-    } catch (cuda_runtime_error const &err) {
-      runtimeErrorMsg() << "DipolarBarnesHut: " << err.what();
-      return;
-    }
-
-    fill_bh_data(s.rGpuBegin(), s.dipGpuBegin(), &m_bh_data);
-    initBHgpu(m_bh_data.blocks);
-    buildBoxBH(m_bh_data.blocks);
-    buildTreeBH(m_bh_data.blocks);
-    summarizeBH(m_bh_data.blocks);
-    sortBH(m_bh_data.blocks);
-    if (forceBH(&m_bh_data, m_k, s.fGpuBegin(), s.torqueGpuBegin())) {
-      runtimeErrorMsg() << "kernels encountered a functional error";
-    }
-  };
-  void computeEnergy(SystemInterface &s) override {
-    try {
-      allocBHmemCopy(static_cast<int>(s.npart_gpu()), &m_bh_data);
-    } catch (cuda_runtime_error const &err) {
-      runtimeErrorMsg() << "DipolarBarnesHut: " << err.what();
-      return;
-    }
+  void set_params(float epssq, float itolsq);
 
-    fill_bh_data(s.rGpuBegin(), s.dipGpuBegin(), &m_bh_data);
-    initBHgpu(m_bh_data.blocks);
-    buildBoxBH(m_bh_data.blocks);
-    buildTreeBH(m_bh_data.blocks);
-    summarizeBH(m_bh_data.blocks);
-    sortBH(m_bh_data.blocks);
-    if (energyBH(&m_bh_data, m_k,
-                 &(reinterpret_cast<CUDA_energy *>(s.eGpu())->dipolar))) {
-      runtimeErrorMsg() << "kernels encountered a functional error";
-    }
-  };
+  void computeForces(SystemInterface &s) override;
+  void computeEnergy(SystemInterface &s) override;
 
   void activate();
   void deactivate();
 
 private:
-  float m_k;
+  float m_prefactor;
   float m_epssq;
   float m_itolsq;
   /// Container for pointers to device memory.
diff --git a/src/core/actor/DipolarBarnesHut_cuda.cu b/src/core/actor/DipolarBarnesHut_cuda.cu
index e33f99fc123..67d083efdc0 100644
--- a/src/core/actor/DipolarBarnesHut_cuda.cu
+++ b/src/core/actor/DipolarBarnesHut_cuda.cu
@@ -29,7 +29,6 @@
 
 #include "cuda_init.hpp"
 #include "cuda_utils.cuh"
-#include "errorhandling.hpp"
 
 #include <thrust/device_ptr.h>
 #include <thrust/reduce.h>
diff --git a/src/core/actor/DipolarDirectSum.cpp b/src/core/actor/DipolarDirectSum.cpp
index 01d48bf487c..a6da6b4019d 100644
--- a/src/core/actor/DipolarDirectSum.cpp
+++ b/src/core/actor/DipolarDirectSum.cpp
@@ -22,10 +22,31 @@
 #ifdef DIPOLAR_DIRECT_SUM
 
 #include "DipolarDirectSum.hpp"
+#include "DipolarDirectSum_cuda.cuh"
 
 #include "electrostatics_magnetostatics/common.hpp"
+#include "electrostatics_magnetostatics/dipole.hpp"
+
+#include "SystemInterface.hpp"
+#include "cuda_interface.hpp"
 #include "energy.hpp"
+#include "errorhandling.hpp"
 #include "forces.hpp"
+#include "grid.hpp"
+
+DipolarDirectSum::DipolarDirectSum(SystemInterface &s) {
+  if (!s.requestFGpu())
+    runtimeErrorMsg() << "DipolarDirectSum needs access to forces on GPU!";
+
+  if (!s.requestTorqueGpu())
+    runtimeErrorMsg() << "DipolarDirectSum needs access to torques on GPU!";
+
+  if (!s.requestRGpu())
+    runtimeErrorMsg() << "DipolarDirectSum needs access to positions on GPU!";
+
+  if (!s.requestDipGpu())
+    runtimeErrorMsg() << "DipolarDirectSum needs access to dipoles on GPU!";
+}
 
 void DipolarDirectSum::activate() {
   // also necessary on 1 CPU or GPU, does more than just broadcasting
@@ -44,4 +65,33 @@ void DipolarDirectSum::deactivate() {
   energyActors.remove(this);
 }
 
+void DipolarDirectSum::set_params() {
+  m_prefactor = static_cast<float>(dipole.prefactor);
+}
+
+void DipolarDirectSum::computeForces(SystemInterface &s) {
+  float box[3];
+  int per[3];
+  for (int i = 0; i < 3; i++) {
+    box[i] = static_cast<float>(s.box()[i]);
+    per[i] = box_geo.periodic(i);
+  }
+  DipolarDirectSum_kernel_wrapper_force(
+      m_prefactor, static_cast<unsigned>(s.npart_gpu()), s.rGpuBegin(),
+      s.dipGpuBegin(), s.fGpuBegin(), s.torqueGpuBegin(), box, per);
+}
+
+void DipolarDirectSum::computeEnergy(SystemInterface &s) {
+  float box[3];
+  int per[3];
+  for (int i = 0; i < 3; i++) {
+    box[i] = static_cast<float>(s.box()[i]);
+    per[i] = box_geo.periodic(i);
+  }
+  auto energy = reinterpret_cast<CUDA_energy *>(s.eGpu());
+  DipolarDirectSum_kernel_wrapper_energy(
+      m_prefactor, static_cast<unsigned>(s.npart_gpu()), s.rGpuBegin(),
+      s.dipGpuBegin(), box, per, &(energy->dipolar));
+}
+
 #endif
diff --git a/src/core/actor/DipolarDirectSum.hpp b/src/core/actor/DipolarDirectSum.hpp
index 3dc82d68a79..940cc9082d9 100644
--- a/src/core/actor/DipolarDirectSum.hpp
+++ b/src/core/actor/DipolarDirectSum.hpp
@@ -25,66 +25,22 @@
 
 #include "Actor.hpp"
 #include "SystemInterface.hpp"
-#include "cuda_interface.hpp"
-#include "electrostatics_magnetostatics/dipole.hpp"
-#include "errorhandling.hpp"
-#include "grid.hpp"
-
-void DipolarDirectSum_kernel_wrapper_energy(float k, unsigned int n, float *pos,
-                                            float *dip, float box_l[3],
-                                            int periodic[3], float *E);
-void DipolarDirectSum_kernel_wrapper_force(float k, unsigned int n, float *pos,
-                                           float *dip, float *f, float *torque,
-                                           float box_l[3], int periodic[3]);
 
 class DipolarDirectSum : public Actor {
 public:
-  DipolarDirectSum(SystemInterface &s) {
-
-    if (!s.requestFGpu())
-      runtimeErrorMsg() << "DipolarDirectSum needs access to forces on GPU!";
-
-    if (!s.requestTorqueGpu())
-      runtimeErrorMsg() << "DipolarDirectSum needs access to torques on GPU!";
-
-    if (!s.requestRGpu())
-      runtimeErrorMsg() << "DipolarDirectSum needs access to positions on GPU!";
-
-    if (!s.requestDipGpu())
-      runtimeErrorMsg() << "DipolarDirectSum needs access to dipoles on GPU!";
-  };
+  DipolarDirectSum(SystemInterface &s);
+  ~DipolarDirectSum() override = default;
 
-  void set_params() { m_pk = static_cast<float>(dipole.prefactor); }
+  void set_params();
 
-  void computeForces(SystemInterface &s) override {
-    float box[3];
-    int per[3];
-    for (int i = 0; i < 3; i++) {
-      box[i] = static_cast<float>(s.box()[i]);
-      per[i] = box_geo.periodic(i);
-    }
-    DipolarDirectSum_kernel_wrapper_force(
-        m_pk, static_cast<unsigned>(s.npart_gpu()), s.rGpuBegin(),
-        s.dipGpuBegin(), s.fGpuBegin(), s.torqueGpuBegin(), box, per);
-  };
-  void computeEnergy(SystemInterface &s) override {
-    float box[3];
-    int per[3];
-    for (int i = 0; i < 3; i++) {
-      box[i] = static_cast<float>(s.box()[i]);
-      per[i] = box_geo.periodic(i);
-    }
-    auto energy = reinterpret_cast<CUDA_energy *>(s.eGpu());
-    DipolarDirectSum_kernel_wrapper_energy(
-        m_pk, static_cast<unsigned>(s.npart_gpu()), s.rGpuBegin(),
-        s.dipGpuBegin(), box, per, &(energy->dipolar));
-  };
+  void computeForces(SystemInterface &s) override;
+  void computeEnergy(SystemInterface &s) override;
 
   void activate();
   void deactivate();
 
 private:
-  float m_pk;
+  float m_prefactor;
 };
 
 #endif // DIPOLAR_DIRECT_SUM
diff --git a/src/core/actor/DipolarDirectSum_cuda.cu b/src/core/actor/DipolarDirectSum_cuda.cu
index 8c1e37754b2..532f551e85a 100644
--- a/src/core/actor/DipolarDirectSum_cuda.cu
+++ b/src/core/actor/DipolarDirectSum_cuda.cu
@@ -21,6 +21,8 @@
 
 #ifdef DIPOLAR_DIRECT_SUM
 
+#include "DipolarDirectSum_cuda.cuh"
+
 #include "cuda_utils.cuh"
 
 #include <thrust/device_ptr.h>
diff --git a/src/core/actor/DipolarDirectSum_cuda.cuh b/src/core/actor/DipolarDirectSum_cuda.cuh
new file mode 100644
index 00000000000..31b3171a45b
--- /dev/null
+++ b/src/core/actor/DipolarDirectSum_cuda.cuh
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) 2010-2019 The ESPResSo project
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef DIPOLARDIRECTSUM_CUH_
+#define DIPOLARDIRECTSUM_CUH_
+
+#include "config.hpp"
+
+#ifdef DIPOLAR_DIRECT_SUM
+
+void DipolarDirectSum_kernel_wrapper_energy(float k, unsigned int n, float *pos,
+                                            float *dip, float box_l[3],
+                                            int periodic[3], float *E);
+void DipolarDirectSum_kernel_wrapper_force(float k, unsigned int n, float *pos,
+                                           float *dip, float *f, float *torque,
+                                           float box_l[3], int periodic[3]);
+
+#endif // DIPOLAR_DIRECT_SUM
+
+#endif /* DIPOLARDIRECTSUM_CUH_ */
diff --git a/src/core/actor/Mmm1dgpuForce.cpp b/src/core/actor/Mmm1dgpuForce.cpp
index dcfcdd857c1..7baedba8e19 100644
--- a/src/core/actor/Mmm1dgpuForce.cpp
+++ b/src/core/actor/Mmm1dgpuForce.cpp
@@ -28,13 +28,26 @@
 
 #include <stdexcept>
 
-void Mmm1dgpuForce::sanity_checks() {
+Mmm1dgpuForce::Mmm1dgpuForce(SystemInterface &s) {
+  // interface sanity checks
+  if (!s.requestFGpu())
+    throw std::runtime_error("Mmm1dgpuForce needs access to forces on GPU!");
+
+  if (!s.requestRGpu())
+    throw std::runtime_error("Mmm1dgpuForce needs access to positions on GPU!");
+
+  if (!s.requestQGpu())
+    throw std::runtime_error("Mmm1dgpuForce needs access to charges on GPU!");
+
+  // system sanity checks
   if (box_geo.periodic(0) || box_geo.periodic(1) || !box_geo.periodic(2)) {
     throw std::runtime_error("MMM1D requires periodicity (0, 0, 1)");
   }
   if (cell_structure.decomposition_type() != CELL_STRUCTURE_NSQUARE) {
     throw std::runtime_error("MMM1D requires the N-square cellsystem");
   }
+
+  modpsi_init();
 }
 
 void Mmm1dgpuForce::activate() {
diff --git a/src/core/actor/Mmm1dgpuForce.hpp b/src/core/actor/Mmm1dgpuForce.hpp
index 293154c1a30..ea16efb3d31 100644
--- a/src/core/actor/Mmm1dgpuForce.hpp
+++ b/src/core/actor/Mmm1dgpuForce.hpp
@@ -28,9 +28,9 @@
 
 class Mmm1dgpuForce : public Actor {
 public:
-  // constructor
   Mmm1dgpuForce(SystemInterface &s);
   ~Mmm1dgpuForce() override;
+
   // interface methods
   void computeForces(SystemInterface &s) override;
   void computeEnergy(SystemInterface &s) override;
@@ -45,32 +45,37 @@ class Mmm1dgpuForce : public Actor {
 
 private:
   // CUDA parameters
-  unsigned int numThreads;
+  unsigned int numThreads = 64;
   unsigned int numBlocks(SystemInterface const &s) const;
 
   // the box length currently set on the GPU
   // Needed to make sure it hasn't been modified after inter coulomb was used.
-  float host_boxz;
+  float host_boxz = 0;
   // the number of particles we had during the last run. Needed to check if we
   // have to realloc dev_forcePairs
-  unsigned int host_npart;
-  bool need_tune;
+  unsigned int host_npart = 0;
+  bool need_tune = true;
 
   // pairs==0: return forces using atomicAdd
   // pairs==1: return force pairs
   // pairs==2: return forces using a global memory reduction
-  int pairs;
+  int pairs = -1;
   // variables for forces and energies calculated pre-reduction
-  float *dev_forcePairs, *dev_energyBlocks;
+  float *dev_forcePairs = nullptr;
+  float *dev_energyBlocks = nullptr;
 
   // MMM1D parameters
-  float coulomb_prefactor, maxPWerror, far_switch_radius;
-  int bessel_cutoff;
+  float coulomb_prefactor = 0;
+  float maxPWerror = -1;
+  float far_switch_radius = -1;
+  int bessel_cutoff = -1;
 
   // run a single force calculation and return the time it takes using
   // high-precision CUDA timers
   float force_benchmark(SystemInterface &s);
 
+  void modpsi_init();
+
   // some functions to move MPI dependencies out of the .cu file
   void sanity_checks();
 };
diff --git a/src/core/actor/Mmm1dgpuForce_cuda.cu b/src/core/actor/Mmm1dgpuForce_cuda.cu
index 6c5a7f57a06..0755b319686 100644
--- a/src/core/actor/Mmm1dgpuForce_cuda.cu
+++ b/src/core/actor/Mmm1dgpuForce_cuda.cu
@@ -86,7 +86,7 @@ __device__ float dev_mod_psi_odd(int n, float x) {
                  static_cast<int>(device_linModPsi_lengths[2 * n + 1]), x * x);
 }
 
-int modpsi_init() {
+void Mmm1dgpuForce::modpsi_init() {
   create_mod_psi_up_to(modpsi_order);
 
   // linearized array on host
@@ -129,28 +129,6 @@ int modpsi_init() {
     auto const n_modPsi = static_cast<int>(modPsi.size() >> 1);
     cuda_safe_mem(cudaMemcpyToSymbol(device_n_modPsi, &n_modPsi, sizeof(int)));
   }
-
-  return 0;
-}
-
-Mmm1dgpuForce::Mmm1dgpuForce(SystemInterface &s)
-    : numThreads(64), host_boxz(0), host_npart(0), need_tune(true), pairs(-1),
-      dev_forcePairs(nullptr), dev_energyBlocks(nullptr), coulomb_prefactor(0),
-      maxPWerror(-1), far_switch_radius(-1), bessel_cutoff(-1) {
-  // interface sanity checks
-  if (!s.requestFGpu())
-    throw std::runtime_error("Mmm1dgpuForce needs access to forces on GPU!");
-
-  if (!s.requestRGpu())
-    throw std::runtime_error("Mmm1dgpuForce needs access to positions on GPU!");
-
-  if (!s.requestQGpu())
-    throw std::runtime_error("Mmm1dgpuForce needs access to charges on GPU!");
-
-  // system sanity checks
-  sanity_checks();
-
-  modpsi_init();
 }
 
 void Mmm1dgpuForce::setup(SystemInterface &s) {

From 2401244c1e92ce02595db2a93bfacf9fd2850b58 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Wed, 8 Dec 2021 13:04:19 +0100
Subject: [PATCH 14/21] core: Reduce visibility of magnetostatics globals

The `dipole` global variable and the associated enum values are no
longer used by the magnetostatics Cython interface.
---
 src/core/actor/DipolarBarnesHut.cpp           |  1 -
 src/core/actor/Mmm1dgpuForce.cpp              | 10 +++++++
 .../electrostatics_magnetostatics/dipole.cpp  |  8 ++++--
 .../electrostatics_magnetostatics/dipole.hpp  |  3 ++
 .../magnetic_non_p3m_methods.cpp              | 10 +++++--
 .../magnetic_non_p3m_methods.hpp              |  4 +--
 src/python/espressomd/magnetostatics.pxd      | 20 ++-----------
 src/python/espressomd/magnetostatics.pyx      | 28 +++++++++++--------
 src/python/espressomd/scafacos.pyx            |  2 +-
 9 files changed, 47 insertions(+), 39 deletions(-)

diff --git a/src/core/actor/DipolarBarnesHut.cpp b/src/core/actor/DipolarBarnesHut.cpp
index 85a6d5bbe6b..2b067dc2584 100644
--- a/src/core/actor/DipolarBarnesHut.cpp
+++ b/src/core/actor/DipolarBarnesHut.cpp
@@ -49,7 +49,6 @@ DipolarBarnesHut::DipolarBarnesHut(SystemInterface &s) {
 }
 
 void DipolarBarnesHut::activate() {
-  // also necessary on 1 CPU or GPU, does more than just broadcasting
   dipole.method = DIPOLAR_BH_GPU;
   mpi_bcast_coulomb_params();
 
diff --git a/src/core/actor/Mmm1dgpuForce.cpp b/src/core/actor/Mmm1dgpuForce.cpp
index 7baedba8e19..827337d8c1a 100644
--- a/src/core/actor/Mmm1dgpuForce.cpp
+++ b/src/core/actor/Mmm1dgpuForce.cpp
@@ -21,6 +21,10 @@
 #ifdef MMM1D_GPU
 
 #include "actor/Mmm1dgpuForce.hpp"
+
+#include "electrostatics_magnetostatics/common.hpp"
+#include "electrostatics_magnetostatics/coulomb.hpp"
+
 #include "cells.hpp"
 #include "energy.hpp"
 #include "forces.hpp"
@@ -51,11 +55,17 @@ Mmm1dgpuForce::Mmm1dgpuForce(SystemInterface &s) {
 }
 
 void Mmm1dgpuForce::activate() {
+  coulomb.method = COULOMB_MMM1D_GPU;
+  mpi_bcast_coulomb_params();
+
   forceActors.push_back(this);
   energyActors.push_back(this);
 }
 
 void Mmm1dgpuForce::deactivate() {
+  coulomb.method = COULOMB_NONE;
+  mpi_bcast_coulomb_params();
+
   forceActors.remove(this);
   energyActors.remove(this);
 }
diff --git a/src/core/electrostatics_magnetostatics/dipole.cpp b/src/core/electrostatics_magnetostatics/dipole.cpp
index 77291e3bae0..4295ac62647 100644
--- a/src/core/electrostatics_magnetostatics/dipole.cpp
+++ b/src/core/electrostatics_magnetostatics/dipole.cpp
@@ -78,7 +78,7 @@ bool sanity_checks() {
       mdlc_sanity_checks();
       // fall through
     case DIPOLAR_DS:
-      mdds_sanity_checks(mdds_n_replica);
+      mdds_sanity_checks();
       break;
     default:
       break;
@@ -287,13 +287,15 @@ void set_Dprefactor(double prefactor) {
   if (prefactor < 0.0) {
     throw std::invalid_argument("Dipolar prefactor has to be >= 0");
   }
-
   dipole.prefactor = prefactor;
-
   mpi_bcast_coulomb_params();
 }
 
+double get_Dprefactor() { return dipole.prefactor; }
+
 void set_method_local(DipolarInteraction method) { dipole.method = method; }
 
+void disable_method_local() { dipole.method = DIPOLAR_NONE; }
+
 } // namespace Dipole
 #endif // DIPOLES
diff --git a/src/core/electrostatics_magnetostatics/dipole.hpp b/src/core/electrostatics_magnetostatics/dipole.hpp
index 7ef92038ef3..a6cdb49ebf9 100644
--- a/src/core/electrostatics_magnetostatics/dipole.hpp
+++ b/src/core/electrostatics_magnetostatics/dipole.hpp
@@ -86,8 +86,11 @@ void bcast_params(const boost::mpi::communicator &comm);
 
 /** @brief Set the dipolar prefactor */
 void set_Dprefactor(double prefactor);
+/** @brief Get the dipolar prefactor */
+double get_Dprefactor();
 
 void set_method_local(DipolarInteraction method);
+void disable_method_local();
 } // namespace Dipole
 #else  // DIPOLES
 namespace Dipole {
diff --git a/src/core/electrostatics_magnetostatics/magnetic_non_p3m_methods.cpp b/src/core/electrostatics_magnetostatics/magnetic_non_p3m_methods.cpp
index 52fbb901f8b..619af521368 100644
--- a/src/core/electrostatics_magnetostatics/magnetic_non_p3m_methods.cpp
+++ b/src/core/electrostatics_magnetostatics/magnetic_non_p3m_methods.cpp
@@ -137,9 +137,11 @@ double dawaanr_calculations(bool force_flag, bool energy_flag,
    =============================================================================
 */
 
-int mdds_n_replica = 0;
+static int mdds_n_replica = 0;
 
-void mdds_sanity_checks(int n_replica) {
+int mdds_get_n_replica() { return mdds_n_replica; }
+
+void sanity_checks(int n_replica) {
   if (box_geo.periodic(0) and box_geo.periodic(1) and box_geo.periodic(2) and
       n_replica == 0) {
     throw std::runtime_error("Dipolar direct sum with replica does not "
@@ -147,6 +149,8 @@ void mdds_sanity_checks(int n_replica) {
   }
 }
 
+void mdds_sanity_checks() { sanity_checks(mdds_n_replica); }
+
 double
 magnetic_dipolar_direct_sum_calculations(bool force_flag, bool energy_flag,
                                          ParticleRange const &particles) {
@@ -320,7 +324,7 @@ void mdds_set_params(int n_replica) {
   if (n_replica < 0) {
     throw std::runtime_error("Dipolar direct sum requires n_replica >= 0.");
   }
-  mdds_sanity_checks(n_replica);
+  sanity_checks(n_replica);
   if (n_replica == 0) {
     fprintf(stderr, "Careful: the number of extra replicas to take into "
                     "account during the direct sum calculation is zero\n");
diff --git a/src/core/electrostatics_magnetostatics/magnetic_non_p3m_methods.hpp b/src/core/electrostatics_magnetostatics/magnetic_non_p3m_methods.hpp
index dd1664e9d93..7565f32cbe9 100644
--- a/src/core/electrostatics_magnetostatics/magnetic_non_p3m_methods.hpp
+++ b/src/core/electrostatics_magnetostatics/magnetic_non_p3m_methods.hpp
@@ -62,7 +62,7 @@ void dawaanr_set_params();
 */
 
 /** Sanity checks for the magnetic dipolar direct sum. */
-void mdds_sanity_checks(int n_replica);
+void mdds_sanity_checks();
 
 /** Core of the method: here you compute all the magnetic forces, torques and
  *  the energy for the whole system using direct sum
@@ -76,7 +76,7 @@ double magnetic_dipolar_direct_sum_calculations(bool force_flag,
  */
 void mdds_set_params(int n_replica);
 
-extern int mdds_n_replica;
+int mdds_get_n_replica();
 
 #endif /*of ifdef DIPOLES  */
 #endif /* of ifndef  MAG_NON_P3M_H */
diff --git a/src/python/espressomd/magnetostatics.pxd b/src/python/espressomd/magnetostatics.pxd
index d12f5bee383..fb6893f3ec6 100644
--- a/src/python/espressomd/magnetostatics.pxd
+++ b/src/python/espressomd/magnetostatics.pxd
@@ -31,29 +31,15 @@ IF DIPOLES == 1:
     cdef extern from "electrostatics_magnetostatics/common.hpp":
         void mpi_bcast_coulomb_params()
 
-    cdef extern from "electrostatics_magnetostatics/dipole.hpp":
-        ctypedef enum DipolarInteraction:
-            DIPOLAR_NONE = 0,
-            DIPOLAR_P3M,
-            DIPOLAR_MDLC_P3M,
-            DIPOLAR_ALL_WITH_ALL_AND_NO_REPLICA,
-            DIPOLAR_DS,
-            DIPOLAR_MDLC_DS,
-            DIPOLAR_SCAFACOS
-
-        ctypedef struct Dipole_parameters:
-            double prefactor
-            DipolarInteraction method
-
-        cdef extern Dipole_parameters dipole
-
     cdef extern from "electrostatics_magnetostatics/dipole.hpp" namespace "Dipole":
         void set_Dprefactor(double prefactor) except +
+        double get_Dprefactor()
+        void disable_method_local()
 
     cdef extern from "electrostatics_magnetostatics/magnetic_non_p3m_methods.hpp":
         void dawaanr_set_params() except +
         void mdds_set_params(int n_replica) except +
-        int mdds_n_replica
+        int mdds_get_n_replica()
 
     IF(CUDA == 1) and (ROTATION == 1):
         cdef extern from "actor/DipolarDirectSum.hpp":
diff --git a/src/python/espressomd/magnetostatics.pyx b/src/python/espressomd/magnetostatics.pyx
index f6b09376e51..3dc0571a3bf 100644
--- a/src/python/espressomd/magnetostatics.pyx
+++ b/src/python/espressomd/magnetostatics.pyx
@@ -45,6 +45,13 @@ IF DIPOLES == 1:
             if not self._params["prefactor"] >= 0:
                 raise ValueError("prefactor should be a positive float")
 
+        def get_magnetostatics_prefactor(self):
+            """
+            Get the magnetostatics prefactor
+
+            """
+            return get_Dprefactor()
+
         def set_magnetostatics_prefactor(self):
             """
             Set the magnetostatics prefactor
@@ -54,12 +61,9 @@ IF DIPOLES == 1:
             # also necessary on 1 CPU or GPU, does more than just broadcasting
             mpi_bcast_coulomb_params()
 
-        def _get_active_method_from_es_core(self):
-            return dipole.method
-
         def _deactivate_method(self):
             set_Dprefactor(0.0)
-            dipole.method = DIPOLAR_NONE
+            disable_method_local()
             mpi_bcast_coulomb_params()
 
 IF DP3M == 1:
@@ -148,7 +152,7 @@ IF DP3M == 1:
         def _get_params_from_es_core(self):
             params = {}
             params.update(dp3m.params)
-            params["prefactor"] = dipole.prefactor
+            params["prefactor"] = self.get_magnetostatics_prefactor()
             params["tune"] = self._params["tune"]
             params["timings"] = self._params["timings"]
             return params
@@ -220,7 +224,7 @@ IF DIPOLES == 1:
             return ("prefactor",)
 
         def _get_params_from_es_core(self):
-            return {"prefactor": dipole.prefactor}
+            return {"prefactor": self.get_magnetostatics_prefactor()}
 
         def _activate_method(self):
             self._set_params_in_es_core()
@@ -258,7 +262,8 @@ IF DIPOLES == 1:
             return ("prefactor", "n_replica")
 
         def _get_params_from_es_core(self):
-            return {"prefactor": dipole.prefactor, "n_replica": mdds_n_replica}
+            return {"prefactor": self.get_magnetostatics_prefactor(),
+                    "n_replica": mdds_get_n_replica()}
 
         def _activate_method(self):
             self._set_params_in_es_core()
@@ -296,13 +301,12 @@ IF DIPOLES == 1:
                 Actor.__init__(self, *args, **kwargs)
 
             def _activate_method(self):
-                set_Dprefactor(self._params["prefactor"])
+                self.set_magnetostatics_prefactor()
                 self._set_params_in_es_core()
 
             def _deactivate_method(self):
-                dipole.method = DIPOLAR_NONE
                 scafacos.free_handle(self.dipolar)
-                mpi_bcast_coulomb_params()
+                super()._deactivate_method()
 
             def default_params(self):
                 return {}
@@ -343,7 +347,7 @@ IF DIPOLES == 1:
                 return ("prefactor",)
 
             def _get_params_from_es_core(self):
-                return {"prefactor": dipole.prefactor}
+                return {"prefactor": self.get_magnetostatics_prefactor()}
 
             def _activate_method(self):
                 self._set_params_in_es_core()
@@ -384,7 +388,7 @@ IF DIPOLES == 1:
                 return ("prefactor", "epssq", "itolsq")
 
             def _get_params_from_es_core(self):
-                return {"prefactor": dipole.prefactor}
+                return {"prefactor": self.get_magnetostatics_prefactor()}
 
             def _activate_method(self):
                 self._set_params_in_es_core()
diff --git a/src/python/espressomd/scafacos.pyx b/src/python/espressomd/scafacos.pyx
index 07394e77e8c..04dd88496da 100644
--- a/src/python/espressomd/scafacos.pyx
+++ b/src/python/espressomd/scafacos.pyx
@@ -98,7 +98,7 @@ IF SCAFACOS == 1:
             # Re-add the prefactor to the parameter set
             if self.dipolar:
                 IF DIPOLES == 1:
-                    res["prefactor"] = magnetostatics.dipole.prefactor
+                    res["prefactor"] = magnetostatics.get_Dprefactor()
                 pass
             else:
                 IF ELECTROSTATICS == 1:

From 0e6c336c6f3934a87cbe84f0368a819b313cf270 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Wed, 8 Dec 2021 16:38:41 +0100
Subject: [PATCH 15/21] core: Proper error propagation

---
 src/core/EspressoSystemInterface.hpp          | 18 +++----
 src/core/SystemInterface.hpp                  | 36 +++++++++----
 src/core/actor/DipolarBarnesHut.cpp           | 15 ++----
 src/core/actor/DipolarDirectSum.cpp           | 16 ++----
 src/core/actor/Mmm1dgpuForce.cpp              | 12 ++---
 .../EspressoSystemInterface_test.cpp          | 51 +++++++++----------
 src/python/espressomd/magnetostatics.pxd      |  4 +-
 7 files changed, 69 insertions(+), 83 deletions(-)

diff --git a/src/core/EspressoSystemInterface.hpp b/src/core/EspressoSystemInterface.hpp
index feecf6bd53b..838520b0ae1 100644
--- a/src/core/EspressoSystemInterface.hpp
+++ b/src/core/EspressoSystemInterface.hpp
@@ -55,57 +55,52 @@ class EspressoSystemInterface : public SystemInterface {
 #ifdef CUDA
   float *rGpuBegin() override { return m_r_gpu_begin; };
   bool hasRGpu() override { return true; };
-  bool requestRGpu() override {
+  void requestRGpu() override {
     m_needsRGpu = hasRGpu();
     m_splitParticleStructGpu |= m_needsRGpu;
     m_gpu |= m_needsRGpu;
     if (m_gpu)
       enableParticleCommunication();
-    return m_needsRGpu;
   };
 
 #ifdef DIPOLES
   float *dipGpuBegin() override { return m_dip_gpu_begin; };
   bool hasDipGpu() override { return true; };
-  bool requestDipGpu() override {
+  void requestDipGpu() override {
     m_needsDipGpu = hasDipGpu();
     m_splitParticleStructGpu |= m_needsRGpu;
     m_gpu |= m_needsRGpu;
     if (m_gpu)
       enableParticleCommunication();
-    return m_needsDipGpu;
   };
 #endif
 
 #ifdef ELECTROSTATICS
   float *qGpuBegin() override { return m_q_gpu_begin; };
   bool hasQGpu() override { return true; };
-  bool requestQGpu() override {
+  void requestQGpu() override {
     m_needsQGpu = hasQGpu();
     m_splitParticleStructGpu |= m_needsQGpu;
     m_gpu |= m_needsQGpu;
     if (m_gpu)
       enableParticleCommunication();
-    return m_needsQGpu;
   };
 #endif
 
-  bool requestParticleStructGpu() {
+  void requestParticleStructGpu() {
     m_needsParticleStructGpu = true;
     m_gpu |= m_needsParticleStructGpu;
     if (m_gpu)
       enableParticleCommunication();
-    return true;
   };
 
   float *fGpuBegin() override { return gpu_get_particle_force_pointer(); };
   bool hasFGpu() override { return true; };
-  bool requestFGpu() override {
+  void requestFGpu() override {
     m_needsFGpu = hasFGpu();
     m_gpu |= m_needsFGpu;
     if (m_gpu)
       enableParticleCommunication();
-    return m_needsFGpu;
   };
 
 #ifdef ROTATION
@@ -113,12 +108,11 @@ class EspressoSystemInterface : public SystemInterface {
     return gpu_get_particle_torque_pointer();
   };
   bool hasTorqueGpu() override { return true; };
-  bool requestTorqueGpu() override {
+  void requestTorqueGpu() override {
     m_needsTorqueGpu = hasTorqueGpu();
     m_gpu |= m_needsTorqueGpu;
     if (m_gpu)
       enableParticleCommunication();
-    return m_needsTorqueGpu;
   };
 #endif
 
diff --git a/src/core/SystemInterface.hpp b/src/core/SystemInterface.hpp
index 006232efab6..15b79825362 100644
--- a/src/core/SystemInterface.hpp
+++ b/src/core/SystemInterface.hpp
@@ -24,6 +24,8 @@
 #include <utils/Vector.hpp>
 
 #include <cstddef>
+#include <stdexcept>
+#include <string>
 
 /**
  * Public interface for the ESPResSo system and device memory management.
@@ -46,10 +48,11 @@
  *
  * Since properties in the @ref Particle struct depend on the config file,
  * it is necessary to provide a means to check if it is even possible to
- * carry out the split. This is achieved by the <tt>hasXGpu()</tt> methods.
- * The same value is returned from <tt>requestXGpu()</tt> and it needs to
- * be checked at the call site to throw an error. This class won't throw
- * exceptions.
+ * carry out the split. This is achieved by the <tt>hasXGpu()</tt> methods,
+ * whose return value must be checked at the call site to raise a runtime
+ * error. This is important in MPI callbacks, since they have limited support
+ * for standard exceptions. Methods <tt>requestXGpu()</tt> will throw an
+ * exception if the particle property cannot be split on device memory.
  */
 class SystemInterface {
 public:
@@ -61,28 +64,43 @@ class SystemInterface {
 
   virtual float *rGpuBegin() { return nullptr; };
   virtual bool hasRGpu() { return false; };
-  virtual bool requestRGpu() { return false; }
+  virtual void requestRGpu() {
+    throw std::runtime_error(error_message("positions"));
+  }
 
   virtual float *dipGpuBegin() { return nullptr; };
   virtual bool hasDipGpu() { return false; };
-  virtual bool requestDipGpu() { return false; }
+  virtual void requestDipGpu() {
+    throw std::runtime_error(error_message("dipoles"));
+  }
 
   virtual float *torqueGpuBegin() { return nullptr; };
   virtual bool hasTorqueGpu() { return false; };
-  virtual bool requestTorqueGpu() { return false; }
+  virtual void requestTorqueGpu() {
+    throw std::runtime_error(error_message("torques"));
+  }
 
   virtual float *qGpuBegin() { return nullptr; };
   virtual bool hasQGpu() { return false; };
-  virtual bool requestQGpu() { return false; }
+  virtual void requestQGpu() {
+    throw std::runtime_error(error_message("charges"));
+  }
 
   virtual float *fGpuBegin() { return nullptr; };
   virtual bool hasFGpu() { return false; };
-  virtual bool requestFGpu() { return false; }
+  virtual void requestFGpu() {
+    throw std::runtime_error(error_message("forces"));
+  }
 
   virtual float *eGpu() { return nullptr; };
 
   virtual std::size_t npart_gpu() const { return 0; };
   virtual Utils::Vector3d box() const = 0;
+
+private:
+  std::string error_message(std::string property) const {
+    return "No GPU available or particle " + property + " not compiled in.";
+  }
 };
 
 #endif
diff --git a/src/core/actor/DipolarBarnesHut.cpp b/src/core/actor/DipolarBarnesHut.cpp
index 2b067dc2584..582d30827c3 100644
--- a/src/core/actor/DipolarBarnesHut.cpp
+++ b/src/core/actor/DipolarBarnesHut.cpp
@@ -35,17 +35,10 @@
 #include "forces.hpp"
 
 DipolarBarnesHut::DipolarBarnesHut(SystemInterface &s) {
-  if (!s.requestFGpu())
-    runtimeErrorMsg() << "DipolarBarnesHut needs access to forces on GPU!";
-
-  if (!s.requestTorqueGpu())
-    runtimeErrorMsg() << "DipolarBarnesHut needs access to torques on GPU!";
-
-  if (!s.requestRGpu())
-    runtimeErrorMsg() << "DipolarBarnesHut needs access to positions on GPU!";
-
-  if (!s.requestDipGpu())
-    runtimeErrorMsg() << "DipolarBarnesHut needs access to dipoles on GPU!";
+  s.requestFGpu();
+  s.requestTorqueGpu();
+  s.requestRGpu();
+  s.requestDipGpu();
 }
 
 void DipolarBarnesHut::activate() {
diff --git a/src/core/actor/DipolarDirectSum.cpp b/src/core/actor/DipolarDirectSum.cpp
index a6da6b4019d..95f4cd4dc6b 100644
--- a/src/core/actor/DipolarDirectSum.cpp
+++ b/src/core/actor/DipolarDirectSum.cpp
@@ -30,22 +30,14 @@
 #include "SystemInterface.hpp"
 #include "cuda_interface.hpp"
 #include "energy.hpp"
-#include "errorhandling.hpp"
 #include "forces.hpp"
 #include "grid.hpp"
 
 DipolarDirectSum::DipolarDirectSum(SystemInterface &s) {
-  if (!s.requestFGpu())
-    runtimeErrorMsg() << "DipolarDirectSum needs access to forces on GPU!";
-
-  if (!s.requestTorqueGpu())
-    runtimeErrorMsg() << "DipolarDirectSum needs access to torques on GPU!";
-
-  if (!s.requestRGpu())
-    runtimeErrorMsg() << "DipolarDirectSum needs access to positions on GPU!";
-
-  if (!s.requestDipGpu())
-    runtimeErrorMsg() << "DipolarDirectSum needs access to dipoles on GPU!";
+  s.requestFGpu();
+  s.requestTorqueGpu();
+  s.requestRGpu();
+  s.requestDipGpu();
 }
 
 void DipolarDirectSum::activate() {
diff --git a/src/core/actor/Mmm1dgpuForce.cpp b/src/core/actor/Mmm1dgpuForce.cpp
index 827337d8c1a..d5695ceede6 100644
--- a/src/core/actor/Mmm1dgpuForce.cpp
+++ b/src/core/actor/Mmm1dgpuForce.cpp
@@ -33,15 +33,9 @@
 #include <stdexcept>
 
 Mmm1dgpuForce::Mmm1dgpuForce(SystemInterface &s) {
-  // interface sanity checks
-  if (!s.requestFGpu())
-    throw std::runtime_error("Mmm1dgpuForce needs access to forces on GPU!");
-
-  if (!s.requestRGpu())
-    throw std::runtime_error("Mmm1dgpuForce needs access to positions on GPU!");
-
-  if (!s.requestQGpu())
-    throw std::runtime_error("Mmm1dgpuForce needs access to charges on GPU!");
+  s.requestFGpu();
+  s.requestRGpu();
+  s.requestQGpu();
 
   // system sanity checks
   if (box_geo.periodic(0) || box_geo.periodic(1) || !box_geo.periodic(2)) {
diff --git a/src/core/unit_tests/EspressoSystemInterface_test.cpp b/src/core/unit_tests/EspressoSystemInterface_test.cpp
index 09fb492aea2..61eace5d331 100644
--- a/src/core/unit_tests/EspressoSystemInterface_test.cpp
+++ b/src/core/unit_tests/EspressoSystemInterface_test.cpp
@@ -68,20 +68,6 @@ BOOST_AUTO_TEST_CASE(check_with_gpu, *boost::unit_test::precondition(has_gpu)) {
   check_uninitialized_device_pointers();
   BOOST_CHECK_EQUAL(espresso::system.npart_gpu(), 0);
 
-  // features compiled in
-  auto has_feature_rotation = false;
-  auto has_feature_electrostatics = false;
-  auto has_feature_dipoles = false;
-#ifdef ROTATION
-  has_feature_rotation = true;
-#endif
-#ifdef ELECTROSTATICS
-  has_feature_electrostatics = true;
-#endif
-#ifdef DIPOLES
-  has_feature_dipoles = true;
-#endif
-
   auto const pid = 1;
   place_particle(pid, {0., 0., 0.});
   set_particle_type(pid, 0);
@@ -89,44 +75,53 @@ BOOST_AUTO_TEST_CASE(check_with_gpu, *boost::unit_test::precondition(has_gpu)) {
   BOOST_CHECK_EQUAL(espresso::system.npart_gpu(), 0);
   espresso::system.update();
   BOOST_CHECK_EQUAL(espresso::system.npart_gpu(), 0);
-  BOOST_TEST(espresso::system.requestParticleStructGpu());
+  espresso::system.requestParticleStructGpu();
   espresso::system.update();
   BOOST_CHECK_EQUAL(espresso::system.npart_gpu(), 1);
 
   // check position split
   BOOST_TEST(espresso::system.hasRGpu());
-  BOOST_TEST(espresso::system.requestRGpu());
+  espresso::system.requestRGpu();
   espresso::system.update();
   BOOST_TEST(espresso::system.rGpuBegin() != nullptr);
 
   // check force split
   BOOST_TEST(espresso::system.hasFGpu());
-  BOOST_TEST(espresso::system.requestFGpu());
+  espresso::system.requestFGpu();
   espresso::system.update();
   BOOST_TEST(espresso::system.fGpuBegin() != nullptr);
 
   // check torque split
-  BOOST_CHECK_EQUAL(espresso::system.hasTorqueGpu(), has_feature_rotation);
-  BOOST_CHECK_EQUAL(espresso::system.requestTorqueGpu(), has_feature_rotation);
 #ifdef ROTATION
+  BOOST_CHECK(espresso::system.hasTorqueGpu());
+  espresso::system.requestTorqueGpu();
   espresso::system.update();
   BOOST_TEST(espresso::system.torqueGpuBegin() != nullptr);
+#else
+  BOOST_CHECK(!espresso::system.hasTorqueGpu());
+  BOOST_CHECK_THROW(espresso::system.requestTorqueGpu(), std::runtime_error);
 #endif
 
   // check charge split
-  BOOST_CHECK_EQUAL(espresso::system.hasQGpu(), has_feature_electrostatics);
-  BOOST_CHECK_EQUAL(espresso::system.requestQGpu(), has_feature_electrostatics);
 #ifdef ELECTROSTATICS
+  BOOST_CHECK(espresso::system.hasQGpu());
+  espresso::system.requestQGpu();
   espresso::system.update();
   BOOST_TEST(espresso::system.qGpuBegin() != nullptr);
+#else
+  BOOST_CHECK(!espresso::system.hasQGpu());
+  BOOST_CHECK_THROW(espresso::system.requestQGpu(), std::runtime_error);
 #endif
 
   // check dipole split
-  BOOST_CHECK_EQUAL(espresso::system.hasDipGpu(), has_feature_dipoles);
-  BOOST_CHECK_EQUAL(espresso::system.requestDipGpu(), has_feature_dipoles);
 #ifdef DIPOLES
+  BOOST_CHECK(espresso::system.hasDipGpu());
+  espresso::system.requestDipGpu();
   espresso::system.update();
   BOOST_TEST(espresso::system.dipGpuBegin() != nullptr);
+#else
+  BOOST_CHECK(!espresso::system.hasDipGpu());
+  BOOST_CHECK_THROW(espresso::system.requestDipGpu(), std::runtime_error);
 #endif
 
   // clear device memory
@@ -141,15 +136,15 @@ BOOST_AUTO_TEST_CASE(check_without_cuda) {
   check_uninitialized_device_pointers();
   BOOST_CHECK_EQUAL(espresso::system.npart_gpu(), 0);
   BOOST_TEST(!espresso::system.hasRGpu());
-  BOOST_TEST(!espresso::system.requestRGpu());
   BOOST_TEST(!espresso::system.hasDipGpu());
-  BOOST_TEST(!espresso::system.requestDipGpu());
   BOOST_TEST(!espresso::system.hasFGpu());
-  BOOST_TEST(!espresso::system.requestFGpu());
   BOOST_TEST(!espresso::system.hasTorqueGpu());
-  BOOST_TEST(!espresso::system.requestTorqueGpu());
   BOOST_TEST(!espresso::system.hasQGpu());
-  BOOST_TEST(!espresso::system.requestQGpu());
+  BOOST_CHECK_THROW(espresso::system.requestRGpu(), std::runtime_error);
+  BOOST_CHECK_THROW(espresso::system.requestDipGpu(), std::runtime_error);
+  BOOST_CHECK_THROW(espresso::system.requestFGpu(), std::runtime_error);
+  BOOST_CHECK_THROW(espresso::system.requestTorqueGpu(), std::runtime_error);
+  BOOST_CHECK_THROW(espresso::system.requestQGpu(), std::runtime_error);
 }
 
 #endif // CUDA
diff --git a/src/python/espressomd/magnetostatics.pxd b/src/python/espressomd/magnetostatics.pxd
index fb6893f3ec6..45df8411b2f 100644
--- a/src/python/espressomd/magnetostatics.pxd
+++ b/src/python/espressomd/magnetostatics.pxd
@@ -44,7 +44,7 @@ IF DIPOLES == 1:
     IF(CUDA == 1) and (ROTATION == 1):
         cdef extern from "actor/DipolarDirectSum.hpp":
             cdef cppclass DipolarDirectSum:
-                DipolarDirectSum(SystemInterface & s)
+                DipolarDirectSum(SystemInterface & s) except +
                 void set_params()
                 void activate()
                 void deactivate()
@@ -52,7 +52,7 @@ IF DIPOLES == 1:
     IF(DIPOLAR_BARNES_HUT == 1):
         cdef extern from "actor/DipolarBarnesHut.hpp":
             cdef cppclass DipolarBarnesHut:
-                DipolarBarnesHut(SystemInterface & s)
+                DipolarBarnesHut(SystemInterface & s) except +
                 void set_params(float epssq, float itolsq)
                 void activate()
                 void deactivate()

From d29e63302c1c112acd4e717055fc5c2c6034fef4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Thu, 9 Dec 2021 09:39:04 +0100
Subject: [PATCH 16/21] core: Remove code duplication

---
 src/core/actor/DipolarBarnesHut.cpp | 36 +++++++++++++----------------
 src/core/actor/DipolarBarnesHut.hpp |  1 +
 2 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/src/core/actor/DipolarBarnesHut.cpp b/src/core/actor/DipolarBarnesHut.cpp
index 582d30827c3..67d82ad2960 100644
--- a/src/core/actor/DipolarBarnesHut.cpp
+++ b/src/core/actor/DipolarBarnesHut.cpp
@@ -64,12 +64,12 @@ void DipolarBarnesHut::set_params(float epssq, float itolsq) {
   setBHPrecision(m_epssq, m_itolsq);
 }
 
-void DipolarBarnesHut::computeForces(SystemInterface &s) {
+int DipolarBarnesHut::initialize_data_structure(SystemInterface &s) {
   try {
     allocBHmemCopy(static_cast<int>(s.npart_gpu()), &m_bh_data);
   } catch (cuda_runtime_error const &err) {
     runtimeErrorMsg() << "DipolarBarnesHut: " << err.what();
-    return;
+    return ES_ERROR;
   }
 
   fill_bh_data(s.rGpuBegin(), s.dipGpuBegin(), &m_bh_data);
@@ -78,28 +78,24 @@ void DipolarBarnesHut::computeForces(SystemInterface &s) {
   buildTreeBH(m_bh_data.blocks);
   summarizeBH(m_bh_data.blocks);
   sortBH(m_bh_data.blocks);
-  if (forceBH(&m_bh_data, m_prefactor, s.fGpuBegin(), s.torqueGpuBegin())) {
-    runtimeErrorMsg() << "kernels encountered a functional error";
-  }
+
+  return ES_OK;
 }
 
-void DipolarBarnesHut::computeEnergy(SystemInterface &s) {
-  try {
-    allocBHmemCopy(static_cast<int>(s.npart_gpu()), &m_bh_data);
-  } catch (cuda_runtime_error const &err) {
-    runtimeErrorMsg() << "DipolarBarnesHut: " << err.what();
-    return;
+void DipolarBarnesHut::computeForces(SystemInterface &s) {
+  if (initialize_data_structure(s) == ES_OK) {
+    if (forceBH(&m_bh_data, m_prefactor, s.fGpuBegin(), s.torqueGpuBegin())) {
+      runtimeErrorMsg() << "kernels encountered a functional error";
+    }
   }
+}
 
-  fill_bh_data(s.rGpuBegin(), s.dipGpuBegin(), &m_bh_data);
-  initBHgpu(m_bh_data.blocks);
-  buildBoxBH(m_bh_data.blocks);
-  buildTreeBH(m_bh_data.blocks);
-  summarizeBH(m_bh_data.blocks);
-  sortBH(m_bh_data.blocks);
-  auto energy = reinterpret_cast<CUDA_energy *>(s.eGpu());
-  if (energyBH(&m_bh_data, m_prefactor, &(energy->dipolar))) {
-    runtimeErrorMsg() << "kernels encountered a functional error";
+void DipolarBarnesHut::computeEnergy(SystemInterface &s) {
+  if (initialize_data_structure(s) == ES_OK) {
+    auto energy = reinterpret_cast<CUDA_energy *>(s.eGpu());
+    if (energyBH(&m_bh_data, m_prefactor, &(energy->dipolar))) {
+      runtimeErrorMsg() << "kernels encountered a functional error";
+    }
   }
 }
 
diff --git a/src/core/actor/DipolarBarnesHut.hpp b/src/core/actor/DipolarBarnesHut.hpp
index 4cf76ecd5ec..4c380b39d63 100644
--- a/src/core/actor/DipolarBarnesHut.hpp
+++ b/src/core/actor/DipolarBarnesHut.hpp
@@ -48,6 +48,7 @@ class DipolarBarnesHut : public Actor {
   BHData m_bh_data = {0,       0,       0,       nullptr, nullptr,
                       nullptr, nullptr, nullptr, nullptr, nullptr,
                       nullptr, nullptr, nullptr, nullptr};
+  int initialize_data_structure(SystemInterface &s);
 };
 
 #endif // DIPOLAR_BARNES_HUT

From 799aee08771866d28a45de760fbd80c6b0624f72 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Thu, 9 Dec 2021 10:00:04 +0100
Subject: [PATCH 17/21] core: Remove unused GPU kernels

---
 src/core/grid_based_algorithms/lbgpu_cuda.cu  | 22 -------------------
 .../virtual_sites/lb_inertialess_tracers.hpp  | 10 +--------
 .../lb_inertialess_tracers_cuda.cu            | 11 ----------
 3 files changed, 1 insertion(+), 42 deletions(-)

diff --git a/src/core/grid_based_algorithms/lbgpu_cuda.cu b/src/core/grid_based_algorithms/lbgpu_cuda.cu
index cc80bec200d..becd3ec8ba0 100644
--- a/src/core/grid_based_algorithms/lbgpu_cuda.cu
+++ b/src/core/grid_based_algorithms/lbgpu_cuda.cu
@@ -1035,28 +1035,6 @@ __device__ void calc_values_from_m(Utils::Array<float, 19> const &mode_single,
   pi_out = stress_from_stress_modes(stress_modes(d_v_single, mode_single));
 }
 
-/** Calculate temperature of the fluid kernel
- *  @param[out] cpu_jsquared  Result
- *  @param[in]  n_a           Local node residing in array a
- *  @param[out] number_of_non_boundary_nodes  Local node residing in array a
- */
-__global__ void temperature(LB_nodes_gpu n_a, float *cpu_jsquared,
-                            int *number_of_non_boundary_nodes) {
-  Utils::Array<float, 4> mode;
-  float jsquared = 0.0f;
-  unsigned int index = blockIdx.y * gridDim.x * blockDim.x +
-                       blockDim.x * blockIdx.x + threadIdx.x;
-
-  if (index < para->number_of_nodes) {
-    if (!n_a.boundary[index]) {
-      calc_mass_and_momentum_mode(mode, n_a, index);
-      jsquared = mode[1] * mode[1] + mode[2] * mode[2] + mode[3] * mode[3];
-      atomicAdd(cpu_jsquared, jsquared);
-      atomicAdd(number_of_non_boundary_nodes, 1);
-    }
-  }
-}
-
 /** Interpolation kernel.
  *  See @cite duenweg09a
  *  @param u Distance to grid point in units of agrid
diff --git a/src/core/virtual_sites/lb_inertialess_tracers.hpp b/src/core/virtual_sites/lb_inertialess_tracers.hpp
index dbdf1b30de6..8377e8d33d4 100644
--- a/src/core/virtual_sites/lb_inertialess_tracers.hpp
+++ b/src/core/virtual_sites/lb_inertialess_tracers.hpp
@@ -28,18 +28,10 @@
 
 #include "ParticleRange.hpp"
 
-// Main functions for CPU & GPU
 void IBM_UpdateParticlePositions(ParticleRange const &particles,
                                  double time_step);
-
-// Main functions for CPU implementation - called from integrate.cpp
 void IBM_ForcesIntoFluid_CPU();
-
-// Main functions for GPU implementation - called from integrate.cpp
-// These are in ibm_cuda.cu
 void IBM_ForcesIntoFluid_GPU(ParticleRange const &particles);
-void IBM_ResetLBForces_GPU();
-
-#endif
 
+#endif // VIRTUAL_SITES_INERTIALESS_TRACERS
 #endif
diff --git a/src/core/virtual_sites/lb_inertialess_tracers_cuda.cu b/src/core/virtual_sites/lb_inertialess_tracers_cuda.cu
index 021f03373af..8fa0363d6a0 100644
--- a/src/core/virtual_sites/lb_inertialess_tracers_cuda.cu
+++ b/src/core/virtual_sites/lb_inertialess_tracers_cuda.cu
@@ -280,17 +280,6 @@ __global__ void ResetLBForces_Kernel(LB_node_force_density_gpu node_f,
   }
 }
 
-/** Call a kernel to reset the forces on the LB nodes to the external force. */
-void IBM_ResetLBForces_GPU() {
-  if (this_node == 0) {
-    dim3 dim_grid =
-        calculate_dim_grid(lbpar_gpu.number_of_nodes, 4, threads_per_block);
-
-    KERNELCALL(ResetLBForces_Kernel, dim_grid, threads_per_block, node_f,
-               para_gpu);
-  }
-}
-
 /** Transfer particle forces into the LB fluid.
  *  Called from @ref integrate.
  *  This must be the first CUDA-IBM function to be called because it also does

From a8f1403035b5beb849232a1fcd850ca69ebbd011 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Thu, 9 Dec 2021 10:33:12 +0100
Subject: [PATCH 18/21] core: Hide GPU interface implementation details

---
 src/core/EspressoSystemInterface.cpp | 12 ++++++++++++
 src/core/EspressoSystemInterface.hpp | 26 +++++++-------------------
 2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/src/core/EspressoSystemInterface.cpp b/src/core/EspressoSystemInterface.cpp
index fc09ddb1994..559b4d51830 100644
--- a/src/core/EspressoSystemInterface.cpp
+++ b/src/core/EspressoSystemInterface.cpp
@@ -40,6 +40,18 @@ void EspressoSystemInterface::gatherParticles() {
 #endif
 }
 
+#ifdef CUDA
+void EspressoSystemInterface::enableParticleCommunication() {
+  if (m_gpu) {
+    if (!gpu_get_global_particle_vars_pointer_host()->communication_enabled) {
+      gpu_init_particle_comm();
+      cuda_bcast_global_part_params();
+      reallocDeviceMemory(gpu_get_particle_pointer().size());
+    }
+  }
+}
+#endif
+
 void EspressoSystemInterface::init() { gatherParticles(); }
 
 void EspressoSystemInterface::update() { gatherParticles(); }
diff --git a/src/core/EspressoSystemInterface.hpp b/src/core/EspressoSystemInterface.hpp
index 838520b0ae1..136ff5d0180 100644
--- a/src/core/EspressoSystemInterface.hpp
+++ b/src/core/EspressoSystemInterface.hpp
@@ -59,8 +59,7 @@ class EspressoSystemInterface : public SystemInterface {
     m_needsRGpu = hasRGpu();
     m_splitParticleStructGpu |= m_needsRGpu;
     m_gpu |= m_needsRGpu;
-    if (m_gpu)
-      enableParticleCommunication();
+    enableParticleCommunication();
   };
 
 #ifdef DIPOLES
@@ -70,8 +69,7 @@ class EspressoSystemInterface : public SystemInterface {
     m_needsDipGpu = hasDipGpu();
     m_splitParticleStructGpu |= m_needsRGpu;
     m_gpu |= m_needsRGpu;
-    if (m_gpu)
-      enableParticleCommunication();
+    enableParticleCommunication();
   };
 #endif
 
@@ -82,16 +80,14 @@ class EspressoSystemInterface : public SystemInterface {
     m_needsQGpu = hasQGpu();
     m_splitParticleStructGpu |= m_needsQGpu;
     m_gpu |= m_needsQGpu;
-    if (m_gpu)
-      enableParticleCommunication();
+    enableParticleCommunication();
   };
 #endif
 
   void requestParticleStructGpu() {
     m_needsParticleStructGpu = true;
     m_gpu |= m_needsParticleStructGpu;
-    if (m_gpu)
-      enableParticleCommunication();
+    enableParticleCommunication();
   };
 
   float *fGpuBegin() override { return gpu_get_particle_force_pointer(); };
@@ -99,8 +95,7 @@ class EspressoSystemInterface : public SystemInterface {
   void requestFGpu() override {
     m_needsFGpu = hasFGpu();
     m_gpu |= m_needsFGpu;
-    if (m_gpu)
-      enableParticleCommunication();
+    enableParticleCommunication();
   };
 
 #ifdef ROTATION
@@ -111,8 +106,7 @@ class EspressoSystemInterface : public SystemInterface {
   void requestTorqueGpu() override {
     m_needsTorqueGpu = hasTorqueGpu();
     m_gpu |= m_needsTorqueGpu;
-    if (m_gpu)
-      enableParticleCommunication();
+    enableParticleCommunication();
   };
 #endif
 
@@ -140,13 +134,7 @@ class EspressoSystemInterface : public SystemInterface {
   void gatherParticles();
   void split_particle_struct();
 #ifdef CUDA
-  void enableParticleCommunication() {
-    if (!gpu_get_global_particle_vars_pointer_host()->communication_enabled) {
-      gpu_init_particle_comm();
-      cuda_bcast_global_part_params();
-      reallocDeviceMemory(gpu_get_particle_pointer().size());
-    }
-  };
+  void enableParticleCommunication();
   void reallocDeviceMemory(std::size_t n);
 
 private:

From cc7c4d6e1e5824c03edbd6a912735dc3e60e7b19 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Thu, 9 Dec 2021 17:50:24 +0100
Subject: [PATCH 19/21] core: Pass global variable this_node by value

---
 src/core/EspressoSystemInterface.cpp          |   4 +-
 src/core/cuda_common_cuda.cu                  |  14 +--
 src/core/cuda_interface.hpp                   |   6 +-
 .../p3m-common.cpp                            |   4 +-
 src/core/forces.cpp                           |   2 +-
 src/core/grid_based_algorithms/lbgpu.cpp      |   2 +-
 src/core/grid_based_algorithms/lbgpu_cuda.cu  |   4 -
 .../VirtualSitesInertialessTracers.cpp        |   8 +-
 .../virtual_sites/lb_inertialess_tracers.cpp  |   4 +-
 .../virtual_sites/lb_inertialess_tracers.hpp  |   4 +-
 .../lb_inertialess_tracers_cuda.cu            | 109 ++++++++----------
 .../lb_inertialess_tracers_cuda_interface.hpp |   3 +-
 12 files changed, 76 insertions(+), 88 deletions(-)

diff --git a/src/core/EspressoSystemInterface.cpp b/src/core/EspressoSystemInterface.cpp
index 559b4d51830..17c7015b2ad 100644
--- a/src/core/EspressoSystemInterface.cpp
+++ b/src/core/EspressoSystemInterface.cpp
@@ -31,7 +31,7 @@ void EspressoSystemInterface::gatherParticles() {
 #ifdef CUDA
   if (m_gpu) {
     if (gpu_get_global_particle_vars_pointer_host()->communication_enabled) {
-      copy_part_data_to_gpu(cell_structure.local_particles());
+      copy_part_data_to_gpu(cell_structure.local_particles(), this_node);
       reallocDeviceMemory(gpu_get_particle_pointer().size());
       if (m_splitParticleStructGpu && (this_node == 0))
         split_particle_struct();
@@ -44,7 +44,7 @@ void EspressoSystemInterface::gatherParticles() {
 void EspressoSystemInterface::enableParticleCommunication() {
   if (m_gpu) {
     if (!gpu_get_global_particle_vars_pointer_host()->communication_enabled) {
-      gpu_init_particle_comm();
+      gpu_init_particle_comm(this_node);
       cuda_bcast_global_part_params();
       reallocDeviceMemory(gpu_get_particle_pointer().size());
     }
diff --git a/src/core/cuda_common_cuda.cu b/src/core/cuda_common_cuda.cu
index 2a084138012..b0e9bf38148 100644
--- a/src/core/cuda_common_cuda.cu
+++ b/src/core/cuda_common_cuda.cu
@@ -36,8 +36,6 @@
 #include <cstddef>
 #include <cstdio>
 
-extern int this_node;
-
 template <class T>
 using device_vector = thrust::device_vector<T, CudaDeviceAllocator<T>>;
 
@@ -76,10 +74,10 @@ void cuda_check_errors_exit(const dim3 &block, const dim3 &grid,
   cudaError_t CU_err = cudaGetLastError();
   if (CU_err != cudaSuccess) {
     fprintf(stderr,
-            "%d: error \"%s\" calling %s with dim %d %d %d, grid %d %d "
+            "error \"%s\" calling %s with dim %d %d %d, grid %d %d "
             "%d in %s:%u\n",
-            this_node, cudaGetErrorString(CU_err), function, block.x, block.y,
-            block.z, grid.x, grid.y, grid.z, file, line);
+            cudaGetErrorString(CU_err), function, block.x, block.y, block.z,
+            grid.x, grid.y, grid.z, file, line);
     errexit();
   }
 }
@@ -131,7 +129,7 @@ void resize_buffers(std::size_t number_of_particles) {
  *    sizeof(CUDA_global_part_vars), MPI_BYTE, 0, comm_cart)` (when executed
  *    on all nodes)
  */
-void gpu_init_particle_comm() {
+void gpu_init_particle_comm(int this_node) {
   if (this_node == 0 && global_part_vars_host.communication_enabled == 0) {
     try {
       cuda_check_device();
@@ -159,7 +157,7 @@ float *gpu_get_particle_torque_pointer() {
 #endif
 CUDA_energy *gpu_get_energy_pointer() { return energy_device; }
 
-void copy_part_data_to_gpu(ParticleRange particles) {
+void copy_part_data_to_gpu(ParticleRange particles, int this_node) {
   if (global_part_vars_host.communication_enabled == 1) {
     cuda_mpi_get_particles(particles, particle_data_host);
 
@@ -182,7 +180,7 @@ void copy_part_data_to_gpu(ParticleRange particles) {
 
 /** setup and call kernel to copy particle forces to host
  */
-void copy_forces_from_GPU(ParticleRange &particles) {
+void copy_forces_from_GPU(ParticleRange &particles, int this_node) {
   if (global_part_vars_host.communication_enabled == 1) {
     /* Copy result from device memory to host memory*/
     if (this_node == 0 && (not particle_forces_device.empty())) {
diff --git a/src/core/cuda_interface.hpp b/src/core/cuda_interface.hpp
index 7844c1db3d8..af9eda408b3 100644
--- a/src/core/cuda_interface.hpp
+++ b/src/core/cuda_interface.hpp
@@ -102,7 +102,7 @@ struct CUDA_global_part_vars {
   unsigned int communication_enabled;
 };
 
-void copy_forces_from_GPU(ParticleRange &particles);
+void copy_forces_from_GPU(ParticleRange &particles, int this_node);
 CUDA_energy copy_energy_from_GPU();
 void clear_energy_on_GPU();
 
@@ -114,12 +114,12 @@ float *gpu_get_particle_torque_pointer();
 #endif
 
 CUDA_energy *gpu_get_energy_pointer();
-void gpu_init_particle_comm();
+void gpu_init_particle_comm(int this_node);
 
 void cuda_mpi_get_particles(
     const ParticleRange &particles,
     pinned_vector<CUDA_particle_data> &particle_data_host);
-void copy_part_data_to_gpu(ParticleRange particles);
+void copy_part_data_to_gpu(ParticleRange particles, int this_node);
 
 /**
  * @brief Distribute forces to the worker nodes, and add them to the particles.
diff --git a/src/core/electrostatics_magnetostatics/p3m-common.cpp b/src/core/electrostatics_magnetostatics/p3m-common.cpp
index 6ef78228bd0..48bbcdbf376 100644
--- a/src/core/electrostatics_magnetostatics/p3m-common.cpp
+++ b/src/core/electrostatics_magnetostatics/p3m-common.cpp
@@ -24,6 +24,7 @@
 #include "p3m-common.hpp"
 
 #if defined(P3M) || defined(DP3M)
+#include "communication.hpp"
 #include "errorhandling.hpp"
 
 #include <utils/Vector.hpp>
@@ -33,9 +34,6 @@
 #include <cmath>
 #include <cstdio>
 
-/* For debug messages */
-extern int this_node;
-
 void p3m_add_block(double const *in, double *out, int const start[3],
                    int const size[3], int const dim[3]) {
   /* fast,mid and slow changing indices */
diff --git a/src/core/forces.cpp b/src/core/forces.cpp
index 8dfefee3dfe..9c050768dfe 100644
--- a/src/core/forces.cpp
+++ b/src/core/forces.cpp
@@ -156,7 +156,7 @@ void force_calc(CellStructure &cell_structure, double time_step, double kT) {
                                          ghost_particles, time_step);
 
 #ifdef CUDA
-  copy_forces_from_GPU(particles);
+  copy_forces_from_GPU(particles, this_node);
 #endif
 
 // VIRTUAL_SITES distribute forces
diff --git a/src/core/grid_based_algorithms/lbgpu.cpp b/src/core/grid_based_algorithms/lbgpu.cpp
index c5de7c8d7d6..3d01fecb276 100644
--- a/src/core/grid_based_algorithms/lbgpu.cpp
+++ b/src/core/grid_based_algorithms/lbgpu.cpp
@@ -164,7 +164,7 @@ void lb_init_gpu() {
 
   lb_init_GPU(lbpar_gpu);
 
-  gpu_init_particle_comm();
+  gpu_init_particle_comm(this_node);
   cuda_bcast_global_part_params();
 }
 
diff --git a/src/core/grid_based_algorithms/lbgpu_cuda.cu b/src/core/grid_based_algorithms/lbgpu_cuda.cu
index becd3ec8ba0..3d80fd71a52 100644
--- a/src/core/grid_based_algorithms/lbgpu_cuda.cu
+++ b/src/core/grid_based_algorithms/lbgpu_cuda.cu
@@ -59,8 +59,6 @@
 #include <cstdlib>
 #include <vector>
 
-extern int this_node;
-
 /** struct for hydrodynamic fields: this is for internal use
  *  (i.e. stores values in LB units) and should not be used for
  *  printing values
@@ -2151,8 +2149,6 @@ void lb_init_boundaries_GPU(std::size_t host_n_lb_boundaries,
                             int *host_boundary_node_list,
                             int *host_boundary_index_list,
                             float *host_lb_boundary_velocity) {
-  if (this_node != 0)
-    return;
 
   float *boundary_velocity = nullptr;
   int *boundary_node_list = nullptr;
diff --git a/src/core/virtual_sites/VirtualSitesInertialessTracers.cpp b/src/core/virtual_sites/VirtualSitesInertialessTracers.cpp
index 567d8715f1b..fcd2396c148 100644
--- a/src/core/virtual_sites/VirtualSitesInertialessTracers.cpp
+++ b/src/core/virtual_sites/VirtualSitesInertialessTracers.cpp
@@ -21,6 +21,7 @@
 #ifdef VIRTUAL_SITES_INERTIALESS_TRACERS
 #include "VirtualSitesInertialessTracers.hpp"
 #include "cells.hpp"
+#include "communication.hpp"
 #include "errorhandling.hpp"
 #include "grid_based_algorithms/lb_interface.hpp"
 #include "virtual_sites/lb_inertialess_tracers.hpp"
@@ -34,7 +35,7 @@ void VirtualSitesInertialessTracers::after_force_calc() {
   }
 #ifdef CUDA
   if (lattice_switch == ActiveLB::GPU) {
-    IBM_ForcesIntoFluid_GPU(cell_structure.local_particles());
+    IBM_ForcesIntoFluid_GPU(cell_structure.local_particles(), this_node);
     return;
   }
 #endif
@@ -49,7 +50,8 @@ void VirtualSitesInertialessTracers::after_force_calc() {
 
 void VirtualSitesInertialessTracers::after_lb_propagation(double time_step) {
 #ifdef VIRTUAL_SITES_INERTIALESS_TRACERS
-  IBM_UpdateParticlePositions(cell_structure.local_particles(), time_step);
-#endif // VS inertialess tracers
+  IBM_UpdateParticlePositions(cell_structure.local_particles(), time_step,
+                              this_node);
+#endif
 }
 #endif
diff --git a/src/core/virtual_sites/lb_inertialess_tracers.cpp b/src/core/virtual_sites/lb_inertialess_tracers.cpp
index bce83e17163..5d006a60f78 100644
--- a/src/core/virtual_sites/lb_inertialess_tracers.cpp
+++ b/src/core/virtual_sites/lb_inertialess_tracers.cpp
@@ -91,13 +91,13 @@ void IBM_ForcesIntoFluid_CPU() {
  *  Called from the integration loop right after the LB update.
  */
 void IBM_UpdateParticlePositions(ParticleRange const &particles,
-                                 double time_step) {
+                                 double time_step, int this_node) {
   // Get velocities
   if (lattice_switch == ActiveLB::CPU)
     ParticleVelocitiesFromLB_CPU();
 #ifdef CUDA
   if (lattice_switch == ActiveLB::GPU)
-    ParticleVelocitiesFromLB_GPU(particles);
+    ParticleVelocitiesFromLB_GPU(particles, this_node);
 #endif
 
   // Do update: Euler
diff --git a/src/core/virtual_sites/lb_inertialess_tracers.hpp b/src/core/virtual_sites/lb_inertialess_tracers.hpp
index 8377e8d33d4..876774553a9 100644
--- a/src/core/virtual_sites/lb_inertialess_tracers.hpp
+++ b/src/core/virtual_sites/lb_inertialess_tracers.hpp
@@ -29,9 +29,9 @@
 #include "ParticleRange.hpp"
 
 void IBM_UpdateParticlePositions(ParticleRange const &particles,
-                                 double time_step);
+                                 double time_step, int this_node);
 void IBM_ForcesIntoFluid_CPU();
-void IBM_ForcesIntoFluid_GPU(ParticleRange const &particles);
+void IBM_ForcesIntoFluid_GPU(ParticleRange const &particles, int this_node);
 
 #endif // VIRTUAL_SITES_INERTIALESS_TRACERS
 #endif
diff --git a/src/core/virtual_sites/lb_inertialess_tracers_cuda.cu b/src/core/virtual_sites/lb_inertialess_tracers_cuda.cu
index 8fa0363d6a0..03f4cf9476a 100644
--- a/src/core/virtual_sites/lb_inertialess_tracers_cuda.cu
+++ b/src/core/virtual_sites/lb_inertialess_tracers_cuda.cu
@@ -39,9 +39,6 @@
 
 #include <cstddef>
 
-// To avoid including communication.hpp
-extern int this_node;
-
 // Other functions for internal use
 void InitCUDA_IBM(std::size_t numParticles);
 
@@ -285,7 +282,7 @@ __global__ void ResetLBForces_Kernel(LB_node_force_density_gpu node_f,
  *  This must be the first CUDA-IBM function to be called because it also does
  *  some initialization.
  */
-void IBM_ForcesIntoFluid_GPU(ParticleRange const &particles) {
+void IBM_ForcesIntoFluid_GPU(ParticleRange const &particles, int this_node) {
   // This function does
   // (1) Gather forces from all particles via MPI
   // (2) Copy forces to the GPU
@@ -294,8 +291,9 @@ void IBM_ForcesIntoFluid_GPU(ParticleRange const &particles) {
   auto const numParticles = gpu_get_particle_pointer().size();
 
   // Storage only needed on head node
-  if (IBM_ParticleDataInput_host.empty() || !IBM_initialized ||
-      numParticles != IBM_numParticlesCache)
+  if (this_node == 0 &&
+      (IBM_ParticleDataInput_host.empty() || !IBM_initialized ||
+       numParticles != IBM_numParticlesCache))
     InitCUDA_IBM(numParticles);
 
   // We gather particle positions and forces from all nodes
@@ -320,70 +318,65 @@ void IBM_ForcesIntoFluid_GPU(ParticleRange const &particles) {
 
 void InitCUDA_IBM(std::size_t const numParticles) {
 
-  // GPU only on head node
-  if (this_node == 0) {
-
-    // Check if we have to delete
-    if (!IBM_ParticleDataInput_host.empty()) {
-      IBM_ParticleDataInput_host.clear();
-      IBM_ParticleDataOutput_host.clear();
-      cuda_safe_mem(cudaFree(IBM_ParticleDataInput_device));
-      cuda_safe_mem(cudaFree(IBM_ParticleDataOutput_device));
-      cuda_safe_mem(cudaFree(lb_boundary_velocity_IBM));
-    }
+  // Check if we have to delete
+  if (!IBM_ParticleDataInput_host.empty()) {
+    IBM_ParticleDataInput_host.clear();
+    IBM_ParticleDataOutput_host.clear();
+    cuda_safe_mem(cudaFree(IBM_ParticleDataInput_device));
+    cuda_safe_mem(cudaFree(IBM_ParticleDataOutput_device));
+    cuda_safe_mem(cudaFree(lb_boundary_velocity_IBM));
+  }
+
+  // Back and forth communication of positions and velocities
+  IBM_ParticleDataInput_host.resize(numParticles);
+  IBM_ParticleDataOutput_host.resize(numParticles);
+  cuda_safe_mem(cudaMalloc((void **)&IBM_ParticleDataInput_device,
+                           numParticles * sizeof(IBM_CUDA_ParticleDataInput)));
+  cuda_safe_mem(cudaMalloc((void **)&IBM_ParticleDataOutput_device,
+                           numParticles * sizeof(IBM_CUDA_ParticleDataOutput)));
 
-    // Back and forth communication of positions and velocities
-    IBM_ParticleDataInput_host.resize(numParticles);
-    IBM_ParticleDataOutput_host.resize(numParticles);
-    cuda_safe_mem(
-        cudaMalloc((void **)&IBM_ParticleDataInput_device,
-                   numParticles * sizeof(IBM_CUDA_ParticleDataInput)));
-    cuda_safe_mem(
-        cudaMalloc((void **)&IBM_ParticleDataOutput_device,
-                   numParticles * sizeof(IBM_CUDA_ParticleDataOutput)));
-
-    // Use LB parameters
-    lb_get_para_pointer(&para_gpu);
-
-    // Copy boundary velocities to the GPU
-    // First put them into correct format
+  // Use LB parameters
+  lb_get_para_pointer(&para_gpu);
+
+  // Copy boundary velocities to the GPU
+  // First put them into correct format
 #ifdef LB_BOUNDARIES_GPU
-    auto *host_lb_boundary_velocity =
-        new float[3 * (LBBoundaries::lbboundaries.size() + 1)];
-
-    for (int n = 0; n < LBBoundaries::lbboundaries.size(); n++) {
-      host_lb_boundary_velocity[3 * n + 0] =
-          static_cast<float>(LBBoundaries::lbboundaries[n]->velocity()[0]);
-      host_lb_boundary_velocity[3 * n + 1] =
-          static_cast<float>(LBBoundaries::lbboundaries[n]->velocity()[1]);
-      host_lb_boundary_velocity[3 * n + 2] =
-          static_cast<float>(LBBoundaries::lbboundaries[n]->velocity()[2]);
-    }
+  auto *host_lb_boundary_velocity =
+      new float[3 * (LBBoundaries::lbboundaries.size() + 1)];
+
+  for (int n = 0; n < LBBoundaries::lbboundaries.size(); n++) {
+    host_lb_boundary_velocity[3 * n + 0] =
+        static_cast<float>(LBBoundaries::lbboundaries[n]->velocity()[0]);
+    host_lb_boundary_velocity[3 * n + 1] =
+        static_cast<float>(LBBoundaries::lbboundaries[n]->velocity()[1]);
+    host_lb_boundary_velocity[3 * n + 2] =
+        static_cast<float>(LBBoundaries::lbboundaries[n]->velocity()[2]);
+  }
 
-    host_lb_boundary_velocity[3 * LBBoundaries::lbboundaries.size() + 0] = 0.0f;
-    host_lb_boundary_velocity[3 * LBBoundaries::lbboundaries.size() + 1] = 0.0f;
-    host_lb_boundary_velocity[3 * LBBoundaries::lbboundaries.size() + 2] = 0.0f;
+  host_lb_boundary_velocity[3 * LBBoundaries::lbboundaries.size() + 0] = 0.0f;
+  host_lb_boundary_velocity[3 * LBBoundaries::lbboundaries.size() + 1] = 0.0f;
+  host_lb_boundary_velocity[3 * LBBoundaries::lbboundaries.size() + 2] = 0.0f;
 
-    cuda_safe_mem(
-        cudaMalloc((void **)&lb_boundary_velocity_IBM,
-                   3 * LBBoundaries::lbboundaries.size() * sizeof(float)));
-    cuda_safe_mem(
-        cudaMemcpy(lb_boundary_velocity_IBM, host_lb_boundary_velocity,
-                   3 * LBBoundaries::lbboundaries.size() * sizeof(float),
-                   cudaMemcpyHostToDevice));
+  cuda_safe_mem(
+      cudaMalloc((void **)&lb_boundary_velocity_IBM,
+                 3 * LBBoundaries::lbboundaries.size() * sizeof(float)));
+  cuda_safe_mem(
+      cudaMemcpy(lb_boundary_velocity_IBM, host_lb_boundary_velocity,
+                 3 * LBBoundaries::lbboundaries.size() * sizeof(float),
+                 cudaMemcpyHostToDevice));
 
-    delete[] host_lb_boundary_velocity;
+  delete[] host_lb_boundary_velocity;
 #endif
 
-    IBM_numParticlesCache = numParticles;
-    IBM_initialized = true;
-  }
+  IBM_numParticlesCache = numParticles;
+  IBM_initialized = true;
 }
 
 /** Call a kernel function to interpolate the velocity at each IBM particle's
  *  position. Store velocity in the particle data structure.
  */
-void ParticleVelocitiesFromLB_GPU(ParticleRange const &particles) {
+void ParticleVelocitiesFromLB_GPU(ParticleRange const &particles,
+                                  int this_node) {
   // This function performs three steps:
   // (1) interpolate velocities on GPU
   // (2) transfer velocities back to CPU
diff --git a/src/core/virtual_sites/lb_inertialess_tracers_cuda_interface.hpp b/src/core/virtual_sites/lb_inertialess_tracers_cuda_interface.hpp
index 6baecd4d2d5..59afe0dd838 100644
--- a/src/core/virtual_sites/lb_inertialess_tracers_cuda_interface.hpp
+++ b/src/core/virtual_sites/lb_inertialess_tracers_cuda_interface.hpp
@@ -38,7 +38,8 @@
 void IBM_cuda_mpi_send_velocities(ParticleRange const &particles);
 void IBM_cuda_mpi_get_particles(ParticleRange const &particles);
 
-void ParticleVelocitiesFromLB_GPU(ParticleRange const &particles);
+void ParticleVelocitiesFromLB_GPU(ParticleRange const &particles,
+                                  int this_node);
 
 // ******** data types for CUDA and MPI communication ******
 struct IBM_CUDA_ParticleDataInput {

From e38486cb995adeb49d81d18ac7596fd9d18aa5b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Fri, 10 Dec 2021 12:26:16 +0100
Subject: [PATCH 20/21] core: Improve separation of concerns

- decouple thermostats from long-range GPU methods
- narrow includes of thermostat.hpp, cells.hpp, rotation.hpp
- break cyclic dependency in bonded interactions
- move functions to different files as necessary
- pass cell_structure global as function argument as necessary
---
 .../bonded_interaction_data.hpp               |  1 -
 .../bonded_interactions/thermalized_bond.hpp  | 57 ------------
 .../thermalized_bond_kernel.hpp               | 92 +++++++++++++++++++
 .../electrostatics_magnetostatics/coulomb.cpp |  2 +-
 .../electrostatics_magnetostatics/icc.cpp     | 40 ++++----
 .../electrostatics_magnetostatics/icc.hpp     |  8 +-
 src/core/energy_inline.hpp                    |  2 +-
 src/core/forces.cpp                           | 65 ++++++++++++-
 src/core/forces.hpp                           |  7 +-
 src/core/forces_inline.hpp                    | 53 +----------
 src/core/integrate.cpp                        |  1 +
 src/core/rattle.cpp                           |  4 +
 src/core/rattle.hpp                           |  5 +-
 src/python/espressomd/particle_data.pxd       |  2 +-
 14 files changed, 192 insertions(+), 147 deletions(-)
 create mode 100644 src/core/bonded_interactions/thermalized_bond_kernel.hpp

diff --git a/src/core/bonded_interactions/bonded_interaction_data.hpp b/src/core/bonded_interactions/bonded_interaction_data.hpp
index f243eea4ba0..0b864ad8fa7 100644
--- a/src/core/bonded_interactions/bonded_interaction_data.hpp
+++ b/src/core/bonded_interactions/bonded_interaction_data.hpp
@@ -39,7 +39,6 @@
 #include "object-in-fluid/oif_global_forces_params.hpp"
 #include "object-in-fluid/oif_local_forces.hpp"
 #include "quartic.hpp"
-#include "rattle.hpp"
 #include "rigid_bond.hpp"
 #include "thermalized_bond.hpp"
 
diff --git a/src/core/bonded_interactions/thermalized_bond.hpp b/src/core/bonded_interactions/thermalized_bond.hpp
index e838231b84e..10c12041e85 100644
--- a/src/core/bonded_interactions/thermalized_bond.hpp
+++ b/src/core/bonded_interactions/thermalized_bond.hpp
@@ -32,14 +32,11 @@
 extern int n_thermalized_bonds;
 
 #include "Particle.hpp"
-#include "random.hpp"
-#include "thermostat.hpp"
 
 #include <utils/Vector.hpp>
 
 #include <boost/optional.hpp>
 
-#include <cmath>
 #include <tuple>
 
 /** Parameters for Thermalized bond */
@@ -80,58 +77,4 @@ struct ThermalizedBond {
   }
 };
 
-/** Separately thermalizes the com and distance of a particle pair.
- *  @param[in]  p1        First particle.
- *  @param[in]  p2        Second particle.
- *  @param[in]  dx        %Distance between the particles.
- *  @return the forces on @p p1 and @p p2
- */
-inline boost::optional<std::tuple<Utils::Vector3d, Utils::Vector3d>>
-ThermalizedBond::forces(Particle const &p1, Particle const &p2,
-                        Utils::Vector3d const &dx) const {
-  // Bond broke?
-  if (r_cut > 0.0 && dx.norm() > r_cut) {
-    return {};
-  }
-
-  auto const mass_tot = p1.p.mass + p2.p.mass;
-  auto const mass_tot_inv = 1.0 / mass_tot;
-  auto const sqrt_mass_tot = sqrt(mass_tot);
-  auto const sqrt_mass_red = sqrt(p1.p.mass * p2.p.mass / mass_tot);
-  auto const com_vel = mass_tot_inv * (p1.p.mass * p1.m.v + p2.p.mass * p2.m.v);
-  auto const dist_vel = p2.m.v - p1.m.v;
-
-  extern ThermalizedBondThermostat thermalized_bond;
-  Utils::Vector3d force1{};
-  Utils::Vector3d force2{};
-  auto const noise = Random::noise_uniform<RNGSalt::THERMALIZED_BOND>(
-      thermalized_bond.rng_counter(), thermalized_bond.rng_seed(),
-      p1.p.identity, p2.p.identity);
-
-  for (int i = 0; i < 3; i++) {
-    double force_lv_com, force_lv_dist;
-
-    // Langevin thermostat for center of mass
-    if (pref2_com > 0.0) {
-      force_lv_com =
-          -pref1_com * com_vel[i] + sqrt_mass_tot * pref2_com * noise[i];
-    } else {
-      force_lv_com = -pref1_com * com_vel[i];
-    }
-
-    // Langevin thermostat for distance p1->p2
-    if (pref2_dist > 0.0) {
-      force_lv_dist =
-          -pref1_dist * dist_vel[i] + sqrt_mass_red * pref2_dist * noise[i];
-    } else {
-      force_lv_dist = -pref1_dist * dist_vel[i];
-    }
-    // Add forces
-    force1[i] = p1.p.mass * mass_tot_inv * force_lv_com - force_lv_dist;
-    force2[i] = p2.p.mass * mass_tot_inv * force_lv_com + force_lv_dist;
-  }
-
-  return std::make_tuple(force1, force2);
-}
-
 #endif
diff --git a/src/core/bonded_interactions/thermalized_bond_kernel.hpp b/src/core/bonded_interactions/thermalized_bond_kernel.hpp
new file mode 100644
index 00000000000..47cc82adc49
--- /dev/null
+++ b/src/core/bonded_interactions/thermalized_bond_kernel.hpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright (C) 2010-2019 The ESPResSo project
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
+ *   Max-Planck-Institute for Polymer Research, Theory Group
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef THERMALIZED_DIST_KERNEL_H
+#define THERMALIZED_DIST_KERNEL_H
+
+#include "thermalized_bond.hpp"
+
+#include "Particle.hpp"
+#include "random.hpp"
+#include "thermostat.hpp"
+
+#include <utils/Vector.hpp>
+
+#include <boost/optional.hpp>
+
+#include <cmath>
+#include <tuple>
+
+/** Separately thermalizes the com and distance of a particle pair.
+ *  @param[in]  p1        First particle.
+ *  @param[in]  p2        Second particle.
+ *  @param[in]  dx        %Distance between the particles.
+ *  @return the forces on @p p1 and @p p2
+ */
+inline boost::optional<std::tuple<Utils::Vector3d, Utils::Vector3d>>
+ThermalizedBond::forces(Particle const &p1, Particle const &p2,
+                        Utils::Vector3d const &dx) const {
+  // Bond broke?
+  if (r_cut > 0.0 && dx.norm() > r_cut) {
+    return {};
+  }
+
+  auto const mass_tot = p1.p.mass + p2.p.mass;
+  auto const mass_tot_inv = 1.0 / mass_tot;
+  auto const sqrt_mass_tot = sqrt(mass_tot);
+  auto const sqrt_mass_red = sqrt(p1.p.mass * p2.p.mass / mass_tot);
+  auto const com_vel = mass_tot_inv * (p1.p.mass * p1.m.v + p2.p.mass * p2.m.v);
+  auto const dist_vel = p2.m.v - p1.m.v;
+
+  extern ThermalizedBondThermostat thermalized_bond;
+  Utils::Vector3d force1{};
+  Utils::Vector3d force2{};
+  auto const noise = Random::noise_uniform<RNGSalt::THERMALIZED_BOND>(
+      thermalized_bond.rng_counter(), thermalized_bond.rng_seed(),
+      p1.p.identity, p2.p.identity);
+
+  for (int i = 0; i < 3; i++) {
+    double force_lv_com, force_lv_dist;
+
+    // Langevin thermostat for center of mass
+    if (pref2_com > 0.0) {
+      force_lv_com =
+          -pref1_com * com_vel[i] + sqrt_mass_tot * pref2_com * noise[i];
+    } else {
+      force_lv_com = -pref1_com * com_vel[i];
+    }
+
+    // Langevin thermostat for distance p1->p2
+    if (pref2_dist > 0.0) {
+      force_lv_dist =
+          -pref1_dist * dist_vel[i] + sqrt_mass_red * pref2_dist * noise[i];
+    } else {
+      force_lv_dist = -pref1_dist * dist_vel[i];
+    }
+    // Add forces
+    force1[i] = p1.p.mass * mass_tot_inv * force_lv_com - force_lv_dist;
+    force2[i] = p2.p.mass * mass_tot_inv * force_lv_com + force_lv_dist;
+  }
+
+  return std::make_tuple(force1, force2);
+}
+
+#endif
diff --git a/src/core/electrostatics_magnetostatics/coulomb.cpp b/src/core/electrostatics_magnetostatics/coulomb.cpp
index 525bca1eefa..f2adfd61209 100644
--- a/src/core/electrostatics_magnetostatics/coulomb.cpp
+++ b/src/core/electrostatics_magnetostatics/coulomb.cpp
@@ -160,7 +160,7 @@ void deactivate() {
 }
 
 void update_dependent_particles() {
-  icc_iteration(cell_structure.local_particles(),
+  icc_iteration(cell_structure, cell_structure.local_particles(),
                 cell_structure.ghost_particles());
 }
 
diff --git a/src/core/electrostatics_magnetostatics/icc.cpp b/src/core/electrostatics_magnetostatics/icc.cpp
index 67de472a2b4..ea2cbcabf0c 100644
--- a/src/core/electrostatics_magnetostatics/icc.cpp
+++ b/src/core/electrostatics_magnetostatics/icc.cpp
@@ -32,6 +32,7 @@
 
 #include "icc.hpp"
 
+#include "CellStructure.hpp"
 #include "Particle.hpp"
 #include "ParticleRange.hpp"
 #include "cells.hpp"
@@ -46,6 +47,7 @@
 
 #include <mpi.h>
 
+#include <algorithm>
 #include <cmath>
 #include <cstdlib>
 #include <stdexcept>
@@ -54,16 +56,14 @@
 
 icc_struct icc_cfg;
 
-void init_forces_icc(const ParticleRange &particles,
-                     const ParticleRange &ghosts_particles);
-
 /** Calculate the electrostatic forces between source charges (= real charges)
  *  and wall charges. For each electrostatic method, the proper functions
  *  for short- and long-range parts are called. Long-range parts are calculated
  *  directly, short-range parts need helper functions according to the particle
  *  data organisation. This is a modified version of \ref force_calc.
  */
-void force_calc_icc(const ParticleRange &particles,
+void force_calc_icc(CellStructure &cell_structure,
+                    const ParticleRange &particles,
                     const ParticleRange &ghost_particles);
 
 /** Variant of @ref add_non_bonded_pair_force where only %Coulomb
@@ -81,7 +81,9 @@ inline void add_non_bonded_pair_force_icc(Particle &p1, Particle &p2,
   p2.f.f += std::get<2>(forces);
 #endif
 }
-void icc_iteration(const ParticleRange &particles,
+
+void icc_iteration(CellStructure &cell_structure,
+                   const ParticleRange &particles,
                    const ParticleRange &ghost_particles) {
   if (icc_cfg.n_icc == 0)
     return;
@@ -96,8 +98,8 @@ void icc_iteration(const ParticleRange &particles,
   for (int j = 0; j < icc_cfg.num_iteration; j++) {
     double charge_density_max = 0.;
 
-    force_calc_icc(particles, ghost_particles); /* Calculate electrostatic
-                            forces (SR+LR) excluding source source interaction*/
+    // calculate electrostatic forces (SR+LR) excluding self-interactions
+    force_calc_icc(cell_structure, particles, ghost_particles);
     cell_structure.ghosts_reduce_forces();
 
     double diff = 0;
@@ -177,30 +179,26 @@ void icc_iteration(const ParticleRange &particles,
   on_particle_charge_change();
 }
 
-void force_calc_icc(const ParticleRange &particles,
+void force_calc_icc(CellStructure &cell_structure,
+                    const ParticleRange &particles,
                     const ParticleRange &ghost_particles) {
-  init_forces_icc(particles, ghost_particles);
+  // reset forces
+  for (auto &p : particles) {
+    p.f.f = {};
+  }
+  for (auto &p : ghost_particles) {
+    p.f.f = {};
+  }
 
+  // calc ICC forces
   cell_structure.non_bonded_loop(
       [](Particle &p1, Particle &p2, Distance const &d) {
-        /* calc non-bonded interactions */
         add_non_bonded_pair_force_icc(p1, p2, d.vec21, sqrt(d.dist2), d.dist2);
       });
 
   Coulomb::calc_long_range_force(particles);
 }
 
-void init_forces_icc(const ParticleRange &particles,
-                     const ParticleRange &ghosts_particles) {
-  for (auto &p : particles) {
-    p.f.f = {};
-  }
-
-  for (auto &p : ghosts_particles) {
-    p.f.f = {};
-  }
-}
-
 void mpi_icc_init_local(const icc_struct &icc_cfg_) {
   icc_cfg = icc_cfg_;
 
diff --git a/src/core/electrostatics_magnetostatics/icc.hpp b/src/core/electrostatics_magnetostatics/icc.hpp
index 358cbb46586..05c351ccc6f 100644
--- a/src/core/electrostatics_magnetostatics/icc.hpp
+++ b/src/core/electrostatics_magnetostatics/icc.hpp
@@ -48,10 +48,11 @@
 
 #if defined(ELECTROSTATICS)
 
-#include <ParticleRange.hpp>
+#include "CellStructure.hpp"
+#include "ParticleRange.hpp"
+
 #include <utils/Vector.hpp>
 
-#include <algorithm>
 #include <vector>
 
 /** ICC data structure */
@@ -104,7 +105,8 @@ extern icc_struct icc_cfg;
 /** The main iterative scheme, where the surface element charges are calculated
  *  self-consistently.
  */
-void icc_iteration(const ParticleRange &particles,
+void icc_iteration(CellStructure &cell_structure,
+                   const ParticleRange &particles,
                    const ParticleRange &ghost_particles);
 
 /** Perform ICC initialization.
diff --git a/src/core/energy_inline.hpp b/src/core/energy_inline.hpp
index cd1026ac61d..41cc2fa565b 100644
--- a/src/core/energy_inline.hpp
+++ b/src/core/energy_inline.hpp
@@ -57,7 +57,7 @@
 
 #include "Observable_stat.hpp"
 #include "Particle.hpp"
-#include "cells.hpp"
+#include "bond_error.hpp"
 #include "errorhandling.hpp"
 #include "exclusions.hpp"
 
diff --git a/src/core/forces.cpp b/src/core/forces.cpp
index 9c050768dfe..d0758d36e2a 100644
--- a/src/core/forces.cpp
+++ b/src/core/forces.cpp
@@ -27,6 +27,7 @@
 
 #include "EspressoSystemInterface.hpp"
 
+#include "cells.hpp"
 #include "collision.hpp"
 #include "comfixed_global.hpp"
 #include "communication.hpp"
@@ -45,7 +46,10 @@
 #include "nonbonded_interactions/VerletCriterion.hpp"
 #include "nonbonded_interactions/nonbonded_interaction_data.hpp"
 #include "npt.hpp"
+#include "rotation.hpp"
 #include "short_range_loop.hpp"
+#include "thermostat.hpp"
+#include "thermostats/langevin_inline.hpp"
 #include "virtual_sites.hpp"
 
 #include <profiler/profiler.hpp>
@@ -54,7 +58,58 @@
 
 ActorList forceActors;
 
-void init_forces(const ParticleRange &particles, double time_step, double kT) {
+/** Initialize the forces for a ghost particle */
+inline ParticleForce init_ghost_force(Particle const &) { return {}; }
+
+/** External particle forces */
+inline ParticleForce external_force(Particle const &p) {
+  ParticleForce f = {};
+
+#ifdef EXTERNAL_FORCES
+  f.f += p.p.ext_force;
+#ifdef ROTATION
+  f.torque += p.p.ext_torque;
+#endif
+#endif
+
+#ifdef ENGINE
+  // apply a swimming force in the direction of
+  // the particle's orientation axis
+  if (p.p.swim.swimming) {
+    f.f += p.p.swim.f_swim * p.r.calc_director();
+  }
+#endif
+
+  return f;
+}
+
+inline ParticleForce thermostat_force(Particle const &p, double time_step,
+                                      double kT) {
+  extern LangevinThermostat langevin;
+  if (!(thermo_switch & THERMO_LANGEVIN)) {
+    return {};
+  }
+
+#ifdef ROTATION
+  return {friction_thermo_langevin(langevin, p, time_step, kT),
+          p.p.rotation ? convert_vector_body_to_space(
+                             p, friction_thermo_langevin_rotation(
+                                    langevin, p, time_step, kT))
+                       : Utils::Vector3d{}};
+#else
+  return friction_thermo_langevin(langevin, p, time_step, kT);
+#endif
+}
+
+/** Initialize the forces for a real particle */
+inline ParticleForce init_real_particle_force(Particle const &p,
+                                              double time_step, double kT) {
+  return thermostat_force(p, time_step, kT) + external_force(p);
+}
+
+static void init_forces(const ParticleRange &particles,
+                        const ParticleRange &ghost_particles, double time_step,
+                        double kT) {
   ESPRESSO_PROFILER_CXX_MARK_FUNCTION;
   /* The force initialization depends on the used thermostat and the
      thermodynamic ensemble */
@@ -68,13 +123,13 @@ void init_forces(const ParticleRange &particles, double time_step, double kT) {
      set torque to zero for all and rescale quaternions
   */
   for (auto &p : particles) {
-    p.f = init_local_particle_force(p, time_step, kT);
+    p.f = init_real_particle_force(p, time_step, kT);
   }
 
   /* initialize ghost forces with zero
      set torque to zero for all and rescale quaternions
   */
-  for (auto &p : cell_structure.ghost_particles()) {
+  for (auto &p : ghost_particles) {
     p.f = init_ghost_force(p);
   }
 }
@@ -98,9 +153,9 @@ void force_calc(CellStructure &cell_structure, double time_step, double kT) {
   auto particles = cell_structure.local_particles();
   auto ghost_particles = cell_structure.ghost_particles();
 #ifdef ELECTROSTATICS
-  icc_iteration(particles, cell_structure.ghost_particles());
+  icc_iteration(cell_structure, particles, ghost_particles);
 #endif
-  init_forces(particles, time_step, kT);
+  init_forces(particles, ghost_particles, time_step, kT);
 
   for (auto &forceActor : forceActors) {
     forceActor->computeForces(espresso_system);
diff --git a/src/core/forces.hpp b/src/core/forces.hpp
index 299456f5027..21166dff9e5 100644
--- a/src/core/forces.hpp
+++ b/src/core/forces.hpp
@@ -30,11 +30,12 @@
  *  Implementation in forces.cpp.
  */
 
+#include "CellStructure.hpp"
+#include "ParticleRange.hpp"
 #include "actor/Actor.hpp"
 #include "actor/ActorList.hpp"
-#include "bonded_interactions/bonded_interaction_data.hpp"
-#include "cells.hpp"
-#include "nonbonded_interactions/nonbonded_interaction_data.hpp"
+
+#include <utils/Vector.hpp>
 
 extern ActorList forceActors;
 
diff --git a/src/core/forces_inline.hpp b/src/core/forces_inline.hpp
index 815424897e5..d5758e7b90b 100644
--- a/src/core/forces_inline.hpp
+++ b/src/core/forces_inline.hpp
@@ -26,6 +26,7 @@
 #include "forces.hpp"
 
 #include "bonded_interactions/bonded_interaction_data.hpp"
+#include "bonded_interactions/thermalized_bond_kernel.hpp"
 #include "immersed_boundary/ibm_tribend.hpp"
 #include "immersed_boundary/ibm_triel.hpp"
 #include "nonbonded_interactions/bmhtf-nacl.hpp"
@@ -61,11 +62,10 @@
 #endif
 
 #include "Particle.hpp"
+#include "bond_error.hpp"
 #include "errorhandling.hpp"
 #include "exclusions.hpp"
-#include "rotation.hpp"
 #include "thermostat.hpp"
-#include "thermostats/langevin_inline.hpp"
 
 #include <utils/Span.hpp>
 #include <utils/Vector.hpp>
@@ -75,55 +75,6 @@
 
 #include <tuple>
 
-/** Initialize the forces for a ghost particle */
-inline ParticleForce init_ghost_force(Particle const &) { return {}; }
-
-/** External particle forces */
-inline ParticleForce external_force(Particle const &p) {
-  ParticleForce f = {};
-
-#ifdef EXTERNAL_FORCES
-  f.f += p.p.ext_force;
-#ifdef ROTATION
-  f.torque += p.p.ext_torque;
-#endif
-#endif
-
-#ifdef ENGINE
-  // apply a swimming force in the direction of
-  // the particle's orientation axis
-  if (p.p.swim.swimming) {
-    f.f += p.p.swim.f_swim * p.r.calc_director();
-  }
-#endif
-
-  return f;
-}
-
-inline ParticleForce thermostat_force(Particle const &p, double time_step,
-                                      double kT) {
-  extern LangevinThermostat langevin;
-  if (!(thermo_switch & THERMO_LANGEVIN)) {
-    return {};
-  }
-
-#ifdef ROTATION
-  return {friction_thermo_langevin(langevin, p, time_step, kT),
-          p.p.rotation ? convert_vector_body_to_space(
-                             p, friction_thermo_langevin_rotation(
-                                    langevin, p, time_step, kT))
-                       : Utils::Vector3d{}};
-#else
-  return friction_thermo_langevin(langevin, p, time_step, kT);
-#endif
-}
-
-/** Initialize the forces for a real particle */
-inline ParticleForce init_local_particle_force(Particle const &part,
-                                               double time_step, double kT) {
-  return thermostat_force(part, time_step, kT) + external_force(part);
-}
-
 inline ParticleForce calc_non_bonded_pair_force(Particle const &p1,
                                                 Particle const &p2,
                                                 IA_parameters const &ia_params,
diff --git a/src/core/integrate.cpp b/src/core/integrate.cpp
index 011ae63ec9b..799f9285be6 100644
--- a/src/core/integrate.cpp
+++ b/src/core/integrate.cpp
@@ -35,6 +35,7 @@
 
 #include "ParticleRange.hpp"
 #include "accumulators.hpp"
+#include "bonded_interactions/rigid_bond.hpp"
 #include "cells.hpp"
 #include "collision.hpp"
 #include "communication.hpp"
diff --git a/src/core/rattle.cpp b/src/core/rattle.cpp
index 497cdc52093..e141405167c 100644
--- a/src/core/rattle.cpp
+++ b/src/core/rattle.cpp
@@ -25,6 +25,10 @@
 
 #include "CellStructure.hpp"
 #include "Particle.hpp"
+#include "ParticleRange.hpp"
+#include "bonded_interactions/bonded_interaction_data.hpp"
+#include "bonded_interactions/rigid_bond.hpp"
+#include "cells.hpp"
 #include "communication.hpp"
 #include "errorhandling.hpp"
 #include "grid.hpp"
diff --git a/src/core/rattle.hpp b/src/core/rattle.hpp
index b35d9dca9dd..a5dd83b2e62 100644
--- a/src/core/rattle.hpp
+++ b/src/core/rattle.hpp
@@ -27,13 +27,12 @@
  *  For more information see \ref rattle.cpp.
  */
 
-#include "bonded_interactions/bonded_interaction_data.hpp"
-#include "bonded_interactions/rigid_bond.hpp"
-#include "cells.hpp"
 #include "config.hpp"
 
 #ifdef BOND_CONSTRAINT
 
+#include "CellStructure.hpp"
+
 /** Transfer the current particle positions from @ref ParticlePosition::p
  *  "Particle::r::p" to @ref ParticlePosition::p_last_timestep
  *  "Particle::r::p_last_timestep"
diff --git a/src/python/espressomd/particle_data.pxd b/src/python/espressomd/particle_data.pxd
index 7e1785bf9bb..0b095a27063 100644
--- a/src/python/espressomd/particle_data.pxd
+++ b/src/python/espressomd/particle_data.pxd
@@ -202,7 +202,7 @@ cdef extern from "rotation.hpp":
     Vector3d convert_vector_space_to_body(const particle & p, const Vector3d & v)
     void rotate_particle(int pid, const Vector3d & axis, double angle)
 
-cdef extern from "rattle.hpp":
+cdef extern from "bonded_interactions/rigid_bond.hpp":
     extern int n_rigidbonds
 
 cdef class ParticleHandle:

From aae26c92c2a694b09ea4c841a2cefd8fe35930ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Thu, 23 Dec 2021 14:20:14 +0100
Subject: [PATCH 21/21] core: Apply suggested changes

Co-authored-by: Alexander Reinauer <38552369+reinaual@users.noreply.github.com>
---
 src/core/EspressoSystemInterface.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/core/EspressoSystemInterface.hpp b/src/core/EspressoSystemInterface.hpp
index 136ff5d0180..ef5b27e973d 100644
--- a/src/core/EspressoSystemInterface.hpp
+++ b/src/core/EspressoSystemInterface.hpp
@@ -111,7 +111,7 @@ class EspressoSystemInterface : public SystemInterface {
 #endif
 
   float *eGpu() override {
-    // cast pointer to struct of floats to array of floats
+    // cast pointer from struct of floats to array of floats
     // https://stackoverflow.com/a/29278260
     return reinterpret_cast<float *>(gpu_get_energy_pointer());
   };