From a9776300a4e22b49d5f8db9cab0b81b876319048 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Mon, 19 Jul 2021 16:20:35 +0200
Subject: [PATCH 1/6] core: Enable and test the Chebychev series

---
 doc/sphinx/installation.rst                         |  8 +++++++-
 maintainer/configs/no_rotation.hpp                  |  1 +
 src/config/features.def                             |  1 +
 src/core/electrostatics_magnetostatics/mmm1d.cpp    |  8 +++-----
 src/core/electrostatics_magnetostatics/specfunc.hpp | 10 +++++-----
 5 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/doc/sphinx/installation.rst b/doc/sphinx/installation.rst
index 6d7c8c02d16..4187f9e42b1 100644
--- a/doc/sphinx/installation.rst
+++ b/doc/sphinx/installation.rst
@@ -294,7 +294,13 @@ General features
 
    .. seealso:: :ref:`Electrostatics`
 
--  ``MMM1D_GPU``
+-  ``MMM1D_GPU``: This enables MMM1D on GPU. It is faster than the CPU version
+   by several orders of magnitude, but has float precision instead of double
+   precision.
+
+-  ``MMM1D_MACHINE_PREC``: This enables high-precision Bessel functions
+   for MMM1D on CPU. Comes with a 60% slow-down penalty. The low-precision
+   functions are in most cases precise enough and are enabled by default.
 
 -  ``DIPOLES`` This activates the dipole-moment property of particles; In addition,
    the various magnetostatics algorithms, such as P3M are switched on.
diff --git a/maintainer/configs/no_rotation.hpp b/maintainer/configs/no_rotation.hpp
index 5f88f292a1e..d7cbf83640b 100644
--- a/maintainer/configs/no_rotation.hpp
+++ b/maintainer/configs/no_rotation.hpp
@@ -36,6 +36,7 @@
 
 // Charges and dipoles
 #define ELECTROSTATICS
+#define MMM1D_MACHINE_PREC
 #ifdef CUDA
 #define MMM1D_GPU
 #endif
diff --git a/src/config/features.def b/src/config/features.def
index 7a2aa60bd6f..36e77376fdc 100644
--- a/src/config/features.def
+++ b/src/config/features.def
@@ -33,6 +33,7 @@ ROTATIONAL_INERTIA              implies ROTATION
 ELECTROSTATICS
 P3M                             equals ELECTROSTATICS and FFTW
 MMM1D_GPU                       requires CUDA and ELECTROSTATICS
+MMM1D_MACHINE_PREC              requires ELECTROSTATICS
 
 /* Magnetostatics */
 DIPOLES                         implies ROTATION
diff --git a/src/core/electrostatics_magnetostatics/mmm1d.cpp b/src/core/electrostatics_magnetostatics/mmm1d.cpp
index 8cdc61c3a5a..4750b08a31e 100644
--- a/src/core/electrostatics_magnetostatics/mmm1d.cpp
+++ b/src/core/electrostatics_magnetostatics/mmm1d.cpp
@@ -60,12 +60,10 @@
 /** Minimal radius for the far formula in multiples of box_l[2] */
 #define MIN_RAD 0.01
 
-/* if you define this, the Bessel functions are calculated up
+/* if you define this feature, the Bessel functions are calculated up
  * to machine precision, otherwise 10^-14, which should be
  * definitely enough for daily life. */
-#undef BESSEL_MACHINE_PREC
-
-#ifndef BESSEL_MACHINE_PREC
+#ifndef MMM1D_MACHINE_PREC
 #define K0 LPK0
 #define K1 LPK1
 #endif
@@ -250,7 +248,7 @@ void add_mmm1d_coulomb_pair_force(double chpref, Utils::Vector3d const &d,
 
       auto const fq = c_2pi * bp;
       double k0, k1;
-#ifdef BESSEL_MACHINE_PREC
+#ifdef MMM1D_MACHINE_PREC
       k0 = K0(fq * rxy_d);
       k1 = K1(fq * rxy_d);
 #else
diff --git a/src/core/electrostatics_magnetostatics/specfunc.hpp b/src/core/electrostatics_magnetostatics/specfunc.hpp
index 72afd0ec603..b9a0e7fb4e5 100644
--- a/src/core/electrostatics_magnetostatics/specfunc.hpp
+++ b/src/core/electrostatics_magnetostatics/specfunc.hpp
@@ -45,31 +45,31 @@ double hzeta(double order, double x);
 
 /** Modified Bessel function of second kind, order 0. This function was taken
  *  from the GSL code. Precise roughly up to machine precision.
- *  If @c BESSEL_MACHINE_PREC is not defined, @ref LPK0 is used instead.
+ *  If @c MMM1D_MACHINE_PREC is not defined, @ref LPK0 is used instead.
  */
 double K0(double x);
 
 /** Modified Bessel function of second kind, order 1. This function was taken
  *  from the GSL code. Precise roughly up to machine precision.
- *  If @c BESSEL_MACHINE_PREC is not defined, @ref LPK1 is used instead.
+ *  If @c MMM1D_MACHINE_PREC is not defined, @ref LPK1 is used instead.
  */
 double K1(double x);
 
-/** Bessel function K0 at x.
+/** Bessel function of second kind, order 0, low precision.
  *  The implementation has an absolute precision of around 10^(-14), which is
  *  comparable to the relative precision sqrt implementation of current
  *  hardware.
  */
 double LPK0(double x);
 
-/** Bessel function K1 at x.
+/** Bessel function of second kind, order 1, low precision.
  *  The implementation has an absolute precision of around 10^(-14), which is
  *  comparable to the relative precision sqrt implementation of current
  *  hardware.
  */
 double LPK1(double x);
 
-/** Bessel functions K0 and K1 at x.
+/** Bessel functions of second kind, order 0 and order 1, low precision.
  *  The implementation has an absolute precision of around 10^(-14), which is
  *  comparable to the relative precision sqrt implementation of current
  *  hardware.

From 304617cbf2c9280cd3153c3bc80c8640863fc709 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Mon, 19 Jul 2021 16:52:53 +0200
Subject: [PATCH 2/6] core: Modernize MMM1D code

---
 .../electrostatics_magnetostatics/mmm1d.cpp   | 40 +++++-----
 .../specfunc.cpp                              | 73 +++++++++----------
 .../specfunc.hpp                              |  4 +-
 3 files changed, 57 insertions(+), 60 deletions(-)

diff --git a/src/core/electrostatics_magnetostatics/mmm1d.cpp b/src/core/electrostatics_magnetostatics/mmm1d.cpp
index 4750b08a31e..d4eadc66121 100644
--- a/src/core/electrostatics_magnetostatics/mmm1d.cpp
+++ b/src/core/electrostatics_magnetostatics/mmm1d.cpp
@@ -52,13 +52,10 @@
 #include <tuple>
 #include <vector>
 
-/** Largest numerically stable cutoff for Bessel function. Don't
- *  change without improving the formulas.
+/** Largest numerically stable cutoff for Bessel function.
+ *  Don't change without improving the formulas.
  */
-#define MAXIMAL_B_CUT 30
-
-/** Minimal radius for the far formula in multiples of box_l[2] */
-#define MIN_RAD 0.01
+constexpr int MAXIMAL_B_CUT = 30;
 
 /* if you define this feature, the Bessel functions are calculated up
  * to machine precision, otherwise 10^-14, which should be
@@ -74,9 +71,10 @@ static double uz2, prefuz2, prefL3_i;
 /**@}*/
 
 MMM1D_struct mmm1d_params = {0.05, 1e-5, 0};
-/** From which distance a certain Bessel cutoff is valid. Can't be part of the
-    params since these get broadcasted. */
-static std::vector<double> bessel_radii;
+/** From which distance a certain Bessel cutoff is valid.
+ *  Can't be part of the params since these get broadcasted.
+ */
+static std::array<double, MAXIMAL_B_CUT> bessel_radii;
 
 static double far_error(int P, double minrad) {
   // this uses an upper bound to all force components and the potential
@@ -89,7 +87,8 @@ static double far_error(int P, double minrad) {
 
 static double determine_minrad(double maxPWerror, int P) {
   // bisection to search for where the error is maxPWerror
-  double const rgranularity = MIN_RAD * box_geo.length()[2];
+  constexpr auto min_rad = 0.01;
+  double const rgranularity = min_rad * box_geo.length()[2];
   double rmin = rgranularity;
   double rmax = std::min(box_geo.length()[0], box_geo.length()[1]);
   double const errmin = far_error(P, rmin);
@@ -115,10 +114,9 @@ static double determine_minrad(double maxPWerror, int P) {
   return 0.5 * (rmin + rmax);
 }
 
-static void determine_bessel_radii(double maxPWerror, int maxP) {
-  bessel_radii.resize(maxP);
-  for (int P = 1; P <= maxP; ++P) {
-    bessel_radii[P - 1] = determine_minrad(maxPWerror, P);
+static void determine_bessel_radii(double maxPWerror) {
+  for (int i = 0; i < MAXIMAL_B_CUT; ++i) {
+    bessel_radii[i] = determine_minrad(maxPWerror, i + 1);
   }
 }
 
@@ -171,7 +169,7 @@ int MMM1D_init() {
   prefuz2 = coulomb.prefactor * uz2;
   prefL3_i = prefuz2 * box_geo.length_inv()[2];
 
-  determine_bessel_radii(mmm1d_params.maxPWerror, MAXIMAL_B_CUT);
+  determine_bessel_radii(mmm1d_params.maxPWerror);
   prepare_polygamma_series(mmm1d_params.maxPWerror,
                            mmm1d_params.far_switch_radius_2);
   return ES_OK;
@@ -333,17 +331,17 @@ double mmm1d_coulomb_pair_energy(double const chpref, Utils::Vector3d const &d,
 int mmm1d_tune(int timings, bool verbose) {
   if (MMM1D_sanity_checks())
     return ES_ERROR;
-  double min_time = std::numeric_limits<double>::infinity();
-  double min_rad = -1;
-  auto const maxrad = box_geo.length()[2];
-  double switch_radius;
 
   if (mmm1d_params.far_switch_radius_2 < 0) {
+    auto const maxrad = box_geo.length()[2];
+    auto min_time = std::numeric_limits<double>::infinity();
+    auto min_rad = -1.;
+    double switch_radius;
     /* determine besselcutoff and optimal switching radius. Should be around
      * 0.33 */
     for (switch_radius = 0.2 * maxrad; switch_radius < 0.4 * maxrad;
          switch_radius += 0.025 * maxrad) {
-      if (switch_radius <= bessel_radii[MAXIMAL_B_CUT - 1]) {
+      if (switch_radius <= bessel_radii.back()) {
         // this switching radius is too small for our Bessel series
         continue;
       }
@@ -377,7 +375,7 @@ int mmm1d_tune(int timings, bool verbose) {
     switch_radius = min_rad;
     mmm1d_params.far_switch_radius_2 = Utils::sqr(switch_radius);
   } else if (mmm1d_params.far_switch_radius_2 <=
-             Utils::sqr(bessel_radii[MAXIMAL_B_CUT - 1])) {
+             Utils::sqr(bessel_radii.back())) {
     // this switching radius is too small for our Bessel series
     runtimeErrorMsg() << "could not find reasonable bessel cutoff";
     return ES_ERROR;
diff --git a/src/core/electrostatics_magnetostatics/specfunc.cpp b/src/core/electrostatics_magnetostatics/specfunc.cpp
index 5b5987394fe..027e820a6cd 100644
--- a/src/core/electrostatics_magnetostatics/specfunc.cpp
+++ b/src/core/electrostatics_magnetostatics/specfunc.cpp
@@ -47,8 +47,11 @@
 #include "specfunc.hpp"
 
 #include <utils/constants.hpp>
+#include <utils/math/sqr.hpp>
 
 #include <cmath>
+#include <tuple>
+#include <utility>
 
 /************************************************
  * chebychev expansions
@@ -209,8 +212,9 @@ static double const hzeta_c[15] = {
     -8.9535174270375468504026113181e-23};
 
 double hzeta(double s, double q) {
-  double max_bits = 54.0;
-  int const jmax = 12, kmax = 10;
+  constexpr auto max_bits = 54.0;
+  constexpr auto jmax = 12;
+  constexpr auto kmax = 10;
 
   if ((s > max_bits && q < 1.0) || (s > 0.5 * max_bits && q < 0.25))
     return pow(q, -s);
@@ -223,19 +227,20 @@ double hzeta(double s, double q) {
   /** Euler-Maclaurin summation formula from @cite moshier89a p. 400, with
    *  several typo corrections.
    */
-  auto const pmax = pow(kmax + q, -s);
+  auto const kmax_q = static_cast<double>(kmax) + q;
+  auto const pmax = pow(kmax_q, -s);
   auto scp = s;
-  auto pcp = pmax / (kmax + q);
-  auto ans = pmax * ((kmax + q) / (s - 1.0) + 0.5);
+  auto pcp = pmax / kmax_q;
+  auto ans = pmax * (kmax_q / (s - 1.0) + 0.5);
 
   for (int k = 0; k < kmax; k++)
-    ans += pow(k + q, -s);
+    ans += pow(static_cast<double>(k) + q, -s);
 
   for (int j = 0; j <= jmax; j++) {
     auto const delta = hzeta_c[j + 1] * scp * pcp;
     ans += delta;
     scp *= (s + 2 * j + 1) * (s + 2 * j + 2);
-    pcp /= (kmax + q) * (kmax + q);
+    pcp /= Utils::sqr(static_cast<double>(kmax) + q);
   }
 
   return ans;
@@ -289,9 +294,8 @@ double LPK0(double x) {
     return tmp * (xx * ak0_cs[1] + 0.5 * ak0_cs[0]);
   }
   if (x > 2) {
-    int j = ak01_orders[((int)x) - 2];
+    int j = ak01_orders[static_cast<int>(x) - 2];
     double x2;
-    double dd0, d0;
     double *s0;
     if (x <= 8) {
       s0 = ak0_cs;
@@ -300,8 +304,8 @@ double LPK0(double x) {
       s0 = ak02_cs;
       x2 = (2. * 16.) / x - 2.;
     }
-    dd0 = s0[j];
-    d0 = x2 * dd0 + s0[j - 1];
+    auto dd0 = s0[j];
+    auto d0 = x2 * dd0 + s0[j - 1];
     for (j -= 2; j >= 1; j--) {
       auto const tmp0 = d0;
       d0 = x2 * d0 - dd0 + s0[j];
@@ -314,17 +318,16 @@ double LPK0(double x) {
   {
     /* I0/1 series */
     int j = 10;
-    double ret, x2 = (2. / 4.5) * x * x - 2.;
-    double dd0, d0;
-    dd0 = bi0_cs[j];
-    d0 = x2 * dd0 + bi0_cs[j - 1];
+    auto x2 = (2. / 4.5) * x * x - 2.;
+    auto dd0 = bi0_cs[j];
+    auto d0 = x2 * dd0 + bi0_cs[j - 1];
     for (j -= 2; j >= 1; j--) {
-      double tmp0 = d0;
+      auto const tmp0 = d0;
       d0 = x2 * d0 - dd0 + bi0_cs[j];
       dd0 = tmp0;
     }
     auto const tmp = log(x) - Utils::ln_2();
-    ret = -tmp * (0.5 * (bi0_cs[0] + x2 * d0) - dd0);
+    auto const ret = -tmp * (0.5 * (bi0_cs[0] + x2 * d0) - dd0);
 
     /* K0/K1 correction */
     j = 9;
@@ -352,7 +355,6 @@ double LPK1(double x) {
   if (x > 2) {
     int j = ak01_orders[((int)x) - 2];
     double x2;
-    double dd1, d1;
     double *s1;
     if (x <= 8) {
       s1 = ak1_cs;
@@ -361,8 +363,8 @@ double LPK1(double x) {
       s1 = ak12_cs;
       x2 = (2. * 16.) / x - 2.;
     }
-    dd1 = s1[j];
-    d1 = x2 * dd1 + s1[j - 1];
+    auto dd1 = s1[j];
+    auto d1 = x2 * dd1 + s1[j - 1];
     for (j -= 2; j >= 1; j--) {
       auto const tmp1 = d1;
       d1 = x2 * d1 - dd1 + s1[j];
@@ -375,17 +377,16 @@ double LPK1(double x) {
   {
     /* I0/1 series */
     int j = 10;
-    double ret, x2 = (2. / 4.5) * x * x - 2.;
-    double dd1, d1;
-    dd1 = bi1_cs[j];
-    d1 = x2 * dd1 + bi1_cs[j - 1];
+    auto x2 = (2. / 4.5) * x * x - 2.;
+    auto dd1 = bi1_cs[j];
+    auto d1 = x2 * dd1 + bi1_cs[j - 1];
     for (j -= 2; j >= 1; j--) {
       auto const tmp1 = d1;
       d1 = x2 * d1 - dd1 + bi1_cs[j];
       dd1 = tmp1;
     }
     auto const tmp = log(x) - Utils::ln_2();
-    ret = x * tmp * (0.5 * (bi1_cs[0] + x2 * d1) - dd1);
+    auto const ret = x * tmp * (0.5 * (bi1_cs[0] + x2 * d1) - dd1);
 
     /* K0/K1 correction */
     j = 9;
@@ -401,7 +402,7 @@ double LPK1(double x) {
   }
 }
 
-std::tuple<double, double> LPK01(double x) {
+std::pair<double, double> LPK01(double x) {
   if (x >= 27.) {
     auto const tmp = .5 * exp(-x) / sqrt(x);
     auto const K0 = tmp * ak0_cs[0];
@@ -417,7 +418,6 @@ std::tuple<double, double> LPK01(double x) {
   if (x > 2) {
     int j = ak01_orders[((int)x) - 2];
     double x2;
-    double dd0, dd1, d0, d1;
     double *s0, *s1;
     if (x <= 8) {
       s0 = ak0_cs;
@@ -428,10 +428,10 @@ std::tuple<double, double> LPK01(double x) {
       s1 = ak12_cs;
       x2 = (2. * 16.) / x - 2.;
     }
-    dd0 = s0[j];
-    dd1 = s1[j];
-    d0 = x2 * dd0 + s0[j - 1];
-    d1 = x2 * dd1 + s1[j - 1];
+    auto dd0 = s0[j];
+    auto dd1 = s1[j];
+    auto d0 = x2 * dd0 + s0[j - 1];
+    auto d1 = x2 * dd1 + s1[j - 1];
     for (j -= 2; j >= 1; j--) {
       auto const tmp0 = d0, tmp1 = d1;
       d0 = x2 * d0 - dd0 + s0[j];
@@ -448,12 +448,11 @@ std::tuple<double, double> LPK01(double x) {
   {
     /* I0/1 series */
     int j = 10;
-    double x2 = (2. / 4.5) * x * x - 2.;
-    double dd0, dd1, d0, d1;
-    dd0 = bi0_cs[j];
-    dd1 = bi1_cs[j];
-    d0 = x2 * dd0 + bi0_cs[j - 1];
-    d1 = x2 * dd1 + bi1_cs[j - 1];
+    auto x2 = (2. / 4.5) * x * x - 2.;
+    auto dd0 = bi0_cs[j];
+    auto dd1 = bi1_cs[j];
+    auto d0 = x2 * dd0 + bi0_cs[j - 1];
+    auto d1 = x2 * dd1 + bi1_cs[j - 1];
     for (j -= 2; j >= 1; j--) {
       auto const tmp0 = d0, tmp1 = d1;
       d0 = x2 * d0 - dd0 + bi0_cs[j];
diff --git a/src/core/electrostatics_magnetostatics/specfunc.hpp b/src/core/electrostatics_magnetostatics/specfunc.hpp
index b9a0e7fb4e5..3df2ae621b2 100644
--- a/src/core/electrostatics_magnetostatics/specfunc.hpp
+++ b/src/core/electrostatics_magnetostatics/specfunc.hpp
@@ -38,7 +38,7 @@
 #include <utils/Span.hpp>
 
 #include <cassert>
-#include <tuple>
+#include <utility>
 
 /** Hurwitz zeta function. This function was taken from the GSL code. */
 double hzeta(double order, double x);
@@ -74,7 +74,7 @@ double LPK1(double x);
  *  comparable to the relative precision sqrt implementation of current
  *  hardware.
  */
-std::tuple<double, double> LPK01(double x);
+std::pair<double, double> LPK01(double x);
 
 /** Evaluate the polynomial interpreted as a Taylor series via the
  *  Horner scheme.

From 88a2afe08e80ceb7ca5c079c8942917ee925c648 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Mon, 19 Jul 2021 18:19:45 +0200
Subject: [PATCH 3/6] core: Modernize MMM1D GPU code

---
 src/core/EspressoSystemInterface.hpp |   2 +-
 src/core/SystemInterface.hpp         |   2 +-
 src/core/actor/Mmm1dgpuForce.hpp     |   2 +-
 src/core/actor/Mmm1dgpuForce_cuda.cu | 139 ++++++++++++---------------
 src/core/actor/specfunc_cuda.hpp     |  10 +-
 5 files changed, 72 insertions(+), 83 deletions(-)

diff --git a/src/core/EspressoSystemInterface.hpp b/src/core/EspressoSystemInterface.hpp
index 47413ba70af..f7f25b59464 100644
--- a/src/core/EspressoSystemInterface.hpp
+++ b/src/core/EspressoSystemInterface.hpp
@@ -148,7 +148,7 @@ class EspressoSystemInterface : public SystemInterface {
 
   Utils::Vector3d box() const override;
 
-  unsigned int npart_gpu() override {
+  unsigned int npart_gpu() const override {
 #ifdef CUDA
     return gpu_get_particle_pointer().size();
 #else
diff --git a/src/core/SystemInterface.hpp b/src/core/SystemInterface.hpp
index d2dbe1009bf..d09014a25ea 100644
--- a/src/core/SystemInterface.hpp
+++ b/src/core/SystemInterface.hpp
@@ -93,7 +93,7 @@ class SystemInterface {
     return m_needsDirectorGpu;
   }
 
-  virtual unsigned int npart_gpu() { return 0; };
+  virtual unsigned int npart_gpu() const { return 0; };
   virtual Vector3 box() const = 0;
 
   virtual bool needsRGpu() { return m_needsRGpu; };
diff --git a/src/core/actor/Mmm1dgpuForce.hpp b/src/core/actor/Mmm1dgpuForce.hpp
index 24a8c2fc9d8..4941839a823 100644
--- a/src/core/actor/Mmm1dgpuForce.hpp
+++ b/src/core/actor/Mmm1dgpuForce.hpp
@@ -48,7 +48,7 @@ class Mmm1dgpuForce : public Actor {
 private:
   // CUDA parameters
   unsigned int numThreads;
-  unsigned int numBlocks(SystemInterface &s);
+  unsigned int numBlocks(SystemInterface const &s) const;
 
   // the box length currently set on the GPU
   // Needed to make sure it hasn't been modified after inter coulomb was used.
diff --git a/src/core/actor/Mmm1dgpuForce_cuda.cu b/src/core/actor/Mmm1dgpuForce_cuda.cu
index f2b138c2af1..bd2c2e519f3 100644
--- a/src/core/actor/Mmm1dgpuForce_cuda.cu
+++ b/src/core/actor/Mmm1dgpuForce_cuda.cu
@@ -34,6 +34,7 @@
 #include "electrostatics_magnetostatics/mmm1d.hpp"
 
 #include <utils/constants.hpp>
+#include <utils/math/sqr.hpp>
 
 #include <cuda.h>
 
@@ -45,7 +46,7 @@
 #endif
 
 // the code is mostly multi-GPU capable, but ESPResSo is not yet
-const int deviceCount = 1;
+constexpr int deviceCount = 1;
 
 #undef cudaSetDevice
 #define cudaSetDevice(d)
@@ -57,12 +58,11 @@ __constant__ float coulomb_prefactor[1] = {1.0f};
 __constant__ int bessel_cutoff[1] = {5};
 __constant__ float maxPWerror[1] = {1e-5f};
 
-// order hardcoded. mmm1d_recalcTables() typically does order less than 30.
 // As the coefficients are stored in __constant__ memory, the array needs to be
 // sized in advance. We don't know exactly how many coefficients per order, so
 // we size plentiful.
-const int modpsi_order = 30;
-const int modpsi_constant_size = modpsi_order * modpsi_order * 2;
+constexpr int modpsi_order = 30;
+constexpr int modpsi_constant_size = modpsi_order * modpsi_order * 2;
 
 // linearized array on device
 __constant__ int device_n_modPsi[1] = {0};
@@ -108,11 +108,11 @@ int modpsi_init() {
     cudaSetDevice(d);
 
     // copy to GPU
-    int linModPsiSize = linModPsi_offsets[modPsi.size() - 1] +
-                        linModPsi_lengths[modPsi.size() - 1];
+    auto const linModPsiSize = linModPsi_offsets[modPsi.size() - 1] +
+                               linModPsi_lengths[modPsi.size() - 1];
     if (linModPsiSize > modpsi_constant_size) {
-      printf("ERROR: __constant__ device_linModPsi[] is not large enough\n");
-      std::abort();
+      throw std::runtime_error(
+          "__constant__ device_linModPsi[] is not large enough");
     }
     cuda_safe_mem(cudaMemcpyToSymbol(device_linModPsi_offsets,
                                      linModPsi_offsets.data(),
@@ -155,7 +155,7 @@ Mmm1dgpuForce::Mmm1dgpuForce(SystemInterface &s, float _coulomb_prefactor,
 void Mmm1dgpuForce::setup(SystemInterface &s) {
   if (s.box()[2] <= 0) {
     throw std::runtime_error(
-        "Error: Please set box length before initializing MMM1D!");
+        "Please set box length before initializing MMM1D!");
   }
   if (need_tune && s.npart_gpu() > 0) {
     set_params(static_cast<float>(s.box()[2]),
@@ -166,24 +166,22 @@ void Mmm1dgpuForce::setup(SystemInterface &s) {
   if (s.box()[2] != host_boxz) {
     set_params(static_cast<float>(s.box()[2]), 0, -1, -1, -1);
   }
-  if (s.npart_gpu() == host_npart) // unchanged
-  {
+  if (s.npart_gpu() == host_npart) { // unchanged
     return;
   }
 
   // For all but the largest systems, it is faster to store force pairs and then
   // sum them up. Atomics are just so slow: so unless we're limited by memory,
   // do the latter.
+  auto const part_mem_size = 3 * Utils::sqr(s.npart_gpu()) * sizeof(float);
   pairs = 2;
   for (int d = 0; d < deviceCount; d++) {
     cudaSetDevice(d);
 
     size_t freeMem, totalMem;
     cudaMemGetInfo(&freeMem, &totalMem);
-    if (freeMem / 2 <
-        3 * s.npart_gpu() * s.npart_gpu() *
-            sizeof(float)) // don't use more than half the device's memory
-    {
+    if (freeMem / 2 < part_mem_size) {
+      // don't use more than half the device's memory
       std::cerr << "Switching to atomicAdd due to memory constraints."
                 << std::endl;
       pairs = 0;
@@ -192,11 +190,9 @@ void Mmm1dgpuForce::setup(SystemInterface &s) {
   }
   if (dev_forcePairs)
     cudaFree(dev_forcePairs);
-  if (pairs) // we need memory to store force pairs
-  {
-    cuda_safe_mem(
-        cudaMalloc((void **)&dev_forcePairs,
-                   3 * s.npart_gpu() * s.npart_gpu() * sizeof(float)));
+  if (pairs) {
+    // we need memory to store force pairs
+    cuda_safe_mem(cudaMalloc((void **)&dev_forcePairs, part_mem_size));
   }
   if (dev_energyBlocks)
     cudaFree(dev_energyBlocks);
@@ -205,7 +201,7 @@ void Mmm1dgpuForce::setup(SystemInterface &s) {
   host_npart = static_cast<int>(s.npart_gpu());
 }
 
-unsigned int Mmm1dgpuForce::numBlocks(SystemInterface &s) {
+unsigned int Mmm1dgpuForce::numBlocks(SystemInterface const &s) const {
   auto b = static_cast<int>(s.npart_gpu() * s.npart_gpu() / numThreads) + 1;
   if (b > 65535)
     b = 65535;
@@ -233,8 +229,8 @@ __global__ void sumKernel(float *data, int N) {
   extern __shared__ float partialsums[];
   if (blockIdx.x != 0)
     return;
-  auto tid = static_cast<int>(threadIdx.x);
-  float result = 0;
+  auto const tid = static_cast<int>(threadIdx.x);
+  auto result = 0.f;
 
   for (int i = 0; i < N; i += static_cast<int>(blockDim.x)) {
     if (i + tid >= N)
@@ -274,9 +270,8 @@ void Mmm1dgpuForce::tune(SystemInterface &s, float _maxPWerror,
   int bessel_cutoff = _bessel_cutoff;
   float maxrad = host_boxz;
 
-  if (_far_switch_radius < 0 && _bessel_cutoff < 0)
-  // autodetermine switching radius and Bessel cutoff
-  {
+  if (_far_switch_radius < 0 && _bessel_cutoff < 0) {
+    // autodetermine switching radius and Bessel cutoff
     float bestrad = 0, besttime = INFINITY;
 
     // NOLINTNEXTLINE(clang-analyzer-security.FloatLoopCounter)
@@ -284,7 +279,7 @@ void Mmm1dgpuForce::tune(SystemInterface &s, float _maxPWerror,
          far_switch_radius += 0.05f * maxrad) {
       set_params(0, 0, _maxPWerror, far_switch_radius, bessel_cutoff);
       tune(s, _maxPWerror, far_switch_radius, -2); // tune Bessel cutoff
-      auto runtime = force_benchmark(s);
+      auto const runtime = force_benchmark(s);
       if (runtime < besttime) {
         besttime = runtime;
         bestrad = far_switch_radius;
@@ -294,24 +289,19 @@ void Mmm1dgpuForce::tune(SystemInterface &s, float _maxPWerror,
 
     set_params(0, 0, _maxPWerror, far_switch_radius, bessel_cutoff);
     tune(s, _maxPWerror, far_switch_radius, -2); // tune Bessel cutoff
-  }
-
-  else if (_bessel_cutoff < 0)
-  // autodetermine Bessel cutoff
-  {
+  } else if (_bessel_cutoff < 0) {
+    // autodetermine Bessel cutoff
     int *dev_cutoff;
-    int maxCut = 30;
+    constexpr auto maxCut = 30;
     cuda_safe_mem(cudaMalloc((void **)&dev_cutoff, sizeof(int)));
     besselTuneKernel<<<dim3(1), dim3(1), 0, nullptr>>>(
         dev_cutoff, far_switch_radius, maxCut);
     cuda_safe_mem(cudaMemcpy(&bessel_cutoff, dev_cutoff, sizeof(int),
                              cudaMemcpyDeviceToHost));
     cudaFree(dev_cutoff);
-    if (_bessel_cutoff != -2 &&
-        bessel_cutoff >=
-            maxCut) // we already have our switching radius and only need to
-                    // determine the cutoff, i.e. this is the final tuning round
-    {
+    if (_bessel_cutoff != -2 && bessel_cutoff >= maxCut) {
+      // we already had our switching radius and only needed to
+      // determine the cutoff, i.e. this was the final tuning round
       throw std::runtime_error(
           "No reasonable Bessel cutoff could be determined.");
     }
@@ -327,8 +317,8 @@ void Mmm1dgpuForce::set_params(float _boxz, float _coulomb_prefactor,
     throw std::runtime_error(
         "switching radius must not be larger than box length");
   }
-  float _far_switch_radius_2 = _far_switch_radius * _far_switch_radius;
-  float _uz = 1.0f / _boxz;
+  auto const _far_switch_radius_2 = Utils::sqr(_far_switch_radius);
+  auto const _uz = 1.0f / _boxz;
   for (int d = 0; d < deviceCount; d++) {
     // double colons are needed to access the constant memory variables because
     // they are file globals and we have identically named class variables
@@ -339,8 +329,7 @@ void Mmm1dgpuForce::set_params(float _boxz, float _coulomb_prefactor,
       bessel_cutoff = _bessel_cutoff;
     }
     if (_far_switch_radius >= 0) {
-      mmm1d_params.far_switch_radius_2 =
-          _far_switch_radius * _far_switch_radius;
+      mmm1d_params.far_switch_radius_2 = _far_switch_radius_2;
       cuda_safe_mem(cudaMemcpyToSymbol(::far_switch_radius_2,
                                        &_far_switch_radius_2, sizeof(float)));
       far_switch_radius = _far_switch_radius;
@@ -370,12 +359,10 @@ void Mmm1dgpuForce::set_params(float _boxz, float _coulomb_prefactor,
   }
   need_tune = true;
 
-  // The changed parameters in mmm1d_params do not need to be broadcast: they
-  // are only accessed by the TCL print function (on node 0) when you call inter
-  // coulomb. The CUDA code only runs on node 0, so other nodes do not need the
-  // parameters. We couldn't broadcast from here anyway because set_params()
-  // might be called from inside computeForces() which is not a time at which
-  // the MPI loop on the slave nodes is waiting for broadcasts.
+  // The changed parameters in @ref mmm1d_params do not need to be broadcast:
+  // they are only accessed by the CUDA code which only runs on the head node.
+  // We couldn't broadcast from here anyway because @ref set_params() is
+  // called from the @ref computeForces() and @ref tune() functions.
 }
 
 __global__ void forcesKernel(const float *__restrict__ r,
@@ -385,17 +372,19 @@ __global__ void forcesKernel(const float *__restrict__ r,
   if (tStop < 0)
     tStop = N * N;
 
-  const float c_2pif = 2 * Utils::pi<float>();
+  auto const c_2pif = 2 * Utils::pi<float>();
 
   for (int tid =
            static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x) + tStart;
        tid < tStop; tid += static_cast<int>(blockDim.x * gridDim.x)) {
-    int p1 = tid % N, p2 = tid / N;
-    float x = r[3 * p2] - r[3 * p1], y = r[3 * p2 + 1] - r[3 * p1 + 1],
-          z = r[3 * p2 + 2] - r[3 * p1 + 2];
-    float rxy2 = sqpow(x) + sqpow(y);
-    float rxy = sqrt(rxy2);
-    float sum_r = 0, sum_z = 0;
+    auto const p1 = tid % N, p2 = tid / N;
+    auto x = r[3 * p2] - r[3 * p1];
+    auto y = r[3 * p2 + 1] - r[3 * p1 + 1];
+    auto z = r[3 * p2 + 2] - r[3 * p1 + 2];
+    auto const rxy2 = sqpow(x) + sqpow(y);
+    auto rxy = sqrt(rxy2);
+    auto sum_r = 0.f;
+    auto sum_z = 0.f;
 
     // if (*boxz <= 0.0) return; // in case we are not initialized yet
 
@@ -408,14 +397,14 @@ __global__ void forcesKernel(const float *__restrict__ r,
                // anyway)
     } else if (rxy2 <= *far_switch_radius_2) // near formula
     {
-      float uzz = *uz * z;
-      float uzr = *uz * rxy;
+      auto const uzz = *uz * z;
+      auto const uzr = *uz * rxy;
       sum_z = dev_mod_psi_odd(0, uzz);
-      float uzrpow = uzr;
+      auto uzrpow = uzr;
       for (int n = 1; n < *device_n_modPsi; n++) {
-        float sum_r_old = sum_r;
-        float mpe = dev_mod_psi_even(n, uzz);
-        float mpo = dev_mod_psi_odd(n, uzz);
+        auto const sum_r_old = sum_r;
+        auto const mpe = dev_mod_psi_even(n, uzz);
+        auto const mpo = dev_mod_psi_odd(n, uzz);
 
         sum_r += 2 * static_cast<float>(n) * mpe * uzrpow;
         uzrpow *= uzr;
@@ -455,7 +444,7 @@ __global__ void forcesKernel(const float *__restrict__ r,
       sum_r += 2 * *uz / rxy;
     }
 
-    float pref = *coulomb_prefactor * q[p1] * q[p2];
+    auto const pref = *coulomb_prefactor * q[p1] * q[p2];
     if (pairs) {
       force[3 * (p1 + p2 * N - tStart)] = pref * sum_r / rxy * x;
       force[3 * (p1 + p2 * N - tStart) + 1] = pref * sum_r / rxy * y;
@@ -486,12 +475,12 @@ __global__ void energiesKernel(const float *__restrict__ r,
   for (int tid =
            static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x) + tStart;
        tid < tStop; tid += static_cast<int>(blockDim.x * gridDim.x)) {
-    int p1 = tid % N, p2 = tid / N;
-    float z = r[3 * p2 + 2] - r[3 * p1 + 2];
-    float rxy2 =
+    auto const p1 = tid % N, p2 = tid / N;
+    auto z = r[3 * p2 + 2] - r[3 * p1 + 2];
+    auto const rxy2 =
         sqpow(r[3 * p2] - r[3 * p1]) + sqpow(r[3 * p2 + 1] - r[3 * p1 + 1]);
-    float rxy = sqrt(rxy2);
-    float sum_e = 0;
+    auto rxy = sqrt(rxy2);
+    auto sum_e = 0.f;
 
     // if (*boxz <= 0.0) return; // in case we are not initialized yet
 
@@ -502,13 +491,13 @@ __global__ void energiesKernel(const float *__restrict__ r,
     {
     } else if (rxy2 <= *far_switch_radius_2) // near formula
     {
-      float uzz = *uz * z;
-      float uzr2 = sqpow(*uz * rxy);
-      float uzrpow = uzr2;
+      auto const uzz = *uz * z;
+      auto const uzr2 = sqpow(*uz * rxy);
+      auto uzrpow = uzr2;
       sum_e = dev_mod_psi_even(0, uzz);
       for (int n = 1; n < *device_n_modPsi; n++) {
-        float sum_e_old = sum_e;
-        float mpe = dev_mod_psi_even(n, uzz);
+        auto const sum_e_old = sum_e;
+        auto const mpe = dev_mod_psi_even(n, uzz);
         sum_e += mpe * uzrpow;
         uzrpow *= uzr2;
 
@@ -525,7 +514,7 @@ __global__ void energiesKernel(const float *__restrict__ r,
     {
       sum_e = -(log(rxy * *uz / 2) + c_gammaf) / 2;
       for (int p = 1; p < *bessel_cutoff; p++) {
-        float arg = c_2pif * *uz * static_cast<float>(p);
+        auto const arg = c_2pif * *uz * static_cast<float>(p);
         sum_e += dev_K0(arg * rxy) * cos(arg * z);
       }
       sum_e *= *uz * 4;
@@ -549,7 +538,7 @@ __global__ void vectorReductionKernel(float const *src, float *dst, int N,
 
   for (auto tid = static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
        tid < N; tid += static_cast<int>(blockDim.x * gridDim.x)) {
-    int offset = ((tid + (tStart % N)) % N);
+    auto const offset = (tid + (tStart % N)) % N;
 
     for (int i = 0; tid + i * N < (tStop - tStart); i++) {
 #pragma unroll 3
@@ -604,7 +593,7 @@ void Mmm1dgpuForce::computeEnergy(SystemInterface &s) {
   if (pairs < 0) {
     throw std::runtime_error("MMM1D was not initialized correctly");
   }
-  auto shared = static_cast<int>(numThreads * sizeof(float));
+  auto const shared = numThreads * static_cast<int>(sizeof(float));
 
   KERNELCALL_shared(energiesKernel, numBlocks(s), numThreads, shared,
                     s.rGpuBegin(), s.qGpuBegin(), dev_energyBlocks,
diff --git a/src/core/actor/specfunc_cuda.hpp b/src/core/actor/specfunc_cuda.hpp
index 1664d4e48e2..3ff410e165b 100644
--- a/src/core/actor/specfunc_cuda.hpp
+++ b/src/core/actor/specfunc_cuda.hpp
@@ -132,11 +132,11 @@ __constant__ static int bi1_size = 11;
 /**@}*/
 
 __device__ float evaluateAsChebychevSeriesAt(float const *c, int n, float x) {
-  float x2 = 2 * x;
-  float dd = c[n - 1];
-  float d = x2 * dd + c[n - 2];
+  auto const x2 = 2 * x;
+  auto dd = c[n - 1];
+  auto d = x2 * dd + c[n - 2];
   for (int j = n - 3; j >= 1; j--) {
-    float tmp = d;
+    auto const tmp = d;
     d = x2 * d - dd + c[j];
     dd = tmp;
   }
@@ -145,7 +145,7 @@ __device__ float evaluateAsChebychevSeriesAt(float const *c, int n, float x) {
 
 __device__ float evaluateAsTaylorSeriesAt(float const *c, int n, float x) {
   int cnt = n - 1;
-  float r = c[cnt];
+  auto r = c[cnt];
   while (--cnt >= 0)
     r = r * x + c[cnt];
   return r;

From 268f52872beeb8780aec86535be9d78c38a598d9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Fri, 23 Jul 2021 13:26:19 +0200
Subject: [PATCH 4/6] core: Remove unused argument in CUDA kernels

---
 src/core/actor/Mmm1dgpuForce_cuda.cu | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/src/core/actor/Mmm1dgpuForce_cuda.cu b/src/core/actor/Mmm1dgpuForce_cuda.cu
index bd2c2e519f3..46a5b0d3f2d 100644
--- a/src/core/actor/Mmm1dgpuForce_cuda.cu
+++ b/src/core/actor/Mmm1dgpuForce_cuda.cu
@@ -368,11 +368,10 @@ void Mmm1dgpuForce::set_params(float _boxz, float _coulomb_prefactor,
 __global__ void forcesKernel(const float *__restrict__ r,
                              const float *__restrict__ q,
                              float *__restrict__ force, int N, int pairs,
-                             int tStart, int tStop) {
-  if (tStop < 0)
-    tStop = N * N;
+                             int tStart) {
 
   auto const c_2pif = 2 * Utils::pi<float>();
+  auto const tStop = Utils::sqr(N);
 
   for (int tid =
            static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x) + tStart;
@@ -460,12 +459,11 @@ __global__ void forcesKernel(const float *__restrict__ r,
 __global__ void energiesKernel(const float *__restrict__ r,
                                const float *__restrict__ q,
                                float *__restrict__ energy, int N, int pairs,
-                               int tStart, int tStop) {
-  if (tStop < 0)
-    tStop = N * N;
+                               int tStart) {
 
   auto const c_2pif = 2 * Utils::pi<float>();
   auto const c_gammaf = Utils::gamma<float>();
+  auto const tStop = Utils::sqr(N);
 
   extern __shared__ float partialsums[];
   if (!pairs) {
@@ -532,9 +530,9 @@ __global__ void energiesKernel(const float *__restrict__ r,
 }
 
 __global__ void vectorReductionKernel(float const *src, float *dst, int N,
-                                      int tStart, int tStop) {
-  if (tStop < 0)
-    tStop = N * N;
+                                      int tStart) {
+
+  auto const tStop = Utils::sqr(N);
 
   for (auto tid = static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
        tid < N; tid += static_cast<int>(blockDim.x * gridDim.x)) {
@@ -565,12 +563,12 @@ void Mmm1dgpuForce::computeForces(SystemInterface &s) {
   {
     auto blocksRed = static_cast<int>(s.npart_gpu() / numThreads) + 1;
     KERNELCALL(forcesKernel, numBlocks(s), numThreads, s.rGpuBegin(),
-               s.qGpuBegin(), dev_forcePairs, s.npart_gpu(), pairs, 0, -1)
+               s.qGpuBegin(), dev_forcePairs, s.npart_gpu(), pairs, 0)
     KERNELCALL(vectorReductionKernel, blocksRed, numThreads, dev_forcePairs,
-               s.fGpuBegin(), s.npart_gpu(), 0, -1)
+               s.fGpuBegin(), s.npart_gpu(), 0)
   } else {
     KERNELCALL(forcesKernel, numBlocks(s), numThreads, s.rGpuBegin(),
-               s.qGpuBegin(), s.fGpuBegin(), s.npart_gpu(), pairs, 0, -1)
+               s.qGpuBegin(), s.fGpuBegin(), s.npart_gpu(), pairs, 0)
   }
 }
 
@@ -597,7 +595,7 @@ void Mmm1dgpuForce::computeEnergy(SystemInterface &s) {
 
   KERNELCALL_shared(energiesKernel, numBlocks(s), numThreads, shared,
                     s.rGpuBegin(), s.qGpuBegin(), dev_energyBlocks,
-                    s.npart_gpu(), 0, 0, -1);
+                    s.npart_gpu(), 0, 0);
   KERNELCALL_shared(sumKernel, 1, numThreads, shared, dev_energyBlocks,
                     numBlocks(s));
   KERNELCALL(scaleAndAddKernel, 1, 1, &(((CUDA_energy *)s.eGpu())->coulomb),
@@ -617,7 +615,7 @@ float Mmm1dgpuForce::force_benchmark(SystemInterface &s) {
   cuda_safe_mem(cudaEventCreate(&eventStop));
   cuda_safe_mem(cudaEventRecord(eventStart, stream[0]));
   KERNELCALL(forcesKernel, numBlocks(s), numThreads, s.rGpuBegin(),
-             s.qGpuBegin(), dev_f_benchmark, s.npart_gpu(), 0, 0, -1)
+             s.qGpuBegin(), dev_f_benchmark, s.npart_gpu(), 0, 0)
   cuda_safe_mem(cudaEventRecord(eventStop, stream[0]));
   cuda_safe_mem(cudaEventSynchronize(eventStop));
   cuda_safe_mem(cudaEventElapsedTime(&elapsedTime, eventStart, eventStop));

From c38907abd0b01df0fa8e4c9d17cb02a5a2c33030 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Fri, 23 Jul 2021 14:20:21 +0200
Subject: [PATCH 5/6] core: Fix -Wconversion warnings in MMM1D code

---
 src/core/EspressoSystemInterface.hpp          |  18 ++-
 src/core/EspressoSystemInterface_cuda.cu      |  35 +++--
 src/core/SystemInterface.hpp                  |   5 +-
 src/core/actor/DipolarBarnesHut.hpp           |   3 +-
 src/core/actor/DipolarDirectSum.hpp           |   2 +-
 src/core/actor/Mmm1dgpuForce.hpp              |   2 +-
 src/core/actor/Mmm1dgpuForce_cuda.cu          | 133 +++++++++---------
 src/core/actor/specfunc_cuda.hpp              |  34 ++---
 .../electrostatics_magnetostatics/mmm1d.cpp   |  24 ++--
 .../p3m-common.hpp                            |   2 +-
 src/python/espressomd/electrostatics.pxd      |   2 +-
 11 files changed, 138 insertions(+), 122 deletions(-)

diff --git a/src/core/EspressoSystemInterface.hpp b/src/core/EspressoSystemInterface.hpp
index f7f25b59464..fa01a3aa067 100644
--- a/src/core/EspressoSystemInterface.hpp
+++ b/src/core/EspressoSystemInterface.hpp
@@ -22,6 +22,8 @@
 #include "SystemInterface.hpp"
 #include "cuda_interface.hpp"
 
+#include <cstddef>
+
 /* Syntactic sugar */
 #define espressoSystemInterface EspressoSystemInterface::Instance()
 
@@ -117,12 +119,16 @@ class EspressoSystemInterface : public SystemInterface {
   float *fGpuEnd() override {
     return gpu_get_particle_force_pointer() + 3 * m_gpu_npart;
   };
-  float *eGpu() override { return (float *)gpu_get_energy_pointer(); };
+  float *eGpu() override {
+    // cast struct of floats to array of floats
+    // https://stackoverflow.com/a/29278260
+    return reinterpret_cast<float *>(gpu_get_energy_pointer());
+  };
   float *torqueGpuBegin() override {
-    return (float *)gpu_get_particle_torque_pointer();
+    return gpu_get_particle_torque_pointer();
   };
   float *torqueGpuEnd() override {
-    return (float *)(gpu_get_particle_torque_pointer()) + 3 * m_gpu_npart;
+    return gpu_get_particle_torque_pointer() + 3 * m_gpu_npart;
   };
   bool hasFGpu() override { return true; };
   bool requestFGpu() override {
@@ -148,7 +154,7 @@ class EspressoSystemInterface : public SystemInterface {
 
   Utils::Vector3d box() const override;
 
-  unsigned int npart_gpu() const override {
+  std::size_t npart_gpu() const override {
 #ifdef CUDA
     return gpu_get_particle_pointer().size();
 #else
@@ -177,10 +183,10 @@ class EspressoSystemInterface : public SystemInterface {
       reallocDeviceMemory(gpu_get_particle_pointer().size());
     }
   };
-  void reallocDeviceMemory(int n);
+  void reallocDeviceMemory(std::size_t n);
 #endif
 
-  int m_gpu_npart;
+  std::size_t m_gpu_npart;
   bool m_gpu;
 
   float *m_r_gpu_begin;
diff --git a/src/core/EspressoSystemInterface_cuda.cu b/src/core/EspressoSystemInterface_cuda.cu
index 81948ad920b..2904f2994fb 100644
--- a/src/core/EspressoSystemInterface_cuda.cu
+++ b/src/core/EspressoSystemInterface_cuda.cu
@@ -22,6 +22,8 @@
 #include "cuda_utils.cuh"
 #include "errorhandling.hpp"
 
+#include <cstddef>
+
 #include <cuda.h>
 
 #if defined(OMPI_MPI_H) || defined(_MPI_H)
@@ -33,8 +35,8 @@
 
 // Position and charge
 __global__ void split_kernel_rq(CUDA_particle_data *particles, float *r,
-                                float *q, int n) {
-  auto idx = static_cast<int>(blockDim.x * blockIdx.x + threadIdx.x);
+                                float *q, unsigned int n) {
+  auto const idx = blockDim.x * blockIdx.x + threadIdx.x;
   if (idx >= n)
     return;
 
@@ -49,8 +51,9 @@ __global__ void split_kernel_rq(CUDA_particle_data *particles, float *r,
 }
 
 // Charge only
-__global__ void split_kernel_q(CUDA_particle_data *particles, float *q, int n) {
-  auto idx = static_cast<int>(blockDim.x * blockIdx.x + threadIdx.x);
+__global__ void split_kernel_q(CUDA_particle_data *particles, float *q,
+                               unsigned int n) {
+  auto const idx = blockDim.x * blockIdx.x + threadIdx.x;
   if (idx >= n)
     return;
 
@@ -62,8 +65,9 @@ __global__ void split_kernel_q(CUDA_particle_data *particles, float *q, int n) {
 }
 
 // Position only
-__global__ void split_kernel_r(CUDA_particle_data *particles, float *r, int n) {
-  auto idx = static_cast<int>(blockDim.x * blockIdx.x + threadIdx.x);
+__global__ void split_kernel_r(CUDA_particle_data *particles, float *r,
+                               unsigned int n) {
+  auto idx = blockDim.x * blockIdx.x + threadIdx.x;
   if (idx >= n)
     return;
 
@@ -78,8 +82,9 @@ __global__ void split_kernel_r(CUDA_particle_data *particles, float *r, int n) {
 
 #ifdef CUDA
 // Velocity
-__global__ void split_kernel_v(CUDA_particle_data *particles, float *v, int n) {
-  auto idx = static_cast<int>(blockDim.x * blockIdx.x + threadIdx.x);
+__global__ void split_kernel_v(CUDA_particle_data *particles, float *v,
+                               unsigned int n) {
+  auto idx = blockDim.x * blockIdx.x + threadIdx.x;
   if (idx >= n)
     return;
 
@@ -96,8 +101,8 @@ __global__ void split_kernel_v(CUDA_particle_data *particles, float *v, int n) {
 #ifdef DIPOLES
 // Dipole moment
 __global__ void split_kernel_dip(CUDA_particle_data *particles, float *dip,
-                                 int n) {
-  auto idx = static_cast<int>(blockDim.x * blockIdx.x + threadIdx.x);
+                                 unsigned int n) {
+  auto idx = blockDim.x * blockIdx.x + threadIdx.x;
   if (idx >= n)
     return;
 
@@ -112,9 +117,9 @@ __global__ void split_kernel_dip(CUDA_particle_data *particles, float *dip,
 #endif
 
 __global__ void split_kernel_director(CUDA_particle_data *particles,
-                                      float *director, int n) {
+                                      float *director, unsigned int n) {
 #ifdef ROTATION
-  auto idx = static_cast<int>(blockDim.x * blockIdx.x + threadIdx.x);
+  auto idx = blockDim.x * blockIdx.x + threadIdx.x;
   if (idx >= n)
     return;
 
@@ -128,7 +133,7 @@ __global__ void split_kernel_director(CUDA_particle_data *particles,
 #endif
 }
 
-void EspressoSystemInterface::reallocDeviceMemory(int n) {
+void EspressoSystemInterface::reallocDeviceMemory(std::size_t n) {
   if (m_needsRGpu && ((n != m_gpu_npart) || (m_r_gpu_begin == nullptr))) {
     if (m_r_gpu_begin != nullptr)
       cuda_safe_mem(cudaFree(m_r_gpu_begin));
@@ -169,8 +174,8 @@ void EspressoSystemInterface::reallocDeviceMemory(int n) {
 }
 
 void EspressoSystemInterface::split_particle_struct() {
-  auto device_particles = gpu_get_particle_pointer();
-  int n = device_particles.size();
+  auto const device_particles = gpu_get_particle_pointer();
+  auto const n = static_cast<unsigned int>(device_particles.size());
   if (n == 0)
     return;
 
diff --git a/src/core/SystemInterface.hpp b/src/core/SystemInterface.hpp
index d09014a25ea..5f78a98581b 100644
--- a/src/core/SystemInterface.hpp
+++ b/src/core/SystemInterface.hpp
@@ -20,8 +20,11 @@
 #define SYSTEMINTERFACE_H
 
 #include "config.hpp"
+
 #include <utils/Vector.hpp>
 
+#include <cstddef>
+
 /** @todo: Turn needsXY in getter/setter **/
 
 class SystemInterface {
@@ -93,7 +96,7 @@ class SystemInterface {
     return m_needsDirectorGpu;
   }
 
-  virtual unsigned int npart_gpu() const { return 0; };
+  virtual std::size_t npart_gpu() const { return 0; };
   virtual Vector3 box() const = 0;
 
   virtual bool needsRGpu() { return m_needsRGpu; };
diff --git a/src/core/actor/DipolarBarnesHut.hpp b/src/core/actor/DipolarBarnesHut.hpp
index f4019c34d29..8631e8c9eb6 100644
--- a/src/core/actor/DipolarBarnesHut.hpp
+++ b/src/core/actor/DipolarBarnesHut.hpp
@@ -82,7 +82,8 @@ class DipolarBarnesHut : public Actor {
     buildTreeBH(m_bh_data.blocks);
     summarizeBH(m_bh_data.blocks);
     sortBH(m_bh_data.blocks);
-    if (energyBH(&m_bh_data, m_k, (&(((CUDA_energy *)s.eGpu())->dipolar)))) {
+    if (energyBH(&m_bh_data, m_k,
+                 &(reinterpret_cast<CUDA_energy *>(s.eGpu())->dipolar))) {
       runtimeErrorMsg() << "kernels encountered a functional error";
     }
   };
diff --git a/src/core/actor/DipolarDirectSum.hpp b/src/core/actor/DipolarDirectSum.hpp
index 809bec5b828..0e9dd8d505b 100644
--- a/src/core/actor/DipolarDirectSum.hpp
+++ b/src/core/actor/DipolarDirectSum.hpp
@@ -74,7 +74,7 @@ class DipolarDirectSum : public Actor {
     }
     DipolarDirectSum_kernel_wrapper_energy(
         k, static_cast<int>(s.npart_gpu()), s.rGpuBegin(), s.dipGpuBegin(), box,
-        per, (&(((CUDA_energy *)s.eGpu())->dipolar)));
+        per, &(reinterpret_cast<CUDA_energy *>(s.eGpu())->dipolar));
   };
 
 private:
diff --git a/src/core/actor/Mmm1dgpuForce.hpp b/src/core/actor/Mmm1dgpuForce.hpp
index 4941839a823..a767a38cad5 100644
--- a/src/core/actor/Mmm1dgpuForce.hpp
+++ b/src/core/actor/Mmm1dgpuForce.hpp
@@ -55,7 +55,7 @@ class Mmm1dgpuForce : public Actor {
   float host_boxz;
   // the number of particles we had during the last run. Needed to check if we
   // have to realloc dev_forcePairs
-  int host_npart;
+  unsigned int host_npart;
   bool need_tune;
 
   // pairs==0: return forces using atomicAdd
diff --git a/src/core/actor/Mmm1dgpuForce_cuda.cu b/src/core/actor/Mmm1dgpuForce_cuda.cu
index 46a5b0d3f2d..e4c389981e6 100644
--- a/src/core/actor/Mmm1dgpuForce_cuda.cu
+++ b/src/core/actor/Mmm1dgpuForce_cuda.cu
@@ -66,40 +66,40 @@ constexpr int modpsi_constant_size = modpsi_order * modpsi_order * 2;
 
 // linearized array on device
 __constant__ int device_n_modPsi[1] = {0};
-__constant__ int device_linModPsi_offsets[2 * modpsi_order],
-    device_linModPsi_lengths[2 * modpsi_order];
+__constant__ unsigned int device_linModPsi_offsets[2 * modpsi_order];
+__constant__ unsigned int device_linModPsi_lengths[2 * modpsi_order];
 __constant__ float device_linModPsi[modpsi_constant_size];
 
 __device__ float dev_mod_psi_even(int n, float x) {
   return evaluateAsTaylorSeriesAt(
       &device_linModPsi[device_linModPsi_offsets[2 * n]],
-      device_linModPsi_lengths[2 * n], x * x);
+      static_cast<int>(device_linModPsi_lengths[2 * n]), x * x);
 }
 
 __device__ float dev_mod_psi_odd(int n, float x) {
   return x * evaluateAsTaylorSeriesAt(
                  &device_linModPsi[device_linModPsi_offsets[2 * n + 1]],
-                 device_linModPsi_lengths[2 * n + 1], x * x);
+                 static_cast<int>(device_linModPsi_lengths[2 * n + 1]), x * x);
 }
 
 int modpsi_init() {
   create_mod_psi_up_to(modpsi_order);
 
   // linearized array on host
-  std::vector<int> linModPsi_offsets(modPsi.size());
-  std::vector<int> linModPsi_lengths(modPsi.size());
-  for (size_t i = 0; i < modPsi.size(); i++) {
+  std::vector<unsigned int> linModPsi_offsets(modPsi.size());
+  std::vector<unsigned int> linModPsi_lengths(modPsi.size());
+  for (std::size_t i = 0; i < modPsi.size(); i++) {
     if (i)
       linModPsi_offsets[i] =
           linModPsi_offsets[i - 1] + linModPsi_lengths[i - 1];
-    linModPsi_lengths[i] = modPsi[i].size();
+    linModPsi_lengths[i] = static_cast<unsigned int>(modPsi[i].size());
   }
 
   // linearize the coefficients array
   std::vector<float> linModPsi(linModPsi_offsets[modPsi.size() - 1] +
                                linModPsi_lengths[modPsi.size() - 1]);
-  for (size_t i = 0; i < modPsi.size(); i++) {
-    for (size_t j = 0; j < modPsi[i].size(); j++) {
+  for (std::size_t i = 0; i < modPsi.size(); i++) {
+    for (std::size_t j = 0; j < modPsi[i].size(); j++) {
       linModPsi[linModPsi_offsets[i] + j] = static_cast<float>(modPsi[i][j]);
     }
   }
@@ -110,7 +110,7 @@ int modpsi_init() {
     // copy to GPU
     auto const linModPsiSize = linModPsi_offsets[modPsi.size() - 1] +
                                linModPsi_lengths[modPsi.size() - 1];
-    if (linModPsiSize > modpsi_constant_size) {
+    if (linModPsiSize > static_cast<unsigned int>(modpsi_constant_size)) {
       throw std::runtime_error(
           "__constant__ device_linModPsi[] is not large enough");
     }
@@ -153,18 +153,18 @@ Mmm1dgpuForce::Mmm1dgpuForce(SystemInterface &s, float _coulomb_prefactor,
 }
 
 void Mmm1dgpuForce::setup(SystemInterface &s) {
-  if (s.box()[2] <= 0) {
+  auto const box_z = static_cast<float>(s.box()[2]);
+  if (box_z <= 0) {
     throw std::runtime_error(
         "Please set box length before initializing MMM1D!");
   }
   if (need_tune && s.npart_gpu() > 0) {
-    set_params(static_cast<float>(s.box()[2]),
-               static_cast<float>(coulomb.prefactor), maxPWerror,
+    set_params(box_z, static_cast<float>(coulomb.prefactor), maxPWerror,
                far_switch_radius, bessel_cutoff);
     tune(s, maxPWerror, far_switch_radius, bessel_cutoff);
   }
-  if (s.box()[2] != host_boxz) {
-    set_params(static_cast<float>(s.box()[2]), 0, -1, -1, -1);
+  if (box_z != host_boxz) {
+    set_params(box_z, 0, -1, -1, -1);
   }
   if (s.npart_gpu() == host_npart) { // unchanged
     return;
@@ -178,7 +178,7 @@ void Mmm1dgpuForce::setup(SystemInterface &s) {
   for (int d = 0; d < deviceCount; d++) {
     cudaSetDevice(d);
 
-    size_t freeMem, totalMem;
+    std::size_t freeMem, totalMem;
     cudaMemGetInfo(&freeMem, &totalMem);
     if (freeMem / 2 < part_mem_size) {
       // don't use more than half the device's memory
@@ -198,11 +198,12 @@ void Mmm1dgpuForce::setup(SystemInterface &s) {
     cudaFree(dev_energyBlocks);
   cuda_safe_mem(
       cudaMalloc((void **)&dev_energyBlocks, numBlocks(s) * sizeof(float)));
-  host_npart = static_cast<int>(s.npart_gpu());
+  host_npart = static_cast<unsigned int>(s.npart_gpu());
 }
 
 unsigned int Mmm1dgpuForce::numBlocks(SystemInterface const &s) const {
-  auto b = static_cast<int>(s.npart_gpu() * s.npart_gpu() / numThreads) + 1;
+  auto b = 1 + static_cast<unsigned int>(Utils::sqr(s.npart_gpu()) /
+                                         static_cast<std::size_t>(numThreads));
   if (b > 65535)
     b = 65535;
   return b;
@@ -214,8 +215,8 @@ __forceinline__ __device__ float sqpow(float x) { return x * x; }
 __forceinline__ __device__ float cbpow(float x) { return x * x * x; }
 
 __device__ void sumReduction(float *input, float *sum) {
-  auto tid = static_cast<int>(threadIdx.x);
-  for (auto i = static_cast<int>(blockDim.x) / 2; i > 0; i /= 2) {
+  auto const tid = threadIdx.x;
+  for (auto i = blockDim.x / 2; i > 0; i /= 2) {
     __syncthreads();
     if (tid < i)
       input[tid] += input[i + tid];
@@ -225,23 +226,23 @@ __device__ void sumReduction(float *input, float *sum) {
     sum[0] = input[0];
 }
 
-__global__ void sumKernel(float *data, int N) {
+__global__ void sumKernel(float *data, std::size_t N) {
   extern __shared__ float partialsums[];
   if (blockIdx.x != 0)
     return;
-  auto const tid = static_cast<int>(threadIdx.x);
+  std::size_t const tid = threadIdx.x;
   auto result = 0.f;
 
-  for (int i = 0; i < N; i += static_cast<int>(blockDim.x)) {
+  for (std::size_t i = 0; i < N; i += blockDim.x) {
     if (i + tid >= N)
-      partialsums[tid] = 0;
+      partialsums[tid] = 0.f;
     else
       partialsums[tid] = data[i + tid];
 
     sumReduction(partialsums, &result);
     if (tid == 0) {
       if (i == 0)
-        data[0] = 0;
+        data[0] = 0.f;
       data[0] += result;
     }
   }
@@ -249,9 +250,9 @@ __global__ void sumKernel(float *data, int N) {
 
 __global__ void besselTuneKernel(int *result, float far_switch_radius,
                                  int maxCut) {
-  const float c_2pif = 2 * Utils::pi<float>();
-  float arg = c_2pif * *uz * far_switch_radius;
-  float pref = 4 * *uz * max(1.0f, c_2pif * *uz);
+  constexpr auto c_2pif = 2 * Utils::pi<float>();
+  auto const arg = c_2pif * *uz * far_switch_radius;
+  auto const pref = 4 * *uz * max(1.0f, c_2pif * *uz);
   float err;
   int P = 1;
   do {
@@ -367,17 +368,16 @@ void Mmm1dgpuForce::set_params(float _boxz, float _coulomb_prefactor,
 
 __global__ void forcesKernel(const float *__restrict__ r,
                              const float *__restrict__ q,
-                             float *__restrict__ force, int N, int pairs,
-                             int tStart) {
+                             float *__restrict__ force, std::size_t N,
+                             int pairs, std::size_t tStart) {
 
-  auto const c_2pif = 2 * Utils::pi<float>();
+  constexpr auto c_2pif = 2 * Utils::pi<float>();
   auto const tStop = Utils::sqr(N);
 
-  for (int tid =
-           static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x) + tStart;
-       tid < tStop; tid += static_cast<int>(blockDim.x * gridDim.x)) {
+  for (std::size_t tid = threadIdx.x + blockIdx.x * blockDim.x + tStart;
+       tid < tStop; tid += blockDim.x * gridDim.x) {
     auto const p1 = tid % N, p2 = tid / N;
-    auto x = r[3 * p2] - r[3 * p1];
+    auto x = r[3 * p2 + 0] - r[3 * p1 + 0];
     auto y = r[3 * p2 + 1] - r[3 * p1 + 1];
     auto z = r[3 * p2 + 2] - r[3 * p1 + 2];
     auto const rxy2 = sqpow(x) + sqpow(y);
@@ -392,8 +392,8 @@ __global__ void forcesKernel(const float *__restrict__ r,
 
     if (p1 == p2) // particle exerts no force on itself
     {
-      rxy = 1; // so the division at the end doesn't fail with NaN (sum_r is 0
-               // anyway)
+      rxy = 1.f; // so the division at the end doesn't fail with NaN
+                 // (sum_r is 0 anyway)
     } else if (rxy2 <= *far_switch_radius_2) // near formula
     {
       auto const uzz = *uz * z;
@@ -428,8 +428,8 @@ __global__ void forcesKernel(const float *__restrict__ r,
       if (rxy == 0) // particles at the same radial position only exert a force
                     // in z direction
       {
-        rxy = 1; // so the division at the end doesn't fail with NaN (sum_r is 0
-                 // anyway)
+        rxy = 1.f; // so the division at the end doesn't fail with NaN
+                   // (sum_r is 0 anyway)
       }
     } else // far formula
     {
@@ -445,11 +445,11 @@ __global__ void forcesKernel(const float *__restrict__ r,
 
     auto const pref = *coulomb_prefactor * q[p1] * q[p2];
     if (pairs) {
-      force[3 * (p1 + p2 * N - tStart)] = pref * sum_r / rxy * x;
+      force[3 * (p1 + p2 * N - tStart) + 0] = pref * sum_r / rxy * x;
       force[3 * (p1 + p2 * N - tStart) + 1] = pref * sum_r / rxy * y;
       force[3 * (p1 + p2 * N - tStart) + 2] = pref * sum_z;
     } else {
-      atomicAdd(&force[3 * p2], pref * sum_r / rxy * x);
+      atomicAdd(&force[3 * p2 + 0], pref * sum_r / rxy * x);
       atomicAdd(&force[3 * p2 + 1], pref * sum_r / rxy * y);
       atomicAdd(&force[3 * p2 + 2], pref * sum_z);
     }
@@ -458,11 +458,11 @@ __global__ void forcesKernel(const float *__restrict__ r,
 
 __global__ void energiesKernel(const float *__restrict__ r,
                                const float *__restrict__ q,
-                               float *__restrict__ energy, int N, int pairs,
-                               int tStart) {
+                               float *__restrict__ energy, std::size_t N,
+                               int pairs, std::size_t tStart) {
 
-  auto const c_2pif = 2 * Utils::pi<float>();
-  auto const c_gammaf = Utils::gamma<float>();
+  constexpr auto c_2pif = 2 * Utils::pi<float>();
+  constexpr auto c_gammaf = Utils::gamma<float>();
   auto const tStop = Utils::sqr(N);
 
   extern __shared__ float partialsums[];
@@ -470,13 +470,12 @@ __global__ void energiesKernel(const float *__restrict__ r,
     partialsums[threadIdx.x] = 0;
     __syncthreads();
   }
-  for (int tid =
-           static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x) + tStart;
-       tid < tStop; tid += static_cast<int>(blockDim.x * gridDim.x)) {
+  for (std::size_t tid = threadIdx.x + blockIdx.x * blockDim.x + tStart;
+       tid < tStop; tid += blockDim.x * gridDim.x) {
     auto const p1 = tid % N, p2 = tid / N;
     auto z = r[3 * p2 + 2] - r[3 * p1 + 2];
-    auto const rxy2 =
-        sqpow(r[3 * p2] - r[3 * p1]) + sqpow(r[3 * p2 + 1] - r[3 * p1 + 1]);
+    auto const rxy2 = sqpow(r[3 * p2 + 0] - r[3 * p1 + 0]) +
+                      sqpow(r[3 * p2 + 1] - r[3 * p1 + 1]);
     auto rxy = sqrt(rxy2);
     auto sum_e = 0.f;
 
@@ -529,18 +528,17 @@ __global__ void energiesKernel(const float *__restrict__ r,
   }
 }
 
-__global__ void vectorReductionKernel(float const *src, float *dst, int N,
-                                      int tStart) {
+__global__ void vectorReductionKernel(float const *src, float *dst,
+                                      std::size_t N, std::size_t tStart) {
 
   auto const tStop = Utils::sqr(N);
 
-  for (auto tid = static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
-       tid < N; tid += static_cast<int>(blockDim.x * gridDim.x)) {
+  for (std::size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < N;
+       tid += blockDim.x * gridDim.x) {
     auto const offset = (tid + (tStart % N)) % N;
-
-    for (int i = 0; tid + i * N < (tStop - tStart); i++) {
+    for (std::size_t i = 0; tid + i * N < (tStop - tStart); i++) {
 #pragma unroll 3
-      for (int d = 0; d < 3; d++) {
+      for (std::size_t d = 0; d < 3; d++) {
         dst[3 * offset + d] -= src[3 * (tid + i * N) + d];
       }
     }
@@ -561,7 +559,9 @@ void Mmm1dgpuForce::computeForces(SystemInterface &s) {
 
   if (pairs) // if we calculate force pairs, we need to reduce them to forces
   {
-    auto blocksRed = static_cast<int>(s.npart_gpu() / numThreads) + 1;
+    auto const blocksRed =
+        1 + static_cast<unsigned>(s.npart_gpu() /
+                                  static_cast<std::size_t>(numThreads));
     KERNELCALL(forcesKernel, numBlocks(s), numThreads, s.rGpuBegin(),
                s.qGpuBegin(), dev_forcePairs, s.npart_gpu(), pairs, 0)
     KERNELCALL(vectorReductionKernel, blocksRed, numThreads, dev_forcePairs,
@@ -572,10 +572,10 @@ void Mmm1dgpuForce::computeForces(SystemInterface &s) {
   }
 }
 
-__global__ void scaleAndAddKernel(float *dst, float const *src, int N,
+__global__ void scaleAndAddKernel(float *dst, float const *src, std::size_t N,
                                   float factor) {
-  for (auto tid = static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
-       tid < N; tid += static_cast<int>(blockDim.x * gridDim.x)) {
+  for (std::size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < N;
+       tid += blockDim.x * gridDim.x) {
     dst[tid] += src[tid] * factor;
   }
 }
@@ -591,17 +591,18 @@ void Mmm1dgpuForce::computeEnergy(SystemInterface &s) {
   if (pairs < 0) {
     throw std::runtime_error("MMM1D was not initialized correctly");
   }
-  auto const shared = numThreads * static_cast<int>(sizeof(float));
 
+  auto const shared = numThreads * static_cast<unsigned>(sizeof(float));
   KERNELCALL_shared(energiesKernel, numBlocks(s), numThreads, shared,
                     s.rGpuBegin(), s.qGpuBegin(), dev_energyBlocks,
                     s.npart_gpu(), 0, 0);
   KERNELCALL_shared(sumKernel, 1, numThreads, shared, dev_energyBlocks,
                     numBlocks(s));
-  KERNELCALL(scaleAndAddKernel, 1, 1, &(((CUDA_energy *)s.eGpu())->coulomb),
+  KERNELCALL(scaleAndAddKernel, 1, 1,
+             &(reinterpret_cast<CUDA_energy *>(s.eGpu())->coulomb),
              &dev_energyBlocks[0], 1,
-             0.5); // we have counted every interaction twice, so halve the
-                   // total energy
+             0.5f); // we have counted every interaction twice, so halve the
+                    // total energy
 }
 
 float Mmm1dgpuForce::force_benchmark(SystemInterface &s) {
diff --git a/src/core/actor/specfunc_cuda.hpp b/src/core/actor/specfunc_cuda.hpp
index 3ff410e165b..ca5770cdb1a 100644
--- a/src/core/actor/specfunc_cuda.hpp
+++ b/src/core/actor/specfunc_cuda.hpp
@@ -132,7 +132,7 @@ __constant__ static int bi1_size = 11;
 /**@}*/
 
 __device__ float evaluateAsChebychevSeriesAt(float const *c, int n, float x) {
-  auto const x2 = 2 * x;
+  auto const x2 = 2.0f * x;
   auto dd = c[n - 1];
   auto d = x2 * dd + c[n - 2];
   for (int j = n - 3; j >= 1; j--) {
@@ -140,7 +140,7 @@ __device__ float evaluateAsChebychevSeriesAt(float const *c, int n, float x) {
     d = x2 * d - dd + c[j];
     dd = tmp;
   }
-  return x * d - dd + c[0] / 2;
+  return x * d - dd + c[0] / 2.0f;
 }
 
 __device__ float evaluateAsTaylorSeriesAt(float const *c, int n, float x) {
@@ -152,13 +152,13 @@ __device__ float evaluateAsTaylorSeriesAt(float const *c, int n, float x) {
 }
 
 __device__ float dev_K0(float x) {
-  float c = evaluateAsChebychevSeriesAt(
-      x <= 2 ? bk0_data : x <= 8 ? ak0_data : ak02_data,
-      x <= 2 ? bk0_size : x <= 8 ? ak0_size : ak02_size,
-      x <= 2 ? x * x / 2 - 1.0f
-             : x <= 8 ? (16 / x - 5.0f) / 3.0f : (16 / x - 1.0f));
-  if (x <= 2) {
-    float I0 =
+  auto const c = evaluateAsChebychevSeriesAt(
+      x <= 2.0f ? bk0_data : x <= 8.0f ? ak0_data : ak02_data,
+      x <= 2.0f ? bk0_size : x <= 8.0f ? ak0_size : ak02_size,
+      x <= 2.0f ? x * x / 2.0f - 1.0f
+                : x <= 8.0f ? (16.0f / x - 5.0f) / 3.0f : (16.0f / x - 1.0f));
+  if (x <= 2.0f) {
+    auto const I0 =
         evaluateAsChebychevSeriesAt(bi0_data, bi0_size, x * x / 4.5f - 1.0f);
     return (-log(x) + Utils::ln_2<float>()) * I0 + c;
   }
@@ -166,14 +166,14 @@ __device__ float dev_K0(float x) {
 }
 
 __device__ float dev_K1(float x) {
-  float c = evaluateAsChebychevSeriesAt(
-      x <= 2 ? bk1_data : x <= 8 ? ak1_data : ak12_data,
-      x <= 2 ? bk1_size : x <= 8 ? ak1_size : ak12_size,
-      x <= 2 ? x * x / 2 - 1.0f
-             : x <= 8 ? (16 / x - 5.0f) / 3.0f : (16 / x - 1.0f));
-  if (x <= 2) {
-    float I1 = x * evaluateAsChebychevSeriesAt(bi1_data, bi1_size,
-                                               x * x / 4.5f - 1.0f);
+  auto const c = evaluateAsChebychevSeriesAt(
+      x <= 2.0f ? bk1_data : x <= 8.0f ? ak1_data : ak12_data,
+      x <= 2.0f ? bk1_size : x <= 8.0f ? ak1_size : ak12_size,
+      x <= 2.0f ? x * x / 2.0f - 1.0f
+                : x <= 8.0f ? (16.0f / x - 5.0f) / 3.0f : (16.0f / x - 1.0f));
+  if (x <= 2.0f) {
+    auto const I1 = x * evaluateAsChebychevSeriesAt(bi1_data, bi1_size,
+                                                    x * x / 4.5f - 1.0f);
     return (log(x) - Utils::ln_2<float>()) * I1 + c / x;
   }
   return exp(-x) * c * rsqrt(x);
diff --git a/src/core/electrostatics_magnetostatics/mmm1d.cpp b/src/core/electrostatics_magnetostatics/mmm1d.cpp
index d4eadc66121..db6a3e5798d 100644
--- a/src/core/electrostatics_magnetostatics/mmm1d.cpp
+++ b/src/core/electrostatics_magnetostatics/mmm1d.cpp
@@ -77,10 +77,10 @@ MMM1D_struct mmm1d_params = {0.05, 1e-5, 0};
 static std::array<double, MAXIMAL_B_CUT> bessel_radii;
 
 static double far_error(int P, double minrad) {
+  auto const wavenumber = 2 * Utils::pi() * box_geo.length_inv()[2];
   // this uses an upper bound to all force components and the potential
-  auto const rhores = 2 * Utils::pi() * box_geo.length_inv()[2] * minrad;
-  auto const pref = 4 * box_geo.length_inv()[2] *
-                    std::max(1.0, 2 * Utils::pi() * box_geo.length_inv()[2]);
+  auto const rhores = wavenumber * minrad;
+  auto const pref = 4 * box_geo.length_inv()[2] * std::max(1.0, wavenumber);
 
   return pref * K1(rhores * P) * exp(rhores) / rhores * (P - 1 + 1 / rhores);
 }
@@ -177,7 +177,7 @@ int MMM1D_init() {
 
 void add_mmm1d_coulomb_pair_force(double chpref, Utils::Vector3d const &d,
                                   double r, Utils::Vector3d &force) {
-  constexpr double c_2pi = 2 * Utils::pi();
+  constexpr auto c_2pi = 2 * Utils::pi();
   auto const n_modPsi = static_cast<int>(modPsi.size() >> 1);
   auto const rxy2 = d[0] * d[0] + d[1] * d[1];
   auto const rxy2_d = rxy2 * uz2;
@@ -186,9 +186,9 @@ void add_mmm1d_coulomb_pair_force(double chpref, Utils::Vector3d const &d,
 
   if (rxy2 <= mmm1d_params.far_switch_radius_2) {
     /* polygamma summation */
-    double sr = 0;
-    double sz = mod_psi_odd(0, z_d);
-    double r2nm1 = 1.0;
+    auto sr = 0.;
+    auto sz = mod_psi_odd(0, z_d);
+    auto r2nm1 = 1.0;
     for (int n = 1; n < n_modPsi; n++) {
       auto const deriv = static_cast<double>(2 * n);
       auto const mpe = mod_psi_even(n, z_d);
@@ -238,7 +238,7 @@ void add_mmm1d_coulomb_pair_force(double chpref, Utils::Vector3d const &d,
     /* far range formula */
     auto const rxy = sqrt(rxy2);
     auto const rxy_d = rxy * box_geo.length_inv()[2];
-    double sr = 0, sz = 0;
+    auto sr = 0., sz = 0.;
 
     for (int bp = 1; bp < MAXIMAL_B_CUT; bp++) {
       if (bessel_radii[bp - 1] < rxy)
@@ -271,7 +271,7 @@ double mmm1d_coulomb_pair_energy(double const chpref, Utils::Vector3d const &d,
   if (chpref == 0)
     return 0;
 
-  constexpr double c_2pi = 2 * Utils::pi();
+  constexpr auto c_2pi = 2 * Utils::pi();
   auto const n_modPsi = static_cast<int>(modPsi.size() >> 1);
   auto const rxy2 = d[0] * d[0] + d[1] * d[1];
   auto const rxy2_d = rxy2 * uz2;
@@ -299,15 +299,15 @@ double mmm1d_coulomb_pair_energy(double const chpref, Utils::Vector3d const &d,
 
     double rt, shift_z;
 
-    E += 1 / r;
+    E += 1. / r;
 
     shift_z = d[2] + box_geo.length()[2];
     rt = sqrt(rxy2 + shift_z * shift_z);
-    E += 1 / rt;
+    E += 1. / rt;
 
     shift_z = d[2] - box_geo.length()[2];
     rt = sqrt(rxy2 + shift_z * shift_z);
-    E += 1 / rt;
+    E += 1. / rt;
   } else {
     /* far range formula */
     auto const rxy = sqrt(rxy2);
diff --git a/src/core/electrostatics_magnetostatics/p3m-common.hpp b/src/core/electrostatics_magnetostatics/p3m-common.hpp
index 26e9fbaf3e5..e8fe39d78cc 100644
--- a/src/core/electrostatics_magnetostatics/p3m-common.hpp
+++ b/src/core/electrostatics_magnetostatics/p3m-common.hpp
@@ -210,7 +210,7 @@ std::array<std::vector<int>, 3> inline calc_meshift(
     std::array<int, 3> const &mesh_size, bool zero_out_midpoint = false) {
   std::array<std::vector<int>, 3> ret{};
 
-  for (size_t i = 0; i < 3; i++) {
+  for (std::size_t i = 0; i < 3; i++) {
     ret[i] = std::vector<int>(mesh_size[i]);
 
     for (int j = 1; j <= mesh_size[i] / 2; j++) {
diff --git a/src/python/espressomd/electrostatics.pxd b/src/python/espressomd/electrostatics.pxd
index 3acec46b13d..88f26489159 100644
--- a/src/python/espressomd/electrostatics.pxd
+++ b/src/python/espressomd/electrostatics.pxd
@@ -151,7 +151,7 @@ IF ELECTROSTATICS and MMM1D_GPU:
             unsigned int numBlocks(SystemInterface & s)
 
             float host_boxz
-            int host_npart
+            unsigned int host_npart
             bool need_tune
 
             int pairs

From 66e54eaeff8210edcf758820e1c527834f93aa27 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Fri, 23 Jul 2021 14:30:34 +0200
Subject: [PATCH 6/6] core: Update exception mechanism

There can only be one MMM actor in an ESPResSo system,
so coulomb.method checks can be converted to assertions.
system.box_l cannot be <= 0. Remove the iostream global.
---
 src/core/actor/Mmm1dgpuForce_cuda.cu | 24 +++++++-----------------
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/src/core/actor/Mmm1dgpuForce_cuda.cu b/src/core/actor/Mmm1dgpuForce_cuda.cu
index e4c389981e6..5a4c037a5dc 100644
--- a/src/core/actor/Mmm1dgpuForce_cuda.cu
+++ b/src/core/actor/Mmm1dgpuForce_cuda.cu
@@ -38,8 +38,11 @@
 
 #include <cuda.h>
 
+#include <cmath>
 #include <cstddef>
-#include <iostream>
+#include <cstdio>
+#include <stdexcept>
+#include <vector>
 
 #if defined(OMPI_MPI_H) || defined(_MPI_H)
 #error CU-file includes mpi.h! This should not happen!
@@ -154,10 +157,6 @@ Mmm1dgpuForce::Mmm1dgpuForce(SystemInterface &s, float _coulomb_prefactor,
 
 void Mmm1dgpuForce::setup(SystemInterface &s) {
   auto const box_z = static_cast<float>(s.box()[2]);
-  if (box_z <= 0) {
-    throw std::runtime_error(
-        "Please set box length before initializing MMM1D!");
-  }
   if (need_tune && s.npart_gpu() > 0) {
     set_params(box_z, static_cast<float>(coulomb.prefactor), maxPWerror,
                far_switch_radius, bessel_cutoff);
@@ -182,8 +181,7 @@ void Mmm1dgpuForce::setup(SystemInterface &s) {
     cudaMemGetInfo(&freeMem, &totalMem);
     if (freeMem / 2 < part_mem_size) {
       // don't use more than half the device's memory
-      std::cerr << "Switching to atomicAdd due to memory constraints."
-                << std::endl;
+      fprintf(stderr, "Switching to atomicAdd due to memory constraints.\n");
       pairs = 0;
       break;
     }
@@ -546,11 +544,7 @@ __global__ void vectorReductionKernel(float const *src, float *dst,
 }
 
 void Mmm1dgpuForce::computeForces(SystemInterface &s) {
-  if (coulomb.method != COULOMB_MMM1D_GPU) {
-    std::cerr << "MMM1D: coulomb.method has been changed, skipping calculation"
-              << std::endl;
-    return;
-  }
+  assert(coulomb.method == COULOMB_MMM1D_GPU);
   setup(s);
 
   if (pairs < 0) {
@@ -581,11 +575,7 @@ __global__ void scaleAndAddKernel(float *dst, float const *src, std::size_t N,
 }
 
 void Mmm1dgpuForce::computeEnergy(SystemInterface &s) {
-  if (coulomb.method != COULOMB_MMM1D_GPU) {
-    std::cerr << "MMM1D: coulomb.method has been changed, skipping calculation"
-              << std::endl;
-    return;
-  }
+  assert(coulomb.method == COULOMB_MMM1D_GPU);
   setup(s);
 
   if (pairs < 0) {