espressomd · kodiakhq · Jul 27, 2021 · Jul 19, 2021 · Jul 19, 2021 · Jul 19, 2021
diff --git a/doc/sphinx/installation.rst b/doc/sphinx/installation.rst
@@ -294,7 +294,13 @@ General features
 
    .. seealso:: :ref:`Electrostatics`
 
--  ``MMM1D_GPU``
+-  ``MMM1D_GPU``: This enables MMM1D on GPU. It is faster than the CPU version
+   by several orders of magnitude, but has float precision instead of double
+   precision.
+
+-  ``MMM1D_MACHINE_PREC``: This enables high-precision Bessel functions
+   for MMM1D on CPU. Comes with a 60% slow-down penalty. The low-precision
+   functions are in most cases precise enough and are enabled by default.
 
 -  ``DIPOLES`` This activates the dipole-moment property of particles; In addition,
    the various magnetostatics algorithms, such as P3M are switched on.

diff --git a/maintainer/configs/no_rotation.hpp b/maintainer/configs/no_rotation.hpp
@@ -36,6 +36,7 @@
 
 // Charges and dipoles
 #define ELECTROSTATICS
+#define MMM1D_MACHINE_PREC
 #ifdef CUDA
 #define MMM1D_GPU
 #endif

diff --git a/src/config/features.def b/src/config/features.def
@@ -33,6 +33,7 @@ ROTATIONAL_INERTIA              implies ROTATION
 ELECTROSTATICS
 P3M                             equals ELECTROSTATICS and FFTW
 MMM1D_GPU                       requires CUDA and ELECTROSTATICS
+MMM1D_MACHINE_PREC              requires ELECTROSTATICS
 
 /* Magnetostatics */
 DIPOLES                         implies ROTATION

diff --git a/src/core/EspressoSystemInterface.hpp b/src/core/EspressoSystemInterface.hpp
@@ -22,6 +22,8 @@
 #include "SystemInterface.hpp"
 #include "cuda_interface.hpp"
 
+#include <cstddef>
+
 /* Syntactic sugar */
 #define espressoSystemInterface EspressoSystemInterface::Instance()
 
@@ -117,12 +119,16 @@ class EspressoSystemInterface : public SystemInterface {
   float *fGpuEnd() override {
     return gpu_get_particle_force_pointer() + 3 * m_gpu_npart;
   };
-  float *eGpu() override { return (float *)gpu_get_energy_pointer(); };
+  float *eGpu() override {
+    // cast struct of floats to array of floats
+    // https://stackoverflow.com/a/29278260
+    return reinterpret_cast<float *>(gpu_get_energy_pointer());
+  };
   float *torqueGpuBegin() override {
-    return (float *)gpu_get_particle_torque_pointer();
+    return gpu_get_particle_torque_pointer();
   };
   float *torqueGpuEnd() override {
-    return (float *)(gpu_get_particle_torque_pointer()) + 3 * m_gpu_npart;
+    return gpu_get_particle_torque_pointer() + 3 * m_gpu_npart;
   };
   bool hasFGpu() override { return true; };
   bool requestFGpu() override {
@@ -148,7 +154,7 @@ class EspressoSystemInterface : public SystemInterface {
 
   Utils::Vector3d box() const override;
 
-  unsigned int npart_gpu() override {
+  std::size_t npart_gpu() const override {
 #ifdef CUDA
     return gpu_get_particle_pointer().size();
 #else
@@ -177,10 +183,10 @@ class EspressoSystemInterface : public SystemInterface {
       reallocDeviceMemory(gpu_get_particle_pointer().size());
     }
   };
-  void reallocDeviceMemory(int n);
+  void reallocDeviceMemory(std::size_t n);
 #endif
 
-  int m_gpu_npart;
+  std::size_t m_gpu_npart;
   bool m_gpu;
 
   float *m_r_gpu_begin;

diff --git a/src/core/EspressoSystemInterface_cuda.cu b/src/core/EspressoSystemInterface_cuda.cu
@@ -22,6 +22,8 @@
 #include "cuda_utils.cuh"
 #include "errorhandling.hpp"
 
+#include <cstddef>
+
 #include <cuda.h>
 
 #if defined(OMPI_MPI_H) || defined(_MPI_H)
@@ -33,8 +35,8 @@
 
 // Position and charge
 __global__ void split_kernel_rq(CUDA_particle_data *particles, float *r,
-                                float *q, int n) {
-  auto idx = static_cast<int>(blockDim.x * blockIdx.x + threadIdx.x);
+                                float *q, unsigned int n) {
+  auto const idx = blockDim.x * blockIdx.x + threadIdx.x;
   if (idx >= n)
     return;
 
@@ -49,8 +51,9 @@ __global__ void split_kernel_rq(CUDA_particle_data *particles, float *r,
 }
 
 // Charge only
-__global__ void split_kernel_q(CUDA_particle_data *particles, float *q, int n) {
-  auto idx = static_cast<int>(blockDim.x * blockIdx.x + threadIdx.x);
+__global__ void split_kernel_q(CUDA_particle_data *particles, float *q,
+                               unsigned int n) {
+  auto const idx = blockDim.x * blockIdx.x + threadIdx.x;
   if (idx >= n)
     return;
 
@@ -62,8 +65,9 @@ __global__ void split_kernel_q(CUDA_particle_data *particles, float *q, int n) {
 }
 
 // Position only
-__global__ void split_kernel_r(CUDA_particle_data *particles, float *r, int n) {
-  auto idx = static_cast<int>(blockDim.x * blockIdx.x + threadIdx.x);
+__global__ void split_kernel_r(CUDA_particle_data *particles, float *r,
+                               unsigned int n) {
+  auto idx = blockDim.x * blockIdx.x + threadIdx.x;
   if (idx >= n)
     return;
 
@@ -78,8 +82,9 @@ __global__ void split_kernel_r(CUDA_particle_data *particles, float *r, int n) {
 
 #ifdef CUDA
 // Velocity
-__global__ void split_kernel_v(CUDA_particle_data *particles, float *v, int n) {
-  auto idx = static_cast<int>(blockDim.x * blockIdx.x + threadIdx.x);
+__global__ void split_kernel_v(CUDA_particle_data *particles, float *v,
+                               unsigned int n) {
+  auto idx = blockDim.x * blockIdx.x + threadIdx.x;
   if (idx >= n)
     return;
 
@@ -96,8 +101,8 @@ __global__ void split_kernel_v(CUDA_particle_data *particles, float *v, int n) {
 #ifdef DIPOLES
 // Dipole moment
 __global__ void split_kernel_dip(CUDA_particle_data *particles, float *dip,
-                                 int n) {
-  auto idx = static_cast<int>(blockDim.x * blockIdx.x + threadIdx.x);
+                                 unsigned int n) {
+  auto idx = blockDim.x * blockIdx.x + threadIdx.x;
   if (idx >= n)
     return;
 
@@ -112,9 +117,9 @@ __global__ void split_kernel_dip(CUDA_particle_data *particles, float *dip,
 #endif
 
 __global__ void split_kernel_director(CUDA_particle_data *particles,
-                                      float *director, int n) {
+                                      float *director, unsigned int n) {
 #ifdef ROTATION
-  auto idx = static_cast<int>(blockDim.x * blockIdx.x + threadIdx.x);
+  auto idx = blockDim.x * blockIdx.x + threadIdx.x;
   if (idx >= n)
     return;
 
@@ -128,7 +133,7 @@ __global__ void split_kernel_director(CUDA_particle_data *particles,
 #endif
 }
 
-void EspressoSystemInterface::reallocDeviceMemory(int n) {
+void EspressoSystemInterface::reallocDeviceMemory(std::size_t n) {
   if (m_needsRGpu && ((n != m_gpu_npart) || (m_r_gpu_begin == nullptr))) {
     if (m_r_gpu_begin != nullptr)
       cuda_safe_mem(cudaFree(m_r_gpu_begin));
@@ -169,8 +174,8 @@ void EspressoSystemInterface::reallocDeviceMemory(int n) {
 }
 
 void EspressoSystemInterface::split_particle_struct() {
-  auto device_particles = gpu_get_particle_pointer();
-  int n = device_particles.size();
+  auto const device_particles = gpu_get_particle_pointer();
+  auto const n = static_cast<unsigned int>(device_particles.size());
   if (n == 0)
     return;
 

diff --git a/src/core/SystemInterface.hpp b/src/core/SystemInterface.hpp
@@ -20,8 +20,11 @@
 #define SYSTEMINTERFACE_H
 
 #include "config.hpp"
+
 #include <utils/Vector.hpp>
 
+#include <cstddef>
+
 /** @todo: Turn needsXY in getter/setter **/
 
 class SystemInterface {
@@ -93,7 +96,7 @@ class SystemInterface {
     return m_needsDirectorGpu;
   }
 
-  virtual unsigned int npart_gpu() { return 0; };
+  virtual std::size_t npart_gpu() const { return 0; };
   virtual Vector3 box() const = 0;
 
   virtual bool needsRGpu() { return m_needsRGpu; };

diff --git a/src/core/actor/DipolarBarnesHut.hpp b/src/core/actor/DipolarBarnesHut.hpp
@@ -82,7 +82,8 @@ class DipolarBarnesHut : public Actor {
     buildTreeBH(m_bh_data.blocks);
     summarizeBH(m_bh_data.blocks);
     sortBH(m_bh_data.blocks);
-    if (energyBH(&m_bh_data, m_k, (&(((CUDA_energy *)s.eGpu())->dipolar)))) {
+    if (energyBH(&m_bh_data, m_k,
+                 &(reinterpret_cast<CUDA_energy *>(s.eGpu())->dipolar))) {
       runtimeErrorMsg() << "kernels encountered a functional error";
     }
   };

diff --git a/src/core/actor/DipolarDirectSum.hpp b/src/core/actor/DipolarDirectSum.hpp
@@ -74,7 +74,7 @@ class DipolarDirectSum : public Actor {
     }
     DipolarDirectSum_kernel_wrapper_energy(
         k, static_cast<int>(s.npart_gpu()), s.rGpuBegin(), s.dipGpuBegin(), box,
-        per, (&(((CUDA_energy *)s.eGpu())->dipolar)));
+        per, &(reinterpret_cast<CUDA_energy *>(s.eGpu())->dipolar));
   };
 
 private:

diff --git a/src/core/actor/Mmm1dgpuForce.hpp b/src/core/actor/Mmm1dgpuForce.hpp
@@ -48,14 +48,14 @@ class Mmm1dgpuForce : public Actor {
 private:
   // CUDA parameters
   unsigned int numThreads;
-  unsigned int numBlocks(SystemInterface &s);
+  unsigned int numBlocks(SystemInterface const &s) const;
 
   // the box length currently set on the GPU
   // Needed to make sure it hasn't been modified after inter coulomb was used.
   float host_boxz;
   // the number of particles we had during the last run. Needed to check if we
   // have to realloc dev_forcePairs
-  int host_npart;
+  unsigned int host_npart;
   bool need_tune;
 
   // pairs==0: return forces using atomicAdd