From a9776300a4e22b49d5f8db9cab0b81b876319048 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= Date: Mon, 19 Jul 2021 16:20:35 +0200 Subject: [PATCH 1/6] core: Enable and test the Chebychev series --- doc/sphinx/installation.rst | 8 +++++++- maintainer/configs/no_rotation.hpp | 1 + src/config/features.def | 1 + src/core/electrostatics_magnetostatics/mmm1d.cpp | 8 +++----- src/core/electrostatics_magnetostatics/specfunc.hpp | 10 +++++----- 5 files changed, 17 insertions(+), 11 deletions(-) diff --git a/doc/sphinx/installation.rst b/doc/sphinx/installation.rst index 6d7c8c02d16..4187f9e42b1 100644 --- a/doc/sphinx/installation.rst +++ b/doc/sphinx/installation.rst @@ -294,7 +294,13 @@ General features .. seealso:: :ref:`Electrostatics` -- ``MMM1D_GPU`` +- ``MMM1D_GPU``: This enables MMM1D on GPU. It is faster than the CPU version + by several orders of magnitude, but has float precision instead of double + precision. + +- ``MMM1D_MACHINE_PREC``: This enables high-precision Bessel functions + for MMM1D on CPU. Comes with a 60% slow-down penalty. The low-precision + functions are in most cases precise enough and are enabled by default. - ``DIPOLES`` This activates the dipole-moment property of particles; In addition, the various magnetostatics algorithms, such as P3M are switched on. diff --git a/maintainer/configs/no_rotation.hpp b/maintainer/configs/no_rotation.hpp index 5f88f292a1e..d7cbf83640b 100644 --- a/maintainer/configs/no_rotation.hpp +++ b/maintainer/configs/no_rotation.hpp @@ -36,6 +36,7 @@ // Charges and dipoles #define ELECTROSTATICS +#define MMM1D_MACHINE_PREC #ifdef CUDA #define MMM1D_GPU #endif diff --git a/src/config/features.def b/src/config/features.def index 7a2aa60bd6f..36e77376fdc 100644 --- a/src/config/features.def +++ b/src/config/features.def @@ -33,6 +33,7 @@ ROTATIONAL_INERTIA implies ROTATION ELECTROSTATICS P3M equals ELECTROSTATICS and FFTW MMM1D_GPU requires CUDA and ELECTROSTATICS +MMM1D_MACHINE_PREC requires ELECTROSTATICS /* Magnetostatics */ DIPOLES implies ROTATION diff --git a/src/core/electrostatics_magnetostatics/mmm1d.cpp b/src/core/electrostatics_magnetostatics/mmm1d.cpp index 8cdc61c3a5a..4750b08a31e 100644 --- a/src/core/electrostatics_magnetostatics/mmm1d.cpp +++ b/src/core/electrostatics_magnetostatics/mmm1d.cpp @@ -60,12 +60,10 @@ /** Minimal radius for the far formula in multiples of box_l[2] */ #define MIN_RAD 0.01 -/* if you define this, the Bessel functions are calculated up +/* if you define this feature, the Bessel functions are calculated up * to machine precision, otherwise 10^-14, which should be * definitely enough for daily life. */ -#undef BESSEL_MACHINE_PREC - -#ifndef BESSEL_MACHINE_PREC +#ifndef MMM1D_MACHINE_PREC #define K0 LPK0 #define K1 LPK1 #endif @@ -250,7 +248,7 @@ void add_mmm1d_coulomb_pair_force(double chpref, Utils::Vector3d const &d, auto const fq = c_2pi * bp; double k0, k1; -#ifdef BESSEL_MACHINE_PREC +#ifdef MMM1D_MACHINE_PREC k0 = K0(fq * rxy_d); k1 = K1(fq * rxy_d); #else diff --git a/src/core/electrostatics_magnetostatics/specfunc.hpp b/src/core/electrostatics_magnetostatics/specfunc.hpp index 72afd0ec603..b9a0e7fb4e5 100644 --- a/src/core/electrostatics_magnetostatics/specfunc.hpp +++ b/src/core/electrostatics_magnetostatics/specfunc.hpp @@ -45,31 +45,31 @@ double hzeta(double order, double x); /** Modified Bessel function of second kind, order 0. This function was taken * from the GSL code. Precise roughly up to machine precision. - * If @c BESSEL_MACHINE_PREC is not defined, @ref LPK0 is used instead. + * If @c MMM1D_MACHINE_PREC is not defined, @ref LPK0 is used instead. */ double K0(double x); /** Modified Bessel function of second kind, order 1. This function was taken * from the GSL code. Precise roughly up to machine precision. - * If @c BESSEL_MACHINE_PREC is not defined, @ref LPK1 is used instead. + * If @c MMM1D_MACHINE_PREC is not defined, @ref LPK1 is used instead. */ double K1(double x); -/** Bessel function K0 at x. +/** Bessel function of second kind, order 0, low precision. * The implementation has an absolute precision of around 10^(-14), which is * comparable to the relative precision sqrt implementation of current * hardware. */ double LPK0(double x); -/** Bessel function K1 at x. +/** Bessel function of second kind, order 1, low precision. * The implementation has an absolute precision of around 10^(-14), which is * comparable to the relative precision sqrt implementation of current * hardware. */ double LPK1(double x); -/** Bessel functions K0 and K1 at x. +/** Bessel functions of second kind, order 0 and order 1, low precision. * The implementation has an absolute precision of around 10^(-14), which is * comparable to the relative precision sqrt implementation of current * hardware. From 304617cbf2c9280cd3153c3bc80c8640863fc709 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= Date: Mon, 19 Jul 2021 16:52:53 +0200 Subject: [PATCH 2/6] core: Modernize MMM1D code --- .../electrostatics_magnetostatics/mmm1d.cpp | 40 +++++----- .../specfunc.cpp | 73 +++++++++---------- .../specfunc.hpp | 4 +- 3 files changed, 57 insertions(+), 60 deletions(-) diff --git a/src/core/electrostatics_magnetostatics/mmm1d.cpp b/src/core/electrostatics_magnetostatics/mmm1d.cpp index 4750b08a31e..d4eadc66121 100644 --- a/src/core/electrostatics_magnetostatics/mmm1d.cpp +++ b/src/core/electrostatics_magnetostatics/mmm1d.cpp @@ -52,13 +52,10 @@ #include #include -/** Largest numerically stable cutoff for Bessel function. Don't - * change without improving the formulas. +/** Largest numerically stable cutoff for Bessel function. + * Don't change without improving the formulas. */ -#define MAXIMAL_B_CUT 30 - -/** Minimal radius for the far formula in multiples of box_l[2] */ -#define MIN_RAD 0.01 +constexpr int MAXIMAL_B_CUT = 30; /* if you define this feature, the Bessel functions are calculated up * to machine precision, otherwise 10^-14, which should be @@ -74,9 +71,10 @@ static double uz2, prefuz2, prefL3_i; /**@}*/ MMM1D_struct mmm1d_params = {0.05, 1e-5, 0}; -/** From which distance a certain Bessel cutoff is valid. Can't be part of the - params since these get broadcasted. */ -static std::vector bessel_radii; +/** From which distance a certain Bessel cutoff is valid. + * Can't be part of the params since these get broadcasted. + */ +static std::array bessel_radii; static double far_error(int P, double minrad) { // this uses an upper bound to all force components and the potential @@ -89,7 +87,8 @@ static double far_error(int P, double minrad) { static double determine_minrad(double maxPWerror, int P) { // bisection to search for where the error is maxPWerror - double const rgranularity = MIN_RAD * box_geo.length()[2]; + constexpr auto min_rad = 0.01; + double const rgranularity = min_rad * box_geo.length()[2]; double rmin = rgranularity; double rmax = std::min(box_geo.length()[0], box_geo.length()[1]); double const errmin = far_error(P, rmin); @@ -115,10 +114,9 @@ static double determine_minrad(double maxPWerror, int P) { return 0.5 * (rmin + rmax); } -static void determine_bessel_radii(double maxPWerror, int maxP) { - bessel_radii.resize(maxP); - for (int P = 1; P <= maxP; ++P) { - bessel_radii[P - 1] = determine_minrad(maxPWerror, P); +static void determine_bessel_radii(double maxPWerror) { + for (int i = 0; i < MAXIMAL_B_CUT; ++i) { + bessel_radii[i] = determine_minrad(maxPWerror, i + 1); } } @@ -171,7 +169,7 @@ int MMM1D_init() { prefuz2 = coulomb.prefactor * uz2; prefL3_i = prefuz2 * box_geo.length_inv()[2]; - determine_bessel_radii(mmm1d_params.maxPWerror, MAXIMAL_B_CUT); + determine_bessel_radii(mmm1d_params.maxPWerror); prepare_polygamma_series(mmm1d_params.maxPWerror, mmm1d_params.far_switch_radius_2); return ES_OK; @@ -333,17 +331,17 @@ double mmm1d_coulomb_pair_energy(double const chpref, Utils::Vector3d const &d, int mmm1d_tune(int timings, bool verbose) { if (MMM1D_sanity_checks()) return ES_ERROR; - double min_time = std::numeric_limits::infinity(); - double min_rad = -1; - auto const maxrad = box_geo.length()[2]; - double switch_radius; if (mmm1d_params.far_switch_radius_2 < 0) { + auto const maxrad = box_geo.length()[2]; + auto min_time = std::numeric_limits::infinity(); + auto min_rad = -1.; + double switch_radius; /* determine besselcutoff and optimal switching radius. Should be around * 0.33 */ for (switch_radius = 0.2 * maxrad; switch_radius < 0.4 * maxrad; switch_radius += 0.025 * maxrad) { - if (switch_radius <= bessel_radii[MAXIMAL_B_CUT - 1]) { + if (switch_radius <= bessel_radii.back()) { // this switching radius is too small for our Bessel series continue; } @@ -377,7 +375,7 @@ int mmm1d_tune(int timings, bool verbose) { switch_radius = min_rad; mmm1d_params.far_switch_radius_2 = Utils::sqr(switch_radius); } else if (mmm1d_params.far_switch_radius_2 <= - Utils::sqr(bessel_radii[MAXIMAL_B_CUT - 1])) { + Utils::sqr(bessel_radii.back())) { // this switching radius is too small for our Bessel series runtimeErrorMsg() << "could not find reasonable bessel cutoff"; return ES_ERROR; diff --git a/src/core/electrostatics_magnetostatics/specfunc.cpp b/src/core/electrostatics_magnetostatics/specfunc.cpp index 5b5987394fe..027e820a6cd 100644 --- a/src/core/electrostatics_magnetostatics/specfunc.cpp +++ b/src/core/electrostatics_magnetostatics/specfunc.cpp @@ -47,8 +47,11 @@ #include "specfunc.hpp" #include +#include #include +#include +#include /************************************************ * chebychev expansions @@ -209,8 +212,9 @@ static double const hzeta_c[15] = { -8.9535174270375468504026113181e-23}; double hzeta(double s, double q) { - double max_bits = 54.0; - int const jmax = 12, kmax = 10; + constexpr auto max_bits = 54.0; + constexpr auto jmax = 12; + constexpr auto kmax = 10; if ((s > max_bits && q < 1.0) || (s > 0.5 * max_bits && q < 0.25)) return pow(q, -s); @@ -223,19 +227,20 @@ double hzeta(double s, double q) { /** Euler-Maclaurin summation formula from @cite moshier89a p. 400, with * several typo corrections. */ - auto const pmax = pow(kmax + q, -s); + auto const kmax_q = static_cast(kmax) + q; + auto const pmax = pow(kmax_q, -s); auto scp = s; - auto pcp = pmax / (kmax + q); - auto ans = pmax * ((kmax + q) / (s - 1.0) + 0.5); + auto pcp = pmax / kmax_q; + auto ans = pmax * (kmax_q / (s - 1.0) + 0.5); for (int k = 0; k < kmax; k++) - ans += pow(k + q, -s); + ans += pow(static_cast(k) + q, -s); for (int j = 0; j <= jmax; j++) { auto const delta = hzeta_c[j + 1] * scp * pcp; ans += delta; scp *= (s + 2 * j + 1) * (s + 2 * j + 2); - pcp /= (kmax + q) * (kmax + q); + pcp /= Utils::sqr(static_cast(kmax) + q); } return ans; @@ -289,9 +294,8 @@ double LPK0(double x) { return tmp * (xx * ak0_cs[1] + 0.5 * ak0_cs[0]); } if (x > 2) { - int j = ak01_orders[((int)x) - 2]; + int j = ak01_orders[static_cast(x) - 2]; double x2; - double dd0, d0; double *s0; if (x <= 8) { s0 = ak0_cs; @@ -300,8 +304,8 @@ double LPK0(double x) { s0 = ak02_cs; x2 = (2. * 16.) / x - 2.; } - dd0 = s0[j]; - d0 = x2 * dd0 + s0[j - 1]; + auto dd0 = s0[j]; + auto d0 = x2 * dd0 + s0[j - 1]; for (j -= 2; j >= 1; j--) { auto const tmp0 = d0; d0 = x2 * d0 - dd0 + s0[j]; @@ -314,17 +318,16 @@ double LPK0(double x) { { /* I0/1 series */ int j = 10; - double ret, x2 = (2. / 4.5) * x * x - 2.; - double dd0, d0; - dd0 = bi0_cs[j]; - d0 = x2 * dd0 + bi0_cs[j - 1]; + auto x2 = (2. / 4.5) * x * x - 2.; + auto dd0 = bi0_cs[j]; + auto d0 = x2 * dd0 + bi0_cs[j - 1]; for (j -= 2; j >= 1; j--) { - double tmp0 = d0; + auto const tmp0 = d0; d0 = x2 * d0 - dd0 + bi0_cs[j]; dd0 = tmp0; } auto const tmp = log(x) - Utils::ln_2(); - ret = -tmp * (0.5 * (bi0_cs[0] + x2 * d0) - dd0); + auto const ret = -tmp * (0.5 * (bi0_cs[0] + x2 * d0) - dd0); /* K0/K1 correction */ j = 9; @@ -352,7 +355,6 @@ double LPK1(double x) { if (x > 2) { int j = ak01_orders[((int)x) - 2]; double x2; - double dd1, d1; double *s1; if (x <= 8) { s1 = ak1_cs; @@ -361,8 +363,8 @@ double LPK1(double x) { s1 = ak12_cs; x2 = (2. * 16.) / x - 2.; } - dd1 = s1[j]; - d1 = x2 * dd1 + s1[j - 1]; + auto dd1 = s1[j]; + auto d1 = x2 * dd1 + s1[j - 1]; for (j -= 2; j >= 1; j--) { auto const tmp1 = d1; d1 = x2 * d1 - dd1 + s1[j]; @@ -375,17 +377,16 @@ double LPK1(double x) { { /* I0/1 series */ int j = 10; - double ret, x2 = (2. / 4.5) * x * x - 2.; - double dd1, d1; - dd1 = bi1_cs[j]; - d1 = x2 * dd1 + bi1_cs[j - 1]; + auto x2 = (2. / 4.5) * x * x - 2.; + auto dd1 = bi1_cs[j]; + auto d1 = x2 * dd1 + bi1_cs[j - 1]; for (j -= 2; j >= 1; j--) { auto const tmp1 = d1; d1 = x2 * d1 - dd1 + bi1_cs[j]; dd1 = tmp1; } auto const tmp = log(x) - Utils::ln_2(); - ret = x * tmp * (0.5 * (bi1_cs[0] + x2 * d1) - dd1); + auto const ret = x * tmp * (0.5 * (bi1_cs[0] + x2 * d1) - dd1); /* K0/K1 correction */ j = 9; @@ -401,7 +402,7 @@ double LPK1(double x) { } } -std::tuple LPK01(double x) { +std::pair LPK01(double x) { if (x >= 27.) { auto const tmp = .5 * exp(-x) / sqrt(x); auto const K0 = tmp * ak0_cs[0]; @@ -417,7 +418,6 @@ std::tuple LPK01(double x) { if (x > 2) { int j = ak01_orders[((int)x) - 2]; double x2; - double dd0, dd1, d0, d1; double *s0, *s1; if (x <= 8) { s0 = ak0_cs; @@ -428,10 +428,10 @@ std::tuple LPK01(double x) { s1 = ak12_cs; x2 = (2. * 16.) / x - 2.; } - dd0 = s0[j]; - dd1 = s1[j]; - d0 = x2 * dd0 + s0[j - 1]; - d1 = x2 * dd1 + s1[j - 1]; + auto dd0 = s0[j]; + auto dd1 = s1[j]; + auto d0 = x2 * dd0 + s0[j - 1]; + auto d1 = x2 * dd1 + s1[j - 1]; for (j -= 2; j >= 1; j--) { auto const tmp0 = d0, tmp1 = d1; d0 = x2 * d0 - dd0 + s0[j]; @@ -448,12 +448,11 @@ std::tuple LPK01(double x) { { /* I0/1 series */ int j = 10; - double x2 = (2. / 4.5) * x * x - 2.; - double dd0, dd1, d0, d1; - dd0 = bi0_cs[j]; - dd1 = bi1_cs[j]; - d0 = x2 * dd0 + bi0_cs[j - 1]; - d1 = x2 * dd1 + bi1_cs[j - 1]; + auto x2 = (2. / 4.5) * x * x - 2.; + auto dd0 = bi0_cs[j]; + auto dd1 = bi1_cs[j]; + auto d0 = x2 * dd0 + bi0_cs[j - 1]; + auto d1 = x2 * dd1 + bi1_cs[j - 1]; for (j -= 2; j >= 1; j--) { auto const tmp0 = d0, tmp1 = d1; d0 = x2 * d0 - dd0 + bi0_cs[j]; diff --git a/src/core/electrostatics_magnetostatics/specfunc.hpp b/src/core/electrostatics_magnetostatics/specfunc.hpp index b9a0e7fb4e5..3df2ae621b2 100644 --- a/src/core/electrostatics_magnetostatics/specfunc.hpp +++ b/src/core/electrostatics_magnetostatics/specfunc.hpp @@ -38,7 +38,7 @@ #include #include -#include +#include /** Hurwitz zeta function. This function was taken from the GSL code. */ double hzeta(double order, double x); @@ -74,7 +74,7 @@ double LPK1(double x); * comparable to the relative precision sqrt implementation of current * hardware. */ -std::tuple LPK01(double x); +std::pair LPK01(double x); /** Evaluate the polynomial interpreted as a Taylor series via the * Horner scheme. From 88a2afe08e80ceb7ca5c079c8942917ee925c648 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= Date: Mon, 19 Jul 2021 18:19:45 +0200 Subject: [PATCH 3/6] core: Modernize MMM1D GPU code --- src/core/EspressoSystemInterface.hpp | 2 +- src/core/SystemInterface.hpp | 2 +- src/core/actor/Mmm1dgpuForce.hpp | 2 +- src/core/actor/Mmm1dgpuForce_cuda.cu | 139 ++++++++++++--------------- src/core/actor/specfunc_cuda.hpp | 10 +- 5 files changed, 72 insertions(+), 83 deletions(-) diff --git a/src/core/EspressoSystemInterface.hpp b/src/core/EspressoSystemInterface.hpp index 47413ba70af..f7f25b59464 100644 --- a/src/core/EspressoSystemInterface.hpp +++ b/src/core/EspressoSystemInterface.hpp @@ -148,7 +148,7 @@ class EspressoSystemInterface : public SystemInterface { Utils::Vector3d box() const override; - unsigned int npart_gpu() override { + unsigned int npart_gpu() const override { #ifdef CUDA return gpu_get_particle_pointer().size(); #else diff --git a/src/core/SystemInterface.hpp b/src/core/SystemInterface.hpp index d2dbe1009bf..d09014a25ea 100644 --- a/src/core/SystemInterface.hpp +++ b/src/core/SystemInterface.hpp @@ -93,7 +93,7 @@ class SystemInterface { return m_needsDirectorGpu; } - virtual unsigned int npart_gpu() { return 0; }; + virtual unsigned int npart_gpu() const { return 0; }; virtual Vector3 box() const = 0; virtual bool needsRGpu() { return m_needsRGpu; }; diff --git a/src/core/actor/Mmm1dgpuForce.hpp b/src/core/actor/Mmm1dgpuForce.hpp index 24a8c2fc9d8..4941839a823 100644 --- a/src/core/actor/Mmm1dgpuForce.hpp +++ b/src/core/actor/Mmm1dgpuForce.hpp @@ -48,7 +48,7 @@ class Mmm1dgpuForce : public Actor { private: // CUDA parameters unsigned int numThreads; - unsigned int numBlocks(SystemInterface &s); + unsigned int numBlocks(SystemInterface const &s) const; // the box length currently set on the GPU // Needed to make sure it hasn't been modified after inter coulomb was used. diff --git a/src/core/actor/Mmm1dgpuForce_cuda.cu b/src/core/actor/Mmm1dgpuForce_cuda.cu index f2b138c2af1..bd2c2e519f3 100644 --- a/src/core/actor/Mmm1dgpuForce_cuda.cu +++ b/src/core/actor/Mmm1dgpuForce_cuda.cu @@ -34,6 +34,7 @@ #include "electrostatics_magnetostatics/mmm1d.hpp" #include +#include #include @@ -45,7 +46,7 @@ #endif // the code is mostly multi-GPU capable, but ESPResSo is not yet -const int deviceCount = 1; +constexpr int deviceCount = 1; #undef cudaSetDevice #define cudaSetDevice(d) @@ -57,12 +58,11 @@ __constant__ float coulomb_prefactor[1] = {1.0f}; __constant__ int bessel_cutoff[1] = {5}; __constant__ float maxPWerror[1] = {1e-5f}; -// order hardcoded. mmm1d_recalcTables() typically does order less than 30. // As the coefficients are stored in __constant__ memory, the array needs to be // sized in advance. We don't know exactly how many coefficients per order, so // we size plentiful. -const int modpsi_order = 30; -const int modpsi_constant_size = modpsi_order * modpsi_order * 2; +constexpr int modpsi_order = 30; +constexpr int modpsi_constant_size = modpsi_order * modpsi_order * 2; // linearized array on device __constant__ int device_n_modPsi[1] = {0}; @@ -108,11 +108,11 @@ int modpsi_init() { cudaSetDevice(d); // copy to GPU - int linModPsiSize = linModPsi_offsets[modPsi.size() - 1] + - linModPsi_lengths[modPsi.size() - 1]; + auto const linModPsiSize = linModPsi_offsets[modPsi.size() - 1] + + linModPsi_lengths[modPsi.size() - 1]; if (linModPsiSize > modpsi_constant_size) { - printf("ERROR: __constant__ device_linModPsi[] is not large enough\n"); - std::abort(); + throw std::runtime_error( + "__constant__ device_linModPsi[] is not large enough"); } cuda_safe_mem(cudaMemcpyToSymbol(device_linModPsi_offsets, linModPsi_offsets.data(), @@ -155,7 +155,7 @@ Mmm1dgpuForce::Mmm1dgpuForce(SystemInterface &s, float _coulomb_prefactor, void Mmm1dgpuForce::setup(SystemInterface &s) { if (s.box()[2] <= 0) { throw std::runtime_error( - "Error: Please set box length before initializing MMM1D!"); + "Please set box length before initializing MMM1D!"); } if (need_tune && s.npart_gpu() > 0) { set_params(static_cast(s.box()[2]), @@ -166,24 +166,22 @@ void Mmm1dgpuForce::setup(SystemInterface &s) { if (s.box()[2] != host_boxz) { set_params(static_cast(s.box()[2]), 0, -1, -1, -1); } - if (s.npart_gpu() == host_npart) // unchanged - { + if (s.npart_gpu() == host_npart) { // unchanged return; } // For all but the largest systems, it is faster to store force pairs and then // sum them up. Atomics are just so slow: so unless we're limited by memory, // do the latter. + auto const part_mem_size = 3 * Utils::sqr(s.npart_gpu()) * sizeof(float); pairs = 2; for (int d = 0; d < deviceCount; d++) { cudaSetDevice(d); size_t freeMem, totalMem; cudaMemGetInfo(&freeMem, &totalMem); - if (freeMem / 2 < - 3 * s.npart_gpu() * s.npart_gpu() * - sizeof(float)) // don't use more than half the device's memory - { + if (freeMem / 2 < part_mem_size) { + // don't use more than half the device's memory std::cerr << "Switching to atomicAdd due to memory constraints." << std::endl; pairs = 0; @@ -192,11 +190,9 @@ void Mmm1dgpuForce::setup(SystemInterface &s) { } if (dev_forcePairs) cudaFree(dev_forcePairs); - if (pairs) // we need memory to store force pairs - { - cuda_safe_mem( - cudaMalloc((void **)&dev_forcePairs, - 3 * s.npart_gpu() * s.npart_gpu() * sizeof(float))); + if (pairs) { + // we need memory to store force pairs + cuda_safe_mem(cudaMalloc((void **)&dev_forcePairs, part_mem_size)); } if (dev_energyBlocks) cudaFree(dev_energyBlocks); @@ -205,7 +201,7 @@ void Mmm1dgpuForce::setup(SystemInterface &s) { host_npart = static_cast(s.npart_gpu()); } -unsigned int Mmm1dgpuForce::numBlocks(SystemInterface &s) { +unsigned int Mmm1dgpuForce::numBlocks(SystemInterface const &s) const { auto b = static_cast(s.npart_gpu() * s.npart_gpu() / numThreads) + 1; if (b > 65535) b = 65535; @@ -233,8 +229,8 @@ __global__ void sumKernel(float *data, int N) { extern __shared__ float partialsums[]; if (blockIdx.x != 0) return; - auto tid = static_cast(threadIdx.x); - float result = 0; + auto const tid = static_cast(threadIdx.x); + auto result = 0.f; for (int i = 0; i < N; i += static_cast(blockDim.x)) { if (i + tid >= N) @@ -274,9 +270,8 @@ void Mmm1dgpuForce::tune(SystemInterface &s, float _maxPWerror, int bessel_cutoff = _bessel_cutoff; float maxrad = host_boxz; - if (_far_switch_radius < 0 && _bessel_cutoff < 0) - // autodetermine switching radius and Bessel cutoff - { + if (_far_switch_radius < 0 && _bessel_cutoff < 0) { + // autodetermine switching radius and Bessel cutoff float bestrad = 0, besttime = INFINITY; // NOLINTNEXTLINE(clang-analyzer-security.FloatLoopCounter) @@ -284,7 +279,7 @@ void Mmm1dgpuForce::tune(SystemInterface &s, float _maxPWerror, far_switch_radius += 0.05f * maxrad) { set_params(0, 0, _maxPWerror, far_switch_radius, bessel_cutoff); tune(s, _maxPWerror, far_switch_radius, -2); // tune Bessel cutoff - auto runtime = force_benchmark(s); + auto const runtime = force_benchmark(s); if (runtime < besttime) { besttime = runtime; bestrad = far_switch_radius; @@ -294,24 +289,19 @@ void Mmm1dgpuForce::tune(SystemInterface &s, float _maxPWerror, set_params(0, 0, _maxPWerror, far_switch_radius, bessel_cutoff); tune(s, _maxPWerror, far_switch_radius, -2); // tune Bessel cutoff - } - - else if (_bessel_cutoff < 0) - // autodetermine Bessel cutoff - { + } else if (_bessel_cutoff < 0) { + // autodetermine Bessel cutoff int *dev_cutoff; - int maxCut = 30; + constexpr auto maxCut = 30; cuda_safe_mem(cudaMalloc((void **)&dev_cutoff, sizeof(int))); besselTuneKernel<<>>( dev_cutoff, far_switch_radius, maxCut); cuda_safe_mem(cudaMemcpy(&bessel_cutoff, dev_cutoff, sizeof(int), cudaMemcpyDeviceToHost)); cudaFree(dev_cutoff); - if (_bessel_cutoff != -2 && - bessel_cutoff >= - maxCut) // we already have our switching radius and only need to - // determine the cutoff, i.e. this is the final tuning round - { + if (_bessel_cutoff != -2 && bessel_cutoff >= maxCut) { + // we already had our switching radius and only needed to + // determine the cutoff, i.e. this was the final tuning round throw std::runtime_error( "No reasonable Bessel cutoff could be determined."); } @@ -327,8 +317,8 @@ void Mmm1dgpuForce::set_params(float _boxz, float _coulomb_prefactor, throw std::runtime_error( "switching radius must not be larger than box length"); } - float _far_switch_radius_2 = _far_switch_radius * _far_switch_radius; - float _uz = 1.0f / _boxz; + auto const _far_switch_radius_2 = Utils::sqr(_far_switch_radius); + auto const _uz = 1.0f / _boxz; for (int d = 0; d < deviceCount; d++) { // double colons are needed to access the constant memory variables because // they are file globals and we have identically named class variables @@ -339,8 +329,7 @@ void Mmm1dgpuForce::set_params(float _boxz, float _coulomb_prefactor, bessel_cutoff = _bessel_cutoff; } if (_far_switch_radius >= 0) { - mmm1d_params.far_switch_radius_2 = - _far_switch_radius * _far_switch_radius; + mmm1d_params.far_switch_radius_2 = _far_switch_radius_2; cuda_safe_mem(cudaMemcpyToSymbol(::far_switch_radius_2, &_far_switch_radius_2, sizeof(float))); far_switch_radius = _far_switch_radius; @@ -370,12 +359,10 @@ void Mmm1dgpuForce::set_params(float _boxz, float _coulomb_prefactor, } need_tune = true; - // The changed parameters in mmm1d_params do not need to be broadcast: they - // are only accessed by the TCL print function (on node 0) when you call inter - // coulomb. The CUDA code only runs on node 0, so other nodes do not need the - // parameters. We couldn't broadcast from here anyway because set_params() - // might be called from inside computeForces() which is not a time at which - // the MPI loop on the slave nodes is waiting for broadcasts. + // The changed parameters in @ref mmm1d_params do not need to be broadcast: + // they are only accessed by the CUDA code which only runs on the head node. + // We couldn't broadcast from here anyway because @ref set_params() is + // called from the @ref computeForces() and @ref tune() functions. } __global__ void forcesKernel(const float *__restrict__ r, @@ -385,17 +372,19 @@ __global__ void forcesKernel(const float *__restrict__ r, if (tStop < 0) tStop = N * N; - const float c_2pif = 2 * Utils::pi(); + auto const c_2pif = 2 * Utils::pi(); for (int tid = static_cast(threadIdx.x + blockIdx.x * blockDim.x) + tStart; tid < tStop; tid += static_cast(blockDim.x * gridDim.x)) { - int p1 = tid % N, p2 = tid / N; - float x = r[3 * p2] - r[3 * p1], y = r[3 * p2 + 1] - r[3 * p1 + 1], - z = r[3 * p2 + 2] - r[3 * p1 + 2]; - float rxy2 = sqpow(x) + sqpow(y); - float rxy = sqrt(rxy2); - float sum_r = 0, sum_z = 0; + auto const p1 = tid % N, p2 = tid / N; + auto x = r[3 * p2] - r[3 * p1]; + auto y = r[3 * p2 + 1] - r[3 * p1 + 1]; + auto z = r[3 * p2 + 2] - r[3 * p1 + 2]; + auto const rxy2 = sqpow(x) + sqpow(y); + auto rxy = sqrt(rxy2); + auto sum_r = 0.f; + auto sum_z = 0.f; // if (*boxz <= 0.0) return; // in case we are not initialized yet @@ -408,14 +397,14 @@ __global__ void forcesKernel(const float *__restrict__ r, // anyway) } else if (rxy2 <= *far_switch_radius_2) // near formula { - float uzz = *uz * z; - float uzr = *uz * rxy; + auto const uzz = *uz * z; + auto const uzr = *uz * rxy; sum_z = dev_mod_psi_odd(0, uzz); - float uzrpow = uzr; + auto uzrpow = uzr; for (int n = 1; n < *device_n_modPsi; n++) { - float sum_r_old = sum_r; - float mpe = dev_mod_psi_even(n, uzz); - float mpo = dev_mod_psi_odd(n, uzz); + auto const sum_r_old = sum_r; + auto const mpe = dev_mod_psi_even(n, uzz); + auto const mpo = dev_mod_psi_odd(n, uzz); sum_r += 2 * static_cast(n) * mpe * uzrpow; uzrpow *= uzr; @@ -455,7 +444,7 @@ __global__ void forcesKernel(const float *__restrict__ r, sum_r += 2 * *uz / rxy; } - float pref = *coulomb_prefactor * q[p1] * q[p2]; + auto const pref = *coulomb_prefactor * q[p1] * q[p2]; if (pairs) { force[3 * (p1 + p2 * N - tStart)] = pref * sum_r / rxy * x; force[3 * (p1 + p2 * N - tStart) + 1] = pref * sum_r / rxy * y; @@ -486,12 +475,12 @@ __global__ void energiesKernel(const float *__restrict__ r, for (int tid = static_cast(threadIdx.x + blockIdx.x * blockDim.x) + tStart; tid < tStop; tid += static_cast(blockDim.x * gridDim.x)) { - int p1 = tid % N, p2 = tid / N; - float z = r[3 * p2 + 2] - r[3 * p1 + 2]; - float rxy2 = + auto const p1 = tid % N, p2 = tid / N; + auto z = r[3 * p2 + 2] - r[3 * p1 + 2]; + auto const rxy2 = sqpow(r[3 * p2] - r[3 * p1]) + sqpow(r[3 * p2 + 1] - r[3 * p1 + 1]); - float rxy = sqrt(rxy2); - float sum_e = 0; + auto rxy = sqrt(rxy2); + auto sum_e = 0.f; // if (*boxz <= 0.0) return; // in case we are not initialized yet @@ -502,13 +491,13 @@ __global__ void energiesKernel(const float *__restrict__ r, { } else if (rxy2 <= *far_switch_radius_2) // near formula { - float uzz = *uz * z; - float uzr2 = sqpow(*uz * rxy); - float uzrpow = uzr2; + auto const uzz = *uz * z; + auto const uzr2 = sqpow(*uz * rxy); + auto uzrpow = uzr2; sum_e = dev_mod_psi_even(0, uzz); for (int n = 1; n < *device_n_modPsi; n++) { - float sum_e_old = sum_e; - float mpe = dev_mod_psi_even(n, uzz); + auto const sum_e_old = sum_e; + auto const mpe = dev_mod_psi_even(n, uzz); sum_e += mpe * uzrpow; uzrpow *= uzr2; @@ -525,7 +514,7 @@ __global__ void energiesKernel(const float *__restrict__ r, { sum_e = -(log(rxy * *uz / 2) + c_gammaf) / 2; for (int p = 1; p < *bessel_cutoff; p++) { - float arg = c_2pif * *uz * static_cast(p); + auto const arg = c_2pif * *uz * static_cast(p); sum_e += dev_K0(arg * rxy) * cos(arg * z); } sum_e *= *uz * 4; @@ -549,7 +538,7 @@ __global__ void vectorReductionKernel(float const *src, float *dst, int N, for (auto tid = static_cast(threadIdx.x + blockIdx.x * blockDim.x); tid < N; tid += static_cast(blockDim.x * gridDim.x)) { - int offset = ((tid + (tStart % N)) % N); + auto const offset = (tid + (tStart % N)) % N; for (int i = 0; tid + i * N < (tStop - tStart); i++) { #pragma unroll 3 @@ -604,7 +593,7 @@ void Mmm1dgpuForce::computeEnergy(SystemInterface &s) { if (pairs < 0) { throw std::runtime_error("MMM1D was not initialized correctly"); } - auto shared = static_cast(numThreads * sizeof(float)); + auto const shared = numThreads * static_cast(sizeof(float)); KERNELCALL_shared(energiesKernel, numBlocks(s), numThreads, shared, s.rGpuBegin(), s.qGpuBegin(), dev_energyBlocks, diff --git a/src/core/actor/specfunc_cuda.hpp b/src/core/actor/specfunc_cuda.hpp index 1664d4e48e2..3ff410e165b 100644 --- a/src/core/actor/specfunc_cuda.hpp +++ b/src/core/actor/specfunc_cuda.hpp @@ -132,11 +132,11 @@ __constant__ static int bi1_size = 11; /**@}*/ __device__ float evaluateAsChebychevSeriesAt(float const *c, int n, float x) { - float x2 = 2 * x; - float dd = c[n - 1]; - float d = x2 * dd + c[n - 2]; + auto const x2 = 2 * x; + auto dd = c[n - 1]; + auto d = x2 * dd + c[n - 2]; for (int j = n - 3; j >= 1; j--) { - float tmp = d; + auto const tmp = d; d = x2 * d - dd + c[j]; dd = tmp; } @@ -145,7 +145,7 @@ __device__ float evaluateAsChebychevSeriesAt(float const *c, int n, float x) { __device__ float evaluateAsTaylorSeriesAt(float const *c, int n, float x) { int cnt = n - 1; - float r = c[cnt]; + auto r = c[cnt]; while (--cnt >= 0) r = r * x + c[cnt]; return r; From 268f52872beeb8780aec86535be9d78c38a598d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= Date: Fri, 23 Jul 2021 13:26:19 +0200 Subject: [PATCH 4/6] core: Remove unused argument in CUDA kernels --- src/core/actor/Mmm1dgpuForce_cuda.cu | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/src/core/actor/Mmm1dgpuForce_cuda.cu b/src/core/actor/Mmm1dgpuForce_cuda.cu index bd2c2e519f3..46a5b0d3f2d 100644 --- a/src/core/actor/Mmm1dgpuForce_cuda.cu +++ b/src/core/actor/Mmm1dgpuForce_cuda.cu @@ -368,11 +368,10 @@ void Mmm1dgpuForce::set_params(float _boxz, float _coulomb_prefactor, __global__ void forcesKernel(const float *__restrict__ r, const float *__restrict__ q, float *__restrict__ force, int N, int pairs, - int tStart, int tStop) { - if (tStop < 0) - tStop = N * N; + int tStart) { auto const c_2pif = 2 * Utils::pi(); + auto const tStop = Utils::sqr(N); for (int tid = static_cast(threadIdx.x + blockIdx.x * blockDim.x) + tStart; @@ -460,12 +459,11 @@ __global__ void forcesKernel(const float *__restrict__ r, __global__ void energiesKernel(const float *__restrict__ r, const float *__restrict__ q, float *__restrict__ energy, int N, int pairs, - int tStart, int tStop) { - if (tStop < 0) - tStop = N * N; + int tStart) { auto const c_2pif = 2 * Utils::pi(); auto const c_gammaf = Utils::gamma(); + auto const tStop = Utils::sqr(N); extern __shared__ float partialsums[]; if (!pairs) { @@ -532,9 +530,9 @@ __global__ void energiesKernel(const float *__restrict__ r, } __global__ void vectorReductionKernel(float const *src, float *dst, int N, - int tStart, int tStop) { - if (tStop < 0) - tStop = N * N; + int tStart) { + + auto const tStop = Utils::sqr(N); for (auto tid = static_cast(threadIdx.x + blockIdx.x * blockDim.x); tid < N; tid += static_cast(blockDim.x * gridDim.x)) { @@ -565,12 +563,12 @@ void Mmm1dgpuForce::computeForces(SystemInterface &s) { { auto blocksRed = static_cast(s.npart_gpu() / numThreads) + 1; KERNELCALL(forcesKernel, numBlocks(s), numThreads, s.rGpuBegin(), - s.qGpuBegin(), dev_forcePairs, s.npart_gpu(), pairs, 0, -1) + s.qGpuBegin(), dev_forcePairs, s.npart_gpu(), pairs, 0) KERNELCALL(vectorReductionKernel, blocksRed, numThreads, dev_forcePairs, - s.fGpuBegin(), s.npart_gpu(), 0, -1) + s.fGpuBegin(), s.npart_gpu(), 0) } else { KERNELCALL(forcesKernel, numBlocks(s), numThreads, s.rGpuBegin(), - s.qGpuBegin(), s.fGpuBegin(), s.npart_gpu(), pairs, 0, -1) + s.qGpuBegin(), s.fGpuBegin(), s.npart_gpu(), pairs, 0) } } @@ -597,7 +595,7 @@ void Mmm1dgpuForce::computeEnergy(SystemInterface &s) { KERNELCALL_shared(energiesKernel, numBlocks(s), numThreads, shared, s.rGpuBegin(), s.qGpuBegin(), dev_energyBlocks, - s.npart_gpu(), 0, 0, -1); + s.npart_gpu(), 0, 0); KERNELCALL_shared(sumKernel, 1, numThreads, shared, dev_energyBlocks, numBlocks(s)); KERNELCALL(scaleAndAddKernel, 1, 1, &(((CUDA_energy *)s.eGpu())->coulomb), @@ -617,7 +615,7 @@ float Mmm1dgpuForce::force_benchmark(SystemInterface &s) { cuda_safe_mem(cudaEventCreate(&eventStop)); cuda_safe_mem(cudaEventRecord(eventStart, stream[0])); KERNELCALL(forcesKernel, numBlocks(s), numThreads, s.rGpuBegin(), - s.qGpuBegin(), dev_f_benchmark, s.npart_gpu(), 0, 0, -1) + s.qGpuBegin(), dev_f_benchmark, s.npart_gpu(), 0, 0) cuda_safe_mem(cudaEventRecord(eventStop, stream[0])); cuda_safe_mem(cudaEventSynchronize(eventStop)); cuda_safe_mem(cudaEventElapsedTime(&elapsedTime, eventStart, eventStop)); From c38907abd0b01df0fa8e4c9d17cb02a5a2c33030 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= Date: Fri, 23 Jul 2021 14:20:21 +0200 Subject: [PATCH 5/6] core: Fix -Wconversion warnings in MMM1D code --- src/core/EspressoSystemInterface.hpp | 18 ++- src/core/EspressoSystemInterface_cuda.cu | 35 +++-- src/core/SystemInterface.hpp | 5 +- src/core/actor/DipolarBarnesHut.hpp | 3 +- src/core/actor/DipolarDirectSum.hpp | 2 +- src/core/actor/Mmm1dgpuForce.hpp | 2 +- src/core/actor/Mmm1dgpuForce_cuda.cu | 133 +++++++++--------- src/core/actor/specfunc_cuda.hpp | 34 ++--- .../electrostatics_magnetostatics/mmm1d.cpp | 24 ++-- .../p3m-common.hpp | 2 +- src/python/espressomd/electrostatics.pxd | 2 +- 11 files changed, 138 insertions(+), 122 deletions(-) diff --git a/src/core/EspressoSystemInterface.hpp b/src/core/EspressoSystemInterface.hpp index f7f25b59464..fa01a3aa067 100644 --- a/src/core/EspressoSystemInterface.hpp +++ b/src/core/EspressoSystemInterface.hpp @@ -22,6 +22,8 @@ #include "SystemInterface.hpp" #include "cuda_interface.hpp" +#include + /* Syntactic sugar */ #define espressoSystemInterface EspressoSystemInterface::Instance() @@ -117,12 +119,16 @@ class EspressoSystemInterface : public SystemInterface { float *fGpuEnd() override { return gpu_get_particle_force_pointer() + 3 * m_gpu_npart; }; - float *eGpu() override { return (float *)gpu_get_energy_pointer(); }; + float *eGpu() override { + // cast struct of floats to array of floats + // https://stackoverflow.com/a/29278260 + return reinterpret_cast(gpu_get_energy_pointer()); + }; float *torqueGpuBegin() override { - return (float *)gpu_get_particle_torque_pointer(); + return gpu_get_particle_torque_pointer(); }; float *torqueGpuEnd() override { - return (float *)(gpu_get_particle_torque_pointer()) + 3 * m_gpu_npart; + return gpu_get_particle_torque_pointer() + 3 * m_gpu_npart; }; bool hasFGpu() override { return true; }; bool requestFGpu() override { @@ -148,7 +154,7 @@ class EspressoSystemInterface : public SystemInterface { Utils::Vector3d box() const override; - unsigned int npart_gpu() const override { + std::size_t npart_gpu() const override { #ifdef CUDA return gpu_get_particle_pointer().size(); #else @@ -177,10 +183,10 @@ class EspressoSystemInterface : public SystemInterface { reallocDeviceMemory(gpu_get_particle_pointer().size()); } }; - void reallocDeviceMemory(int n); + void reallocDeviceMemory(std::size_t n); #endif - int m_gpu_npart; + std::size_t m_gpu_npart; bool m_gpu; float *m_r_gpu_begin; diff --git a/src/core/EspressoSystemInterface_cuda.cu b/src/core/EspressoSystemInterface_cuda.cu index 81948ad920b..2904f2994fb 100644 --- a/src/core/EspressoSystemInterface_cuda.cu +++ b/src/core/EspressoSystemInterface_cuda.cu @@ -22,6 +22,8 @@ #include "cuda_utils.cuh" #include "errorhandling.hpp" +#include + #include #if defined(OMPI_MPI_H) || defined(_MPI_H) @@ -33,8 +35,8 @@ // Position and charge __global__ void split_kernel_rq(CUDA_particle_data *particles, float *r, - float *q, int n) { - auto idx = static_cast(blockDim.x * blockIdx.x + threadIdx.x); + float *q, unsigned int n) { + auto const idx = blockDim.x * blockIdx.x + threadIdx.x; if (idx >= n) return; @@ -49,8 +51,9 @@ __global__ void split_kernel_rq(CUDA_particle_data *particles, float *r, } // Charge only -__global__ void split_kernel_q(CUDA_particle_data *particles, float *q, int n) { - auto idx = static_cast(blockDim.x * blockIdx.x + threadIdx.x); +__global__ void split_kernel_q(CUDA_particle_data *particles, float *q, + unsigned int n) { + auto const idx = blockDim.x * blockIdx.x + threadIdx.x; if (idx >= n) return; @@ -62,8 +65,9 @@ __global__ void split_kernel_q(CUDA_particle_data *particles, float *q, int n) { } // Position only -__global__ void split_kernel_r(CUDA_particle_data *particles, float *r, int n) { - auto idx = static_cast(blockDim.x * blockIdx.x + threadIdx.x); +__global__ void split_kernel_r(CUDA_particle_data *particles, float *r, + unsigned int n) { + auto idx = blockDim.x * blockIdx.x + threadIdx.x; if (idx >= n) return; @@ -78,8 +82,9 @@ __global__ void split_kernel_r(CUDA_particle_data *particles, float *r, int n) { #ifdef CUDA // Velocity -__global__ void split_kernel_v(CUDA_particle_data *particles, float *v, int n) { - auto idx = static_cast(blockDim.x * blockIdx.x + threadIdx.x); +__global__ void split_kernel_v(CUDA_particle_data *particles, float *v, + unsigned int n) { + auto idx = blockDim.x * blockIdx.x + threadIdx.x; if (idx >= n) return; @@ -96,8 +101,8 @@ __global__ void split_kernel_v(CUDA_particle_data *particles, float *v, int n) { #ifdef DIPOLES // Dipole moment __global__ void split_kernel_dip(CUDA_particle_data *particles, float *dip, - int n) { - auto idx = static_cast(blockDim.x * blockIdx.x + threadIdx.x); + unsigned int n) { + auto idx = blockDim.x * blockIdx.x + threadIdx.x; if (idx >= n) return; @@ -112,9 +117,9 @@ __global__ void split_kernel_dip(CUDA_particle_data *particles, float *dip, #endif __global__ void split_kernel_director(CUDA_particle_data *particles, - float *director, int n) { + float *director, unsigned int n) { #ifdef ROTATION - auto idx = static_cast(blockDim.x * blockIdx.x + threadIdx.x); + auto idx = blockDim.x * blockIdx.x + threadIdx.x; if (idx >= n) return; @@ -128,7 +133,7 @@ __global__ void split_kernel_director(CUDA_particle_data *particles, #endif } -void EspressoSystemInterface::reallocDeviceMemory(int n) { +void EspressoSystemInterface::reallocDeviceMemory(std::size_t n) { if (m_needsRGpu && ((n != m_gpu_npart) || (m_r_gpu_begin == nullptr))) { if (m_r_gpu_begin != nullptr) cuda_safe_mem(cudaFree(m_r_gpu_begin)); @@ -169,8 +174,8 @@ void EspressoSystemInterface::reallocDeviceMemory(int n) { } void EspressoSystemInterface::split_particle_struct() { - auto device_particles = gpu_get_particle_pointer(); - int n = device_particles.size(); + auto const device_particles = gpu_get_particle_pointer(); + auto const n = static_cast(device_particles.size()); if (n == 0) return; diff --git a/src/core/SystemInterface.hpp b/src/core/SystemInterface.hpp index d09014a25ea..5f78a98581b 100644 --- a/src/core/SystemInterface.hpp +++ b/src/core/SystemInterface.hpp @@ -20,8 +20,11 @@ #define SYSTEMINTERFACE_H #include "config.hpp" + #include +#include + /** @todo: Turn needsXY in getter/setter **/ class SystemInterface { @@ -93,7 +96,7 @@ class SystemInterface { return m_needsDirectorGpu; } - virtual unsigned int npart_gpu() const { return 0; }; + virtual std::size_t npart_gpu() const { return 0; }; virtual Vector3 box() const = 0; virtual bool needsRGpu() { return m_needsRGpu; }; diff --git a/src/core/actor/DipolarBarnesHut.hpp b/src/core/actor/DipolarBarnesHut.hpp index f4019c34d29..8631e8c9eb6 100644 --- a/src/core/actor/DipolarBarnesHut.hpp +++ b/src/core/actor/DipolarBarnesHut.hpp @@ -82,7 +82,8 @@ class DipolarBarnesHut : public Actor { buildTreeBH(m_bh_data.blocks); summarizeBH(m_bh_data.blocks); sortBH(m_bh_data.blocks); - if (energyBH(&m_bh_data, m_k, (&(((CUDA_energy *)s.eGpu())->dipolar)))) { + if (energyBH(&m_bh_data, m_k, + &(reinterpret_cast(s.eGpu())->dipolar))) { runtimeErrorMsg() << "kernels encountered a functional error"; } }; diff --git a/src/core/actor/DipolarDirectSum.hpp b/src/core/actor/DipolarDirectSum.hpp index 809bec5b828..0e9dd8d505b 100644 --- a/src/core/actor/DipolarDirectSum.hpp +++ b/src/core/actor/DipolarDirectSum.hpp @@ -74,7 +74,7 @@ class DipolarDirectSum : public Actor { } DipolarDirectSum_kernel_wrapper_energy( k, static_cast(s.npart_gpu()), s.rGpuBegin(), s.dipGpuBegin(), box, - per, (&(((CUDA_energy *)s.eGpu())->dipolar))); + per, &(reinterpret_cast(s.eGpu())->dipolar)); }; private: diff --git a/src/core/actor/Mmm1dgpuForce.hpp b/src/core/actor/Mmm1dgpuForce.hpp index 4941839a823..a767a38cad5 100644 --- a/src/core/actor/Mmm1dgpuForce.hpp +++ b/src/core/actor/Mmm1dgpuForce.hpp @@ -55,7 +55,7 @@ class Mmm1dgpuForce : public Actor { float host_boxz; // the number of particles we had during the last run. Needed to check if we // have to realloc dev_forcePairs - int host_npart; + unsigned int host_npart; bool need_tune; // pairs==0: return forces using atomicAdd diff --git a/src/core/actor/Mmm1dgpuForce_cuda.cu b/src/core/actor/Mmm1dgpuForce_cuda.cu index 46a5b0d3f2d..e4c389981e6 100644 --- a/src/core/actor/Mmm1dgpuForce_cuda.cu +++ b/src/core/actor/Mmm1dgpuForce_cuda.cu @@ -66,40 +66,40 @@ constexpr int modpsi_constant_size = modpsi_order * modpsi_order * 2; // linearized array on device __constant__ int device_n_modPsi[1] = {0}; -__constant__ int device_linModPsi_offsets[2 * modpsi_order], - device_linModPsi_lengths[2 * modpsi_order]; +__constant__ unsigned int device_linModPsi_offsets[2 * modpsi_order]; +__constant__ unsigned int device_linModPsi_lengths[2 * modpsi_order]; __constant__ float device_linModPsi[modpsi_constant_size]; __device__ float dev_mod_psi_even(int n, float x) { return evaluateAsTaylorSeriesAt( &device_linModPsi[device_linModPsi_offsets[2 * n]], - device_linModPsi_lengths[2 * n], x * x); + static_cast(device_linModPsi_lengths[2 * n]), x * x); } __device__ float dev_mod_psi_odd(int n, float x) { return x * evaluateAsTaylorSeriesAt( &device_linModPsi[device_linModPsi_offsets[2 * n + 1]], - device_linModPsi_lengths[2 * n + 1], x * x); + static_cast(device_linModPsi_lengths[2 * n + 1]), x * x); } int modpsi_init() { create_mod_psi_up_to(modpsi_order); // linearized array on host - std::vector linModPsi_offsets(modPsi.size()); - std::vector linModPsi_lengths(modPsi.size()); - for (size_t i = 0; i < modPsi.size(); i++) { + std::vector linModPsi_offsets(modPsi.size()); + std::vector linModPsi_lengths(modPsi.size()); + for (std::size_t i = 0; i < modPsi.size(); i++) { if (i) linModPsi_offsets[i] = linModPsi_offsets[i - 1] + linModPsi_lengths[i - 1]; - linModPsi_lengths[i] = modPsi[i].size(); + linModPsi_lengths[i] = static_cast(modPsi[i].size()); } // linearize the coefficients array std::vector linModPsi(linModPsi_offsets[modPsi.size() - 1] + linModPsi_lengths[modPsi.size() - 1]); - for (size_t i = 0; i < modPsi.size(); i++) { - for (size_t j = 0; j < modPsi[i].size(); j++) { + for (std::size_t i = 0; i < modPsi.size(); i++) { + for (std::size_t j = 0; j < modPsi[i].size(); j++) { linModPsi[linModPsi_offsets[i] + j] = static_cast(modPsi[i][j]); } } @@ -110,7 +110,7 @@ int modpsi_init() { // copy to GPU auto const linModPsiSize = linModPsi_offsets[modPsi.size() - 1] + linModPsi_lengths[modPsi.size() - 1]; - if (linModPsiSize > modpsi_constant_size) { + if (linModPsiSize > static_cast(modpsi_constant_size)) { throw std::runtime_error( "__constant__ device_linModPsi[] is not large enough"); } @@ -153,18 +153,18 @@ Mmm1dgpuForce::Mmm1dgpuForce(SystemInterface &s, float _coulomb_prefactor, } void Mmm1dgpuForce::setup(SystemInterface &s) { - if (s.box()[2] <= 0) { + auto const box_z = static_cast(s.box()[2]); + if (box_z <= 0) { throw std::runtime_error( "Please set box length before initializing MMM1D!"); } if (need_tune && s.npart_gpu() > 0) { - set_params(static_cast(s.box()[2]), - static_cast(coulomb.prefactor), maxPWerror, + set_params(box_z, static_cast(coulomb.prefactor), maxPWerror, far_switch_radius, bessel_cutoff); tune(s, maxPWerror, far_switch_radius, bessel_cutoff); } - if (s.box()[2] != host_boxz) { - set_params(static_cast(s.box()[2]), 0, -1, -1, -1); + if (box_z != host_boxz) { + set_params(box_z, 0, -1, -1, -1); } if (s.npart_gpu() == host_npart) { // unchanged return; @@ -178,7 +178,7 @@ void Mmm1dgpuForce::setup(SystemInterface &s) { for (int d = 0; d < deviceCount; d++) { cudaSetDevice(d); - size_t freeMem, totalMem; + std::size_t freeMem, totalMem; cudaMemGetInfo(&freeMem, &totalMem); if (freeMem / 2 < part_mem_size) { // don't use more than half the device's memory @@ -198,11 +198,12 @@ void Mmm1dgpuForce::setup(SystemInterface &s) { cudaFree(dev_energyBlocks); cuda_safe_mem( cudaMalloc((void **)&dev_energyBlocks, numBlocks(s) * sizeof(float))); - host_npart = static_cast(s.npart_gpu()); + host_npart = static_cast(s.npart_gpu()); } unsigned int Mmm1dgpuForce::numBlocks(SystemInterface const &s) const { - auto b = static_cast(s.npart_gpu() * s.npart_gpu() / numThreads) + 1; + auto b = 1 + static_cast(Utils::sqr(s.npart_gpu()) / + static_cast(numThreads)); if (b > 65535) b = 65535; return b; @@ -214,8 +215,8 @@ __forceinline__ __device__ float sqpow(float x) { return x * x; } __forceinline__ __device__ float cbpow(float x) { return x * x * x; } __device__ void sumReduction(float *input, float *sum) { - auto tid = static_cast(threadIdx.x); - for (auto i = static_cast(blockDim.x) / 2; i > 0; i /= 2) { + auto const tid = threadIdx.x; + for (auto i = blockDim.x / 2; i > 0; i /= 2) { __syncthreads(); if (tid < i) input[tid] += input[i + tid]; @@ -225,23 +226,23 @@ __device__ void sumReduction(float *input, float *sum) { sum[0] = input[0]; } -__global__ void sumKernel(float *data, int N) { +__global__ void sumKernel(float *data, std::size_t N) { extern __shared__ float partialsums[]; if (blockIdx.x != 0) return; - auto const tid = static_cast(threadIdx.x); + std::size_t const tid = threadIdx.x; auto result = 0.f; - for (int i = 0; i < N; i += static_cast(blockDim.x)) { + for (std::size_t i = 0; i < N; i += blockDim.x) { if (i + tid >= N) - partialsums[tid] = 0; + partialsums[tid] = 0.f; else partialsums[tid] = data[i + tid]; sumReduction(partialsums, &result); if (tid == 0) { if (i == 0) - data[0] = 0; + data[0] = 0.f; data[0] += result; } } @@ -249,9 +250,9 @@ __global__ void sumKernel(float *data, int N) { __global__ void besselTuneKernel(int *result, float far_switch_radius, int maxCut) { - const float c_2pif = 2 * Utils::pi(); - float arg = c_2pif * *uz * far_switch_radius; - float pref = 4 * *uz * max(1.0f, c_2pif * *uz); + constexpr auto c_2pif = 2 * Utils::pi(); + auto const arg = c_2pif * *uz * far_switch_radius; + auto const pref = 4 * *uz * max(1.0f, c_2pif * *uz); float err; int P = 1; do { @@ -367,17 +368,16 @@ void Mmm1dgpuForce::set_params(float _boxz, float _coulomb_prefactor, __global__ void forcesKernel(const float *__restrict__ r, const float *__restrict__ q, - float *__restrict__ force, int N, int pairs, - int tStart) { + float *__restrict__ force, std::size_t N, + int pairs, std::size_t tStart) { - auto const c_2pif = 2 * Utils::pi(); + constexpr auto c_2pif = 2 * Utils::pi(); auto const tStop = Utils::sqr(N); - for (int tid = - static_cast(threadIdx.x + blockIdx.x * blockDim.x) + tStart; - tid < tStop; tid += static_cast(blockDim.x * gridDim.x)) { + for (std::size_t tid = threadIdx.x + blockIdx.x * blockDim.x + tStart; + tid < tStop; tid += blockDim.x * gridDim.x) { auto const p1 = tid % N, p2 = tid / N; - auto x = r[3 * p2] - r[3 * p1]; + auto x = r[3 * p2 + 0] - r[3 * p1 + 0]; auto y = r[3 * p2 + 1] - r[3 * p1 + 1]; auto z = r[3 * p2 + 2] - r[3 * p1 + 2]; auto const rxy2 = sqpow(x) + sqpow(y); @@ -392,8 +392,8 @@ __global__ void forcesKernel(const float *__restrict__ r, if (p1 == p2) // particle exerts no force on itself { - rxy = 1; // so the division at the end doesn't fail with NaN (sum_r is 0 - // anyway) + rxy = 1.f; // so the division at the end doesn't fail with NaN + // (sum_r is 0 anyway) } else if (rxy2 <= *far_switch_radius_2) // near formula { auto const uzz = *uz * z; @@ -428,8 +428,8 @@ __global__ void forcesKernel(const float *__restrict__ r, if (rxy == 0) // particles at the same radial position only exert a force // in z direction { - rxy = 1; // so the division at the end doesn't fail with NaN (sum_r is 0 - // anyway) + rxy = 1.f; // so the division at the end doesn't fail with NaN + // (sum_r is 0 anyway) } } else // far formula { @@ -445,11 +445,11 @@ __global__ void forcesKernel(const float *__restrict__ r, auto const pref = *coulomb_prefactor * q[p1] * q[p2]; if (pairs) { - force[3 * (p1 + p2 * N - tStart)] = pref * sum_r / rxy * x; + force[3 * (p1 + p2 * N - tStart) + 0] = pref * sum_r / rxy * x; force[3 * (p1 + p2 * N - tStart) + 1] = pref * sum_r / rxy * y; force[3 * (p1 + p2 * N - tStart) + 2] = pref * sum_z; } else { - atomicAdd(&force[3 * p2], pref * sum_r / rxy * x); + atomicAdd(&force[3 * p2 + 0], pref * sum_r / rxy * x); atomicAdd(&force[3 * p2 + 1], pref * sum_r / rxy * y); atomicAdd(&force[3 * p2 + 2], pref * sum_z); } @@ -458,11 +458,11 @@ __global__ void forcesKernel(const float *__restrict__ r, __global__ void energiesKernel(const float *__restrict__ r, const float *__restrict__ q, - float *__restrict__ energy, int N, int pairs, - int tStart) { + float *__restrict__ energy, std::size_t N, + int pairs, std::size_t tStart) { - auto const c_2pif = 2 * Utils::pi(); - auto const c_gammaf = Utils::gamma(); + constexpr auto c_2pif = 2 * Utils::pi(); + constexpr auto c_gammaf = Utils::gamma(); auto const tStop = Utils::sqr(N); extern __shared__ float partialsums[]; @@ -470,13 +470,12 @@ __global__ void energiesKernel(const float *__restrict__ r, partialsums[threadIdx.x] = 0; __syncthreads(); } - for (int tid = - static_cast(threadIdx.x + blockIdx.x * blockDim.x) + tStart; - tid < tStop; tid += static_cast(blockDim.x * gridDim.x)) { + for (std::size_t tid = threadIdx.x + blockIdx.x * blockDim.x + tStart; + tid < tStop; tid += blockDim.x * gridDim.x) { auto const p1 = tid % N, p2 = tid / N; auto z = r[3 * p2 + 2] - r[3 * p1 + 2]; - auto const rxy2 = - sqpow(r[3 * p2] - r[3 * p1]) + sqpow(r[3 * p2 + 1] - r[3 * p1 + 1]); + auto const rxy2 = sqpow(r[3 * p2 + 0] - r[3 * p1 + 0]) + + sqpow(r[3 * p2 + 1] - r[3 * p1 + 1]); auto rxy = sqrt(rxy2); auto sum_e = 0.f; @@ -529,18 +528,17 @@ __global__ void energiesKernel(const float *__restrict__ r, } } -__global__ void vectorReductionKernel(float const *src, float *dst, int N, - int tStart) { +__global__ void vectorReductionKernel(float const *src, float *dst, + std::size_t N, std::size_t tStart) { auto const tStop = Utils::sqr(N); - for (auto tid = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - tid < N; tid += static_cast(blockDim.x * gridDim.x)) { + for (std::size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < N; + tid += blockDim.x * gridDim.x) { auto const offset = (tid + (tStart % N)) % N; - - for (int i = 0; tid + i * N < (tStop - tStart); i++) { + for (std::size_t i = 0; tid + i * N < (tStop - tStart); i++) { #pragma unroll 3 - for (int d = 0; d < 3; d++) { + for (std::size_t d = 0; d < 3; d++) { dst[3 * offset + d] -= src[3 * (tid + i * N) + d]; } } @@ -561,7 +559,9 @@ void Mmm1dgpuForce::computeForces(SystemInterface &s) { if (pairs) // if we calculate force pairs, we need to reduce them to forces { - auto blocksRed = static_cast(s.npart_gpu() / numThreads) + 1; + auto const blocksRed = + 1 + static_cast(s.npart_gpu() / + static_cast(numThreads)); KERNELCALL(forcesKernel, numBlocks(s), numThreads, s.rGpuBegin(), s.qGpuBegin(), dev_forcePairs, s.npart_gpu(), pairs, 0) KERNELCALL(vectorReductionKernel, blocksRed, numThreads, dev_forcePairs, @@ -572,10 +572,10 @@ void Mmm1dgpuForce::computeForces(SystemInterface &s) { } } -__global__ void scaleAndAddKernel(float *dst, float const *src, int N, +__global__ void scaleAndAddKernel(float *dst, float const *src, std::size_t N, float factor) { - for (auto tid = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - tid < N; tid += static_cast(blockDim.x * gridDim.x)) { + for (std::size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < N; + tid += blockDim.x * gridDim.x) { dst[tid] += src[tid] * factor; } } @@ -591,17 +591,18 @@ void Mmm1dgpuForce::computeEnergy(SystemInterface &s) { if (pairs < 0) { throw std::runtime_error("MMM1D was not initialized correctly"); } - auto const shared = numThreads * static_cast(sizeof(float)); + auto const shared = numThreads * static_cast(sizeof(float)); KERNELCALL_shared(energiesKernel, numBlocks(s), numThreads, shared, s.rGpuBegin(), s.qGpuBegin(), dev_energyBlocks, s.npart_gpu(), 0, 0); KERNELCALL_shared(sumKernel, 1, numThreads, shared, dev_energyBlocks, numBlocks(s)); - KERNELCALL(scaleAndAddKernel, 1, 1, &(((CUDA_energy *)s.eGpu())->coulomb), + KERNELCALL(scaleAndAddKernel, 1, 1, + &(reinterpret_cast(s.eGpu())->coulomb), &dev_energyBlocks[0], 1, - 0.5); // we have counted every interaction twice, so halve the - // total energy + 0.5f); // we have counted every interaction twice, so halve the + // total energy } float Mmm1dgpuForce::force_benchmark(SystemInterface &s) { diff --git a/src/core/actor/specfunc_cuda.hpp b/src/core/actor/specfunc_cuda.hpp index 3ff410e165b..ca5770cdb1a 100644 --- a/src/core/actor/specfunc_cuda.hpp +++ b/src/core/actor/specfunc_cuda.hpp @@ -132,7 +132,7 @@ __constant__ static int bi1_size = 11; /**@}*/ __device__ float evaluateAsChebychevSeriesAt(float const *c, int n, float x) { - auto const x2 = 2 * x; + auto const x2 = 2.0f * x; auto dd = c[n - 1]; auto d = x2 * dd + c[n - 2]; for (int j = n - 3; j >= 1; j--) { @@ -140,7 +140,7 @@ __device__ float evaluateAsChebychevSeriesAt(float const *c, int n, float x) { d = x2 * d - dd + c[j]; dd = tmp; } - return x * d - dd + c[0] / 2; + return x * d - dd + c[0] / 2.0f; } __device__ float evaluateAsTaylorSeriesAt(float const *c, int n, float x) { @@ -152,13 +152,13 @@ __device__ float evaluateAsTaylorSeriesAt(float const *c, int n, float x) { } __device__ float dev_K0(float x) { - float c = evaluateAsChebychevSeriesAt( - x <= 2 ? bk0_data : x <= 8 ? ak0_data : ak02_data, - x <= 2 ? bk0_size : x <= 8 ? ak0_size : ak02_size, - x <= 2 ? x * x / 2 - 1.0f - : x <= 8 ? (16 / x - 5.0f) / 3.0f : (16 / x - 1.0f)); - if (x <= 2) { - float I0 = + auto const c = evaluateAsChebychevSeriesAt( + x <= 2.0f ? bk0_data : x <= 8.0f ? ak0_data : ak02_data, + x <= 2.0f ? bk0_size : x <= 8.0f ? ak0_size : ak02_size, + x <= 2.0f ? x * x / 2.0f - 1.0f + : x <= 8.0f ? (16.0f / x - 5.0f) / 3.0f : (16.0f / x - 1.0f)); + if (x <= 2.0f) { + auto const I0 = evaluateAsChebychevSeriesAt(bi0_data, bi0_size, x * x / 4.5f - 1.0f); return (-log(x) + Utils::ln_2()) * I0 + c; } @@ -166,14 +166,14 @@ __device__ float dev_K0(float x) { } __device__ float dev_K1(float x) { - float c = evaluateAsChebychevSeriesAt( - x <= 2 ? bk1_data : x <= 8 ? ak1_data : ak12_data, - x <= 2 ? bk1_size : x <= 8 ? ak1_size : ak12_size, - x <= 2 ? x * x / 2 - 1.0f - : x <= 8 ? (16 / x - 5.0f) / 3.0f : (16 / x - 1.0f)); - if (x <= 2) { - float I1 = x * evaluateAsChebychevSeriesAt(bi1_data, bi1_size, - x * x / 4.5f - 1.0f); + auto const c = evaluateAsChebychevSeriesAt( + x <= 2.0f ? bk1_data : x <= 8.0f ? ak1_data : ak12_data, + x <= 2.0f ? bk1_size : x <= 8.0f ? ak1_size : ak12_size, + x <= 2.0f ? x * x / 2.0f - 1.0f + : x <= 8.0f ? (16.0f / x - 5.0f) / 3.0f : (16.0f / x - 1.0f)); + if (x <= 2.0f) { + auto const I1 = x * evaluateAsChebychevSeriesAt(bi1_data, bi1_size, + x * x / 4.5f - 1.0f); return (log(x) - Utils::ln_2()) * I1 + c / x; } return exp(-x) * c * rsqrt(x); diff --git a/src/core/electrostatics_magnetostatics/mmm1d.cpp b/src/core/electrostatics_magnetostatics/mmm1d.cpp index d4eadc66121..db6a3e5798d 100644 --- a/src/core/electrostatics_magnetostatics/mmm1d.cpp +++ b/src/core/electrostatics_magnetostatics/mmm1d.cpp @@ -77,10 +77,10 @@ MMM1D_struct mmm1d_params = {0.05, 1e-5, 0}; static std::array bessel_radii; static double far_error(int P, double minrad) { + auto const wavenumber = 2 * Utils::pi() * box_geo.length_inv()[2]; // this uses an upper bound to all force components and the potential - auto const rhores = 2 * Utils::pi() * box_geo.length_inv()[2] * minrad; - auto const pref = 4 * box_geo.length_inv()[2] * - std::max(1.0, 2 * Utils::pi() * box_geo.length_inv()[2]); + auto const rhores = wavenumber * minrad; + auto const pref = 4 * box_geo.length_inv()[2] * std::max(1.0, wavenumber); return pref * K1(rhores * P) * exp(rhores) / rhores * (P - 1 + 1 / rhores); } @@ -177,7 +177,7 @@ int MMM1D_init() { void add_mmm1d_coulomb_pair_force(double chpref, Utils::Vector3d const &d, double r, Utils::Vector3d &force) { - constexpr double c_2pi = 2 * Utils::pi(); + constexpr auto c_2pi = 2 * Utils::pi(); auto const n_modPsi = static_cast(modPsi.size() >> 1); auto const rxy2 = d[0] * d[0] + d[1] * d[1]; auto const rxy2_d = rxy2 * uz2; @@ -186,9 +186,9 @@ void add_mmm1d_coulomb_pair_force(double chpref, Utils::Vector3d const &d, if (rxy2 <= mmm1d_params.far_switch_radius_2) { /* polygamma summation */ - double sr = 0; - double sz = mod_psi_odd(0, z_d); - double r2nm1 = 1.0; + auto sr = 0.; + auto sz = mod_psi_odd(0, z_d); + auto r2nm1 = 1.0; for (int n = 1; n < n_modPsi; n++) { auto const deriv = static_cast(2 * n); auto const mpe = mod_psi_even(n, z_d); @@ -238,7 +238,7 @@ void add_mmm1d_coulomb_pair_force(double chpref, Utils::Vector3d const &d, /* far range formula */ auto const rxy = sqrt(rxy2); auto const rxy_d = rxy * box_geo.length_inv()[2]; - double sr = 0, sz = 0; + auto sr = 0., sz = 0.; for (int bp = 1; bp < MAXIMAL_B_CUT; bp++) { if (bessel_radii[bp - 1] < rxy) @@ -271,7 +271,7 @@ double mmm1d_coulomb_pair_energy(double const chpref, Utils::Vector3d const &d, if (chpref == 0) return 0; - constexpr double c_2pi = 2 * Utils::pi(); + constexpr auto c_2pi = 2 * Utils::pi(); auto const n_modPsi = static_cast(modPsi.size() >> 1); auto const rxy2 = d[0] * d[0] + d[1] * d[1]; auto const rxy2_d = rxy2 * uz2; @@ -299,15 +299,15 @@ double mmm1d_coulomb_pair_energy(double const chpref, Utils::Vector3d const &d, double rt, shift_z; - E += 1 / r; + E += 1. / r; shift_z = d[2] + box_geo.length()[2]; rt = sqrt(rxy2 + shift_z * shift_z); - E += 1 / rt; + E += 1. / rt; shift_z = d[2] - box_geo.length()[2]; rt = sqrt(rxy2 + shift_z * shift_z); - E += 1 / rt; + E += 1. / rt; } else { /* far range formula */ auto const rxy = sqrt(rxy2); diff --git a/src/core/electrostatics_magnetostatics/p3m-common.hpp b/src/core/electrostatics_magnetostatics/p3m-common.hpp index 26e9fbaf3e5..e8fe39d78cc 100644 --- a/src/core/electrostatics_magnetostatics/p3m-common.hpp +++ b/src/core/electrostatics_magnetostatics/p3m-common.hpp @@ -210,7 +210,7 @@ std::array, 3> inline calc_meshift( std::array const &mesh_size, bool zero_out_midpoint = false) { std::array, 3> ret{}; - for (size_t i = 0; i < 3; i++) { + for (std::size_t i = 0; i < 3; i++) { ret[i] = std::vector(mesh_size[i]); for (int j = 1; j <= mesh_size[i] / 2; j++) { diff --git a/src/python/espressomd/electrostatics.pxd b/src/python/espressomd/electrostatics.pxd index 3acec46b13d..88f26489159 100644 --- a/src/python/espressomd/electrostatics.pxd +++ b/src/python/espressomd/electrostatics.pxd @@ -151,7 +151,7 @@ IF ELECTROSTATICS and MMM1D_GPU: unsigned int numBlocks(SystemInterface & s) float host_boxz - int host_npart + unsigned int host_npart bool need_tune int pairs From 66e54eaeff8210edcf758820e1c527834f93aa27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= Date: Fri, 23 Jul 2021 14:30:34 +0200 Subject: [PATCH 6/6] core: Update exception mechanism There can only be one MMM actor in an ESPResSo system, so coulomb.method checks can be converted to assertions. system.box_l cannot be <= 0. Remove the iostream global. --- src/core/actor/Mmm1dgpuForce_cuda.cu | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/src/core/actor/Mmm1dgpuForce_cuda.cu b/src/core/actor/Mmm1dgpuForce_cuda.cu index e4c389981e6..5a4c037a5dc 100644 --- a/src/core/actor/Mmm1dgpuForce_cuda.cu +++ b/src/core/actor/Mmm1dgpuForce_cuda.cu @@ -38,8 +38,11 @@ #include +#include #include -#include +#include +#include +#include #if defined(OMPI_MPI_H) || defined(_MPI_H) #error CU-file includes mpi.h! This should not happen! @@ -154,10 +157,6 @@ Mmm1dgpuForce::Mmm1dgpuForce(SystemInterface &s, float _coulomb_prefactor, void Mmm1dgpuForce::setup(SystemInterface &s) { auto const box_z = static_cast(s.box()[2]); - if (box_z <= 0) { - throw std::runtime_error( - "Please set box length before initializing MMM1D!"); - } if (need_tune && s.npart_gpu() > 0) { set_params(box_z, static_cast(coulomb.prefactor), maxPWerror, far_switch_radius, bessel_cutoff); @@ -182,8 +181,7 @@ void Mmm1dgpuForce::setup(SystemInterface &s) { cudaMemGetInfo(&freeMem, &totalMem); if (freeMem / 2 < part_mem_size) { // don't use more than half the device's memory - std::cerr << "Switching to atomicAdd due to memory constraints." - << std::endl; + fprintf(stderr, "Switching to atomicAdd due to memory constraints.\n"); pairs = 0; break; } @@ -546,11 +544,7 @@ __global__ void vectorReductionKernel(float const *src, float *dst, } void Mmm1dgpuForce::computeForces(SystemInterface &s) { - if (coulomb.method != COULOMB_MMM1D_GPU) { - std::cerr << "MMM1D: coulomb.method has been changed, skipping calculation" - << std::endl; - return; - } + assert(coulomb.method == COULOMB_MMM1D_GPU); setup(s); if (pairs < 0) { @@ -581,11 +575,7 @@ __global__ void scaleAndAddKernel(float *dst, float const *src, std::size_t N, } void Mmm1dgpuForce::computeEnergy(SystemInterface &s) { - if (coulomb.method != COULOMB_MMM1D_GPU) { - std::cerr << "MMM1D: coulomb.method has been changed, skipping calculation" - << std::endl; - return; - } + assert(coulomb.method == COULOMB_MMM1D_GPU); setup(s); if (pairs < 0) {