Add new P3M tuning options (#5017)

Description of changes: - new feature: mesh tuning can now be constrained to yield a value between two user-specified mesh sizes - incompatible FFT domain decompositions are now discarded during CPU tuning to avoid undefined behavior
espressomd · Dec 12, 2024 · b200c23 · b200c23
2 parents 6dbf05f + 75f0812
commit b200c23
Show file tree

Hide file tree

Showing 22 changed files with 382 additions and 107 deletions.
diff --git a/doc/doxygen/Doxyfile.in b/doc/doxygen/Doxyfile.in
@@ -1511,7 +1511,7 @@ MATHJAX_RELPATH        = https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/
 # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
-MATHJAX_EXTENSIONS     =
+MATHJAX_EXTENSIONS     = TeX/AMSmath
 
 # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
 # of code that will be used on startup of the MathJax code. See the MathJax site

diff --git a/src/core/electrostatics/elc.cpp b/src/core/electrostatics/elc.cpp
@@ -368,10 +368,6 @@ ElectrostaticLayerCorrection::z_energy(ParticleRange const &particles) const {
   auto const &box_geo = *get_system().box_geo;
   auto const xy_area_inv = box_geo.length_inv()[0] * box_geo.length_inv()[1];
   auto const pref = prefactor * 2. * std::numbers::pi * xy_area_inv;
-  auto const delta = elc.delta_mid_top * elc.delta_mid_bot;
-  auto const fac_delta_mid_bot = elc.delta_mid_bot / (1. - delta);
-  auto const fac_delta_mid_top = elc.delta_mid_top / (1. - delta);
-  auto const fac_delta = delta / (1. - delta);
 
   /* for non-neutral systems, this shift gives the background contribution
    * (rsp. for this shift, the DM of the background is zero) */
@@ -397,6 +393,10 @@ ElectrostaticLayerCorrection::z_energy(ParticleRange const &particles) const {
       }
     } else {
       // metallic boundaries
+      auto const delta = elc.delta_mid_top * elc.delta_mid_bot;
+      auto const fac_delta_mid_bot = elc.delta_mid_bot / (1. - delta);
+      auto const fac_delta_mid_top = elc.delta_mid_top / (1. - delta);
+      auto const fac_delta = delta / (1. - delta);
       clear_vec(gblcblk, size);
       auto const h = elc.box_h;
       ImageSum const image_sum{delta, shift, lz};

diff --git a/src/core/electrostatics/p3m.cpp b/src/core/electrostatics/p3m.cpp
@@ -66,6 +66,7 @@
 #include <utils/integral_parameter.hpp>
 #include <utils/math/int_pow.hpp>
 #include <utils/math/sqr.hpp>
+#include <utils/serialization/array.hpp>
 
 #include <boost/mpi/collectives/all_reduce.hpp>
 #include <boost/mpi/collectives/broadcast.hpp>
@@ -126,7 +127,7 @@ void CoulombP3MImpl<FloatType, Architecture>::count_charged_particles() {
 template <typename FloatType, Arch Architecture>
 void CoulombP3MImpl<FloatType, Architecture>::calc_influence_function_force() {
   auto const [KX, KY, KZ] = p3m.fft->get_permutations();
-  p3m.g_force = grid_influence_function<FloatType, 1>(
+  p3m.g_force = grid_influence_function<FloatType, 1, P3M_BRILLOUIN>(
       p3m.params, p3m.mesh.start, p3m.mesh.stop, KX, KY, KZ,
       get_system().box_geo->length_inv());
 }
@@ -137,7 +138,7 @@ void CoulombP3MImpl<FloatType, Architecture>::calc_influence_function_force() {
 template <typename FloatType, Arch Architecture>
 void CoulombP3MImpl<FloatType, Architecture>::calc_influence_function_energy() {
   auto const [KX, KY, KZ] = p3m.fft->get_permutations();
-  p3m.g_energy = grid_influence_function<FloatType, 0>(
+  p3m.g_energy = grid_influence_function<FloatType, 0, P3M_BRILLOUIN>(
       p3m.params, p3m.mesh.start, p3m.mesh.stop, KX, KY, KZ,
       get_system().box_geo->length_inv());
 }
@@ -597,14 +598,17 @@ class CoulombTuningAlgorithm : public TuningAlgorithm {
   double m_mesh_density_min = -1., m_mesh_density_max = -1.;
   // indicates if mesh should be tuned
   bool m_tune_mesh = false;
+  std::pair<std::optional<int>, std::optional<int>> m_tune_limits;
 
 protected:
   P3MParameters &get_params() override { return p3m.params; }
 
 public:
   CoulombTuningAlgorithm(System::System &system, auto &input_p3m,
-                         double prefactor, int timings)
-      : TuningAlgorithm(system, prefactor, timings), p3m{input_p3m} {}
+                         double prefactor, int timings,
+                         decltype(m_tune_limits) tune_limits)
+      : TuningAlgorithm(system, prefactor, timings), p3m{input_p3m},
+        m_tune_limits{std::move(tune_limits)} {}
 
   void on_solver_change() const override { m_system.on_coulomb_change(); }
 
@@ -632,6 +636,38 @@ class CoulombTuningAlgorithm : public TuningAlgorithm {
     return {};
   }
 
+  std::optional<std::string> fft_decomposition_veto(
+      Utils::Vector3i const &mesh_size_r_space) const override {
+#ifdef CUDA
+    if constexpr (Architecture == Arch::GPU) {
+      return std::nullopt;
+    }
+#endif
+    using Array3i = Utils::Array<int, 3u>;
+    auto const [KX, KY, KZ] = p3m.fft->get_permutations();
+    auto valid_decomposition = false;
+    Array3i mesh_size_k_space = {};
+    boost::mpi::reduce(
+        ::comm_cart, Array3i(p3m.mesh.stop), mesh_size_k_space,
+        [](Array3i const &lhs, Array3i const &rhs) {
+          return Array3i{{std::max(lhs[0u], rhs[0u]),
+                          std::max(lhs[1u], rhs[1u]),
+                          std::max(lhs[2u], rhs[2u])}};
+        },
+        0);
+    if (::this_node == 0) {
+      valid_decomposition = (mesh_size_r_space[0u] == mesh_size_k_space[KX] and
+                             mesh_size_r_space[1u] == mesh_size_k_space[KY] and
+                             mesh_size_r_space[2u] == mesh_size_k_space[KZ]);
+    }
+    boost::mpi::broadcast(::comm_cart, valid_decomposition, 0);
+    std::optional<std::string> retval{"conflict with FFT domain decomposition"};
+    if (valid_decomposition) {
+      retval = std::nullopt;
+    }
+    return retval;
+  }
+
   std::tuple<double, double, double, double>
   calculate_accuracy(Utils::Vector3i const &mesh, int cao,
                      double r_cut_iL) const override {
@@ -689,6 +725,16 @@ class CoulombTuningAlgorithm : public TuningAlgorithm {
           max_npart_per_dim, std::cbrt(static_cast<double>(p3m.sum_qpart)));
       m_mesh_density_min = min_npart_per_dim / normalized_box_dim;
       m_mesh_density_max = max_npart_per_dim / normalized_box_dim;
+      if (m_tune_limits.first or m_tune_limits.second) {
+        auto const &box_l = box_geo.length();
+        auto const dim = std::max({box_l[0], box_l[1], box_l[2]});
+        if (m_tune_limits.first) {
+          m_mesh_density_min = static_cast<double>(*m_tune_limits.first) / dim;
+        }
+        if (m_tune_limits.second) {
+          m_mesh_density_max = static_cast<double>(*m_tune_limits.second) / dim;
+        }
+      }
       m_tune_mesh = true;
     } else {
       m_mesh_density_min = m_mesh_density_max = mesh_density;
@@ -772,7 +818,7 @@ void CoulombP3MImpl<FloatType, Architecture>::tune() {
     }
     try {
       CoulombTuningAlgorithm<FloatType, Architecture> parameters(
-          system, p3m, prefactor, tune_timings);
+          system, p3m, prefactor, tune_timings, tune_limits);
       parameters.setup_logger(tune_verbose);
       // parameter ranges
       parameters.determine_mesh_limits();

diff --git a/src/core/electrostatics/p3m.impl.hpp b/src/core/electrostatics/p3m.impl.hpp
@@ -36,6 +36,7 @@
 #include <utils/Vector.hpp>
 
 #include <memory>
+#include <optional>
 #include <stdexcept>
 #include <type_traits>
 #include <utility>
@@ -69,6 +70,7 @@ struct CoulombP3MImpl : public CoulombP3M {
   /** @brief Coulomb P3M meshes and FFT algorithm. */
   std::unique_ptr<p3m_data_struct_coulomb<FloatType>> p3m_impl;
   int tune_timings;
+  std::pair<std::optional<int>, std::optional<int>> tune_limits;
   bool tune_verbose;
   bool check_complex_residuals;
   bool m_is_tuned;
@@ -77,10 +79,10 @@ struct CoulombP3MImpl : public CoulombP3M {
   CoulombP3MImpl(
       std::unique_ptr<p3m_data_struct_coulomb<FloatType>> &&p3m_handle,
       double prefactor, int tune_timings, bool tune_verbose,
-      bool check_complex_residuals)
+      decltype(tune_limits) tune_limits, bool check_complex_residuals)
       : CoulombP3M(p3m_handle->params), p3m{*p3m_handle},
         p3m_impl{std::move(p3m_handle)}, tune_timings{tune_timings},
-        tune_verbose{tune_verbose},
+        tune_limits{std::move(tune_limits)}, tune_verbose{tune_verbose},
         check_complex_residuals{check_complex_residuals} {
 
     if (tune_timings <= 0) {

diff --git a/src/core/magnetostatics/dlc.cpp b/src/core/magnetostatics/dlc.cpp
@@ -50,6 +50,7 @@
 #include <cmath>
 #include <cstdio>
 #include <functional>
+#include <limits>
 #include <numbers>
 #include <numeric>
 #include <stdexcept>
@@ -430,15 +431,26 @@ double DipolarLayerCorrection::tune_far_cut() const {
   }
 
   auto constexpr limitkc = 200;
+  auto constexpr exp_max = +709.8; // for IEEE-compatible double
+  auto constexpr exp_min = -708.4; // for IEEE-compatible double
+  auto const log_max = std::log(std::numeric_limits<double>::max());
   auto const piarea = std::numbers::pi / (lx * ly);
   auto const nmp = static_cast<double>(count_magnetic_particles(system));
   auto const h = dlc.box_h;
   auto far_cut = -1.;
   for (int kc = 1; kc < limitkc; kc++) {
     auto const gc = kc * 2. * std::numbers::pi / lx;
-    auto const fa0 = sqrt(9. * exp(+2. * gc * h) * g1_DLC_dip(gc, lz - h) +
-                          9. * exp(-2. * gc * h) * g1_DLC_dip(gc, lz + h) +
-                          22. * g1_DLC_dip(gc, lz));
+    auto const pref1 = g1_DLC_dip(gc, lz - h);
+    auto const pref2 = g1_DLC_dip(gc, lz + h);
+    auto const pref0 = g1_DLC_dip(gc, lz);
+    auto const exp_term = 2. * gc * h;
+    auto const log_term1 = exp_term + std::log(9. * pref1);
+    if (exp_term >= exp_max or log_term1 >= log_max or -exp_term <= exp_min) {
+      // no solution can be found at larger k values due to overflow/underflow
+      break;
+    }
+    auto const fa0 = std::sqrt(9. * std::exp(+exp_term) * pref1 +
+                               9. * std::exp(-exp_term) * pref2 + 22. * pref0);
     auto const fa1 = sqrt(0.125 * piarea) * fa0;
     auto const fa2 = g2_DLC_dip(gc, lz);
     auto const de = nmp * mu_max_sq / (4. * (exp(gc * lz) - 1.)) * (fa1 + fa2);

diff --git a/src/core/magnetostatics/dp3m.cpp b/src/core/magnetostatics/dp3m.cpp
@@ -74,6 +74,7 @@
 #include <sstream>
 #include <stdexcept>
 #include <tuple>
+#include <utility>
 #include <vector>
 
 #ifdef FFTW3_H
@@ -115,8 +116,9 @@ double
 DipolarP3MImpl<FloatType, Architecture>::calc_average_self_energy_k_space()
     const {
   auto const &box_geo = *get_system().box_geo;
-  auto const node_phi = grid_influence_function_self_energy(
-      dp3m.params, dp3m.mesh.start, dp3m.mesh.stop, dp3m.g_energy);
+  auto const node_phi =
+      grid_influence_function_self_energy<FloatType, P3M_BRILLOUIN>(
+          dp3m.params, dp3m.mesh.start, dp3m.mesh.stop, dp3m.g_energy);
 
   double phi = 0.;
   boost::mpi::reduce(comm_cart, node_phi, phi, std::plus<>(), 0);
@@ -518,14 +520,14 @@ double DipolarP3MImpl<FloatType, Architecture>::calc_surface_term(
 
 template <typename FloatType, Arch Architecture>
 void DipolarP3MImpl<FloatType, Architecture>::calc_influence_function_force() {
-  dp3m.g_force = grid_influence_function<FloatType, 3>(
+  dp3m.g_force = grid_influence_function<FloatType, 3, P3M_BRILLOUIN>(
       dp3m.params, dp3m.mesh.start, dp3m.mesh.stop,
       get_system().box_geo->length_inv());
 }
 
 template <typename FloatType, Arch Architecture>
 void DipolarP3MImpl<FloatType, Architecture>::calc_influence_function_energy() {
-  dp3m.g_energy = grid_influence_function<FloatType, 2>(
+  dp3m.g_energy = grid_influence_function<FloatType, 2, P3M_BRILLOUIN>(
       dp3m.params, dp3m.mesh.start, dp3m.mesh.stop,
       get_system().box_geo->length_inv());
 }
@@ -534,11 +536,14 @@ template <typename FloatType, Arch Architecture>
 class DipolarTuningAlgorithm : public TuningAlgorithm {
   p3m_data_struct_dipoles<FloatType> &dp3m;
   int m_mesh_max = -1, m_mesh_min = -1;
+  std::pair<std::optional<int>, std::optional<int>> m_tune_limits;
 
 public:
   DipolarTuningAlgorithm(System::System &system, decltype(dp3m) &input_dp3m,
-                         double prefactor, int timings)
-      : TuningAlgorithm(system, prefactor, timings), dp3m{input_dp3m} {}
+                         double prefactor, int timings,
+                         decltype(m_tune_limits) tune_limits)
+      : TuningAlgorithm(system, prefactor, timings), dp3m{input_dp3m},
+        m_tune_limits{std::move(tune_limits)} {}
 
   P3MParameters &get_params() override { return dp3m.params; }
 
@@ -603,6 +608,12 @@ class DipolarTuningAlgorithm : public TuningAlgorithm {
       m_mesh_min = static_cast<int>(std::round(std::pow(2., std::floor(expo))));
       /* avoid using more than 1 GB of FFT arrays */
       m_mesh_max = 128;
+      if (m_tune_limits.first) {
+        m_mesh_min = *m_tune_limits.first;
+      }
+      if (m_tune_limits.second) {
+        m_mesh_max = *m_tune_limits.second;
+      }
     } else {
       m_mesh_min = m_mesh_max = dp3m.params.mesh[0];
       m_logger->report_fixed_mesh(dp3m.params.mesh);
@@ -662,7 +673,7 @@ void DipolarP3MImpl<FloatType, Architecture>::tune() {
     }
     try {
       DipolarTuningAlgorithm<FloatType, Architecture> parameters(
-          system, dp3m, prefactor, tune_timings);
+          system, dp3m, prefactor, tune_timings, tune_limits);
       parameters.setup_logger(tune_verbose);
       // parameter ranges
       parameters.determine_mesh_limits();

diff --git a/src/core/magnetostatics/dp3m.impl.hpp b/src/core/magnetostatics/dp3m.impl.hpp
@@ -36,6 +36,7 @@
 #include <utils/Vector.hpp>
 
 #include <memory>
+#include <optional>
 #include <stdexcept>
 #include <type_traits>
 #include <utility>
@@ -72,16 +73,18 @@ struct DipolarP3MImpl : public DipolarP3M {
   /** @brief Coulomb P3M meshes and FFT algorithm. */
   std::unique_ptr<p3m_data_struct_dipoles<FloatType>> dp3m_impl;
   int tune_timings;
+  std::pair<std::optional<int>, std::optional<int>> tune_limits;
   bool tune_verbose;
   bool m_is_tuned;
 
 public:
   DipolarP3MImpl(
       std::unique_ptr<p3m_data_struct_dipoles<FloatType>> &&dp3m_handle,
-      double prefactor, int tune_timings, bool tune_verbose)
+      double prefactor, int tune_timings, bool tune_verbose,
+      decltype(tune_limits) tune_limits)
       : DipolarP3M(dp3m_handle->params), dp3m{*dp3m_handle},
         dp3m_impl{std::move(dp3m_handle)}, tune_timings{tune_timings},
-        tune_verbose{tune_verbose} {
+        tune_limits{std::move(tune_limits)}, tune_verbose{tune_verbose} {
 
     if (tune_timings <= 0) {
       throw std::domain_error("Parameter 'timings' must be > 0");

diff --git a/src/core/p3m/TuningAlgorithm.cpp b/src/core/p3m/TuningAlgorithm.cpp
@@ -49,6 +49,8 @@ static auto constexpr P3M_TUNE_CAO_TOO_LARGE = 1.;
 static auto constexpr P3M_TUNE_ELC_GAP_SIZE = 2.;
 /** could not achieve target accuracy */
 static auto constexpr P3M_TUNE_ACCURACY_TOO_LARGE = 3.;
+/** conflict with FFT domain decomposition */
+static auto constexpr P3M_TUNE_FFT_MESH_SIZE = 4.;
 /**@}*/
 
 /** @brief Precision threshold for a non-zero real-space cutoff. */
@@ -110,7 +112,7 @@ void TuningAlgorithm::commit(Utils::Vector3i const &mesh, int cao,
  * @param[in,out] tuned_accuracy  @copybrief P3MParameters::accuracy
  *
  * @returns The integration time in case of success, otherwise
- *          -@ref P3M_TUNE_ACCURACY_TOO_LARGE,
+ *          -@ref P3M_TUNE_ACCURACY_TOO_LARGE, -@ref P3M_TUNE_FFT_MESH_SIZE,
  *          -@ref P3M_TUNE_CAO_TOO_LARGE, or -@ref P3M_TUNE_ELC_GAP_SIZE
  */
 double TuningAlgorithm::get_mc_time(Utils::Vector3i const &mesh, int cao,
@@ -171,17 +173,25 @@ double TuningAlgorithm::get_mc_time(Utils::Vector3i const &mesh, int cao,
    * we know that the desired minimal accuracy is obtained */
   tuned_r_cut_iL = r_cut_iL = r_cut_iL_max;
 
+  auto const report_veto = [&](auto const &veto) {
+    if (veto) {
+      m_logger->log_skip(*veto, mesh[0], cao, r_cut_iL, tuned_alpha_L,
+                         tuned_accuracy, rs_err, ks_err);
+    }
+    return static_cast<bool>(veto);
+  };
+
   /* if we are running P3M+ELC, check that r_cut is compatible */
   auto const r_cut = r_cut_iL * box_geo.length()[0];
-  auto const veto = layer_correction_veto_r_cut(r_cut);
-  if (veto) {
-    m_logger->log_skip(*veto, mesh[0], cao, r_cut_iL, tuned_alpha_L,
-                       tuned_accuracy, rs_err, ks_err);
+  if (report_veto(layer_correction_veto_r_cut(r_cut))) {
     return -P3M_TUNE_ELC_GAP_SIZE;
   }
 
   commit(mesh, cao, r_cut_iL, tuned_alpha_L);
   on_solver_change();
+  if (report_veto(fft_decomposition_veto(mesh))) {
+    return -P3M_TUNE_FFT_MESH_SIZE;
+  }
   auto const int_time = benchmark_integration_step(m_system, m_timings);
 
   std::tie(tuned_accuracy, rs_err, ks_err, tuned_alpha_L) =

diff --git a/src/core/p3m/TuningAlgorithm.hpp b/src/core/p3m/TuningAlgorithm.hpp
@@ -147,6 +147,12 @@ class TuningAlgorithm {
   virtual std::optional<std::string>
   layer_correction_veto_r_cut(double r_cut) const = 0;
 
+  /** @brief Veto FFT decomposition in non-cubic boxes. */
+  virtual std::optional<std::string>
+  fft_decomposition_veto(Utils::Vector3i const &) const {
+    return std::nullopt;
+  }
+
   /** @brief Write tuned parameters to the P3M parameter struct. */
   void commit(Utils::Vector3i const &mesh, int cao, double r_cut_iL,
               double alpha_L);