Skip to content

Commit

Permalink
Add new P3M tuning options (#5017)
Browse files Browse the repository at this point in the history
Description of changes:
- new feature: mesh tuning can now be constrained to yield a value between two user-specified mesh sizes
- incompatible FFT domain decompositions are now discarded during CPU tuning to avoid undefined behavior
  • Loading branch information
kodiakhq[bot] authored Dec 12, 2024
2 parents 6dbf05f + 75f0812 commit b200c23
Show file tree
Hide file tree
Showing 22 changed files with 382 additions and 107 deletions.
2 changes: 1 addition & 1 deletion doc/doxygen/Doxyfile.in
Original file line number Diff line number Diff line change
Expand Up @@ -1511,7 +1511,7 @@ MATHJAX_RELPATH = https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/
# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
# This tag requires that the tag USE_MATHJAX is set to YES.

MATHJAX_EXTENSIONS =
MATHJAX_EXTENSIONS = TeX/AMSmath

# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
# of code that will be used on startup of the MathJax code. See the MathJax site
Expand Down
8 changes: 4 additions & 4 deletions src/core/electrostatics/elc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -368,10 +368,6 @@ ElectrostaticLayerCorrection::z_energy(ParticleRange const &particles) const {
auto const &box_geo = *get_system().box_geo;
auto const xy_area_inv = box_geo.length_inv()[0] * box_geo.length_inv()[1];
auto const pref = prefactor * 2. * std::numbers::pi * xy_area_inv;
auto const delta = elc.delta_mid_top * elc.delta_mid_bot;
auto const fac_delta_mid_bot = elc.delta_mid_bot / (1. - delta);
auto const fac_delta_mid_top = elc.delta_mid_top / (1. - delta);
auto const fac_delta = delta / (1. - delta);

/* for non-neutral systems, this shift gives the background contribution
* (rsp. for this shift, the DM of the background is zero) */
Expand All @@ -397,6 +393,10 @@ ElectrostaticLayerCorrection::z_energy(ParticleRange const &particles) const {
}
} else {
// metallic boundaries
auto const delta = elc.delta_mid_top * elc.delta_mid_bot;
auto const fac_delta_mid_bot = elc.delta_mid_bot / (1. - delta);
auto const fac_delta_mid_top = elc.delta_mid_top / (1. - delta);
auto const fac_delta = delta / (1. - delta);
clear_vec(gblcblk, size);
auto const h = elc.box_h;
ImageSum const image_sum{delta, shift, lz};
Expand Down
56 changes: 51 additions & 5 deletions src/core/electrostatics/p3m.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
#include <utils/integral_parameter.hpp>
#include <utils/math/int_pow.hpp>
#include <utils/math/sqr.hpp>
#include <utils/serialization/array.hpp>

#include <boost/mpi/collectives/all_reduce.hpp>
#include <boost/mpi/collectives/broadcast.hpp>
Expand Down Expand Up @@ -126,7 +127,7 @@ void CoulombP3MImpl<FloatType, Architecture>::count_charged_particles() {
template <typename FloatType, Arch Architecture>
void CoulombP3MImpl<FloatType, Architecture>::calc_influence_function_force() {
auto const [KX, KY, KZ] = p3m.fft->get_permutations();
p3m.g_force = grid_influence_function<FloatType, 1>(
p3m.g_force = grid_influence_function<FloatType, 1, P3M_BRILLOUIN>(
p3m.params, p3m.mesh.start, p3m.mesh.stop, KX, KY, KZ,
get_system().box_geo->length_inv());
}
Expand All @@ -137,7 +138,7 @@ void CoulombP3MImpl<FloatType, Architecture>::calc_influence_function_force() {
template <typename FloatType, Arch Architecture>
void CoulombP3MImpl<FloatType, Architecture>::calc_influence_function_energy() {
auto const [KX, KY, KZ] = p3m.fft->get_permutations();
p3m.g_energy = grid_influence_function<FloatType, 0>(
p3m.g_energy = grid_influence_function<FloatType, 0, P3M_BRILLOUIN>(
p3m.params, p3m.mesh.start, p3m.mesh.stop, KX, KY, KZ,
get_system().box_geo->length_inv());
}
Expand Down Expand Up @@ -597,14 +598,17 @@ class CoulombTuningAlgorithm : public TuningAlgorithm {
double m_mesh_density_min = -1., m_mesh_density_max = -1.;
// indicates if mesh should be tuned
bool m_tune_mesh = false;
std::pair<std::optional<int>, std::optional<int>> m_tune_limits;

protected:
P3MParameters &get_params() override { return p3m.params; }

public:
CoulombTuningAlgorithm(System::System &system, auto &input_p3m,
double prefactor, int timings)
: TuningAlgorithm(system, prefactor, timings), p3m{input_p3m} {}
double prefactor, int timings,
decltype(m_tune_limits) tune_limits)
: TuningAlgorithm(system, prefactor, timings), p3m{input_p3m},
m_tune_limits{std::move(tune_limits)} {}

void on_solver_change() const override { m_system.on_coulomb_change(); }

Expand Down Expand Up @@ -632,6 +636,38 @@ class CoulombTuningAlgorithm : public TuningAlgorithm {
return {};
}

std::optional<std::string> fft_decomposition_veto(
Utils::Vector3i const &mesh_size_r_space) const override {
#ifdef CUDA
if constexpr (Architecture == Arch::GPU) {
return std::nullopt;
}
#endif
using Array3i = Utils::Array<int, 3u>;
auto const [KX, KY, KZ] = p3m.fft->get_permutations();
auto valid_decomposition = false;
Array3i mesh_size_k_space = {};
boost::mpi::reduce(
::comm_cart, Array3i(p3m.mesh.stop), mesh_size_k_space,
[](Array3i const &lhs, Array3i const &rhs) {
return Array3i{{std::max(lhs[0u], rhs[0u]),
std::max(lhs[1u], rhs[1u]),
std::max(lhs[2u], rhs[2u])}};
},
0);
if (::this_node == 0) {
valid_decomposition = (mesh_size_r_space[0u] == mesh_size_k_space[KX] and
mesh_size_r_space[1u] == mesh_size_k_space[KY] and
mesh_size_r_space[2u] == mesh_size_k_space[KZ]);
}
boost::mpi::broadcast(::comm_cart, valid_decomposition, 0);
std::optional<std::string> retval{"conflict with FFT domain decomposition"};
if (valid_decomposition) {
retval = std::nullopt;
}
return retval;
}

std::tuple<double, double, double, double>
calculate_accuracy(Utils::Vector3i const &mesh, int cao,
double r_cut_iL) const override {
Expand Down Expand Up @@ -689,6 +725,16 @@ class CoulombTuningAlgorithm : public TuningAlgorithm {
max_npart_per_dim, std::cbrt(static_cast<double>(p3m.sum_qpart)));
m_mesh_density_min = min_npart_per_dim / normalized_box_dim;
m_mesh_density_max = max_npart_per_dim / normalized_box_dim;
if (m_tune_limits.first or m_tune_limits.second) {
auto const &box_l = box_geo.length();
auto const dim = std::max({box_l[0], box_l[1], box_l[2]});
if (m_tune_limits.first) {
m_mesh_density_min = static_cast<double>(*m_tune_limits.first) / dim;
}
if (m_tune_limits.second) {
m_mesh_density_max = static_cast<double>(*m_tune_limits.second) / dim;
}
}
m_tune_mesh = true;
} else {
m_mesh_density_min = m_mesh_density_max = mesh_density;
Expand Down Expand Up @@ -772,7 +818,7 @@ void CoulombP3MImpl<FloatType, Architecture>::tune() {
}
try {
CoulombTuningAlgorithm<FloatType, Architecture> parameters(
system, p3m, prefactor, tune_timings);
system, p3m, prefactor, tune_timings, tune_limits);
parameters.setup_logger(tune_verbose);
// parameter ranges
parameters.determine_mesh_limits();
Expand Down
6 changes: 4 additions & 2 deletions src/core/electrostatics/p3m.impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include <utils/Vector.hpp>

#include <memory>
#include <optional>
#include <stdexcept>
#include <type_traits>
#include <utility>
Expand Down Expand Up @@ -69,6 +70,7 @@ struct CoulombP3MImpl : public CoulombP3M {
/** @brief Coulomb P3M meshes and FFT algorithm. */
std::unique_ptr<p3m_data_struct_coulomb<FloatType>> p3m_impl;
int tune_timings;
std::pair<std::optional<int>, std::optional<int>> tune_limits;
bool tune_verbose;
bool check_complex_residuals;
bool m_is_tuned;
Expand All @@ -77,10 +79,10 @@ struct CoulombP3MImpl : public CoulombP3M {
CoulombP3MImpl(
std::unique_ptr<p3m_data_struct_coulomb<FloatType>> &&p3m_handle,
double prefactor, int tune_timings, bool tune_verbose,
bool check_complex_residuals)
decltype(tune_limits) tune_limits, bool check_complex_residuals)
: CoulombP3M(p3m_handle->params), p3m{*p3m_handle},
p3m_impl{std::move(p3m_handle)}, tune_timings{tune_timings},
tune_verbose{tune_verbose},
tune_limits{std::move(tune_limits)}, tune_verbose{tune_verbose},
check_complex_residuals{check_complex_residuals} {

if (tune_timings <= 0) {
Expand Down
18 changes: 15 additions & 3 deletions src/core/magnetostatics/dlc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
#include <cmath>
#include <cstdio>
#include <functional>
#include <limits>
#include <numbers>
#include <numeric>
#include <stdexcept>
Expand Down Expand Up @@ -430,15 +431,26 @@ double DipolarLayerCorrection::tune_far_cut() const {
}

auto constexpr limitkc = 200;
auto constexpr exp_max = +709.8; // for IEEE-compatible double
auto constexpr exp_min = -708.4; // for IEEE-compatible double
auto const log_max = std::log(std::numeric_limits<double>::max());
auto const piarea = std::numbers::pi / (lx * ly);
auto const nmp = static_cast<double>(count_magnetic_particles(system));
auto const h = dlc.box_h;
auto far_cut = -1.;
for (int kc = 1; kc < limitkc; kc++) {
auto const gc = kc * 2. * std::numbers::pi / lx;
auto const fa0 = sqrt(9. * exp(+2. * gc * h) * g1_DLC_dip(gc, lz - h) +
9. * exp(-2. * gc * h) * g1_DLC_dip(gc, lz + h) +
22. * g1_DLC_dip(gc, lz));
auto const pref1 = g1_DLC_dip(gc, lz - h);
auto const pref2 = g1_DLC_dip(gc, lz + h);
auto const pref0 = g1_DLC_dip(gc, lz);
auto const exp_term = 2. * gc * h;
auto const log_term1 = exp_term + std::log(9. * pref1);
if (exp_term >= exp_max or log_term1 >= log_max or -exp_term <= exp_min) {
// no solution can be found at larger k values due to overflow/underflow
break;
}
auto const fa0 = std::sqrt(9. * std::exp(+exp_term) * pref1 +
9. * std::exp(-exp_term) * pref2 + 22. * pref0);
auto const fa1 = sqrt(0.125 * piarea) * fa0;
auto const fa2 = g2_DLC_dip(gc, lz);
auto const de = nmp * mu_max_sq / (4. * (exp(gc * lz) - 1.)) * (fa1 + fa2);
Expand Down
25 changes: 18 additions & 7 deletions src/core/magnetostatics/dp3m.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@
#include <sstream>
#include <stdexcept>
#include <tuple>
#include <utility>
#include <vector>

#ifdef FFTW3_H
Expand Down Expand Up @@ -115,8 +116,9 @@ double
DipolarP3MImpl<FloatType, Architecture>::calc_average_self_energy_k_space()
const {
auto const &box_geo = *get_system().box_geo;
auto const node_phi = grid_influence_function_self_energy(
dp3m.params, dp3m.mesh.start, dp3m.mesh.stop, dp3m.g_energy);
auto const node_phi =
grid_influence_function_self_energy<FloatType, P3M_BRILLOUIN>(
dp3m.params, dp3m.mesh.start, dp3m.mesh.stop, dp3m.g_energy);

double phi = 0.;
boost::mpi::reduce(comm_cart, node_phi, phi, std::plus<>(), 0);
Expand Down Expand Up @@ -518,14 +520,14 @@ double DipolarP3MImpl<FloatType, Architecture>::calc_surface_term(

template <typename FloatType, Arch Architecture>
void DipolarP3MImpl<FloatType, Architecture>::calc_influence_function_force() {
dp3m.g_force = grid_influence_function<FloatType, 3>(
dp3m.g_force = grid_influence_function<FloatType, 3, P3M_BRILLOUIN>(
dp3m.params, dp3m.mesh.start, dp3m.mesh.stop,
get_system().box_geo->length_inv());
}

template <typename FloatType, Arch Architecture>
void DipolarP3MImpl<FloatType, Architecture>::calc_influence_function_energy() {
dp3m.g_energy = grid_influence_function<FloatType, 2>(
dp3m.g_energy = grid_influence_function<FloatType, 2, P3M_BRILLOUIN>(
dp3m.params, dp3m.mesh.start, dp3m.mesh.stop,
get_system().box_geo->length_inv());
}
Expand All @@ -534,11 +536,14 @@ template <typename FloatType, Arch Architecture>
class DipolarTuningAlgorithm : public TuningAlgorithm {
p3m_data_struct_dipoles<FloatType> &dp3m;
int m_mesh_max = -1, m_mesh_min = -1;
std::pair<std::optional<int>, std::optional<int>> m_tune_limits;

public:
DipolarTuningAlgorithm(System::System &system, decltype(dp3m) &input_dp3m,
double prefactor, int timings)
: TuningAlgorithm(system, prefactor, timings), dp3m{input_dp3m} {}
double prefactor, int timings,
decltype(m_tune_limits) tune_limits)
: TuningAlgorithm(system, prefactor, timings), dp3m{input_dp3m},
m_tune_limits{std::move(tune_limits)} {}

P3MParameters &get_params() override { return dp3m.params; }

Expand Down Expand Up @@ -603,6 +608,12 @@ class DipolarTuningAlgorithm : public TuningAlgorithm {
m_mesh_min = static_cast<int>(std::round(std::pow(2., std::floor(expo))));
/* avoid using more than 1 GB of FFT arrays */
m_mesh_max = 128;
if (m_tune_limits.first) {
m_mesh_min = *m_tune_limits.first;
}
if (m_tune_limits.second) {
m_mesh_max = *m_tune_limits.second;
}
} else {
m_mesh_min = m_mesh_max = dp3m.params.mesh[0];
m_logger->report_fixed_mesh(dp3m.params.mesh);
Expand Down Expand Up @@ -662,7 +673,7 @@ void DipolarP3MImpl<FloatType, Architecture>::tune() {
}
try {
DipolarTuningAlgorithm<FloatType, Architecture> parameters(
system, dp3m, prefactor, tune_timings);
system, dp3m, prefactor, tune_timings, tune_limits);
parameters.setup_logger(tune_verbose);
// parameter ranges
parameters.determine_mesh_limits();
Expand Down
7 changes: 5 additions & 2 deletions src/core/magnetostatics/dp3m.impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include <utils/Vector.hpp>

#include <memory>
#include <optional>
#include <stdexcept>
#include <type_traits>
#include <utility>
Expand Down Expand Up @@ -72,16 +73,18 @@ struct DipolarP3MImpl : public DipolarP3M {
/** @brief Coulomb P3M meshes and FFT algorithm. */
std::unique_ptr<p3m_data_struct_dipoles<FloatType>> dp3m_impl;
int tune_timings;
std::pair<std::optional<int>, std::optional<int>> tune_limits;
bool tune_verbose;
bool m_is_tuned;

public:
DipolarP3MImpl(
std::unique_ptr<p3m_data_struct_dipoles<FloatType>> &&dp3m_handle,
double prefactor, int tune_timings, bool tune_verbose)
double prefactor, int tune_timings, bool tune_verbose,
decltype(tune_limits) tune_limits)
: DipolarP3M(dp3m_handle->params), dp3m{*dp3m_handle},
dp3m_impl{std::move(dp3m_handle)}, tune_timings{tune_timings},
tune_verbose{tune_verbose} {
tune_limits{std::move(tune_limits)}, tune_verbose{tune_verbose} {

if (tune_timings <= 0) {
throw std::domain_error("Parameter 'timings' must be > 0");
Expand Down
20 changes: 15 additions & 5 deletions src/core/p3m/TuningAlgorithm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ static auto constexpr P3M_TUNE_CAO_TOO_LARGE = 1.;
static auto constexpr P3M_TUNE_ELC_GAP_SIZE = 2.;
/** could not achieve target accuracy */
static auto constexpr P3M_TUNE_ACCURACY_TOO_LARGE = 3.;
/** conflict with FFT domain decomposition */
static auto constexpr P3M_TUNE_FFT_MESH_SIZE = 4.;
/**@}*/

/** @brief Precision threshold for a non-zero real-space cutoff. */
Expand Down Expand Up @@ -110,7 +112,7 @@ void TuningAlgorithm::commit(Utils::Vector3i const &mesh, int cao,
* @param[in,out] tuned_accuracy @copybrief P3MParameters::accuracy
*
* @returns The integration time in case of success, otherwise
* -@ref P3M_TUNE_ACCURACY_TOO_LARGE,
* -@ref P3M_TUNE_ACCURACY_TOO_LARGE, -@ref P3M_TUNE_FFT_MESH_SIZE,
* -@ref P3M_TUNE_CAO_TOO_LARGE, or -@ref P3M_TUNE_ELC_GAP_SIZE
*/
double TuningAlgorithm::get_mc_time(Utils::Vector3i const &mesh, int cao,
Expand Down Expand Up @@ -171,17 +173,25 @@ double TuningAlgorithm::get_mc_time(Utils::Vector3i const &mesh, int cao,
* we know that the desired minimal accuracy is obtained */
tuned_r_cut_iL = r_cut_iL = r_cut_iL_max;

auto const report_veto = [&](auto const &veto) {
if (veto) {
m_logger->log_skip(*veto, mesh[0], cao, r_cut_iL, tuned_alpha_L,
tuned_accuracy, rs_err, ks_err);
}
return static_cast<bool>(veto);
};

/* if we are running P3M+ELC, check that r_cut is compatible */
auto const r_cut = r_cut_iL * box_geo.length()[0];
auto const veto = layer_correction_veto_r_cut(r_cut);
if (veto) {
m_logger->log_skip(*veto, mesh[0], cao, r_cut_iL, tuned_alpha_L,
tuned_accuracy, rs_err, ks_err);
if (report_veto(layer_correction_veto_r_cut(r_cut))) {
return -P3M_TUNE_ELC_GAP_SIZE;
}

commit(mesh, cao, r_cut_iL, tuned_alpha_L);
on_solver_change();
if (report_veto(fft_decomposition_veto(mesh))) {
return -P3M_TUNE_FFT_MESH_SIZE;
}
auto const int_time = benchmark_integration_step(m_system, m_timings);

std::tie(tuned_accuracy, rs_err, ks_err, tuned_alpha_L) =
Expand Down
6 changes: 6 additions & 0 deletions src/core/p3m/TuningAlgorithm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,12 @@ class TuningAlgorithm {
virtual std::optional<std::string>
layer_correction_veto_r_cut(double r_cut) const = 0;

/** @brief Veto FFT decomposition in non-cubic boxes. */
virtual std::optional<std::string>
fft_decomposition_veto(Utils::Vector3i const &) const {
return std::nullopt;
}

/** @brief Write tuned parameters to the P3M parameter struct. */
void commit(Utils::Vector3i const &mesh, int cao, double r_cut_iL,
double alpha_L);
Expand Down
Loading

0 comments on commit b200c23

Please sign in to comment.