Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MMM1D maintenance #4311

Merged
merged 6 commits into from
Jul 27, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion doc/sphinx/installation.rst
Original file line number Diff line number Diff line change
@@ -294,7 +294,13 @@ General features

.. seealso:: :ref:`Electrostatics`

- ``MMM1D_GPU``
- ``MMM1D_GPU``: This enables MMM1D on GPU. It is faster than the CPU version
by several orders of magnitude, but has float precision instead of double
precision.

- ``MMM1D_MACHINE_PREC``: This enables high-precision Bessel functions
for MMM1D on CPU. Comes with a 60% slow-down penalty. The low-precision
functions are in most cases precise enough and are enabled by default.

- ``DIPOLES`` This activates the dipole-moment property of particles; In addition,
the various magnetostatics algorithms, such as P3M are switched on.
1 change: 1 addition & 0 deletions maintainer/configs/no_rotation.hpp
Original file line number Diff line number Diff line change
@@ -36,6 +36,7 @@

// Charges and dipoles
#define ELECTROSTATICS
#define MMM1D_MACHINE_PREC
#ifdef CUDA
#define MMM1D_GPU
#endif
1 change: 1 addition & 0 deletions src/config/features.def
Original file line number Diff line number Diff line change
@@ -33,6 +33,7 @@ ROTATIONAL_INERTIA implies ROTATION
ELECTROSTATICS
P3M equals ELECTROSTATICS and FFTW
MMM1D_GPU requires CUDA and ELECTROSTATICS
MMM1D_MACHINE_PREC requires ELECTROSTATICS

/* Magnetostatics */
DIPOLES implies ROTATION
18 changes: 12 additions & 6 deletions src/core/EspressoSystemInterface.hpp
Original file line number Diff line number Diff line change
@@ -22,6 +22,8 @@
#include "SystemInterface.hpp"
#include "cuda_interface.hpp"

#include <cstddef>

/* Syntactic sugar */
#define espressoSystemInterface EspressoSystemInterface::Instance()

@@ -117,12 +119,16 @@ class EspressoSystemInterface : public SystemInterface {
float *fGpuEnd() override {
return gpu_get_particle_force_pointer() + 3 * m_gpu_npart;
};
float *eGpu() override { return (float *)gpu_get_energy_pointer(); };
float *eGpu() override {
// cast struct of floats to array of floats
// https://stackoverflow.com/a/29278260
return reinterpret_cast<float *>(gpu_get_energy_pointer());
};
float *torqueGpuBegin() override {
return (float *)gpu_get_particle_torque_pointer();
return gpu_get_particle_torque_pointer();
};
float *torqueGpuEnd() override {
return (float *)(gpu_get_particle_torque_pointer()) + 3 * m_gpu_npart;
return gpu_get_particle_torque_pointer() + 3 * m_gpu_npart;
};
bool hasFGpu() override { return true; };
bool requestFGpu() override {
@@ -148,7 +154,7 @@ class EspressoSystemInterface : public SystemInterface {

Utils::Vector3d box() const override;

unsigned int npart_gpu() override {
std::size_t npart_gpu() const override {
#ifdef CUDA
return gpu_get_particle_pointer().size();
#else
@@ -177,10 +183,10 @@ class EspressoSystemInterface : public SystemInterface {
reallocDeviceMemory(gpu_get_particle_pointer().size());
}
};
void reallocDeviceMemory(int n);
void reallocDeviceMemory(std::size_t n);
#endif

int m_gpu_npart;
std::size_t m_gpu_npart;
bool m_gpu;

float *m_r_gpu_begin;
35 changes: 20 additions & 15 deletions src/core/EspressoSystemInterface_cuda.cu
Original file line number Diff line number Diff line change
@@ -22,6 +22,8 @@
#include "cuda_utils.cuh"
#include "errorhandling.hpp"

#include <cstddef>

#include <cuda.h>

#if defined(OMPI_MPI_H) || defined(_MPI_H)
@@ -33,8 +35,8 @@

// Position and charge
__global__ void split_kernel_rq(CUDA_particle_data *particles, float *r,
float *q, int n) {
auto idx = static_cast<int>(blockDim.x * blockIdx.x + threadIdx.x);
float *q, unsigned int n) {
auto const idx = blockDim.x * blockIdx.x + threadIdx.x;
if (idx >= n)
return;

@@ -49,8 +51,9 @@ __global__ void split_kernel_rq(CUDA_particle_data *particles, float *r,
}

// Charge only
__global__ void split_kernel_q(CUDA_particle_data *particles, float *q, int n) {
auto idx = static_cast<int>(blockDim.x * blockIdx.x + threadIdx.x);
__global__ void split_kernel_q(CUDA_particle_data *particles, float *q,
unsigned int n) {
auto const idx = blockDim.x * blockIdx.x + threadIdx.x;
if (idx >= n)
return;

@@ -62,8 +65,9 @@ __global__ void split_kernel_q(CUDA_particle_data *particles, float *q, int n) {
}

// Position only
__global__ void split_kernel_r(CUDA_particle_data *particles, float *r, int n) {
auto idx = static_cast<int>(blockDim.x * blockIdx.x + threadIdx.x);
__global__ void split_kernel_r(CUDA_particle_data *particles, float *r,
unsigned int n) {
auto idx = blockDim.x * blockIdx.x + threadIdx.x;
if (idx >= n)
return;

@@ -78,8 +82,9 @@ __global__ void split_kernel_r(CUDA_particle_data *particles, float *r, int n) {

#ifdef CUDA
// Velocity
__global__ void split_kernel_v(CUDA_particle_data *particles, float *v, int n) {
auto idx = static_cast<int>(blockDim.x * blockIdx.x + threadIdx.x);
__global__ void split_kernel_v(CUDA_particle_data *particles, float *v,
unsigned int n) {
auto idx = blockDim.x * blockIdx.x + threadIdx.x;
if (idx >= n)
return;

@@ -96,8 +101,8 @@ __global__ void split_kernel_v(CUDA_particle_data *particles, float *v, int n) {
#ifdef DIPOLES
// Dipole moment
__global__ void split_kernel_dip(CUDA_particle_data *particles, float *dip,
int n) {
auto idx = static_cast<int>(blockDim.x * blockIdx.x + threadIdx.x);
unsigned int n) {
auto idx = blockDim.x * blockIdx.x + threadIdx.x;
if (idx >= n)
return;

@@ -112,9 +117,9 @@ __global__ void split_kernel_dip(CUDA_particle_data *particles, float *dip,
#endif

__global__ void split_kernel_director(CUDA_particle_data *particles,
float *director, int n) {
float *director, unsigned int n) {
#ifdef ROTATION
auto idx = static_cast<int>(blockDim.x * blockIdx.x + threadIdx.x);
auto idx = blockDim.x * blockIdx.x + threadIdx.x;
if (idx >= n)
return;

@@ -128,7 +133,7 @@ __global__ void split_kernel_director(CUDA_particle_data *particles,
#endif
}

void EspressoSystemInterface::reallocDeviceMemory(int n) {
void EspressoSystemInterface::reallocDeviceMemory(std::size_t n) {
if (m_needsRGpu && ((n != m_gpu_npart) || (m_r_gpu_begin == nullptr))) {
if (m_r_gpu_begin != nullptr)
cuda_safe_mem(cudaFree(m_r_gpu_begin));
@@ -169,8 +174,8 @@ void EspressoSystemInterface::reallocDeviceMemory(int n) {
}

void EspressoSystemInterface::split_particle_struct() {
auto device_particles = gpu_get_particle_pointer();
int n = device_particles.size();
auto const device_particles = gpu_get_particle_pointer();
auto const n = static_cast<unsigned int>(device_particles.size());
if (n == 0)
return;

5 changes: 4 additions & 1 deletion src/core/SystemInterface.hpp
Original file line number Diff line number Diff line change
@@ -20,8 +20,11 @@
#define SYSTEMINTERFACE_H

#include "config.hpp"

#include <utils/Vector.hpp>

#include <cstddef>

/** @todo: Turn needsXY in getter/setter **/

class SystemInterface {
@@ -93,7 +96,7 @@ class SystemInterface {
return m_needsDirectorGpu;
}

virtual unsigned int npart_gpu() { return 0; };
virtual std::size_t npart_gpu() const { return 0; };
virtual Vector3 box() const = 0;

virtual bool needsRGpu() { return m_needsRGpu; };
3 changes: 2 additions & 1 deletion src/core/actor/DipolarBarnesHut.hpp
Original file line number Diff line number Diff line change
@@ -82,7 +82,8 @@ class DipolarBarnesHut : public Actor {
buildTreeBH(m_bh_data.blocks);
summarizeBH(m_bh_data.blocks);
sortBH(m_bh_data.blocks);
if (energyBH(&m_bh_data, m_k, (&(((CUDA_energy *)s.eGpu())->dipolar)))) {
if (energyBH(&m_bh_data, m_k,
&(reinterpret_cast<CUDA_energy *>(s.eGpu())->dipolar))) {
runtimeErrorMsg() << "kernels encountered a functional error";
}
};
2 changes: 1 addition & 1 deletion src/core/actor/DipolarDirectSum.hpp
Original file line number Diff line number Diff line change
@@ -74,7 +74,7 @@ class DipolarDirectSum : public Actor {
}
DipolarDirectSum_kernel_wrapper_energy(
k, static_cast<int>(s.npart_gpu()), s.rGpuBegin(), s.dipGpuBegin(), box,
per, (&(((CUDA_energy *)s.eGpu())->dipolar)));
per, &(reinterpret_cast<CUDA_energy *>(s.eGpu())->dipolar));
};

private:
4 changes: 2 additions & 2 deletions src/core/actor/Mmm1dgpuForce.hpp
Original file line number Diff line number Diff line change
@@ -48,14 +48,14 @@ class Mmm1dgpuForce : public Actor {
private:
// CUDA parameters
unsigned int numThreads;
unsigned int numBlocks(SystemInterface &s);
unsigned int numBlocks(SystemInterface const &s) const;

// the box length currently set on the GPU
// Needed to make sure it hasn't been modified after inter coulomb was used.
float host_boxz;
// the number of particles we had during the last run. Needed to check if we
// have to realloc dev_forcePairs
int host_npart;
unsigned int host_npart;
bool need_tune;

// pairs==0: return forces using atomicAdd
Loading