Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MMM1D maintenance #4311

Merged
merged 6 commits into from
Jul 27, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion doc/sphinx/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,13 @@ General features

.. seealso:: :ref:`Electrostatics`

- ``MMM1D_GPU``
- ``MMM1D_GPU``: This enables MMM1D on GPU. It is faster than the CPU version
by several orders of magnitude, but has float precision instead of double
precision.

- ``MMM1D_MACHINE_PREC``: This enables high-precision Bessel functions
for MMM1D on CPU. Comes with a 60% slow-down penalty. The low-precision
functions are in most cases precise enough and are enabled by default.

- ``DIPOLES`` This activates the dipole-moment property of particles; In addition,
the various magnetostatics algorithms, such as P3M are switched on.
Expand Down
1 change: 1 addition & 0 deletions maintainer/configs/no_rotation.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@

// Charges and dipoles
#define ELECTROSTATICS
#define MMM1D_MACHINE_PREC
#ifdef CUDA
#define MMM1D_GPU
#endif
Expand Down
1 change: 1 addition & 0 deletions src/config/features.def
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ ROTATIONAL_INERTIA implies ROTATION
ELECTROSTATICS
P3M equals ELECTROSTATICS and FFTW
MMM1D_GPU requires CUDA and ELECTROSTATICS
MMM1D_MACHINE_PREC requires ELECTROSTATICS

/* Magnetostatics */
DIPOLES implies ROTATION
Expand Down
18 changes: 12 additions & 6 deletions src/core/EspressoSystemInterface.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
#include "SystemInterface.hpp"
#include "cuda_interface.hpp"

#include <cstddef>

/* Syntactic sugar */
#define espressoSystemInterface EspressoSystemInterface::Instance()

Expand Down Expand Up @@ -117,12 +119,16 @@ class EspressoSystemInterface : public SystemInterface {
float *fGpuEnd() override {
return gpu_get_particle_force_pointer() + 3 * m_gpu_npart;
};
float *eGpu() override { return (float *)gpu_get_energy_pointer(); };
float *eGpu() override {
// cast struct of floats to array of floats
// https://stackoverflow.com/a/29278260
return reinterpret_cast<float *>(gpu_get_energy_pointer());
};
float *torqueGpuBegin() override {
return (float *)gpu_get_particle_torque_pointer();
return gpu_get_particle_torque_pointer();
};
float *torqueGpuEnd() override {
return (float *)(gpu_get_particle_torque_pointer()) + 3 * m_gpu_npart;
return gpu_get_particle_torque_pointer() + 3 * m_gpu_npart;
};
bool hasFGpu() override { return true; };
bool requestFGpu() override {
Expand All @@ -148,7 +154,7 @@ class EspressoSystemInterface : public SystemInterface {

Utils::Vector3d box() const override;

unsigned int npart_gpu() override {
std::size_t npart_gpu() const override {
#ifdef CUDA
return gpu_get_particle_pointer().size();
#else
Expand Down Expand Up @@ -177,10 +183,10 @@ class EspressoSystemInterface : public SystemInterface {
reallocDeviceMemory(gpu_get_particle_pointer().size());
}
};
void reallocDeviceMemory(int n);
void reallocDeviceMemory(std::size_t n);
#endif

int m_gpu_npart;
std::size_t m_gpu_npart;
bool m_gpu;

float *m_r_gpu_begin;
Expand Down
35 changes: 20 additions & 15 deletions src/core/EspressoSystemInterface_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
#include "cuda_utils.cuh"
#include "errorhandling.hpp"

#include <cstddef>

#include <cuda.h>

#if defined(OMPI_MPI_H) || defined(_MPI_H)
Expand All @@ -33,8 +35,8 @@

// Position and charge
__global__ void split_kernel_rq(CUDA_particle_data *particles, float *r,
float *q, int n) {
auto idx = static_cast<int>(blockDim.x * blockIdx.x + threadIdx.x);
float *q, unsigned int n) {
auto const idx = blockDim.x * blockIdx.x + threadIdx.x;
if (idx >= n)
return;

Expand All @@ -49,8 +51,9 @@ __global__ void split_kernel_rq(CUDA_particle_data *particles, float *r,
}

// Charge only
__global__ void split_kernel_q(CUDA_particle_data *particles, float *q, int n) {
auto idx = static_cast<int>(blockDim.x * blockIdx.x + threadIdx.x);
__global__ void split_kernel_q(CUDA_particle_data *particles, float *q,
unsigned int n) {
auto const idx = blockDim.x * blockIdx.x + threadIdx.x;
if (idx >= n)
return;

Expand All @@ -62,8 +65,9 @@ __global__ void split_kernel_q(CUDA_particle_data *particles, float *q, int n) {
}

// Position only
__global__ void split_kernel_r(CUDA_particle_data *particles, float *r, int n) {
auto idx = static_cast<int>(blockDim.x * blockIdx.x + threadIdx.x);
__global__ void split_kernel_r(CUDA_particle_data *particles, float *r,
unsigned int n) {
auto idx = blockDim.x * blockIdx.x + threadIdx.x;
if (idx >= n)
return;

Expand All @@ -78,8 +82,9 @@ __global__ void split_kernel_r(CUDA_particle_data *particles, float *r, int n) {

#ifdef CUDA
// Velocity
__global__ void split_kernel_v(CUDA_particle_data *particles, float *v, int n) {
auto idx = static_cast<int>(blockDim.x * blockIdx.x + threadIdx.x);
__global__ void split_kernel_v(CUDA_particle_data *particles, float *v,
unsigned int n) {
auto idx = blockDim.x * blockIdx.x + threadIdx.x;
if (idx >= n)
return;

Expand All @@ -96,8 +101,8 @@ __global__ void split_kernel_v(CUDA_particle_data *particles, float *v, int n) {
#ifdef DIPOLES
// Dipole moment
__global__ void split_kernel_dip(CUDA_particle_data *particles, float *dip,
int n) {
auto idx = static_cast<int>(blockDim.x * blockIdx.x + threadIdx.x);
unsigned int n) {
auto idx = blockDim.x * blockIdx.x + threadIdx.x;
if (idx >= n)
return;

Expand All @@ -112,9 +117,9 @@ __global__ void split_kernel_dip(CUDA_particle_data *particles, float *dip,
#endif

__global__ void split_kernel_director(CUDA_particle_data *particles,
float *director, int n) {
float *director, unsigned int n) {
#ifdef ROTATION
auto idx = static_cast<int>(blockDim.x * blockIdx.x + threadIdx.x);
auto idx = blockDim.x * blockIdx.x + threadIdx.x;
if (idx >= n)
return;

Expand All @@ -128,7 +133,7 @@ __global__ void split_kernel_director(CUDA_particle_data *particles,
#endif
}

void EspressoSystemInterface::reallocDeviceMemory(int n) {
void EspressoSystemInterface::reallocDeviceMemory(std::size_t n) {
if (m_needsRGpu && ((n != m_gpu_npart) || (m_r_gpu_begin == nullptr))) {
if (m_r_gpu_begin != nullptr)
cuda_safe_mem(cudaFree(m_r_gpu_begin));
Expand Down Expand Up @@ -169,8 +174,8 @@ void EspressoSystemInterface::reallocDeviceMemory(int n) {
}

void EspressoSystemInterface::split_particle_struct() {
auto device_particles = gpu_get_particle_pointer();
int n = device_particles.size();
auto const device_particles = gpu_get_particle_pointer();
auto const n = static_cast<unsigned int>(device_particles.size());
if (n == 0)
return;

Expand Down
5 changes: 4 additions & 1 deletion src/core/SystemInterface.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,11 @@
#define SYSTEMINTERFACE_H

#include "config.hpp"

#include <utils/Vector.hpp>

#include <cstddef>

/** @todo: Turn needsXY in getter/setter **/

class SystemInterface {
Expand Down Expand Up @@ -93,7 +96,7 @@ class SystemInterface {
return m_needsDirectorGpu;
}

virtual unsigned int npart_gpu() { return 0; };
virtual std::size_t npart_gpu() const { return 0; };
virtual Vector3 box() const = 0;

virtual bool needsRGpu() { return m_needsRGpu; };
Expand Down
3 changes: 2 additions & 1 deletion src/core/actor/DipolarBarnesHut.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,8 @@ class DipolarBarnesHut : public Actor {
buildTreeBH(m_bh_data.blocks);
summarizeBH(m_bh_data.blocks);
sortBH(m_bh_data.blocks);
if (energyBH(&m_bh_data, m_k, (&(((CUDA_energy *)s.eGpu())->dipolar)))) {
if (energyBH(&m_bh_data, m_k,
&(reinterpret_cast<CUDA_energy *>(s.eGpu())->dipolar))) {
runtimeErrorMsg() << "kernels encountered a functional error";
}
};
Expand Down
2 changes: 1 addition & 1 deletion src/core/actor/DipolarDirectSum.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ class DipolarDirectSum : public Actor {
}
DipolarDirectSum_kernel_wrapper_energy(
k, static_cast<int>(s.npart_gpu()), s.rGpuBegin(), s.dipGpuBegin(), box,
per, (&(((CUDA_energy *)s.eGpu())->dipolar)));
per, &(reinterpret_cast<CUDA_energy *>(s.eGpu())->dipolar));
};

private:
Expand Down
4 changes: 2 additions & 2 deletions src/core/actor/Mmm1dgpuForce.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,14 @@ class Mmm1dgpuForce : public Actor {
private:
// CUDA parameters
unsigned int numThreads;
unsigned int numBlocks(SystemInterface &s);
unsigned int numBlocks(SystemInterface const &s) const;

// the box length currently set on the GPU
// Needed to make sure it hasn't been modified after inter coulomb was used.
float host_boxz;
// the number of particles we had during the last run. Needed to check if we
// have to realloc dev_forcePairs
int host_npart;
unsigned int host_npart;
bool need_tune;

// pairs==0: return forces using atomicAdd
Expand Down
Loading