Skip to content

Commit

Permalink
CUDA code maintenance (#4404)
Browse files Browse the repository at this point in the history
Partial fix for #2628 and #4219

Description of changes:
- simplify CUDA kernels and GPU interface code
   - remove superfluous GPU-related macros and unused code
   - around 300 lines of CUDA code were deleted
- remove GPU magnetostatics global variables
   - they are now shared pointers in the Cython interface
- rewrite `SystemInterface` framework
   - document framework
   - remove unused methods
   - remove broken past-the-end device memory pointers (access to device memory with the wrong alignment is dangerous)
   - throw an error when requesting a particle list conversion from AoS to SoA for features that are not compiled in
   - unit test `EspressoSystemInterface` implementation
- improve separation of concerns:
   - `magnetostatics.pyx` no longer knows about the `dipole` global nor the associated enum values
   - header files of GPU long-range methods no longer expose implementation details
   - narrow includes of `cells.hpp`, `thermostat.hpp`, `rotation.hpp` to the strict minimum
      - the majority of indirect includes were unnecessary, exposed global variables and increased compile time
      - where necessary, functions were moved to different files and global variables `this_node` and `cell_structure` were forwarded by function argument
- bugfixes:
   - fix device memory leak in GPU Barnes-Hut (can fill up device memory quickly when creating, using and then deleting a Barnes-Hut actor in a loop)
   - GPU dipolar direct sum and Barnes-Hut used to emit a runtime error message, but `handle_errors()` was missing in the Cython interface
  • Loading branch information
kodiakhq[bot] authored Dec 23, 2021
2 parents 9997b68 + 189a51e commit 9e363c4
Show file tree
Hide file tree
Showing 62 changed files with 1,318 additions and 1,336 deletions.
20 changes: 15 additions & 5 deletions src/core/EspressoSystemInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,17 +23,15 @@
#include "cuda_interface.hpp"
#include "grid.hpp"

/* Initialize instance pointer */
EspressoSystemInterface *EspressoSystemInterface::m_instance = nullptr;
#include <utils/Vector.hpp>

/********************************************************************************************/
EspressoSystemInterface *EspressoSystemInterface::m_instance = nullptr;

void EspressoSystemInterface::gatherParticles() {
// get particles from other nodes
#ifdef CUDA
if (m_gpu) {
if (gpu_get_global_particle_vars_pointer_host()->communication_enabled) {
copy_part_data_to_gpu(cell_structure.local_particles());
copy_part_data_to_gpu(cell_structure.local_particles(), this_node);
reallocDeviceMemory(gpu_get_particle_pointer().size());
if (m_splitParticleStructGpu && (this_node == 0))
split_particle_struct();
Expand All @@ -42,6 +40,18 @@ void EspressoSystemInterface::gatherParticles() {
#endif
}

#ifdef CUDA
void EspressoSystemInterface::enableParticleCommunication() {
if (m_gpu) {
if (!gpu_get_global_particle_vars_pointer_host()->communication_enabled) {
gpu_init_particle_comm(this_node);
cuda_bcast_global_part_params();
reallocDeviceMemory(gpu_get_particle_pointer().size());
}
}
}
#endif

void EspressoSystemInterface::init() { gatherParticles(); }

void EspressoSystemInterface::update() { gatherParticles(); }
Expand Down
164 changes: 55 additions & 109 deletions src/core/EspressoSystemInterface.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,136 +20,102 @@
#define ESPRESSOSYSTEMINTERFACE_H

#include "SystemInterface.hpp"
#include "config.hpp"
#include "cuda_interface.hpp"

#include <cstddef>
#include <utils/Vector.hpp>

/* Syntactic sugar */
#define espressoSystemInterface EspressoSystemInterface::Instance()
#include <cstddef>

/**
* @brief CUDA implementation of @ref SystemInterface.
*
* When data is synchronized between host and device memory, a subset
* of the @ref Particle struct is copied from each particle on the host
* to the corresponding @ref CUDA_particle_data struct on the device via
* @ref EspressoSystemInterface::gatherParticles(). Once the transfer is
* complete, the particle AoS on the device is copied (or "split") to
* a SoA via @ref EspressoSystemInterface::split_particle_struct().
*/
class EspressoSystemInterface : public SystemInterface {
public:
EspressoSystemInterface() = default;
~EspressoSystemInterface() override = default;

static EspressoSystemInterface &Instance() {
if (!m_instance)
m_instance = new EspressoSystemInterface;

return *m_instance;
};

static EspressoSystemInterface *_Instance() {
if (!m_instance)
m_instance = new EspressoSystemInterface;

return m_instance;
};

void init() override;
void update() override;

#ifdef CUDA
float *rGpuBegin() override { return m_r_gpu_begin; };
float *rGpuEnd() override { return m_r_gpu_end; };
bool hasRGpu() override { return true; };
bool requestRGpu() override {
void requestRGpu() override {
m_needsRGpu = hasRGpu();
m_splitParticleStructGpu |= m_needsRGpu;
m_gpu |= m_needsRGpu;
if (m_gpu)
enableParticleCommunication();
return m_needsRGpu;
enableParticleCommunication();
};

#ifdef DIPOLES
float *dipGpuBegin() override { return m_dip_gpu_begin; };
float *dipGpuEnd() override { return m_dip_gpu_end; };
bool hasDipGpu() override { return true; };
bool requestDipGpu() override {
void requestDipGpu() override {
m_needsDipGpu = hasDipGpu();
m_splitParticleStructGpu |= m_needsRGpu;
m_gpu |= m_needsRGpu;
if (m_gpu)
enableParticleCommunication();
return m_needsDipGpu;
enableParticleCommunication();
};
#endif
float *vGpuBegin() override { return m_v_gpu_begin; };
float *vGpuEnd() override { return m_v_gpu_end; };
bool hasVGpu() override { return true; };
bool requestVGpu() override {
m_needsVGpu = hasVGpu();
m_splitParticleStructGpu |= m_needsVGpu;
m_gpu |= m_needsVGpu;
if (m_gpu)
enableParticleCommunication();
return m_needsVGpu;
};

#ifdef ELECTROSTATICS
float *qGpuBegin() override { return m_q_gpu_begin; };
float *qGpuEnd() override { return m_q_gpu_end; };
bool hasQGpu() override { return true; };
bool requestQGpu() override {
void requestQGpu() override {
m_needsQGpu = hasQGpu();
m_splitParticleStructGpu |= m_needsQGpu;
m_gpu |= m_needsQGpu;
if (m_gpu)
enableParticleCommunication();
return m_needsQGpu;
};

float *directorGpuBegin() override { return m_director_gpu_begin; };
float *directorGpuEnd() override { return m_director_gpu_end; };
bool hasDirectorGpu() override { return true; };
bool requestDirectorGpu() override {
m_needsDirectorGpu = hasDirectorGpu();
m_splitParticleStructGpu |= m_needsDirectorGpu;
m_gpu |= m_needsDirectorGpu;
if (m_gpu)
enableParticleCommunication();
return m_needsDirectorGpu;
enableParticleCommunication();
};
#endif

bool requestParticleStructGpu() {
void requestParticleStructGpu() {
m_needsParticleStructGpu = true;
m_gpu |= m_needsParticleStructGpu;
if (m_gpu)
enableParticleCommunication();
return true;
enableParticleCommunication();
};

float *fGpuBegin() override { return gpu_get_particle_force_pointer(); };
float *fGpuEnd() override {
return gpu_get_particle_force_pointer() + 3 * m_gpu_npart;
};
float *eGpu() override {
// cast pointer to struct of floats to array of floats
// https://stackoverflow.com/a/29278260
return reinterpret_cast<float *>(gpu_get_energy_pointer());
};
float *torqueGpuBegin() override {
return gpu_get_particle_torque_pointer();
};
float *torqueGpuEnd() override {
return gpu_get_particle_torque_pointer() + 3 * m_gpu_npart;
};
bool hasFGpu() override { return true; };
bool requestFGpu() override {
void requestFGpu() override {
m_needsFGpu = hasFGpu();
m_gpu |= m_needsFGpu;
if (m_gpu)
enableParticleCommunication();
return m_needsFGpu;
enableParticleCommunication();
};

#ifdef ROTATION
float *torqueGpuBegin() override {
return gpu_get_particle_torque_pointer();
};
bool hasTorqueGpu() override { return true; };
bool requestTorqueGpu() override {
void requestTorqueGpu() override {
m_needsTorqueGpu = hasTorqueGpu();
m_gpu |= m_needsTorqueGpu;
if (m_gpu)
enableParticleCommunication();
return m_needsTorqueGpu;
enableParticleCommunication();
};
#endif

float *eGpu() override {
// cast pointer from struct of floats to array of floats
// https://stackoverflow.com/a/29278260
return reinterpret_cast<float *>(gpu_get_energy_pointer());
};

#endif // ifdef CUDA

Utils::Vector3d box() const override;
Expand All @@ -164,50 +130,30 @@ class EspressoSystemInterface : public SystemInterface {

protected:
static EspressoSystemInterface *m_instance;
EspressoSystemInterface()
: m_gpu(false), m_r_gpu_begin(nullptr), m_r_gpu_end(nullptr),
m_dip_gpu_begin(nullptr), m_dip_gpu_end(nullptr),
m_v_gpu_begin(nullptr), m_v_gpu_end(nullptr), m_q_gpu_begin(nullptr),
m_q_gpu_end(nullptr), m_director_gpu_begin(nullptr),
m_director_gpu_end(nullptr), m_needsParticleStructGpu(false),
m_splitParticleStructGpu(false){};
~EspressoSystemInterface() override = default;

void gatherParticles();
void split_particle_struct();
#ifdef CUDA
void enableParticleCommunication() {
if (!gpu_get_global_particle_vars_pointer_host()->communication_enabled) {
gpu_init_particle_comm();
cuda_bcast_global_part_params();
reallocDeviceMemory(gpu_get_particle_pointer().size());
}
};
void enableParticleCommunication();
void reallocDeviceMemory(std::size_t n);
#endif

std::size_t m_gpu_npart;
bool m_gpu;
private:
std::size_t m_gpu_npart = 0;
bool m_gpu = false;

float *m_r_gpu_begin;
float *m_r_gpu_end;
float *m_r_gpu_begin = nullptr;
float *m_dip_gpu_begin = nullptr;
float *m_q_gpu_begin = nullptr;

float *m_dip_gpu_begin;
float *m_dip_gpu_end;
bool m_needsParticleStructGpu = false;
bool m_splitParticleStructGpu = false;

float *m_v_gpu_begin;
float *m_v_gpu_end;

float *m_q_gpu_begin;
float *m_q_gpu_end;

float *m_director_gpu_begin;
float *m_director_gpu_end;

bool m_needsParticleStructGpu;
bool m_splitParticleStructGpu;
bool m_needsRGpu = false;
bool m_needsQGpu = false;
bool m_needsFGpu = false;
bool m_needsDipGpu = false;
bool m_needsTorqueGpu = false;
#endif
};

/********************************************************************************************/

#endif
Loading

0 comments on commit 9e363c4

Please sign in to comment.