Skip to content

Commit

Permalink
better particle dat append implementation, removed sycl::buffer from …
Browse files Browse the repository at this point in the history
…library implementation
  • Loading branch information
will-saunders-ukaea committed Jan 7, 2025
1 parent ddedc7f commit dc61a1c
Show file tree
Hide file tree
Showing 3 changed files with 98 additions and 72 deletions.
57 changes: 26 additions & 31 deletions include/neso_particles/particle_dat.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -259,16 +259,19 @@ template <typename T> class ParticleDatT {
* @param npart_new Number of new particles to add.
* @param new_data_exists Indicate if there is new data to copy of if the
* data should be initialised with zeros.
* @param cells Cell indices of the new particles.
* @param layers Layer (row) indices of the new particles.
* @param data Particle data to copy into the ParticleDat.
* @param h_cells Cell indices of the new particles on the host.
* @param d_cells Cell indices of the new particles on the device.
* @param d_layers Layer (row) indices of the new particles on the device.
* @param d_data Particle data to copy into the ParticleDat on the device.
* @param es EventStack to push events onto.
*/
inline void append_particle_data(const int npart_new,
const bool new_data_exists,
std::vector<INT> &cells,
std::vector<INT> &layers,
std::vector<T> &data, EventStack &es);
inline void
append_particle_data(const int npart_new, const bool new_data_exists,
std::vector<INT> &h_cells,
const std::shared_ptr<BufferDevice<INT>> d_cells,
const std::shared_ptr<BufferDevice<INT>> d_layers,
const std::shared_ptr<BufferDevice<T>> d_data,
EventStack &es);

/**
* Realloc the underlying CellDat such that the indicated new number of
Expand Down Expand Up @@ -607,55 +610,47 @@ template <typename T> inline void ParticleDatT<T>::trim_cell_dat_rows() {
*/
template <typename T>
inline void ParticleDatT<T>::append_particle_data(
const int npart_new, const bool new_data_exists, std::vector<INT> &cells,
std::vector<INT> &layers, std::vector<T> &data, EventStack &es) {
const int npart_new, const bool new_data_exists, std::vector<INT> &h_cells,
const std::shared_ptr<BufferDevice<INT>> d_cells,
const std::shared_ptr<BufferDevice<INT>> d_layers,
const std::shared_ptr<BufferDevice<T>> d_data, EventStack &es) {

if (npart_new == 0) {
return;
}

this->write_callback_wrapper(0);

NESOASSERT(static_cast<std::size_t>(npart_new) <= cells.size(),
NESOASSERT(static_cast<std::size_t>(npart_new) <= d_cells->size,
"incorrect number of cells");

// using "this" in the kernel causes segfaults on the device so we make a
// copy here.
const size_t size_npart_new = static_cast<size_t>(npart_new);
const int ncomp = this->ncomp;
T ***d_cell_dat_ptr = this->impl_get();

sycl::buffer<INT, 1> b_cells(cells.data(), sycl::range<1>{size_npart_new});
sycl::buffer<INT, 1> b_layers(layers.data(), sycl::range<1>{size_npart_new});
const INT *k_cells = d_cells->ptr;
const INT *k_layers = d_layers->ptr;

// If data is supplied copy the data otherwise zero the components.
if (new_data_exists) {
sycl::buffer<T, 1> b_data(data.data(),
sycl::range<1>{size_npart_new * this->ncomp});
// The new data
const T *k_data = d_data->ptr;
es.push(this->sycl_target->queue.submit([&](sycl::handler &cgh) {
// The cell counts on this dat
auto a_cells = b_cells.get_access<sycl::access::mode::read>(cgh);
auto a_layers = b_layers.get_access<sycl::access::mode::read>(cgh);
// The new data
auto a_data = b_data.template get_access<sycl::access::mode::read>(cgh);
cgh.parallel_for<>(sycl::range<1>(npart_new), [=](sycl::id<1> idx) {
const INT cellx = a_cells[idx];
const INT layerx = a_layers[idx];
const INT cellx = k_cells[idx];
const INT layerx = k_layers[idx];
// copy the data into the dat.
for (int cx = 0; cx < ncomp; cx++) {
d_cell_dat_ptr[cellx][cx][layerx] = a_data[cx * npart_new + idx];
d_cell_dat_ptr[cellx][cx][layerx] = k_data[cx * npart_new + idx];
}
});
}));
} else {
es.push(this->sycl_target->queue.submit([&](sycl::handler &cgh) {
// The cell counts on this dat
auto a_cells = b_cells.get_access<sycl::access::mode::read>(cgh);
auto a_layers = b_layers.get_access<sycl::access::mode::read>(cgh);

cgh.parallel_for<>(sycl::range<1>(npart_new), [=](sycl::id<1> idx) {
const INT cellx = a_cells[idx];
const INT layerx = a_layers[idx];
const INT cellx = k_cells[idx];
const INT layerx = k_layers[idx];
// zero the new components in the dat
for (int cx = 0; cx < ncomp; cx++) {
d_cell_dat_ptr[cellx][cx][layerx] = ((T)0);
Expand All @@ -665,7 +660,7 @@ inline void ParticleDatT<T>::append_particle_data(
}

for (int px = 0; px < npart_new; px++) {
auto cellx = cells[px];
auto cellx = h_cells[px];
this->h_npart_cell[cellx]++;
}
es.push(this->async_npart_host_to_device());
Expand Down
91 changes: 56 additions & 35 deletions include/neso_particles/particle_group_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,20 +74,46 @@ inline void ParticleGroup::add_particles_local(ParticleSet &particle_data) {
buffer_memcpy(this->d_npart_cell, this->h_npart_cell).wait_and_throw();
this->recompute_npart_cell_es();

// Containers for device copies of the data
std::map<Sym<INT>, std::shared_ptr<BufferDevice<INT>>> map_int;
std::map<Sym<REAL>, std::shared_ptr<BufferDevice<REAL>>> map_real;

// Make these buffers once for all ParticleDats.
auto d_cells =
std::make_shared<BufferDevice<INT>>(this->sycl_target, cellids.size());
auto d_layers =
std::make_shared<BufferDevice<INT>>(this->sycl_target, layers.size());

EventStack es;
es.push(d_cells->set_async(cellids));
es.push(d_layers->set_async(layers));
for (auto &dat : this->particle_dats_real) {
auto &tmphostdata = particle_data.get(dat.first);
auto tmpdata = std::make_shared<BufferDevice<REAL>>(this->sycl_target,
tmphostdata.size());
map_real[dat.first] = tmpdata;
es.push(tmpdata->set_async(tmphostdata));
}
for (auto &dat : this->particle_dats_int) {
auto &tmphostdata = particle_data.get(dat.first);
auto tmpdata = std::make_shared<BufferDevice<INT>>(this->sycl_target,
tmphostdata.size());
map_int[dat.first] = tmpdata;
es.push(tmpdata->set_async(tmphostdata));
}
es.wait();

for (auto &dat : this->particle_dats_real) {
realloc_dat(dat.second);
dat.second->append_particle_data(npart_new,
particle_data.contains(dat.first), cellids,
layers, particle_data.get(dat.first), es);
dat.second->append_particle_data(
npart_new, particle_data.contains(dat.first), cellids, d_cells,
d_layers, map_real.at(dat.first), es);
}

for (auto &dat : this->particle_dats_int) {
realloc_dat(dat.second);
dat.second->append_particle_data(npart_new,
particle_data.contains(dat.first), cellids,
layers, particle_data.get(dat.first), es);
dat.second->append_particle_data(
npart_new, particle_data.contains(dat.first), cellids, d_cells,
d_layers, map_int.at(dat.first), es);
}

es.wait();
Expand All @@ -112,34 +138,29 @@ inline void ParticleGroup::remove_particles(const int npart,
const std::vector<INT> &cells,
const std::vector<INT> &layers) {

this->d_remove_cells.realloc_no_copy(npart);
this->d_remove_layers.realloc_no_copy(npart);

auto k_cells = this->d_remove_cells.ptr;
auto k_layers = this->d_remove_layers.ptr;

NESOASSERT(cells.size() >= static_cast<std::size_t>(npart),
"Bad cells length compared to npart");
NESOASSERT(layers.size() >= static_cast<std::size_t>(npart),
"Bad layers length compared to npart");

auto b_cells = sycl::buffer<INT>(cells.data(), sycl::range<1>(npart));
auto b_layers = sycl::buffer<INT>(layers.data(), sycl::range<1>(npart));

this->sycl_target->queue
.submit([&](sycl::handler &cgh) {
auto a_cells = b_cells.get_access<sycl::access::mode::read>(cgh);
auto a_layers = b_layers.get_access<sycl::access::mode::read>(cgh);
cgh.parallel_for<>(sycl::range<1>(static_cast<size_t>(npart)),
[=](sycl::id<1> idx) {
k_cells[idx] = static_cast<INT>(a_cells[idx]);
k_layers[idx] = static_cast<INT>(a_layers[idx]);
});
})
.wait_and_throw();

this->remove_particles(npart, this->d_remove_cells.ptr,
this->d_remove_layers.ptr);
if (npart > 0) {
this->d_remove_cells.realloc_no_copy(npart);
this->d_remove_layers.realloc_no_copy(npart);

auto k_cells = this->d_remove_cells.ptr;
auto k_layers = this->d_remove_layers.ptr;

NESOASSERT(cells.size() >= static_cast<std::size_t>(npart),
"Bad cells length compared to npart");
NESOASSERT(layers.size() >= static_cast<std::size_t>(npart),
"Bad layers length compared to npart");

const std::size_t num_bytes = static_cast<std::size_t>(npart) * sizeof(INT);
auto event_cells =
this->sycl_target->queue.memcpy(k_cells, cells.data(), num_bytes);
auto event_layers =
this->sycl_target->queue.memcpy(k_layers, layers.data(), num_bytes);

event_cells.wait_and_throw();
event_layers.wait_and_throw();

this->remove_particles(npart, k_cells, k_layers);
}
}

inline void ParticleGroup::remove_particles(
Expand Down
22 changes: 16 additions & 6 deletions test/test_particle_dat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,14 @@ TEST(ParticleDat, test_particle_dat_append_1) {
ASSERT_TRUE(A->cell_dat.nrow[cellx] >= counts[cellx]);
}
EventStack es;
A->append_particle_data(N, false, cells0, layers0, data0, es);
// the append is async
es.wait();
{
auto d_cells = std::make_shared<BufferDevice<INT>>(sycl_target, cells0);
auto d_layers = std::make_shared<BufferDevice<INT>>(sycl_target, layers0);
auto d_data = std::make_shared<BufferDevice<INT>>(sycl_target, data0);
A->append_particle_data(N, false, cells0, d_cells, d_layers, d_data, es);
// the append is async
es.wait();
}

for (int cellx = 0; cellx < cell_count; cellx++) {
ASSERT_TRUE(A->h_npart_cell[cellx] == counts[cellx]);
Expand All @@ -78,9 +83,14 @@ TEST(ParticleDat, test_particle_dat_append_1) {
ASSERT_TRUE(A->cell_dat.nrow[cellx] >= counts[cellx]);
}

A->append_particle_data(N, true, cells0, layers0, data0, es);
// the append is async
es.wait();
{
auto d_cells = std::make_shared<BufferDevice<INT>>(sycl_target, cells0);
auto d_layers = std::make_shared<BufferDevice<INT>>(sycl_target, layers0);
auto d_data = std::make_shared<BufferDevice<INT>>(sycl_target, data0);
A->append_particle_data(N, true, cells0, d_cells, d_layers, d_data, es);
// the append is async
es.wait();
}

for (int cellx = 0; cellx < cell_count; cellx++) {

Expand Down

0 comments on commit dc61a1c

Please sign in to comment.