better particle dat append implementation, removed sycl::buffer from …

…library implementation
ExCALIBUR-NEPTUNE · Jan 7, 2025 · dc61a1c · dc61a1c
1 parent ddedc7f
commit dc61a1c
Show file tree

Hide file tree

Showing 3 changed files with 98 additions and 72 deletions.
diff --git a/include/neso_particles/particle_dat.hpp b/include/neso_particles/particle_dat.hpp
@@ -259,16 +259,19 @@ template <typename T> class ParticleDatT {
    *  @param npart_new Number of new particles to add.
    *  @param new_data_exists Indicate if there is new data to copy of if the
    *  data should be initialised with zeros.
-   *  @param cells Cell indices of the new particles.
-   *  @param layers Layer (row) indices of the new particles.
-   *  @param data Particle data to copy into the ParticleDat.
+   *  @param h_cells Cell indices of the new particles on the host.
+   *  @param d_cells Cell indices of the new particles on the device.
+   *  @param d_layers Layer (row) indices of the new particles on the device.
+   *  @param d_data Particle data to copy into the ParticleDat on the device.
    *  @param es EventStack to push events onto.
    */
-  inline void append_particle_data(const int npart_new,
-                                   const bool new_data_exists,
-                                   std::vector<INT> &cells,
-                                   std::vector<INT> &layers,
-                                   std::vector<T> &data, EventStack &es);
+  inline void
+  append_particle_data(const int npart_new, const bool new_data_exists,
+                       std::vector<INT> &h_cells,
+                       const std::shared_ptr<BufferDevice<INT>> d_cells,
+                       const std::shared_ptr<BufferDevice<INT>> d_layers,
+                       const std::shared_ptr<BufferDevice<T>> d_data,
+                       EventStack &es);
 
   /**
    *  Realloc the underlying CellDat such that the indicated new number of
@@ -607,55 +610,47 @@ template <typename T> inline void ParticleDatT<T>::trim_cell_dat_rows() {
  */
 template <typename T>
 inline void ParticleDatT<T>::append_particle_data(
-    const int npart_new, const bool new_data_exists, std::vector<INT> &cells,
-    std::vector<INT> &layers, std::vector<T> &data, EventStack &es) {
+    const int npart_new, const bool new_data_exists, std::vector<INT> &h_cells,
+    const std::shared_ptr<BufferDevice<INT>> d_cells,
+    const std::shared_ptr<BufferDevice<INT>> d_layers,
+    const std::shared_ptr<BufferDevice<T>> d_data, EventStack &es) {
 
   if (npart_new == 0) {
     return;
   }
 
   this->write_callback_wrapper(0);
 
-  NESOASSERT(static_cast<std::size_t>(npart_new) <= cells.size(),
+  NESOASSERT(static_cast<std::size_t>(npart_new) <= d_cells->size,
              "incorrect number of cells");
 
   // using "this" in the kernel causes segfaults on the device so we make a
   // copy here.
-  const size_t size_npart_new = static_cast<size_t>(npart_new);
   const int ncomp = this->ncomp;
   T ***d_cell_dat_ptr = this->impl_get();
 
-  sycl::buffer<INT, 1> b_cells(cells.data(), sycl::range<1>{size_npart_new});
-  sycl::buffer<INT, 1> b_layers(layers.data(), sycl::range<1>{size_npart_new});
+  const INT *k_cells = d_cells->ptr;
+  const INT *k_layers = d_layers->ptr;
 
   // If data is supplied copy the data otherwise zero the components.
   if (new_data_exists) {
-    sycl::buffer<T, 1> b_data(data.data(),
-                              sycl::range<1>{size_npart_new * this->ncomp});
+    // The new data
+    const T *k_data = d_data->ptr;
     es.push(this->sycl_target->queue.submit([&](sycl::handler &cgh) {
-      // The cell counts on this dat
-      auto a_cells = b_cells.get_access<sycl::access::mode::read>(cgh);
-      auto a_layers = b_layers.get_access<sycl::access::mode::read>(cgh);
-      // The new data
-      auto a_data = b_data.template get_access<sycl::access::mode::read>(cgh);
       cgh.parallel_for<>(sycl::range<1>(npart_new), [=](sycl::id<1> idx) {
-        const INT cellx = a_cells[idx];
-        const INT layerx = a_layers[idx];
+        const INT cellx = k_cells[idx];
+        const INT layerx = k_layers[idx];
         // copy the data into the dat.
         for (int cx = 0; cx < ncomp; cx++) {
-          d_cell_dat_ptr[cellx][cx][layerx] = a_data[cx * npart_new + idx];
+          d_cell_dat_ptr[cellx][cx][layerx] = k_data[cx * npart_new + idx];
         }
       });
     }));
   } else {
     es.push(this->sycl_target->queue.submit([&](sycl::handler &cgh) {
-      // The cell counts on this dat
-      auto a_cells = b_cells.get_access<sycl::access::mode::read>(cgh);
-      auto a_layers = b_layers.get_access<sycl::access::mode::read>(cgh);
-
       cgh.parallel_for<>(sycl::range<1>(npart_new), [=](sycl::id<1> idx) {
-        const INT cellx = a_cells[idx];
-        const INT layerx = a_layers[idx];
+        const INT cellx = k_cells[idx];
+        const INT layerx = k_layers[idx];
         // zero the new components in the dat
         for (int cx = 0; cx < ncomp; cx++) {
           d_cell_dat_ptr[cellx][cx][layerx] = ((T)0);
@@ -665,7 +660,7 @@ inline void ParticleDatT<T>::append_particle_data(
   }
 
   for (int px = 0; px < npart_new; px++) {
-    auto cellx = cells[px];
+    auto cellx = h_cells[px];
     this->h_npart_cell[cellx]++;
   }
   es.push(this->async_npart_host_to_device());

diff --git a/include/neso_particles/particle_group_impl.hpp b/include/neso_particles/particle_group_impl.hpp
@@ -74,20 +74,46 @@ inline void ParticleGroup::add_particles_local(ParticleSet &particle_data) {
   buffer_memcpy(this->d_npart_cell, this->h_npart_cell).wait_and_throw();
   this->recompute_npart_cell_es();
 
+  // Containers for device copies of the data
+  std::map<Sym<INT>, std::shared_ptr<BufferDevice<INT>>> map_int;
+  std::map<Sym<REAL>, std::shared_ptr<BufferDevice<REAL>>> map_real;
+
+  // Make these buffers once for all ParticleDats.
+  auto d_cells =
+      std::make_shared<BufferDevice<INT>>(this->sycl_target, cellids.size());
+  auto d_layers =
+      std::make_shared<BufferDevice<INT>>(this->sycl_target, layers.size());
+
   EventStack es;
+  es.push(d_cells->set_async(cellids));
+  es.push(d_layers->set_async(layers));
+  for (auto &dat : this->particle_dats_real) {
+    auto &tmphostdata = particle_data.get(dat.first);
+    auto tmpdata = std::make_shared<BufferDevice<REAL>>(this->sycl_target,
+                                                        tmphostdata.size());
+    map_real[dat.first] = tmpdata;
+    es.push(tmpdata->set_async(tmphostdata));
+  }
+  for (auto &dat : this->particle_dats_int) {
+    auto &tmphostdata = particle_data.get(dat.first);
+    auto tmpdata = std::make_shared<BufferDevice<INT>>(this->sycl_target,
+                                                       tmphostdata.size());
+    map_int[dat.first] = tmpdata;
+    es.push(tmpdata->set_async(tmphostdata));
+  }
+  es.wait();
 
   for (auto &dat : this->particle_dats_real) {
     realloc_dat(dat.second);
-    dat.second->append_particle_data(npart_new,
-                                     particle_data.contains(dat.first), cellids,
-                                     layers, particle_data.get(dat.first), es);
+    dat.second->append_particle_data(
+        npart_new, particle_data.contains(dat.first), cellids, d_cells,
+        d_layers, map_real.at(dat.first), es);
   }
-
   for (auto &dat : this->particle_dats_int) {
     realloc_dat(dat.second);
-    dat.second->append_particle_data(npart_new,
-                                     particle_data.contains(dat.first), cellids,
-                                     layers, particle_data.get(dat.first), es);
+    dat.second->append_particle_data(
+        npart_new, particle_data.contains(dat.first), cellids, d_cells,
+        d_layers, map_int.at(dat.first), es);
   }
 
   es.wait();
@@ -112,34 +138,29 @@ inline void ParticleGroup::remove_particles(const int npart,
                                             const std::vector<INT> &cells,
                                             const std::vector<INT> &layers) {
 
-  this->d_remove_cells.realloc_no_copy(npart);
-  this->d_remove_layers.realloc_no_copy(npart);
-
-  auto k_cells = this->d_remove_cells.ptr;
-  auto k_layers = this->d_remove_layers.ptr;
-
-  NESOASSERT(cells.size() >= static_cast<std::size_t>(npart),
-             "Bad cells length compared to npart");
-  NESOASSERT(layers.size() >= static_cast<std::size_t>(npart),
-             "Bad layers length compared to npart");
-
-  auto b_cells = sycl::buffer<INT>(cells.data(), sycl::range<1>(npart));
-  auto b_layers = sycl::buffer<INT>(layers.data(), sycl::range<1>(npart));
-
-  this->sycl_target->queue
-      .submit([&](sycl::handler &cgh) {
-        auto a_cells = b_cells.get_access<sycl::access::mode::read>(cgh);
-        auto a_layers = b_layers.get_access<sycl::access::mode::read>(cgh);
-        cgh.parallel_for<>(sycl::range<1>(static_cast<size_t>(npart)),
-                           [=](sycl::id<1> idx) {
-                             k_cells[idx] = static_cast<INT>(a_cells[idx]);
-                             k_layers[idx] = static_cast<INT>(a_layers[idx]);
-                           });
-      })
-      .wait_and_throw();
-
-  this->remove_particles(npart, this->d_remove_cells.ptr,
-                         this->d_remove_layers.ptr);
+  if (npart > 0) {
+    this->d_remove_cells.realloc_no_copy(npart);
+    this->d_remove_layers.realloc_no_copy(npart);
+
+    auto k_cells = this->d_remove_cells.ptr;
+    auto k_layers = this->d_remove_layers.ptr;
+
+    NESOASSERT(cells.size() >= static_cast<std::size_t>(npart),
+               "Bad cells length compared to npart");
+    NESOASSERT(layers.size() >= static_cast<std::size_t>(npart),
+               "Bad layers length compared to npart");
+
+    const std::size_t num_bytes = static_cast<std::size_t>(npart) * sizeof(INT);
+    auto event_cells =
+        this->sycl_target->queue.memcpy(k_cells, cells.data(), num_bytes);
+    auto event_layers =
+        this->sycl_target->queue.memcpy(k_layers, layers.data(), num_bytes);
+
+    event_cells.wait_and_throw();
+    event_layers.wait_and_throw();
+
+    this->remove_particles(npart, k_cells, k_layers);
+  }
 }
 
 inline void ParticleGroup::remove_particles(

diff --git a/test/test_particle_dat.cpp b/test/test_particle_dat.cpp
@@ -51,9 +51,14 @@ TEST(ParticleDat, test_particle_dat_append_1) {
     ASSERT_TRUE(A->cell_dat.nrow[cellx] >= counts[cellx]);
   }
   EventStack es;
-  A->append_particle_data(N, false, cells0, layers0, data0, es);
-  // the append is async
-  es.wait();
+  {
+    auto d_cells = std::make_shared<BufferDevice<INT>>(sycl_target, cells0);
+    auto d_layers = std::make_shared<BufferDevice<INT>>(sycl_target, layers0);
+    auto d_data = std::make_shared<BufferDevice<INT>>(sycl_target, data0);
+    A->append_particle_data(N, false, cells0, d_cells, d_layers, d_data, es);
+    // the append is async
+    es.wait();
+  }
 
   for (int cellx = 0; cellx < cell_count; cellx++) {
     ASSERT_TRUE(A->h_npart_cell[cellx] == counts[cellx]);
@@ -78,9 +83,14 @@ TEST(ParticleDat, test_particle_dat_append_1) {
     ASSERT_TRUE(A->cell_dat.nrow[cellx] >= counts[cellx]);
   }
 
-  A->append_particle_data(N, true, cells0, layers0, data0, es);
-  // the append is async
-  es.wait();
+  {
+    auto d_cells = std::make_shared<BufferDevice<INT>>(sycl_target, cells0);
+    auto d_layers = std::make_shared<BufferDevice<INT>>(sycl_target, layers0);
+    auto d_data = std::make_shared<BufferDevice<INT>>(sycl_target, data0);
+    A->append_particle_data(N, true, cells0, d_cells, d_layers, d_data, es);
+    // the append is async
+    es.wait();
+  }
 
   for (int cellx = 0; cellx < cell_count; cellx++) {