Skip to content

Commit

Permalink
Merge pull request #79 from ExCALIBUR-NEPTUNE/feature/better_scaling_…
Browse files Browse the repository at this point in the history
…dat_compression

Feature/better scaling dat compression
  • Loading branch information
will-saunders-ukaea authored Oct 21, 2024
2 parents a6f671b + caed538 commit 345dc82
Show file tree
Hide file tree
Showing 17 changed files with 703 additions and 216 deletions.
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -108,10 +108,12 @@ set(HEADER_FILES
${INCLUDE_DIR_NESO_PARTICLES}/local_mapping.hpp
${INCLUDE_DIR_NESO_PARTICLES}/local_move.hpp
${INCLUDE_DIR_NESO_PARTICLES}/loop/access_descriptors.hpp
${INCLUDE_DIR_NESO_PARTICLES}/loop/kernel.hpp
${INCLUDE_DIR_NESO_PARTICLES}/loop/particle_loop.hpp
${INCLUDE_DIR_NESO_PARTICLES}/loop/particle_loop_base.hpp
${INCLUDE_DIR_NESO_PARTICLES}/loop/particle_loop_utility.hpp
${INCLUDE_DIR_NESO_PARTICLES}/loop/particle_loop_index.hpp
${INCLUDE_DIR_NESO_PARTICLES}/loop/particle_loop_iteration_set.hpp
${INCLUDE_DIR_NESO_PARTICLES}/loop/pli_particle_dat.hpp
${INCLUDE_DIR_NESO_PARTICLES}/mesh_hierarchy.hpp
${INCLUDE_DIR_NESO_PARTICLES}/mesh_hierarchy_data/mesh_hierarchy_container.hpp
Expand Down
235 changes: 179 additions & 56 deletions include/neso_particles/cell_dat_compression.hpp

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions include/neso_particles/cell_dat_move.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ class CellMove {
// ErrorPropagate object to detect bad cell indices
ErrorPropagate ep_bad_cell_indices;

std::size_t num_bytes_per_particle;

inline void get_particle_dat_info() {

this->num_dats_real = this->particle_dats_real.size();
Expand All @@ -81,17 +83,20 @@ class CellMove {
this->d_particle_dat_ptr_int.realloc_no_copy(this->num_dats_int);
this->d_particle_dat_ncomp_int.realloc_no_copy(this->num_dats_int);

this->num_bytes_per_particle = 0;
int index = 0;
for (auto &dat : this->particle_dats_real) {
this->h_particle_dat_ptr_real.ptr[index] = dat.second->impl_get();
this->h_particle_dat_ncomp_real.ptr[index] = dat.second->ncomp;
index++;
this->num_bytes_per_particle += dat.second->ncomp * sizeof(REAL);
}
index = 0;
for (auto &dat : particle_dats_int) {
this->h_particle_dat_ptr_int.ptr[index] = dat.second->impl_get();
this->h_particle_dat_ncomp_int.ptr[index] = dat.second->ncomp;
index++;
this->num_bytes_per_particle += dat.second->ncomp * sizeof(INT);
}

// copy to the device
Expand Down
1 change: 1 addition & 0 deletions include/neso_particles/cell_dat_move_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,7 @@ inline void CellMove::move() {
.wait_and_throw();

r.end();
r.num_bytes = move_count * this->num_bytes_per_particle * 2;
this->sycl_target->profile_map.add_region(r);

sycl_target->profile_map.inc("CellMove", "cell_move", 1,
Expand Down
32 changes: 32 additions & 0 deletions include/neso_particles/compute_target.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -804,6 +804,38 @@ inline NDRangePeel1D get_nd_range_peel_1d(const std::size_t size,
sycl::range<1>(local_size))};
}

/**
* Compute the exclusive scan of an array using the SYCL group built-ins.
*
* @param[in] sycl_target Compute device to use.
* @param[in] N Number of elements.
* @param[in] d_src Device poitner to source values.
* @param[in, d_dst Device pointer to destination values.
* @returns Event to wait on for completion.
*/
template <typename T>
[[nodiscard]] inline sycl::event
joint_exclusive_scan(SYCLTargetSharedPtr sycl_target, std::size_t N, T *d_src,
T *d_dst) {
const std::size_t group_size =
std::min(static_cast<std::size_t>(
sycl_target->device
.get_info<sycl::info::device::max_work_group_size>()),
static_cast<std::size_t>(N));
NESOASSERT(group_size >= 1, "Bad group size for exclusive_scan.");

return sycl_target->queue.submit([&](sycl::handler &cgh) {
cgh.parallel_for(sycl::nd_range<1>(sycl::range<1>(group_size),
sycl::range<1>(group_size)),
[=](sycl::nd_item<1> it) {
T *first = d_src;
T *last = first + N;
sycl::joint_exclusive_scan(it.get_group(), first, last,
d_dst, sycl::plus<T>());
});
});
}

} // namespace NESO::Particles

#endif
128 changes: 128 additions & 0 deletions include/neso_particles/loop/kernel.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
#ifndef __NESO_PARTICLES_LOOP_KERNEL_HPP_
#define __NESO_PARTICLES_LOOP_KERNEL_HPP_

#include "../compute_target.hpp"

namespace NESO::Particles {

namespace Kernel {

/**
* Type for holding the number of bytes read and written by a kernel.
*/
struct NumBytes {
std::size_t value{0};
NumBytes() = default;
NumBytes(const std::size_t value) : value(value) {}
};

/**
* Type for holding the number of FLOPs a kernel performs.
*/
struct NumFLOP {
std::size_t value{0};
NumFLOP() = default;
NumFLOP(const std::size_t value) : value(value) {}
};

/**
* Container to store the metadata for a kernel.
*/
class Metadata {
protected:
inline void unpack_arg(NumBytes &arg) { this->num_bytes = arg; }
inline void unpack_arg(NumFLOP &arg) { this->num_flops = arg; }
template <typename T> inline void recurse_args(T first) {
this->unpack_arg(first);
}
template <typename T, typename... ARGS>
inline void recurse_args(T first, ARGS... args) {
this->unpack_arg(first);
this->recurse_args(args...);
}

public:
/// The number of bytes moved by a single execution of the kernel.
NumBytes num_bytes;
/// The number of FLOP performed by a single execution of the kernel.
NumFLOP num_flops;

Metadata() = default;

/**
* Create a metadata store from a collection of attributes.
*
* @param args NumBytes and NumFLOP instances.
*/
template <typename... ARGS> Metadata(ARGS... args) {
this->recurse_args(args...);
}
};

/**
* This is a container to wrap a device copyable callable type as the kernel
* function along with user provided metadata.
*/
template <typename KERNEL_TYPE> struct Kernel {
/// The kernel for the parallel loop.
KERNEL_TYPE kernel;
/// Metadata that accompanies the kernel.
Metadata metadata;
/**
* Wrap a kernel without any metadata.
*
* @param kernel Device copyable callable to use as a kernel.
*/
Kernel(KERNEL_TYPE kernel) : kernel(kernel) {}
/**
* Wrap a kernel with metadata.
*
* @param kernel Device copyable callable to use as a kernel.
* @param metadata Metadata for the kernel.
*/
Kernel(KERNEL_TYPE kernel, Metadata metadata)
: kernel(kernel), metadata(metadata) {}
};

} // namespace Kernel

namespace ParticleLoopImplementation {

/**
* Extract the kernel from an object for a ParticleLoop.
* @param kernel Device copyable callable type to use as kernel.
* @returns kernel.
*/
template <typename T> inline T &get_kernel(Kernel::Kernel<T> &kernel) {
return kernel.kernel;
}

/**
* Extract the number of bytes per kernel invocation from the kernel
* metadata.
*
* @param kernel Device copyable callable type to use as kernel.
* @returns Number of bytes in kernel metadata.
*/
template <typename T>
inline std::size_t get_kernel_num_bytes(Kernel::Kernel<T> &kernel) {
return kernel.metadata.num_bytes.value;
}

/**
* Extract the number of flops per kernel invocation from the kernel
* metadata.
*
* @param kernel Device copyable callable type to use as kernel.
* @returns Number flops in kernel metadata.
*/
template <typename T>
inline std::size_t get_kernel_num_flops(Kernel::Kernel<T> &kernel) {
return kernel.metadata.num_flops.value;
}

}; // namespace ParticleLoopImplementation

} // namespace NESO::Particles

#endif
116 changes: 11 additions & 105 deletions include/neso_particles/loop/particle_loop.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,114 +21,12 @@
#include "../particle_dat.hpp"
#include "../particle_spec.hpp"
#include "../sycl_typedefs.hpp"
#include "kernel.hpp"
#include "particle_loop_base.hpp"
#include "particle_loop_index.hpp"
#include "particle_loop_iteration_set.hpp"
#include "pli_particle_dat.hpp"

namespace NESO::Particles::ParticleLoopImplementation {

/**
* For a set of cells containing particles create several sycl::nd_range
* instances which cover the iteration space of all particles. This exists to
* create an iteration set over all particles which is blocked, to reduce the
* number of kernel launches, and reasonably robust to non-uniform.
*/
struct ParticleLoopIterationSet {

/// The number of blocks of cells.
const int nbin;
/// The number of cells.
const int ncell;
/// Host accessible pointer to the number of particles in each cell.
int *h_npart_cell;
/// Container to store the sycl::nd_ranges.
std::vector<sycl::nd_range<2>> iteration_set;
/// Offsets to add to the cell index to map to the correct cell.
std::vector<std::size_t> cell_offsets;

/**
* Creates iteration set creator for a given set of cell particle counts.
*
* @param nbin Number of blocks of cells.
* @param ncell Number of cells.
* @param h_npart_cell Host accessible array of cell particle counts.
*/
ParticleLoopIterationSet(const int nbin, const int ncell, int *h_npart_cell)
: nbin(std::min(ncell, nbin)), ncell(ncell), h_npart_cell(h_npart_cell) {
this->iteration_set.reserve(nbin);
this->cell_offsets.reserve(nbin);
}

/**
* Create and return an iteration set which is formed as nbin
* sycl::nd_ranges.
*
* @param cell If set iteration set will only cover this cell.
* @param local_size Optional size of SYCL work groups.
* @returns Tuple containing: Number of bins, sycl::nd_ranges, cell index
* offsets.
*/
inline std::tuple<int, std::vector<sycl::nd_range<2>> &,
std::vector<std::size_t> &>
get(const std::optional<int> cell = std::nullopt,
const size_t local_size = 256) {

this->iteration_set.clear();
this->cell_offsets.clear();

if (cell == std::nullopt) {
for (int binx = 0; binx < nbin; binx++) {
int start, end;
get_decomp_1d(nbin, ncell, binx, &start, &end);
const int bin_width = end - start;
int cell_maxi = 0;
int cell_avg = 0;
for (int cellx = start; cellx < end; cellx++) {
const int cell_occ = h_npart_cell[cellx];
cell_maxi = std::max(cell_maxi, cell_occ);
cell_avg += cell_occ;
}
cell_avg = (((REAL)cell_avg) / ((REAL)(end - start)));
const size_t cell_local_size =
get_min_power_of_two((size_t)cell_avg, local_size);
const auto div_mod = std::div(static_cast<long long>(cell_maxi),
static_cast<long long>(cell_local_size));
const std::size_t outer_size =
static_cast<std::size_t>(div_mod.quot +
(div_mod.rem == 0 ? 0 : 1)) *
cell_local_size;

if (cell_maxi > 0) {
this->iteration_set.emplace_back(
sycl::nd_range<2>(sycl::range<2>(bin_width, outer_size),
sycl::range<2>(1, cell_local_size)));
this->cell_offsets.push_back(static_cast<std::size_t>(start));
}
}

return {this->iteration_set.size(), this->iteration_set,
this->cell_offsets};
} else {
const int cellx = cell.value();
const size_t cell_maxi = static_cast<size_t>(h_npart_cell[cellx]);
const size_t cell_local_size =
get_min_power_of_two((size_t)cell_maxi, local_size);

const auto div_mod = std::div(static_cast<long long>(cell_maxi),
static_cast<long long>(cell_local_size));
const std::size_t outer_size =
static_cast<std::size_t>(div_mod.quot + (div_mod.rem == 0 ? 0 : 1)) *
cell_local_size;
this->iteration_set.emplace_back(sycl::nd_range<2>(
sycl::range<2>(1, outer_size), sycl::range<2>(1, cell_local_size)));
this->cell_offsets.push_back(static_cast<std::size_t>(cellx));
return {1, this->iteration_set, this->cell_offsets};
}
}
};

} // namespace NESO::Particles::ParticleLoopImplementation

namespace NESO::Particles {

namespace ParticleLoopImplementation {
Expand Down Expand Up @@ -417,6 +315,13 @@ class ParticleLoop : public ParticleLoopBase {
this->profile_region = ProfileRegion(this->loop_type, this->name);
}

inline void profiling_region_metrics(const std::size_t size) {
this->profile_region.num_bytes =
size * ParticleLoopImplementation::get_kernel_num_bytes(this->kernel);
this->profile_region.num_flops =
size * ParticleLoopImplementation::get_kernel_num_flops(this->kernel);
}

inline void profile_region_finalise() {
this->profile_region.end();
this->sycl_target->profile_map.add_region(this->profile_region);
Expand Down Expand Up @@ -530,7 +435,8 @@ class ParticleLoop : public ParticleLoopBase {

auto k_npart_cell_lb = this->d_npart_cell_lb;
auto is = this->iteration_set->get(cell, this->local_size);
auto k_kernel = this->kernel;
this->profiling_region_metrics(this->iteration_set->iteration_set_size);
auto k_kernel = ParticleLoopImplementation::get_kernel(this->kernel);

const int nbin = std::get<0>(is);
this->sycl_target->profile_map.inc(
Expand Down
29 changes: 29 additions & 0 deletions include/neso_particles/loop/particle_loop_base.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,35 @@ inline void pre_loop(ParticleLoopGlobalInfo *global_info, T &arg) {}
template <typename T>
inline void post_loop(ParticleLoopGlobalInfo *global_info, T &arg) {}

/**
* Extract the kernel from an object for a ParticleLoop.
* @param kernel Device copyable callable type to use as kernel.
* @returns kernel.
*/
template <typename T> inline T &get_kernel(T &kernel) { return kernel; }

/**
* Extract the number of bytes per kernel invocation from the kernel. No-op
* implementation for when kernel is a generic callable.
*
* @param kernel Device copyable callable type to use as kernel.
* @returns 0.
*/
template <typename T> inline std::size_t get_kernel_num_bytes(T &kernel) {
return 0;
}

/**
* Extract the number of flops per kernel invocation from the kernel. No-op
* implementation for when kernel is a generic callable.
*
* @param kernel Device copyable callable type to use as kernel.
* @returns 0.
*/
template <typename T> inline std::size_t get_kernel_num_flops(T &kernel) {
return 0;
}

} // namespace ParticleLoopImplementation

/**
Expand Down
Loading

0 comments on commit 345dc82

Please sign in to comment.