Skip to content

Commit

Permalink
Update RMM adaptors, containers and tests to use get/set_current_devi…
Browse files Browse the repository at this point in the history
…ce_resource_ref() (#1661)

Closes #1660.

This adds a constructor to each MR adaptor to take a resource_ref rather than an `Upstream*`. It also updates RMM to use `get_current_device_resource_ref()` everywhere: in containers, in tests, in adaptors, Thrust allocator, polymorphic allocator, execution_policy, etc.

Importantly, this PR also modifies `set_current_device_resource()` to basically call `set_current_device_resource_ref()`. This is necessary, because while RMM C++ uses `get_current_device_resource_ref()` everywhere, the Python API still uses the raw pointer API `set_current_device_resource()`. So we need the latter to update the state for the former.  This is a temporary bootstrap to help with the refactoring.

Authors:
  - Mark Harris (https://github.com/harrism)

Approvers:
  - Michael Schellenberger Costa (https://github.com/miscco)
  - Lawrence Mitchell (https://github.com/wence-)
  - Rong Ou (https://github.com/rongou)
  - Bradley Dice (https://github.com/bdice)

URL: #1661
  • Loading branch information
harrism authored Sep 9, 2024
1 parent 687ed5c commit 6729def
Show file tree
Hide file tree
Showing 44 changed files with 639 additions and 387 deletions.
18 changes: 9 additions & 9 deletions benchmarks/device_uvector/device_uvector_bench.cu
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ void BM_UvectorSizeConstruction(benchmark::State& state)
rmm::mr::cuda_memory_resource cuda_mr{};
rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> mr{
&cuda_mr, rmm::percent_of_free_device_memory(50)};
rmm::mr::set_current_device_resource(&mr);
rmm::mr::set_current_device_resource_ref(mr);

for (auto _ : state) { // NOLINT(clang-analyzer-deadcode.DeadStores)
rmm::device_uvector<std::int32_t> vec(state.range(0), rmm::cuda_stream_view{});
Expand All @@ -49,7 +49,7 @@ void BM_UvectorSizeConstruction(benchmark::State& state)

state.SetItemsProcessed(static_cast<std::int64_t>(state.iterations()));

rmm::mr::set_current_device_resource(nullptr);
rmm::mr::reset_current_device_resource_ref();
}

BENCHMARK(BM_UvectorSizeConstruction)
Expand All @@ -62,7 +62,7 @@ void BM_ThrustVectorSizeConstruction(benchmark::State& state)
rmm::mr::cuda_memory_resource cuda_mr{};
rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> mr{
&cuda_mr, rmm::percent_of_free_device_memory(50)};
rmm::mr::set_current_device_resource(&mr);
rmm::mr::set_current_device_resource_ref(mr);

for (auto _ : state) { // NOLINT(clang-analyzer-deadcode.DeadStores)
rmm::device_vector<std::int32_t> vec(state.range(0));
Expand All @@ -71,7 +71,7 @@ void BM_ThrustVectorSizeConstruction(benchmark::State& state)

state.SetItemsProcessed(static_cast<std::int64_t>(state.iterations()));

rmm::mr::set_current_device_resource(nullptr);
rmm::mr::reset_current_device_resource_ref();
}

BENCHMARK(BM_ThrustVectorSizeConstruction)
Expand Down Expand Up @@ -140,7 +140,7 @@ template <typename Vector>
void BM_VectorWorkflow(benchmark::State& state)
{
rmm::mr::cuda_async_memory_resource cuda_async_mr{};
rmm::mr::set_current_device_resource(&cuda_async_mr);
rmm::mr::set_current_device_resource_ref(cuda_async_mr);

rmm::cuda_stream input_stream;
std::vector<rmm::cuda_stream> streams(4);
Expand All @@ -158,7 +158,7 @@ void BM_VectorWorkflow(benchmark::State& state)
auto const bytes = num_elements * sizeof(std::int32_t) * num_accesses;
state.SetBytesProcessed(static_cast<std::int64_t>(state.iterations() * bytes));

rmm::mr::set_current_device_resource(nullptr);
rmm::mr::reset_current_device_resource_ref();
}

BENCHMARK_TEMPLATE(BM_VectorWorkflow, thrust_vector) // NOLINT
Expand All @@ -167,9 +167,9 @@ BENCHMARK_TEMPLATE(BM_VectorWorkflow, thrust_vector) // NOLINT
->Unit(benchmark::kMicrosecond)
->UseManualTime();

// The only difference here is that `rmm::device_vector` uses `rmm::current_device_resource()`
// for allocation while `thrust::device_vector` uses cudaMalloc/cudaFree. In the benchmarks we use
// `cuda_async_memory_resource`, which is faster.
// The only difference here is that `rmm::device_vector` uses
// `rmm::get_current_device_resource_ref()` for allocation while `thrust::device_vector` uses
// cudaMalloc/cudaFree. In the benchmarks we use `cuda_async_memory_resource`, which is faster.
BENCHMARK_TEMPLATE(BM_VectorWorkflow, rmm_vector) // NOLINT
->RangeMultiplier(10) // NOLINT
->Range(100'000, 100'000'000) // NOLINT
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ static void BM_MultiStreamAllocations(benchmark::State& state, MRFactoryFunc con
{
auto mr = factory();

rmm::mr::set_current_device_resource(mr.get());
rmm::mr::set_current_device_resource_ref(mr.get());

auto num_streams = state.range(0);
auto num_kernels = state.range(1);
Expand All @@ -92,7 +92,7 @@ static void BM_MultiStreamAllocations(benchmark::State& state, MRFactoryFunc con

state.SetItemsProcessed(static_cast<int64_t>(state.iterations() * num_kernels));

rmm::mr::set_current_device_resource(nullptr);
rmm::mr::reset_current_device_resource_ref();
}

inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
Expand Down
14 changes: 7 additions & 7 deletions include/rmm/device_buffer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ namespace RMM_NAMESPACE {
*
* This class allocates untyped and *uninitialized* device memory using a
* `device_async_resource_ref`. If not explicitly specified, the memory resource
* returned from `get_current_device_resource()` is used.
* returned from `get_current_device_resource_ref()` is used.
*
* @note Unlike `std::vector` or `thrust::device_vector`, the device memory
* allocated by a `device_buffer` is uninitialized. Therefore, it is undefined
Expand Down Expand Up @@ -95,7 +95,7 @@ class device_buffer {
// `__host__ __device__` specifiers to the defaulted constructor when it is called within the
// context of both host and device functions. Specifically, the `cudf::type_dispatcher` is a host-
// device function. This causes warnings/errors because this ctor invokes host-only functions.
device_buffer() : _mr{rmm::mr::get_current_device_resource()} {}
device_buffer() : _mr{rmm::mr::get_current_device_resource_ref()} {}

/**
* @brief Constructs a new device buffer of `size` uninitialized bytes
Expand All @@ -109,7 +109,7 @@ class device_buffer {
*/
explicit device_buffer(std::size_t size,
cuda_stream_view stream,
device_async_resource_ref mr = mr::get_current_device_resource())
device_async_resource_ref mr = mr::get_current_device_resource_ref())
: _stream{stream}, _mr{mr}
{
cuda_set_device_raii dev{_device};
Expand Down Expand Up @@ -138,7 +138,7 @@ class device_buffer {
device_buffer(void const* source_data,
std::size_t size,
cuda_stream_view stream,
device_async_resource_ref mr = mr::get_current_device_resource())
device_async_resource_ref mr = mr::get_current_device_resource_ref())
: _stream{stream}, _mr{mr}
{
cuda_set_device_raii dev{_device};
Expand Down Expand Up @@ -169,7 +169,7 @@ class device_buffer {
*/
device_buffer(device_buffer const& other,
cuda_stream_view stream,
device_async_resource_ref mr = mr::get_current_device_resource())
device_async_resource_ref mr = mr::get_current_device_resource_ref())
: device_buffer{other.data(), other.size(), stream, mr}
{
}
Expand Down Expand Up @@ -419,8 +419,8 @@ class device_buffer {
cuda_stream_view _stream{}; ///< Stream to use for device memory deallocation

rmm::device_async_resource_ref _mr{
rmm::mr::get_current_device_resource()}; ///< The memory resource used to
///< allocate/deallocate device memory
rmm::mr::get_current_device_resource_ref()}; ///< The memory resource used to
///< allocate/deallocate device memory
cuda_device_id _device{get_current_cuda_device()};

/**
Expand Down
6 changes: 3 additions & 3 deletions include/rmm/device_scalar.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ class device_scalar {
* @param mr Optional, resource with which to allocate.
*/
explicit device_scalar(cuda_stream_view stream,
device_async_resource_ref mr = mr::get_current_device_resource())
device_async_resource_ref mr = mr::get_current_device_resource_ref())
: _storage{1, stream, mr}
{
}
Expand All @@ -118,7 +118,7 @@ class device_scalar {
*/
explicit device_scalar(value_type const& initial_value,
cuda_stream_view stream,
device_async_resource_ref mr = mr::get_current_device_resource())
device_async_resource_ref mr = mr::get_current_device_resource_ref())
: _storage{1, stream, mr}
{
set_value_async(initial_value, stream);
Expand All @@ -138,7 +138,7 @@ class device_scalar {
*/
device_scalar(device_scalar const& other,
cuda_stream_view stream,
device_async_resource_ref mr = mr::get_current_device_resource())
device_async_resource_ref mr = mr::get_current_device_resource_ref())
: _storage{other._storage, stream, mr}
{
}
Expand Down
6 changes: 3 additions & 3 deletions include/rmm/device_uvector.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ namespace RMM_NAMESPACE {
*
* Example:
* @code{.cpp}
* rmm::mr::device_memory_resource * mr = new my_custom_resource();
* auto mr = new my_custom_resource();
* rmm::cuda_stream_view s{};
*
* // Allocates *uninitialized* device memory on stream `s` sufficient for 100 ints using the
Expand Down Expand Up @@ -126,7 +126,7 @@ class device_uvector {
*/
explicit device_uvector(std::size_t size,
cuda_stream_view stream,
device_async_resource_ref mr = mr::get_current_device_resource())
device_async_resource_ref mr = mr::get_current_device_resource_ref())
: _storage{elements_to_bytes(size), stream, mr}
{
}
Expand All @@ -142,7 +142,7 @@ class device_uvector {
*/
explicit device_uvector(device_uvector const& other,
cuda_stream_view stream,
device_async_resource_ref mr = mr::get_current_device_resource())
device_async_resource_ref mr = mr::get_current_device_resource_ref())
: _storage{other._storage, stream, mr}
{
}
Expand Down
4 changes: 2 additions & 2 deletions include/rmm/exec_policy.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ class exec_policy : public thrust_exec_policy_t {
* @param mr The resource to use for allocating temporary memory
*/
explicit exec_policy(cuda_stream_view stream = cuda_stream_default,
device_async_resource_ref mr = mr::get_current_device_resource())
device_async_resource_ref mr = mr::get_current_device_resource_ref())
: thrust_exec_policy_t(
thrust::cuda::par(mr::thrust_allocator<char>(stream, mr)).on(stream.value()))
{
Expand All @@ -81,7 +81,7 @@ using thrust_exec_policy_nosync_t =
class exec_policy_nosync : public thrust_exec_policy_nosync_t {
public:
explicit exec_policy_nosync(cuda_stream_view stream = cuda_stream_default,
device_async_resource_ref mr = mr::get_current_device_resource())
device_async_resource_ref mr = mr::get_current_device_resource_ref())
: thrust_exec_policy_nosync_t(
thrust::cuda::par_nosync(mr::thrust_allocator<char>(stream, mr)).on(stream.value()))
{
Expand Down
51 changes: 34 additions & 17 deletions include/rmm/mr/device/aligned_resource_adaptor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <rmm/detail/error.hpp>
#include <rmm/detail/export.hpp>
#include <rmm/mr/device/device_memory_resource.hpp>
#include <rmm/mr/device/per_device_resource.hpp>
#include <rmm/resource_ref.hpp>

#include <cstddef>
Expand Down Expand Up @@ -59,20 +60,40 @@ class aligned_resource_adaptor final : public device_memory_resource {
/**
* @brief Construct an aligned resource adaptor using `upstream` to satisfy allocation requests.
*
* @throws rmm::logic_error if `upstream == nullptr`
* @throws rmm::logic_error if `allocation_alignment` is not a power of 2
*
* @param upstream The resource used for allocating/deallocating device memory.
* @param alignment The size used for allocation alignment.
* @param alignment_threshold Only allocations with a size larger than or equal to this threshold
* are aligned.
*/
explicit aligned_resource_adaptor(Upstream* upstream,
explicit aligned_resource_adaptor(device_async_resource_ref upstream,
std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT,
std::size_t alignment_threshold = default_alignment_threshold)
: upstream_{upstream}, alignment_{alignment}, alignment_threshold_{alignment_threshold}
{
RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer.");
RMM_EXPECTS(rmm::is_supported_alignment(alignment),
"Allocation alignment is not a power of 2.");
}

/**
* @brief Construct an aligned resource adaptor using `upstream` to satisfy allocation requests.
*
* @throws rmm::logic_error if `upstream == nullptr`
* @throws rmm::logic_error if `alignment` is not a power of 2
*
* @param upstream The resource used for allocating/deallocating device memory.
* @param alignment The size used for allocation alignment.
* @param alignment_threshold Only allocations with a size larger than or equal to this threshold
* are aligned.
*/
explicit aligned_resource_adaptor(Upstream* upstream,
std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT,
std::size_t alignment_threshold = default_alignment_threshold)
: upstream_{to_device_async_resource_ref_checked(upstream)},
alignment_{alignment},
alignment_threshold_{alignment_threshold}
{
RMM_EXPECTS(rmm::is_supported_alignment(alignment),
"Allocation alignment is not a power of 2.");
}
Expand All @@ -92,11 +113,6 @@ class aligned_resource_adaptor final : public device_memory_resource {
return upstream_;
}

/**
* @briefreturn{Upstream* to the upstream memory resource}
*/
[[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; }

/**
* @brief The default alignment used by the adaptor.
*/
Expand All @@ -106,8 +122,8 @@ class aligned_resource_adaptor final : public device_memory_resource {
using lock_guard = std::lock_guard<std::mutex>;

/**
* @brief Allocates memory of size at least `bytes` using the upstream resource with the specified
* alignment.
* @brief Allocates memory of size at least `bytes` using the upstream resource with the
* specified alignment.
*
* @throws rmm::bad_alloc if the requested allocation could not be fulfilled
* by the upstream resource.
Expand All @@ -119,10 +135,10 @@ class aligned_resource_adaptor final : public device_memory_resource {
void* do_allocate(std::size_t bytes, cuda_stream_view stream) override
{
if (alignment_ == rmm::CUDA_ALLOCATION_ALIGNMENT || bytes < alignment_threshold_) {
return upstream_->allocate(bytes, stream);
return get_upstream_resource().allocate_async(bytes, 1, stream);
}
auto const size = upstream_allocation_size(bytes);
void* pointer = upstream_->allocate(size, stream);
void* pointer = get_upstream_resource().allocate_async(size, 1, stream);
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
auto const address = reinterpret_cast<std::size_t>(pointer);
auto const aligned_address = rmm::align_up(address, alignment_);
Expand All @@ -145,7 +161,7 @@ class aligned_resource_adaptor final : public device_memory_resource {
void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override
{
if (alignment_ == rmm::CUDA_ALLOCATION_ALIGNMENT || bytes < alignment_threshold_) {
upstream_->deallocate(ptr, bytes, stream);
get_upstream_resource().deallocate_async(ptr, bytes, 1, stream);
} else {
{
lock_guard lock(mtx_);
Expand All @@ -155,7 +171,7 @@ class aligned_resource_adaptor final : public device_memory_resource {
pointers_.erase(iter);
}
}
upstream_->deallocate(ptr, upstream_allocation_size(bytes), stream);
get_upstream_resource().deallocate_async(ptr, upstream_allocation_size(bytes), 1, stream);
}
}

Expand All @@ -176,8 +192,8 @@ class aligned_resource_adaptor final : public device_memory_resource {
}

/**
* @brief Calculate the allocation size needed from upstream to account for alignments of both the
* size and the base pointer.
* @brief Calculate the allocation size needed from upstream to account for alignments of both
* the size and the base pointer.
*
* @param bytes The requested allocation size.
* @return Allocation size needed from upstream to align both the size and the base pointer.
Expand All @@ -188,7 +204,8 @@ class aligned_resource_adaptor final : public device_memory_resource {
return aligned_size + alignment_ - rmm::CUDA_ALLOCATION_ALIGNMENT;
}

Upstream* upstream_; ///< The upstream resource used for satisfying allocation requests
/// The upstream resource used for satisfying allocation requests
device_async_resource_ref upstream_;
std::unordered_map<void*, void*> pointers_; ///< Map of aligned pointers to upstream pointers.
std::size_t alignment_; ///< The size used for allocation alignment
std::size_t alignment_threshold_; ///< The size above which allocations should be aligned
Expand Down
Loading

0 comments on commit 6729def

Please sign in to comment.