Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update RMM adaptors, containers and tests to use get/set_current_device_resource_ref() #1661

Merged
merged 27 commits into from
Sep 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
a9555c5
Add _ref versions of get/set_current_device_resource and friends.
harrism Jun 27, 2024
d893698
doc: explain that set_current_device_resource is in transition
harrism Aug 20, 2024
b068ded
Update/add tests for get/set_current_device_resource_ref
harrism Aug 20, 2024
c7d443d
Apply suggestions from code review
harrism Aug 20, 2024
3cd6917
RMM_EXPORT ref_map_lock()
harrism Aug 21, 2024
adbf7f6
Apply suggestions from code review
harrism Aug 21, 2024
db53c65
Merge branch 'fea-get_current_device_resource_ref' of github.com:harr…
harrism Aug 21, 2024
b5a831a
Explicit RMM_EXPORT on all functions that return a function-local sta…
harrism Aug 21, 2024
812d08d
Doc: ref and ptr versions are not interchangeable.
harrism Aug 21, 2024
1c43d63
Use resource_ref in MR adaptors and related tests.
harrism Aug 22, 2024
bd70c42
Use resource_ref in torch_allocator and polymorphic_allocator
harrism Aug 22, 2024
00bd3ac
Make the raw pointer version of set_per_device_resource update the re…
harrism Aug 22, 2024
d7df60e
Update containers and tests to use get/set_current_device_resource_ref()
harrism Aug 22, 2024
81ba5a0
Merge branch 'branch-24.10' into fea-resource_ref-upstream
harrism Aug 28, 2024
8d2bc62
Remove an utterance of device_memory_resource* in test.
harrism Aug 28, 2024
3d2ef08
Remove another unnecessary device_memory_resource*
harrism Aug 28, 2024
13172c5
Copyrights
harrism Aug 28, 2024
efee997
Merge branch 'branch-24.10' into fea-resource_ref-upstream
harrism Aug 28, 2024
4884f53
Add `to_device_async_resource_ref_checked()` to simplify adaptor cons…
harrism Aug 29, 2024
1bcb81a
Use to_device_async_resource_ref_checked() and more adaptor tidy / co…
harrism Aug 29, 2024
56e5eaa
Make test debug builds use -O0
harrism Aug 29, 2024
545323b
Fix quoted include.
harrism Aug 29, 2024
5598840
cmake style
harrism Aug 29, 2024
c1d7b0d
Don't take the address when passing MR to resource_ref.
harrism Sep 4, 2024
4c11382
Fix yoda code.
harrism Sep 5, 2024
6d2f09d
Use resource_ref upstreams in binning_mr
harrism Sep 5, 2024
a367af6
Remove incorrect @throws documentation
harrism Sep 5, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions benchmarks/device_uvector/device_uvector_bench.cu
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ void BM_UvectorSizeConstruction(benchmark::State& state)
rmm::mr::cuda_memory_resource cuda_mr{};
rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> mr{
&cuda_mr, rmm::percent_of_free_device_memory(50)};
rmm::mr::set_current_device_resource(&mr);
rmm::mr::set_current_device_resource_ref(mr);

for (auto _ : state) { // NOLINT(clang-analyzer-deadcode.DeadStores)
rmm::device_uvector<std::int32_t> vec(state.range(0), rmm::cuda_stream_view{});
Expand All @@ -49,7 +49,7 @@ void BM_UvectorSizeConstruction(benchmark::State& state)

state.SetItemsProcessed(static_cast<std::int64_t>(state.iterations()));

rmm::mr::set_current_device_resource(nullptr);
rmm::mr::reset_current_device_resource_ref();
}

BENCHMARK(BM_UvectorSizeConstruction)
Expand All @@ -62,7 +62,7 @@ void BM_ThrustVectorSizeConstruction(benchmark::State& state)
rmm::mr::cuda_memory_resource cuda_mr{};
rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> mr{
&cuda_mr, rmm::percent_of_free_device_memory(50)};
rmm::mr::set_current_device_resource(&mr);
rmm::mr::set_current_device_resource_ref(mr);

for (auto _ : state) { // NOLINT(clang-analyzer-deadcode.DeadStores)
rmm::device_vector<std::int32_t> vec(state.range(0));
Expand All @@ -71,7 +71,7 @@ void BM_ThrustVectorSizeConstruction(benchmark::State& state)

state.SetItemsProcessed(static_cast<std::int64_t>(state.iterations()));

rmm::mr::set_current_device_resource(nullptr);
rmm::mr::reset_current_device_resource_ref();
}

BENCHMARK(BM_ThrustVectorSizeConstruction)
Expand Down Expand Up @@ -140,7 +140,7 @@ template <typename Vector>
void BM_VectorWorkflow(benchmark::State& state)
{
rmm::mr::cuda_async_memory_resource cuda_async_mr{};
rmm::mr::set_current_device_resource(&cuda_async_mr);
rmm::mr::set_current_device_resource_ref(cuda_async_mr);

rmm::cuda_stream input_stream;
std::vector<rmm::cuda_stream> streams(4);
Expand All @@ -158,7 +158,7 @@ void BM_VectorWorkflow(benchmark::State& state)
auto const bytes = num_elements * sizeof(std::int32_t) * num_accesses;
state.SetBytesProcessed(static_cast<std::int64_t>(state.iterations() * bytes));

rmm::mr::set_current_device_resource(nullptr);
rmm::mr::reset_current_device_resource_ref();
}

BENCHMARK_TEMPLATE(BM_VectorWorkflow, thrust_vector) // NOLINT
Expand All @@ -167,9 +167,9 @@ BENCHMARK_TEMPLATE(BM_VectorWorkflow, thrust_vector) // NOLINT
->Unit(benchmark::kMicrosecond)
->UseManualTime();

// The only difference here is that `rmm::device_vector` uses `rmm::current_device_resource()`
// for allocation while `thrust::device_vector` uses cudaMalloc/cudaFree. In the benchmarks we use
// `cuda_async_memory_resource`, which is faster.
// The only difference here is that `rmm::device_vector` uses
// `rmm::get_current_device_resource_ref()` for allocation while `thrust::device_vector` uses
// cudaMalloc/cudaFree. In the benchmarks we use `cuda_async_memory_resource`, which is faster.
BENCHMARK_TEMPLATE(BM_VectorWorkflow, rmm_vector) // NOLINT
->RangeMultiplier(10) // NOLINT
->Range(100'000, 100'000'000) // NOLINT
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ static void BM_MultiStreamAllocations(benchmark::State& state, MRFactoryFunc con
{
auto mr = factory();

rmm::mr::set_current_device_resource(mr.get());
rmm::mr::set_current_device_resource_ref(mr.get());

auto num_streams = state.range(0);
auto num_kernels = state.range(1);
Expand All @@ -92,7 +92,7 @@ static void BM_MultiStreamAllocations(benchmark::State& state, MRFactoryFunc con

state.SetItemsProcessed(static_cast<int64_t>(state.iterations() * num_kernels));

rmm::mr::set_current_device_resource(nullptr);
rmm::mr::reset_current_device_resource_ref();
}

inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
Expand Down
14 changes: 7 additions & 7 deletions include/rmm/device_buffer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ namespace RMM_NAMESPACE {
*
* This class allocates untyped and *uninitialized* device memory using a
* `device_async_resource_ref`. If not explicitly specified, the memory resource
* returned from `get_current_device_resource()` is used.
* returned from `get_current_device_resource_ref()` is used.
*
* @note Unlike `std::vector` or `thrust::device_vector`, the device memory
* allocated by a `device_buffer` is uninitialized. Therefore, it is undefined
Expand Down Expand Up @@ -95,7 +95,7 @@ class device_buffer {
// `__host__ __device__` specifiers to the defaulted constructor when it is called within the
// context of both host and device functions. Specifically, the `cudf::type_dispatcher` is a host-
// device function. This causes warnings/errors because this ctor invokes host-only functions.
device_buffer() : _mr{rmm::mr::get_current_device_resource()} {}
device_buffer() : _mr{rmm::mr::get_current_device_resource_ref()} {}

/**
* @brief Constructs a new device buffer of `size` uninitialized bytes
Expand All @@ -109,7 +109,7 @@ class device_buffer {
*/
explicit device_buffer(std::size_t size,
cuda_stream_view stream,
device_async_resource_ref mr = mr::get_current_device_resource())
device_async_resource_ref mr = mr::get_current_device_resource_ref())
: _stream{stream}, _mr{mr}
{
cuda_set_device_raii dev{_device};
Expand Down Expand Up @@ -138,7 +138,7 @@ class device_buffer {
device_buffer(void const* source_data,
std::size_t size,
cuda_stream_view stream,
device_async_resource_ref mr = mr::get_current_device_resource())
device_async_resource_ref mr = mr::get_current_device_resource_ref())
: _stream{stream}, _mr{mr}
{
cuda_set_device_raii dev{_device};
Expand Down Expand Up @@ -169,7 +169,7 @@ class device_buffer {
*/
device_buffer(device_buffer const& other,
cuda_stream_view stream,
device_async_resource_ref mr = mr::get_current_device_resource())
device_async_resource_ref mr = mr::get_current_device_resource_ref())
: device_buffer{other.data(), other.size(), stream, mr}
{
}
Expand Down Expand Up @@ -419,8 +419,8 @@ class device_buffer {
cuda_stream_view _stream{}; ///< Stream to use for device memory deallocation

rmm::device_async_resource_ref _mr{
rmm::mr::get_current_device_resource()}; ///< The memory resource used to
///< allocate/deallocate device memory
rmm::mr::get_current_device_resource_ref()}; ///< The memory resource used to
///< allocate/deallocate device memory
cuda_device_id _device{get_current_cuda_device()};

/**
Expand Down
6 changes: 3 additions & 3 deletions include/rmm/device_scalar.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ class device_scalar {
* @param mr Optional, resource with which to allocate.
*/
explicit device_scalar(cuda_stream_view stream,
device_async_resource_ref mr = mr::get_current_device_resource())
device_async_resource_ref mr = mr::get_current_device_resource_ref())
: _storage{1, stream, mr}
{
}
Expand All @@ -118,7 +118,7 @@ class device_scalar {
*/
explicit device_scalar(value_type const& initial_value,
cuda_stream_view stream,
device_async_resource_ref mr = mr::get_current_device_resource())
device_async_resource_ref mr = mr::get_current_device_resource_ref())
: _storage{1, stream, mr}
{
set_value_async(initial_value, stream);
Expand All @@ -138,7 +138,7 @@ class device_scalar {
*/
device_scalar(device_scalar const& other,
cuda_stream_view stream,
device_async_resource_ref mr = mr::get_current_device_resource())
device_async_resource_ref mr = mr::get_current_device_resource_ref())
: _storage{other._storage, stream, mr}
{
}
Expand Down
6 changes: 3 additions & 3 deletions include/rmm/device_uvector.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ namespace RMM_NAMESPACE {
*
* Example:
* @code{.cpp}
* rmm::mr::device_memory_resource * mr = new my_custom_resource();
* auto mr = new my_custom_resource();
* rmm::cuda_stream_view s{};
*
* // Allocates *uninitialized* device memory on stream `s` sufficient for 100 ints using the
Expand Down Expand Up @@ -126,7 +126,7 @@ class device_uvector {
*/
explicit device_uvector(std::size_t size,
cuda_stream_view stream,
device_async_resource_ref mr = mr::get_current_device_resource())
device_async_resource_ref mr = mr::get_current_device_resource_ref())
: _storage{elements_to_bytes(size), stream, mr}
{
}
Expand All @@ -142,7 +142,7 @@ class device_uvector {
*/
explicit device_uvector(device_uvector const& other,
cuda_stream_view stream,
device_async_resource_ref mr = mr::get_current_device_resource())
device_async_resource_ref mr = mr::get_current_device_resource_ref())
: _storage{other._storage, stream, mr}
{
}
Expand Down
4 changes: 2 additions & 2 deletions include/rmm/exec_policy.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ class exec_policy : public thrust_exec_policy_t {
* @param mr The resource to use for allocating temporary memory
*/
explicit exec_policy(cuda_stream_view stream = cuda_stream_default,
device_async_resource_ref mr = mr::get_current_device_resource())
device_async_resource_ref mr = mr::get_current_device_resource_ref())
: thrust_exec_policy_t(
thrust::cuda::par(mr::thrust_allocator<char>(stream, mr)).on(stream.value()))
{
Expand All @@ -81,7 +81,7 @@ using thrust_exec_policy_nosync_t =
class exec_policy_nosync : public thrust_exec_policy_nosync_t {
public:
explicit exec_policy_nosync(cuda_stream_view stream = cuda_stream_default,
device_async_resource_ref mr = mr::get_current_device_resource())
device_async_resource_ref mr = mr::get_current_device_resource_ref())
: thrust_exec_policy_nosync_t(
thrust::cuda::par_nosync(mr::thrust_allocator<char>(stream, mr)).on(stream.value()))
{
Expand Down
51 changes: 34 additions & 17 deletions include/rmm/mr/device/aligned_resource_adaptor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <rmm/detail/error.hpp>
#include <rmm/detail/export.hpp>
#include <rmm/mr/device/device_memory_resource.hpp>
#include <rmm/mr/device/per_device_resource.hpp>
#include <rmm/resource_ref.hpp>

#include <cstddef>
Expand Down Expand Up @@ -59,20 +60,40 @@ class aligned_resource_adaptor final : public device_memory_resource {
/**
* @brief Construct an aligned resource adaptor using `upstream` to satisfy allocation requests.
*
* @throws rmm::logic_error if `upstream == nullptr`
* @throws rmm::logic_error if `allocation_alignment` is not a power of 2
*
* @param upstream The resource used for allocating/deallocating device memory.
* @param alignment The size used for allocation alignment.
* @param alignment_threshold Only allocations with a size larger than or equal to this threshold
* are aligned.
*/
explicit aligned_resource_adaptor(Upstream* upstream,
explicit aligned_resource_adaptor(device_async_resource_ref upstream,
std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT,
std::size_t alignment_threshold = default_alignment_threshold)
: upstream_{upstream}, alignment_{alignment}, alignment_threshold_{alignment_threshold}
{
RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer.");
RMM_EXPECTS(rmm::is_supported_alignment(alignment),
"Allocation alignment is not a power of 2.");
}

/**
* @brief Construct an aligned resource adaptor using `upstream` to satisfy allocation requests.
*
* @throws rmm::logic_error if `upstream == nullptr`
* @throws rmm::logic_error if `alignment` is not a power of 2
*
* @param upstream The resource used for allocating/deallocating device memory.
* @param alignment The size used for allocation alignment.
* @param alignment_threshold Only allocations with a size larger than or equal to this threshold
* are aligned.
*/
explicit aligned_resource_adaptor(Upstream* upstream,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure how to handle this migration without making a breaking change, but this class no longer needs to be a templated class, but rather this constructor should be templated.

And hence, question: does the constructor even need to exist, if there is transparent conversion from Upstream * to device_async_resource_ref?

That is, what doesn't work if the only constructor is:

aligned_resource_adaptor(device_async_resource_ref upstream, ...);

Applies mutatis mutandis to the other adaptor MR changes as well, I think.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, the plan is to remove the template parameter and the Upstream* constructors, once we add the resource_ref constructors and convert all of RAPIDS to use them. But we can't do it yet.

See #1457

std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT,
std::size_t alignment_threshold = default_alignment_threshold)
: upstream_{to_device_async_resource_ref_checked(upstream)},
alignment_{alignment},
alignment_threshold_{alignment_threshold}
{
RMM_EXPECTS(rmm::is_supported_alignment(alignment),
"Allocation alignment is not a power of 2.");
}
Expand All @@ -92,11 +113,6 @@ class aligned_resource_adaptor final : public device_memory_resource {
return upstream_;
}

/**
* @briefreturn{Upstream* to the upstream memory resource}
*/
[[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; }

/**
* @brief The default alignment used by the adaptor.
*/
Expand All @@ -106,8 +122,8 @@ class aligned_resource_adaptor final : public device_memory_resource {
using lock_guard = std::lock_guard<std::mutex>;

/**
* @brief Allocates memory of size at least `bytes` using the upstream resource with the specified
* alignment.
* @brief Allocates memory of size at least `bytes` using the upstream resource with the
* specified alignment.
*
* @throws rmm::bad_alloc if the requested allocation could not be fulfilled
* by the upstream resource.
Expand All @@ -119,10 +135,10 @@ class aligned_resource_adaptor final : public device_memory_resource {
void* do_allocate(std::size_t bytes, cuda_stream_view stream) override
{
if (alignment_ == rmm::CUDA_ALLOCATION_ALIGNMENT || bytes < alignment_threshold_) {
return upstream_->allocate(bytes, stream);
return get_upstream_resource().allocate_async(bytes, 1, stream);
}
auto const size = upstream_allocation_size(bytes);
void* pointer = upstream_->allocate(size, stream);
void* pointer = get_upstream_resource().allocate_async(size, 1, stream);
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
auto const address = reinterpret_cast<std::size_t>(pointer);
auto const aligned_address = rmm::align_up(address, alignment_);
Expand All @@ -145,7 +161,7 @@ class aligned_resource_adaptor final : public device_memory_resource {
void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override
{
if (alignment_ == rmm::CUDA_ALLOCATION_ALIGNMENT || bytes < alignment_threshold_) {
upstream_->deallocate(ptr, bytes, stream);
get_upstream_resource().deallocate_async(ptr, bytes, 1, stream);
} else {
{
lock_guard lock(mtx_);
Expand All @@ -155,7 +171,7 @@ class aligned_resource_adaptor final : public device_memory_resource {
pointers_.erase(iter);
}
}
upstream_->deallocate(ptr, upstream_allocation_size(bytes), stream);
get_upstream_resource().deallocate_async(ptr, upstream_allocation_size(bytes), 1, stream);
}
}

Expand All @@ -176,8 +192,8 @@ class aligned_resource_adaptor final : public device_memory_resource {
}

/**
* @brief Calculate the allocation size needed from upstream to account for alignments of both the
* size and the base pointer.
* @brief Calculate the allocation size needed from upstream to account for alignments of both
* the size and the base pointer.
*
* @param bytes The requested allocation size.
* @return Allocation size needed from upstream to align both the size and the base pointer.
Expand All @@ -188,7 +204,8 @@ class aligned_resource_adaptor final : public device_memory_resource {
return aligned_size + alignment_ - rmm::CUDA_ALLOCATION_ALIGNMENT;
}

Upstream* upstream_; ///< The upstream resource used for satisfying allocation requests
/// The upstream resource used for satisfying allocation requests
device_async_resource_ref upstream_;
std::unordered_map<void*, void*> pointers_; ///< Map of aligned pointers to upstream pointers.
std::size_t alignment_; ///< The size used for allocation alignment
std::size_t alignment_threshold_; ///< The size above which allocations should be aligned
Expand Down
Loading
Loading