Update RMM adaptors, containers and tests to use get/set_current_devi…

…ce_resource_ref() (#1661) Closes #1660. This adds a constructor to each MR adaptor to take a resource_ref rather than an `Upstream*`. It also updates RMM to use `get_current_device_resource_ref()` everywhere: in containers, in tests, in adaptors, Thrust allocator, polymorphic allocator, execution_policy, etc. Importantly, this PR also modifies `set_current_device_resource()` to basically call `set_current_device_resource_ref()`. This is necessary, because while RMM C++ uses `get_current_device_resource_ref()` everywhere, the Python API still uses the raw pointer API `set_current_device_resource()`. So we need the latter to update the state for the former. This is a temporary bootstrap to help with the refactoring. Authors: - Mark Harris (https://github.com/harrism) Approvers: - Michael Schellenberger Costa (https://github.com/miscco) - Lawrence Mitchell (https://github.com/wence-) - Rong Ou (https://github.com/rongou) - Bradley Dice (https://github.com/bdice) URL: #1661
rapidsai · Sep 9, 2024 · 6729def · 6729def
1 parent 687ed5c
commit 6729def
Show file tree

Hide file tree

Showing 44 changed files with 639 additions and 387 deletions.
diff --git a/benchmarks/device_uvector/device_uvector_bench.cu b/benchmarks/device_uvector/device_uvector_bench.cu
@@ -40,7 +40,7 @@ void BM_UvectorSizeConstruction(benchmark::State& state)
   rmm::mr::cuda_memory_resource cuda_mr{};
   rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> mr{
     &cuda_mr, rmm::percent_of_free_device_memory(50)};
-  rmm::mr::set_current_device_resource(&mr);
+  rmm::mr::set_current_device_resource_ref(mr);
 
   for (auto _ : state) {  // NOLINT(clang-analyzer-deadcode.DeadStores)
     rmm::device_uvector<std::int32_t> vec(state.range(0), rmm::cuda_stream_view{});
@@ -49,7 +49,7 @@ void BM_UvectorSizeConstruction(benchmark::State& state)
 
   state.SetItemsProcessed(static_cast<std::int64_t>(state.iterations()));
 
-  rmm::mr::set_current_device_resource(nullptr);
+  rmm::mr::reset_current_device_resource_ref();
 }
 
 BENCHMARK(BM_UvectorSizeConstruction)
@@ -62,7 +62,7 @@ void BM_ThrustVectorSizeConstruction(benchmark::State& state)
   rmm::mr::cuda_memory_resource cuda_mr{};
   rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> mr{
     &cuda_mr, rmm::percent_of_free_device_memory(50)};
-  rmm::mr::set_current_device_resource(&mr);
+  rmm::mr::set_current_device_resource_ref(mr);
 
   for (auto _ : state) {  // NOLINT(clang-analyzer-deadcode.DeadStores)
     rmm::device_vector<std::int32_t> vec(state.range(0));
@@ -71,7 +71,7 @@ void BM_ThrustVectorSizeConstruction(benchmark::State& state)
 
   state.SetItemsProcessed(static_cast<std::int64_t>(state.iterations()));
 
-  rmm::mr::set_current_device_resource(nullptr);
+  rmm::mr::reset_current_device_resource_ref();
 }
 
 BENCHMARK(BM_ThrustVectorSizeConstruction)
@@ -140,7 +140,7 @@ template <typename Vector>
 void BM_VectorWorkflow(benchmark::State& state)
 {
   rmm::mr::cuda_async_memory_resource cuda_async_mr{};
-  rmm::mr::set_current_device_resource(&cuda_async_mr);
+  rmm::mr::set_current_device_resource_ref(cuda_async_mr);
 
   rmm::cuda_stream input_stream;
   std::vector<rmm::cuda_stream> streams(4);
@@ -158,7 +158,7 @@ void BM_VectorWorkflow(benchmark::State& state)
   auto const bytes            = num_elements * sizeof(std::int32_t) * num_accesses;
   state.SetBytesProcessed(static_cast<std::int64_t>(state.iterations() * bytes));
 
-  rmm::mr::set_current_device_resource(nullptr);
+  rmm::mr::reset_current_device_resource_ref();
 }
 
 BENCHMARK_TEMPLATE(BM_VectorWorkflow, thrust_vector)  // NOLINT
@@ -167,9 +167,9 @@ BENCHMARK_TEMPLATE(BM_VectorWorkflow, thrust_vector)  // NOLINT
   ->Unit(benchmark::kMicrosecond)
   ->UseManualTime();
 
-// The only difference here is that `rmm::device_vector` uses `rmm::current_device_resource()`
-// for allocation while `thrust::device_vector` uses cudaMalloc/cudaFree. In the benchmarks we use
-// `cuda_async_memory_resource`, which is faster.
+// The only difference here is that `rmm::device_vector` uses
+// `rmm::get_current_device_resource_ref()` for allocation while `thrust::device_vector` uses
+// cudaMalloc/cudaFree. In the benchmarks we use `cuda_async_memory_resource`, which is faster.
 BENCHMARK_TEMPLATE(BM_VectorWorkflow, rmm_vector)  // NOLINT
   ->RangeMultiplier(10)                            // NOLINT
   ->Range(100'000, 100'000'000)                    // NOLINT

diff --git a/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu b/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu
@@ -75,7 +75,7 @@ static void BM_MultiStreamAllocations(benchmark::State& state, MRFactoryFunc con
 {
   auto mr = factory();
 
-  rmm::mr::set_current_device_resource(mr.get());
+  rmm::mr::set_current_device_resource_ref(mr.get());
 
   auto num_streams = state.range(0);
   auto num_kernels = state.range(1);
@@ -92,7 +92,7 @@ static void BM_MultiStreamAllocations(benchmark::State& state, MRFactoryFunc con
 
   state.SetItemsProcessed(static_cast<int64_t>(state.iterations() * num_kernels));
 
-  rmm::mr::set_current_device_resource(nullptr);
+  rmm::mr::reset_current_device_resource_ref();
 }
 
 inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }

diff --git a/include/rmm/device_buffer.hpp b/include/rmm/device_buffer.hpp
@@ -41,7 +41,7 @@ namespace RMM_NAMESPACE {
  *
  * This class allocates untyped and *uninitialized* device memory using a
  * `device_async_resource_ref`. If not explicitly specified, the memory resource
- * returned from `get_current_device_resource()` is used.
+ * returned from `get_current_device_resource_ref()` is used.
  *
  * @note Unlike `std::vector` or `thrust::device_vector`, the device memory
  * allocated by a `device_buffer` is uninitialized. Therefore, it is undefined
@@ -95,7 +95,7 @@ class device_buffer {
   // `__host__ __device__` specifiers to the defaulted constructor when it is called within the
   // context of both host and device functions. Specifically, the `cudf::type_dispatcher` is a host-
   // device function. This causes warnings/errors because this ctor invokes host-only functions.
-  device_buffer() : _mr{rmm::mr::get_current_device_resource()} {}
+  device_buffer() : _mr{rmm::mr::get_current_device_resource_ref()} {}
 
   /**
    * @brief Constructs a new device buffer of `size` uninitialized bytes
@@ -109,7 +109,7 @@ class device_buffer {
    */
   explicit device_buffer(std::size_t size,
                          cuda_stream_view stream,
-                         device_async_resource_ref mr = mr::get_current_device_resource())
+                         device_async_resource_ref mr = mr::get_current_device_resource_ref())
     : _stream{stream}, _mr{mr}
   {
     cuda_set_device_raii dev{_device};
@@ -138,7 +138,7 @@ class device_buffer {
   device_buffer(void const* source_data,
                 std::size_t size,
                 cuda_stream_view stream,
-                device_async_resource_ref mr = mr::get_current_device_resource())
+                device_async_resource_ref mr = mr::get_current_device_resource_ref())
     : _stream{stream}, _mr{mr}
   {
     cuda_set_device_raii dev{_device};
@@ -169,7 +169,7 @@ class device_buffer {
    */
   device_buffer(device_buffer const& other,
                 cuda_stream_view stream,
-                device_async_resource_ref mr = mr::get_current_device_resource())
+                device_async_resource_ref mr = mr::get_current_device_resource_ref())
     : device_buffer{other.data(), other.size(), stream, mr}
   {
   }
@@ -419,8 +419,8 @@ class device_buffer {
   cuda_stream_view _stream{};  ///< Stream to use for device memory deallocation
 
   rmm::device_async_resource_ref _mr{
-    rmm::mr::get_current_device_resource()};  ///< The memory resource used to
-                                              ///< allocate/deallocate device memory
+    rmm::mr::get_current_device_resource_ref()};  ///< The memory resource used to
+                                                  ///< allocate/deallocate device memory
   cuda_device_id _device{get_current_cuda_device()};
 
   /**

diff --git a/include/rmm/device_scalar.hpp b/include/rmm/device_scalar.hpp
@@ -95,7 +95,7 @@ class device_scalar {
    * @param mr Optional, resource with which to allocate.
    */
   explicit device_scalar(cuda_stream_view stream,
-                         device_async_resource_ref mr = mr::get_current_device_resource())
+                         device_async_resource_ref mr = mr::get_current_device_resource_ref())
     : _storage{1, stream, mr}
   {
   }
@@ -118,7 +118,7 @@ class device_scalar {
    */
   explicit device_scalar(value_type const& initial_value,
                          cuda_stream_view stream,
-                         device_async_resource_ref mr = mr::get_current_device_resource())
+                         device_async_resource_ref mr = mr::get_current_device_resource_ref())
     : _storage{1, stream, mr}
   {
     set_value_async(initial_value, stream);
@@ -138,7 +138,7 @@ class device_scalar {
    */
   device_scalar(device_scalar const& other,
                 cuda_stream_view stream,
-                device_async_resource_ref mr = mr::get_current_device_resource())
+                device_async_resource_ref mr = mr::get_current_device_resource_ref())
     : _storage{other._storage, stream, mr}
   {
   }

diff --git a/include/rmm/device_uvector.hpp b/include/rmm/device_uvector.hpp
@@ -48,7 +48,7 @@ namespace RMM_NAMESPACE {
  *
  * Example:
  * @code{.cpp}
- * rmm::mr::device_memory_resource * mr = new my_custom_resource();
+ * auto mr = new my_custom_resource();
  * rmm::cuda_stream_view s{};
  *
  * // Allocates *uninitialized* device memory on stream `s` sufficient for 100 ints using the
@@ -126,7 +126,7 @@ class device_uvector {
    */
   explicit device_uvector(std::size_t size,
                           cuda_stream_view stream,
-                          device_async_resource_ref mr = mr::get_current_device_resource())
+                          device_async_resource_ref mr = mr::get_current_device_resource_ref())
     : _storage{elements_to_bytes(size), stream, mr}
   {
   }
@@ -142,7 +142,7 @@ class device_uvector {
    */
   explicit device_uvector(device_uvector const& other,
                           cuda_stream_view stream,
-                          device_async_resource_ref mr = mr::get_current_device_resource())
+                          device_async_resource_ref mr = mr::get_current_device_resource_ref())
     : _storage{other._storage, stream, mr}
   {
   }

diff --git a/include/rmm/exec_policy.hpp b/include/rmm/exec_policy.hpp
@@ -57,7 +57,7 @@ class exec_policy : public thrust_exec_policy_t {
    * @param mr The resource to use for allocating temporary memory
    */
   explicit exec_policy(cuda_stream_view stream      = cuda_stream_default,
-                       device_async_resource_ref mr = mr::get_current_device_resource())
+                       device_async_resource_ref mr = mr::get_current_device_resource_ref())
     : thrust_exec_policy_t(
         thrust::cuda::par(mr::thrust_allocator<char>(stream, mr)).on(stream.value()))
   {
@@ -81,7 +81,7 @@ using thrust_exec_policy_nosync_t =
 class exec_policy_nosync : public thrust_exec_policy_nosync_t {
  public:
   explicit exec_policy_nosync(cuda_stream_view stream      = cuda_stream_default,
-                              device_async_resource_ref mr = mr::get_current_device_resource())
+                              device_async_resource_ref mr = mr::get_current_device_resource_ref())
     : thrust_exec_policy_nosync_t(
         thrust::cuda::par_nosync(mr::thrust_allocator<char>(stream, mr)).on(stream.value()))
   {

diff --git a/include/rmm/mr/device/aligned_resource_adaptor.hpp b/include/rmm/mr/device/aligned_resource_adaptor.hpp
@@ -20,6 +20,7 @@
 #include <rmm/detail/error.hpp>
 #include <rmm/detail/export.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <cstddef>
@@ -59,20 +60,40 @@ class aligned_resource_adaptor final : public device_memory_resource {
   /**
    * @brief Construct an aligned resource adaptor using `upstream` to satisfy allocation requests.
    *
-   * @throws rmm::logic_error if `upstream == nullptr`
    * @throws rmm::logic_error if `allocation_alignment` is not a power of 2
    *
    * @param upstream The resource used for allocating/deallocating device memory.
    * @param alignment The size used for allocation alignment.
    * @param alignment_threshold Only allocations with a size larger than or equal to this threshold
    * are aligned.
    */
-  explicit aligned_resource_adaptor(Upstream* upstream,
+  explicit aligned_resource_adaptor(device_async_resource_ref upstream,
                                     std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT,
                                     std::size_t alignment_threshold = default_alignment_threshold)
     : upstream_{upstream}, alignment_{alignment}, alignment_threshold_{alignment_threshold}
   {
-    RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer.");
+    RMM_EXPECTS(rmm::is_supported_alignment(alignment),
+                "Allocation alignment is not a power of 2.");
+  }
+
+  /**
+   * @brief Construct an aligned resource adaptor using `upstream` to satisfy allocation requests.
+   *
+   * @throws rmm::logic_error if `upstream == nullptr`
+   * @throws rmm::logic_error if `alignment` is not a power of 2
+   *
+   * @param upstream The resource used for allocating/deallocating device memory.
+   * @param alignment The size used for allocation alignment.
+   * @param alignment_threshold Only allocations with a size larger than or equal to this threshold
+   * are aligned.
+   */
+  explicit aligned_resource_adaptor(Upstream* upstream,
+                                    std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT,
+                                    std::size_t alignment_threshold = default_alignment_threshold)
+    : upstream_{to_device_async_resource_ref_checked(upstream)},
+      alignment_{alignment},
+      alignment_threshold_{alignment_threshold}
+  {
     RMM_EXPECTS(rmm::is_supported_alignment(alignment),
                 "Allocation alignment is not a power of 2.");
   }
@@ -92,11 +113,6 @@ class aligned_resource_adaptor final : public device_memory_resource {
     return upstream_;
   }
 
-  /**
-   * @briefreturn{Upstream* to the upstream memory resource}
-   */
-  [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; }
-
   /**
    * @brief The default alignment used by the adaptor.
    */
@@ -106,8 +122,8 @@ class aligned_resource_adaptor final : public device_memory_resource {
   using lock_guard = std::lock_guard<std::mutex>;
 
   /**
-   * @brief Allocates memory of size at least `bytes` using the upstream resource with the specified
-   * alignment.
+   * @brief Allocates memory of size at least `bytes` using the upstream resource with the
+   * specified alignment.
    *
    * @throws rmm::bad_alloc if the requested allocation could not be fulfilled
    * by the upstream resource.
@@ -119,10 +135,10 @@ class aligned_resource_adaptor final : public device_memory_resource {
   void* do_allocate(std::size_t bytes, cuda_stream_view stream) override
   {
     if (alignment_ == rmm::CUDA_ALLOCATION_ALIGNMENT || bytes < alignment_threshold_) {
-      return upstream_->allocate(bytes, stream);
+      return get_upstream_resource().allocate_async(bytes, 1, stream);
     }
     auto const size = upstream_allocation_size(bytes);
-    void* pointer   = upstream_->allocate(size, stream);
+    void* pointer   = get_upstream_resource().allocate_async(size, 1, stream);
     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
     auto const address         = reinterpret_cast<std::size_t>(pointer);
     auto const aligned_address = rmm::align_up(address, alignment_);
@@ -145,7 +161,7 @@ class aligned_resource_adaptor final : public device_memory_resource {
   void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override
   {
     if (alignment_ == rmm::CUDA_ALLOCATION_ALIGNMENT || bytes < alignment_threshold_) {
-      upstream_->deallocate(ptr, bytes, stream);
+      get_upstream_resource().deallocate_async(ptr, bytes, 1, stream);
     } else {
       {
         lock_guard lock(mtx_);
@@ -155,7 +171,7 @@ class aligned_resource_adaptor final : public device_memory_resource {
           pointers_.erase(iter);
         }
       }
-      upstream_->deallocate(ptr, upstream_allocation_size(bytes), stream);
+      get_upstream_resource().deallocate_async(ptr, upstream_allocation_size(bytes), 1, stream);
     }
   }
 
@@ -176,8 +192,8 @@ class aligned_resource_adaptor final : public device_memory_resource {
   }
 
   /**
-   * @brief Calculate the allocation size needed from upstream to account for alignments of both the
-   * size and the base pointer.
+   * @brief Calculate the allocation size needed from upstream to account for alignments of both
+   * the size and the base pointer.
    *
    * @param bytes The requested allocation size.
    * @return Allocation size needed from upstream to align both the size and the base pointer.
@@ -188,7 +204,8 @@ class aligned_resource_adaptor final : public device_memory_resource {
     return aligned_size + alignment_ - rmm::CUDA_ALLOCATION_ALIGNMENT;
   }
 
-  Upstream* upstream_;  ///< The upstream resource used for satisfying allocation requests
+  /// The upstream resource used for satisfying allocation requests
+  device_async_resource_ref upstream_;
   std::unordered_map<void*, void*> pointers_;  ///< Map of aligned pointers to upstream pointers.
   std::size_t alignment_;                      ///< The size used for allocation alignment
   std::size_t alignment_threshold_;  ///< The size above which allocations should be aligned