From 588928f5ff2418aadcec8e6c91fcea4dd8cb9265 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 16 Apr 2024 15:27:18 +0100
Subject: [PATCH] Make thrust_allocator deallocate safe in multi-device setting
 (#1533)

Previously, the user had to arrange that the device active when a
thrust_allocator object was created was also active when allocate and
deallocate was called. This is hard to manage if exceptions are
thrown. Instead, save the active device on construction and ensure
that it is active when calling deallocate and deallocate. This means
that device_vector is safe to destruct with RAII semantics in a
multi-device setting.

Add tests of this facility, and correct the parameterization usage in
the other thrust allocator tests such that we actually check the MRs
we're parameterizing over.

- Closes #1527

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Rong Ou (https://github.com/rongou)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/rmm/pull/1533
---
 README.md                                     | 26 ++++++++++++-------
 .../mr/device/thrust_allocator_adaptor.hpp    |  9 ++++++-
 tests/mr/device/thrust_allocator_tests.cu     | 19 ++++++++++++++
 3 files changed, 43 insertions(+), 11 deletions(-)
diff --git a/README.md b/README.md
index 5b7dc69c0..f378cfe50 100644
--- a/README.md
+++ b/README.md
@@ -375,14 +375,16 @@ this code is correct:
 
 #### Use of `rmm::device_vector` with multiple devices
 
-> [!CAUTION] In contrast to the uninitialized `rmm:device_uvector`, `rmm::device_vector` **DOES
-> NOT** store the active device during construction, and therefore cannot arrange for it to be
-> active when the destructor runs. It is therefore the responsibility of the user to ensure the
-> currently active device is correct.
+`rmm:device_vector` uses an `rmm::mr::thrust_allocator` to enable `thrust::device_vector` to
+allocate and deallocate memory using RMM. As such, the usual rules for usage of the backing memory
+resource apply: the active device must match the active device at resource construction time. To
+facilitate use in an RAII setting, `rmm::mr::thrust_allocator` records the active device at
+construction time and ensures that device is active whenever it allocates or deallocates memory.
+Usage of `rmm::device_vector` with multiple devices is therefore the same as `rmm::device_buffer`.
+One must _create_ `device_vector`s with the correct device active, but it is safe to destroy them
+with a different active device.
 
-`rmm::device_vector` is therefore slightly less ergonomic to use in a multiple device setting since
-the caller must arrange that active devices on allocation and deallocation match. Recapitulating the
-previous example using `rmm::device_vector`:
+For example, recapitulating the previous example using `rmm::device_vector`:
 
 ```c++
 {
@@ -391,12 +393,16 @@ previous example using `rmm::device_vector`:
   rmm::device_vector<int> vec(16, rmm::mr::thrust_allocator<int>(rmm::cuda_stream_default, &mr));
   RMM_CUDA_TRY(cudaSetDevice(1));
   ...
-  // ERROR: ~vec runs with device 1 active, but needs device 0 to be active
+  // No need to switch back to device 0 before ~vec runs
 }
 ```
 
-A correct example adds a call to `cudaSetDevice(0)` on the line of the error comment before the dtor
-for `~vec` runs.
+> [!NOTE]
+> Although allocation and deallocation in the `thrust_allocator` run with the correct active device,
+> modification of `rmm::device_vector` might necessitate a kernel launch, and this must run with the
+> correct device active. For example, `.resize()` might both allocate _and_ launch a kernel to
+> initialize new elements: the user must arrange for this kernel launch to occur with the correct
+> device for the memory resource active.
 
 ## `cuda_stream_view` and `cuda_stream`
 
diff --git a/include/rmm/mr/device/thrust_allocator_adaptor.hpp b/include/rmm/mr/device/thrust_allocator_adaptor.hpp
index 41973e04b..3bfd65996 100644
--- a/include/rmm/mr/device/thrust_allocator_adaptor.hpp
+++ b/include/rmm/mr/device/thrust_allocator_adaptor.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <rmm/cuda_device.hpp>
 #include <rmm/detail/thrust_namespace.h>
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
@@ -39,6 +40,9 @@ namespace rmm::mr {
  * allocate objects of a specific type `T`, but can be freely rebound to other
  * types.
  *
+ * The allocator records the current cuda device and may only be used with a backing
+ * `device_async_resource_ref` valid for the same device.
+ *
  * @tparam T The type of the objects that will be allocated by this allocator
  */
 template <typename T>
@@ -92,7 +96,7 @@ class thrust_allocator : public thrust::device_malloc_allocator<T> {
    */
   template <typename U>
   thrust_allocator(thrust_allocator<U> const& other)
-    : _mr(other.resource()), _stream{other.stream()}
+    : _mr(other.resource()), _stream{other.stream()}, _device{other._device}
   {
   }
 
@@ -104,6 +108,7 @@ class thrust_allocator : public thrust::device_malloc_allocator<T> {
    */
   pointer allocate(size_type num)
   {
+    cuda_set_device_raii dev{_device};
     return thrust::device_pointer_cast(
       static_cast<T*>(_mr.allocate_async(num * sizeof(T), _stream)));
   }
@@ -117,6 +122,7 @@ class thrust_allocator : public thrust::device_malloc_allocator<T> {
    */
   void deallocate(pointer ptr, size_type num)
   {
+    cuda_set_device_raii dev{_device};
     return _mr.deallocate_async(thrust::raw_pointer_cast(ptr), num * sizeof(T), _stream);
   }
 
@@ -143,6 +149,7 @@ class thrust_allocator : public thrust::device_malloc_allocator<T> {
  private:
   cuda_stream_view _stream{};
   rmm::device_async_resource_ref _mr{rmm::mr::get_current_device_resource()};
+  cuda_device_id _device{get_current_cuda_device()};
 };
 /** @} */  // end of group
 }  // namespace rmm::mr
diff --git a/tests/mr/device/thrust_allocator_tests.cu b/tests/mr/device/thrust_allocator_tests.cu
index b94d6b3e1..e855d1036 100644
--- a/tests/mr/device/thrust_allocator_tests.cu
+++ b/tests/mr/device/thrust_allocator_tests.cu
@@ -16,7 +16,9 @@
 
 #include "mr_ref_test.hpp"
 
+#include <rmm/cuda_device.hpp>
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/detail/error.hpp>
 #include <rmm/device_vector.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/thrust_allocator_adaptor.hpp>
@@ -36,6 +38,7 @@ struct allocator_test : public mr_ref_test {};
 
 TEST_P(allocator_test, first)
 {
+  rmm::mr::set_current_device_resource(this->mr.get());
   auto const num_ints{100};
   rmm::device_vector<int> ints(num_ints, 1);
   EXPECT_EQ(num_ints, thrust::reduce(ints.begin(), ints.end()));
@@ -43,12 +46,28 @@ TEST_P(allocator_test, first)
 
 TEST_P(allocator_test, defaults)
 {
+  rmm::mr::set_current_device_resource(this->mr.get());
   rmm::mr::thrust_allocator<int> allocator(rmm::cuda_stream_default);
   EXPECT_EQ(allocator.stream(), rmm::cuda_stream_default);
   EXPECT_EQ(allocator.get_upstream_resource(),
             rmm::device_async_resource_ref{rmm::mr::get_current_device_resource()});
 }
 
+TEST_P(allocator_test, multi_device)
+{
+  if (rmm::get_num_cuda_devices() < 2) { GTEST_SKIP() << "Needs at least two devices"; }
+  cuda_set_device_raii with_device{rmm::get_current_cuda_device()};
+  rmm::cuda_stream stream{};
+  // make allocator on device-0
+  rmm::mr::thrust_allocator<int> allocator(stream.view(), this->ref);
+  auto const size{100};
+  EXPECT_NO_THROW([&]() {
+    auto vec = rmm::device_vector<int>(size, allocator);
+    // Destruct with device-1 active
+    RMM_CUDA_TRY(cudaSetDevice(1));
+  }());
+}
+
 INSTANTIATE_TEST_CASE_P(ThrustAllocatorTests,
                         allocator_test,
                         ::testing::Values(mr_factory{"CUDA", &make_cuda},