From a9555c58c6f9e01608d24bba777fee14bdc45ab9 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Thu, 27 Jun 2024 05:53:06 +0000 Subject: [PATCH 01/24] Add _ref versions of get/set_current_device_resource and friends. --- include/rmm/mr/device/per_device_resource.hpp | 205 +++++++++++++++++- 1 file changed, 199 insertions(+), 6 deletions(-) diff --git a/include/rmm/mr/device/per_device_resource.hpp b/include/rmm/mr/device/per_device_resource.hpp index a56a784a1..67aac51ac 100644 --- a/include/rmm/mr/device/per_device_resource.hpp +++ b/include/rmm/mr/device/per_device_resource.hpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -121,6 +122,25 @@ RMM_EXPORT inline auto& get_map() return device_id_to_resource; } +/** + * @briefreturn{Reference to the lock} + */ +inline std::mutex& ref_map_lock() +{ + static std::mutex ref_map_lock; + return ref_map_lock; +} + +// This symbol must have default visibility, see: https://github.com/rapidsai/rmm/issues/826 +/** + * @briefreturn{Reference to the map from device id -> resource_ref} + */ +RMM_EXPORT inline auto& get_ref_map() +{ + static std::map device_id_to_resource_ref; + return device_id_to_resource_ref; +} + } // namespace detail /** @@ -129,20 +149,21 @@ RMM_EXPORT inline auto& get_map() * Returns a pointer to the `device_memory_resource` for the specified device. The initial * resource is a `cuda_memory_resource`. * - * `id.value()` must be in the range `[0, cudaGetDeviceCount())`, otherwise behavior is undefined. + * `device_id.value()` must be in the range `[0, cudaGetDeviceCount())`, otherwise behavior is + * undefined. * * This function is thread-safe with respect to concurrent calls to `set_per_device_resource`, * `get_per_device_resource`, `get_current_device_resource`, and `set_current_device_resource`. * Concurrent calls to any of these functions will result in a valid state, but the order of * execution is undefined. * - * @note The returned `device_memory_resource` should only be used when CUDA device `id` is the - * current device (e.g. set using `cudaSetDevice()`). The behavior of a device_memory_resource is - * undefined if used while the active CUDA device is a different device from the one that was active - * when the device_memory_resource was created. + * @note The returned `device_memory_resource` should only be used when CUDA device `device_id` is + * the current device (e.g. set using `cudaSetDevice()`). The behavior of a + * `device_memory_resource` is undefined if used while the active CUDA device is a different device + * from the one that was active when the `device_memory_resource` was created. * * @param device_id The id of the target device - * @return Pointer to the current `device_memory_resource` for device `id` + * @return Pointer to the current `device_memory_resource` for device `device_id` */ inline device_memory_resource* get_per_device_resource(cuda_device_id device_id) { @@ -247,5 +268,177 @@ inline device_memory_resource* set_current_device_resource(device_memory_resourc { return set_per_device_resource(rmm::get_current_cuda_device(), new_mr); } + +/** + * @brief Get the `device_async_resource_ref for the specified device. + * + * Returns a `device_async_resource_ref` for the specified device. The initial resource_ref + * references a `cuda_memory_resource`. + * + * `device_id.value()` must be in the range `[0, cudaGetDeviceCount())`, otherwise behavior is + * undefined. + * + * This function is thread-safe with respect to concurrent calls to `set_per_device_resource_ref`, + * `get_per_device_resource_ref`, `get_current_device_resource_ref`, + * `set_current_device_resource_ref` and `reset_current_device_resource_ref. Concurrent calls to any + * of these functions will result in a valid state, but the order of execution is undefined. + * + * @note The returned `device_async_resource_ref` should only be used when CUDA device `device_id` + * is the current device (e.g. set using `cudaSetDevice()`). The behavior of a + * `device_async_resource_ref` is undefined if used while the active CUDA device is a different + * device from the one that was active when the memory resource was created. + * + * @param device_id The id of the target device + * @return The current `device_async_resource_ref` for device `device_id` + */ +inline device_async_resource_ref get_per_device_resource_ref(cuda_device_id device_id) +{ + std::lock_guard lock{detail::ref_map_lock()}; + auto& map = detail::get_ref_map(); + // If a resource was never set for `id`, set to the initial resource + auto const found = map.find(device_id.value()); + if (found == map.end()) { + // + auto item = map.insert({device_id.value(), detail::initial_resource()}); + return item.first->second; + } + return found->second; +} + +/** + * @brief Set the `device_async_resource_ref` for the specified device to `new_resource_ref` + * + * `device_id.value()` must be in the range `[0, cudaGetDeviceCount())`, otherwise behavior is + * undefined. + * + * The object referenced by `new_resource_ref` must outlive the last use of the resource, otherwise + * behavior is undefined. It is the caller's responsibility to maintain the lifetime of the resource + * object. + * + * This function is thread-safe with respect to concurrent calls to `set_per_device_resource_ref`, + * `get_per_device_resource_ref`, `get_current_device_resource_ref`, + * `set_current_device_resource_ref` and `reset_current_device_resource_ref. Concurrent calls to any + * of these functions will result in a valid state, but the order of execution is undefined. + * + * @note The resource passed in `new_resource_ref` must have been created when device `device_id` + * was the current CUDA device (e.g. set using `cudaSetDevice()`). The behavior of a + * `device_async_resource_ref` is undefined if used while the active CUDA device is a different + * device from the one that was active when the memory resource was created. + * + * @param device_id The id of the target device + * @param new_resource_ref new `device_async_resource_ref` to use as new resource for `device_id` + * @return The previous `device_async_resource_ref` for `device_id` + */ +inline device_async_resource_ref set_per_device_resource_ref( + cuda_device_id device_id, device_async_resource_ref new_resource_ref) +{ + std::lock_guard lock{detail::ref_map_lock()}; + auto& map = detail::get_ref_map(); + auto const old_itr = map.find(device_id.value()); + // If a resource didn't previously exist for `id`, return pointer to initial_resource + // Note: because resource_ref is not default-constructible, we can't use std::map::operator[] + if (old_itr == map.end()) { + map.insert({device_id.value(), new_resource_ref}); + std::cout << "returning initial resource in set_per_device_resource_ref\n"; + return device_async_resource_ref{detail::initial_resource()}; + } + + auto old_resource_ref = old_itr->second; + old_itr->second = new_resource_ref; // update map directly via iterator + return old_resource_ref; +} + +/** + * @brief Get the `device_async_resource_ref` for the current device. + * + * Returns the `device_async_resource_ref` set for the current device. The initial resource_ref + * references a `cuda_memory_resource`. + * + * The "current device" is the device returned by `cudaGetDevice`. + * + * This function is thread-safe with respect to concurrent calls to `set_per_device_resource_ref`, + * `get_per_device_resource_ref`, `get_current_device_resource_ref`, + * `set_current_device_resource_ref` and `reset_current_device_resource_ref. Concurrent calls to any + * of these functions will result in a valid state, but the order of execution is undefined. + + * + * @note The returned `device_async_resource_ref` should only be used with the current CUDA device. + * Changing the current device (e.g. using `cudaSetDevice()`) and then using the returned + * `resource_ref` can result in undefined behavior. The behavior of a `device_async_resource_ref` is + * undefined if used while the active CUDA device is a different device from the one that was active + * when the memory resource was created. + * + * @return `device_async_resource_ref` active for the current device + */ +inline device_async_resource_ref get_current_device_resource_ref() +{ + return get_per_device_resource_ref(rmm::get_current_cuda_device()); +} + +/** + * @brief Set the `device_async_resource_ref` for the current device. + * + * The "current device" is the device returned by `cudaGetDevice`. + * + * The object referenced by `new_resource_ref` must outlive the last use of the resource, otherwise + * behavior is undefined. It is the caller's responsibility to maintain the lifetime of the resource + * object. + * + * This function is thread-safe with respect to concurrent calls to `set_per_device_resource_ref`, + * `get_per_device_resource_ref`, `get_current_device_resource_ref`, + * `set_current_device_resource_ref` and `reset_current_device_resource_ref. Concurrent calls to any + * of these functions will result in a valid state, but the order of execution is undefined. + * + * @note The resource passed in `new_resource` must have been created for the current CUDA device. + * The behavior of a `device_async_resource_ref` is undefined if used while the active CUDA device + * is a different device from the one that was active when the memory resource was created. + * + * @param new_resource_ref New `device_async_resource_ref` to use for the current device + * @return Previous `device_async_resource_ref` for the current device + */ +inline device_async_resource_ref set_current_device_resource_ref( + device_async_resource_ref new_resource_ref) +{ + return set_per_device_resource_ref(rmm::get_current_cuda_device(), new_resource_ref); +} + +/** + * @brief Reset the `device_async_resource_ref` for the specified device to the initial resource. + * + * Resets to a reference to the initial `cuda_memory_resource`. + * + * `device_id.value()` must be in the range `[0, cudaGetDeviceCount())`, otherwise behavior is + * undefined. + * + * This function is thread-safe with respect to concurrent calls to `set_per_device_resource_ref`, + * `get_per_device_resource_ref`, `get_current_device_resource_ref`, + * `set_current_device_resource_ref` and `reset_current_device_resource_ref. Concurrent calls to any + * of these functions will result in a valid state, but the order of execution is undefined. + * + * @param device_id The id of the target device + * @return Previous `device_async_resource_ref` for `device_id` + */ +inline device_async_resource_ref reset_per_device_resource_ref(cuda_device_id device_id) +{ + return set_per_device_resource_ref(device_id, detail::initial_resource()); +} + +/** + * @brief Reset the `device_async_resource_ref` for the current device to the initial resource. + * + * Resets to a reference to the initial `cuda_memory_resource`. The "current device" is the device + * returned by `cudaGetDevice`. + * + * This function is thread-safe with respect to concurrent calls to `set_per_device_resource_ref`, + * `get_per_device_resource_ref`, `get_current_device_resource_ref`, + * `set_current_device_resource_ref` and `reset_current_device_resource_ref. Concurrent calls to any + * of these functions will result in a valid state, but the order of execution is undefined. + * + * @return Previous `device_async_resource_ref` for `device_id` + */ +inline device_async_resource_ref reset_current_device_resource_ref() +{ + return reset_per_device_resource_ref(rmm::get_current_cuda_device()); +} /** @} */ // end of group } // namespace rmm::mr From d8936988d6d35e860d8bd4b83bda91566d3668b3 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Tue, 20 Aug 2024 05:56:19 +0000 Subject: [PATCH 02/24] doc: explain that set_current_device_resource is in transition --- include/rmm/mr/device/per_device_resource.hpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/include/rmm/mr/device/per_device_resource.hpp b/include/rmm/mr/device/per_device_resource.hpp index 67aac51ac..15823cfe9 100644 --- a/include/rmm/mr/device/per_device_resource.hpp +++ b/include/rmm/mr/device/per_device_resource.hpp @@ -53,9 +53,16 @@ * `get_per_device_resource(id)` will return a pointer to a `cuda_memory_resource`. * * To fetch and modify the resource for the current CUDA device, `get_current_device_resource()` and - * `set_current_device_resource()` will automatically use the current CUDA device id from + * `set_current_device_resource()` automatically use the current CUDA device id from * `cudaGetDevice()`. * + * RMM is in transition to use `cuda::mr::async_resource_ref` in place of raw pointers to + * `device_memory_resource`. The `set_per_device_resource_ref`, `get_per_device_resource_ref`, + * `get_current_device_resource_ref`, `set_current_device_resource_ref`, and + * `reset_current_device_resource_ref` functions provide the same functionality as their + * `device_memory_resource` counterparts, but with `device_async_resource_ref` objects. + * The raw pointer versions are expected to be deprecated and removed in a future release. + * * Creating a device_memory_resource for each device requires care to set the current device * before creating each resource, and to maintain the lifetime of the resources as long as they * are set as per-device resources. Here is an example loop that creates `unique_ptr`s to From b068ded2a7ea07e9ce04a6a5617b1924ecce2cbc Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Tue, 20 Aug 2024 06:26:22 +0000 Subject: [PATCH 03/24] Update/add tests for get/set_current_device_resource_ref --- .../mr/device/mr_ref_multithreaded_tests.cpp | 86 +++++++++++-------- tests/mr/device/mr_ref_test.hpp | 13 ++- tests/mr/device/mr_ref_tests.cpp | 36 +++++--- 3 files changed, 87 insertions(+), 48 deletions(-) diff --git a/tests/mr/device/mr_ref_multithreaded_tests.cpp b/tests/mr/device/mr_ref_multithreaded_tests.cpp index 3ba32445f..7d749efd1 100644 --- a/tests/mr/device/mr_ref_multithreaded_tests.cpp +++ b/tests/mr/device/mr_ref_multithreaded_tests.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include @@ -69,6 +70,8 @@ void spawn(Task task, Arguments&&... args) TEST(DefaultTest, UseCurrentDeviceResource_mt) { spawn(test_get_current_device_resource); } +TEST(DefaultTest, UseCurrentDeviceResourceRef_mt) { spawn(test_get_current_device_resource_ref); } + TEST(DefaultTest, CurrentDeviceResourceIsCUDA_mt) { spawn([]() { @@ -77,6 +80,14 @@ TEST(DefaultTest, CurrentDeviceResourceIsCUDA_mt) }); } +TEST(DefaultTest, CurrentDeviceResourceRefIsCUDA_mt) +{ + spawn([]() { + EXPECT_EQ(rmm::mr::get_current_device_resource_ref(), + rmm::device_async_resource_ref{rmm::mr::detail::initial_resource()}); + }); +} + TEST(DefaultTest, GetCurrentDeviceResource_mt) { spawn([]() { @@ -86,57 +97,64 @@ TEST(DefaultTest, GetCurrentDeviceResource_mt) }); } -// Disable until we support resource_ref with set_current_device_resource -/*TEST_P(mr_ref_test_mt, SetCurrentDeviceResource_mt) +TEST(DefaultTest, GetCurrentDeviceResourceRef_mt) { - // single thread changes default resource, then multiple threads use it + spawn([]() { + auto mr = rmm::mr::get_current_device_resource_ref(); + EXPECT_EQ(mr, rmm::device_async_resource_ref{rmm::mr::detail::initial_resource()}); + }); +} - rmm::mr::device_memory_resource* old = rmm::mr::set_current_device_resource(this->mr.get()); - EXPECT_NE(nullptr, old); +TEST_P(mr_ref_test_mt, SetCurrentDeviceResourceRef_mt) +{ + // single thread changes default resource, then multiple threads use it + auto old = rmm::mr::set_current_device_resource_ref(this->ref); - spawn([mr = this->mr.get()]() { - EXPECT_EQ(mr, rmm::mr::get_current_device_resource()); - test_get_current_device_resource(); // test allocating with the new default resource + spawn([mr = this->ref]() { + EXPECT_EQ(mr, rmm::mr::get_current_device_resource_ref()); + test_get_current_device_resource_ref(); // test allocating with the new default resource }); - // setting default resource w/ nullptr should reset to initial - rmm::mr::set_current_device_resource(nullptr); - EXPECT_TRUE(old->is_equal(*rmm::mr::get_current_device_resource())); -}*/ + // resetting default resource should reset to initial + rmm::mr::reset_current_device_resource_ref(); + EXPECT_EQ(old, rmm::mr::get_current_device_resource_ref()); +} -/*TEST_P(mr_ref_test_mt, SetCurrentDeviceResourcePerThread_mt) +TEST_P(mr_ref_test_mt, SetCurrentDeviceResourceRefPerThread_mt) { int num_devices{}; RMM_CUDA_TRY(cudaGetDeviceCount(&num_devices)); std::vector threads; threads.reserve(num_devices); + + auto mr = this->ref; + for (int i = 0; i < num_devices; ++i) { - threads.emplace_back(std::thread{[mr = this->mr.get()](auto dev_id) { - RMM_CUDA_TRY(cudaSetDevice(dev_id)); - rmm::mr::device_memory_resource* old = - rmm::mr::set_current_device_resource(mr); - EXPECT_NE(nullptr, old); - // initial resource for this device should be CUDA mr - EXPECT_TRUE(old->is_equal(rmm::mr::cuda_memory_resource{})); - // get_current_device_resource should equal the resource we - // just set - EXPECT_EQ(mr, rmm::mr::get_current_device_resource()); - // Setting current dev resource to nullptr should reset to - // cuda MR and return the MR we previously set - old = rmm::mr::set_current_device_resource(nullptr); - EXPECT_NE(nullptr, old); - EXPECT_EQ(old, mr); - EXPECT_TRUE(rmm::mr::get_current_device_resource()->is_equal( - rmm::mr::cuda_memory_resource{})); - }, - i}); + threads.emplace_back( + [mr](auto dev_id) { + RMM_CUDA_TRY(cudaSetDevice(dev_id)); + auto cuda_ref = rmm::mr::get_current_device_resource_ref(); + auto old = rmm::mr::set_current_device_resource_ref(mr); + + // initial resource for this device should be CUDA mr + EXPECT_EQ(old, cuda_ref); + // get_current_device_resource_ref should equal the resource we + // just set + EXPECT_EQ(mr, rmm::mr::get_current_device_resource_ref()); + // Resetting current dev resource ref should make it + // cuda MR and return the MR we previously set + old = rmm::mr::reset_current_device_resource_ref(); + EXPECT_EQ(old, mr); + EXPECT_EQ(cuda_ref, rmm::mr::get_current_device_resource_ref()); + }, + i); } for (auto& thread : threads) { thread.join(); } -}*/ +} TEST_P(mr_ref_test_mt, Allocate) { spawn(test_various_allocations, this->ref); } @@ -247,7 +265,7 @@ void test_allocate_async_free_different_threads(rmm::device_async_resource_ref r std::mutex mtx; std::condition_variable allocations_ready; std::list allocations; - cudaEvent_t event; + cudaEvent_t event{}; RMM_CUDA_TRY(cudaEventCreate(&event)); diff --git a/tests/mr/device/mr_ref_test.hpp b/tests/mr/device/mr_ref_test.hpp index df0045d2b..6e63b3838 100644 --- a/tests/mr/device/mr_ref_test.hpp +++ b/tests/mr/device/mr_ref_test.hpp @@ -73,6 +73,15 @@ inline void test_get_current_device_resource() rmm::mr::get_current_device_resource()->deallocate(ptr, 1_MiB); } +inline void test_get_current_device_resource_ref() +{ + void* ptr = rmm::mr::get_current_device_resource_ref().allocate(1_MiB); + EXPECT_NE(nullptr, ptr); + EXPECT_TRUE(is_properly_aligned(ptr)); + EXPECT_TRUE(is_device_accessible_memory(ptr)); + rmm::mr::get_current_device_resource_ref().deallocate(ptr, 1_MiB); +} + inline void test_allocate(resource_ref ref, std::size_t bytes) { try { @@ -392,7 +401,7 @@ inline auto make_binning() struct mr_factory_base { std::string name{}; ///< Name to associate with tests that use this factory - resource_ref mr{rmm::mr::get_current_device_resource()}; + resource_ref mr{rmm::mr::get_current_device_resource_ref()}; bool skip_test{false}; }; @@ -468,7 +477,7 @@ struct mr_ref_test : public ::testing::TestWithParam { } std::shared_ptr factory_obj{}; - resource_ref ref{rmm::mr::get_current_device_resource()}; + resource_ref ref{rmm::mr::get_current_device_resource_ref()}; rmm::cuda_stream stream{}; }; diff --git a/tests/mr/device/mr_ref_tests.cpp b/tests/mr/device/mr_ref_tests.cpp index d94817bef..55e91d765 100644 --- a/tests/mr/device/mr_ref_tests.cpp +++ b/tests/mr/device/mr_ref_tests.cpp @@ -16,7 +16,9 @@ #include "mr_ref_test.hpp" +#include #include +#include #include @@ -63,6 +65,8 @@ TEST(DefaultTest, CurrentDeviceResourceIsCUDA) TEST(DefaultTest, UseCurrentDeviceResource) { test_get_current_device_resource(); } +TEST(DefaultTest, UseCurrentDeviceResourceRef) { test_get_current_device_resource_ref(); } + TEST(DefaultTest, GetCurrentDeviceResource) { auto* mr = rmm::mr::get_current_device_resource(); @@ -70,25 +74,33 @@ TEST(DefaultTest, GetCurrentDeviceResource) EXPECT_TRUE(mr->is_equal(rmm::mr::cuda_memory_resource{})); } -// Disable until we support resource_ref with set_current_device_resource -/*TEST_P(mr_ref_test, SetCurrentDeviceResource) +TEST(DefaultTest, GetCurrentDeviceResourceRef) +{ + auto mr = rmm::mr::get_current_device_resource_ref(); + EXPECT_EQ(mr, rmm::device_async_resource_ref{rmm::mr::detail::initial_resource()}); +} + +TEST_P(mr_ref_test, SetCurrentDeviceResourceRef) { - rmm::mr::device_memory_resource* old{}; - old = rmm::mr::set_current_device_resource(this->mr.get()); - EXPECT_NE(nullptr, old); + rmm::mr::cuda_memory_resource cuda_mr{}; + auto cuda_ref = rmm::device_async_resource_ref{cuda_mr}; + + rmm::mr::set_current_device_resource_ref(cuda_ref); + auto old = rmm::mr::set_current_device_resource_ref(this->ref); // old mr should equal a cuda mr - EXPECT_TRUE(old->is_equal(rmm::mr::cuda_memory_resource{})); + EXPECT_EQ(old, cuda_ref); // current dev resource should equal this resource - EXPECT_TRUE(this->mr->is_equal(*rmm::mr::get_current_device_resource())); + EXPECT_EQ(this->ref, rmm::mr::get_current_device_resource_ref()); - test_get_current_device_resource(); + test_get_current_device_resource_ref(); - // setting to `nullptr` should reset to initial cuda resource - rmm::mr::set_current_device_resource(nullptr); - EXPECT_TRUE(rmm::mr::get_current_device_resource()->is_equal(rmm::mr::cuda_memory_resource{})); -}*/ + // Resetting should reset to initial cuda resource + rmm::mr::reset_current_device_resource_ref(); + EXPECT_EQ(rmm::device_async_resource_ref{rmm::mr::detail::initial_resource()}, + rmm::mr::get_current_device_resource_ref()); +} TEST_P(mr_ref_test, SelfEquality) { EXPECT_TRUE(this->ref == this->ref); } From c7d443d3c015677ddc8e25bfaf45380377174cf1 Mon Sep 17 00:00:00 2001 From: Mark Harris <783069+harrism@users.noreply.github.com> Date: Tue, 20 Aug 2024 22:37:34 +1000 Subject: [PATCH 04/24] Apply suggestions from code review Co-authored-by: Lawrence Mitchell --- include/rmm/mr/device/per_device_resource.hpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/include/rmm/mr/device/per_device_resource.hpp b/include/rmm/mr/device/per_device_resource.hpp index 15823cfe9..e9fbc74b6 100644 --- a/include/rmm/mr/device/per_device_resource.hpp +++ b/include/rmm/mr/device/per_device_resource.hpp @@ -305,7 +305,6 @@ inline device_async_resource_ref get_per_device_resource_ref(cuda_device_id devi // If a resource was never set for `id`, set to the initial resource auto const found = map.find(device_id.value()); if (found == map.end()) { - // auto item = map.insert({device_id.value(), detail::initial_resource()}); return item.first->second; } @@ -342,11 +341,10 @@ inline device_async_resource_ref set_per_device_resource_ref( std::lock_guard lock{detail::ref_map_lock()}; auto& map = detail::get_ref_map(); auto const old_itr = map.find(device_id.value()); - // If a resource didn't previously exist for `id`, return pointer to initial_resource + // If a resource didn't previously exist for `device_id`, return pointer to initial_resource // Note: because resource_ref is not default-constructible, we can't use std::map::operator[] if (old_itr == map.end()) { map.insert({device_id.value(), new_resource_ref}); - std::cout << "returning initial resource in set_per_device_resource_ref\n"; return device_async_resource_ref{detail::initial_resource()}; } From 3cd6917cbb44f734d98d6a598a37e00d7c6739cd Mon Sep 17 00:00:00 2001 From: Mark Harris <783069+harrism@users.noreply.github.com> Date: Wed, 21 Aug 2024 10:37:17 +1000 Subject: [PATCH 05/24] RMM_EXPORT ref_map_lock() Co-authored-by: Lawrence Mitchell --- include/rmm/mr/device/per_device_resource.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/rmm/mr/device/per_device_resource.hpp b/include/rmm/mr/device/per_device_resource.hpp index e9fbc74b6..2e963b5fd 100644 --- a/include/rmm/mr/device/per_device_resource.hpp +++ b/include/rmm/mr/device/per_device_resource.hpp @@ -132,7 +132,7 @@ RMM_EXPORT inline auto& get_map() /** * @briefreturn{Reference to the lock} */ -inline std::mutex& ref_map_lock() +RMM_EXPORT inline std::mutex& ref_map_lock() { static std::mutex ref_map_lock; return ref_map_lock; From adbf7f666c8d2a8105b1f9deaafeb4baa5d02ac8 Mon Sep 17 00:00:00 2001 From: Mark Harris <783069+harrism@users.noreply.github.com> Date: Wed, 21 Aug 2024 10:39:01 +1000 Subject: [PATCH 06/24] Apply suggestions from code review Co-authored-by: Bradley Dice --- include/rmm/mr/device/per_device_resource.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/rmm/mr/device/per_device_resource.hpp b/include/rmm/mr/device/per_device_resource.hpp index 2e963b5fd..a2ca1819e 100644 --- a/include/rmm/mr/device/per_device_resource.hpp +++ b/include/rmm/mr/device/per_device_resource.hpp @@ -277,7 +277,7 @@ inline device_memory_resource* set_current_device_resource(device_memory_resourc } /** - * @brief Get the `device_async_resource_ref for the specified device. + * @brief Get the `device_async_resource_ref` for the specified device. * * Returns a `device_async_resource_ref` for the specified device. The initial resource_ref * references a `cuda_memory_resource`. @@ -287,7 +287,7 @@ inline device_memory_resource* set_current_device_resource(device_memory_resourc * * This function is thread-safe with respect to concurrent calls to `set_per_device_resource_ref`, * `get_per_device_resource_ref`, `get_current_device_resource_ref`, - * `set_current_device_resource_ref` and `reset_current_device_resource_ref. Concurrent calls to any + * `set_current_device_resource_ref` and `reset_current_device_resource_ref`. Concurrent calls to any * of these functions will result in a valid state, but the order of execution is undefined. * * @note The returned `device_async_resource_ref` should only be used when CUDA device `device_id` From b5a831ab4f4541dd9d19e53579a4fc130d292e7a Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Wed, 21 Aug 2024 02:16:49 +0000 Subject: [PATCH 07/24] Explicit RMM_EXPORT on all functions that return a function-local static. --- include/rmm/mr/device/per_device_resource.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/rmm/mr/device/per_device_resource.hpp b/include/rmm/mr/device/per_device_resource.hpp index a2ca1819e..7d747395f 100644 --- a/include/rmm/mr/device/per_device_resource.hpp +++ b/include/rmm/mr/device/per_device_resource.hpp @@ -104,7 +104,7 @@ namespace detail { * * @return Pointer to the static cuda_memory_resource used as the initial, default resource */ -inline device_memory_resource* initial_resource() +RMM_EXPORT inline device_memory_resource* initial_resource() { static cuda_memory_resource mr{}; return &mr; @@ -113,7 +113,7 @@ inline device_memory_resource* initial_resource() /** * @briefreturn{Reference to the lock} */ -inline std::mutex& map_lock() +RMM_EXPORT inline std::mutex& map_lock() { static std::mutex map_lock; return map_lock; @@ -287,8 +287,8 @@ inline device_memory_resource* set_current_device_resource(device_memory_resourc * * This function is thread-safe with respect to concurrent calls to `set_per_device_resource_ref`, * `get_per_device_resource_ref`, `get_current_device_resource_ref`, - * `set_current_device_resource_ref` and `reset_current_device_resource_ref`. Concurrent calls to any - * of these functions will result in a valid state, but the order of execution is undefined. + * `set_current_device_resource_ref` and `reset_current_device_resource_ref`. Concurrent calls to + * any of these functions will result in a valid state, but the order of execution is undefined. * * @note The returned `device_async_resource_ref` should only be used when CUDA device `device_id` * is the current device (e.g. set using `cudaSetDevice()`). The behavior of a From 812d08d115203e78705f01d252182db7753963ec Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Wed, 21 Aug 2024 02:20:46 +0000 Subject: [PATCH 08/24] Doc: ref and ptr versions are not interchangeable. --- include/rmm/mr/device/per_device_resource.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/rmm/mr/device/per_device_resource.hpp b/include/rmm/mr/device/per_device_resource.hpp index 7d747395f..6b78ee463 100644 --- a/include/rmm/mr/device/per_device_resource.hpp +++ b/include/rmm/mr/device/per_device_resource.hpp @@ -60,8 +60,10 @@ * `device_memory_resource`. The `set_per_device_resource_ref`, `get_per_device_resource_ref`, * `get_current_device_resource_ref`, `set_current_device_resource_ref`, and * `reset_current_device_resource_ref` functions provide the same functionality as their - * `device_memory_resource` counterparts, but with `device_async_resource_ref` objects. - * The raw pointer versions are expected to be deprecated and removed in a future release. + * `device_memory_resource` counterparts, but with `device_async_resource_ref` objects. The raw + * pointer versions and the `resource_ref` versions maintain distinc state and are not + * interchangeable. The raw pointer versions are expected to be deprecated and removed in a future + * release. * * Creating a device_memory_resource for each device requires care to set the current device * before creating each resource, and to maintain the lifetime of the resources as long as they From 1c43d63da1b6776a83ec9473b7f2871299af4916 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Thu, 22 Aug 2024 03:12:40 +0000 Subject: [PATCH 09/24] Use resource_ref in MR adaptors and related tests. --- .../mr/device/aligned_resource_adaptor.hpp | 53 +++++++++++++------ .../rmm/mr/device/arena_memory_resource.hpp | 35 ++++++++++-- .../mr/device/callback_memory_resource.hpp | 13 ++--- include/rmm/mr/device/detail/arena.hpp | 31 ++++++----- .../failure_callback_resource_adaptor.hpp | 42 ++++++++++----- .../mr/device/limiting_resource_adaptor.hpp | 42 ++++++++++----- .../rmm/mr/device/pool_memory_resource.hpp | 41 +++++++++++--- .../mr/device/statistics_resource_adaptor.hpp | 27 +++++----- .../mr/device/thrust_allocator_adaptor.hpp | 2 +- .../mr/device/tracking_resource_adaptor.hpp | 37 ++++++++----- tests/container_multidevice_tests.cu | 24 ++++----- tests/cuda_stream_tests.cpp | 2 +- tests/device_check_resource_adaptor.hpp | 16 +++--- tests/mock_resource.hpp | 5 ++ tests/mr/device/aligned_mr_tests.cpp | 4 +- tests/mr/device/arena_mr_tests.cpp | 37 +++++++------ tests/mr/device/callback_mr_tests.cpp | 37 +++++++------ tests/mr/device/limiting_mr_tests.cpp | 14 ++--- tests/mr/device/pool_mr_tests.cpp | 40 +++++--------- tests/mr/device/statistics_mr_tests.cpp | 20 +++---- tests/mr/device/tracking_mr_tests.cpp | 24 ++++----- tests/mr/host/mr_ref_tests.cpp | 3 ++ 22 files changed, 335 insertions(+), 214 deletions(-) diff --git a/include/rmm/mr/device/aligned_resource_adaptor.hpp b/include/rmm/mr/device/aligned_resource_adaptor.hpp index a91056dfa..088d28413 100644 --- a/include/rmm/mr/device/aligned_resource_adaptor.hpp +++ b/include/rmm/mr/device/aligned_resource_adaptor.hpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -65,12 +66,36 @@ class aligned_resource_adaptor final : public device_memory_resource { * @param alignment_threshold Only allocations with a size larger than or equal to this threshold * are aligned. */ - explicit aligned_resource_adaptor(Upstream* upstream, + explicit aligned_resource_adaptor(device_async_resource_ref upstream, std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT, std::size_t alignment_threshold = default_alignment_threshold) : upstream_{upstream}, alignment_{alignment}, alignment_threshold_{alignment_threshold} { - RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); + RMM_EXPECTS(rmm::is_supported_alignment(alignment), + "Allocation alignment is not a power of 2."); + } + + /** + * @brief Construct an aligned resource adaptor using `upstream` to satisfy allocation requests. + * + * @throws rmm::logic_error if `upstream == nullptr` + * @throws rmm::logic_error if `allocation_alignment` is not a power of 2 + * + * @param upstream The resource used for allocating/deallocating device memory. + * @param alignment The size used for allocation alignment. + * @param alignment_threshold Only allocations with a size larger than or equal to this threshold + * are aligned. + */ + explicit aligned_resource_adaptor(Upstream* upstream, + std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT, + std::size_t alignment_threshold = default_alignment_threshold) + : upstream_{[upstream]() { + RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); + return device_async_resource_ref{*upstream}; + }()}, + alignment_{alignment}, + alignment_threshold_{alignment_threshold} + { RMM_EXPECTS(rmm::is_supported_alignment(alignment), "Allocation alignment is not a power of 2."); } @@ -90,11 +115,6 @@ class aligned_resource_adaptor final : public device_memory_resource { return upstream_; } - /** - * @briefreturn{Upstream* to the upstream memory resource} - */ - [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; } - /** * @brief The default alignment used by the adaptor. */ @@ -104,8 +124,8 @@ class aligned_resource_adaptor final : public device_memory_resource { using lock_guard = std::lock_guard; /** - * @brief Allocates memory of size at least `bytes` using the upstream resource with the specified - * alignment. + * @brief Allocates memory of size at least `bytes` using the upstream resource with the + * specified alignment. * * @throws rmm::bad_alloc if the requested allocation could not be fulfilled * by the upstream resource. @@ -117,10 +137,10 @@ class aligned_resource_adaptor final : public device_memory_resource { void* do_allocate(std::size_t bytes, cuda_stream_view stream) override { if (alignment_ == rmm::CUDA_ALLOCATION_ALIGNMENT || bytes < alignment_threshold_) { - return upstream_->allocate(bytes, stream); + return get_upstream_resource().allocate_async(bytes, 1, stream); } auto const size = upstream_allocation_size(bytes); - void* pointer = upstream_->allocate(size, stream); + void* pointer = get_upstream_resource().allocate_async(size, 1, stream); // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast) auto const address = reinterpret_cast(pointer); auto const aligned_address = rmm::align_up(address, alignment_); @@ -143,7 +163,7 @@ class aligned_resource_adaptor final : public device_memory_resource { void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override { if (alignment_ == rmm::CUDA_ALLOCATION_ALIGNMENT || bytes < alignment_threshold_) { - upstream_->deallocate(ptr, bytes, stream); + get_upstream_resource().deallocate_async(ptr, bytes, 1, stream); } else { { lock_guard lock(mtx_); @@ -153,7 +173,7 @@ class aligned_resource_adaptor final : public device_memory_resource { pointers_.erase(iter); } } - upstream_->deallocate(ptr, upstream_allocation_size(bytes), stream); + get_upstream_resource().deallocate_async(ptr, upstream_allocation_size(bytes), 1, stream); } } @@ -174,8 +194,8 @@ class aligned_resource_adaptor final : public device_memory_resource { } /** - * @brief Calculate the allocation size needed from upstream to account for alignments of both the - * size and the base pointer. + * @brief Calculate the allocation size needed from upstream to account for alignments of both + * the size and the base pointer. * * @param bytes The requested allocation size. * @return Allocation size needed from upstream to align both the size and the base pointer. @@ -186,7 +206,8 @@ class aligned_resource_adaptor final : public device_memory_resource { return aligned_size + alignment_ - rmm::CUDA_ALLOCATION_ALIGNMENT; } - Upstream* upstream_; ///< The upstream resource used for satisfying allocation requests + /// The upstream resource used for satisfying allocation requests + device_async_resource_ref upstream_{rmm::mr::get_current_device_resource_ref()}; std::unordered_map pointers_; ///< Map of aligned pointers to upstream pointers. std::size_t alignment_; ///< The size used for allocation alignment std::size_t alignment_threshold_; ///< The size above which allocations should be aligned diff --git a/include/rmm/mr/device/arena_memory_resource.hpp b/include/rmm/mr/device/arena_memory_resource.hpp index b274e0c18..0860a46d5 100644 --- a/include/rmm/mr/device/arena_memory_resource.hpp +++ b/include/rmm/mr/device/arena_memory_resource.hpp @@ -21,6 +21,7 @@ #include #include #include +#include #include @@ -80,6 +81,26 @@ namespace rmm::mr { template class arena_memory_resource final : public device_memory_resource { public: + /** + * @brief Construct an `arena_memory_resource`. + * + * @param upstream_mr The memory resource from which to allocate blocks for the global arena. + * @param arena_size Size in bytes of the global arena. Defaults to half of the available + * memory on the current device. + * @param dump_log_on_failure If true, dump memory log when running out of memory. + */ + explicit arena_memory_resource(device_async_resource_ref upstream_mr, + std::optional arena_size = std::nullopt, + bool dump_log_on_failure = false) + : global_arena_{upstream_mr, arena_size}, dump_log_on_failure_{dump_log_on_failure} + { + if (dump_log_on_failure_) { + logger_ = spdlog::basic_logger_mt("arena_memory_dump", "rmm_arena_memory_dump.log"); + // Set the level to `debug` for more detailed output. + logger_->set_level(spdlog::level::info); + } + } + /** * @brief Construct an `arena_memory_resource`. * @@ -93,7 +114,13 @@ class arena_memory_resource final : public device_memory_resource { explicit arena_memory_resource(Upstream* upstream_mr, std::optional arena_size = std::nullopt, bool dump_log_on_failure = false) - : global_arena_{upstream_mr, arena_size}, dump_log_on_failure_{dump_log_on_failure} + : global_arena_{[upstream_mr]() { + RMM_EXPECTS(upstream_mr != nullptr, + "Unexpected null upstream memory resource."); + return device_async_resource_ref{*upstream_mr}; + }(), + arena_size}, + dump_log_on_failure_{dump_log_on_failure} { if (dump_log_on_failure_) { logger_ = spdlog::basic_logger_mt("arena_memory_dump", "rmm_arena_memory_dump.log"); @@ -111,8 +138,8 @@ class arena_memory_resource final : public device_memory_resource { arena_memory_resource& operator=(arena_memory_resource&&) noexcept = delete; private: - using global_arena = rmm::mr::detail::arena::global_arena; - using arena = rmm::mr::detail::arena::arena; + using global_arena = rmm::mr::detail::arena::global_arena; + using arena = rmm::mr::detail::arena::arena; /** * @brief Allocates memory of size at least `bytes`. @@ -272,7 +299,7 @@ class arena_memory_resource final : public device_memory_resource { std::unique_lock lock(map_mtx_); auto thread_arena = std::make_shared(global_arena_); thread_arenas_.emplace(thread_id, thread_arena); - thread_local detail::arena::arena_cleaner cleaner{thread_arena}; + thread_local detail::arena::arena_cleaner cleaner{thread_arena}; return *thread_arena; } } diff --git a/include/rmm/mr/device/callback_memory_resource.hpp b/include/rmm/mr/device/callback_memory_resource.hpp index 1483925de..74af8679a 100644 --- a/include/rmm/mr/device/callback_memory_resource.hpp +++ b/include/rmm/mr/device/callback_memory_resource.hpp @@ -84,12 +84,13 @@ class callback_memory_resource final : public device_memory_resource { * It is the caller's responsibility to maintain the lifetime of the pointed-to data * for the duration of the lifetime of the `callback_memory_resource`. */ - callback_memory_resource(allocate_callback_t allocate_callback, - deallocate_callback_t deallocate_callback, - void* allocate_callback_arg = nullptr, - void* deallocate_callback_arg = nullptr) noexcept - : allocate_callback_(allocate_callback), - deallocate_callback_(deallocate_callback), + callback_memory_resource( + allocate_callback_t allocate_callback, + deallocate_callback_t deallocate_callback, + void* allocate_callback_arg = nullptr, // NOLINT(bugprone-easily-swappable-parameters) + void* deallocate_callback_arg = nullptr) noexcept + : allocate_callback_(std::move(allocate_callback)), + deallocate_callback_(std::move(deallocate_callback)), allocate_callback_arg_(allocate_callback_arg), deallocate_callback_arg_(deallocate_callback_arg) { diff --git a/include/rmm/mr/device/detail/arena.hpp b/include/rmm/mr/device/detail/arena.hpp index c7965ca34..324b9c928 100644 --- a/include/rmm/mr/device/detail/arena.hpp +++ b/include/rmm/mr/device/detail/arena.hpp @@ -22,6 +22,7 @@ #include #include #include +#include #include @@ -492,7 +493,6 @@ inline auto max_free_size(std::set const& superblocks) * @tparam Upstream Memory resource to use for allocating the arena. Implements * rmm::mr::device_memory_resource interface. */ -template class global_arena final { public: /** @@ -504,10 +504,9 @@ class global_arena final { * @param arena_size Size in bytes of the global arena. Defaults to half of the available memory * on the current device. */ - global_arena(Upstream* upstream_mr, std::optional arena_size) + global_arena(device_async_resource_ref upstream_mr, std::optional arena_size) : upstream_mr_{upstream_mr} { - RMM_EXPECTS(nullptr != upstream_mr_, "Unexpected null upstream pointer."); auto const size = rmm::align_down(arena_size.value_or(default_size()), rmm::CUDA_ALLOCATION_ALIGNMENT); RMM_EXPECTS(size >= superblock::minimum_size, @@ -528,7 +527,7 @@ class global_arena final { ~global_arena() { std::lock_guard lock(mtx_); - upstream_mr_->deallocate(upstream_block_.pointer(), upstream_block_.size()); + upstream_mr_.deallocate(upstream_block_.pointer(), upstream_block_.size()); } /** @@ -537,7 +536,7 @@ class global_arena final { * @param size The size in bytes of the allocation. * @return bool True if the allocation should be handled by the global arena. */ - bool handles(std::size_t size) const { return size > superblock::minimum_size; } + static bool handles(std::size_t size) { return size > superblock::minimum_size; } /** * @brief Acquire a superblock that can fit a block of the given size. @@ -608,7 +607,7 @@ class global_arena final { * @param stream Stream on which to perform deallocation. * @return bool true if the allocation is found, false otherwise. */ - bool deallocate(void* ptr, std::size_t size, cuda_stream_view stream) + bool deallocate_async(void* ptr, std::size_t size, cuda_stream_view stream) { RMM_LOGGING_ASSERT(handles(size)); stream.synchronize_no_throw(); @@ -690,7 +689,7 @@ class global_arena final { * @brief Default size of the global arena if unspecified. * @return the default global arena size. */ - constexpr std::size_t default_size() const + static std::size_t default_size() { auto const [free, total] = rmm::available_device_memory(); return free / 2; @@ -703,7 +702,7 @@ class global_arena final { */ void initialize(std::size_t size) { - upstream_block_ = {upstream_mr_->allocate(size), size}; + upstream_block_ = {upstream_mr_.allocate(size), size}; superblocks_.emplace(upstream_block_.pointer(), size); } @@ -775,7 +774,7 @@ class global_arena final { } /// The upstream resource to allocate memory from. - Upstream* upstream_mr_; + device_async_resource_ref upstream_mr_; /// Block allocated from upstream so that it can be quickly freed. block upstream_block_; /// Address-ordered set of superblocks. @@ -793,7 +792,6 @@ class global_arena final { * @tparam Upstream Memory resource to use for allocating the global arena. Implements * rmm::mr::device_memory_resource interface. */ -template class arena { public: /** @@ -801,7 +799,7 @@ class arena { * * @param global_arena The global arena from which to allocate superblocks. */ - explicit arena(global_arena& global_arena) : global_arena_{global_arena} {} + explicit arena(global_arena& global_arena) : global_arena_{global_arena} {} // Disable copy (and move) semantics. arena(arena const&) = delete; @@ -835,7 +833,9 @@ class arena { */ bool deallocate(void* ptr, std::size_t size, cuda_stream_view stream) { - if (global_arena_.handles(size) && global_arena_.deallocate(ptr, size, stream)) { return true; } + if (global_arena::handles(size) && global_arena_.deallocate_async(ptr, size, stream)) { + return true; + } return deallocate(ptr, size); } @@ -959,7 +959,7 @@ class arena { } /// The global arena to allocate superblocks from. - global_arena& global_arena_; + global_arena& global_arena_; /// Acquired superblocks. std::set superblocks_; /// Mutex for exclusive lock. @@ -974,10 +974,9 @@ class arena { * @tparam Upstream Memory resource to use for allocating the global arena. Implements * rmm::mr::device_memory_resource interface. */ -template class arena_cleaner { public: - explicit arena_cleaner(std::shared_ptr> const& arena) : arena_(arena) {} + explicit arena_cleaner(std::shared_ptr const& arena) : arena_(arena) {} // Disable copy (and move) semantics. arena_cleaner(arena_cleaner const&) = delete; @@ -995,7 +994,7 @@ class arena_cleaner { private: /// A non-owning pointer to the arena that may need cleaning. - std::weak_ptr> arena_; + std::weak_ptr arena_; }; } // namespace rmm::mr::detail::arena diff --git a/include/rmm/mr/device/failure_callback_resource_adaptor.hpp b/include/rmm/mr/device/failure_callback_resource_adaptor.hpp index 53bc572c2..f1a2178ee 100644 --- a/include/rmm/mr/device/failure_callback_resource_adaptor.hpp +++ b/include/rmm/mr/device/failure_callback_resource_adaptor.hpp @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -79,7 +80,7 @@ using failure_callback_t = std::function; * { * bool retried{false}; * failure_callback_adaptor mr{ - * rmm::mr::get_current_device_resource(), failure_handler, &retried + * rmm::mr::get_current_device_resource_ref(), failure_handler, &retried * }; * rmm::mr::set_current_device_resource(&mr); * } @@ -103,12 +104,33 @@ class failure_callback_resource_adaptor final : public device_memory_resource { * @param callback Callback function @see failure_callback_t * @param callback_arg Extra argument passed to `callback` */ - failure_callback_resource_adaptor(Upstream* upstream, + failure_callback_resource_adaptor(device_async_resource_ref upstream, failure_callback_t callback, void* callback_arg) : upstream_{upstream}, callback_{std::move(callback)}, callback_arg_{callback_arg} { - RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); + } + + /** + * @brief Construct a new `failure_callback_resource_adaptor` using `upstream` to satisfy + * allocation requests. + * + * @throws rmm::logic_error if `upstream == nullptr` + * + * @param upstream The resource used for allocating/deallocating device memory + * @param callback Callback function @see failure_callback_t + * @param callback_arg Extra argument passed to `callback` + */ + failure_callback_resource_adaptor(Upstream* upstream, + failure_callback_t callback, + void* callback_arg) + : upstream_{[upstream]() { + RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); + return device_async_resource_ref{*upstream}; + }()}, + callback_{std::move(callback)}, + callback_arg_{callback_arg} + { } failure_callback_resource_adaptor() = delete; @@ -128,11 +150,6 @@ class failure_callback_resource_adaptor final : public device_memory_resource { return upstream_; } - /** - * @briefreturn{Upstream* to the upstream memory resource} - */ - [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; } - private: /** * @brief Allocates memory of size at least `bytes` using the upstream @@ -151,7 +168,7 @@ class failure_callback_resource_adaptor final : public device_memory_resource { while (true) { try { - ret = upstream_->allocate(bytes, stream); + ret = get_upstream_resource().allocate_async(bytes, stream); break; } catch (exception_type const& e) { if (!callback_(bytes, callback_arg_)) { throw; } @@ -169,7 +186,7 @@ class failure_callback_resource_adaptor final : public device_memory_resource { */ void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override { - upstream_->deallocate(ptr, bytes, stream); + get_upstream_resource().deallocate_async(ptr, bytes, stream); } /** @@ -183,11 +200,12 @@ class failure_callback_resource_adaptor final : public device_memory_resource { { if (this == &other) { return true; } auto cast = dynamic_cast const*>(&other); - if (cast == nullptr) { return upstream_->is_equal(other); } + if (cast == nullptr) { return false; } return get_upstream_resource() == cast->get_upstream_resource(); } - Upstream* upstream_; // the upstream resource used for satisfying allocation requests + // the upstream resource used for satisfying allocation requests + device_async_resource_ref upstream_{rmm::mr::get_current_device_resource_ref()}; failure_callback_t callback_; void* callback_arg_; }; diff --git a/include/rmm/mr/device/limiting_resource_adaptor.hpp b/include/rmm/mr/device/limiting_resource_adaptor.hpp index 76a5a31c1..933682ead 100644 --- a/include/rmm/mr/device/limiting_resource_adaptor.hpp +++ b/include/rmm/mr/device/limiting_resource_adaptor.hpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -53,7 +54,7 @@ class limiting_resource_adaptor final : public device_memory_resource { * @param allocation_limit Maximum memory allowed for this allocator * @param alignment Alignment in bytes for the start of each allocated buffer */ - limiting_resource_adaptor(Upstream* upstream, + limiting_resource_adaptor(device_async_resource_ref upstream, std::size_t allocation_limit, std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT) : allocation_limit_{allocation_limit}, @@ -61,7 +62,29 @@ class limiting_resource_adaptor final : public device_memory_resource { alignment_(alignment), upstream_{upstream} { - RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); + } + + /** + * @brief Construct a new limiting resource adaptor using `upstream` to satisfy + * allocation requests and limiting the total allocation amount possible. + * + * @throws rmm::logic_error if `upstream == nullptr` + * + * @param upstream The resource used for allocating/deallocating device memory + * @param allocation_limit Maximum memory allowed for this allocator + * @param alignment Alignment in bytes for the start of each allocated buffer + */ + limiting_resource_adaptor(Upstream* upstream, + std::size_t allocation_limit, + std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT) + : allocation_limit_{allocation_limit}, + allocated_bytes_(0), + alignment_(alignment), + upstream_{[upstream]() { + RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); + return device_async_resource_ref{*upstream}; + }()} + { } limiting_resource_adaptor() = delete; @@ -81,11 +104,6 @@ class limiting_resource_adaptor final : public device_memory_resource { return upstream_; } - /** - * @briefreturn{Upstream* to the upstream memory resource} - */ - [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; } - /** * @brief Query the number of bytes that have been allocated. Note that * this can not be used to know how large of an allocation is possible due @@ -126,7 +144,7 @@ class limiting_resource_adaptor final : public device_memory_resource { auto const old = allocated_bytes_.fetch_add(proposed_size); if (old + proposed_size <= allocation_limit_) { try { - return upstream_->allocate(bytes, stream); + return get_upstream_resource().allocate_async(bytes, stream); } catch (...) { allocated_bytes_ -= proposed_size; throw; @@ -147,7 +165,7 @@ class limiting_resource_adaptor final : public device_memory_resource { void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override { std::size_t allocated_size = rmm::align_up(bytes, alignment_); - upstream_->deallocate(ptr, bytes, stream); + get_upstream_resource().deallocate_async(ptr, bytes, stream); allocated_bytes_ -= allocated_size; } @@ -162,7 +180,7 @@ class limiting_resource_adaptor final : public device_memory_resource { { if (this == &other) { return true; } auto const* cast = dynamic_cast const*>(&other); - if (cast == nullptr) { return upstream_->is_equal(other); } + if (cast == nullptr) { return false; } return get_upstream_resource() == cast->get_upstream_resource(); } @@ -175,8 +193,8 @@ class limiting_resource_adaptor final : public device_memory_resource { // todo: should be some way to ask the upstream... std::size_t alignment_; - Upstream* upstream_; ///< The upstream resource used for satisfying - ///< allocation requests + // The upstream resource used for satisfying allocation requests + device_async_resource_ref upstream_{rmm::mr::get_current_device_resource_ref()}; }; /** diff --git a/include/rmm/mr/device/pool_memory_resource.hpp b/include/rmm/mr/device/pool_memory_resource.hpp index a3a972904..5e76aaf74 100644 --- a/include/rmm/mr/device/pool_memory_resource.hpp +++ b/include/rmm/mr/device/pool_memory_resource.hpp @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -112,6 +113,34 @@ class pool_memory_resource final friend class detail::stream_ordered_memory_resource, detail::coalescing_free_list>; + /** + * @brief Construct a `pool_memory_resource` and allocate the initial device memory pool using + * `upstream_mr`. + * + * @throws rmm::logic_error if `upstream_mr == nullptr` + * @throws rmm::logic_error if `initial_pool_size` is not aligned to a multiple of + * pool_memory_resource::allocation_alignment bytes. + * @throws rmm::logic_error if `maximum_pool_size` is neither the default nor aligned to a + * multiple of pool_memory_resource::allocation_alignment bytes. + * + * @param upstream_mr The memory_resource from which to allocate blocks for the pool. + * @param initial_pool_size Minimum size, in bytes, of the initial pool. + * @param maximum_pool_size Maximum size, in bytes, that the pool can grow to. Defaults to all + * of the available from the upstream resource. + */ + explicit pool_memory_resource(device_async_resource_ref upstream_mr, + std::size_t initial_pool_size, + std::optional maximum_pool_size = std::nullopt) + : upstream_mr_{upstream_mr} + { + RMM_EXPECTS(rmm::is_aligned(initial_pool_size, rmm::CUDA_ALLOCATION_ALIGNMENT), + "Error, Initial pool size required to be a multiple of 256 bytes"); + RMM_EXPECTS(rmm::is_aligned(maximum_pool_size.value_or(0), rmm::CUDA_ALLOCATION_ALIGNMENT), + "Error, Maximum pool size required to be a multiple of 256 bytes"); + + initialize_pool(initial_pool_size, maximum_pool_size); + } + /** * @brief Construct a `pool_memory_resource` and allocate the initial device memory pool using * `upstream_mr`. @@ -132,7 +161,7 @@ class pool_memory_resource final std::optional maximum_pool_size = std::nullopt) : upstream_mr_{[upstream_mr]() { RMM_EXPECTS(nullptr != upstream_mr, "Unexpected null upstream pointer."); - return upstream_mr; + return device_async_resource_ref{*upstream_mr}; }()} { RMM_EXPECTS(rmm::is_aligned(initial_pool_size, rmm::CUDA_ALLOCATION_ALIGNMENT), @@ -182,16 +211,11 @@ class pool_memory_resource final /** * @briefreturn{rmm::device_async_resource_ref to the upstream resource} */ - [[nodiscard]] rmm::device_async_resource_ref get_upstream_resource() const noexcept + [[nodiscard]] device_async_resource_ref get_upstream_resource() const noexcept { return upstream_mr_; } - /** - * @briefreturn{Upstream* to the upstream memory resource} - */ - [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_mr_; } - /** * @brief Computes the size of the current pool * @@ -464,7 +488,8 @@ class pool_memory_resource final } private: - Upstream* upstream_mr_; // The "heap" to allocate the pool from + // The "heap" to allocate the pool from + device_async_resource_ref upstream_mr_; std::size_t current_pool_size_{}; std::optional maximum_pool_size_{}; diff --git a/include/rmm/mr/device/statistics_resource_adaptor.hpp b/include/rmm/mr/device/statistics_resource_adaptor.hpp index cbf1b87d2..fcb364ea8 100644 --- a/include/rmm/mr/device/statistics_resource_adaptor.hpp +++ b/include/rmm/mr/device/statistics_resource_adaptor.hpp @@ -16,6 +16,7 @@ #pragma once #include +#include #include #include @@ -111,6 +112,8 @@ class statistics_resource_adaptor final : public device_memory_resource { } }; + statistics_resource_adaptor(device_async_resource_ref upstream) : upstream_{upstream} {} + /** * @brief Construct a new statistics resource adaptor using `upstream` to satisfy * allocation requests. @@ -119,9 +122,12 @@ class statistics_resource_adaptor final : public device_memory_resource { * * @param upstream The resource used for allocating/deallocating device memory */ - statistics_resource_adaptor(Upstream* upstream) : upstream_{upstream} + statistics_resource_adaptor(Upstream* upstream) + : upstream_{[upstream]() { + RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); + return device_async_resource_ref{*upstream}; + }()} { - RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); } statistics_resource_adaptor() = delete; @@ -141,11 +147,6 @@ class statistics_resource_adaptor final : public device_memory_resource { return upstream_; } - /** - * @briefreturn{Upstream* to the upstream memory resource} - */ - [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; } - /** * @brief Returns a `counter` struct for this adaptor containing the current, * peak, and total number of allocated bytes for this @@ -224,7 +225,7 @@ class statistics_resource_adaptor final : public device_memory_resource { */ void* do_allocate(std::size_t bytes, cuda_stream_view stream) override { - void* ptr = upstream_->allocate(bytes, stream); + void* ptr = upstream_.allocate_async(bytes, stream); // increment the stats { @@ -247,7 +248,7 @@ class statistics_resource_adaptor final : public device_memory_resource { */ void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override { - upstream_->deallocate(ptr, bytes, stream); + upstream_.deallocate_async(ptr, bytes, stream); { write_lock_t lock(mtx_); @@ -269,7 +270,7 @@ class statistics_resource_adaptor final : public device_memory_resource { { if (this == &other) { return true; } auto cast = dynamic_cast const*>(&other); - if (cast == nullptr) { return upstream_->is_equal(other); } + if (cast == nullptr) { return false; } return get_upstream_resource() == cast->get_upstream_resource(); } @@ -277,14 +278,14 @@ class statistics_resource_adaptor final : public device_memory_resource { // Invariant: the stack always contains at least one entry std::stack> counter_stack_{{std::make_pair(counter{}, counter{})}}; std::shared_mutex mutable mtx_; // mutex for thread safe access to allocations_ - Upstream* upstream_; // the upstream resource used for satisfying allocation requests + // the upstream resource used for satisfying allocation requests + device_async_resource_ref upstream_{rmm::mr::get_current_device_resource_ref()}; }; /** * @brief Convenience factory to return a `statistics_resource_adaptor` around the * upstream resource `upstream`. * - * @tparam Upstream Type of the upstream `device_memory_resource`. * @param upstream Pointer to the upstream resource * @return The new statistics resource adaptor */ @@ -295,7 +296,7 @@ template "instead.")]] statistics_resource_adaptor make_statistics_adaptor(Upstream* upstream) { - return statistics_resource_adaptor{upstream}; + return statistics_resource_adaptor{upstream}; } /** @} */ // end of group diff --git a/include/rmm/mr/device/thrust_allocator_adaptor.hpp b/include/rmm/mr/device/thrust_allocator_adaptor.hpp index 3bfd65996..bb60ea522 100644 --- a/include/rmm/mr/device/thrust_allocator_adaptor.hpp +++ b/include/rmm/mr/device/thrust_allocator_adaptor.hpp @@ -148,7 +148,7 @@ class thrust_allocator : public thrust::device_malloc_allocator { private: cuda_stream_view _stream{}; - rmm::device_async_resource_ref _mr{rmm::mr::get_current_device_resource()}; + rmm::device_async_resource_ref _mr{rmm::mr::get_current_device_resource_ref()}; cuda_device_id _device{get_current_cuda_device()}; }; /** @} */ // end of group diff --git a/include/rmm/mr/device/tracking_resource_adaptor.hpp b/include/rmm/mr/device/tracking_resource_adaptor.hpp index 0d3046973..f5bc43ee6 100644 --- a/include/rmm/mr/device/tracking_resource_adaptor.hpp +++ b/include/rmm/mr/device/tracking_resource_adaptor.hpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -90,10 +91,26 @@ class tracking_resource_adaptor final : public device_memory_resource { * @param upstream The resource used for allocating/deallocating device memory * @param capture_stacks If true, capture stacks for allocation calls */ - tracking_resource_adaptor(Upstream* upstream, bool capture_stacks = false) + tracking_resource_adaptor(device_async_resource_ref upstream, bool capture_stacks = false) : capture_stacks_{capture_stacks}, allocated_bytes_{0}, upstream_{upstream} { - RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); + } + + /** + * @brief Construct a new tracking resource adaptor using `upstream` to satisfy + * allocation requests. + * + * @throws rmm::logic_error if `upstream == nullptr` + * + * @param upstream The resource used for allocating/deallocating device memory + * @param capture_stacks If true, capture stacks for allocation calls + */ + tracking_resource_adaptor(Upstream* upstream, bool capture_stacks = false) + : capture_stacks_{capture_stacks}, allocated_bytes_{0}, upstream_{[upstream]() { + RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); + return device_async_resource_ref{*upstream}; + }()} + { } tracking_resource_adaptor() = delete; @@ -113,11 +130,6 @@ class tracking_resource_adaptor final : public device_memory_resource { return upstream_; } - /** - * @briefreturn{Upstream* to the upstream memory resource} - */ - [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; } - /** * @brief Get the outstanding allocations map * @@ -197,8 +209,7 @@ class tracking_resource_adaptor final : public device_memory_resource { */ void* do_allocate(std::size_t bytes, cuda_stream_view stream) override { - void* ptr = upstream_->allocate(bytes, stream); - + void* ptr = upstream_.allocate_async(bytes, stream); // track it. { write_lock_t lock(mtx_); @@ -218,7 +229,7 @@ class tracking_resource_adaptor final : public device_memory_resource { */ void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override { - upstream_->deallocate(ptr, bytes, stream); + upstream_.deallocate_async(ptr, bytes, stream); { write_lock_t lock(mtx_); @@ -263,7 +274,7 @@ class tracking_resource_adaptor final : public device_memory_resource { { if (this == &other) { return true; } auto cast = dynamic_cast const*>(&other); - if (cast == nullptr) { return upstream_->is_equal(other); } + if (cast == nullptr) { return false; } return get_upstream_resource() == cast->get_upstream_resource(); } @@ -271,7 +282,9 @@ class tracking_resource_adaptor final : public device_memory_resource { std::map allocations_; // map of active allocations std::atomic allocated_bytes_; // number of bytes currently allocated std::shared_mutex mutable mtx_; // mutex for thread safe access to allocations_ - Upstream* upstream_; // the upstream resource used for satisfying allocation requests + device_async_resource_ref upstream_{ + rmm::mr::get_current_device_resource_ref()}; // the upstream resource used for satisfying + // allocation requests }; /** diff --git a/tests/container_multidevice_tests.cu b/tests/container_multidevice_tests.cu index e58ba53a2..4d00173ac 100644 --- a/tests/container_multidevice_tests.cu +++ b/tests/container_multidevice_tests.cu @@ -42,9 +42,9 @@ TYPED_TEST(ContainerMultiDeviceTest, CreateDestroyDifferentActiveDevice) // only run on multidevice systems if (num_devices >= 2) { rmm::cuda_set_device_raii dev{rmm::cuda_device_id{0}}; - auto* orig_mr = rmm::mr::get_current_device_resource(); + auto orig_mr = rmm::mr::get_current_device_resource_ref(); auto check_mr = device_check_resource_adaptor{orig_mr}; - rmm::mr::set_current_device_resource(&check_mr); + rmm::mr::set_current_device_resource_ref(check_mr); { if constexpr (std::is_same_v>) { @@ -57,7 +57,7 @@ TYPED_TEST(ContainerMultiDeviceTest, CreateDestroyDifferentActiveDevice) } RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(0)); - rmm::mr::set_current_device_resource(orig_mr); + rmm::mr::set_current_device_resource_ref(orig_mr); } } @@ -69,9 +69,9 @@ TYPED_TEST(ContainerMultiDeviceTest, CreateMoveDestroyDifferentActiveDevice) // only run on multidevice systems if (num_devices >= 2) { rmm::cuda_set_device_raii dev{rmm::cuda_device_id{0}}; - auto* orig_mr = rmm::mr::get_current_device_resource(); + auto orig_mr = rmm::mr::get_current_device_resource_ref(); auto check_mr = device_check_resource_adaptor{orig_mr}; - rmm::mr::set_current_device_resource(&check_mr); + rmm::mr::set_current_device_resource_ref(&check_mr); { auto buf_1 = []() { @@ -97,7 +97,7 @@ TYPED_TEST(ContainerMultiDeviceTest, CreateMoveDestroyDifferentActiveDevice) } RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(0)); - rmm::mr::set_current_device_resource(orig_mr); + rmm::mr::set_current_device_resource_ref(orig_mr); } } @@ -109,9 +109,9 @@ TYPED_TEST(ContainerMultiDeviceTest, ResizeDifferentActiveDevice) // only run on multidevice systems if (num_devices >= 2) { rmm::cuda_set_device_raii dev{rmm::cuda_device_id{0}}; - auto* orig_mr = rmm::mr::get_current_device_resource(); + auto orig_mr = rmm::mr::get_current_device_resource_ref(); auto check_mr = device_check_resource_adaptor{orig_mr}; - rmm::mr::set_current_device_resource(&check_mr); + rmm::mr::set_current_device_resource_ref(&check_mr); if constexpr (not std::is_same_v>) { auto buf = TypeParam(128, rmm::cuda_stream_view{}); @@ -120,7 +120,7 @@ TYPED_TEST(ContainerMultiDeviceTest, ResizeDifferentActiveDevice) } RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(0)); - rmm::mr::set_current_device_resource(orig_mr); + rmm::mr::set_current_device_resource_ref(orig_mr); } } @@ -132,9 +132,9 @@ TYPED_TEST(ContainerMultiDeviceTest, ShrinkDifferentActiveDevice) // only run on multidevice systems if (num_devices >= 2) { rmm::cuda_set_device_raii dev{rmm::cuda_device_id{0}}; - auto* orig_mr = rmm::mr::get_current_device_resource(); + auto orig_mr = rmm::mr::get_current_device_resource_ref(); auto check_mr = device_check_resource_adaptor{orig_mr}; - rmm::mr::set_current_device_resource(&check_mr); + rmm::mr::set_current_device_resource_ref(&check_mr); if constexpr (not std::is_same_v>) { auto buf = TypeParam(128, rmm::cuda_stream_view{}); @@ -144,6 +144,6 @@ TYPED_TEST(ContainerMultiDeviceTest, ShrinkDifferentActiveDevice) } RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(0)); - rmm::mr::set_current_device_resource(orig_mr); + rmm::mr::set_current_device_resource_ref(orig_mr); } } diff --git a/tests/cuda_stream_tests.cpp b/tests/cuda_stream_tests.cpp index 1cc068434..c4d0a6b9f 100644 --- a/tests/cuda_stream_tests.cpp +++ b/tests/cuda_stream_tests.cpp @@ -96,6 +96,6 @@ TEST_F(CudaStreamDeathTest, TestSyncNoThrow) // should assert here or in `~cuda_stream()` stream_a.synchronize_no_throw(); }; - EXPECT_DEATH(test(), "Assertion"); + EXPECT_DEATH(test(), ""); } #endif diff --git a/tests/device_check_resource_adaptor.hpp b/tests/device_check_resource_adaptor.hpp index fcb578fdf..c77c078db 100644 --- a/tests/device_check_resource_adaptor.hpp +++ b/tests/device_check_resource_adaptor.hpp @@ -17,13 +17,14 @@ #include #include #include +#include #include #include class device_check_resource_adaptor final : public rmm::mr::device_memory_resource { public: - device_check_resource_adaptor(rmm::mr::device_memory_resource* upstream) + device_check_resource_adaptor(rmm::device_async_resource_ref upstream) : device_id{rmm::get_current_cuda_device()}, upstream_(upstream) { } @@ -36,11 +37,6 @@ class device_check_resource_adaptor final : public rmm::mr::device_memory_resour return upstream_; } - /** - * @briefreturn{device_memory_resource* to the upstream memory resource} - */ - [[nodiscard]] device_memory_resource* get_upstream() const noexcept { return upstream_; } - private: [[nodiscard]] bool check_device_id() const { return device_id == rmm::get_current_cuda_device(); } @@ -48,7 +44,7 @@ class device_check_resource_adaptor final : public rmm::mr::device_memory_resour { bool const is_correct_device = check_device_id(); EXPECT_TRUE(is_correct_device); - if (is_correct_device) { return upstream_->allocate(bytes, stream); } + if (is_correct_device) { return get_upstream_resource().allocate_async(bytes, stream); } return nullptr; } @@ -56,7 +52,7 @@ class device_check_resource_adaptor final : public rmm::mr::device_memory_resour { bool const is_correct_device = check_device_id(); EXPECT_TRUE(is_correct_device); - if (is_correct_device) { upstream_->deallocate(ptr, bytes, stream); } + if (is_correct_device) { get_upstream_resource().deallocate_async(ptr, bytes, stream); } } [[nodiscard]] bool do_is_equal( @@ -64,10 +60,10 @@ class device_check_resource_adaptor final : public rmm::mr::device_memory_resour { if (this == &other) { return true; } auto const* cast = dynamic_cast(&other); - if (cast == nullptr) { return upstream_->is_equal(other); } + if (cast == nullptr) { return false; } return get_upstream_resource() == cast->get_upstream_resource(); } rmm::cuda_device_id device_id; - rmm::mr::device_memory_resource* upstream_{}; + rmm::device_async_resource_ref upstream_{rmm::mr::get_current_device_resource()}; }; diff --git a/tests/mock_resource.hpp b/tests/mock_resource.hpp index e06148d3a..555cf0d74 100644 --- a/tests/mock_resource.hpp +++ b/tests/mock_resource.hpp @@ -25,7 +25,12 @@ class mock_resource : public rmm::mr::device_memory_resource { public: MOCK_METHOD(void*, do_allocate, (std::size_t, cuda_stream_view), (override)); MOCK_METHOD(void, do_deallocate, (void*, std::size_t, cuda_stream_view), (override)); + bool operator==(mock_resource const&) const noexcept { return true; } + bool operator!=(mock_resource const&) const { return false; } + friend void get_property(mock_resource const&, cuda::mr::device_accessible) noexcept {} using size_pair = std::pair; }; +static_assert(cuda::mr::async_resource_with); + } // namespace rmm::test diff --git a/tests/mr/device/aligned_mr_tests.cpp b/tests/mr/device/aligned_mr_tests.cpp index b9ecbc8ca..85262c29d 100644 --- a/tests/mr/device/aligned_mr_tests.cpp +++ b/tests/mr/device/aligned_mr_tests.cpp @@ -59,13 +59,13 @@ TEST(AlignedTest, ThrowOnInvalidAllocationAlignment) TEST(AlignedTest, SupportsGetMemInfo) { mock_resource mock; - aligned_mock mr{&mock}; + aligned_mock mr{mock}; } TEST(AlignedTest, DefaultAllocationAlignmentPassthrough) { mock_resource mock; - aligned_mock mr{&mock}; + aligned_mock mr{mock}; cuda_stream_view stream; void* const pointer = int_to_address(123); diff --git a/tests/mr/device/arena_mr_tests.cpp b/tests/mr/device/arena_mr_tests.cpp index 1eb38888e..6b7468d74 100644 --- a/tests/mr/device/arena_mr_tests.cpp +++ b/tests/mr/device/arena_mr_tests.cpp @@ -15,6 +15,7 @@ */ #include "../../byte_literals.hpp" +#include "cuda/stream_ref" #include #include @@ -23,6 +24,7 @@ #include #include #include +#include #include #include @@ -37,15 +39,22 @@ namespace { class mock_memory_resource { public: - MOCK_METHOD(void*, allocate, (std::size_t)); - MOCK_METHOD(void, deallocate, (void*, std::size_t)); + MOCK_METHOD(void*, allocate, (std::size_t, std::size_t)); + MOCK_METHOD(void, deallocate, (void*, std::size_t, std::size_t)); + MOCK_METHOD(void*, allocate_async, (std::size_t, std::size_t, cuda::stream_ref)); + MOCK_METHOD(void, deallocate_async, (void*, std::size_t, std::size_t, cuda::stream_ref)); + bool operator==(mock_memory_resource const&) const noexcept { return true; } + bool operator!=(mock_memory_resource const&) const { return false; } + friend void get_property(mock_memory_resource const&, cuda::mr::device_accessible) noexcept {} }; +static_assert(cuda::mr::async_resource_with); + using rmm::mr::detail::arena::block; using rmm::mr::detail::arena::byte_span; using rmm::mr::detail::arena::superblock; -using global_arena = rmm::mr::detail::arena::global_arena; -using arena = rmm::mr::detail::arena::arena; +using global_arena = rmm::mr::detail::arena::global_arena; +using arena = rmm::mr::detail::arena::arena; using arena_mr = rmm::mr::arena_memory_resource; using ::testing::Return; @@ -59,9 +68,10 @@ auto const fake_address4 = reinterpret_cast(superblock::minimum_size * 2) struct ArenaTest : public ::testing::Test { void SetUp() override { - EXPECT_CALL(mock_mr, allocate(arena_size)).WillOnce(Return(fake_address3)); - EXPECT_CALL(mock_mr, deallocate(fake_address3, arena_size)); - global = std::make_unique(&mock_mr, arena_size); + EXPECT_CALL(mock_mr, allocate(arena_size, ::testing::_)).WillOnce(Return(fake_address3)); + EXPECT_CALL(mock_mr, deallocate(fake_address3, arena_size, ::testing::_)); + + global = std::make_unique(mock_mr, arena_size); per_thread = std::make_unique(*global); } @@ -293,13 +303,6 @@ TEST_F(ArenaTest, SuperblockMaxFreeSizeWhenFull) // NOLINT /** * Test global_arena. */ - -TEST_F(ArenaTest, GlobalArenaNullUpstream) // NOLINT -{ - auto construct_nullptr = []() { global_arena global{nullptr, std::nullopt}; }; - EXPECT_THROW(construct_nullptr(), rmm::logic_error); // NOLINT(cppcoreguidelines-avoid-goto) -} - TEST_F(ArenaTest, GlobalArenaAcquire) // NOLINT { auto const sblk = global->acquire(256); @@ -378,7 +381,7 @@ TEST_F(ArenaTest, GlobalArenaDeallocate) // NOLINT { auto* ptr = global->allocate(superblock::minimum_size * 2); EXPECT_EQ(ptr, fake_address3); - global->deallocate(ptr, superblock::minimum_size * 2, {}); + global->deallocate_async(ptr, superblock::minimum_size * 2, {}); ptr = global->allocate(superblock::minimum_size * 2); EXPECT_EQ(ptr, fake_address3); } @@ -387,8 +390,8 @@ TEST_F(ArenaTest, GlobalArenaDeallocateAlignUp) // NOLINT { auto* ptr = global->allocate(superblock::minimum_size + 256); auto* ptr2 = global->allocate(superblock::minimum_size + 512); - global->deallocate(ptr, superblock::minimum_size + 256, {}); - global->deallocate(ptr2, superblock::minimum_size + 512, {}); + global->deallocate_async(ptr, superblock::minimum_size + 256, {}); + global->deallocate_async(ptr2, superblock::minimum_size + 512, {}); EXPECT_EQ(global->allocate(arena_size), fake_address3); } diff --git a/tests/mr/device/callback_mr_tests.cpp b/tests/mr/device/callback_mr_tests.cpp index 34a2cc8cc..445e0aed5 100644 --- a/tests/mr/device/callback_mr_tests.cpp +++ b/tests/mr/device/callback_mr_tests.cpp @@ -36,47 +36,50 @@ using ::testing::_; TEST(CallbackTest, TestCallbacksAreInvoked) { - auto base_mr = mock_resource(); + auto base_mr = mock_resource(); + auto base_ref = device_async_resource_ref{base_mr}; EXPECT_CALL(base_mr, do_allocate(10_MiB, cuda_stream_view{})).Times(1); EXPECT_CALL(base_mr, do_deallocate(_, 10_MiB, cuda_stream_view{})).Times(1); auto allocate_callback = [](std::size_t size, cuda_stream_view stream, void* arg) { - auto base_mr = static_cast(arg); - return base_mr->allocate(size, stream); + auto base_mr = *static_cast(arg); + return base_mr.allocate_async(size, stream); }; auto deallocate_callback = [](void* ptr, std::size_t size, cuda_stream_view stream, void* arg) { - auto base_mr = static_cast(arg); - base_mr->deallocate(ptr, size, stream); + auto base_mr = *static_cast(arg); + base_mr.deallocate_async(ptr, size, stream); }; auto mr = - rmm::mr::callback_memory_resource(allocate_callback, deallocate_callback, &base_mr, &base_mr); - auto ptr = mr.allocate(10_MiB); - mr.deallocate(ptr, 10_MiB); + rmm::mr::callback_memory_resource(allocate_callback, deallocate_callback, &base_ref, &base_ref); + auto const size = std::size_t{10_MiB}; + auto* ptr = mr.allocate(size); + mr.deallocate(ptr, size); } TEST(CallbackTest, LoggingTest) { testing::internal::CaptureStdout(); - auto base_mr = rmm::mr::get_current_device_resource(); + auto base_mr = rmm::mr::get_current_device_resource_ref(); auto allocate_callback = [](std::size_t size, cuda_stream_view stream, void* arg) { std::cout << "Allocating " << size << " bytes" << std::endl; - auto base_mr = static_cast(arg); - return base_mr->allocate(size, stream); + auto base_mr = *static_cast(arg); + return base_mr.allocate_async(size, stream); }; auto deallocate_callback = [](void* ptr, std::size_t size, cuda_stream_view stream, void* arg) { std::cout << "Deallocating " << size << " bytes" << std::endl; - auto base_mr = static_cast(arg); - base_mr->deallocate(ptr, size, stream); + auto base_mr = *static_cast(arg); + base_mr.deallocate_async(ptr, size, stream); }; auto mr = - rmm::mr::callback_memory_resource(allocate_callback, deallocate_callback, base_mr, base_mr); - auto ptr = mr.allocate(10_MiB); - mr.deallocate(ptr, 10_MiB); + rmm::mr::callback_memory_resource(allocate_callback, deallocate_callback, &base_mr, &base_mr); + auto const size = std::size_t{10_MiB}; + auto* ptr = mr.allocate(size); + mr.deallocate(ptr, size); std::string output = testing::internal::GetCapturedStdout(); - std::string expect = fmt::format("Allocating {} bytes\nDeallocating {} bytes\n", 10_MiB, 10_MiB); + std::string expect = fmt::format("Allocating {} bytes\nDeallocating {} bytes\n", size, size); ASSERT_EQ(expect, output); } diff --git a/tests/mr/device/limiting_mr_tests.cpp b/tests/mr/device/limiting_mr_tests.cpp index 777ce9428..3a6178c95 100644 --- a/tests/mr/device/limiting_mr_tests.cpp +++ b/tests/mr/device/limiting_mr_tests.cpp @@ -25,19 +25,19 @@ namespace rmm::test { namespace { -using Limiting_adaptor = rmm::mr::limiting_resource_adaptor; +using limiting_adaptor = rmm::mr::limiting_resource_adaptor; TEST(LimitingTest, ThrowOnNullUpstream) { auto const max_size{5_MiB}; - auto construct_nullptr = []() { Limiting_adaptor mr{nullptr, max_size}; }; + auto construct_nullptr = []() { limiting_adaptor mr{nullptr, max_size}; }; EXPECT_THROW(construct_nullptr(), rmm::logic_error); } TEST(LimitingTest, TooBig) { auto const max_size{5_MiB}; - Limiting_adaptor mr{rmm::mr::get_current_device_resource(), max_size}; + limiting_adaptor mr{rmm::mr::get_current_device_resource(), max_size}; EXPECT_THROW(mr.allocate(max_size + 1), rmm::out_of_memory); } @@ -45,15 +45,15 @@ TEST(LimitingTest, UpstreamFailure) { auto const max_size_1{2_MiB}; auto const max_size_2{5_MiB}; - Limiting_adaptor mr1{rmm::mr::get_current_device_resource(), max_size_1}; - Limiting_adaptor mr2{&mr1, max_size_2}; + limiting_adaptor mr1{rmm::mr::get_current_device_resource(), max_size_1}; + limiting_adaptor mr2{&mr1, max_size_2}; EXPECT_THROW(mr2.allocate(4_MiB), rmm::out_of_memory); } TEST(LimitingTest, UnderLimitDueToFrees) { auto const max_size{10_MiB}; - Limiting_adaptor mr{rmm::mr::get_current_device_resource(), max_size}; + limiting_adaptor mr{rmm::mr::get_current_device_resource(), max_size}; auto const size1{4_MiB}; auto* ptr1 = mr.allocate(size1); auto allocated_bytes = size1; @@ -81,7 +81,7 @@ TEST(LimitingTest, UnderLimitDueToFrees) TEST(LimitingTest, OverLimit) { auto const max_size{10_MiB}; - Limiting_adaptor mr{rmm::mr::get_current_device_resource(), max_size}; + limiting_adaptor mr{rmm::mr::get_current_device_resource(), max_size}; auto const size1{4_MiB}; auto* ptr1 = mr.allocate(size1); auto allocated_bytes = size1; diff --git a/tests/mr/device/pool_mr_tests.cpp b/tests/mr/device/pool_mr_tests.cpp index c63a61844..9db63eb1b 100644 --- a/tests/mr/device/pool_mr_tests.cpp +++ b/tests/mr/device/pool_mr_tests.cpp @@ -49,19 +49,7 @@ TEST(PoolTest, ThrowMaxLessThanInitial) auto max_less_than_initial = []() { const auto initial{1024}; const auto maximum{256}; - pool_mr mr{rmm::mr::get_current_device_resource(), initial, maximum}; - }; - EXPECT_THROW(max_less_than_initial(), rmm::logic_error); -} - -TEST(PoolTest, ReferenceThrowMaxLessThanInitial) -{ - // Make sure first argument is enough larger than the second that alignment rounding doesn't - // make them equal - auto max_less_than_initial = []() { - const auto initial{1024}; - const auto maximum{256}; - pool_mr mr{*rmm::mr::get_current_device_resource(), initial, maximum}; + pool_mr mr{rmm::mr::get_current_device_resource_ref(), initial, maximum}; }; EXPECT_THROW(max_less_than_initial(), rmm::logic_error); } @@ -72,7 +60,7 @@ TEST(PoolTest, AllocateNinetyPercent) auto const [free, total] = rmm::available_device_memory(); (void)total; auto const ninety_percent_pool = rmm::percent_of_free_device_memory(90); - pool_mr mr{rmm::mr::get_current_device_resource(), ninety_percent_pool}; + pool_mr mr{rmm::mr::get_current_device_resource_ref(), ninety_percent_pool}; }; EXPECT_NO_THROW(allocate_ninety()); } @@ -81,7 +69,7 @@ TEST(PoolTest, TwoLargeBuffers) { auto two_large = []() { [[maybe_unused]] auto const [free, total] = rmm::available_device_memory(); - pool_mr mr{rmm::mr::get_current_device_resource(), rmm::percent_of_free_device_memory(50)}; + pool_mr mr{rmm::mr::get_current_device_resource_ref(), rmm::percent_of_free_device_memory(50)}; auto* ptr1 = mr.allocate(free / 4); auto* ptr2 = mr.allocate(free / 4); mr.deallocate(ptr1, free / 4); @@ -116,7 +104,7 @@ TEST(PoolTest, ForceGrowth) TEST(PoolTest, DeletedStream) { - pool_mr mr{rmm::mr::get_current_device_resource(), 0}; + pool_mr mr{rmm::mr::get_current_device_resource_ref(), 0}; cudaStream_t stream{}; // we don't use rmm::cuda_stream here to make destruction more explicit const int size = 10000; EXPECT_EQ(cudaSuccess, cudaStreamCreate(&stream)); @@ -129,7 +117,7 @@ TEST(PoolTest, DeletedStream) TEST(PoolTest, InitialAndMaxPoolSizeEqual) { EXPECT_NO_THROW([]() { - pool_mr mr(rmm::mr::get_current_device_resource(), 1000192, 1000192); + pool_mr mr(rmm::mr::get_current_device_resource_ref(), 1000192, 1000192); mr.allocate(1000); }()); } @@ -138,14 +126,14 @@ TEST(PoolTest, NonAlignedPoolSize) { EXPECT_THROW( []() { - pool_mr mr(rmm::mr::get_current_device_resource(), 1000031, 1000192); + pool_mr mr(rmm::mr::get_current_device_resource_ref(), 1000031, 1000192); mr.allocate(1000); }(), rmm::logic_error); EXPECT_THROW( []() { - pool_mr mr(rmm::mr::get_current_device_resource(), 1000192, 1000200); + pool_mr mr(rmm::mr::get_current_device_resource_ref(), 1000192, 1000200); mr.allocate(1000); }(), rmm::logic_error); @@ -203,18 +191,18 @@ namespace test_properties { class fake_async_resource { public: // To model `async_resource` - void* allocate(std::size_t, std::size_t) { return nullptr; } - void deallocate(void* ptr, std::size_t, std::size_t) {} - void* allocate_async(std::size_t, std::size_t, cuda::stream_ref) { return nullptr; } - void deallocate_async(void* ptr, std::size_t, std::size_t, cuda::stream_ref) {} + static void* allocate(std::size_t, std::size_t) { return nullptr; } + static void deallocate(void* ptr, std::size_t, std::size_t) {} + static void* allocate_async(std::size_t, std::size_t, cuda::stream_ref) { return nullptr; } + static void deallocate_async(void* ptr, std::size_t, std::size_t, cuda::stream_ref) {} bool operator==(const fake_async_resource& other) const { return true; } bool operator!=(const fake_async_resource& other) const { return false; } private: - void* do_allocate(std::size_t bytes, cuda_stream_view) { return nullptr; } - void do_deallocate(void* ptr, std::size_t, cuda_stream_view) {} - [[nodiscard]] bool do_is_equal(fake_async_resource const& other) const noexcept { return true; } + static void* do_allocate(std::size_t bytes, cuda_stream_view) { return nullptr; } + static void do_deallocate(void* ptr, std::size_t, cuda_stream_view) {} + [[nodiscard]] static bool do_is_equal(fake_async_resource const& other) noexcept { return true; } }; static_assert(!cuda::has_property); static_assert(!cuda::has_property, diff --git a/tests/mr/device/statistics_mr_tests.cpp b/tests/mr/device/statistics_mr_tests.cpp index 6c5700f0b..1fdf01aa5 100644 --- a/tests/mr/device/statistics_mr_tests.cpp +++ b/tests/mr/device/statistics_mr_tests.cpp @@ -40,7 +40,7 @@ TEST(StatisticsTest, ThrowOnNullUpstream) TEST(StatisticsTest, Empty) { - statistics_adaptor mr{rmm::mr::get_current_device_resource()}; + statistics_adaptor mr{rmm::mr::get_current_device_resource_ref()}; EXPECT_EQ(mr.get_bytes_counter().peak, 0); EXPECT_EQ(mr.get_bytes_counter().total, 0); @@ -53,7 +53,7 @@ TEST(StatisticsTest, Empty) TEST(StatisticsTest, AllFreed) { - statistics_adaptor mr{rmm::mr::get_current_device_resource()}; + statistics_adaptor mr{rmm::mr::get_current_device_resource_ref()}; std::vector allocations; allocations.reserve(num_allocations); @@ -71,7 +71,7 @@ TEST(StatisticsTest, AllFreed) TEST(StatisticsTest, PeakAllocations) { - statistics_adaptor mr{rmm::mr::get_current_device_resource()}; + statistics_adaptor mr{rmm::mr::get_current_device_resource_ref()}; std::vector allocations; for (std::size_t i = 0; i < num_allocations; ++i) { @@ -127,9 +127,9 @@ TEST(StatisticsTest, PeakAllocations) TEST(StatisticsTest, MultiTracking) { - auto* orig_device_resource = rmm::mr::get_current_device_resource(); + auto orig_device_resource = rmm::mr::get_current_device_resource_ref(); statistics_adaptor mr{orig_device_resource}; - rmm::mr::set_current_device_resource(&mr); + rmm::mr::set_current_device_resource_ref(&mr); std::vector> allocations; for (std::size_t i = 0; i < num_allocations; ++i) { @@ -139,8 +139,8 @@ TEST(StatisticsTest, MultiTracking) EXPECT_EQ(mr.get_allocations_counter().value, 10); - statistics_adaptor inner_mr{rmm::mr::get_current_device_resource()}; - rmm::mr::set_current_device_resource(&inner_mr); + statistics_adaptor inner_mr{rmm::mr::get_current_device_resource_ref()}; + rmm::mr::set_current_device_resource_ref(&inner_mr); for (std::size_t i = 0; i < num_more_allocations; ++i) { allocations.emplace_back( @@ -172,7 +172,7 @@ TEST(StatisticsTest, MultiTracking) EXPECT_EQ(inner_mr.get_allocations_counter().peak, 5); // Reset the current device resource - rmm::mr::set_current_device_resource(orig_device_resource); + rmm::mr::set_current_device_resource_ref(orig_device_resource); } TEST(StatisticsTest, NegativeInnerTracking) @@ -180,7 +180,7 @@ TEST(StatisticsTest, NegativeInnerTracking) // This tests the unlikely scenario where pointers are deallocated on an inner // wrapped memory resource. This can happen if the MR is not saved with the // memory pointer - statistics_adaptor mr{rmm::mr::get_current_device_resource()}; + statistics_adaptor mr{rmm::mr::get_current_device_resource_ref()}; std::vector allocations; for (std::size_t i = 0; i < num_allocations; ++i) { allocations.push_back(mr.allocate(ten_MiB)); @@ -236,7 +236,7 @@ TEST(StatisticsTest, NegativeInnerTracking) TEST(StatisticsTest, Nested) { - statistics_adaptor mr{rmm::mr::get_current_device_resource()}; + statistics_adaptor mr{rmm::mr::get_current_device_resource_ref()}; void* a0 = mr.allocate(ten_MiB); EXPECT_EQ(mr.get_bytes_counter().value, ten_MiB); EXPECT_EQ(mr.get_allocations_counter().value, 1); diff --git a/tests/mr/device/tracking_mr_tests.cpp b/tests/mr/device/tracking_mr_tests.cpp index 7c2532c60..45284144f 100644 --- a/tests/mr/device/tracking_mr_tests.cpp +++ b/tests/mr/device/tracking_mr_tests.cpp @@ -42,14 +42,14 @@ TEST(TrackingTest, ThrowOnNullUpstream) TEST(TrackingTest, Empty) { - tracking_adaptor mr{rmm::mr::get_current_device_resource()}; + tracking_adaptor mr{rmm::mr::get_current_device_resource_ref()}; EXPECT_EQ(mr.get_outstanding_allocations().size(), 0); EXPECT_EQ(mr.get_allocated_bytes(), 0); } TEST(TrackingTest, AllFreed) { - tracking_adaptor mr{rmm::mr::get_current_device_resource()}; + tracking_adaptor mr{rmm::mr::get_current_device_resource_ref()}; std::vector allocations; allocations.reserve(num_allocations); for (int i = 0; i < num_allocations; ++i) { @@ -64,7 +64,7 @@ TEST(TrackingTest, AllFreed) TEST(TrackingTest, AllocationsLeftWithStacks) { - tracking_adaptor mr{rmm::mr::get_current_device_resource(), true}; + tracking_adaptor mr{rmm::mr::get_current_device_resource_ref(), true}; std::vector allocations; allocations.reserve(num_allocations); for (int i = 0; i < num_allocations; ++i) { @@ -82,7 +82,7 @@ TEST(TrackingTest, AllocationsLeftWithStacks) TEST(TrackingTest, AllocationsLeftWithoutStacks) { - tracking_adaptor mr{rmm::mr::get_current_device_resource()}; + tracking_adaptor mr{rmm::mr::get_current_device_resource_ref()}; std::vector allocations; allocations.reserve(num_allocations); for (int i = 0; i < num_allocations; ++i) { @@ -101,9 +101,9 @@ TEST(TrackingTest, AllocationsLeftWithoutStacks) TEST(TrackingTest, MultiTracking) { - auto* orig_device_resource = rmm::mr::get_current_device_resource(); + auto orig_device_resource = rmm::mr::get_current_device_resource_ref(); tracking_adaptor mr{orig_device_resource, true}; - rmm::mr::set_current_device_resource(&mr); + rmm::mr::set_current_device_resource_ref(&mr); std::vector> allocations; for (std::size_t i = 0; i < num_allocations; ++i) { @@ -113,8 +113,8 @@ TEST(TrackingTest, MultiTracking) EXPECT_EQ(mr.get_outstanding_allocations().size(), num_allocations); - tracking_adaptor inner_mr{rmm::mr::get_current_device_resource()}; - rmm::mr::set_current_device_resource(&inner_mr); + tracking_adaptor inner_mr{rmm::mr::get_current_device_resource_ref()}; + rmm::mr::set_current_device_resource_ref(&inner_mr); for (std::size_t i = 0; i < num_more_allocations; ++i) { allocations.emplace_back( @@ -141,7 +141,7 @@ TEST(TrackingTest, MultiTracking) EXPECT_EQ(inner_mr.get_allocated_bytes(), 0); // Reset the current device resource - rmm::mr::set_current_device_resource(orig_device_resource); + rmm::mr::set_current_device_resource_ref(orig_device_resource); } TEST(TrackingTest, NegativeInnerTracking) @@ -149,7 +149,7 @@ TEST(TrackingTest, NegativeInnerTracking) // This tests the unlikely scenario where pointers are deallocated on an inner // wrapped memory resource. This can happen if the MR is not saved with the // memory pointer - tracking_adaptor mr{rmm::mr::get_current_device_resource()}; + tracking_adaptor mr{rmm::mr::get_current_device_resource_ref()}; std::vector allocations; for (std::size_t i = 0; i < num_allocations; ++i) { allocations.push_back(mr.allocate(ten_MiB)); @@ -181,7 +181,7 @@ TEST(TrackingTest, NegativeInnerTracking) TEST(TrackingTest, DeallocWrongBytes) { - tracking_adaptor mr{rmm::mr::get_current_device_resource()}; + tracking_adaptor mr{rmm::mr::get_current_device_resource_ref()}; std::vector allocations; for (std::size_t i = 0; i < num_allocations; ++i) { allocations.push_back(mr.allocate(ten_MiB)); @@ -207,7 +207,7 @@ TEST(TrackingTest, LogOutstandingAllocations) rmm::logger().sinks().push_back(oss_sink); auto old_level = rmm::logger().level(); - tracking_adaptor mr{rmm::mr::get_current_device_resource()}; + tracking_adaptor mr{rmm::mr::get_current_device_resource_ref()}; std::vector allocations; for (std::size_t i = 0; i < num_allocations; ++i) { allocations.push_back(mr.allocate(ten_MiB)); diff --git a/tests/mr/host/mr_ref_tests.cpp b/tests/mr/host/mr_ref_tests.cpp index 8445ab1f5..071739575 100644 --- a/tests/mr/host/mr_ref_tests.cpp +++ b/tests/mr/host/mr_ref_tests.cpp @@ -233,14 +233,17 @@ TYPED_TEST(MRRefTest, UnsupportedAlignmentTest) for (std::size_t num_trials = 0; num_trials < NUM_TRIALS; ++num_trials) { for (std::size_t alignment = MinTestedAlignment; alignment <= MaxTestedAlignment; alignment *= TestedAlignmentMultiplier) { +#ifdef NDEBUG auto allocation_size = size_distribution(generator); void* ptr{nullptr}; // An unsupported alignment (like an odd number) should result in an // alignment of `alignof(std::max_align_t)` auto const bad_alignment = alignment + 1; + EXPECT_NO_THROW(ptr = this->ref.allocate(allocation_size, bad_alignment)); EXPECT_TRUE(is_aligned(ptr, alignof(std::max_align_t))); EXPECT_NO_THROW(this->ref.deallocate(ptr, allocation_size, bad_alignment)); +#endif } } } From bd70c4284f1c8358f44d8935caaeea93f5d39834 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Thu, 22 Aug 2024 03:14:00 +0000 Subject: [PATCH 10/24] Use resource_ref in torch_allocator and polymorphic_allocator --- include/rmm/mr/device/polymorphic_allocator.hpp | 4 ++-- python/rmm/rmm/_lib/_torch_allocator.cpp | 12 ++++++++---- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/include/rmm/mr/device/polymorphic_allocator.hpp b/include/rmm/mr/device/polymorphic_allocator.hpp index 863ba6f62..a1152c77e 100644 --- a/include/rmm/mr/device/polymorphic_allocator.hpp +++ b/include/rmm/mr/device/polymorphic_allocator.hpp @@ -50,7 +50,7 @@ class polymorphic_allocator { using value_type = T; ///< T, the value type of objects allocated by this allocator /** * @brief Construct a `polymorphic_allocator` using the return value of - * `rmm::mr::get_current_device_resource()` as the underlying memory resource. + * `rmm::mr::get_current_device_resource_ref()` as the underlying memory resource. * */ polymorphic_allocator() = default; @@ -114,7 +114,7 @@ class polymorphic_allocator { private: rmm::device_async_resource_ref mr_{ - get_current_device_resource()}; ///< Underlying resource used for (de)allocation + get_current_device_resource_ref()}; ///< Underlying resource used for (de)allocation }; /** diff --git a/python/rmm/rmm/_lib/_torch_allocator.cpp b/python/rmm/rmm/_lib/_torch_allocator.cpp index dc92e4639..64885cc2f 100644 --- a/python/rmm/rmm/_lib/_torch_allocator.cpp +++ b/python/rmm/rmm/_lib/_torch_allocator.cpp @@ -39,8 +39,9 @@ extern "C" void* allocate(std::size_t size, int device, void* stream) { rmm::cuda_device_id const device_id{device}; rmm::cuda_set_device_raii with_device{device_id}; - auto mr = rmm::mr::get_per_device_resource(device_id); - return mr->allocate(size, rmm::cuda_stream_view{static_cast(stream)}); + auto mr = rmm::mr::get_per_device_resource_ref(device_id); + return mr.allocate_async( + size, rmm::CUDA_ALLOCATION_ALIGNMENT, rmm::cuda_stream_view{static_cast(stream)}); } /** @@ -55,6 +56,9 @@ extern "C" void deallocate(void* ptr, std::size_t size, int device, void* stream { rmm::cuda_device_id const device_id{device}; rmm::cuda_set_device_raii with_device{device_id}; - auto mr = rmm::mr::get_per_device_resource(device_id); - mr->deallocate(ptr, size, rmm::cuda_stream_view{static_cast(stream)}); + auto mr = rmm::mr::get_per_device_resource_ref(device_id); + mr.deallocate_async(ptr, + size, + rmm::CUDA_ALLOCATION_ALIGNMENT, + rmm::cuda_stream_view{static_cast(stream)}); } From 00bd3ac131108ae868f173c56408f4165008dae4 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Thu, 22 Aug 2024 04:06:45 +0000 Subject: [PATCH 11/24] Make the raw pointer version of set_per_device_resource update the resource_ref state as well. --- include/rmm/mr/device/per_device_resource.hpp | 44 ++++++++++++++----- 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/include/rmm/mr/device/per_device_resource.hpp b/include/rmm/mr/device/per_device_resource.hpp index 6b78ee463..bc4242874 100644 --- a/include/rmm/mr/device/per_device_resource.hpp +++ b/include/rmm/mr/device/per_device_resource.hpp @@ -184,6 +184,30 @@ inline device_memory_resource* get_per_device_resource(cuda_device_id device_id) : found->second; } +namespace detail { + +// The non-thread-safe implementation of `set_per_device_resource_ref`. This exists because +// we need to call this function from two places: the thread-safe version of +// `set_per_device_resource_ref` and the thread-safe version of `set_per_device_resource`, +// both of which take the lock, so we need an implementation that doesn't take the lock. +inline device_async_resource_ref set_per_device_resource_ref_unsafe( + cuda_device_id device_id, device_async_resource_ref new_resource_ref) +{ + auto& map = detail::get_ref_map(); + auto const old_itr = map.find(device_id.value()); + // If a resource didn't previously exist for `device_id`, return pointer to initial_resource + // Note: because resource_ref is not default-constructible, we can't use std::map::operator[] + if (old_itr == map.end()) { + map.insert({device_id.value(), new_resource_ref}); + return device_async_resource_ref{detail::initial_resource()}; + } + + auto old_resource_ref = old_itr->second; + old_itr->second = new_resource_ref; // update map directly via iterator + return old_resource_ref; +} +} // namespace detail + /** * @brief Set the `device_memory_resource` for the specified device. * @@ -214,6 +238,13 @@ inline device_memory_resource* get_per_device_resource(cuda_device_id device_id) inline device_memory_resource* set_per_device_resource(cuda_device_id device_id, device_memory_resource* new_mr) { + // Note: even though set_per_device_resource() and set_per_device_resource_ref() are not + // interchangeable, we call the latter from the former to maintain resource_ref + // state consistent with the resource pointer state. This is necessary because the + // Python API still uses the raw pointer API. Once the Python API is updated to use + // resource_ref, this call can be removed. + detail::set_per_device_resource_ref_unsafe(device_id, new_mr); + std::lock_guard lock{detail::map_lock()}; auto& map = detail::get_map(); auto const old_itr = map.find(device_id.value()); @@ -341,18 +372,7 @@ inline device_async_resource_ref set_per_device_resource_ref( cuda_device_id device_id, device_async_resource_ref new_resource_ref) { std::lock_guard lock{detail::ref_map_lock()}; - auto& map = detail::get_ref_map(); - auto const old_itr = map.find(device_id.value()); - // If a resource didn't previously exist for `device_id`, return pointer to initial_resource - // Note: because resource_ref is not default-constructible, we can't use std::map::operator[] - if (old_itr == map.end()) { - map.insert({device_id.value(), new_resource_ref}); - return device_async_resource_ref{detail::initial_resource()}; - } - - auto old_resource_ref = old_itr->second; - old_itr->second = new_resource_ref; // update map directly via iterator - return old_resource_ref; + return detail::set_per_device_resource_ref_unsafe(device_id, new_resource_ref); } /** From d7df60efde1f16f8dfb14dafe3ec83983b125384 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Thu, 22 Aug 2024 04:09:26 +0000 Subject: [PATCH 12/24] Update containers and tests to use get/set_current_device_resource_ref() --- .../device_uvector/device_uvector_bench.cu | 18 +++++++++--------- .../multi_stream_allocations_bench.cu | 4 ++-- include/rmm/device_buffer.hpp | 14 +++++++------- include/rmm/device_scalar.hpp | 6 +++--- include/rmm/device_uvector.hpp | 4 ++-- include/rmm/exec_policy.hpp | 4 ++-- .../failure_callback_resource_adaptor.hpp | 2 +- tests/device_buffer_tests.cu | 14 +++++++------- tests/device_check_resource_adaptor.hpp | 2 +- tests/device_scalar_tests.cpp | 2 +- tests/device_uvector_tests.cpp | 2 +- tests/mr/device/aligned_mr_tests.cpp | 2 +- tests/mr/device/arena_mr_tests.cpp | 12 ++++++------ tests/mr/device/failure_callback_mr_tests.cpp | 3 ++- tests/mr/device/limiting_mr_tests.cpp | 8 ++++---- tests/mr/device/thrust_allocator_tests.cu | 10 +++++----- 16 files changed, 54 insertions(+), 53 deletions(-) diff --git a/benchmarks/device_uvector/device_uvector_bench.cu b/benchmarks/device_uvector/device_uvector_bench.cu index 36c9183f9..e56926827 100644 --- a/benchmarks/device_uvector/device_uvector_bench.cu +++ b/benchmarks/device_uvector/device_uvector_bench.cu @@ -40,7 +40,7 @@ void BM_UvectorSizeConstruction(benchmark::State& state) rmm::mr::cuda_memory_resource cuda_mr{}; rmm::mr::pool_memory_resource mr{ &cuda_mr, rmm::percent_of_free_device_memory(50)}; - rmm::mr::set_current_device_resource(&mr); + rmm::mr::set_current_device_resource_ref(&mr); for (auto _ : state) { // NOLINT(clang-analyzer-deadcode.DeadStores) rmm::device_uvector vec(state.range(0), rmm::cuda_stream_view{}); @@ -49,7 +49,7 @@ void BM_UvectorSizeConstruction(benchmark::State& state) state.SetItemsProcessed(static_cast(state.iterations())); - rmm::mr::set_current_device_resource(nullptr); + rmm::mr::reset_current_device_resource_ref(); } BENCHMARK(BM_UvectorSizeConstruction) @@ -62,7 +62,7 @@ void BM_ThrustVectorSizeConstruction(benchmark::State& state) rmm::mr::cuda_memory_resource cuda_mr{}; rmm::mr::pool_memory_resource mr{ &cuda_mr, rmm::percent_of_free_device_memory(50)}; - rmm::mr::set_current_device_resource(&mr); + rmm::mr::set_current_device_resource_ref(&mr); for (auto _ : state) { // NOLINT(clang-analyzer-deadcode.DeadStores) rmm::device_vector vec(state.range(0)); @@ -71,7 +71,7 @@ void BM_ThrustVectorSizeConstruction(benchmark::State& state) state.SetItemsProcessed(static_cast(state.iterations())); - rmm::mr::set_current_device_resource(nullptr); + rmm::mr::reset_current_device_resource_ref(); } BENCHMARK(BM_ThrustVectorSizeConstruction) @@ -140,7 +140,7 @@ template void BM_VectorWorkflow(benchmark::State& state) { rmm::mr::cuda_async_memory_resource cuda_async_mr{}; - rmm::mr::set_current_device_resource(&cuda_async_mr); + rmm::mr::set_current_device_resource_ref(&cuda_async_mr); rmm::cuda_stream input_stream; std::vector streams(4); @@ -158,7 +158,7 @@ void BM_VectorWorkflow(benchmark::State& state) auto const bytes = num_elements * sizeof(std::int32_t) * num_accesses; state.SetBytesProcessed(static_cast(state.iterations() * bytes)); - rmm::mr::set_current_device_resource(nullptr); + rmm::mr::reset_current_device_resource_ref(); } BENCHMARK_TEMPLATE(BM_VectorWorkflow, thrust_vector) // NOLINT @@ -167,9 +167,9 @@ BENCHMARK_TEMPLATE(BM_VectorWorkflow, thrust_vector) // NOLINT ->Unit(benchmark::kMicrosecond) ->UseManualTime(); -// The only difference here is that `rmm::device_vector` uses `rmm::current_device_resource()` -// for allocation while `thrust::device_vector` uses cudaMalloc/cudaFree. In the benchmarks we use -// `cuda_async_memory_resource`, which is faster. +// The only difference here is that `rmm::device_vector` uses +// `rmm::get_current_device_resource_ref()` for allocation while `thrust::device_vector` uses +// cudaMalloc/cudaFree. In the benchmarks we use `cuda_async_memory_resource`, which is faster. BENCHMARK_TEMPLATE(BM_VectorWorkflow, rmm_vector) // NOLINT ->RangeMultiplier(10) // NOLINT ->Range(100'000, 100'000'000) // NOLINT diff --git a/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu b/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu index b73ef54f8..86e761c80 100644 --- a/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu +++ b/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu @@ -75,7 +75,7 @@ static void BM_MultiStreamAllocations(benchmark::State& state, MRFactoryFunc con { auto mr = factory(); - rmm::mr::set_current_device_resource(mr.get()); + rmm::mr::set_current_device_resource_ref(mr.get()); auto num_streams = state.range(0); auto num_kernels = state.range(1); @@ -92,7 +92,7 @@ static void BM_MultiStreamAllocations(benchmark::State& state, MRFactoryFunc con state.SetItemsProcessed(static_cast(state.iterations() * num_kernels)); - rmm::mr::set_current_device_resource(nullptr); + rmm::mr::reset_current_device_resource_ref(); } inline auto make_cuda() { return std::make_shared(); } diff --git a/include/rmm/device_buffer.hpp b/include/rmm/device_buffer.hpp index 4663b5733..64940a78c 100644 --- a/include/rmm/device_buffer.hpp +++ b/include/rmm/device_buffer.hpp @@ -40,7 +40,7 @@ namespace rmm { * * This class allocates untyped and *uninitialized* device memory using a * `device_async_resource_ref`. If not explicitly specified, the memory resource - * returned from `get_current_device_resource()` is used. + * returned from `get_current_device_resource_ref()` is used. * * @note Unlike `std::vector` or `thrust::device_vector`, the device memory * allocated by a `device_buffer` is uninitialized. Therefore, it is undefined @@ -94,7 +94,7 @@ class device_buffer { // `__host__ __device__` specifiers to the defaulted constructor when it is called within the // context of both host and device functions. Specifically, the `cudf::type_dispatcher` is a host- // device function. This causes warnings/errors because this ctor invokes host-only functions. - device_buffer() : _mr{rmm::mr::get_current_device_resource()} {} + device_buffer() : _mr{rmm::mr::get_current_device_resource_ref()} {} /** * @brief Constructs a new device buffer of `size` uninitialized bytes @@ -108,7 +108,7 @@ class device_buffer { */ explicit device_buffer(std::size_t size, cuda_stream_view stream, - device_async_resource_ref mr = mr::get_current_device_resource()) + device_async_resource_ref mr = mr::get_current_device_resource_ref()) : _stream{stream}, _mr{mr} { cuda_set_device_raii dev{_device}; @@ -137,7 +137,7 @@ class device_buffer { device_buffer(void const* source_data, std::size_t size, cuda_stream_view stream, - device_async_resource_ref mr = mr::get_current_device_resource()) + device_async_resource_ref mr = mr::get_current_device_resource_ref()) : _stream{stream}, _mr{mr} { cuda_set_device_raii dev{_device}; @@ -168,7 +168,7 @@ class device_buffer { */ device_buffer(device_buffer const& other, cuda_stream_view stream, - device_async_resource_ref mr = mr::get_current_device_resource()) + device_async_resource_ref mr = mr::get_current_device_resource_ref()) : device_buffer{other.data(), other.size(), stream, mr} { } @@ -418,8 +418,8 @@ class device_buffer { cuda_stream_view _stream{}; ///< Stream to use for device memory deallocation rmm::device_async_resource_ref _mr{ - rmm::mr::get_current_device_resource()}; ///< The memory resource used to - ///< allocate/deallocate device memory + rmm::mr::get_current_device_resource_ref()}; ///< The memory resource used to + ///< allocate/deallocate device memory cuda_device_id _device{get_current_cuda_device()}; /** diff --git a/include/rmm/device_scalar.hpp b/include/rmm/device_scalar.hpp index 47eed78b2..37c607e53 100644 --- a/include/rmm/device_scalar.hpp +++ b/include/rmm/device_scalar.hpp @@ -94,7 +94,7 @@ class device_scalar { * @param mr Optional, resource with which to allocate. */ explicit device_scalar(cuda_stream_view stream, - device_async_resource_ref mr = mr::get_current_device_resource()) + device_async_resource_ref mr = mr::get_current_device_resource_ref()) : _storage{1, stream, mr} { } @@ -117,7 +117,7 @@ class device_scalar { */ explicit device_scalar(value_type const& initial_value, cuda_stream_view stream, - device_async_resource_ref mr = mr::get_current_device_resource()) + device_async_resource_ref mr = mr::get_current_device_resource_ref()) : _storage{1, stream, mr} { set_value_async(initial_value, stream); @@ -137,7 +137,7 @@ class device_scalar { */ device_scalar(device_scalar const& other, cuda_stream_view stream, - device_async_resource_ref mr = mr::get_current_device_resource()) + device_async_resource_ref mr = mr::get_current_device_resource_ref()) : _storage{other._storage, stream, mr} { } diff --git a/include/rmm/device_uvector.hpp b/include/rmm/device_uvector.hpp index ff6a7d837..111f194c0 100644 --- a/include/rmm/device_uvector.hpp +++ b/include/rmm/device_uvector.hpp @@ -125,7 +125,7 @@ class device_uvector { */ explicit device_uvector(std::size_t size, cuda_stream_view stream, - device_async_resource_ref mr = mr::get_current_device_resource()) + device_async_resource_ref mr = mr::get_current_device_resource_ref()) : _storage{elements_to_bytes(size), stream, mr} { } @@ -141,7 +141,7 @@ class device_uvector { */ explicit device_uvector(device_uvector const& other, cuda_stream_view stream, - device_async_resource_ref mr = mr::get_current_device_resource()) + device_async_resource_ref mr = mr::get_current_device_resource_ref()) : _storage{other._storage, stream, mr} { } diff --git a/include/rmm/exec_policy.hpp b/include/rmm/exec_policy.hpp index a343afb8b..713094441 100644 --- a/include/rmm/exec_policy.hpp +++ b/include/rmm/exec_policy.hpp @@ -56,7 +56,7 @@ class exec_policy : public thrust_exec_policy_t { * @param mr The resource to use for allocating temporary memory */ explicit exec_policy(cuda_stream_view stream = cuda_stream_default, - device_async_resource_ref mr = mr::get_current_device_resource()) + device_async_resource_ref mr = mr::get_current_device_resource_ref()) : thrust_exec_policy_t( thrust::cuda::par(mr::thrust_allocator(stream, mr)).on(stream.value())) { @@ -80,7 +80,7 @@ using thrust_exec_policy_nosync_t = class exec_policy_nosync : public thrust_exec_policy_nosync_t { public: explicit exec_policy_nosync(cuda_stream_view stream = cuda_stream_default, - device_async_resource_ref mr = mr::get_current_device_resource()) + device_async_resource_ref mr = mr::get_current_device_resource_ref()) : thrust_exec_policy_nosync_t( thrust::cuda::par_nosync(mr::thrust_allocator(stream, mr)).on(stream.value())) { diff --git a/include/rmm/mr/device/failure_callback_resource_adaptor.hpp b/include/rmm/mr/device/failure_callback_resource_adaptor.hpp index f1a2178ee..0ed274a75 100644 --- a/include/rmm/mr/device/failure_callback_resource_adaptor.hpp +++ b/include/rmm/mr/device/failure_callback_resource_adaptor.hpp @@ -82,7 +82,7 @@ using failure_callback_t = std::function; * failure_callback_adaptor mr{ * rmm::mr::get_current_device_resource_ref(), failure_handler, &retried * }; - * rmm::mr::set_current_device_resource(&mr); + * rmm::mr::set_current_device_resource_ref(&mr); * } * @endcode * diff --git a/tests/device_buffer_tests.cu b/tests/device_buffer_tests.cu index c095eecf8..5e48504d6 100644 --- a/tests/device_buffer_tests.cu +++ b/tests/device_buffer_tests.cu @@ -75,7 +75,7 @@ TYPED_TEST(DeviceBufferTest, DefaultMemoryResource) EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.ssize()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(rmm::device_async_resource_ref{rmm::mr::get_current_device_resource()}, + EXPECT_EQ(rmm::device_async_resource_ref{rmm::mr::get_current_device_resource_ref()}, buff.memory_resource()); EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); } @@ -87,7 +87,7 @@ TYPED_TEST(DeviceBufferTest, DefaultMemoryResourceStream) EXPECT_NE(nullptr, buff.data()); EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(rmm::device_async_resource_ref{rmm::mr::get_current_device_resource()}, + EXPECT_EQ(rmm::device_async_resource_ref{rmm::mr::get_current_device_resource_ref()}, buff.memory_resource()); EXPECT_EQ(this->stream, buff.stream()); } @@ -121,7 +121,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromRawDevicePointer) EXPECT_NE(nullptr, buff.data()); EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(rmm::device_async_resource_ref{rmm::mr::get_current_device_resource()}, + EXPECT_EQ(rmm::device_async_resource_ref{rmm::mr::get_current_device_resource_ref()}, buff.memory_resource()); EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); @@ -138,7 +138,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromRawHostPointer) EXPECT_NE(nullptr, buff.data()); EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(rmm::device_async_resource_ref{rmm::mr::get_current_device_resource()}, + EXPECT_EQ(rmm::device_async_resource_ref{rmm::mr::get_current_device_resource_ref()}, buff.memory_resource()); EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); buff.stream().synchronize(); @@ -152,7 +152,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromNullptr) EXPECT_EQ(nullptr, buff.data()); EXPECT_EQ(0, buff.size()); EXPECT_EQ(0, buff.capacity()); - EXPECT_EQ(rmm::device_async_resource_ref{rmm::mr::get_current_device_resource()}, + EXPECT_EQ(rmm::device_async_resource_ref{rmm::mr::get_current_device_resource_ref()}, buff.memory_resource()); EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); } @@ -180,7 +180,7 @@ TYPED_TEST(DeviceBufferTest, CopyConstructor) EXPECT_EQ(buff.size(), buff_copy.size()); EXPECT_EQ(buff.capacity(), buff_copy.capacity()); EXPECT_EQ(buff_copy.memory_resource(), - rmm::device_async_resource_ref{rmm::mr::get_current_device_resource()}); + rmm::device_async_resource_ref{rmm::mr::get_current_device_resource_ref()}); EXPECT_EQ(buff_copy.stream(), rmm::cuda_stream_view{}); EXPECT_TRUE(thrust::equal(rmm::exec_policy(rmm::cuda_stream_default), @@ -223,7 +223,7 @@ TYPED_TEST(DeviceBufferTest, CopyCapacityLargerThanSize) // The capacity of the copy should be equal to the `size()` of the original EXPECT_EQ(new_size, buff_copy.capacity()); EXPECT_EQ(buff_copy.memory_resource(), - rmm::device_async_resource_ref{rmm::mr::get_current_device_resource()}); + rmm::device_async_resource_ref{rmm::mr::get_current_device_resource_ref()}); EXPECT_EQ(buff_copy.stream(), rmm::cuda_stream_view{}); EXPECT_TRUE(thrust::equal(rmm::exec_policy(rmm::cuda_stream_default), diff --git a/tests/device_check_resource_adaptor.hpp b/tests/device_check_resource_adaptor.hpp index c77c078db..23b0c4bb1 100644 --- a/tests/device_check_resource_adaptor.hpp +++ b/tests/device_check_resource_adaptor.hpp @@ -65,5 +65,5 @@ class device_check_resource_adaptor final : public rmm::mr::device_memory_resour } rmm::cuda_device_id device_id; - rmm::device_async_resource_ref upstream_{rmm::mr::get_current_device_resource()}; + rmm::device_async_resource_ref upstream_{rmm::mr::get_current_device_resource_ref()}; }; diff --git a/tests/device_scalar_tests.cpp b/tests/device_scalar_tests.cpp index 6f80a5de1..d0c5e327f 100644 --- a/tests/device_scalar_tests.cpp +++ b/tests/device_scalar_tests.cpp @@ -38,7 +38,7 @@ struct DeviceScalarTest : public ::testing::Test { std::default_random_engine generator{}; T value{}; rmm::cuda_stream stream{}; - rmm::device_async_resource_ref mr{rmm::mr::get_current_device_resource()}; + rmm::device_async_resource_ref mr{rmm::mr::get_current_device_resource_ref()}; DeviceScalarTest() : value{random_value()} {} diff --git a/tests/device_uvector_tests.cpp b/tests/device_uvector_tests.cpp index 1c93ef138..90955c24c 100644 --- a/tests/device_uvector_tests.cpp +++ b/tests/device_uvector_tests.cpp @@ -39,7 +39,7 @@ TYPED_TEST(TypedUVectorTest, MemoryResource) { rmm::device_uvector vec(128, this->stream()); EXPECT_EQ(vec.memory_resource(), - rmm::device_async_resource_ref{rmm::mr::get_current_device_resource()}); + rmm::device_async_resource_ref{rmm::mr::get_current_device_resource_ref()}); } TYPED_TEST(TypedUVectorTest, ZeroSizeConstructor) diff --git a/tests/mr/device/aligned_mr_tests.cpp b/tests/mr/device/aligned_mr_tests.cpp index 85262c29d..9b90bf751 100644 --- a/tests/mr/device/aligned_mr_tests.cpp +++ b/tests/mr/device/aligned_mr_tests.cpp @@ -204,7 +204,7 @@ TEST(AlignedTest, AlignRealPointer) { auto const alignment{4096}; auto const threshold{65536}; - aligned_real mr{rmm::mr::get_current_device_resource(), alignment, threshold}; + aligned_real mr{rmm::mr::get_current_device_resource_ref(), alignment, threshold}; void* alloc = mr.allocate(threshold); EXPECT_TRUE(rmm::is_pointer_aligned(alloc, alignment)); mr.deallocate(alloc, threshold); diff --git a/tests/mr/device/arena_mr_tests.cpp b/tests/mr/device/arena_mr_tests.cpp index 6b7468d74..9db623394 100644 --- a/tests/mr/device/arena_mr_tests.cpp +++ b/tests/mr/device/arena_mr_tests.cpp @@ -482,7 +482,7 @@ TEST_F(ArenaTest, ThrowOnNullUpstream) // NOLINT TEST_F(ArenaTest, SizeSmallerThanSuperblockSize) // NOLINT { - auto construct_small = []() { arena_mr mr{rmm::mr::get_current_device_resource(), 256}; }; + auto construct_small = []() { arena_mr mr{rmm::mr::get_current_device_resource_ref(), 256}; }; // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto) EXPECT_THROW(construct_small(), rmm::logic_error); } @@ -493,14 +493,14 @@ TEST_F(ArenaTest, AllocateNinetyPercent) // NOLINT auto const free = rmm::available_device_memory().first; auto const ninety_percent = rmm::align_up( static_cast(static_cast(free) * 0.9), rmm::CUDA_ALLOCATION_ALIGNMENT); - arena_mr mr(rmm::mr::get_current_device_resource(), ninety_percent); + arena_mr mr(rmm::mr::get_current_device_resource_ref(), ninety_percent); }()); } TEST_F(ArenaTest, SmallMediumLarge) // NOLINT { EXPECT_NO_THROW([]() { // NOLINT(cppcoreguidelines-avoid-goto) - arena_mr mr(rmm::mr::get_current_device_resource()); + arena_mr mr(rmm::mr::get_current_device_resource_ref()); auto* small = mr.allocate(256); auto* medium = mr.allocate(64_MiB); auto const free = rmm::available_device_memory().first; @@ -515,7 +515,7 @@ TEST_F(ArenaTest, Defragment) // NOLINT { EXPECT_NO_THROW([]() { // NOLINT(cppcoreguidelines-avoid-goto) auto const arena_size = superblock::minimum_size * 4; - arena_mr mr(rmm::mr::get_current_device_resource(), arena_size); + arena_mr mr(rmm::mr::get_current_device_resource_ref(), arena_size); std::vector threads; std::size_t num_threads{4}; threads.reserve(num_threads); @@ -542,7 +542,7 @@ TEST_F(ArenaTest, PerThreadToStreamDealloc) // NOLINT // arena that then moved to global arena during a defragmentation // and then moved to a stream arena. auto const arena_size = superblock::minimum_size * 2; - arena_mr mr(rmm::mr::get_current_device_resource(), arena_size); + arena_mr mr(rmm::mr::get_current_device_resource_ref(), arena_size); // Create an allocation from a per thread arena void* thread_ptr = mr.allocate(256, rmm::cuda_stream_per_thread); // Create an allocation in a stream arena to force global arena @@ -568,7 +568,7 @@ TEST_F(ArenaTest, PerThreadToStreamDealloc) // NOLINT TEST_F(ArenaTest, DumpLogOnFailure) // NOLINT { - arena_mr mr{rmm::mr::get_current_device_resource(), 1_MiB, true}; + arena_mr mr{rmm::mr::get_current_device_resource_ref(), 1_MiB, true}; { // make the log interesting std::vector threads; diff --git a/tests/mr/device/failure_callback_mr_tests.cpp b/tests/mr/device/failure_callback_mr_tests.cpp index 683aee86e..4b3d084d5 100644 --- a/tests/mr/device/failure_callback_mr_tests.cpp +++ b/tests/mr/device/failure_callback_mr_tests.cpp @@ -47,7 +47,8 @@ bool failure_handler(std::size_t /*bytes*/, void* arg) TEST(FailureCallbackTest, RetryAllocationOnce) { bool retried{false}; - failure_callback_adaptor<> mr{rmm::mr::get_current_device_resource(), failure_handler, &retried}; + failure_callback_adaptor<> mr{ + rmm::mr::get_current_device_resource_ref(), failure_handler, &retried}; EXPECT_EQ(retried, false); EXPECT_THROW(mr.allocate(512_GiB), std::bad_alloc); EXPECT_EQ(retried, true); diff --git a/tests/mr/device/limiting_mr_tests.cpp b/tests/mr/device/limiting_mr_tests.cpp index 3a6178c95..58dbfd5f5 100644 --- a/tests/mr/device/limiting_mr_tests.cpp +++ b/tests/mr/device/limiting_mr_tests.cpp @@ -37,7 +37,7 @@ TEST(LimitingTest, ThrowOnNullUpstream) TEST(LimitingTest, TooBig) { auto const max_size{5_MiB}; - limiting_adaptor mr{rmm::mr::get_current_device_resource(), max_size}; + limiting_adaptor mr{rmm::mr::get_current_device_resource_ref(), max_size}; EXPECT_THROW(mr.allocate(max_size + 1), rmm::out_of_memory); } @@ -45,7 +45,7 @@ TEST(LimitingTest, UpstreamFailure) { auto const max_size_1{2_MiB}; auto const max_size_2{5_MiB}; - limiting_adaptor mr1{rmm::mr::get_current_device_resource(), max_size_1}; + limiting_adaptor mr1{rmm::mr::get_current_device_resource_ref(), max_size_1}; limiting_adaptor mr2{&mr1, max_size_2}; EXPECT_THROW(mr2.allocate(4_MiB), rmm::out_of_memory); } @@ -53,7 +53,7 @@ TEST(LimitingTest, UpstreamFailure) TEST(LimitingTest, UnderLimitDueToFrees) { auto const max_size{10_MiB}; - limiting_adaptor mr{rmm::mr::get_current_device_resource(), max_size}; + limiting_adaptor mr{rmm::mr::get_current_device_resource_ref(), max_size}; auto const size1{4_MiB}; auto* ptr1 = mr.allocate(size1); auto allocated_bytes = size1; @@ -81,7 +81,7 @@ TEST(LimitingTest, UnderLimitDueToFrees) TEST(LimitingTest, OverLimit) { auto const max_size{10_MiB}; - limiting_adaptor mr{rmm::mr::get_current_device_resource(), max_size}; + limiting_adaptor mr{rmm::mr::get_current_device_resource_ref(), max_size}; auto const size1{4_MiB}; auto* ptr1 = mr.allocate(size1); auto allocated_bytes = size1; diff --git a/tests/mr/device/thrust_allocator_tests.cu b/tests/mr/device/thrust_allocator_tests.cu index 91ae396ed..84f599957 100644 --- a/tests/mr/device/thrust_allocator_tests.cu +++ b/tests/mr/device/thrust_allocator_tests.cu @@ -37,9 +37,9 @@ namespace { struct allocator_test : public mr_ref_test {}; // Disable until we support resource_ref with set_current_device_resource -/*TEST_P(allocator_test, first) +TEST_P(allocator_test, first) { - rmm::mr::set_current_device_resource(this->mr.get()); + rmm::mr::set_current_device_resource_ref(this->ref); auto const num_ints{100}; rmm::device_vector ints(num_ints, 1); EXPECT_EQ(num_ints, thrust::reduce(ints.begin(), ints.end())); @@ -47,12 +47,12 @@ struct allocator_test : public mr_ref_test {}; TEST_P(allocator_test, defaults) { - rmm::mr::set_current_device_resource(this->mr.get()); + rmm::mr::set_current_device_resource_ref(this->ref); rmm::mr::thrust_allocator allocator(rmm::cuda_stream_default); EXPECT_EQ(allocator.stream(), rmm::cuda_stream_default); EXPECT_EQ(allocator.get_upstream_resource(), - rmm::device_async_resource_ref{rmm::mr::get_current_device_resource()}); -}*/ + rmm::device_async_resource_ref{rmm::mr::get_current_device_resource_ref()}); +} TEST_P(allocator_test, multi_device) { From 8d2bc627255e9a991092b0d8db269118d984e6b9 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Wed, 28 Aug 2024 01:28:02 +0000 Subject: [PATCH 13/24] Remove an utterance of device_memory_resource* in test. --- tests/mr/device/adaptor_tests.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/mr/device/adaptor_tests.cpp b/tests/mr/device/adaptor_tests.cpp index a757a78b0..286daa841 100644 --- a/tests/mr/device/adaptor_tests.cpp +++ b/tests/mr/device/adaptor_tests.cpp @@ -129,8 +129,7 @@ TYPED_TEST(AdaptorTest, Equality) } { - rmm::mr::device_memory_resource* device_mr = &this->cuda; - auto other_mr = aligned_resource_adaptor{device_mr}; + auto other_mr = aligned_resource_adaptor{&this->cuda}; EXPECT_FALSE(this->mr->is_equal(other_mr)); } } From 3d2ef083801a70294500d52b444202d8b598b867 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Wed, 28 Aug 2024 01:28:42 +0000 Subject: [PATCH 14/24] Remove another unnecessary device_memory_resource* --- include/rmm/device_uvector.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/rmm/device_uvector.hpp b/include/rmm/device_uvector.hpp index 2dcbc01ff..13f566150 100644 --- a/include/rmm/device_uvector.hpp +++ b/include/rmm/device_uvector.hpp @@ -48,7 +48,7 @@ namespace RMM_NAMESPACE { * * Example: * @code{.cpp} - * rmm::mr::device_memory_resource * mr = new my_custom_resource(); + * auto mr = new my_custom_resource(); * rmm::cuda_stream_view s{}; * * // Allocates *uninitialized* device memory on stream `s` sufficient for 100 ints using the From 13172c5ab12d30f6573217802b97d2d58d95234d Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Wed, 28 Aug 2024 01:28:56 +0000 Subject: [PATCH 15/24] Copyrights --- python/rmm/rmm/_lib/_torch_allocator.cpp | 2 +- tests/cuda_stream_tests.cpp | 2 +- tests/device_scalar_tests.cpp | 2 +- tests/mr/device/callback_mr_tests.cpp | 2 +- tests/mr/device/limiting_mr_tests.cpp | 2 +- tests/mr/device/tracking_mr_tests.cpp | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/python/rmm/rmm/_lib/_torch_allocator.cpp b/python/rmm/rmm/_lib/_torch_allocator.cpp index 64885cc2f..bfe94c2d0 100644 --- a/python/rmm/rmm/_lib/_torch_allocator.cpp +++ b/python/rmm/rmm/_lib/_torch_allocator.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/tests/cuda_stream_tests.cpp b/tests/cuda_stream_tests.cpp index c4d0a6b9f..ec7e6c3e9 100644 --- a/tests/cuda_stream_tests.cpp +++ b/tests/cuda_stream_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/tests/device_scalar_tests.cpp b/tests/device_scalar_tests.cpp index d0c5e327f..323894a6a 100644 --- a/tests/device_scalar_tests.cpp +++ b/tests/device_scalar_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/tests/mr/device/callback_mr_tests.cpp b/tests/mr/device/callback_mr_tests.cpp index 445e0aed5..a56efa60c 100644 --- a/tests/mr/device/callback_mr_tests.cpp +++ b/tests/mr/device/callback_mr_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/tests/mr/device/limiting_mr_tests.cpp b/tests/mr/device/limiting_mr_tests.cpp index 58dbfd5f5..e6cc97029 100644 --- a/tests/mr/device/limiting_mr_tests.cpp +++ b/tests/mr/device/limiting_mr_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/tests/mr/device/tracking_mr_tests.cpp b/tests/mr/device/tracking_mr_tests.cpp index 45284144f..6eeb6ccc3 100644 --- a/tests/mr/device/tracking_mr_tests.cpp +++ b/tests/mr/device/tracking_mr_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 4884f536fb22e5447ba1898fdf86b54ee2693904 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Thu, 29 Aug 2024 01:20:26 +0000 Subject: [PATCH 16/24] Add `to_device_async_resource_ref_checked()` to simplify adaptor constructors. --- include/rmm/resource_ref.hpp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/include/rmm/resource_ref.hpp b/include/rmm/resource_ref.hpp index 08942a040..6a34a6a47 100644 --- a/include/rmm/resource_ref.hpp +++ b/include/rmm/resource_ref.hpp @@ -65,5 +65,21 @@ using host_device_resource_ref = using host_device_async_resource_ref = cuda::mr::async_resource_ref; +/** + * @brief Convert pointer to memory resource into `device_async_resource_ref`, checking for + * `nullptr` + * + * @tparam Resource The type of the memory resource. + * @param res A pointer to the memory resource. + * @return A `device_async_resource_ref` to the memory resource. + * @throws std::logic_error if the memory resource pointer is null. + */ +template +device_async_resource_ref to_device_async_resource_ref_checked(Resource* res) +{ + RMM_EXPECTS(nullptr != res, "Unexpected null resource pointer."); + return device_async_resource_ref{*res}; +} + /** @} */ // end of group } // namespace RMM_NAMESPACE From 1bcb81ad80a07703ed06561f45374aa37f651531 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Thu, 29 Aug 2024 01:24:12 +0000 Subject: [PATCH 17/24] Use to_device_async_resource_ref_checked() and more adaptor tidy / conversion to refs --- .../mr/device/aligned_resource_adaptor.hpp | 7 +- .../rmm/mr/device/arena_memory_resource.hpp | 7 +- .../rmm/mr/device/binning_memory_resource.hpp | 28 ++---- .../failure_callback_resource_adaptor.hpp | 7 +- .../mr/device/fixed_size_memory_resource.hpp | 50 +++++++--- .../mr/device/limiting_resource_adaptor.hpp | 33 +++---- .../mr/device/logging_resource_adaptor.hpp | 93 +++++++++++++++---- include/rmm/mr/device/per_device_resource.hpp | 1 + .../rmm/mr/device/pool_memory_resource.hpp | 5 +- .../mr/device/prefetch_resource_adaptor.hpp | 28 +++--- .../mr/device/statistics_resource_adaptor.hpp | 19 ++-- .../device/thread_safe_resource_adaptor.hpp | 28 +++--- .../mr/device/tracking_resource_adaptor.hpp | 16 ++-- tests/device_check_resource_adaptor.hpp | 2 +- 14 files changed, 193 insertions(+), 131 deletions(-) diff --git a/include/rmm/mr/device/aligned_resource_adaptor.hpp b/include/rmm/mr/device/aligned_resource_adaptor.hpp index 40d9e7c33..f99348906 100644 --- a/include/rmm/mr/device/aligned_resource_adaptor.hpp +++ b/include/rmm/mr/device/aligned_resource_adaptor.hpp @@ -91,10 +91,7 @@ class aligned_resource_adaptor final : public device_memory_resource { explicit aligned_resource_adaptor(Upstream* upstream, std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT, std::size_t alignment_threshold = default_alignment_threshold) - : upstream_{[upstream]() { - RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); - return device_async_resource_ref{*upstream}; - }()}, + : upstream_{to_device_async_resource_ref_checked(upstream)}, alignment_{alignment}, alignment_threshold_{alignment_threshold} { @@ -209,7 +206,7 @@ class aligned_resource_adaptor final : public device_memory_resource { } /// The upstream resource used for satisfying allocation requests - device_async_resource_ref upstream_{rmm::mr::get_current_device_resource_ref()}; + device_async_resource_ref upstream_; std::unordered_map pointers_; ///< Map of aligned pointers to upstream pointers. std::size_t alignment_; ///< The size used for allocation alignment std::size_t alignment_threshold_; ///< The size above which allocations should be aligned diff --git a/include/rmm/mr/device/arena_memory_resource.hpp b/include/rmm/mr/device/arena_memory_resource.hpp index 31d26e98d..417b7d2b4 100644 --- a/include/rmm/mr/device/arena_memory_resource.hpp +++ b/include/rmm/mr/device/arena_memory_resource.hpp @@ -116,12 +116,7 @@ class arena_memory_resource final : public device_memory_resource { explicit arena_memory_resource(Upstream* upstream_mr, std::optional arena_size = std::nullopt, bool dump_log_on_failure = false) - : global_arena_{[upstream_mr]() { - RMM_EXPECTS(upstream_mr != nullptr, - "Unexpected null upstream memory resource."); - return device_async_resource_ref{*upstream_mr}; - }(), - arena_size}, + : global_arena_{to_device_async_resource_ref_checked(upstream_mr), arena_size}, dump_log_on_failure_{dump_log_on_failure} { if (dump_log_on_failure_) { diff --git a/include/rmm/mr/device/binning_memory_resource.hpp b/include/rmm/mr/device/binning_memory_resource.hpp index 773035231..7daa77328 100644 --- a/include/rmm/mr/device/binning_memory_resource.hpp +++ b/include/rmm/mr/device/binning_memory_resource.hpp @@ -57,10 +57,7 @@ class binning_memory_resource final : public device_memory_resource { * @param upstream_resource The upstream memory resource used to allocate bin pools. */ explicit binning_memory_resource(Upstream* upstream_resource) - : upstream_mr_{[upstream_resource]() { - RMM_EXPECTS(nullptr != upstream_resource, "Unexpected null upstream pointer."); - return upstream_resource; - }()} + : upstream_mr_{to_device_async_resource_ref_checked(upstream_resource)} { } @@ -79,10 +76,7 @@ class binning_memory_resource final : public device_memory_resource { binning_memory_resource(Upstream* upstream_resource, int8_t min_size_exponent, // NOLINT(bugprone-easily-swappable-parameters) int8_t max_size_exponent) - : upstream_mr_{[upstream_resource]() { - RMM_EXPECTS(nullptr != upstream_resource, "Unexpected null upstream pointer."); - return upstream_resource; - }()} + : upstream_mr_{to_device_async_resource_ref_checked(upstream_resource)} { for (auto i = min_size_exponent; i <= max_size_exponent; i++) { add_bin(1 << i); @@ -102,18 +96,13 @@ class binning_memory_resource final : public device_memory_resource { binning_memory_resource& operator=(binning_memory_resource&&) = delete; /** - * @briefreturn{rmm::device_async_resource_ref to the upstream resource} + * @briefreturn{device_async_resource_ref to the upstream resource} */ - [[nodiscard]] rmm::device_async_resource_ref get_upstream_resource() const noexcept + [[nodiscard]] device_async_resource_ref get_upstream_resource() const noexcept { return upstream_mr_; } - /** - * @briefreturn{Upstream* to the upstream memory resource} - */ - [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_mr_; } - /** * @brief Add a bin allocator to this resource * @@ -132,7 +121,7 @@ class binning_memory_resource final : public device_memory_resource { */ void add_bin(std::size_t allocation_size, device_memory_resource* bin_resource = nullptr) { - allocation_size = rmm::align_up(allocation_size, rmm::CUDA_ALLOCATION_ALIGNMENT); + allocation_size = align_up(allocation_size, CUDA_ALLOCATION_ALIGNMENT); if (nullptr != bin_resource) { resource_bins_.insert({allocation_size, bin_resource}); @@ -153,10 +142,10 @@ class binning_memory_resource final : public device_memory_resource { * @param bytes Requested allocation size in bytes * @return Get the resource reference for the requested size. */ - rmm::device_async_resource_ref get_resource_ref(std::size_t bytes) + device_async_resource_ref get_resource_ref(std::size_t bytes) { auto iter = resource_bins_.lower_bound(bytes); - return (iter != resource_bins_.cend()) ? rmm::device_async_resource_ref{iter->second} + return (iter != resource_bins_.cend()) ? device_async_resource_ref{iter->second} : get_upstream_resource(); } @@ -188,7 +177,8 @@ class binning_memory_resource final : public device_memory_resource { get_resource_ref(bytes).deallocate_async(ptr, bytes, stream); } - Upstream* upstream_mr_; // The upstream memory_resource from which to allocate blocks. + device_async_resource_ref + upstream_mr_; // The upstream memory_resource from which to allocate blocks. std::vector>> owned_bin_resources_; diff --git a/include/rmm/mr/device/failure_callback_resource_adaptor.hpp b/include/rmm/mr/device/failure_callback_resource_adaptor.hpp index d8c90b564..264a5a532 100644 --- a/include/rmm/mr/device/failure_callback_resource_adaptor.hpp +++ b/include/rmm/mr/device/failure_callback_resource_adaptor.hpp @@ -126,10 +126,7 @@ class failure_callback_resource_adaptor final : public device_memory_resource { failure_callback_resource_adaptor(Upstream* upstream, failure_callback_t callback, void* callback_arg) - : upstream_{[upstream]() { - RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); - return device_async_resource_ref{*upstream}; - }()}, + : upstream_{to_device_async_resource_ref_checked(upstream)}, callback_{std::move(callback)}, callback_arg_{callback_arg} { @@ -207,7 +204,7 @@ class failure_callback_resource_adaptor final : public device_memory_resource { } // the upstream resource used for satisfying allocation requests - device_async_resource_ref upstream_{rmm::mr::get_current_device_resource_ref()}; + device_async_resource_ref upstream_; failure_callback_t callback_; void* callback_arg_; }; diff --git a/include/rmm/mr/device/fixed_size_memory_resource.hpp b/include/rmm/mr/device/fixed_size_memory_resource.hpp index 84cb3b0c6..249af77dd 100644 --- a/include/rmm/mr/device/fixed_size_memory_resource.hpp +++ b/include/rmm/mr/device/fixed_size_memory_resource.hpp @@ -65,7 +65,31 @@ class fixed_size_memory_resource /** * @brief Construct a new `fixed_size_memory_resource` that allocates memory from - * `upstream_resource`. + * `upstream_mr`. + * + * When the pool of blocks is all allocated, grows the pool by allocating + * `blocks_to_preallocate` more blocks from `upstream_mr`. + * + * @param upstream_mr The device_async_resource_ref from which to allocate blocks for the pool. + * @param block_size The size of blocks to allocate. + * @param blocks_to_preallocate The number of blocks to allocate to initialize the pool. + */ + explicit fixed_size_memory_resource( + device_async_resource_ref upstream_mr, + // NOLINTNEXTLINE bugprone-easily-swappable-parameters + std::size_t block_size = default_block_size, + std::size_t blocks_to_preallocate = default_blocks_to_preallocate) + : upstream_mr_{upstream_mr}, + block_size_{align_up(block_size, CUDA_ALLOCATION_ALIGNMENT)}, + upstream_chunk_size_{block_size_ * blocks_to_preallocate} + { + // allocate initial blocks and insert into free list + this->insert_blocks(std::move(blocks_from_upstream(cuda_stream_legacy)), cuda_stream_legacy); + } + + /** + * @brief Construct a new `fixed_size_memory_resource` that allocates memory from + * `upstream_mr`. * * When the pool of blocks is all allocated, grows the pool by allocating * `blocks_to_preallocate` more blocks from `upstream_mr`. @@ -76,11 +100,12 @@ class fixed_size_memory_resource */ explicit fixed_size_memory_resource( Upstream* upstream_mr, + // NOLINTNEXTLINE bugprone-easily-swappable-parameters std::size_t block_size = default_block_size, std::size_t blocks_to_preallocate = default_blocks_to_preallocate) - : upstream_mr_{upstream_mr}, - block_size_{rmm::align_up(block_size, rmm::CUDA_ALLOCATION_ALIGNMENT)}, - upstream_chunk_size_{block_size * blocks_to_preallocate} + : upstream_mr_{to_device_async_resource_ref_checked(upstream_mr)}, + block_size_{align_up(block_size, CUDA_ALLOCATION_ALIGNMENT)}, + upstream_chunk_size_{block_size_ * blocks_to_preallocate} { // allocate initial blocks and insert into free list this->insert_blocks(std::move(blocks_from_upstream(cuda_stream_legacy)), cuda_stream_legacy); @@ -99,18 +124,13 @@ class fixed_size_memory_resource fixed_size_memory_resource& operator=(fixed_size_memory_resource&&) = delete; /** - * @briefreturn{rmm::device_async_resource_ref to the upstream resource} + * @briefreturn{device_async_resource_ref to the upstream resource} */ - [[nodiscard]] rmm::device_async_resource_ref get_upstream_resource() const noexcept + [[nodiscard]] device_async_resource_ref get_upstream_resource() const noexcept { return upstream_mr_; } - /** - * @briefreturn{Upstream* to the upstream memory resource} - */ - [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_mr_; } - /** * @brief Get the size of blocks allocated by this memory resource. * @@ -200,7 +220,7 @@ class fixed_size_memory_resource { // Deallocating a fixed-size block just inserts it in the free list, which is // handled by the parent class - RMM_LOGGING_ASSERT(rmm::align_up(size, rmm::CUDA_ALLOCATION_ALIGNMENT) <= block_size_); + RMM_LOGGING_ASSERT(align_up(size, CUDA_ALLOCATION_ALIGNMENT) <= block_size_); return block_type{ptr}; } @@ -254,10 +274,10 @@ class fixed_size_memory_resource } private: - Upstream* upstream_mr_; // The resource from which to allocate new blocks + device_async_resource_ref upstream_mr_; // The resource from which to allocate new blocks - std::size_t const block_size_; // size of blocks this MR allocates - std::size_t const upstream_chunk_size_; // size of chunks allocated from heap MR + std::size_t block_size_; // size of blocks this MR allocates + std::size_t upstream_chunk_size_; // size of chunks allocated from heap MR // blocks allocated from heap: so they can be easily freed std::vector upstream_blocks_; diff --git a/include/rmm/mr/device/limiting_resource_adaptor.hpp b/include/rmm/mr/device/limiting_resource_adaptor.hpp index 2e9e08e22..eba352e78 100644 --- a/include/rmm/mr/device/limiting_resource_adaptor.hpp +++ b/include/rmm/mr/device/limiting_resource_adaptor.hpp @@ -58,11 +58,11 @@ class limiting_resource_adaptor final : public device_memory_resource { */ limiting_resource_adaptor(device_async_resource_ref upstream, std::size_t allocation_limit, - std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT) - : allocation_limit_{allocation_limit}, + std::size_t alignment = CUDA_ALLOCATION_ALIGNMENT) + : upstream_{upstream}, + allocation_limit_{allocation_limit}, allocated_bytes_(0), - alignment_(alignment), - upstream_{upstream} + alignment_(alignment) { } @@ -78,14 +78,11 @@ class limiting_resource_adaptor final : public device_memory_resource { */ limiting_resource_adaptor(Upstream* upstream, std::size_t allocation_limit, - std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT) - : allocation_limit_{allocation_limit}, + std::size_t alignment = CUDA_ALLOCATION_ALIGNMENT) + : upstream_{to_device_async_resource_ref_checked(upstream)}, + allocation_limit_{allocation_limit}, allocated_bytes_(0), - alignment_(alignment), - upstream_{[upstream]() { - RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); - return device_async_resource_ref{*upstream}; - }()} + alignment_(alignment) { } @@ -99,9 +96,9 @@ class limiting_resource_adaptor final : public device_memory_resource { default; ///< @default_move_assignment{limiting_resource_adaptor} /** - * @briefreturn{rmm::device_async_resource_ref to the upstream resource} + * @briefreturn{device_async_resource_ref to the upstream resource} */ - [[nodiscard]] rmm::device_async_resource_ref get_upstream_resource() const noexcept + [[nodiscard]] device_async_resource_ref get_upstream_resource() const noexcept { return upstream_; } @@ -142,7 +139,7 @@ class limiting_resource_adaptor final : public device_memory_resource { */ void* do_allocate(std::size_t bytes, cuda_stream_view stream) override { - auto const proposed_size = rmm::align_up(bytes, alignment_); + auto const proposed_size = align_up(bytes, alignment_); auto const old = allocated_bytes_.fetch_add(proposed_size); if (old + proposed_size <= allocation_limit_) { try { @@ -166,7 +163,7 @@ class limiting_resource_adaptor final : public device_memory_resource { */ void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override { - std::size_t allocated_size = rmm::align_up(bytes, alignment_); + std::size_t allocated_size = align_up(bytes, alignment_); get_upstream_resource().deallocate_async(ptr, bytes, stream); allocated_bytes_ -= allocated_size; } @@ -186,6 +183,9 @@ class limiting_resource_adaptor final : public device_memory_resource { return get_upstream_resource() == cast->get_upstream_resource(); } + // The upstream resource used for satisfying allocation requests + device_async_resource_ref upstream_; + // maximum bytes this allocator is allowed to allocate. std::size_t allocation_limit_; @@ -194,9 +194,6 @@ class limiting_resource_adaptor final : public device_memory_resource { // todo: should be some way to ask the upstream... std::size_t alignment_; - - // The upstream resource used for satisfying allocation requests - device_async_resource_ref upstream_{rmm::mr::get_current_device_resource_ref()}; }; /** diff --git a/include/rmm/mr/device/logging_resource_adaptor.hpp b/include/rmm/mr/device/logging_resource_adaptor.hpp index 6f6cd816a..08fe4dd88 100644 --- a/include/rmm/mr/device/logging_resource_adaptor.hpp +++ b/include/rmm/mr/device/logging_resource_adaptor.hpp @@ -77,10 +77,8 @@ class logging_resource_adaptor final : public device_memory_resource { logging_resource_adaptor(Upstream* upstream, std::string const& filename = get_default_filename(), bool auto_flush = false) - : logger_{make_logger(filename)}, upstream_{upstream} + : logger_{make_logger(filename)}, upstream_{to_device_async_resource_ref_checked(upstream)} { - RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); - init_logger(auto_flush); } @@ -99,10 +97,8 @@ class logging_resource_adaptor final : public device_memory_resource { * performance. */ logging_resource_adaptor(Upstream* upstream, std::ostream& stream, bool auto_flush = false) - : logger_{make_logger(stream)}, upstream_{upstream} + : logger_{make_logger(stream)}, upstream_{to_device_async_resource_ref_checked(upstream)} { - RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); - init_logger(auto_flush); } @@ -123,10 +119,76 @@ class logging_resource_adaptor final : public device_memory_resource { logging_resource_adaptor(Upstream* upstream, spdlog::sinks_init_list sinks, bool auto_flush = false) - : logger_{make_logger(sinks)}, upstream_{upstream} + : logger_{make_logger(sinks)}, upstream_{to_device_async_resource_ref_checked(upstream)} + { + init_logger(auto_flush); + } + + /** + * @brief Construct a new logging resource adaptor using `upstream` to satisfy + * allocation requests and logging information about each allocation/free to + * the file specified by `filename`. + * + * The logfile will be written using CSV formatting. + * + * Clears the contents of `filename` if it already exists. + * + * Creating multiple `logging_resource_adaptor`s with the same `filename` will + * result in undefined behavior. + * + * @throws spdlog::spdlog_ex if opening `filename` failed + * + * @param upstream The resource_ref used for allocating/deallocating device memory. + * @param filename Name of file to write log info. If not specified, retrieves + * the file name from the environment variable "RMM_LOG_FILE". + * @param auto_flush If true, flushes the log for every (de)allocation. Warning, this will degrade + * performance. + */ + logging_resource_adaptor(device_async_resource_ref upstream, + std::string const& filename = get_default_filename(), + bool auto_flush = false) + : logger_{make_logger(filename)}, upstream_{upstream} + { + init_logger(auto_flush); + } + + /** + * @brief Construct a new logging resource adaptor using `upstream` to satisfy + * allocation requests and logging information about each allocation/free to + * the ostream specified by `stream`. + * + * The logfile will be written using CSV formatting. + * + * @param upstream The resource_ref used for allocating/deallocating device memory. + * @param stream The ostream to write log info. + * @param auto_flush If true, flushes the log for every (de)allocation. Warning, this will degrade + * performance. + */ + logging_resource_adaptor(device_async_resource_ref upstream, + std::ostream& stream, + bool auto_flush = false) + : logger_{make_logger(stream)}, upstream_{upstream} { - RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); + init_logger(auto_flush); + } + /** + * @brief Construct a new logging resource adaptor using `upstream` to satisfy + * allocation requests and logging information about each allocation/free to + * the ostream specified by `stream`. + * + * The logfile will be written using CSV formatting. + * + * @param upstream The resource_ref used for allocating/deallocating device memory. + * @param sinks A list of logging sinks to which log output will be written. + * @param auto_flush If true, flushes the log for every (de)allocation. Warning, this will degrade + * performance. + */ + logging_resource_adaptor(device_async_resource_ref upstream, + spdlog::sinks_init_list sinks, + bool auto_flush = false) + : logger_{make_logger(sinks)}, upstream_{upstream} + { init_logger(auto_flush); } @@ -147,11 +209,6 @@ class logging_resource_adaptor final : public device_memory_resource { return upstream_; } - /** - * @briefreturn{Upstream* to the upstream memory resource} - */ - [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; } - /** * @brief Flush logger contents. */ @@ -239,7 +296,7 @@ class logging_resource_adaptor final : public device_memory_resource { void* do_allocate(std::size_t bytes, cuda_stream_view stream) override { try { - auto const ptr = upstream_->allocate(bytes, stream); + auto const ptr = get_upstream_resource().allocate_async(bytes, stream); logger_->info("allocate,{},{},{}", ptr, bytes, fmt::ptr(stream.value())); return ptr; } catch (...) { @@ -265,7 +322,7 @@ class logging_resource_adaptor final : public device_memory_resource { void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override { logger_->info("free,{},{},{}", ptr, bytes, fmt::ptr(stream.value())); - upstream_->deallocate(ptr, bytes, stream); + get_upstream_resource().deallocate_async(ptr, bytes, stream); } /** @@ -279,7 +336,7 @@ class logging_resource_adaptor final : public device_memory_resource { { if (this == &other) { return true; } auto const* cast = dynamic_cast const*>(&other); - if (cast == nullptr) { return upstream_->is_equal(other); } + if (cast == nullptr) { return false; } return get_upstream_resource() == cast->get_upstream_resource(); } @@ -295,8 +352,8 @@ class logging_resource_adaptor final : public device_memory_resource { std::shared_ptr logger_; ///< spdlog logger object - Upstream* upstream_; ///< The upstream resource used for satisfying - ///< allocation requests + device_async_resource_ref upstream_; ///< The upstream resource used for satisfying + ///< allocation requests }; /** diff --git a/include/rmm/mr/device/per_device_resource.hpp b/include/rmm/mr/device/per_device_resource.hpp index 9167c4fe6..4198397f4 100644 --- a/include/rmm/mr/device/per_device_resource.hpp +++ b/include/rmm/mr/device/per_device_resource.hpp @@ -191,6 +191,7 @@ namespace detail { // we need to call this function from two places: the thread-safe version of // `set_per_device_resource_ref` and the thread-safe version of `set_per_device_resource`, // both of which take the lock, so we need an implementation that doesn't take the lock. +/// @private inline device_async_resource_ref set_per_device_resource_ref_unsafe( cuda_device_id device_id, device_async_resource_ref new_resource_ref) { diff --git a/include/rmm/mr/device/pool_memory_resource.hpp b/include/rmm/mr/device/pool_memory_resource.hpp index 4daba5e8e..59a641f05 100644 --- a/include/rmm/mr/device/pool_memory_resource.hpp +++ b/include/rmm/mr/device/pool_memory_resource.hpp @@ -161,10 +161,7 @@ class pool_memory_resource final explicit pool_memory_resource(Upstream* upstream_mr, std::size_t initial_pool_size, std::optional maximum_pool_size = std::nullopt) - : upstream_mr_{[upstream_mr]() { - RMM_EXPECTS(nullptr != upstream_mr, "Unexpected null upstream pointer."); - return device_async_resource_ref{*upstream_mr}; - }()} + : upstream_mr_{to_device_async_resource_ref_checked(upstream_mr)} { RMM_EXPECTS(rmm::is_aligned(initial_pool_size, rmm::CUDA_ALLOCATION_ALIGNMENT), "Error, Initial pool size required to be a multiple of 256 bytes"); diff --git a/include/rmm/mr/device/prefetch_resource_adaptor.hpp b/include/rmm/mr/device/prefetch_resource_adaptor.hpp index 59ce8e036..b38bd0316 100644 --- a/include/rmm/mr/device/prefetch_resource_adaptor.hpp +++ b/include/rmm/mr/device/prefetch_resource_adaptor.hpp @@ -41,6 +41,16 @@ namespace mr { template class prefetch_resource_adaptor final : public device_memory_resource { public: + /** + * @brief Construct a new prefetch resource adaptor using `upstream` to satisfy + * allocation requests. + * + * @throws rmm::logic_error if `upstream == nullptr` + * + * @param upstream The resource_ref used for allocating/deallocating device memory + */ + prefetch_resource_adaptor(device_async_resource_ref upstream) : upstream_{upstream} {} + /** * @brief Construct a new prefetch resource adaptor using `upstream` to satisfy * allocation requests. @@ -49,9 +59,9 @@ class prefetch_resource_adaptor final : public device_memory_resource { * * @param upstream The resource used for allocating/deallocating device memory */ - prefetch_resource_adaptor(Upstream* upstream) : upstream_{upstream} + prefetch_resource_adaptor(Upstream* upstream) + : upstream_{to_device_async_resource_ref_checked(upstream)} { - RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); } prefetch_resource_adaptor() = delete; @@ -71,11 +81,6 @@ class prefetch_resource_adaptor final : public device_memory_resource { return upstream_; } - /** - * @briefreturn{Upstream* to the upstream memory resource} - */ - [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; } - private: /** * @brief Allocates memory of size at least `bytes` using the upstream @@ -92,7 +97,7 @@ class prefetch_resource_adaptor final : public device_memory_resource { */ void* do_allocate(std::size_t bytes, cuda_stream_view stream) override { - void* ptr = upstream_->allocate(bytes, stream); + void* ptr = get_upstream_resource().allocate_async(bytes, stream); rmm::prefetch(ptr, bytes, rmm::get_current_cuda_device(), stream); return ptr; } @@ -106,7 +111,7 @@ class prefetch_resource_adaptor final : public device_memory_resource { */ void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override { - upstream_->deallocate(ptr, bytes, stream); + get_upstream_resource().deallocate_async(ptr, bytes, stream); } /** @@ -120,11 +125,12 @@ class prefetch_resource_adaptor final : public device_memory_resource { { if (this == &other) { return true; } auto cast = dynamic_cast const*>(&other); - if (cast == nullptr) { return upstream_->is_equal(other); } + if (cast == nullptr) { return false; } return get_upstream_resource() == cast->get_upstream_resource(); } - Upstream* upstream_; // the upstream resource used for satisfying allocation requests + // the upstream resource used for satisfying allocation requests + device_async_resource_ref upstream_; }; /** @} */ // end of group diff --git a/include/rmm/mr/device/statistics_resource_adaptor.hpp b/include/rmm/mr/device/statistics_resource_adaptor.hpp index bb4361049..025c51aa7 100644 --- a/include/rmm/mr/device/statistics_resource_adaptor.hpp +++ b/include/rmm/mr/device/statistics_resource_adaptor.hpp @@ -114,6 +114,12 @@ class statistics_resource_adaptor final : public device_memory_resource { } }; + /** + * @brief Construct a new statistics resource adaptor using `upstream` to satisfy + * allocation requests. + * + * @param upstream The resource_ref used for allocating/deallocating device memory. + */ statistics_resource_adaptor(device_async_resource_ref upstream) : upstream_{upstream} {} /** @@ -122,13 +128,10 @@ class statistics_resource_adaptor final : public device_memory_resource { * * @throws rmm::logic_error if `upstream == nullptr` * - * @param upstream The resource used for allocating/deallocating device memory + * @param upstream The resource used for allocating/deallocating device memory. */ statistics_resource_adaptor(Upstream* upstream) - : upstream_{[upstream]() { - RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); - return device_async_resource_ref{*upstream}; - }()} + : upstream_{to_device_async_resource_ref_checked(upstream)} { } @@ -227,7 +230,7 @@ class statistics_resource_adaptor final : public device_memory_resource { */ void* do_allocate(std::size_t bytes, cuda_stream_view stream) override { - void* ptr = upstream_.allocate_async(bytes, stream); + void* ptr = get_upstream_resource().allocate_async(bytes, stream); // increment the stats { @@ -250,7 +253,7 @@ class statistics_resource_adaptor final : public device_memory_resource { */ void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override { - upstream_.deallocate_async(ptr, bytes, stream); + get_upstream_resource().deallocate_async(ptr, bytes, stream); { write_lock_t lock(mtx_); @@ -281,7 +284,7 @@ class statistics_resource_adaptor final : public device_memory_resource { std::stack> counter_stack_{{std::make_pair(counter{}, counter{})}}; std::shared_mutex mutable mtx_; // mutex for thread safe access to allocations_ // the upstream resource used for satisfying allocation requests - device_async_resource_ref upstream_{rmm::mr::get_current_device_resource_ref()}; + device_async_resource_ref upstream_; }; /** diff --git a/include/rmm/mr/device/thread_safe_resource_adaptor.hpp b/include/rmm/mr/device/thread_safe_resource_adaptor.hpp index 9979d1e08..6881aa19e 100644 --- a/include/rmm/mr/device/thread_safe_resource_adaptor.hpp +++ b/include/rmm/mr/device/thread_safe_resource_adaptor.hpp @@ -45,6 +45,16 @@ class thread_safe_resource_adaptor final : public device_memory_resource { public: using lock_t = std::lock_guard; ///< Type of lock used to synchronize access + /** + * @brief Construct a new thread safe resource adaptor using `upstream` to satisfy + * allocation requests. + * + * All allocations and frees are protected by a mutex lock + * + * @param upstream The resource used for allocating/deallocating device memory. + */ + thread_safe_resource_adaptor(device_async_resource_ref upstream) : upstream_{upstream} {} + /** * @brief Construct a new thread safe resource adaptor using `upstream` to satisfy * allocation requests. @@ -55,9 +65,9 @@ class thread_safe_resource_adaptor final : public device_memory_resource { * * @param upstream The resource used for allocating/deallocating device memory. */ - thread_safe_resource_adaptor(Upstream* upstream) : upstream_{upstream} + thread_safe_resource_adaptor(Upstream* upstream) + : upstream_{to_device_async_resource_ref_checked(upstream)} { - RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); } thread_safe_resource_adaptor() = delete; @@ -75,11 +85,6 @@ class thread_safe_resource_adaptor final : public device_memory_resource { return upstream_; } - /** - * @briefreturn{Upstream* to the upstream memory resource} - */ - [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; } - private: /** * @brief Allocates memory of size at least `bytes` using the upstream @@ -95,7 +100,7 @@ class thread_safe_resource_adaptor final : public device_memory_resource { void* do_allocate(std::size_t bytes, cuda_stream_view stream) override { lock_t lock(mtx); - return upstream_->allocate(bytes, stream); + return get_upstream_resource().allocate_async(bytes, stream); } /** @@ -108,7 +113,7 @@ class thread_safe_resource_adaptor final : public device_memory_resource { void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override { lock_t lock(mtx); - upstream_->deallocate(ptr, bytes, stream); + get_upstream_resource().deallocate_async(ptr, bytes, stream); } /** @@ -122,12 +127,13 @@ class thread_safe_resource_adaptor final : public device_memory_resource { { if (this == &other) { return true; } auto cast = dynamic_cast const*>(&other); - if (cast == nullptr) { return upstream_->is_equal(other); } + if (cast == nullptr) { return false; } return get_upstream_resource() == cast->get_upstream_resource(); } std::mutex mutable mtx; // mutex for thread safe access to upstream - Upstream* upstream_; ///< The upstream resource used for satisfying allocation requests + device_async_resource_ref + upstream_; ///< The upstream resource used for satisfying allocation requests }; /** @} */ // end of group diff --git a/include/rmm/mr/device/tracking_resource_adaptor.hpp b/include/rmm/mr/device/tracking_resource_adaptor.hpp index 461b1832d..6a5916e5c 100644 --- a/include/rmm/mr/device/tracking_resource_adaptor.hpp +++ b/include/rmm/mr/device/tracking_resource_adaptor.hpp @@ -88,8 +88,6 @@ class tracking_resource_adaptor final : public device_memory_resource { * @brief Construct a new tracking resource adaptor using `upstream` to satisfy * allocation requests. * - * @throws rmm::logic_error if `upstream == nullptr` - * * @param upstream The resource used for allocating/deallocating device memory * @param capture_stacks If true, capture stacks for allocation calls */ @@ -108,10 +106,9 @@ class tracking_resource_adaptor final : public device_memory_resource { * @param capture_stacks If true, capture stacks for allocation calls */ tracking_resource_adaptor(Upstream* upstream, bool capture_stacks = false) - : capture_stacks_{capture_stacks}, allocated_bytes_{0}, upstream_{[upstream]() { - RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); - return device_async_resource_ref{*upstream}; - }()} + : capture_stacks_{capture_stacks}, + allocated_bytes_{0}, + upstream_{to_device_async_resource_ref_checked(upstream)} { } @@ -211,7 +208,7 @@ class tracking_resource_adaptor final : public device_memory_resource { */ void* do_allocate(std::size_t bytes, cuda_stream_view stream) override { - void* ptr = upstream_.allocate_async(bytes, stream); + void* ptr = get_upstream_resource().allocate_async(bytes, stream); // track it. { write_lock_t lock(mtx_); @@ -231,7 +228,7 @@ class tracking_resource_adaptor final : public device_memory_resource { */ void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override { - upstream_.deallocate_async(ptr, bytes, stream); + get_upstream_resource().deallocate_async(ptr, bytes, stream); { write_lock_t lock(mtx_); @@ -284,8 +281,7 @@ class tracking_resource_adaptor final : public device_memory_resource { std::map allocations_; // map of active allocations std::atomic allocated_bytes_; // number of bytes currently allocated std::shared_mutex mutable mtx_; // mutex for thread safe access to allocations_ - device_async_resource_ref upstream_{ - rmm::mr::get_current_device_resource_ref()}; // the upstream resource used for satisfying + device_async_resource_ref upstream_; // the upstream resource used for satisfying // allocation requests }; diff --git a/tests/device_check_resource_adaptor.hpp b/tests/device_check_resource_adaptor.hpp index 23b0c4bb1..6780f56d7 100644 --- a/tests/device_check_resource_adaptor.hpp +++ b/tests/device_check_resource_adaptor.hpp @@ -65,5 +65,5 @@ class device_check_resource_adaptor final : public rmm::mr::device_memory_resour } rmm::cuda_device_id device_id; - rmm::device_async_resource_ref upstream_{rmm::mr::get_current_device_resource_ref()}; + rmm::device_async_resource_ref upstream_; }; From 56e5eaa6de50bfebfef2a36b2acdc51f551564ff Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Thu, 29 Aug 2024 01:24:36 +0000 Subject: [PATCH 18/24] Make test debug builds use -O0 --- tests/CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 75b15a90b..97f13b01b 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -40,6 +40,7 @@ function(ConfigureTestInternal TEST_NAME) PUBLIC "SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${RMM_LOGGING_LEVEL}") target_compile_options(${TEST_NAME} PUBLIC $<$:-Wall -Werror -Wno-error=deprecated-declarations>) + target_compile_options(${TEST_NAME} PUBLIC "$<$:-O0>") if(DISABLE_DEPRECATION_WARNING) target_compile_options( @@ -48,6 +49,8 @@ function(ConfigureTestInternal TEST_NAME) PUBLIC $<$:-Wno-deprecated-declarations>) endif() + + if(CODE_COVERAGE) if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") From 545323ba325a619b0d4d23fb15195a7cbbe51333 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Thu, 29 Aug 2024 01:26:03 +0000 Subject: [PATCH 19/24] Fix quoted include. --- tests/mr/device/arena_mr_tests.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/mr/device/arena_mr_tests.cpp b/tests/mr/device/arena_mr_tests.cpp index 9db623394..bdc0f2438 100644 --- a/tests/mr/device/arena_mr_tests.cpp +++ b/tests/mr/device/arena_mr_tests.cpp @@ -15,7 +15,6 @@ */ #include "../../byte_literals.hpp" -#include "cuda/stream_ref" #include #include @@ -26,6 +25,8 @@ #include #include +#include + #include #include #include From 5598840307c36ed410c450529a1b52c67c9f5b1b Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Thu, 29 Aug 2024 02:26:23 +0000 Subject: [PATCH 20/24] cmake style --- tests/CMakeLists.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 97f13b01b..28093360a 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -49,8 +49,6 @@ function(ConfigureTestInternal TEST_NAME) PUBLIC $<$:-Wno-deprecated-declarations>) endif() - - if(CODE_COVERAGE) if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") From c1d7b0d0e4d81acf9dc00a7907004cd548d28d8c Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Wed, 4 Sep 2024 07:43:37 +0000 Subject: [PATCH 21/24] Don't take the address when passing MR to resource_ref. --- benchmarks/device_uvector/device_uvector_bench.cu | 6 +++--- include/rmm/mr/device/failure_callback_resource_adaptor.hpp | 2 +- tests/container_multidevice_tests.cu | 6 +++--- tests/mr/device/statistics_mr_tests.cpp | 4 ++-- tests/mr/device/tracking_mr_tests.cpp | 4 ++-- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/benchmarks/device_uvector/device_uvector_bench.cu b/benchmarks/device_uvector/device_uvector_bench.cu index e56926827..0eddb1d92 100644 --- a/benchmarks/device_uvector/device_uvector_bench.cu +++ b/benchmarks/device_uvector/device_uvector_bench.cu @@ -40,7 +40,7 @@ void BM_UvectorSizeConstruction(benchmark::State& state) rmm::mr::cuda_memory_resource cuda_mr{}; rmm::mr::pool_memory_resource mr{ &cuda_mr, rmm::percent_of_free_device_memory(50)}; - rmm::mr::set_current_device_resource_ref(&mr); + rmm::mr::set_current_device_resource_ref(mr); for (auto _ : state) { // NOLINT(clang-analyzer-deadcode.DeadStores) rmm::device_uvector vec(state.range(0), rmm::cuda_stream_view{}); @@ -62,7 +62,7 @@ void BM_ThrustVectorSizeConstruction(benchmark::State& state) rmm::mr::cuda_memory_resource cuda_mr{}; rmm::mr::pool_memory_resource mr{ &cuda_mr, rmm::percent_of_free_device_memory(50)}; - rmm::mr::set_current_device_resource_ref(&mr); + rmm::mr::set_current_device_resource_ref(mr); for (auto _ : state) { // NOLINT(clang-analyzer-deadcode.DeadStores) rmm::device_vector vec(state.range(0)); @@ -140,7 +140,7 @@ template void BM_VectorWorkflow(benchmark::State& state) { rmm::mr::cuda_async_memory_resource cuda_async_mr{}; - rmm::mr::set_current_device_resource_ref(&cuda_async_mr); + rmm::mr::set_current_device_resource_ref(cuda_async_mr); rmm::cuda_stream input_stream; std::vector streams(4); diff --git a/include/rmm/mr/device/failure_callback_resource_adaptor.hpp b/include/rmm/mr/device/failure_callback_resource_adaptor.hpp index 264a5a532..a08352d18 100644 --- a/include/rmm/mr/device/failure_callback_resource_adaptor.hpp +++ b/include/rmm/mr/device/failure_callback_resource_adaptor.hpp @@ -84,7 +84,7 @@ using failure_callback_t = std::function; * failure_callback_adaptor mr{ * rmm::mr::get_current_device_resource_ref(), failure_handler, &retried * }; - * rmm::mr::set_current_device_resource_ref(&mr); + * rmm::mr::set_current_device_resource_ref(mr); * } * @endcode * diff --git a/tests/container_multidevice_tests.cu b/tests/container_multidevice_tests.cu index 4d00173ac..55432feb0 100644 --- a/tests/container_multidevice_tests.cu +++ b/tests/container_multidevice_tests.cu @@ -71,7 +71,7 @@ TYPED_TEST(ContainerMultiDeviceTest, CreateMoveDestroyDifferentActiveDevice) rmm::cuda_set_device_raii dev{rmm::cuda_device_id{0}}; auto orig_mr = rmm::mr::get_current_device_resource_ref(); auto check_mr = device_check_resource_adaptor{orig_mr}; - rmm::mr::set_current_device_resource_ref(&check_mr); + rmm::mr::set_current_device_resource_ref(check_mr); { auto buf_1 = []() { @@ -111,7 +111,7 @@ TYPED_TEST(ContainerMultiDeviceTest, ResizeDifferentActiveDevice) rmm::cuda_set_device_raii dev{rmm::cuda_device_id{0}}; auto orig_mr = rmm::mr::get_current_device_resource_ref(); auto check_mr = device_check_resource_adaptor{orig_mr}; - rmm::mr::set_current_device_resource_ref(&check_mr); + rmm::mr::set_current_device_resource_ref(check_mr); if constexpr (not std::is_same_v>) { auto buf = TypeParam(128, rmm::cuda_stream_view{}); @@ -134,7 +134,7 @@ TYPED_TEST(ContainerMultiDeviceTest, ShrinkDifferentActiveDevice) rmm::cuda_set_device_raii dev{rmm::cuda_device_id{0}}; auto orig_mr = rmm::mr::get_current_device_resource_ref(); auto check_mr = device_check_resource_adaptor{orig_mr}; - rmm::mr::set_current_device_resource_ref(&check_mr); + rmm::mr::set_current_device_resource_ref(check_mr); if constexpr (not std::is_same_v>) { auto buf = TypeParam(128, rmm::cuda_stream_view{}); diff --git a/tests/mr/device/statistics_mr_tests.cpp b/tests/mr/device/statistics_mr_tests.cpp index 1fdf01aa5..f796a4c00 100644 --- a/tests/mr/device/statistics_mr_tests.cpp +++ b/tests/mr/device/statistics_mr_tests.cpp @@ -129,7 +129,7 @@ TEST(StatisticsTest, MultiTracking) { auto orig_device_resource = rmm::mr::get_current_device_resource_ref(); statistics_adaptor mr{orig_device_resource}; - rmm::mr::set_current_device_resource_ref(&mr); + rmm::mr::set_current_device_resource_ref(mr); std::vector> allocations; for (std::size_t i = 0; i < num_allocations; ++i) { @@ -140,7 +140,7 @@ TEST(StatisticsTest, MultiTracking) EXPECT_EQ(mr.get_allocations_counter().value, 10); statistics_adaptor inner_mr{rmm::mr::get_current_device_resource_ref()}; - rmm::mr::set_current_device_resource_ref(&inner_mr); + rmm::mr::set_current_device_resource_ref(inner_mr); for (std::size_t i = 0; i < num_more_allocations; ++i) { allocations.emplace_back( diff --git a/tests/mr/device/tracking_mr_tests.cpp b/tests/mr/device/tracking_mr_tests.cpp index 6eeb6ccc3..acd540ae6 100644 --- a/tests/mr/device/tracking_mr_tests.cpp +++ b/tests/mr/device/tracking_mr_tests.cpp @@ -103,7 +103,7 @@ TEST(TrackingTest, MultiTracking) { auto orig_device_resource = rmm::mr::get_current_device_resource_ref(); tracking_adaptor mr{orig_device_resource, true}; - rmm::mr::set_current_device_resource_ref(&mr); + rmm::mr::set_current_device_resource_ref(mr); std::vector> allocations; for (std::size_t i = 0; i < num_allocations; ++i) { @@ -114,7 +114,7 @@ TEST(TrackingTest, MultiTracking) EXPECT_EQ(mr.get_outstanding_allocations().size(), num_allocations); tracking_adaptor inner_mr{rmm::mr::get_current_device_resource_ref()}; - rmm::mr::set_current_device_resource_ref(&inner_mr); + rmm::mr::set_current_device_resource_ref(inner_mr); for (std::size_t i = 0; i < num_more_allocations; ++i) { allocations.emplace_back( From 4c113827371c4f1b79f7021609d5fcbaac5097dd Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Thu, 5 Sep 2024 02:36:24 +0000 Subject: [PATCH 22/24] Fix yoda code. --- include/rmm/resource_ref.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/rmm/resource_ref.hpp b/include/rmm/resource_ref.hpp index 6a34a6a47..56049522f 100644 --- a/include/rmm/resource_ref.hpp +++ b/include/rmm/resource_ref.hpp @@ -77,7 +77,7 @@ using host_device_async_resource_ref = template device_async_resource_ref to_device_async_resource_ref_checked(Resource* res) { - RMM_EXPECTS(nullptr != res, "Unexpected null resource pointer."); + RMM_EXPECTS(res, "Unexpected null resource pointer."); return device_async_resource_ref{*res}; } From 6d2f09d862254840d726c1e932c25e28797077d2 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Thu, 5 Sep 2024 02:36:46 +0000 Subject: [PATCH 23/24] Use resource_ref upstreams in binning_mr --- .../rmm/mr/device/binning_memory_resource.hpp | 56 +++++++++++++++---- 1 file changed, 46 insertions(+), 10 deletions(-) diff --git a/include/rmm/mr/device/binning_memory_resource.hpp b/include/rmm/mr/device/binning_memory_resource.hpp index 7daa77328..a5ef64665 100644 --- a/include/rmm/mr/device/binning_memory_resource.hpp +++ b/include/rmm/mr/device/binning_memory_resource.hpp @@ -27,6 +27,7 @@ #include #include #include +#include #include namespace RMM_NAMESPACE { @@ -52,7 +53,20 @@ class binning_memory_resource final : public device_memory_resource { * Initially has no bins, so simply uses the upstream_resource until bin resources are added * with `add_bin`. * - * @throws rmm::logic_error if size_base is not a power of two. + * @param upstream_resource The upstream memory resource used to allocate bin pools. + */ + explicit binning_memory_resource(device_async_resource_ref upstream_resource) + : upstream_mr_{upstream_resource} + { + } + + /** + * @brief Construct a new binning memory resource object. + * + * Initially has no bins, so simply uses the upstream_resource until bin resources are added + * with `add_bin`. + * + * @throws rmm::logic_error if upstream_resource is nullptr * * @param upstream_resource The upstream memory resource used to allocate bin pools. */ @@ -73,6 +87,30 @@ class binning_memory_resource final : public device_memory_resource { * @param min_size_exponent The minimum base-2 exponent bin size. * @param max_size_exponent The maximum base-2 exponent bin size. */ + binning_memory_resource(device_async_resource_ref upstream_resource, + int8_t min_size_exponent, // NOLINT(bugprone-easily-swappable-parameters) + int8_t max_size_exponent) + : upstream_mr_{upstream_resource} + { + for (auto i = min_size_exponent; i <= max_size_exponent; i++) { + add_bin(1 << i); + } + } + + /** + * @brief Construct a new binning memory resource object with a range of initial bins. + * + * Constructs a new binning memory resource and adds bins backed by `fixed_size_memory_resource` + * in the range [2^min_size_exponent, 2^max_size_exponent]. For example if `min_size_exponent==18` + * and `max_size_exponent==22`, creates bins of sizes 256KiB, 512KiB, 1024KiB, 2048KiB and + * 4096KiB. + * + * @throws rmm::logic_error if upstream_resource is nullptr + * + * @param upstream_resource The upstream memory resource used to allocate bin pools. + * @param min_size_exponent The minimum base-2 exponent bin size. + * @param max_size_exponent The maximum base-2 exponent bin size. + */ binning_memory_resource(Upstream* upstream_resource, int8_t min_size_exponent, // NOLINT(bugprone-easily-swappable-parameters) int8_t max_size_exponent) @@ -106,8 +144,7 @@ class binning_memory_resource final : public device_memory_resource { /** * @brief Add a bin allocator to this resource * - * Adds `bin_resource` if it is not null; otherwise constructs and adds a - * fixed_size_memory_resource. + * Adds `bin_resource` if provided; otherwise constructs and adds a fixed_size_memory_resource. * * This bin will be used for any allocation smaller than `allocation_size` that is larger than * the next smaller bin's allocation size. @@ -119,14 +156,14 @@ class binning_memory_resource final : public device_memory_resource { * @param allocation_size The maximum size that this bin allocates * @param bin_resource The memory resource for the bin */ - void add_bin(std::size_t allocation_size, device_memory_resource* bin_resource = nullptr) + void add_bin(std::size_t allocation_size, + std::optional bin_resource = std::nullopt) { allocation_size = align_up(allocation_size, CUDA_ALLOCATION_ALIGNMENT); - if (nullptr != bin_resource) { - resource_bins_.insert({allocation_size, bin_resource}); + if (bin_resource.has_value()) { + resource_bins_.insert({allocation_size, bin_resource.value()}); } else if (resource_bins_.count(allocation_size) == 0) { // do nothing if bin already exists - owned_bin_resources_.push_back( std::make_unique>(upstream_mr_, allocation_size)); resource_bins_.insert({allocation_size, owned_bin_resources_.back().get()}); @@ -145,8 +182,7 @@ class binning_memory_resource final : public device_memory_resource { device_async_resource_ref get_resource_ref(std::size_t bytes) { auto iter = resource_bins_.lower_bound(bytes); - return (iter != resource_bins_.cend()) ? device_async_resource_ref{iter->second} - : get_upstream_resource(); + return (iter != resource_bins_.cend()) ? iter->second : get_upstream_resource(); } /** @@ -182,7 +218,7 @@ class binning_memory_resource final : public device_memory_resource { std::vector>> owned_bin_resources_; - std::map resource_bins_; + std::map resource_bins_; }; /** @} */ // end of group From a367af6954cbd7d025b8ab4ba4cef1abfa55a62e Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Thu, 5 Sep 2024 02:37:07 +0000 Subject: [PATCH 24/24] Remove incorrect @throws documentation --- include/rmm/mr/device/aligned_resource_adaptor.hpp | 3 +-- include/rmm/mr/device/detail/arena.hpp | 2 -- include/rmm/mr/device/failure_callback_resource_adaptor.hpp | 2 -- include/rmm/mr/device/limiting_resource_adaptor.hpp | 2 -- include/rmm/mr/device/pool_memory_resource.hpp | 1 - include/rmm/mr/device/prefetch_resource_adaptor.hpp | 2 -- 6 files changed, 1 insertion(+), 11 deletions(-) diff --git a/include/rmm/mr/device/aligned_resource_adaptor.hpp b/include/rmm/mr/device/aligned_resource_adaptor.hpp index f99348906..4df2c4d2d 100644 --- a/include/rmm/mr/device/aligned_resource_adaptor.hpp +++ b/include/rmm/mr/device/aligned_resource_adaptor.hpp @@ -60,7 +60,6 @@ class aligned_resource_adaptor final : public device_memory_resource { /** * @brief Construct an aligned resource adaptor using `upstream` to satisfy allocation requests. * - * @throws rmm::logic_error if `upstream == nullptr` * @throws rmm::logic_error if `allocation_alignment` is not a power of 2 * * @param upstream The resource used for allocating/deallocating device memory. @@ -81,7 +80,7 @@ class aligned_resource_adaptor final : public device_memory_resource { * @brief Construct an aligned resource adaptor using `upstream` to satisfy allocation requests. * * @throws rmm::logic_error if `upstream == nullptr` - * @throws rmm::logic_error if `allocation_alignment` is not a power of 2 + * @throws rmm::logic_error if `alignment` is not a power of 2 * * @param upstream The resource used for allocating/deallocating device memory. * @param alignment The size used for allocation alignment. diff --git a/include/rmm/mr/device/detail/arena.hpp b/include/rmm/mr/device/detail/arena.hpp index f6049f441..6f8303c83 100644 --- a/include/rmm/mr/device/detail/arena.hpp +++ b/include/rmm/mr/device/detail/arena.hpp @@ -500,8 +500,6 @@ class global_arena final { /** * @brief Construct a global arena. * - * @throws rmm::logic_error if `upstream_mr == nullptr`. - * * @param upstream_mr The memory resource from which to allocate blocks for the pool * @param arena_size Size in bytes of the global arena. Defaults to half of the available memory * on the current device. diff --git a/include/rmm/mr/device/failure_callback_resource_adaptor.hpp b/include/rmm/mr/device/failure_callback_resource_adaptor.hpp index a08352d18..fdb385748 100644 --- a/include/rmm/mr/device/failure_callback_resource_adaptor.hpp +++ b/include/rmm/mr/device/failure_callback_resource_adaptor.hpp @@ -100,8 +100,6 @@ class failure_callback_resource_adaptor final : public device_memory_resource { * @brief Construct a new `failure_callback_resource_adaptor` using `upstream` to satisfy * allocation requests. * - * @throws rmm::logic_error if `upstream == nullptr` - * * @param upstream The resource used for allocating/deallocating device memory * @param callback Callback function @see failure_callback_t * @param callback_arg Extra argument passed to `callback` diff --git a/include/rmm/mr/device/limiting_resource_adaptor.hpp b/include/rmm/mr/device/limiting_resource_adaptor.hpp index eba352e78..d19fa3a0a 100644 --- a/include/rmm/mr/device/limiting_resource_adaptor.hpp +++ b/include/rmm/mr/device/limiting_resource_adaptor.hpp @@ -50,8 +50,6 @@ class limiting_resource_adaptor final : public device_memory_resource { * @brief Construct a new limiting resource adaptor using `upstream` to satisfy * allocation requests and limiting the total allocation amount possible. * - * @throws rmm::logic_error if `upstream == nullptr` - * * @param upstream The resource used for allocating/deallocating device memory * @param allocation_limit Maximum memory allowed for this allocator * @param alignment Alignment in bytes for the start of each allocated buffer diff --git a/include/rmm/mr/device/pool_memory_resource.hpp b/include/rmm/mr/device/pool_memory_resource.hpp index 59a641f05..f63de21ff 100644 --- a/include/rmm/mr/device/pool_memory_resource.hpp +++ b/include/rmm/mr/device/pool_memory_resource.hpp @@ -119,7 +119,6 @@ class pool_memory_resource final * @brief Construct a `pool_memory_resource` and allocate the initial device memory pool using * `upstream_mr`. * - * @throws rmm::logic_error if `upstream_mr == nullptr` * @throws rmm::logic_error if `initial_pool_size` is not aligned to a multiple of * pool_memory_resource::allocation_alignment bytes. * @throws rmm::logic_error if `maximum_pool_size` is neither the default nor aligned to a diff --git a/include/rmm/mr/device/prefetch_resource_adaptor.hpp b/include/rmm/mr/device/prefetch_resource_adaptor.hpp index b38bd0316..d3a4c676a 100644 --- a/include/rmm/mr/device/prefetch_resource_adaptor.hpp +++ b/include/rmm/mr/device/prefetch_resource_adaptor.hpp @@ -45,8 +45,6 @@ class prefetch_resource_adaptor final : public device_memory_resource { * @brief Construct a new prefetch resource adaptor using `upstream` to satisfy * allocation requests. * - * @throws rmm::logic_error if `upstream == nullptr` - * * @param upstream The resource_ref used for allocating/deallocating device memory */ prefetch_resource_adaptor(device_async_resource_ref upstream) : upstream_{upstream} {}