diff --git a/rocprim/include/rocprim/device/config_types.hpp b/rocprim/include/rocprim/device/config_types.hpp index 32bdb5cd6..484db8348 100644 --- a/rocprim/include/rocprim/device/config_types.hpp +++ b/rocprim/include/rocprim/device/config_types.hpp @@ -100,10 +100,13 @@ template< unsigned int SharedMemoryPerThread, // Most kernels require block sizes not smaller than warp unsigned int MinBlockSize, + // If kernels require more than MaxBlockSize * SharedMemoryPerThread bytes + // (eg. to store some kind of block-wide state), that size can be specified here + unsigned int ExtraSharedMemory = 0, // Can fit in shared memory? // Although GPUs have 64KiB, 32KiB is used here as a "soft" limit, // because some additional memory may be required in kernels - bool = (MaxBlockSize * SharedMemoryPerThread <= (1u << 15)) + bool = (MaxBlockSize * SharedMemoryPerThread + ExtraSharedMemory <= (1u << 15)) > struct limit_block_size { @@ -112,16 +115,18 @@ struct limit_block_size limit_block_size< detail::next_power_of_two(MaxBlockSize) / 2, SharedMemoryPerThread, - MinBlockSize + MinBlockSize, + ExtraSharedMemory >::value; }; template< unsigned int MaxBlockSize, unsigned int SharedMemoryPerThread, - unsigned int MinBlockSize + unsigned int MinBlockSize, + unsigned int ExtraSharedMemory > -struct limit_block_size +struct limit_block_size { static_assert(MaxBlockSize >= MinBlockSize, "Data is too large, it cannot fit in shared memory"); diff --git a/rocprim/include/rocprim/device/device_partition.hpp b/rocprim/include/rocprim/device/device_partition.hpp index 3b89a3ed1..56394c631 100644 --- a/rocprim/include/rocprim/device/device_partition.hpp +++ b/rocprim/include/rocprim/device/device_partition.hpp @@ -149,15 +149,18 @@ hipError_t partition_impl(void * temporary_storage, using key_type = typename std::iterator_traits::value_type; using value_type = typename std::iterator_traits::value_type; + using offset_scan_state_type = detail::lookback_scan_state; + using offset_scan_state_with_sleep_type = detail::lookback_scan_state; + // Get default config if Config is default_config using config = default_or_custom_config< Config, - default_select_config + // Note: the partition algorithm requires some extra shared memory space for an instance of + // offset_scan_state_type. Pass it's size to default_select_config here so that it can select + // an appropriate block size (one that ensures that we don't run out of shared memory). + default_select_config >; - using offset_scan_state_type = detail::lookback_scan_state; - using offset_scan_state_with_sleep_type = detail::lookback_scan_state; - static constexpr unsigned int block_size = config::block_size; static constexpr unsigned int items_per_thread = config::items_per_thread; static constexpr auto items_per_block = block_size * items_per_thread; diff --git a/rocprim/include/rocprim/device/device_select_config.hpp b/rocprim/include/rocprim/device/device_select_config.hpp index b7c89eb72..7f90abc81 100644 --- a/rocprim/include/rocprim/device/device_select_config.hpp +++ b/rocprim/include/rocprim/device/device_select_config.hpp @@ -76,14 +76,14 @@ struct select_config namespace detail { -template +template struct select_config_803 { static constexpr unsigned int item_scale = ::rocprim::detail::ceiling_div(sizeof(Key), sizeof(int)); using type = select_config< - limit_block_size<256U, sizeof(Key), ROCPRIM_WARP_SIZE_64>::value, + limit_block_size<256U, sizeof(Key), ROCPRIM_WARP_SIZE_64, ExtraSharedMemory>::value, ::rocprim::max(1u, 13u / item_scale), ::rocprim::block_load_method::block_load_transpose, ::rocprim::block_load_method::block_load_transpose, @@ -92,14 +92,14 @@ struct select_config_803 >; }; -template +template struct select_config_900 { static constexpr unsigned int item_scale = ::rocprim::detail::ceiling_div(sizeof(Key), sizeof(int)); using type = select_config< - limit_block_size<256U, sizeof(Key), ROCPRIM_WARP_SIZE_64>::value, + limit_block_size<256U, sizeof(Key), ROCPRIM_WARP_SIZE_64, ExtraSharedMemory>::value, ::rocprim::max(1u, 15u / item_scale), ::rocprim::block_load_method::block_load_transpose, ::rocprim::block_load_method::block_load_transpose, @@ -108,14 +108,14 @@ struct select_config_900 >; }; -template +template struct select_config_90a { static constexpr unsigned int item_scale = ::rocprim::detail::ceiling_div(sizeof(Value), sizeof(int)); using type = select_config< - limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value, + limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64, ExtraSharedMemory>::value, ::rocprim::max(1u, 15u / item_scale), ::rocprim::block_load_method::block_load_transpose, ::rocprim::block_load_method::block_load_transpose, @@ -124,14 +124,14 @@ struct select_config_90a >; }; -template +template struct select_config_1030 { static constexpr unsigned int item_scale = ::rocprim::detail::ceiling_div(sizeof(Value), sizeof(int)); using type = select_config< - limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_32>::value, + limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_32, ExtraSharedMemory>::value, ::rocprim::max(1u, 15u / item_scale), ::rocprim::block_load_method::block_load_transpose, ::rocprim::block_load_method::block_load_transpose, @@ -141,15 +141,15 @@ struct select_config_1030 }; -template +template struct default_select_config : select_arch< TargetArch, - select_arch_case<803, select_config_803>, - select_arch_case<900, select_config_900>, - select_arch_case>, - select_arch_case<1030, select_config_1030>, - select_config_803 + select_arch_case<803, select_config_803>, + select_arch_case<900, select_config_900>, + select_arch_case>, + select_arch_case<1030, select_config_1030>, + select_config_803 > { }; } // end namespace detail diff --git a/test/rocprim/test_device_partition.cpp b/test/rocprim/test_device_partition.cpp index 77f2d5629..627d50e72 100644 --- a/test/rocprim/test_device_partition.cpp +++ b/test/rocprim/test_device_partition.cpp @@ -1464,3 +1464,160 @@ TEST_P(RocprimDevicePartitionLargeInputTests, LargeInputPartitionThreeWay) HIP_CHECK(hipStreamDestroy(stream)); } } + +// This test checks to make sure that the block size is reduced correctly +// when our data size and type are set in a way that we will exceed the shared +// memory limit. Since the block size calculation is done at compile time, +// if the block size is not correctly reduced, this test will fail to compile. +TEST(RocprimDevicePartitionBlockSizeTests, BlockSize) +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + // Create a large struct to test with. It must be big enough that when + // we the use default block size (defined in rocprim::default_select_config + // struct as 256), giving one instance to each thread will cause us to hit + // 32 KiB of shared memory (the limit enforced by the rocprim::limit_block_size + // struct's boolean template parameter). Since the device_partition algorithm also + // uses some shared memory to store state, this will cause the total usage to exceed + // the 32 KiB limit. If everything's working correctly, this should be detected in + // the limit_block_size's template logic, and it should reduce the block size. + const size_t test_obj_size = 128; // Choose 128, since 256 * 128 = 2^15 bytes (32 KiB). + struct TestObject + { + unsigned char data[test_obj_size]; + + bool operator==(const TestObject& other) const + { + bool equal = true; + for (size_t i = 0; equal && i < test_obj_size; i++) + equal = data[i] == other.data[i]; + + return equal; + } + }; + + using T = TestObject; // input data type + using U = TestObject; // output data type + const bool debug_synchronous = false; + const hipStream_t stream = 0; // default stream + + auto select_op = [] __host__ __device__ (const T& value) -> bool + { + // The data values are in [0, 255]. Partition on the midpoint. + if(value.data[0] == 128) return true; + return false; + }; + + // Use some power of two and off-by-one-from-power-of-two data sizes. + const std::vector sizes = {256, 257, 511, 512, 1024, 1025}; + + for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + { + unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + + for(auto size : sizes) + { + SCOPED_TRACE(testing::Message() << "with size = " << size); + + // Generate data + std::vector input_data = test_utils::get_random_data(size * test_obj_size, 0, 255, seed_value); + std::vector input(size); + for (size_t i = 0; i < size; i++) + memcpy(input[i].data, input_data.data() + i * test_obj_size, test_obj_size); + + T * d_input; + U * d_output; + unsigned int * d_selected_count_output; + HIP_CHECK(test_common_utils::hipMallocHelper(&d_input, input.size() * sizeof(T))); + HIP_CHECK(test_common_utils::hipMallocHelper(&d_output, input.size() * sizeof(U))); + HIP_CHECK(test_common_utils::hipMallocHelper(&d_selected_count_output, sizeof(unsigned int))); + HIP_CHECK( + hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); + + // Calculate expected_selected and expected_rejected results on host + std::vector expected_selected; + std::vector expected_rejected; + expected_selected.reserve(input.size()/2); + expected_rejected.reserve(input.size()/2); + for(size_t i = 0; i < input.size(); i++) + { + if(select_op(input[i])) + { + expected_selected.push_back(input[i]); + } + else + { + expected_rejected.push_back(input[i]); + } + } + std::reverse(expected_rejected.begin(), expected_rejected.end()); + + // temp storage + size_t temp_storage_size_bytes; + // Get size of d_temp_storage + HIP_CHECK(rocprim::partition( + nullptr, + temp_storage_size_bytes, + d_input, + d_output, + d_selected_count_output, + input.size(), + select_op, + stream, + debug_synchronous)); + + // temp_storage_size_bytes must be >0 + ASSERT_GT(temp_storage_size_bytes, 0); + + // allocate temporary storage + void* d_temp_storage = nullptr; + HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); + + // Run + HIP_CHECK(rocprim::partition( + d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + d_selected_count_output, + input.size(), + select_op, + stream, + debug_synchronous)); + + HIP_CHECK(hipDeviceSynchronize()); + + // Check if number of selected value is as expected_selected + unsigned int selected_count_output = 0; + HIP_CHECK(hipMemcpy(&selected_count_output, + d_selected_count_output, + sizeof(unsigned int), + hipMemcpyDeviceToHost)); + ASSERT_EQ(selected_count_output, expected_selected.size()); + + // Check if output values are as expected_selected + std::vector output(input.size()); + HIP_CHECK(hipMemcpy(output.data(), + d_output, + output.size() * sizeof(U), + hipMemcpyDeviceToHost)); + + std::vector output_rejected; + for(size_t i = 0; i < expected_rejected.size(); i++) + { + auto j = i + expected_selected.size(); + output_rejected.push_back(output[j]); + } + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected_selected, expected_selected.size())); + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output_rejected, expected_rejected, expected_rejected.size())); + + hipFree(d_input); + hipFree(d_output); + hipFree(d_selected_count_output); + hipFree(d_temp_storage); + } + } +} \ No newline at end of file