Skip to content

Commit

Permalink
util/gpu_t.cuh: fix launch_coop(f, {0, 0},...).
Browse files Browse the repository at this point in the history
As it turns out cudaOccupancyMaxPotentialBlockSize doesn't actually
cater to cudaLaunchCooperativeKernel, because suggested grid size
can be twice the amount of SMs, which is unsuitable for cooperative
launches.
  • Loading branch information
dot-asm committed Sep 12, 2024
1 parent b2a94ff commit 41439f3
Showing 1 changed file with 4 additions and 5 deletions.
9 changes: 4 additions & 5 deletions util/gpu_t.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -118,11 +118,10 @@ public:
if (gpu_props(gpu_id).sharedMemPerBlock < shared_sz)
CUDA_OK(cudaFuncSetAttribute(f, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_sz));
if (gridDim.x == 0 || blockDim.x == 0) {
int blockSize, minGridSize;

CUDA_OK(cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, f));
if (blockDim.x == 0) blockDim.x = blockSize;
if (gridDim.x == 0) gridDim.x = minGridSize;
cudaFuncAttributes attr;
CUDA_OK(cudaFuncGetAttributes(&attr, f));
if (blockDim.x == 0) blockDim.x = attr.maxThreadsPerBlock;
if (gridDim.x == 0) gridDim.x = sm_count();
}
void* va_args[sizeof...(args)] = { &args... };
CUDA_OK(cudaLaunchCooperativeKernel((const void*)f, gridDim, blockDim,
Expand Down

0 comments on commit 41439f3

Please sign in to comment.