From f20e2c596dea1d9ab372b53b0cdefa80e826210a Mon Sep 17 00:00:00 2001 From: dflhw <595869386@qq.com> Date: Tue, 6 Jun 2023 15:40:40 +0800 Subject: [PATCH 1/3] fix arf op's bank conflict when num_orientations is not 1 --- .../common/cuda/active_rotated_filter_cuda_kernel.cuh | 11 +++++++---- mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp | 6 ++++-- .../csrc/pytorch/cuda/active_rotated_filter_cuda.cu | 4 ++-- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh index 36e41107eb..f502c3c791 100644 --- a/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh +++ b/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh @@ -15,18 +15,19 @@ __global__ void active_rotated_filter_forward_cuda_kernel( const int nthreads, const scalar_t* weight_data, const int* indices_data, const int num_input_planes, const int num_output_planes, const int num_orientations, const int num_rotations, const int nEntry, - scalar_t* output_data) { + const int kH, const int kW, scalar_t* output_data) { CUDA_1D_KERNEL_LOOP(index, nthreads) { int l = index % nEntry; int j = (index / nEntry) % num_input_planes; int i = index / nEntry / num_input_planes; int k; + int fmIndex = (l / (kH * kW)) * kH * kW; scalar_t val = *(weight_data + index); for (k = 0; k < num_rotations; k++) { int idx = (int)(*(indices_data + l * num_rotations + k)) - 1; scalar_t* target = output_data + i * (num_rotations * num_input_planes * nEntry) + - k * (num_input_planes * nEntry) + j * (nEntry) + idx; + k * (num_input_planes * nEntry) + j * (nEntry) + idx + fmIndex; *target = val; } } @@ -37,12 +38,14 @@ __global__ void active_rotated_filter_backward_cuda_kernel( const int nthreads, const scalar_t* gradWeight_data, const int* indices_data, const int num_input_planes, const int num_output_planes, const int num_orientations, - const int num_rotations, const int nEntry, scalar_t* weight_data) { + const int num_rotations, const int nEntry, const int kH, + const int kW, scalar_t* weight_data) { CUDA_1D_KERNEL_LOOP(index, nthreads) { int l = index % nEntry; int j = (index / nEntry) % num_input_planes; int i = index / nEntry / num_input_planes; int k; + int fmIndex = (l / (kH * kW)) * kH * kW; scalar_t* val = weight_data + index; *val = 0; scalar_t tmp = 0; @@ -50,7 +53,7 @@ __global__ void active_rotated_filter_backward_cuda_kernel( int idx = (int)(*(indices_data + l * num_rotations + k)) - 1; scalar_t target = *(gradWeight_data + i * (num_rotations * num_input_planes * nEntry) + - k * (num_input_planes * nEntry) + j * (nEntry) + idx); + k * (num_input_planes * nEntry) + j * (nEntry) + idx + fmIndex); tmp = tmp + target; } *val = tmp; diff --git a/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp b/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp index aa5a8b3d51..89c244e8e8 100644 --- a/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp +++ b/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp @@ -19,11 +19,12 @@ void active_rotated_filter_forward_cpu_kernel( for (l = 0; l < nEntry; l++) { int weightIndex = i * num_input_planes * nEntry + j * nEntry + l; T val = *(weightData + weightIndex); + int fmIndex = (l / (kH * kW)) * kH * kW; for (k = 0; k < num_rotations; k++) { int index = (int)(*(indicesData + l * num_rotations + k)) - 1; T* target = outputData + i * (num_rotations * num_input_planes * nEntry) + - k * (num_input_planes * nEntry) + j * (nEntry) + index; + k * (num_input_planes * nEntry) + j * (nEntry) + index + fmIndex; *target = val; } } @@ -48,11 +49,12 @@ void active_rotated_filter_backward_cpu_kernel( int gradInputIndex = i * num_input_planes * nEntry + j * nEntry + l; T* val = gradInputData + gradInputIndex; *val = 0; + int fmIndex = (l / (kH * kW)) * kH * kW; for (k = 0; k < num_rotations; k++) { int index = (int)(*(indicesData + l * num_rotations + k)) - 1; const T* target = gradOutputData + i * (num_rotations * num_input_planes * nEntry) + - k * (num_input_planes * nEntry) + j * (nEntry) + index; + k * (num_input_planes * nEntry) + j * (nEntry) + index + fmIndex; *val = *val + *target; } } diff --git a/mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.cu index 27fffb9fae..025e44148f 100644 --- a/mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.cu +++ b/mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.cu @@ -24,7 +24,7 @@ void ActiveRotatedFilterForwardCUDAKernelLauncher(const Tensor input, <<>>( output_size, input.data_ptr(), indices.data_ptr(), num_input_planes, num_output_planes, - num_orientations, num_rotations, nEntry, + num_orientations, num_rotations, nEntry, kH, kW, output.data_ptr()); }); AT_CUDA_CHECK(cudaGetLastError()); @@ -51,7 +51,7 @@ void ActiveRotatedFilterBackwardCUDAKernelLauncher(const Tensor grad_out, <<>>( output_size, grad_out.data_ptr(), indices.data_ptr(), num_input_planes, num_output_planes, - num_orientations, num_rotations, nEntry, + num_orientations, num_rotations, nEntry, kH, kW, grad_in.data_ptr()); }); AT_CUDA_CHECK(cudaGetLastError()); From 43365c3cd0393339ecfb53d53b08a36296d5828e Mon Sep 17 00:00:00 2001 From: dflhw <595869386@qq.com> Date: Tue, 6 Jun 2023 16:00:50 +0800 Subject: [PATCH 2/3] fix lint --- .../common/cuda/active_rotated_filter_cuda_kernel.cuh | 10 +++++----- mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh index f502c3c791..26118ac621 100644 --- a/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh +++ b/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh @@ -25,9 +25,9 @@ __global__ void active_rotated_filter_forward_cuda_kernel( scalar_t val = *(weight_data + index); for (k = 0; k < num_rotations; k++) { int idx = (int)(*(indices_data + l * num_rotations + k)) - 1; - scalar_t* target = output_data + - i * (num_rotations * num_input_planes * nEntry) + - k * (num_input_planes * nEntry) + j * (nEntry) + idx + fmIndex; + scalar_t* target = + output_data + i * (num_rotations * num_input_planes * nEntry) + + k * (num_input_planes * nEntry) + j * (nEntry) + idx + fmIndex; *target = val; } } @@ -38,8 +38,8 @@ __global__ void active_rotated_filter_backward_cuda_kernel( const int nthreads, const scalar_t* gradWeight_data, const int* indices_data, const int num_input_planes, const int num_output_planes, const int num_orientations, - const int num_rotations, const int nEntry, const int kH, - const int kW, scalar_t* weight_data) { + const int num_rotations, const int nEntry, const int kH, const int kW, + scalar_t* weight_data) { CUDA_1D_KERNEL_LOOP(index, nthreads) { int l = index % nEntry; int j = (index / nEntry) % num_input_planes; diff --git a/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp b/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp index 89c244e8e8..d90e55089a 100644 --- a/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp +++ b/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp @@ -23,8 +23,8 @@ void active_rotated_filter_forward_cpu_kernel( for (k = 0; k < num_rotations; k++) { int index = (int)(*(indicesData + l * num_rotations + k)) - 1; T* target = outputData + - i * (num_rotations * num_input_planes * nEntry) + - k * (num_input_planes * nEntry) + j * (nEntry) + index + fmIndex; + i * (num_rotations * num_input_planes * nEntry) + + k * (num_input_planes * nEntry) + j * (nEntry) + index + fmIndex; *target = val; } } From 3c9dce97eb2341fc925343ed2f90f141e69fa8a5 Mon Sep 17 00:00:00 2001 From: dflhw <595869386@qq.com> Date: Tue, 6 Jun 2023 16:05:39 +0800 Subject: [PATCH 3/3] fix lint --- mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp b/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp index d90e55089a..c322b4044a 100644 --- a/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp +++ b/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp @@ -22,8 +22,8 @@ void active_rotated_filter_forward_cpu_kernel( int fmIndex = (l / (kH * kW)) * kH * kW; for (k = 0; k < num_rotations; k++) { int index = (int)(*(indicesData + l * num_rotations + k)) - 1; - T* target = outputData + - i * (num_rotations * num_input_planes * nEntry) + + T* target = + outputData + i * (num_rotations * num_input_planes * nEntry) + k * (num_input_planes * nEntry) + j * (nEntry) + index + fmIndex; *target = val; }