From f20e2c596dea1d9ab372b53b0cdefa80e826210a Mon Sep 17 00:00:00 2001
From: dflhw <595869386@qq.com>
Date: Tue, 6 Jun 2023 15:40:40 +0800
Subject: [PATCH 1/3] fix arf op's bank conflict when num_orientations is not 1

---
 .../common/cuda/active_rotated_filter_cuda_kernel.cuh | 11 +++++++----
 mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp   |  6 ++++--
 .../csrc/pytorch/cuda/active_rotated_filter_cuda.cu   |  4 ++--
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh
index 36e41107eb..f502c3c791 100644
--- a/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh
@@ -15,18 +15,19 @@ __global__ void active_rotated_filter_forward_cuda_kernel(
     const int nthreads, const scalar_t* weight_data, const int* indices_data,
     const int num_input_planes, const int num_output_planes,
     const int num_orientations, const int num_rotations, const int nEntry,
-    scalar_t* output_data) {
+    const int kH, const int kW, scalar_t* output_data) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     int l = index % nEntry;
     int j = (index / nEntry) % num_input_planes;
     int i = index / nEntry / num_input_planes;
     int k;
+    int fmIndex = (l / (kH * kW)) * kH * kW;
     scalar_t val = *(weight_data + index);
     for (k = 0; k < num_rotations; k++) {
       int idx = (int)(*(indices_data + l * num_rotations + k)) - 1;
       scalar_t* target = output_data +
                          i * (num_rotations * num_input_planes * nEntry) +
-                         k * (num_input_planes * nEntry) + j * (nEntry) + idx;
+                         k * (num_input_planes * nEntry) + j * (nEntry) + idx + fmIndex;
       *target = val;
     }
   }
@@ -37,12 +38,14 @@ __global__ void active_rotated_filter_backward_cuda_kernel(
     const int nthreads, const scalar_t* gradWeight_data,
     const int* indices_data, const int num_input_planes,
     const int num_output_planes, const int num_orientations,
-    const int num_rotations, const int nEntry, scalar_t* weight_data) {
+    const int num_rotations, const int nEntry, const int kH,
+    const int kW, scalar_t* weight_data) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     int l = index % nEntry;
     int j = (index / nEntry) % num_input_planes;
     int i = index / nEntry / num_input_planes;
     int k;
+    int fmIndex = (l / (kH * kW)) * kH * kW;
     scalar_t* val = weight_data + index;
     *val = 0;
     scalar_t tmp = 0;
@@ -50,7 +53,7 @@ __global__ void active_rotated_filter_backward_cuda_kernel(
       int idx = (int)(*(indices_data + l * num_rotations + k)) - 1;
       scalar_t target =
           *(gradWeight_data + i * (num_rotations * num_input_planes * nEntry) +
-            k * (num_input_planes * nEntry) + j * (nEntry) + idx);
+            k * (num_input_planes * nEntry) + j * (nEntry) + idx + fmIndex);
       tmp = tmp + target;
     }
     *val = tmp;
diff --git a/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp b/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp
index aa5a8b3d51..89c244e8e8 100644
--- a/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp
+++ b/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp
@@ -19,11 +19,12 @@ void active_rotated_filter_forward_cpu_kernel(
       for (l = 0; l < nEntry; l++) {
         int weightIndex = i * num_input_planes * nEntry + j * nEntry + l;
         T val = *(weightData + weightIndex);
+        int fmIndex = (l / (kH * kW)) * kH * kW;
         for (k = 0; k < num_rotations; k++) {
           int index = (int)(*(indicesData + l * num_rotations + k)) - 1;
           T* target = outputData +
                       i * (num_rotations * num_input_planes * nEntry) +
-                      k * (num_input_planes * nEntry) + j * (nEntry) + index;
+                      k * (num_input_planes * nEntry) + j * (nEntry) + index + fmIndex;
           *target = val;
         }
       }
@@ -48,11 +49,12 @@ void active_rotated_filter_backward_cpu_kernel(
         int gradInputIndex = i * num_input_planes * nEntry + j * nEntry + l;
         T* val = gradInputData + gradInputIndex;
         *val = 0;
+        int fmIndex = (l / (kH * kW)) * kH * kW;
         for (k = 0; k < num_rotations; k++) {
           int index = (int)(*(indicesData + l * num_rotations + k)) - 1;
           const T* target =
               gradOutputData + i * (num_rotations * num_input_planes * nEntry) +
-              k * (num_input_planes * nEntry) + j * (nEntry) + index;
+              k * (num_input_planes * nEntry) + j * (nEntry) + index + fmIndex;
           *val = *val + *target;
         }
       }
diff --git a/mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.cu
index 27fffb9fae..025e44148f 100644
--- a/mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.cu
@@ -24,7 +24,7 @@ void ActiveRotatedFilterForwardCUDAKernelLauncher(const Tensor input,
             <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                 output_size, input.data_ptr<scalar_t>(),
                 indices.data_ptr<int>(), num_input_planes, num_output_planes,
-                num_orientations, num_rotations, nEntry,
+                num_orientations, num_rotations, nEntry, kH, kW,
                 output.data_ptr<scalar_t>());
       });
   AT_CUDA_CHECK(cudaGetLastError());
@@ -51,7 +51,7 @@ void ActiveRotatedFilterBackwardCUDAKernelLauncher(const Tensor grad_out,
             <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                 output_size, grad_out.data_ptr<scalar_t>(),
                 indices.data_ptr<int>(), num_input_planes, num_output_planes,
-                num_orientations, num_rotations, nEntry,
+                num_orientations, num_rotations, nEntry, kH, kW,
                 grad_in.data_ptr<scalar_t>());
       });
   AT_CUDA_CHECK(cudaGetLastError());

From 43365c3cd0393339ecfb53d53b08a36296d5828e Mon Sep 17 00:00:00 2001
From: dflhw <595869386@qq.com>
Date: Tue, 6 Jun 2023 16:00:50 +0800
Subject: [PATCH 2/3] fix lint

---
 .../common/cuda/active_rotated_filter_cuda_kernel.cuh  | 10 +++++-----
 mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp    |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh
index f502c3c791..26118ac621 100644
--- a/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh
@@ -25,9 +25,9 @@ __global__ void active_rotated_filter_forward_cuda_kernel(
     scalar_t val = *(weight_data + index);
     for (k = 0; k < num_rotations; k++) {
       int idx = (int)(*(indices_data + l * num_rotations + k)) - 1;
-      scalar_t* target = output_data +
-                         i * (num_rotations * num_input_planes * nEntry) +
-                         k * (num_input_planes * nEntry) + j * (nEntry) + idx + fmIndex;
+      scalar_t* target =
+          output_data + i * (num_rotations * num_input_planes * nEntry) +
+          k * (num_input_planes * nEntry) + j * (nEntry) + idx + fmIndex;
       *target = val;
     }
   }
@@ -38,8 +38,8 @@ __global__ void active_rotated_filter_backward_cuda_kernel(
     const int nthreads, const scalar_t* gradWeight_data,
     const int* indices_data, const int num_input_planes,
     const int num_output_planes, const int num_orientations,
-    const int num_rotations, const int nEntry, const int kH,
-    const int kW, scalar_t* weight_data) {
+    const int num_rotations, const int nEntry, const int kH, const int kW,
+    scalar_t* weight_data) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     int l = index % nEntry;
     int j = (index / nEntry) % num_input_planes;
diff --git a/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp b/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp
index 89c244e8e8..d90e55089a 100644
--- a/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp
+++ b/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp
@@ -23,8 +23,8 @@ void active_rotated_filter_forward_cpu_kernel(
         for (k = 0; k < num_rotations; k++) {
           int index = (int)(*(indicesData + l * num_rotations + k)) - 1;
           T* target = outputData +
-                      i * (num_rotations * num_input_planes * nEntry) +
-                      k * (num_input_planes * nEntry) + j * (nEntry) + index + fmIndex;
+              i * (num_rotations * num_input_planes * nEntry) +
+              k * (num_input_planes * nEntry) + j * (nEntry) + index + fmIndex;
           *target = val;
         }
       }

From 3c9dce97eb2341fc925343ed2f90f141e69fa8a5 Mon Sep 17 00:00:00 2001
From: dflhw <595869386@qq.com>
Date: Tue, 6 Jun 2023 16:05:39 +0800
Subject: [PATCH 3/3] fix lint

---
 mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp b/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp
index d90e55089a..c322b4044a 100644
--- a/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp
+++ b/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp
@@ -22,8 +22,8 @@ void active_rotated_filter_forward_cpu_kernel(
         int fmIndex = (l / (kH * kW)) * kH * kW;
         for (k = 0; k < num_rotations; k++) {
           int index = (int)(*(indicesData + l * num_rotations + k)) - 1;
-          T* target = outputData +
-              i * (num_rotations * num_input_planes * nEntry) +
+          T* target =
+              outputData + i * (num_rotations * num_input_planes * nEntry) +
               k * (num_input_planes * nEntry) + j * (nEntry) + index + fmIndex;
           *target = val;
         }