open-mmlab · zhouzaida · Sep 20, 2023 · Sep 18, 2023 · Sep 19, 2023 · Sep 19, 2023
diff --git a/mmcv/ops/csrc/pytorch/npu/group_points_npu.cpp b/mmcv/ops/csrc/pytorch/npu/group_points_npu.cpp
@@ -0,0 +1,45 @@
+#include "pytorch_npu_helper.hpp"
+
+using namespace NPU_NAME_SPACE;
+using namespace std;
+
+void group_points_forward_npu(int b, int c, int n, int npoints, int nsample,
+                              const Tensor points, const Tensor idx,
+                              Tensor out) {
+  // b, c, n, and npoints do not need to be passed into gatherv2,
+  // b, c, n, and npoints are calculated inside the operator
+  // gatherv2 operator in ascend needs to set axis to 0, batch_dims is 0
+  c10::SmallVector<int64_t, N> axis = {0};
+  int64_t batch_dims = 0;
+
+  auto index = at::arange(0, b);
+  index = index.to(points.device());
+  index = index.view({-1, 1, 1});
+  index = at::mul(index, n);
+  at::Tensor indices = at::add(index, idx);
+  indices = indices.view({-1});
+
+  at::Tensor trans_features = points.transpose(1, 2);
+  at::Tensor features = NpuUtils::format_contiguous(trans_features);
+  features = features.view({b * n, c});
+
+  OpCommand cmd;
+  cmd.Name("GatherV2")
+      .Input(features)
+      .Input(indices)
+      .Input(axis)
+      .Output(out)
+      .Attr("batch_dims", batch_dims)
+      .Run();
+
+  at::Tensor output =
+      out.view({b, npoints, nsample, c}).transpose(1, 3).transpose(2, 3);
+  at::Tensor res = NpuUtils::format_contiguous(output);
+  out.copy_(res);
+}
+
+void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
+                               const Tensor points, const Tensor idx,
+                               Tensor out);
+
+REGISTER_NPU_IMPL(group_points_forward_impl, group_points_forward_npu);
diff --git a/tests/test_ops/test_group_points.py b/tests/test_ops/test_group_points.py
@@ -3,16 +3,25 @@
 import torch
 
 from mmcv.ops import grouping_operation
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_NPU_AVAILABLE
 
 
-@pytest.mark.skipif(
-    not torch.cuda.is_available(), reason='requires CUDA support')
+@pytest.mark.parametrize('device', [
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'npu',
+        marks=pytest.mark.skipif(
+            not IS_NPU_AVAILABLE, reason='requires NPU support'))
+])
 @pytest.mark.parametrize('dtype', [torch.half, torch.float, torch.double])
-def test_grouping_points(dtype):
+def test_grouping_points(dtype, device):
     idx = torch.tensor([[[0, 0, 0], [3, 3, 3], [8, 8, 8], [0, 0, 0], [0, 0, 0],
                          [0, 0, 0]],
                         [[0, 0, 0], [6, 6, 6], [9, 9, 9], [0, 0, 0], [0, 0, 0],
-                         [0, 0, 0]]]).int().cuda()
+                         [0, 0, 0]]]).int().to(device)
     features = torch.tensor([[[
         0.5798, -0.7981, -0.9280, -1.3311, 1.3687, 0.9277, -0.4164, -1.8274,
         0.9268, 0.8414
@@ -37,7 +46,7 @@ def test_grouping_points(dtype):
                                   -0.6646, -0.6870, -0.1125, -0.2224, -0.3445,
                                   -1.4049, 0.4990, -0.7037, -0.9924, 0.0386
                               ]]],
-                            dtype=dtype).cuda()
+                            dtype=dtype).to(device)
 
     output = grouping_operation(features, idx)
     expected_output = torch.tensor(
@@ -59,7 +68,7 @@ def test_grouping_points(dtype):
           [[-0.6646, -0.6646, -0.6646], [0.4990, 0.4990, 0.4990],
            [0.0386, 0.0386, 0.0386], [-0.6646, -0.6646, -0.6646],
            [-0.6646, -0.6646, -0.6646], [-0.6646, -0.6646, -0.6646]]]],
-        dtype=dtype).cuda()
+        dtype=dtype).to(device)
     assert torch.allclose(output, expected_output)