[Enhancement] Support torch2.1 on Ascend NPU

Ginray · Aug 19, 2023 · 203960b · 203960b
1 parent c523359
commit 203960b
Show file tree

Hide file tree

Showing 8 changed files with 27 additions and 26 deletions.
diff --git a/mmcv/ops/csrc/common/pytorch_npu_helper.hpp b/mmcv/ops/csrc/common/pytorch_npu_helper.hpp
@@ -18,9 +18,10 @@
 #ifndef PYTORCH_NPU_HELPER_HPP_
 #define PYTORCH_NPU_HELPER_HPP_
 
-#include <torch_npu/csrc/aten/NPUNativeFunctions.h>
+// #include <torch_npu/csrc/aten/NPUNativeFunctions.h>
 #include <torch_npu/csrc/framework/utils/CalcuOpUtil.h>
 #include <torch_npu/csrc/framework/utils/OpAdapter.h>
+#include <torch_npu/csrc/framework/utils/CustomFunctions.h>
 
 #include "pytorch_cpp_helper.hpp"
 #include "pytorch_device_registry.hpp"

diff --git a/mmcv/ops/csrc/pytorch/npu/bbox_overlaps_npu.cpp b/mmcv/ops/csrc/pytorch/npu/bbox_overlaps_npu.cpp
@@ -21,8 +21,8 @@ void bbox_overlaps_npu(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
     gtboxesFP32 = bboxes2;
   }
   if (bboxes2.scalar_type() != at::ScalarType::Float) {
-    bboxesFP32 = NPUNativeFunctions::npu_dtype_cast(bboxesFP32, at::kFloat);
-    gtboxesFP32 = NPUNativeFunctions::npu_dtype_cast(gtboxesFP32, at::kFloat);
+    bboxesFP32 = custom_ops::npu_dtype_cast(bboxesFP32, at::kFloat);
+    gtboxesFP32 = custom_ops::npu_dtype_cast(gtboxesFP32, at::kFloat);
   }
   c10::SmallVector<int64_t, SIZE> iousSize = {gtboxesFP32.size(0),
                                               bboxesFP32.size(0)};
@@ -42,7 +42,7 @@ void bbox_overlaps_npu(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
       .Attr("aligned", aligned)
       .Run();
   if (bboxes2.scalar_type() != at::ScalarType::Float) {
-    iousFP32 = NPUNativeFunctions::npu_dtype_cast(iousFP32, at::kHalf);
+    iousFP32 = custom_ops::npu_dtype_cast(iousFP32, at::kHalf);
   }
   iousFP32 = swap_flag ? iousFP32.transpose(0, 1) : iousFP32;
   ious.copy_(iousFP32);

diff --git a/mmcv/ops/csrc/pytorch/npu/focal_loss_npu.cpp b/mmcv/ops/csrc/pytorch/npu/focal_loss_npu.cpp
@@ -12,14 +12,14 @@ void sigmoid_focal_loss_forward_npu(Tensor input, Tensor target, Tensor weight,
     target_y = at::mul(target_y, -1.0);
     target_y = at::add(target_y, 1.0);
   } else {
-    target_y = at_npu::native::NPUNativeFunctions::one_hot(target, n_class);
+    target_y = at::one_hot(target, n_class);
   }
   target_y =
-      at_npu::native::NPUNativeFunctions::npu_dtype_cast(target_y, at::kInt);
+      at_npu::native::custom_ops::npu_dtype_cast(target_y, at::kInt);
   int64_t weight_size = weight.size(0);
   at::Tensor weight_y = at::ones_like(input);
   if (weight_size > 0) {
-    weight_y = at_npu::native::NPUNativeFunctions::npu_broadcast(weight,
+    weight_y = at_npu::native::custom_ops::npu_broadcast(weight,
                                                                  input.sizes());
   }
   OpCommand cmd;
@@ -46,17 +46,17 @@ void sigmoid_focal_loss_backward_npu(Tensor input, Tensor target, Tensor weight,
   if (n_class == 1) {
     target_y = at::reshape(target, input.sizes());
   } else {
-    target_y = at_npu::native::NPUNativeFunctions::one_hot(target, n_class);
+    target_y = at::one_hot(target, n_class);
     target_y = at::mul(target_y, -1.0);
     target_y = at::add(target_y, 1.0);
   }
   target_y =
-      at_npu::native::NPUNativeFunctions::npu_dtype_cast(target_y, at::kInt);
+      at_npu::native::custom_ops::npu_dtype_cast(target_y, at::kInt);
   at::Tensor grad_up = at::ones_like(input);
   int64_t weight_size = weight.size(0);
   at::Tensor weight_y = at::ones_like(input);
   if (weight_size > 0) {
-    weight_y = at_npu::native::NPUNativeFunctions::npu_broadcast(weight,
+    weight_y = at_npu::native::custom_ops::npu_broadcast(weight,
                                                                  input.sizes());
   }
   OpCommand cmd;
@@ -81,13 +81,13 @@ void softmax_focal_loss_forward_npu(Tensor input, Tensor target, Tensor weight,
                                     Tensor output, float gamma, float alpha) {
   int64_t n_class = input.size(1);
   at::Tensor target_y =
-      at_npu::native::NPUNativeFunctions::one_hot(target, n_class);
+      at::one_hot(target, n_class);
   target_y =
-      at_npu::native::NPUNativeFunctions::npu_dtype_cast(target_y, at::kInt);
+      at_npu::native::custom_ops::npu_dtype_cast(target_y, at::kInt);
   int64_t weight_size = weight.size(0);
   at::Tensor weight_y = at::ones_like(input);
   if (weight_size > 0) {
-    weight_y = at_npu::native::NPUNativeFunctions::npu_broadcast(weight,
+    weight_y = at_npu::native::custom_ops::npu_broadcast(weight,
                                                                  input.sizes());
   }
   at::Tensor op_output = at::ones_like(input);
@@ -107,7 +107,7 @@ void softmax_focal_loss_forward_npu(Tensor input, Tensor target, Tensor weight,
   c10::SmallVector<int64_t, 2> sizes = {n_batch, 1};
   at::IntArrayRef offset = at::IntArrayRef(offsets);
   at::IntArrayRef size = at::IntArrayRef(sizes);
-  at_npu::native::NPUNativeFunctions::npu_slice_out(op_output, offset, size,
+  at_npu::native::custom_ops::npu_slice_out(op_output, offset, size,
                                                     output);
 }
 
@@ -120,14 +120,14 @@ void softmax_focal_loss_backward_npu(Tensor input, Tensor target, Tensor weight,
                                      float gamma, float alpha) {
   int64_t n_class = input.size(1);
   at::Tensor target_y =
-      at_npu::native::NPUNativeFunctions::one_hot(target, n_class);
+      at::one_hot(target, n_class);
   target_y =
-      at_npu::native::NPUNativeFunctions::npu_dtype_cast(target_y, at::kInt);
+      at_npu::native::custom_ops::npu_dtype_cast(target_y, at::kInt);
   at::Tensor grad_up = at::ones_like(input);
   int64_t weight_size = weight.size(0);
   at::Tensor weight_y = at::ones_like(input);
   if (weight_size > 0) {
-    weight_y = at_npu::native::NPUNativeFunctions::npu_broadcast(weight,
+    weight_y = at_npu::native::custom_ops::npu_broadcast(weight,
                                                                  input.sizes());
   }
   OpCommand cmd;

diff --git a/mmcv/ops/csrc/pytorch/npu/fused_bias_leakyrelu_npu.cpp b/mmcv/ops/csrc/pytorch/npu/fused_bias_leakyrelu_npu.cpp
@@ -25,7 +25,7 @@ Tensor fused_bias_leakyrelu_npu(const Tensor &input, const Tensor &bias,
       }
     }
     at::Tensor bias_tmp = at::reshape(bias, input_size_tmp);
-    at::Tensor bias_ = at_npu::native::NPUNativeFunctions::npu_broadcast(
+    at::Tensor bias_ = at_npu::native::custom_ops::npu_broadcast(
         bias_tmp, input.sizes());
     OpCommand cmd;
     cmd.Name("FusedBiasLeakyRelu")

diff --git a/mmcv/ops/csrc/pytorch/npu/nms_npu.cpp b/mmcv/ops/csrc/pytorch/npu/nms_npu.cpp
@@ -35,7 +35,7 @@ Tensor nms_npu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
   auto outputsizeInt = outputsizeBool.to(at::ScalarType::Int);
   auto countLen = at::sum(outputsizeInt, at::ScalarType::Int);
   at::Tensor actual_output = output.slice(0, 0, countLen.item().toLong());
-  actual_output = at_npu::native::NPUNativeFunctions::npu_dtype_cast(
+  actual_output = at_npu::native::custom_ops::npu_dtype_cast(
       actual_output, at::kLong);
   return actual_output;
 }

diff --git a/mmcv/ops/csrc/pytorch/npu/nms_rotated_npu.cpp b/mmcv/ops/csrc/pytorch/npu/nms_rotated_npu.cpp
@@ -8,8 +8,8 @@ Tensor nms_rotated_npu(const Tensor dets, const Tensor scores,
   at::Tensor detsCast = dets;
   at::Tensor scoresCast = scores;
   if (originDtype != at::ScalarType::Float) {
-    detsCast = NPUNativeFunctions::npu_dtype_cast(dets, at::kFloat);
-    scoresCast = NPUNativeFunctions::npu_dtype_cast(scores, at::kFloat);
+    detsCast = custom_ops::npu_dtype_cast(dets, at::kFloat);
+    scoresCast = custom_ops::npu_dtype_cast(scores, at::kFloat);
   }
   c10::SmallVector<int64_t, SIZE> selectedIndexSize = {dets.size(0)};
   at::Tensor selectedBox = OpPreparation::ApplyTensor(dets);
@@ -27,6 +27,6 @@ Tensor nms_rotated_npu(const Tensor dets, const Tensor scores,
       .Output(selectedIndex)
       .Attr("iou_threshold", (float)iou_threshold)
       .Run();
-  selectedIndex = NPUNativeFunctions::npu_dtype_cast(selectedIndex, at::kLong);
+  selectedIndex = custom_ops::npu_dtype_cast(selectedIndex, at::kLong);
   return selectedIndex;
 }
diff --git a/mmcv/ops/csrc/pytorch/npu/voxelization_npu.cpp b/mmcv/ops/csrc/pytorch/npu/voxelization_npu.cpp
@@ -19,7 +19,7 @@ int hard_voxelize_forward_npu(const at::Tensor &points, at::Tensor &voxels,
                               const int max_points, const int max_voxels,
                               const int NDim = 3) {
   at::Tensor voxel_num_tmp = OpPreparation::ApplyTensor(points, {1});
-  at::Tensor voxel_num = at_npu::native::NPUNativeFunctions::npu_dtype_cast(
+  at::Tensor voxel_num = at_npu::native::custom_ops::npu_dtype_cast(
       voxel_num_tmp, at::kInt);
 
   at::Tensor voxel_size_cpu = at::from_blob(

diff --git a/setup.py b/setup.py
@@ -159,7 +159,7 @@ def get_extensions():
         cuda_args = os.getenv('MMCV_CUDA_ARGS')
         extra_compile_args = {
             'nvcc': [cuda_args, '-std=c++14'] if cuda_args else ['-std=c++14'],
-            'cxx': ['-std=c++14'],
+            'cxx': ['-std=c++17'],
         }
         if torch.cuda.is_available() or os.getenv('FORCE_CUDA', '0') == '1':
             define_macros += [('MMCV_WITH_CUDA', None)]
@@ -201,13 +201,13 @@ def get_extensions():
         extra_compile_args = {'cxx': []}
 
         if platform.system() != 'Windows':
-            extra_compile_args['cxx'] = ['-std=c++14']
+            extra_compile_args['cxx'] = ['-std=c17']
         else:
             # TODO: In Windows, C++17 is chosen to compile extensions in
             # PyTorch2.0 , but a compile error will be reported.
             # As a temporary solution, force the use of C++14.
             if parse_version(torch.__version__) >= parse_version('2.0.0'):
-                extra_compile_args['cxx'] = ['/std:c++14']
+                extra_compile_args['cxx'] = ['/std:c++17']
 
         include_dirs = []
         library_dirs = []