diff --git a/mmcv/ops/csrc/pytorch/bbox_overlaps.cpp b/mmcv/ops/csrc/pytorch/bbox_overlaps.cpp index 187216fb01..60dab44521 100644 --- a/mmcv/ops/csrc/pytorch/bbox_overlaps.cpp +++ b/mmcv/ops/csrc/pytorch/bbox_overlaps.cpp @@ -1,6 +1,16 @@ // Copyright (c) OpenMMLab. All rights reserved #include "pytorch_cpp_helper.hpp" #include "pytorch_device_registry.hpp" +#ifdef MMCV_WITH_DIOPI +#include +#include +#include + +#include "csrc_dipu/diopirt/diopirt_impl.h" + +using dipu::diopi_helper::toDiopiScalar; +using dipu::diopi_helper::toDiopiTensorHandle; +#endif void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious, const int mode, const bool aligned, const int offset) { @@ -8,7 +18,40 @@ void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious, aligned, offset); } +#ifdef MMCV_WITH_DIOPI +void bbox_overlaps_diopi(const Tensor bboxes1, const Tensor bboxes2, + Tensor ious, const int mode, const bool aligned, + const int offset) { + auto bboxes1_p = toDiopiTensorHandle(bboxes1); + diopiDevice_t device; + diopiGetTensorDevice(bboxes1_p, &device); + if (device == diopi_host) { + bbox_overlaps_impl(bboxes1, bboxes2, ious, mode, aligned, offset); + return; + } + diopiContext ctx(dipu::getCurrentDIPUStream().rawstream()); + diopiContextHandle_t ch = &ctx; + auto bboxes2_p = toDiopiTensorHandle(bboxes2); + auto ious_p = toDiopiTensorHandle(ious); + if (reinterpret_cast(diopiBboxOverlapsMmcv) != nullptr) { + auto ret = diopiBboxOverlapsMmcv(ch, ious_p, bboxes1_p, bboxes2_p, mode, + offset, aligned); + if (ret == diopiSuccess) return; + } + LOG(WARNING) << "Fallback to cpu: mmcv ext op bbox_overlaps"; + auto bboxes1_cpu = bboxes1.cpu(); + auto bboxes2_cpu = bboxes2.cpu(); + auto ious_cpu = ious.cpu(); + bbox_overlaps_impl(bboxes1_cpu, bboxes2_cpu, ious_cpu, mode, aligned, offset); + ious.copy_(ious_cpu); +} +#endif + void bbox_overlaps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious, const int mode, const bool aligned, const int offset) { +#ifdef MMCV_WITH_DIOPI + bbox_overlaps_diopi(bboxes1, bboxes2, ious, mode, aligned, offset); +#else bbox_overlaps_impl(bboxes1, bboxes2, ious, mode, aligned, offset); +#endif } diff --git a/mmcv/ops/csrc/pytorch/focal_loss.cpp b/mmcv/ops/csrc/pytorch/focal_loss.cpp index ed0e218653..51568ead34 100644 --- a/mmcv/ops/csrc/pytorch/focal_loss.cpp +++ b/mmcv/ops/csrc/pytorch/focal_loss.cpp @@ -1,6 +1,16 @@ // Copyright (c) OpenMMLab. All rights reserved #include "pytorch_cpp_helper.hpp" #include "pytorch_device_registry.hpp" +#ifdef MMCV_WITH_DIOPI +#include +#include +#include + +#include "csrc_dipu/diopirt/diopirt_impl.h" + +using dipu::diopi_helper::toDiopiScalar; +using dipu::diopi_helper::toDiopiTensorHandle; +#endif void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight, Tensor output, float gamma, float alpha) { @@ -29,15 +39,92 @@ void softmax_focal_loss_backward_impl(Tensor input, Tensor target, buff, grad_input, gamma, alpha); } +#ifdef MMCV_WITH_DIOPI +void sigmoid_focal_loss_forward_diopi(Tensor input, Tensor target, + Tensor weight, Tensor output, float gamma, + float alpha) { + auto input_p = toDiopiTensorHandle(input); + diopiDevice_t device; + diopiGetTensorDevice(input_p, &device); + if (device == diopi_host) { + sigmoid_focal_loss_forward_impl(input, target, weight, output, gamma, + alpha); + return; + } + diopiContext ctx(dipu::getCurrentDIPUStream().rawstream()); + diopiContextHandle_t ch = &ctx; + auto target_p = toDiopiTensorHandle(target); + auto weight_p = toDiopiTensorHandle(weight); + auto output_p = toDiopiTensorHandle(output); + if (reinterpret_cast(diopiSigmoidFocalLossMmcv) != nullptr) { + auto ret = diopiSigmoidFocalLossMmcv(ch, output_p, input_p, target_p, + weight_p, gamma, alpha); + if (ret == diopiSuccess) return; + } + LOG(WARNING) + << "Fallback to cpu: mmcv ext op sigmoid_focal_loss_forward_impl"; + auto input_cpu = input.cpu(); + auto target_cpu = target.cpu(); + auto weight_cpu = weight.cpu(); + auto output_cpu = output.cpu(); + sigmoid_focal_loss_forward_impl(input_cpu, target_cpu, weight_cpu, output_cpu, + gamma, alpha); + output.copy_(output_cpu); + return; +} + +void sigmoid_focal_loss_backward_diopi(Tensor input, Tensor target, + Tensor weight, Tensor grad_input, + float gamma, float alpha) { + auto input_p = toDiopiTensorHandle(input); + diopiDevice_t device; + diopiGetTensorDevice(input_p, &device); + if (device == diopi_host) { + sigmoid_focal_loss_backward_impl(input, target, weight, grad_input, gamma, + alpha); + return; + } + diopiContext ctx(dipu::getCurrentDIPUStream().rawstream()); + diopiContextHandle_t ch = &ctx; + auto target_p = toDiopiTensorHandle(target); + auto weight_p = toDiopiTensorHandle(weight); + auto grad_input_p = toDiopiTensorHandle(grad_input); + if (reinterpret_cast(diopiSigmoidFocalLossBackwardMmcv) != nullptr) { + auto ret = diopiSigmoidFocalLossBackwardMmcv( + ch, grad_input_p, input_p, target_p, weight_p, gamma, alpha); + if (ret == diopiSuccess) return; + } + LOG(WARNING) + << "Fallback to cpu: mmcv ext op sigmoid_focal_loss_forward_impl"; + auto input_cpu = input.cpu(); + auto target_cpu = target.cpu(); + auto weight_cpu = weight.cpu(); + auto grad_input_cpu = grad_input.cpu(); + sigmoid_focal_loss_backward_impl(input_cpu, target_cpu, weight_cpu, + grad_input_cpu, gamma, alpha); + grad_input.copy_(grad_input_cpu); + return; +} +#endif + void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight, Tensor output, float gamma, float alpha) { +#ifdef MMCV_WITH_DIOPI + sigmoid_focal_loss_forward_diopi(input, target, weight, output, gamma, alpha); +#else sigmoid_focal_loss_forward_impl(input, target, weight, output, gamma, alpha); +#endif } void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight, Tensor grad_input, float gamma, float alpha) { +#ifdef MMCV_WITH_DIOPI + sigmoid_focal_loss_backward_diopi(input, target, weight, grad_input, gamma, + alpha); +#else sigmoid_focal_loss_backward_impl(input, target, weight, grad_input, gamma, alpha); +#endif } void softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight, diff --git a/mmcv/ops/csrc/pytorch/modulated_deform_conv.cpp b/mmcv/ops/csrc/pytorch/modulated_deform_conv.cpp index 12b538a05e..2b9493dbc9 100644 --- a/mmcv/ops/csrc/pytorch/modulated_deform_conv.cpp +++ b/mmcv/ops/csrc/pytorch/modulated_deform_conv.cpp @@ -1,6 +1,16 @@ // Copyright (c) OpenMMLab. All rights reserved #include "pytorch_cpp_helper.hpp" #include "pytorch_device_registry.hpp" +#ifdef MMCV_WITH_DIOPI +#include +#include +#include + +#include "csrc_dipu/diopirt/diopirt_impl.h" + +using dipu::diopi_helper::toDiopiScalar; +using dipu::diopi_helper::toDiopiTensorHandle; +#endif void modulated_deformable_im2col_impl( const Tensor data_im, const Tensor data_offset, const Tensor data_mask, @@ -45,7 +55,7 @@ void modulated_deformable_col2im_coord_impl( dilation_w, deformable_group, grad_offset, grad_mask); } -void modulated_deform_conv_forward( +void modulated_deform_conv_forward_fallthrough( Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset, Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, @@ -123,7 +133,7 @@ void modulated_deform_conv_forward( } } -void modulated_deform_conv_backward( +void modulated_deform_conv_backward_fallthrough( Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset, Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight, Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output, @@ -235,3 +245,165 @@ void modulated_deform_conv_backward( grad_output.size(2), grad_output.size(3), grad_output.size(4)}); } + +#ifdef MMCV_WITH_DIOPI +void modulated_deform_conv_forward_diopi( + Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset, + Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w, + const int stride_h, const int stride_w, const int pad_h, const int pad_w, + const int dilation_h, const int dilation_w, const int group, + const int deformable_group, const bool with_bias) { + auto input_p = toDiopiTensorHandle(input); + diopiDevice_t device; + diopiGetTensorDevice(input_p, &device); + if (device == diopi_host) { + modulated_deform_conv_forward_fallthrough( + input, weight, bias, ones, offset, mask, output, columns, kernel_h, + kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, + group, deformable_group, with_bias); + return; + } + diopiContext ctx(dipu::getCurrentDIPUStream().rawstream()); + diopiContextHandle_t ch = &ctx; + auto weight_p = toDiopiTensorHandle(weight); + auto bias_p = toDiopiTensorHandle(bias); + auto ones_p = toDiopiTensorHandle(ones); + auto offset_p = toDiopiTensorHandle(offset); + auto mask_p = toDiopiTensorHandle(mask); + auto output_p = toDiopiTensorHandle(output); + auto columns_p = toDiopiTensorHandle(columns); + if (reinterpret_cast(diopiModulatedDeformConvMmcv) != nullptr) { + auto ret = diopiModulatedDeformConvMmcv( + ch, output_p, columns_p, ones_p, input_p, weight_p, bias_p, offset_p, + mask_p, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, + dilation_h, dilation_w, group, deformable_group, with_bias); + if (ret == diopiSuccess) return; + } + LOG(WARNING) << "Fallback to cpu: mmcv ext op modulated_deform_conv_forward"; + auto input_cpu = input.cpu(); + auto weight_cpu = weight.cpu(); + auto bias_cpu = bias.cpu(); + auto ones_cpu = ones.cpu(); + auto offset_cpu = offset.cpu(); + auto mask_cpu = mask.cpu(); + auto output_cpu = output.cpu(); + auto columns_cpu = columns.cpu(); + modulated_deform_conv_forward_fallthrough( + input_cpu, weight_cpu, bias_cpu, ones_cpu, offset_cpu, mask_cpu, + output_cpu, columns_cpu, kernel_h, kernel_w, stride_h, stride_w, pad_h, + pad_w, dilation_h, dilation_w, group, deformable_group, with_bias); + output.copy_(output_cpu); + return; +} + +void modulated_deform_conv_backward_diopi( + Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset, + Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight, + Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output, + int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h, + int pad_w, int dilation_h, int dilation_w, int group, int deformable_group, + const bool with_bias) { + auto input_p = toDiopiTensorHandle(input); + diopiDevice_t device; + diopiGetTensorDevice(input_p, &device); + if (device == diopi_host) { + modulated_deform_conv_backward_fallthrough( + input, weight, bias, ones, offset, mask, columns, grad_input, + grad_weight, grad_bias, grad_offset, grad_mask, grad_output, kernel_h, + kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, + group, deformable_group, with_bias); + return; + } + diopiContext ctx(dipu::getCurrentDIPUStream().rawstream()); + diopiContextHandle_t ch = &ctx; + auto weight_p = toDiopiTensorHandle(weight); + auto bias_p = toDiopiTensorHandle(bias); + auto ones_p = toDiopiTensorHandle(ones); + auto offset_p = toDiopiTensorHandle(offset); + auto mask_p = toDiopiTensorHandle(mask); + auto columns_p = toDiopiTensorHandle(columns); + auto grad_input_p = toDiopiTensorHandle(grad_input); + auto grad_weight_p = toDiopiTensorHandle(grad_weight); + auto grad_bias_p = toDiopiTensorHandle(grad_bias); + auto grad_offset_p = toDiopiTensorHandle(grad_offset); + auto grad_mask_p = toDiopiTensorHandle(grad_mask); + auto grad_output_p = toDiopiTensorHandle(grad_output); + + if (reinterpret_cast(diopiModulatedDeformConvBackwardMmcv) != + nullptr) { + auto ret = diopiModulatedDeformConvBackwardMmcv( + ch, grad_input_p, grad_weight_p, grad_bias_p, grad_offset_p, + grad_mask_p, input_p, weight_p, bias_p, ones_p, offset_p, mask_p, + columns_p, grad_output_p, kernel_h, kernel_w, stride_h, stride_w, pad_h, + pad_w, dilation_h, dilation_w, group, deformable_group, with_bias); + if (ret == diopiSuccess) return; + } + LOG(WARNING) << "Fallback to cpu: mmcv ext op modulated_deform_conv_forward"; + auto input_cpu = input.cpu(); + auto weight_cpu = weight.cpu(); + auto bias_cpu = bias.cpu(); + auto ones_cpu = ones.cpu(); + auto offset_cpu = offset.cpu(); + auto mask_cpu = mask.cpu(); + auto columns_cpu = columns.cpu(); + auto grad_input_cpu = grad_input.cpu(); + auto grad_weight_cpu = grad_weight.cpu(); + auto grad_bias_cpu = grad_bias.cpu(); + auto grad_offset_cpu = grad_offset.cpu(); + auto grad_mask_cpu = grad_mask.cpu(); + auto grad_output_cpu = grad_output.cpu(); + modulated_deform_conv_backward_fallthrough( + input_cpu, weight_cpu, bias_cpu, ones_cpu, offset_cpu, mask_cpu, + columns_cpu, grad_input_cpu, grad_weight_cpu, grad_bias_cpu, + grad_offset_cpu, grad_mask_cpu, grad_output_cpu, kernel_h, kernel_w, + stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, + deformable_group, with_bias); + grad_input.copy_(grad_input_cpu); + grad_weight.copy_(grad_weight_cpu); + grad_bias.copy_(grad_bias_cpu); + grad_offset.copy_(grad_offset_cpu); + grad_mask.copy_(grad_mask_cpu); + return; +} +#endif + +void modulated_deform_conv_forward( + Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset, + Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w, + const int stride_h, const int stride_w, const int pad_h, const int pad_w, + const int dilation_h, const int dilation_w, const int group, + const int deformable_group, const bool with_bias) { +#ifdef MMCV_WITH_DIOPI + modulated_deform_conv_forward_diopi( + input, weight, bias, ones, offset, mask, output, columns, kernel_h, + kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, + deformable_group, with_bias); +#else + modulated_deform_conv_forward_fallthrough( + input, weight, bias, ones, offset, mask, output, columns, kernel_h, + kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, + deformable_group, with_bias); +#endif +} + +void modulated_deform_conv_backward( + Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset, + Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight, + Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output, + int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h, + int pad_w, int dilation_h, int dilation_w, int group, int deformable_group, + const bool with_bias) { +#ifdef MMCV_WITH_DIOPI + modulated_deform_conv_backward_diopi( + input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight, + grad_bias, grad_offset, grad_mask, grad_output, kernel_h, kernel_w, + stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, + deformable_group, with_bias); +#else + modulated_deform_conv_backward_fallthrough( + input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight, + grad_bias, grad_offset, grad_mask, grad_output, kernel_h, kernel_w, + stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, + deformable_group, with_bias); +#endif +} diff --git a/mmcv/ops/csrc/pytorch/nms.cpp b/mmcv/ops/csrc/pytorch/nms.cpp index 199d8af236..ad7988420e 100644 --- a/mmcv/ops/csrc/pytorch/nms.cpp +++ b/mmcv/ops/csrc/pytorch/nms.cpp @@ -1,6 +1,16 @@ // Copyright (c) OpenMMLab. All rights reserved #include "pytorch_cpp_helper.hpp" #include "pytorch_device_registry.hpp" +#ifdef MMCV_WITH_DIOPI +#include +#include +#include + +#include "csrc_dipu/diopirt/diopirt_impl.h" + +using dipu::diopi_helper::toDiopiScalar; +using dipu::diopi_helper::toDiopiTensorHandle; +#endif Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset) { return DISPATCH_DEVICE_IMPL(nms_impl, boxes, scores, iou_threshold, offset); @@ -18,8 +28,41 @@ std::vector > nms_match_impl(Tensor dets, return DISPATCH_DEVICE_IMPL(nms_match_impl, dets, iou_threshold); } +#ifdef MMCV_WITH_DIOPI +Tensor nms_diopi(Tensor boxes, Tensor scores, float iou_threshold, int offset) { + auto boxes_p = toDiopiTensorHandle(boxes); + diopiDevice_t device; + diopiGetTensorDevice(boxes_p, &device); + if (device == diopi_host) { + return nms_impl(boxes, scores, iou_threshold, offset); + } + diopiContext ctx(dipu::getCurrentDIPUStream().rawstream()); + diopiContextHandle_t ch = &ctx; + Tensor out; + auto outp = toDiopiTensorHandle(out); + diopiTensorHandle_t* outhandle = &outp; + auto scores_p = toDiopiTensorHandle(scores); + if (reinterpret_cast(diopiNmsMmcv) != nullptr) { + auto ret = + diopiNmsMmcv(ch, outhandle, boxes_p, scores_p, iou_threshold, offset); + if (ret == diopiSuccess) { + auto tensorhandle = reinterpret_cast(*outhandle); + return *tensorhandle; + } + } + LOG(WARNING) << "Fallback to cpu: mmcv ext op nms"; + auto boxes_cpu = boxes.cpu(); + auto scores_cpu = scores.cpu(); + return nms_impl(boxes_cpu, scores_cpu, iou_threshold, offset); +} +#endif + Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset) { +#ifdef MMCV_WITH_DIOPI + return nms_diopi(boxes, scores, iou_threshold, offset); +#else return nms_impl(boxes, scores, iou_threshold, offset); +#endif } Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold, diff --git a/mmcv/ops/csrc/pytorch/roi_align.cpp b/mmcv/ops/csrc/pytorch/roi_align.cpp index 6e7077397d..798a22b418 100644 --- a/mmcv/ops/csrc/pytorch/roi_align.cpp +++ b/mmcv/ops/csrc/pytorch/roi_align.cpp @@ -1,6 +1,16 @@ // Copyright (c) OpenMMLab. All rights reserved #include "pytorch_cpp_helper.hpp" #include "pytorch_device_registry.hpp" +#ifdef MMCV_WITH_DIOPI +#include +#include +#include + +#include "csrc_dipu/diopirt/diopirt_impl.h" + +using dipu::diopi_helper::toDiopiScalar; +using dipu::diopi_helper::toDiopiTensorHandle; +#endif void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output, Tensor argmax_y, Tensor argmax_x, @@ -22,20 +32,111 @@ void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y, spatial_scale, sampling_ratio, pool_mode, aligned); } +#ifdef MMCV_WITH_DIOPI +void roi_align_forward_diopi(Tensor input, Tensor rois, Tensor output, + Tensor argmax_y, Tensor argmax_x, + int aligned_height, int aligned_width, + float spatial_scale, int sampling_ratio, + int pool_mode, bool aligned) { + auto input_p = toDiopiTensorHandle(input); + diopiDevice_t device; + diopiGetTensorDevice(input_p, &device); + if (device == diopi_host) { + roi_align_forward_impl(input, rois, output, argmax_y, argmax_x, + aligned_height, aligned_width, spatial_scale, + sampling_ratio, pool_mode, aligned); + return; + } + diopiContext ctx(dipu::getCurrentDIPUStream().rawstream()); + diopiContextHandle_t ch = &ctx; + auto rois_p = toDiopiTensorHandle(rois); + auto out_p = toDiopiTensorHandle(output); + auto argmax_y_p = toDiopiTensorHandle(argmax_y); + auto argmax_x_p = toDiopiTensorHandle(argmax_x); + if (reinterpret_cast(diopiRoiAlignMmcv) != nullptr) { + auto ret = diopiRoiAlignMmcv( + ch, out_p, argmax_y_p, argmax_x_p, input_p, rois_p, aligned_height, + aligned_width, sampling_ratio, pool_mode, spatial_scale, aligned); + if (ret == diopiSuccess) return; + } + LOG(WARNING) << "Fallback to cpu: mmcv ext op roi_align_forward"; + auto input_cpu = input.cpu(); + auto rois_cpu = rois.cpu(); + auto out_cpu = output.cpu(); + auto argmax_y_cpu = argmax_y.cpu(); + auto argmax_x_cpu = argmax_x.cpu(); + roi_align_forward_impl(input_cpu, rois_cpu, out_cpu, argmax_y_cpu, + argmax_x_cpu, aligned_height, aligned_width, + spatial_scale, sampling_ratio, pool_mode, aligned); + output.copy_(out_cpu); +} + +void roi_align_backward_diopi(Tensor grad_output, Tensor rois, Tensor argmax_y, + Tensor argmax_x, Tensor grad_input, + int aligned_height, int aligned_width, + float spatial_scale, int sampling_ratio, + int pool_mode, bool aligned) { + auto grad_output_ = toDiopiTensorHandle(grad_output); + diopiDevice_t device; + diopiGetTensorDevice(grad_output_, &device); + if (device == diopi_host) { + roi_align_backward_impl(grad_output, rois, argmax_y, argmax_x, grad_input, + aligned_height, aligned_width, spatial_scale, + sampling_ratio, pool_mode, aligned); + return; + } + auto rois_ = toDiopiTensorHandle(rois); + auto argmax_y_ = toDiopiTensorHandle(argmax_y); + auto argmax_x_ = toDiopiTensorHandle(argmax_x); + auto grad_input_ = toDiopiTensorHandle(grad_input); + diopiContext ctx(dipu::getCurrentDIPUStream().rawstream()); + diopiContextHandle_t ch = &ctx; + if (reinterpret_cast(diopiRoiAlignBackwardMmcv) != nullptr) { + auto ret = diopiRoiAlignBackwardMmcv(ch, grad_input_, grad_output_, rois_, + argmax_y_, argmax_x_, aligned_height, + aligned_width, sampling_ratio, + pool_mode, spatial_scale, aligned); + if (ret == diopiSuccess) return; + } + LOG(WARNING) << "Fallback to cpu: mmcv ext op roi_align_backward"; + auto grad_output_cpu = grad_output.cpu(); + auto rois_cpu = rois.cpu(); + auto argmax_y_cpu = argmax_y.cpu(); + auto argmax_x_cpu = argmax_x.cpu(); + auto grad_input_cpu = grad_input.cpu(); + roi_align_backward_impl(grad_output_cpu, rois_cpu, argmax_y_cpu, argmax_x_cpu, + grad_input_cpu, aligned_height, aligned_width, + spatial_scale, sampling_ratio, pool_mode, aligned); + grad_input.copy_(grad_input_cpu); +} +#endif + void roi_align_forward(Tensor input, Tensor rois, Tensor output, Tensor argmax_y, Tensor argmax_x, int aligned_height, int aligned_width, float spatial_scale, int sampling_ratio, int pool_mode, bool aligned) { +#ifdef MMCV_WITH_DIOPI + roi_align_forward_diopi(input, rois, output, argmax_y, argmax_x, + aligned_height, aligned_width, spatial_scale, + sampling_ratio, pool_mode, aligned); +#else roi_align_forward_impl(input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned); +#endif } void roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y, Tensor argmax_x, Tensor grad_input, int aligned_height, int aligned_width, float spatial_scale, int sampling_ratio, int pool_mode, bool aligned) { +#ifdef MMCV_WITH_DIOPI + roi_align_backward_diopi(grad_output, rois, argmax_y, argmax_x, grad_input, + aligned_height, aligned_width, spatial_scale, + sampling_ratio, pool_mode, aligned); +#else roi_align_backward_impl(grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height, aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned); +#endif } diff --git a/mmcv/ops/csrc/pytorch/voxelization.cpp b/mmcv/ops/csrc/pytorch/voxelization.cpp index 7946be6178..7567351159 100644 --- a/mmcv/ops/csrc/pytorch/voxelization.cpp +++ b/mmcv/ops/csrc/pytorch/voxelization.cpp @@ -1,6 +1,16 @@ // Copyright (c) OpenMMLab. All rights reserved. #include "pytorch_cpp_helper.hpp" #include "pytorch_device_registry.hpp" +#ifdef MMCV_WITH_DIOPI +#include +#include +#include + +#include "csrc_dipu/diopirt/diopirt_impl.h" + +using dipu::diopi_helper::toDiopiScalar; +using dipu::diopi_helper::toDiopiTensorHandle; +#endif int hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors, @@ -33,6 +43,132 @@ void dynamic_voxelize_forward_impl(const at::Tensor &points, at::Tensor &coors, coors_range, NDim); } +#ifdef MMCV_WITH_DIOPI +void hard_voxelize_forward_diopi(const at::Tensor &points, + const at::Tensor &voxel_size, + const at::Tensor &coors_range, + at::Tensor &voxels, at::Tensor &coors, + at::Tensor &num_points_per_voxel, + at::Tensor &voxel_num, const int max_points, + const int max_voxels, const int NDim = 3, + const bool deterministic = true) { + auto points_p = toDiopiTensorHandle(points); + diopiDevice_t device; + diopiGetTensorDevice(points_p, &device); + if (device == diopi_host) { + int64_t *voxel_num_data = voxel_num.data_ptr(); + std::vector voxel_size_v( + voxel_size.data_ptr(), + voxel_size.data_ptr() + voxel_size.numel()); + std::vector coors_range_v( + coors_range.data_ptr(), + coors_range.data_ptr() + coors_range.numel()); + + if (deterministic) { + *voxel_num_data = hard_voxelize_forward_impl( + points, voxels, coors, num_points_per_voxel, voxel_size_v, + coors_range_v, max_points, max_voxels, NDim); + } else { + TORCH_CHECK( + deterministic, + "nondeterministic hard_voxelize_forward is not supported on host!"); + } + return; + } + diopiContext ctx(dipu::getCurrentDIPUStream().rawstream()); + diopiContextHandle_t ch = &ctx; + auto voxel_size_p = toDiopiTensorHandle(voxel_size); + auto coors_range_p = toDiopiTensorHandle(coors_range); + auto voxels_p = toDiopiTensorHandle(voxels); + auto coors_p = toDiopiTensorHandle(coors); + auto num_points_per_voxel_p = toDiopiTensorHandle(num_points_per_voxel); + auto voxel_num_p = toDiopiTensorHandle(voxel_num); + if (reinterpret_cast(diopiHardVoxelizeMmcv) != nullptr) { + auto ret = diopiHardVoxelizeMmcv( + ch, voxels_p, coors_p, num_points_per_voxel_p, voxel_num_p, points_p, + voxel_size_p, coors_range_p, max_points, max_voxels, NDim, + deterministic); + if (ret == diopiSuccess) return; + } + LOG(WARNING) << "Fallback to cpu: mmcv ext op hard_voxelize_forward"; + auto points_cpu = points.cpu(); + auto voxel_size_cpu = voxel_size.cpu(); + auto coors_range_cpu = coors_range.cpu(); + auto voxels_cpu = voxels.cpu(); + auto coors_cpu = coors.cpu(); + auto num_points_per_voxel_cpu = num_points_per_voxel.cpu(); + auto voxel_num_cpu = voxel_num.cpu(); + + int64_t *voxel_num_data_cpu = voxel_num_cpu.data_ptr(); + std::vector voxel_size_v_cpu( + voxel_size_cpu.data_ptr(), + voxel_size_cpu.data_ptr() + voxel_size_cpu.numel()); + std::vector coors_range_v_cpu( + coors_range_cpu.data_ptr(), + coors_range_cpu.data_ptr() + coors_range_cpu.numel()); + + if (deterministic) { + *voxel_num_data_cpu = hard_voxelize_forward_impl( + points_cpu, voxels_cpu, coors_cpu, num_points_per_voxel_cpu, + voxel_size_v_cpu, coors_range_v_cpu, max_points, max_voxels, NDim); + } else { + puts("nondeterministic hard_voxelize_forward is not supported on host!"); + abort(); + } + voxels.copy_(voxels_cpu); + coors.copy_(coors_cpu); + num_points_per_voxel.copy_(num_points_per_voxel_cpu); + voxel_num.copy_(voxel_num_cpu); + return; +} + +void dynamic_voxelize_forward_diopi(const at::Tensor &points, + const at::Tensor &voxel_size, + const at::Tensor &coors_range, + at::Tensor &coors, const int NDim = 3) { + auto points_p = toDiopiTensorHandle(points); + diopiDevice_t device; + diopiGetTensorDevice(points_p, &device); + if (device == diopi_host) { + std::vector voxel_size_v( + voxel_size.data_ptr(), + voxel_size.data_ptr() + voxel_size.numel()); + std::vector coors_range_v( + coors_range.data_ptr(), + coors_range.data_ptr() + coors_range.numel()); + dynamic_voxelize_forward_impl(points, coors, voxel_size_v, coors_range_v, + NDim); + return; + } + diopiContext ctx(dipu::getCurrentDIPUStream().rawstream()); + diopiContextHandle_t ch = &ctx; + auto voxel_size_p = toDiopiTensorHandle(voxel_size); + auto coors_range_p = toDiopiTensorHandle(coors_range); + auto coors_p = toDiopiTensorHandle(coors); + if (reinterpret_cast(diopiDynamicVoxelizeMmcv) != nullptr) { + auto ret = diopiDynamicVoxelizeMmcv(ch, coors_p, points_p, voxel_size_p, + coors_range_p, NDim); + if (ret == diopiSuccess) return; + } + LOG(WARNING) << "Fallback to cpu: mmcv ext op dynamic_voxelize_forward"; + auto points_cpu = points.cpu(); + auto voxel_size_cpu = voxel_size.cpu(); + auto coors_range_cpu = coors_range.cpu(); + auto coors_cpu = coors.cpu(); + + std::vector voxel_size_v_cpu( + voxel_size_cpu.data_ptr(), + voxel_size_cpu.data_ptr() + voxel_size_cpu.numel()); + std::vector coors_range_v_cpu( + coors_range_cpu.data_ptr(), + coors_range_cpu.data_ptr() + coors_range_cpu.numel()); + dynamic_voxelize_forward_impl(points_cpu, coors_cpu, voxel_size_v_cpu, + coors_range_v_cpu, NDim); + coors.copy_(coors_cpu); + return; +} +#endif + void hard_voxelize_forward(const at::Tensor &points, const at::Tensor &voxel_size, const at::Tensor &coors_range, at::Tensor &voxels, @@ -40,6 +176,11 @@ void hard_voxelize_forward(const at::Tensor &points, at::Tensor &voxel_num, const int max_points, const int max_voxels, const int NDim = 3, const bool deterministic = true) { +#ifdef MMCV_WITH_DIOPI + hard_voxelize_forward_diopi(points, voxel_size, coors_range, voxels, coors, + num_points_per_voxel, voxel_num, max_points, + max_voxels, NDim, deterministic); +#else int64_t *voxel_num_data = voxel_num.data_ptr(); std::vector voxel_size_v( voxel_size.data_ptr(), @@ -57,12 +198,16 @@ void hard_voxelize_forward(const at::Tensor &points, points, voxels, coors, num_points_per_voxel, voxel_size_v, coors_range_v, max_points, max_voxels, NDim); } +#endif } void dynamic_voxelize_forward(const at::Tensor &points, const at::Tensor &voxel_size, const at::Tensor &coors_range, at::Tensor &coors, const int NDim = 3) { +#ifdef MMCV_WITH_DIOPI + dynamic_voxelize_forward_diopi(points, voxel_size, coors_range, coors, NDim); +#else std::vector voxel_size_v( voxel_size.data_ptr(), voxel_size.data_ptr() + voxel_size.numel()); @@ -71,4 +216,5 @@ void dynamic_voxelize_forward(const at::Tensor &points, coors_range.data_ptr() + coors_range.numel()); dynamic_voxelize_forward_impl(points, coors, voxel_size_v, coors_range_v, NDim); +#endif } diff --git a/setup.py b/setup.py index a95f460196..c10dd6eec4 100644 --- a/setup.py +++ b/setup.py @@ -210,6 +210,8 @@ def get_extensions(): extra_compile_args['cxx'] = ['/std:c++14'] include_dirs = [] + library_dirs = [] + libraries = [] extra_objects = [] extra_link_args = [] @@ -221,7 +223,34 @@ def get_extensions(): except ImportError: pass - if is_rocm_pytorch or torch.cuda.is_available() or os.getenv( + if os.getenv('MMCV_WITH_DIOPI', '0') == '1': + import mmengine # NOQA: F401 + from mmengine.utils.version_utils import digit_version + assert digit_version(mmengine.__version__) >= digit_version( + '0.7.4'), f'mmengine >= 0.7.4 is required \ + but {mmengine.__version__} is installed' + + print(f'Compiling {ext_name} with CPU and DIPU') + define_macros += [('MMCV_WITH_DIOPI', None)] + define_macros += [('DIOPI_ATTR_WEAK', None)] + op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \ + glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp') + extension = CppExtension + include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common')) + dipu_root = os.getenv('DIPU_ROOT') + diopi_path = os.getenv('DIOPI_PATH') + dipu_path = os.getenv('DIPU_PATH') + vendor_include_dirs = os.getenv('VENDOR_INCLUDE_DIRS') + nccl_include_dirs = os.getenv('NCCL_INCLUDE_DIRS') + include_dirs.append(dipu_root) + include_dirs.append(diopi_path + '/include') + include_dirs.append(dipu_path + '/dist/include') + include_dirs.append(vendor_include_dirs) + if nccl_include_dirs: + include_dirs.append(nccl_include_dirs) + library_dirs += [dipu_root] + libraries += ['torch_dipu'] + elif is_rocm_pytorch or torch.cuda.is_available() or os.getenv( 'FORCE_CUDA', '0') == '1': if is_rocm_pytorch: define_macros += [('MMCV_WITH_HIP', None)] @@ -398,6 +427,8 @@ def get_mluops_version(file_path): define_macros=define_macros, extra_objects=extra_objects, extra_compile_args=extra_compile_args, + library_dirs=library_dirs, + libraries=libraries, extra_link_args=extra_link_args) extensions.append(ext_ops) return extensions diff --git a/tests/test_ops/test_modulated_deform_conv.py b/tests/test_ops/test_modulated_deform_conv.py index ee29e73eb9..b7e48edef0 100644 --- a/tests/test_ops/test_modulated_deform_conv.py +++ b/tests/test_ops/test_modulated_deform_conv.py @@ -7,6 +7,8 @@ from mmengine.utils import digit_version from mmengine.utils.dl_utils import TORCH_VERSION +from mmcv.utils import IS_CUDA_AVAILABLE + try: # If PyTorch version >= 1.6.0 and fp16 is enabled, torch.cuda.amp.autocast # would be imported and used; we should test if our modules support it. @@ -111,13 +113,28 @@ def _test_amp_mdconv(self, input_dtype=torch.float): assert numpy.allclose(dcn.conv_offset.bias.grad.cpu().detach().numpy(), dcn_offset_b_grad, 1e-2) - def test_mdconv(self): - self._test_mdconv(torch.double, device='cpu') - self._test_mdconv(torch.float, device='cpu') - self._test_mdconv(torch.double) - self._test_mdconv(torch.float) + @pytest.mark.parametrize('device', [ + 'cpu', + pytest.param( + 'cuda', + marks=pytest.mark.skipif( + not IS_CUDA_AVAILABLE, reason='requires CUDA support')), + ]) + def test_mdconv_float(self, device): + self._test_mdconv(dtype=torch.float, device=device) + + @pytest.mark.parametrize('device', [ + 'cpu', + pytest.param( + 'cuda', + marks=pytest.mark.skipif( + not IS_CUDA_AVAILABLE, reason='requires CUDA support')), + ]) + def test_mdconv_double(self, device): + self._test_mdconv(dtype=torch.double, device=device) + + def test_mdconv_half(self): self._test_mdconv(torch.half) - # test amp when torch version >= '1.6.0', the type of # input data for mdconv might be torch.float or torch.half if (TORCH_VERSION != 'parrots' diff --git a/tests/test_ops/test_roi_align.py b/tests/test_ops/test_roi_align.py index 46a5183f30..dcd2103461 100644 --- a/tests/test_ops/test_roi_align.py +++ b/tests/test_ops/test_roi_align.py @@ -93,15 +93,7 @@ def _test_roialign_allclose(device, dtype): x.grad.data.type(torch.float).cpu().numpy(), np_grad, atol=1e-3) -@pytest.mark.parametrize('dtype', [ - torch.float, - pytest.param( - torch.double, - marks=pytest.mark.skipif( - IS_MLU_AVAILABLE or IS_NPU_AVAILABLE, - reason='MLU and NPU do not support for 64-bit floating point')), - torch.half -]) +@pytest.mark.parametrize('dtype', [torch.float, torch.half]) @pytest.mark.parametrize('device', [ 'cpu', pytest.param( @@ -117,8 +109,17 @@ def _test_roialign_allclose(device, dtype): marks=pytest.mark.skipif( not IS_NPU_AVAILABLE, reason='requires NPU support')) ]) -def test_roialign(device, dtype): - # check double only - if dtype is torch.double: - _test_roialign_gradcheck(device=device, dtype=dtype) +def test_roialign_float(device, dtype): _test_roialign_allclose(device=device, dtype=dtype) + + +@pytest.mark.parametrize('device', [ + 'cpu', + pytest.param( + 'cuda', + marks=pytest.mark.skipif( + not IS_CUDA_AVAILABLE, reason='requires CUDA support')), +]) +def test_roialign_float64(device): + _test_roialign_allclose(device=device, dtype=torch.double) + _test_roialign_gradcheck(device=device, dtype=torch.double) diff --git a/tests/test_ops/test_voxelization.py b/tests/test_ops/test_voxelization.py index cd01eb46e6..78282a8ad0 100644 --- a/tests/test_ops/test_voxelization.py +++ b/tests/test_ops/test_voxelization.py @@ -139,12 +139,20 @@ def test_voxelization_nondeterministic(): assert len(coors_set) == len(coors) == len(coors_all_set) -@pytest.mark.parametrize('device_type', [ - pytest.param( - 'mlu', - marks=pytest.mark.skipif( - not IS_MLU_AVAILABLE, reason='requires MLU support')) -]) +@pytest.mark.parametrize( + 'device_type', + [ + pytest.param( + # this is only used for dipu device testing case. + # dipu will mock to cuda automatically on mlu physical device. + 'cuda:0', + marks=pytest.mark.skipif( + not IS_CUDA_AVAILABLE, reason='requires CUDA support')), + pytest.param( + 'mlu', + marks=pytest.mark.skipif( + not IS_MLU_AVAILABLE, reason='requires MLU support')) + ]) def test_voxelization_mlu(device_type): voxel_size = [0.5, 0.5, 0.5] point_cloud_range = [0, -40, -3, 70.4, 40, 1]