Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature]Sync replace #2750

Merged
merged 14 commits into from
May 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
431 changes: 0 additions & 431 deletions mmcv/ops/csrc/common/mlu/iou3d_mlu_kernel.mlu

This file was deleted.

695 changes: 0 additions & 695 deletions mmcv/ops/csrc/common/mlu/iou3d_utils.hpp

This file was deleted.

2,094 changes: 0 additions & 2,094 deletions mmcv/ops/csrc/common/mlu/ms_deform_attn_mlu_kernel.mlu

This file was deleted.

483 changes: 0 additions & 483 deletions mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu

This file was deleted.

553 changes: 0 additions & 553 deletions mmcv/ops/csrc/common/mlu/nms_utils.hpp

This file was deleted.

493 changes: 0 additions & 493 deletions mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu

This file was deleted.

649 changes: 0 additions & 649 deletions mmcv/ops/csrc/common/mlu/voxelization_mlu_kernel.mlu

This file was deleted.

136 changes: 35 additions & 101 deletions mmcv/ops/csrc/pytorch/mlu/iou3d_mlu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,114 +10,30 @@
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/

#include "pytorch_device_registry.hpp"
#include "pytorch_mlu_helper.hpp"

void KernelIou3d(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const cnrtDataType_t data_type_input, const void *boxes_dram,
const int input_box_num, const float iou_threshold,
void *workspace, void *output_size, void *output);

int selectType(uint32_t use_job, int box_num_per_core) {
// the box_num_per_core should be at least 256, otherwise the real IO
// bandwidth would be very low
while (box_num_per_core < 256 && use_job >= 4) {
box_num_per_core *= 2;
use_job /= 2;
}
return use_job;
}
static cnnlStatus_t policyFunc(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type,
int &core_num_per_class,
const int input_box_num) {
uint32_t core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
uint32_t job_limit = getJobLimitCapability();
uint32_t core_number = job_limit;

int box_num_per_core = (input_box_num + core_number - 1) / core_number;
int use_job = selectType(job_limit, box_num_per_core);
// initiate k_type as Union1
k_dim->x = core_dim;
k_dim->y = 1;
k_dim->z = 1;
*k_type = CNRT_FUNC_TYPE_UNION1;
switch (job_limit) {
case CN_KERNEL_CLASS_BLOCK:
case CN_KERNEL_CLASS_UNION:
case CN_KERNEL_CLASS_UNION2:
case CN_KERNEL_CLASS_UNION4:
case CN_KERNEL_CLASS_UNION8:
case CN_KERNEL_CLASS_UNION16: {
if (use_job < 4) {
k_dim->x = 1;
*k_type = CNRT_FUNC_TYPE_BLOCK;
} else if (use_job == 4) {
k_dim->x = core_dim;
*k_type = CNRT_FUNC_TYPE_UNION1;
} else {
k_dim->x = use_job;
*k_type = (cnrtFunctionType_t)use_job;
}
}; break;
default:
LOG(WARNING) << "[cnnlNms_v2]: got unsupported job limit number."
<< " Use default CN_KERNEL_CLASS_UNION1 with UNION1 task.";
}
return CNNL_STATUS_SUCCESS;
}
#include "mlu_common_helper.h"

void IoU3DNMS3DMLUKernelLauncher(Tensor boxes, Tensor &keep, Tensor &keep_num,
float iou_threshold) {
// dimension parameters check
TORCH_CHECK(boxes.dim() == 2, "boxes should be a 2d tensor, got ",
boxes.dim(), "D");
TORCH_CHECK(boxes.size(1) == 7,
"boxes should have 7 elements in dimension 1, got ",
boxes.size(1));

// data type check
TORCH_CHECK(
boxes.scalar_type() == at::kFloat || boxes.scalar_type() == at::kHalf,
"data type of boxes should be Float or Half, got ", boxes.scalar_type());

if (boxes.numel() == 0) {
return;
}
const size_t max_input_num = 2147483648; // 2^31, 2G num
TORCH_CHECK(boxes.numel() < max_input_num,
"boxes.numel() should be less than 2147483648, got ",
boxes.numel());
int input_box_num = boxes.size(0);

cnrtDataType_t data_type_input = torch_mlu::toCnrtDtype(boxes.dtype());
cnrtDim3_t k_dim;
cnrtJobType_t k_type;

int core_num_per_class;
policyFunc(&k_dim, &k_type, core_num_per_class, input_box_num);

// transpose boxes (n, 7) to (7, n) for better performance
auto boxes_t = boxes.transpose(0, 1);
auto boxes_ = torch_mlu::cnnl::ops::cnnl_contiguous(boxes_t);

auto output = at::empty({input_box_num}, boxes.options().dtype(at::kLong));
int input_box_num = boxes.size(0);
auto boxes_ = torch_mlu::cnnl::ops::cnnl_contiguous(boxes);
auto output = keep.to(boxes.options().dtype(at::kInt));
auto output_size = at::empty({1}, boxes.options().dtype(at::kInt));

// workspace
const int info_num = 7; // x, y,z, dx, dy, dz,angle
size_t space_size = 0;
if (boxes.scalar_type() == at::kHalf) {
space_size = input_box_num * sizeof(int16_t) * info_num +
input_box_num * sizeof(float) + sizeof(float);
} else {
space_size = input_box_num * sizeof(float) * (info_num + 1) + sizeof(float);
}
MluOpTensorDescriptor boxes_desc, output_desc;
boxes_desc.set(boxes_);
output_desc.set(output);

auto workspace = at::empty(space_size, boxes.options().dtype(at::kByte));
// workspace
size_t workspace_size = 0;
auto handle = mluOpGetCurrentHandle();
mluOpGetNmsWorkspaceSize(handle, boxes_desc.desc(), NULL, &workspace_size);
auto workspace = at::empty(workspace_size, boxes.options().dtype(at::kByte));

// get compute queue
auto queue = torch_mlu::getCurQueue();

auto boxes_impl = torch_mlu::getMluTensorImpl(boxes_);
auto boxes_ptr = boxes_impl->cnnlMalloc();
auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);
Expand All @@ -127,11 +43,29 @@ void IoU3DNMS3DMLUKernelLauncher(Tensor boxes, Tensor &keep, Tensor &keep_num,
auto output_size_impl = torch_mlu::getMluTensorImpl(keep_num);
auto output_size_ptr = output_size_impl->cnnlMalloc();

uint32_t core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
CNLOG(INFO) << "Launch Kernel KernelIou3d<<<Union" << k_type / core_dim
<< ", " << k_dim.x << ", " << k_dim.y << ", " << k_dim.z << ">>>";
KernelIou3d(k_dim, k_type, queue, data_type_input, boxes_ptr, input_box_num,
iou_threshold, workspace_ptr, output_size_ptr, output_ptr);
// nms desc
mluOpNmsDescriptor_t nms_desc;
const mluOpNmsBoxPointMode_t box_mode = (mluOpNmsBoxPointMode_t)0;
const mluOpNmsOutputMode_t output_mode = (mluOpNmsOutputMode_t)0;
const mluOpNmsAlgo_t algo = (mluOpNmsAlgo_t)0;
const mluOpNmsMethodMode_t method_mode = (mluOpNmsMethodMode_t)0;
const float soft_nms_sigma = 0.0;
const float confidence_threshold = 0.0;
const int input_layout = 0;
const bool pad_to_max_output_size = false;
const int max_output_size = input_box_num;
const float offset = 0.0;

mluOpCreateNmsDescriptor(&nms_desc);
mluOpSetNmsDescriptor(nms_desc, box_mode, output_mode, algo, method_mode,
iou_threshold, soft_nms_sigma, max_output_size,
confidence_threshold, offset, input_layout,
pad_to_max_output_size);

mluOpNms(handle, nms_desc, boxes_desc.desc(), boxes_ptr, NULL, NULL,
workspace_ptr, workspace_size, output_desc.desc(), output_ptr,
output_size_ptr);
mluOpDestroyNmsDescriptor(nms_desc);
}

void iou3d_nms3d_forward_mlu(const Tensor boxes, Tensor &keep, Tensor &keep_num,
Expand Down
4 changes: 2 additions & 2 deletions mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@
#include "pytorch_device_registry.hpp"

#define MLUOP_MAJOR 0
#define MLUOP_MINOR 5
#define MLUOP_PATCHLEVEL 302
#define MLUOP_MINOR 6
#define MLUOP_PATCHLEVEL 0

mluOpDataType_t getMluOpDataType(const caffe2::TypeMeta& data_type);
mluOpTensorLayout_t getMluOpSuggestLayout(const at::Tensor& input);
Expand Down
Loading