diff --git a/csrc/backend_ops/onnxruntime/roi_align/roi_align.cpp b/csrc/backend_ops/onnxruntime/roi_align/roi_align.cpp
deleted file mode 100644
index 78cd13c922..0000000000
--- a/csrc/backend_ops/onnxruntime/roi_align/roi_align.cpp
+++ /dev/null
@@ -1,255 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
-// modified from
-// https://github.com/facebookresearch/maskrcnn-benchmark/blob/main/maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp
-#include "roi_align.h"
-
-#include "ort_utils.h"
-
-namespace mmdeploy {
-// implementation taken from Caffe2
-struct PreCalc {
-  int pos1;
-  int pos2;
-  int pos3;
-  int pos4;
-  float w1;
-  float w2;
-  float w3;
-  float w4;
-};
-
-void pre_calc_for_bilinear_interpolate(const int height, const int width, const int pooled_height,
-                                       const int pooled_width, const int iy_upper,
-                                       const int ix_upper, float roi_start_h, float roi_start_w,
-                                       float bin_size_h, float bin_size_w, int roi_bin_grid_h,
-                                       int roi_bin_grid_w, std::vector<PreCalc> &pre_calc) {
-  int pre_calc_index = 0;
-  for (int ph = 0; ph < pooled_height; ph++) {
-    for (int pw = 0; pw < pooled_width; pw++) {
-      for (int iy = 0; iy < iy_upper; iy++) {
-        const float yy = roi_start_h + ph * bin_size_h +
-                         static_cast<float>(iy + .5f) * bin_size_h /
-                             static_cast<float>(roi_bin_grid_h);  // e.g., 0.5, 1.5
-        for (int ix = 0; ix < ix_upper; ix++) {
-          const float xx =
-              roi_start_w + pw * bin_size_w +
-              static_cast<float>(ix + .5f) * bin_size_w / static_cast<float>(roi_bin_grid_w);
-
-          float x = xx;
-          float y = yy;
-          // deal with: inverse elements are out of feature map boundary
-          if (y < -1.0 || y > height || x < -1.0 || x > width) {
-            // empty
-            PreCalc pc;
-            pc.pos1 = 0;
-            pc.pos2 = 0;
-            pc.pos3 = 0;
-            pc.pos4 = 0;
-            pc.w1 = 0;
-            pc.w2 = 0;
-            pc.w3 = 0;
-            pc.w4 = 0;
-            pre_calc[pre_calc_index] = pc;
-            pre_calc_index += 1;
-            continue;
-          }
-
-          if (y <= 0) {
-            y = 0;
-          }
-          if (x <= 0) {
-            x = 0;
-          }
-
-          int y_low = (int)y;
-          int x_low = (int)x;
-          int y_high;
-          int x_high;
-
-          if (y_low >= height - 1) {
-            y_high = y_low = height - 1;
-            y = (float)y_low;
-          } else {
-            y_high = y_low + 1;
-          }
-
-          if (x_low >= width - 1) {
-            x_high = x_low = width - 1;
-            x = (float)x_low;
-          } else {
-            x_high = x_low + 1;
-          }
-
-          float ly = y - y_low;
-          float lx = x - x_low;
-          float hy = 1. - ly, hx = 1. - lx;
-          float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
-          // save weights and indices
-          PreCalc pc;
-          pc.pos1 = y_low * width + x_low;
-          pc.pos2 = y_low * width + x_high;
-          pc.pos3 = y_high * width + x_low;
-          pc.pos4 = y_high * width + x_high;
-          pc.w1 = w1;
-          pc.w2 = w2;
-          pc.w3 = w3;
-          pc.w4 = w4;
-          pre_calc[pre_calc_index] = pc;
-
-          pre_calc_index += 1;
-        }
-      }
-    }
-  }
-}
-
-void ROIAlignForwardCPU(const int nthreads, const float *input, const float *rois, float *output,
-                        float *argmax_y, float *argmax_x, const int pooled_height,
-                        const int pooled_width, const float spatial_scale, const int sampling_ratio,
-                        const int pool_mode,  // 0 - max pool, 1 - avg pool
-                        const bool aligned, const int channels, const int height, const int width) {
-  int n_rois = nthreads / channels / pooled_width / pooled_height;
-  // (n, c, ph, pw) is an element in the pooled output
-  // can be parallelized using omp
-  // #pragma omp parallel for num_threads(32)
-  for (int n = 0; n < n_rois; n++) {
-    int index_n = n * channels * pooled_width * pooled_height;
-
-    const float *offset_rois = rois + n * 5;
-    int roi_batch_ind = offset_rois[0];
-
-    // Do not use rounding; this implementation detail is critical
-    float offset = aligned ? (float)0.5 : (float)0.0;
-    float roi_start_w = offset_rois[1] * spatial_scale - offset;
-    float roi_start_h = offset_rois[2] * spatial_scale - offset;
-    float roi_end_w = offset_rois[3] * spatial_scale - offset;
-    float roi_end_h = offset_rois[4] * spatial_scale - offset;
-
-    float roi_width = roi_end_w - roi_start_w;
-    float roi_height = roi_end_h - roi_start_h;
-    if (aligned) {
-      /*AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
-                 "ROIs in ROIAlign cannot have non-negative size!");*/
-      assert(roi_width >= 0 && roi_height >= 0);
-    } else {  // for backward-compatibility only
-      roi_width = std::max(roi_width, (float)1.);
-      roi_height = std::max(roi_height, (float)1.);
-    }
-    float bin_size_h = static_cast<float>(roi_height) / static_cast<float>(pooled_height);
-    float bin_size_w = static_cast<float>(roi_width) / static_cast<float>(pooled_width);
-
-    // We use roi_bin_grid to sample the grid and mimic integral
-    int roi_bin_grid_h =
-        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height);  // e.g., = 2
-    int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-
-    // When the grid is empty, output zeros == 0/1, instead of NaN.
-    const float count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
-
-    // we want to precalculate indices and weights shared by all channels,
-    // this is the key point of optimization
-    std::vector<PreCalc> pre_calc(roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
-    pre_calc_for_bilinear_interpolate(height, width, pooled_height, pooled_width, roi_bin_grid_h,
-                                      roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h,
-                                      bin_size_w, roi_bin_grid_h, roi_bin_grid_w, pre_calc);
-
-    for (int c = 0; c < channels; c++) {
-      int index_n_c = index_n + c * pooled_width * pooled_height;
-      const float *offset_input = input + (roi_batch_ind * channels + c) * height * width;
-      int pre_calc_index = 0;
-
-      for (int ph = 0; ph < pooled_height; ph++) {
-        for (int pw = 0; pw < pooled_width; pw++) {
-          int index = index_n_c + ph * pooled_width + pw;
-
-          float output_val = 0.;
-          float maxval = -10000;
-          float maxidx_y = -1.f, maxidx_x = -1.f;
-          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-            const float y =
-                roi_start_h + ph * bin_size_h +
-                static_cast<float>(iy + .5f) * bin_size_h / static_cast<float>(roi_bin_grid_h);
-            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-              const float x =
-                  roi_start_w + pw * bin_size_w +
-                  static_cast<float>(ix + .5f) * bin_size_w / static_cast<float>(roi_bin_grid_w);
-              PreCalc pc = pre_calc[pre_calc_index];
-              float val = pc.w1 * offset_input[pc.pos1] + pc.w2 * offset_input[pc.pos2] +
-                          pc.w3 * offset_input[pc.pos3] + pc.w4 * offset_input[pc.pos4];
-              if (val > maxval) {
-                maxval = val;
-                maxidx_y = y;
-                maxidx_x = x;
-              }
-              output_val += val;
-              pre_calc_index += 1;
-            }
-          }
-          if (pool_mode == 0) {
-            // We do max pooling inside a bin
-            output[index] = maxval;
-            argmax_y[index] = maxidx_y;
-            argmax_x[index] = maxidx_x;
-          } else if (pool_mode == 1) {
-            // We do average (integral) pooling inside a bin
-            output[index] = output_val / count;
-          }  // if
-        }    // for pw
-      }      // for ph
-    }        // for c
-  }          // for n
-}
-
-void MMCVRoiAlignKernel::Compute(OrtKernelContext *context) {
-  // Setup inputs
-  const OrtValue *input_X = ort_.KernelContext_GetInput(context, 0);
-  const float *X_data = reinterpret_cast<const float *>(ort_.GetTensorData<float>(input_X));
-  const OrtValue *input_rois = ort_.KernelContext_GetInput(context, 1);
-  const float *rois =
-      reinterpret_cast<const float *>(ort_.GetTensorData<const float *>(input_rois));
-
-  // Setup output
-  OrtTensorDimensions out_dimensions(ort_, input_X);
-  OrtTensorDimensions roi_dimensions(ort_, input_rois);
-
-  int batch_size = out_dimensions.data()[0];
-  int input_channels = out_dimensions.data()[1];
-  int input_height = out_dimensions.data()[2];
-  int input_width = out_dimensions.data()[3];
-
-  out_dimensions.data()[0] = roi_dimensions.data()[0];
-  out_dimensions.data()[2] = aligned_height_;
-  out_dimensions.data()[3] = aligned_width_;
-
-  OrtValue *output =
-      ort_.KernelContext_GetOutput(context, 0, out_dimensions.data(), out_dimensions.size());
-  float *out = ort_.GetTensorMutableData<float>(output);
-  OrtTensorTypeAndShapeInfo *output_info = ort_.GetTensorTypeAndShape(output);
-  ort_.ReleaseTensorTypeAndShapeInfo(output_info);
-
-  // TODO: forward here
-  int output_size = out_dimensions.data()[0];
-  for (auto i = 1; i < out_dimensions.size(); ++i) {
-    output_size *= out_dimensions.data()[i];
-  }
-
-  int poolMod = 1;
-  if (pool_mode_ == "max") poolMod = 0;
-
-  float *argmax_x = nullptr, *argmax_y = nullptr;
-  if (poolMod == 0) {
-    argmax_y = new float[output_size];
-    argmax_x = new float[output_size];
-  }
-
-  ROIAlignForwardCPU(output_size, X_data, rois, out, argmax_y, argmax_x, aligned_height_,
-                     aligned_width_, spatial_scale_, sampling_ratio_, poolMod, aligned_,
-                     input_channels, input_height, input_width);
-
-  if (argmax_x) delete argmax_x;
-  if (argmax_y) delete argmax_y;
-}
-
-REGISTER_ONNXRUNTIME_OPS(mmdeploy, MMCVRoiAlignCustomOp);
-}  // namespace mmdeploy
diff --git a/csrc/backend_ops/onnxruntime/roi_align/roi_align.h b/csrc/backend_ops/onnxruntime/roi_align/roi_align.h
deleted file mode 100644
index 0c7afa67da..0000000000
--- a/csrc/backend_ops/onnxruntime/roi_align/roi_align.h
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-#ifndef ONNXRUNTIME_ROI_ALIGN_H
-#define ONNXRUNTIME_ROI_ALIGN_H
-
-#include <assert.h>
-#include <onnxruntime_cxx_api.h>
-
-#include <cmath>
-#include <mutex>
-#include <string>
-#include <vector>
-
-namespace mmdeploy {
-struct MMCVRoiAlignKernel {
- public:
-  MMCVRoiAlignKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info) : ort_(ort) {
-    aligned_ = ort_.KernelInfoGetAttribute<int64_t>(info, "aligned");
-    aligned_height_ = ort_.KernelInfoGetAttribute<int64_t>(info, "output_height");
-    aligned_width_ = ort_.KernelInfoGetAttribute<int64_t>(info, "output_width");
-    pool_mode_ = ort_.KernelInfoGetAttribute<std::string>(info, "mode");
-    sampling_ratio_ = ort_.KernelInfoGetAttribute<int64_t>(info, "sampling_ratio");
-    spatial_scale_ = ort_.KernelInfoGetAttribute<float>(info, "spatial_scale");
-  }
-
-  void Compute(OrtKernelContext* context);
-
- private:
-  Ort::CustomOpApi ort_;
-
-  int aligned_height_;
-  int aligned_width_;
-  float spatial_scale_;
-  int sampling_ratio_;
-  std::string pool_mode_;
-  int aligned_;
-};
-
-struct MMCVRoiAlignCustomOp : Ort::CustomOpBase<MMCVRoiAlignCustomOp, MMCVRoiAlignKernel> {
-  void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
-    return new MMCVRoiAlignKernel(api, info);
-  }
-  const char* GetName() const { return "MMCVRoiAlign"; }
-
-  size_t GetInputTypeCount() const { return 2; }
-  ONNXTensorElementDataType GetInputType(size_t) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-  }
-
-  size_t GetOutputTypeCount() const { return 1; }
-  ONNXTensorElementDataType GetOutputType(size_t) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-  }
-
-  // force cpu
-  const char* GetExecutionProviderType() const { return "CPUExecutionProvider"; }
-};
-}  // namespace mmdeploy
-
-#endif  // ONNXRUNTIME_ROI_ALIGN_H
diff --git a/docs/en/backends/onnxruntime.md b/docs/en/backends/onnxruntime.md
index a47569d802..181e655094 100644
--- a/docs/en/backends/onnxruntime.md
+++ b/docs/en/backends/onnxruntime.md
@@ -57,7 +57,6 @@ make -j$(nproc)
 
 | Operator                                                                     |  CPU  |  GPU  | MMDeploy Releases |
 | :--------------------------------------------------------------------------- | :---: | :---: | :---------------- |
-| [RoIAlign](../ops/onnxruntime.md#roialign)                                   |   Y   |   N   | master            |
 | [grid_sampler](../ops/onnxruntime.md#grid_sampler)                           |   Y   |   N   | master            |
 | [MMCVModulatedDeformConv2d](../ops/onnxruntime.md#mmcvmodulateddeformconv2d) |   Y   |   N   | master            |
 
diff --git a/docs/en/ops/onnxruntime.md b/docs/en/ops/onnxruntime.md
index 2e4d741e0d..51791ebc9f 100644
--- a/docs/en/ops/onnxruntime.md
+++ b/docs/en/ops/onnxruntime.md
@@ -3,64 +3,21 @@
 <!-- TOC -->
 
 - [ONNX Runtime Ops](#onnx-runtime-ops)
-  - [RoIAlign](#roialign)
+  - [grid_sampler](#grid_sampler)
     - [Description](#description)
     - [Parameters](#parameters)
     - [Inputs](#inputs)
     - [Outputs](#outputs)
     - [Type Constraints](#type-constraints)
-  - [grid_sampler](#grid_sampler)
+  - [MMCVModulatedDeformConv2d](#mmcvmodulateddeformconv2d)
     - [Description](#description-1)
     - [Parameters](#parameters-1)
     - [Inputs](#inputs-1)
     - [Outputs](#outputs-1)
     - [Type Constraints](#type-constraints-1)
-  - [MMCVModulatedDeformConv2d](#mmcvmodulateddeformconv2d)
-    - [Description](#description-2)
-    - [Parameters](#parameters-2)
-    - [Inputs](#inputs-2)
-    - [Outputs](#outputs-2)
-    - [Type Constraints](#type-constraints-2)
 
 <!-- TOC -->
 
-### RoIAlign
-
-#### Description
-
-Perform RoIAlign on output feature, used in bbox_head of most two-stage detectors.
-
-#### Parameters
-
-| Type    | Parameter        | Description                                                                                                   |
-| ------- | ---------------- | ------------------------------------------------------------------------------------------------------------- |
-| `int`   | `output_height`  | height of output roi                                                                                          |
-| `int`   | `output_width`   | width of output roi                                                                                           |
-| `float` | `spatial_scale`  | used to scale the input boxes                                                                                 |
-| `int`   | `sampling_ratio` | number of input samples to take for each output sample. `0` means to take samples densely for current models. |
-| `str`   | `mode`           | pooling mode in each bin. `avg` or `max`                                                                      |
-| `int`   | `aligned`        | If `aligned=0`, use the legacy implementation in MMDetection. Else, align the results more perfectly.         |
-
-#### Inputs
-
-<dl>
-<dt><tt>input</tt>: T</dt>
-<dd>Input feature map; 4D tensor of shape (N, C, H, W), where N is the batch size, C is the numbers of channels, H and W are the height and width of the data.</dd>
-<dt><tt>rois</tt>: T</dt>
-<dd>RoIs (Regions of Interest) to pool over; 2-D tensor of shape (num_rois, 5) given as [[batch_index, x1, y1, x2, y2], ...]. The RoIs' coordinates are the coordinate system of input.</dd>
-</dl>
-
-#### Outputs
-
-<dl>
-<dt><tt>feat</tt>: T</dt>
-<dd>RoI pooled output, 4-D tensor of shape (num_rois, C, output_height, output_width). The r-th batch element feat[r-1] is a pooled feature map corresponding to the r-th RoI RoIs[r-1].<dd>
-</dl>
-
-#### Type Constraints
-
-- T:tensor(float32)
-
 ### grid_sampler
 
 #### Description
diff --git a/mmdeploy/codebase/mmdet/models/roi_heads/single_level_roi_extractor.py b/mmdeploy/codebase/mmdet/models/roi_heads/single_level_roi_extractor.py
index de0ea22dad..7c262c1dca 100644
--- a/mmdeploy/codebase/mmdet/models/roi_heads/single_level_roi_extractor.py
+++ b/mmdeploy/codebase/mmdet/models/roi_heads/single_level_roi_extractor.py
@@ -137,14 +137,13 @@ def single_roi_extractor__forward(ctx,
         device=target_lvls.device)
     target_lvls = torch.cat((_tmp, _tmp, target_lvls))
     for i in range(num_levels):
-        # use the roi align in torhcvision to accelerate the inference
-        # roi_align in MMCV is same as torchvision when pool mode is 'avg'
-        if backend == Backend.TORCHSCRIPT or self.roi_layers[
-                i].pool_mode == 'avg':
-            self.roi_layers[i].use_torchvision = True
         mask = target_lvls == i
         inds = mask.nonzero(as_tuple=False).squeeze(1)
-        roi_feats_t = self.roi_layers[i](feats[i], rois[inds])
+        rois_t = rois[inds]
+        # use the roi align in torhcvision
+        if backend == Backend.TORCHSCRIPT:
+            self.roi_layers[i].use_torchvision = True
+        roi_feats_t = self.roi_layers[i](feats[i], rois_t)
         roi_feats[inds] = roi_feats_t
     # slice to recover original size
     roi_feats = roi_feats[num_levels * 2:]
diff --git a/mmdeploy/mmcv/ops/roi_align.py b/mmdeploy/mmcv/ops/roi_align.py
index c6da740fe5..33cd7342d5 100644
--- a/mmdeploy/mmcv/ops/roi_align.py
+++ b/mmdeploy/mmcv/ops/roi_align.py
@@ -1,10 +1,11 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from typing import List
 
+import torch
 from torch import Tensor
 
 from mmdeploy.core import SYMBOLIC_REWRITER
-from mmdeploy.utils import Backend, get_backend
+from mmdeploy.utils import Backend, get_backend, get_ir_config
 
 
 # Here using mmcv.ops.roi_align.__self__ to find
@@ -17,7 +18,10 @@ def roi_align_default(ctx, g, input: Tensor, rois: Tensor,
                       sampling_ratio: int, pool_mode: str, aligned: bool):
     """Rewrite symbolic function for default backend.
 
-    Replace onnx::RoiAlign with mmdeploy::MMCVRoiAlign.
+    Replace onnx::RoiAlign with mmcv::MMCVRoiAlign for PPLNN. For ONNXRuntime,
+    align operation get done outside the inference engine for opset versions
+    lower than 16. By default,  onnx::RoiAlign get replaced to
+    mmdeploy::MMCVRoiAlign.
 
     Args:
         ctx (ContextCaller): The context with additional information.
@@ -40,6 +44,58 @@ def roi_align_default(ctx, g, input: Tensor, rois: Tensor,
     backend = get_backend(ctx.cfg)
     if backend == Backend.PPLNN:
         domain = 'mmcv'
+    elif backend == Backend.ONNXRUNTIME:
+        from torch.onnx.symbolic_opset9 import _cast_Long
+        from torch.onnx.symbolic_opset11 import add, select, squeeze
+        batch_indices = _cast_Long(
+            g,
+            squeeze(
+                g,
+                select(
+                    g, rois, 1,
+                    g.op(
+                        'Constant',
+                        value_t=torch.tensor([0], dtype=torch.long))), 1),
+            False)
+        rois = select(
+            g, rois, 1,
+            g.op(
+                'Constant',
+                value_t=torch.tensor([1, 2, 3, 4], dtype=torch.long)))
+        ir_cfg = get_ir_config(ctx.cfg)
+        opset_version = ir_cfg.get('opset_version', 11)
+        if opset_version < 16:
+            # preprocess rois to make compatible with opset 16-
+            # as for opset 16+, `aligned` get implemented inside onnxruntime.
+            if aligned is True:
+                rois = add(
+                    g, rois,
+                    g.op(
+                        'Constant',
+                        value_t=torch.tensor([-0.5 / spatial_scale],
+                                             dtype=torch.float)))
+            return g.op(
+                'RoiAlign',
+                input,
+                rois,
+                batch_indices,
+                output_height_i=output_size[0],
+                output_width_i=output_size[1],
+                spatial_scale_f=spatial_scale,
+                sampling_ratio_i=sampling_ratio,
+                mode_s=pool_mode)
+        else:
+            return g.op(
+                'RoiAlign',
+                input,
+                rois,
+                batch_indices,
+                output_height_i=output_size[0],
+                output_width_i=output_size[1],
+                spatial_scale_f=spatial_scale,
+                sampling_ratio_i=sampling_ratio,
+                mode_s=pool_mode,
+                aligned_i=aligned)
     else:
         domain = 'mmdeploy'
     return g.op(
diff --git a/tests/test_codebase/test_mmdet/test_mmdet_models.py b/tests/test_codebase/test_mmdet/test_mmdet_models.py
index 706cc05be6..b3a1a94173 100644
--- a/tests/test_codebase/test_mmdet/test_mmdet_models.py
+++ b/tests/test_codebase/test_mmdet/test_mmdet_models.py
@@ -546,7 +546,6 @@ def test_single_roi_extractor(backend_type: Backend):
         wrapped_model=single_roi_extractor,
         model_inputs=model_inputs,
         deploy_cfg=deploy_cfg)
-
     if isinstance(backend_outputs, dict):
         backend_outputs = backend_outputs.values()
     for model_output, backend_output in zip(model_outputs[0], backend_outputs):
diff --git a/tests/test_ops/test_ops.py b/tests/test_ops/test_ops.py
index 7a5784b0ac..72ff750d1e 100644
--- a/tests/test_ops/test_ops.py
+++ b/tests/test_ops/test_ops.py
@@ -16,7 +16,7 @@
 TEST_NCNN = TestNCNNExporter()
 
 
-@pytest.mark.parametrize('backend', [TEST_ONNXRT, TEST_TENSORRT])
+@pytest.mark.parametrize('backend', [TEST_TENSORRT])
 @pytest.mark.parametrize('pool_h,pool_w,spatial_scale,sampling_ratio',
                          [(2, 2, 1.0, 2), (4, 4, 2.0, 4)])
 def test_roi_align(backend,