diff --git a/src/operator/numpy/np_pad_op-inl.h b/src/operator/numpy/np_pad_op-inl.h
index be514b27a6b5..79baba756845 100644
--- a/src/operator/numpy/np_pad_op-inl.h
+++ b/src/operator/numpy/np_pad_op-inl.h
@@ -540,13 +540,22 @@ struct min_pad {
   }
 };
 
-
-template <typename xpu, int req>
+template <typename xpu, int req, int ndim>
 struct pad_grad {
-  template<typename DType>
-  MSHADOW_XINLINE static void Map(index_t i, DType *out, const DType *a){
-    using namespace mxnet_op;
-    KERNEL_ASSIGN(out[i], req, 1);
+template<typename DType>
+MSHADOW_XINLINE static void Map(index_t i, DType *out, const DType *a,
+                                const index_t* ishape,
+                                const index_t* oshape,
+                                mshadow::Shape<ndim*2> width) {
+    auto j = uunravel<ndim>(i, oshape);
+    size_t m;
+    index_t* indexwidth = width.shape_;
+    index_t* indexshape = j.shape_;
+    for (m = 0; m < ndim; m++) {
+      indexshape[m] = indexshape[m] + indexwidth[m * 2];
+    }
+    index_t l = rravel<ndim>(j, ishape);
+    KERNEL_ASSIGN(out[i], req, a[l]);
   }
 };
 
@@ -720,20 +729,43 @@ void NumpyPadOpImpl(const TBlob& in_data,
 template<typename xpu>
 void NumpyPadOpBackImpl(const TBlob& in_data,
                         const TBlob& out_data,
+                        index_t* ishape,
+                        index_t* oshape,
                         index_t dsize,
+                        const NumpyPadParam& param,
                         const std::vector<OpReqType>& req,
                         mxnet_op::Stream<xpu> *s) {
-  using namespace mxnet_op;
-  using namespace mshadow;
-  MSHADOW_TYPE_SWITCH_WITH_BOOL(out_data.type_flag_, DType, {
-    MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
-      Kernel<pad_grad<xpu, req_type>, xpu>::Launch(
-        s, dsize, out_data.dptr<DType>(), in_data.dptr<DType>());
-    });
-  });
+    using namespace mxnet_op;
+    using namespace mshadow;
+    int mode = param.mode;
+    int ndim = in_data.ndim();
+    if (mode != 0) {
+        LOG(FATAL) << "Other modes are not supported. ";
+    }
+    MXNET_NDIM_SWITCH(ndim, NDim, {
+      mshadow::Shape<NDim*2> width;
+      int dimcounter = 0;
+      index_t* odptr = reinterpret_cast<index_t*>(oshape);
+      if (ndim == 1) {
+        width[0] = param.pad_width[0][0];
+        width[1] = param.pad_width[1][0];
+      } else {
+        for (dimcounter = 0; dimcounter < NDim; dimcounter++) {
+          width[dimcounter*2] = param.pad_width[dimcounter][0];
+          width[dimcounter*2 + 1] = param.pad_width[dimcounter][1];
+        }
+      }
+      index_t* idptr = reinterpret_cast<index_t*>(ishape);
+      MSHADOW_TYPE_SWITCH_WITH_BOOL(out_data.type_flag_, DType, {
+        MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
+          Kernel<pad_grad<xpu, req_type, NDim>, xpu>::Launch(
+            s, dsize, out_data.dptr<DType>(), in_data.dptr<DType>(),
+            idptr, odptr, width);
+        });
+      });
+    })
 }
 
-
 template<typename xpu>
 void NumpyPadOpForward(const nnvm::NodeAttrs& attrs,
                        const OpContext& ctx,
@@ -746,7 +778,8 @@ void NumpyPadOpForward(const nnvm::NodeAttrs& attrs,
     CHECK_EQ(inputs.size(), 1U);
     CHECK_EQ(outputs.size(), 1U);
     CHECK_EQ(req.size(), 1U);
-    CHECK_EQ(req[0], kWriteTo);
+    CHECK(req[0] != kNullOp);
+    CHECK(req[0] != kWriteInplace);
     Stream<xpu> *s = ctx.get_stream<xpu>();
     const TBlob& in_data = inputs[0];
     const TBlob& out_data = outputs[0];
@@ -792,15 +825,51 @@ void NumpyPadOpBackward(const nnvm::NodeAttrs& attrs,
                         const std::vector<TBlob>& inputs,
                         const std::vector<OpReqType>& req,
                         const std::vector<TBlob>& outputs) {
-  using namespace mxnet_op;
-  using namespace mshadow;
-  CHECK_EQ(inputs.size(), 1U);
-  CHECK_EQ(outputs.size(), 1U);
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-  const TBlob& in_data = inputs[0];
-  const TBlob& out_data = outputs[0];
-  NumpyPadOpBackImpl<xpu>(in_data, out_data,
-                          out_data.Size(), req, s);
+  MXNET_NDIM_SWITCH(inputs[0].ndim(), NDim, {
+    using namespace mxnet_op;
+    using namespace mshadow;
+    CHECK_EQ(inputs.size(), 1U);
+    CHECK_EQ(outputs.size(), 1U);
+    CHECK_EQ(req.size(), 1U);
+    CHECK(req[0] != kNullOp);
+    CHECK(req[0] != kWriteInplace);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    const TBlob& in_data = inputs[0];
+    const TBlob& out_data = outputs[0];
+    size_t ts = in_data.ndim();
+    size_t count;
+    mshadow::Shape<NDim> inshape;
+    for (count = 0; count < ts; count++) {
+      inshape[count] = static_cast<index_t>((in_data.shape_)[count]);
+    }
+
+    Tensor<xpu, 1, index_t> tsp = ctx.requested[0].
+                                  get_space_typed<xpu, 1, index_t>(Shape1(2*ts), s);
+    Tensor<cpu, 1, index_t> ta(reinterpret_cast<index_t*>(inshape.shape_),
+                               Shape1(ts), ctx.get_stream<cpu>());
+    Tensor<xpu, 1, index_t> ti(reinterpret_cast<index_t*>(tsp.dptr_),
+                               Shape1(ts), ctx.get_stream<xpu>());
+    mshadow::Copy(ti, ta, ctx.get_stream<xpu>());
+
+    mshadow::Shape<NDim> outshape;
+    for (count = 0; count < ts; count++) {
+      outshape[count] = static_cast<index_t>((out_data.shape_)[count]);
+    }
+    index_t* wcp = tsp.dptr_;
+    wcp += ts;
+    Tensor<cpu, 1, index_t> tb(reinterpret_cast<index_t*>(outshape.shape_),
+                               Shape1(ts), ctx.get_stream<cpu>());
+    Tensor<xpu, 1, index_t> to(reinterpret_cast<index_t*>(wcp), Shape1(ts),
+                               ctx.get_stream<xpu>());
+    mshadow::Copy(to, tb, ctx.get_stream<xpu>());
+    const NumpyPadParam& param = nnvm::get<NumpyPadParam>(attrs.parsed);
+
+    index_t* wt = reinterpret_cast<index_t*>(to.dptr_);
+    index_t* wi = reinterpret_cast<index_t*>(ti.dptr_);
+
+    NumpyPadOpBackImpl<xpu>(in_data, out_data, wi,
+                            wt, out_data.Size(), param, req, s);
+  })
 }
 
 }  // namespace op
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index cc97821fba94..16e25b234ed5 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -8431,19 +8431,28 @@ def hybrid_forward(self,F,A,**kwargs):
             assert_almost_equal(mx_out.asnumpy(), np_out, rtol = rtol, atol = atol)
 
             # test gradient
-            mx_out.backward()
-            np_backward = np.ones(shape)
-            assert_almost_equal(x.grad.asnumpy(), np_backward, rtol=rtol, atol=atol)
-
-            # test imperative once again
-
-            if(m != 'constant'):
-                np_out = _np.pad(x.asnumpy(), pw, mode=m)
-                mx_out = np.pad(x, pw, mode=m)
-            else:
-                np_out = _np.pad(x.asnumpy(), pw, constant_values=0, mode=m)
-                mx_out = np.pad(x, pw, mode=m, constant_values=0)
-            assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)
+            if m == "constant":
+                ctx = mx.context.current_context()
+                x = mx.np.random.uniform(-1.0, 1.0, size=shape)
+                x = mx.np.array(x, ctx=ctx)
+                for grad_req in ['write', 'add']:
+                    x.attach_grad(grad_req)
+                    if grad_req == 'add':
+                        init_grad = mx.np.random.uniform(-1.0, 1.0, size=shape, ctx=ctx)
+                        x.grad[:] = init_grad
+                    with mx.autograd.record():
+                        mx_out = mx.np.pad(x, pad_width=pw, mode="constant")
+                        out_grad = mx.np.random.normal(0, 1, mx_out.shape)
+                        out_grad = mx.np.array(out_grad, ctx=ctx)
+                        loss = mx_out * out_grad
+                        loss = loss.sum()
+                        loss.backward()
+                    gt_in_grad = mx.np.pad(mx.np.ones_like(x.grad), pad_width=pw, mode="constant") * mx.np.array(out_grad, ctx=ctx)
+                    mx_grad = x.grad
+                    if grad_req == 'add':
+                        assert_almost_equal(mx.np.pad(mx_grad - init_grad, pad_width=pw, mode="constant"), gt_in_grad.asnumpy(), rtol=rtol, atol=atol)
+                    else:
+                        assert_almost_equal(mx.np.pad(mx_grad, pad_width=pw, mode="constant"), gt_in_grad.asnumpy(), rtol=rtol, atol=atol)
 
 
 @with_seed()