From f0c72642b04e4e7ab4ff92a4a3fa4c56835e11af Mon Sep 17 00:00:00 2001
From: "Yuan, Pengxin" <pengxin.yuan@intel.com>
Date: Thu, 24 Jan 2019 17:47:26 -0800
Subject: [PATCH 01/10] mkl_func test with erf&log op, build success~

---
 src/operator/mkl_functions-inl.h              | 126 ++++++++++++++++++
 src/operator/tensor/elemwise_unary_op.h       |  76 ++++++-----
 .../tensor/elemwise_unary_op_basic.cc         |  30 ++++-
 3 files changed, 193 insertions(+), 39 deletions(-)
 create mode 100644 src/operator/mkl_functions-inl.h
diff --git a/src/operator/mkl_functions-inl.h b/src/operator/mkl_functions-inl.h
new file mode 100644
index 000000000000..c31accf967c4
--- /dev/null
+++ b/src/operator/mkl_functions-inl.h
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file mkl_functions-inl.h
+ * \brief
+ * \author
+*/
+#ifndef MXNET_OPERATOR_MKL_FUNCTIONS_H_
+#define MXNET_OPERATOR_MKL_FUNCTIONS_H_
+
+#if MSHADOW_USE_MKL == 1
+#include "mkl.h"
+
+namespace mxnet {
+namespace op {
+namespace mkl_func {
+
+MSHADOW_XINLINE
+static bool check_size(const size_t n) {
+  const size_t MKL_INT_MAX = (sizeof(MKL_INT) == sizeof(int)) ? INT_MAX : LLONG_MAX;
+  return (n <= MKL_INT_MAX);
+}
+
+MSHADOW_XINLINE
+static bool check_type(const int t) {
+  return (t == mshadow::kFloat32 || t == mshadow::kFloat64);
+}
+
+#define MXNET_MKL_UNARY_MATH_FUNC(name, func)                                          \
+  struct name : public mxnet_op::tunable {                                             \
+    template <typename DType>                                                          \
+    MSHADOW_XINLINE static void Map(const index_t n, const DType *src, float *dst) {   \
+      vs##func(static_cast<MKL_INT>(n), reinterpret_cast<const float *>(src), dst);    \
+    }                                                                                  \
+    MSHADOW_XINLINE static void Map(const index_t n, const double *src, double *dst) { \
+      vd##func(static_cast<MKL_INT>(n), src, dst);                                     \
+    }                                                                                  \
+  }
+ 
+
+#define MXNET_MKL_BINARY_MATH_FUNC(name, func)                                         \
+  struct name : public mxnet_op::tunable {                                             \
+    template <typename DType>                                                          \
+    MSHADOW_XINLINE static void Map(const index_t n,                                   \
+                                    const DType *a,                                    \
+                                    const DType *b,                                    \
+                                    float *c) {                                        \
+      vs##func(static_cast<MKL_INT>(n),                                                \
+               reinterpret_cast<const float *>(a),                                     \
+               reinterpret_cast<const float *>(b),                                     \
+               c);                                                                     \
+    }                                                                                  \
+    MSHADOW_XINLINE static void Map(const index_t n,                                   \
+                                    const double *a,                                   \
+                                    const double *b,                                   \
+                                    double *c) {                                       \
+      vd##func(static_cast<MKL_INT>(n), a, b, c);                                      \
+    }                                                                                  \
+  } 
+
+MXNET_MKL_UNARY_MATH_FUNC(erf, Erf);
+MXNET_MKL_UNARY_MATH_FUNC(exp, Exp);
+MXNET_MKL_UNARY_MATH_FUNC(exp2, Exp2);
+MXNET_MKL_UNARY_MATH_FUNC(exp10, Exp10);
+MXNET_MKL_UNARY_MATH_FUNC(expm1, Expm1);
+MXNET_MKL_UNARY_MATH_FUNC(log, Ln);
+MXNET_MKL_UNARY_MATH_FUNC(log2, Log2);
+MXNET_MKL_UNARY_MATH_FUNC(log10, Log10);
+MXNET_MKL_UNARY_MATH_FUNC(log1p, Log1p);
+
+MXNET_MKL_UNARY_MATH_FUNC(sin, Sin);
+MXNET_MKL_UNARY_MATH_FUNC(cos, Cos);
+MXNET_MKL_UNARY_MATH_FUNC(tan, Tan);
+MXNET_MKL_UNARY_MATH_FUNC(asin, Asin);
+MXNET_MKL_UNARY_MATH_FUNC(acos, Acos);
+MXNET_MKL_UNARY_MATH_FUNC(atan, Atan);
+
+MXNET_MKL_UNARY_MATH_FUNC(sinh, Sinh);
+MXNET_MKL_UNARY_MATH_FUNC(cosh, Cosh);
+MXNET_MKL_UNARY_MATH_FUNC(tanh, Tanh);
+MXNET_MKL_UNARY_MATH_FUNC(asinh, Asinh);
+MXNET_MKL_UNARY_MATH_FUNC(acosh, Acosh);
+MXNET_MKL_UNARY_MATH_FUNC(atanh, Atanh);
+
+MXNET_MKL_UNARY_MATH_FUNC(sqrt, Sqrt);
+MXNET_MKL_UNARY_MATH_FUNC(abs, Abs);
+MXNET_MKL_UNARY_MATH_FUNC(cbrt, Cbrt);
+MXNET_MKL_UNARY_MATH_FUNC(round, Round);
+MXNET_MKL_UNARY_MATH_FUNC(ceil, Ceil);
+MXNET_MKL_UNARY_MATH_FUNC(floor, Floor);
+MXNET_MKL_UNARY_MATH_FUNC(trunc, Trunc);
+
+MXNET_MKL_UNARY_MATH_FUNC(lgamma, LGamma);
+MXNET_MKL_UNARY_MATH_FUNC(tgamma, TGamma);
+MXNET_MKL_UNARY_MATH_FUNC(square, Sqr);
+
+MXNET_MKL_BINARY_MATH_FUNC(add, Add);
+MXNET_MKL_BINARY_MATH_FUNC(sub, Sub);
+MXNET_MKL_BINARY_MATH_FUNC(mul, Mul);
+MXNET_MKL_BINARY_MATH_FUNC(pow, Pow);
+MXNET_MKL_BINARY_MATH_FUNC(hypot, Hypot);
+
+
+}  // namespace mkl_func
+}  // namespace op
+}  // namespace mxnet
+#endif  // MSHADOW_USE_MKL == 1
+#endif // MXNET_OPERATOR_MKL_FUNCTIONS_H_
diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
index 3085f6d2256a..9338fc60352b 100644
--- a/src/operator/tensor/elemwise_unary_op.h
+++ b/src/operator/tensor/elemwise_unary_op.h
@@ -35,9 +35,10 @@
 #include "../mxnet_op.h"
 #include "../elemwise_op_common.h"
 #include "../../ndarray/ndarray_function.h"
+
 #if MSHADOW_USE_MKL == 1
-#include "mkl.h"
-#endif
+#include "../mkl_functions-inl.h"
+#endif  // MSHADOW_USE_MKL == 1
 
 namespace mxnet {
 namespace op {
@@ -354,42 +355,27 @@ class UnaryOp : public OpBase {
   }
 
 #if MSHADOW_USE_MKL == 1
-  static inline void MKLLog(MKL_INT size, const float* pIn, float* pOut) {
-    vsLn(size, pIn, pOut);
-  }
-
-  static inline void MKLLog(MKL_INT size, const double* pIn, double* pOut) {
-    vdLn(size, pIn, pOut);
-  }
-#endif
-
-  template<typename xpu, typename OP>
-  static void LogCompute(const nnvm::NodeAttrs& attrs,
-                         const OpContext& ctx,
-                         const std::vector<TBlob>& inputs,
-                         const std::vector<OpReqType>& req,
-                         const std::vector<TBlob>& outputs) {
-    if (req[0] == kNullOp) return;
-    // if defined MSHADOW_USE_MKL then call mkl log when req is KWriteTo, type_flag
-    // is mshadow::kFloat32 or mshadow::kFloat64 and data size less than or equal MKL_INT_MAX
-#if MSHADOW_USE_MKL == 1
-    auto type_flag = inputs[0].type_flag_;
-    const size_t MKL_INT_MAX = (sizeof(MKL_INT) == sizeof(int)) ? INT_MAX : LLONG_MAX;
-    size_t input_size = inputs[0].Size();
-    if (req[0] == kWriteTo &&
-        input_size <= MKL_INT_MAX &&
-        (type_flag == mshadow::kFloat32 || type_flag == mshadow::kFloat64)) {
-      MSHADOW_SGL_DBL_TYPE_SWITCH(type_flag, DType, {
-        MKLLog(input_size, inputs[0].dptr<DType>(), outputs[0].dptr<DType>());
-      });
-    } else {
-      Compute<xpu, OP>(attrs, ctx, inputs, req, outputs);
-    }
-#else
-    Compute<xpu, OP>(attrs, ctx, inputs, req, outputs);
-#endif
+template<typename OP, typename MKL_OP>
+static void MKL_Compute(const nnvm::NodeAttrs& attrs,
+                        const OpContext& ctx,
+                        const std::vector<TBlob>& inputs,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<TBlob>& outputs){
+  if (req[0] == kNullOp)  return;
+  auto type_flag = inputs[0].type_flag_;
+  size_t input_size = inputs[0].Size();
+  if (req[0] == kWriteTo &&
+      mkl_func::check_size(input_size) &&
+      mkl_func::check_type(type_flag)){
+    // set DType as float or double according to type_flag
+    MSHADOW_SGL_DBL_TYPE_SWITCH(type_flag, DType, {
+      MKL_OP::Map(input_size, inputs[0].dptr<DType>(), outputs[0].dptr<DType>());
+    });
+  } else {
+    Compute<cpu, OP>(attrs, ctx, inputs, req, outputs);
   }
-};
+ }
+#endif  // MSHADOW_USE_MKL == 1
 
 /*! \brief Map legacy unary_bwd to backward_grad */
 template<typename GRAD_OP>
@@ -562,6 +548,22 @@ struct ReshapeLikeParam : public dmlc::Parameter<ReshapeLikeParam> {
     })                                                              \
   .add_argument("data", "NDArray-or-Symbol", "The input array.")
 
+/*! \bried MKL Unary compute. 
+ *  With this macro means mxnet compile with MKL to accelerate math function with mkl. 
+ *  Will Register FCompute with UnaryOp::MKL_Compute() to compelet the math function. 
+ */
+#define MXNET_MKL_OPERATOR_REGISTER_UNARY(__name$)                  \
+  NNVM_REGISTER_OP(__name$)                                         \
+  .set_num_inputs(1)                                                \
+  .set_num_outputs(1)                                               \
+  .set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<1, 1>)  \
+  .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)     \
+  .set_attr<nnvm::FInplaceOption>("FInplaceOption",                 \
+    [](const NodeAttrs& attrs){                                     \
+      return std::vector<std::pair<int, int> >{{0, 0}};             \
+    })                                                              \
+  .add_argument("data", "NDArray-or-Symbol", "The input array.")
+
 /*! \brief Unary compute, with FComputeEx for csr and rsp available  */
 #define MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(__name$, __xpu$, __kernel$)                     \
   MXNET_OPERATOR_REGISTER_UNARY(__name$)                                                           \
diff --git a/src/operator/tensor/elemwise_unary_op_basic.cc b/src/operator/tensor/elemwise_unary_op_basic.cc
index 4aaf4dfd33c4..084820f6bffe 100644
--- a/src/operator/tensor/elemwise_unary_op_basic.cc
+++ b/src/operator/tensor/elemwise_unary_op_basic.cc
@@ -899,8 +899,19 @@ The storage type of ``cbrt`` output depends upon the input storage type:
 
 MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_cbrt,
                                                   unary_bwd<mshadow_op::cube_root_grad>);
-
 // erf
+#if MSHADOW_USE_MKL == 1
+MXNET_MKL_OPERATOR_REGISTER_UNARY(erf)
+.describe(R"code(Returns element-wise gauss error function of the input.
+
+Example::
+
+   erf([0, -1., 10.]) = [0., -0.8427, 1.]
+
+)code" ADD_FILELINE)
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::MKL_Compute<mshadow_op::erf, mkl_func::erf>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_erf"});
+#else
 MXNET_OPERATOR_REGISTER_UNARY(erf)
 .describe(R"code(Returns element-wise gauss error function of the input.
 
@@ -911,6 +922,7 @@ Example::
 )code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", UnaryOp::Compute<cpu, mshadow_op::erf>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_erf"});
+#endif    // MSHADOW_USE_MKL == 1
 
 MXNET_OPERATOR_REGISTER_BINARY(_backward_erf)
 .set_attr<FCompute>("FCompute<cpu>",
@@ -970,6 +982,18 @@ The storage type of ``exp`` output is always dense
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_mul"});
 
 // log
+#if MSHADOW_USE_MKL == 1
+MXNET_MKL_OPERATOR_REGISTER_UNARY(log)
+.describe(R"code(Returns element-wise Natural logarithmic value of the input.
+
+The natural logarithm is logarithm in base *e*, so that ``log(exp(x)) = x``
+
+The storage type of ``log`` output is always dense
+
+)code" ADD_FILELINE)
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::MKL_Compute<mshadow_op::log, mkl_func::log>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_log"});
+#else
 MXNET_OPERATOR_REGISTER_UNARY(log)
 MXNET_ADD_SPARSE_OP_ALIAS(log)
 .describe(R"code(Returns element-wise Natural logarithmic value of the input.
@@ -979,8 +1003,10 @@ The natural logarithm is logarithm in base *e*, so that ``log(exp(x)) = x``
 The storage type of ``log`` output is always dense
 
 )code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", UnaryOp::LogCompute<cpu, mshadow_op::log>)
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::Compute<cpu, mshadow_op::log>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_log"});
+#endif    // MSHADOW_USE_MKL == 1
+
 
 // log10
 MXNET_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(log10, cpu, mshadow_op::log10)

From 931177741a1b84799141f03cc31e08a9830783f3 Mon Sep 17 00:00:00 2001
From: Tao Lv <tao.a.lv@intel.com>
Date: Fri, 25 Jan 2019 10:11:20 +0800
Subject: [PATCH 02/10] fix lint and build issues

---
 src/operator/mkl_functions-inl.h        |  9 +++---
 src/operator/tensor/elemwise_unary_op.h | 40 ++++++++++++-------------
 2 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/src/operator/mkl_functions-inl.h b/src/operator/mkl_functions-inl.h
index c31accf967c4..f3615f4ad17e 100644
--- a/src/operator/mkl_functions-inl.h
+++ b/src/operator/mkl_functions-inl.h
@@ -23,8 +23,8 @@
  * \brief
  * \author
 */
-#ifndef MXNET_OPERATOR_MKL_FUNCTIONS_H_
-#define MXNET_OPERATOR_MKL_FUNCTIONS_H_
+#ifndef MXNET_OPERATOR_MKL_FUNCTIONS_INL_H_
+#define MXNET_OPERATOR_MKL_FUNCTIONS_INL_H_
 
 #if MSHADOW_USE_MKL == 1
 #include "mkl.h"
@@ -54,7 +54,6 @@ static bool check_type(const int t) {
       vd##func(static_cast<MKL_INT>(n), src, dst);                                     \
     }                                                                                  \
   }
- 
 
 #define MXNET_MKL_BINARY_MATH_FUNC(name, func)                                         \
   struct name : public mxnet_op::tunable {                                             \
@@ -74,7 +73,7 @@ static bool check_type(const int t) {
                                     double *c) {                                       \
       vd##func(static_cast<MKL_INT>(n), a, b, c);                                      \
     }                                                                                  \
-  } 
+  }
 
 MXNET_MKL_UNARY_MATH_FUNC(erf, Erf);
 MXNET_MKL_UNARY_MATH_FUNC(exp, Exp);
@@ -123,4 +122,4 @@ MXNET_MKL_BINARY_MATH_FUNC(hypot, Hypot);
 }  // namespace op
 }  // namespace mxnet
 #endif  // MSHADOW_USE_MKL == 1
-#endif // MXNET_OPERATOR_MKL_FUNCTIONS_H_
+#endif  // MXNET_OPERATOR_MKL_FUNCTIONS_INL_H_
diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
index 9338fc60352b..00c381f77589 100644
--- a/src/operator/tensor/elemwise_unary_op.h
+++ b/src/operator/tensor/elemwise_unary_op.h
@@ -355,27 +355,28 @@ class UnaryOp : public OpBase {
   }
 
 #if MSHADOW_USE_MKL == 1
-template<typename OP, typename MKL_OP>
-static void MKL_Compute(const nnvm::NodeAttrs& attrs,
-                        const OpContext& ctx,
-                        const std::vector<TBlob>& inputs,
-                        const std::vector<OpReqType>& req,
-                        const std::vector<TBlob>& outputs){
-  if (req[0] == kNullOp)  return;
-  auto type_flag = inputs[0].type_flag_;
-  size_t input_size = inputs[0].Size();
-  if (req[0] == kWriteTo &&
-      mkl_func::check_size(input_size) &&
-      mkl_func::check_type(type_flag)){
-    // set DType as float or double according to type_flag
-    MSHADOW_SGL_DBL_TYPE_SWITCH(type_flag, DType, {
-      MKL_OP::Map(input_size, inputs[0].dptr<DType>(), outputs[0].dptr<DType>());
-    });
-  } else {
-    Compute<cpu, OP>(attrs, ctx, inputs, req, outputs);
+  template<typename OP, typename MKL_OP>
+  static void MKL_Compute(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const std::vector<TBlob>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<TBlob>& outputs) {
+    if (req[0] == kNullOp)  return;
+    auto type_flag = inputs[0].type_flag_;
+    size_t input_size = inputs[0].Size();
+    if (req[0] == kWriteTo &&
+        mkl_func::check_size(input_size) &&
+        mkl_func::check_type(type_flag)) {
+      // set DType as float or double according to type_flag
+      MSHADOW_SGL_DBL_TYPE_SWITCH(type_flag, DType, {
+        MKL_OP::Map(input_size, inputs[0].dptr<DType>(), outputs[0].dptr<DType>());
+      });
+    } else {
+      Compute<cpu, OP>(attrs, ctx, inputs, req, outputs);
+    }
   }
- }
 #endif  // MSHADOW_USE_MKL == 1
+};
 
 /*! \brief Map legacy unary_bwd to backward_grad */
 template<typename GRAD_OP>
@@ -586,7 +587,6 @@ struct ReshapeLikeParam : public dmlc::Parameter<ReshapeLikeParam> {
 #define MXNET_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(__name$, __xpu$, __kernel$)        \
   MXNET_OPERATOR_REGISTER_UNARY(__name$)                                                \
   .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::Compute<__xpu$, __kernel$>)
-
 }  // namespace op
 }  // namespace mxnet
 

From a79f7dbdd3dfb37e21be0a20dfc411a78136be50 Mon Sep 17 00:00:00 2001
From: "Wu, Shufan" <shufan.wu@intel.com>
Date: Fri, 22 Feb 2019 05:18:29 -0800
Subject: [PATCH 03/10] Try to add support to sparse array

---
 src/operator/tensor/elemwise_unary_op.h       | 57 ++++++++++++++++-
 .../tensor/elemwise_unary_op_basic.cc         | 39 ++++++++++++
 tests/python/gpu/test_operator_gpu.py         | 63 +++++++++++++++++++
 3 files changed, 158 insertions(+), 1 deletion(-)

diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
index 00c381f77589..0aa49119b094 100644
--- a/src/operator/tensor/elemwise_unary_op.h
+++ b/src/operator/tensor/elemwise_unary_op.h
@@ -265,6 +265,27 @@ class UnaryOp : public OpBase {
     }
   }
 
+#if MSHADOW_USE_MKL == 1
+  template<typename OP, typename MKL_OP>
+  static void MKL_ComputeEx(const nnvm::NodeAttrs& attrs,
+                            const OpContext& ctx,
+                            const std::vector<NDArray>& inputs,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<NDArray>& outputs) {
+    CHECK_EQ(inputs.size(), 1U)
+      << "Invalid input, only one input is allowed";
+    CHECK_EQ(outputs.size(), 1U)
+      << "Invalid output, only one output is allowed";
+    CHECK_NE(inputs[0].storage_type(), kDefaultStorage)
+      << "Operation requires a sparse output storage type";
+    CHECK_NE(outputs[0].storage_type(), kDefaultStorage)
+      << "Operation requires a sparse output storage type";
+    if (inputs[0].storage_shape().Size()) {
+      MapToFCompute<cpu>(attrs, ctx, inputs, req, outputs, MKL_Compute<OP, MKL_OP>);
+    }
+  }
+#endif
+
   template<typename xpu, typename op>
   static void ComputeWithHalf2(const nnvm::NodeAttrs &attrs,
                                const OpContext &ctx,
@@ -371,6 +392,12 @@ class UnaryOp : public OpBase {
       MSHADOW_SGL_DBL_TYPE_SWITCH(type_flag, DType, {
         MKL_OP::Map(input_size, inputs[0].dptr<DType>(), outputs[0].dptr<DType>());
       });
+    } else if (req[0] == kWriteInplace  &&
+        mkl_func::check_size(input_size) &&
+        mkl_func::check_type(type_flag)) {
+      MSHADOW_SGL_DBL_TYPE_SWITCH(type_flag, DType, {
+        MKL_OP::Map(input_size, inputs[0].dptr<DType>(), inputs[0].dptr<DType>());
+      });
     } else {
       Compute<cpu, OP>(attrs, ctx, inputs, req, outputs);
     }
@@ -565,6 +592,34 @@ struct ReshapeLikeParam : public dmlc::Parameter<ReshapeLikeParam> {
     })                                                              \
   .add_argument("data", "NDArray-or-Symbol", "The input array.")
 
+#if MSHADOW_USE_MKL == 1
+  /*! \bried MKL Unary compute.
+   *  *  With this macro means mxnet compile with MKL to accelerate math function with mkl.
+   *   *  Will Register FCompute with UnaryOp::MKL_Compute() to compelet the math function.
+  */
+  #define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(__name$, __xpu$, __kernel$, __mkl_kernel$)      \
+    MXNET_MKL_OPERATOR_REGISTER_UNARY(__name$)                                                           \
+    MXNET_ADD_SPARSE_OP_ALIAS(__name$)                                                                   \
+    .set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, true>)      \
+    .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>)         \
+    .set_attr<FComputeEx>("FComputeEx<" #__xpu$ ">", UnaryOp::MKL_ComputeEx<__kernel$, __mkl_kernel$>)
+
+  /*! \bried MKL Unary compute.
+   *  *  With this macro means mxnet compile with MKL to accelerate math function with mkl.
+   *   *  Will Register FCompute with UnaryOp::MKL_Compute() to compelet the math function.
+  */
+  #define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_RSP(__name$, __xpu$, __kernel$, __mkl_kernel$)          \
+    MXNET_MKL_OPERATOR_REGISTER_UNARY(__name$)                                                           \
+    MXNET_ADD_SPARSE_OP_ALIAS(__name$)                                                                   \
+    .set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, false>)     \
+    .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>)         \
+    .set_attr<FComputeEx>("FComputeEx<" #__xpu$ ">", UnaryOp::MKL_ComputeEx<__kernel$, __mkl_kerbel$>)
+
+  #define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(__name$, __xpu$, __kernel$, __mkl_kernel$)    \
+    MXNET_MKL_OPERATOR_REGISTER_UNARY(__name$)                                                           \
+    .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>)
+#endif
+
 /*! \brief Unary compute, with FComputeEx for csr and rsp available  */
 #define MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(__name$, __xpu$, __kernel$)                     \
   MXNET_OPERATOR_REGISTER_UNARY(__name$)                                                           \
@@ -582,7 +637,7 @@ struct ReshapeLikeParam : public dmlc::Parameter<ReshapeLikeParam> {
   .set_attr<FComputeEx>("FComputeEx<" #__xpu$ ">", UnaryOp::ComputeEx<__xpu$, __kernel$>)
 
 /*! \brief Unary compute, dense result.
- *  FInferStorageType attr is not set using this macro. By default DefaultStorageType is used.
+ *  *  FInferStorageType attr is not set using this macro. By default DefaultStorageType is used.
  */
 #define MXNET_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(__name$, __xpu$, __kernel$)        \
   MXNET_OPERATOR_REGISTER_UNARY(__name$)                                                \
diff --git a/src/operator/tensor/elemwise_unary_op_basic.cc b/src/operator/tensor/elemwise_unary_op_basic.cc
index 084820f6bffe..c26f3949143f 100644
--- a/src/operator/tensor/elemwise_unary_op_basic.cc
+++ b/src/operator/tensor/elemwise_unary_op_basic.cc
@@ -812,6 +812,26 @@ The storage type of ``fix`` output depends upon the input storage type:
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 // square
+#if MSHADOW_USE_MKL == 1
+MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(square, cpu, mshadow_op::square, mkl_func::square)
+.describe(R"code(Returns element-wise squared value of the input.
+
+.. math::
+   square(x) = x^2
+
+Example::
+
+   square([2, 3, 4]) = [4, 9, 16]
+
+The storage type of ``square`` output depends upon the input storage type:
+
+   - square(default) = default
+   - square(row_sparse) = row_sparse
+   - square(csr) = csr
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_square"});
+#else
 MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(square, cpu, mshadow_op::square)
 .describe(R"code(Returns element-wise squared value of the input.
 
@@ -830,6 +850,7 @@ The storage type of ``square`` output depends upon the input storage type:
 
 )code" ADD_FILELINE)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_square"});
+#endif
 
 MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(_backward_square,
                                                unary_bwd<mshadow_op::square_grad>);
@@ -965,6 +986,23 @@ MXNET_OPERATOR_REGISTER_BINARY(_backward_rcbrt)
                       unary_bwd<mshadow_op::reciprocal_cube_root_grad>>);
 
 // exp
+#if MSHADOW_USE_MKL == 1
+MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(exp, cpu, mshadow_op::exp, mkl_func::exp)
+MXNET_ADD_SPARSE_OP_ALIAS(exp)
+.describe(R"code(Returns element-wise exponential value of the input.
+
+.. math::
+   exp(x) = e^x \approx 2.718^x
+
+Example::
+
+   exp([0, 1, 2]) = [1., 2.71828175, 7.38905621]
+
+The storage type of ``exp`` output is always dense
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_mul"});
+#else
 MXNET_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(exp, cpu, mshadow_op::exp)
 MXNET_ADD_SPARSE_OP_ALIAS(exp)
 .describe(R"code(Returns element-wise exponential value of the input.
@@ -980,6 +1018,7 @@ The storage type of ``exp`` output is always dense
 
 )code" ADD_FILELINE)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_mul"});
+#endif
 
 // log
 #if MSHADOW_USE_MKL == 1
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 4dbf82edd3f5..2007b93ef5ee 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -2121,6 +2121,69 @@ def test_context_num_gpus():
     # Test that num_gpus reports at least one GPU, as the test is run on a GPU host.
     assert mx.context.num_gpus() > 0
 
+def math_log(shape, dtype, check_value):
+    np_x = np.random.rand(shape[0], shape[1])
+    x = mx.nd.array(np_x, dtype=dtype)
+    mx.nd.waitall()
+    y = mx.nd.log(data=x)
+    y.wait_to_read()
+    if check_value:
+        x_ = x.as_in_context(mx.cpu())
+        mx.nd.waitall()
+        y_ = mx.nd.log(data=x_)
+        y_.wait_to_read()
+        assert_almost_equal(y.asnumpy(), y_.asnumpy())
+
+def math_erf(shape, dtype, check_value):
+    np_x = np.random.rand(shape[0], shape[1])
+    x = mx.nd.array(np_x, dtype=dtype)
+    mx.nd.waitall()
+    y = mx.nd.erf(data=x)
+    y.wait_to_read()
+    if check_value:
+        x_ = x.as_in_context(mx.cpu())
+        mx.nd.waitall()
+        y_ = mx.nd.erf(data=x_)
+        y_.wait_to_read()
+        assert_almost_equal(y.asnumpy(), y_.asnumpy())
+
+def math_square(shape, dtype, check_value):
+    np_x = np.random.rand(shape[0], shape[1])
+    x = mx.nd.array(np_x, dtype=dtype)
+    mx.nd.waitall()
+    y = mx.nd.square(data=x)
+    y.wait_to_read()
+    if check_value:
+        x_ = x.as_in_context(mx.cpu())
+        mx.nd.waitall()
+        y_ = mx.nd.square(data=x_)
+        y_.wait_to_read()
+        assert_almost_equal(y.asnumpy(), y_.asnumpy())
+
+def run_math(op, shape, dtype="float32", check_value=True):
+    run_num = 10
+    for i in range(run_num):
+        if op == 'log':
+            math_log(shape=shape, dtype=dtype, check_value=check_value)
+        elif op == 'erf':
+            math_erf(shape=shape, dtype=dtype, check_value=check_value)
+        elif op == 'square':
+            math_square(shape=shape, dtype=dtype, check_value=check_value)
+
+@with_seed()
+def test_math():
+    ops = ['log', 'erf', 'square']
+    check_value= True
+    lshape = 1000
+    rshapes = [1, 10, 100, 1000, 10000]
+    dtypes = ["float32", "float64"]
+    for rshape in rshapes:
+        shape = (lshape, rshape)
+        print("shape:(%d, %d), " % (lshape, rshape), end="")
+        for dtype in dtypes:
+            for op in ops:
+                run_math(op, shape, dtype, check_value=check_value)
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()

From 015fd0a6ae68a0b0c6d15de5aad8cfc4dd7cbadd Mon Sep 17 00:00:00 2001
From: Tao Lv <tao.a.lv@intel.com>
Date: Sun, 3 Mar 2019 12:57:40 +0800
Subject: [PATCH 04/10] fix build

---
 src/operator/tensor/elemwise_unary_op.h | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
index 0aa49119b094..600803c953f6 100644
--- a/src/operator/tensor/elemwise_unary_op.h
+++ b/src/operator/tensor/elemwise_unary_op.h
@@ -385,19 +385,13 @@ class UnaryOp : public OpBase {
     if (req[0] == kNullOp)  return;
     auto type_flag = inputs[0].type_flag_;
     size_t input_size = inputs[0].Size();
-    if (req[0] == kWriteTo &&
+    if ((req[0] == kWriteTo || req[0] == kWriteInplace) &&
         mkl_func::check_size(input_size) &&
         mkl_func::check_type(type_flag)) {
       // set DType as float or double according to type_flag
       MSHADOW_SGL_DBL_TYPE_SWITCH(type_flag, DType, {
         MKL_OP::Map(input_size, inputs[0].dptr<DType>(), outputs[0].dptr<DType>());
       });
-    } else if (req[0] == kWriteInplace  &&
-        mkl_func::check_size(input_size) &&
-        mkl_func::check_type(type_flag)) {
-      MSHADOW_SGL_DBL_TYPE_SWITCH(type_flag, DType, {
-        MKL_OP::Map(input_size, inputs[0].dptr<DType>(), inputs[0].dptr<DType>());
-      });
     } else {
       Compute<cpu, OP>(attrs, ctx, inputs, req, outputs);
     }
@@ -584,7 +578,7 @@ struct ReshapeLikeParam : public dmlc::Parameter<ReshapeLikeParam> {
   NNVM_REGISTER_OP(__name$)                                         \
   .set_num_inputs(1)                                                \
   .set_num_outputs(1)                                               \
-  .set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<1, 1>)  \
+  .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)  \
   .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)     \
   .set_attr<nnvm::FInplaceOption>("FInplaceOption",                 \
     [](const NodeAttrs& attrs){                                     \

From 672be6a715b59a6c01f745a039d758c8ecf325fe Mon Sep 17 00:00:00 2001
From: Tao Lv <tao.a.lv@intel.com>
Date: Thu, 18 Apr 2019 14:39:26 +0800
Subject: [PATCH 05/10] add functions

---
 src/operator/mkl_functions-inl.h        | 151 +++++++++++++++++++-----
 src/operator/tensor/elemwise_unary_op.h |  48 ++++----
 2 files changed, 146 insertions(+), 53 deletions(-)

diff --git a/src/operator/mkl_functions-inl.h b/src/operator/mkl_functions-inl.h
index f3615f4ad17e..b224d08ab126 100644
--- a/src/operator/mkl_functions-inl.h
+++ b/src/operator/mkl_functions-inl.h
@@ -44,36 +44,31 @@ static bool check_type(const int t) {
   return (t == mshadow::kFloat32 || t == mshadow::kFloat64);
 }
 
-#define MXNET_MKL_UNARY_MATH_FUNC(name, func)                                          \
-  struct name : public mxnet_op::tunable {                                             \
-    template <typename DType>                                                          \
-    MSHADOW_XINLINE static void Map(const index_t n, const DType *src, float *dst) {   \
-      vs##func(static_cast<MKL_INT>(n), reinterpret_cast<const float *>(src), dst);    \
-    }                                                                                  \
-    MSHADOW_XINLINE static void Map(const index_t n, const double *src, double *dst) { \
-      vd##func(static_cast<MKL_INT>(n), src, dst);                                     \
-    }                                                                                  \
-  }
+#define MXNET_MKL_UNARY_MATH_FUNC(name, func)                                               \
+struct name {                                                                               \
+  MSHADOW_XINLINE static void Vectorize(const index_t n, const float *src, float *dst) {    \
+    vs##func(static_cast<MKL_INT>(n), src, dst);                                            \
+  }                                                                                         \
+  MSHADOW_XINLINE static void Vectorize(const index_t n, const double *src, double *dst) {  \
+    vd##func(static_cast<MKL_INT>(n), src, dst);                                            \
+  }                                                                                         \
+};
 
-#define MXNET_MKL_BINARY_MATH_FUNC(name, func)                                         \
-  struct name : public mxnet_op::tunable {                                             \
-    template <typename DType>                                                          \
-    MSHADOW_XINLINE static void Map(const index_t n,                                   \
-                                    const DType *a,                                    \
-                                    const DType *b,                                    \
-                                    float *c) {                                        \
-      vs##func(static_cast<MKL_INT>(n),                                                \
-               reinterpret_cast<const float *>(a),                                     \
-               reinterpret_cast<const float *>(b),                                     \
-               c);                                                                     \
-    }                                                                                  \
-    MSHADOW_XINLINE static void Map(const index_t n,                                   \
-                                    const double *a,                                   \
-                                    const double *b,                                   \
-                                    double *c) {                                       \
-      vd##func(static_cast<MKL_INT>(n), a, b, c);                                      \
-    }                                                                                  \
-  }
+#define MXNET_MKL_BINARY_MATH_FUNC(name, func)                                        \
+struct name {                                                                         \
+  MSHADOW_XINLINE static void Vectorize(const index_t n,                              \
+                                        const float *a,                               \
+                                        const float *b,                               \
+                                        float *c) {                                   \
+    vs##func(static_cast<MKL_INT>(n), a, b, c);                                       \
+  }                                                                                   \
+  MSHADOW_XINLINE static void Vectorize(const index_t n,                              \
+                                        const double *a,                              \
+                                        const double *b,                              \
+                                        double *c) {                                  \
+    vd##func(static_cast<MKL_INT>(n), a, b, c);                                       \
+  }                                                                                   \
+};
 
 MXNET_MKL_UNARY_MATH_FUNC(erf, Erf);
 MXNET_MKL_UNARY_MATH_FUNC(exp, Exp);
@@ -118,6 +113,104 @@ MXNET_MKL_BINARY_MATH_FUNC(pow, Pow);
 MXNET_MKL_BINARY_MATH_FUNC(hypot, Hypot);
 
 
+template <typename DType>
+MSHADOW_XINLINE static void sub_(index_t n, DType *in, DType b, DType *dst) {
+  for (index_t i = 0; i < n; i++)
+    dst[i] = in[i] - b;
+}
+
+template <typename DType>
+MSHADOW_XINLINE static void div_(index_t n, DType *in, DType b, DType *dst) {
+  for (index_t i = 0; i < n; i++)
+    dst[i] = in[i] / b;
+}
+
+template <typename DType>
+MSHADOW_XINLINE static void sum_(index_t n, DType *in, DType *dst) {
+  // dst[0] = cblas_sasum(n, in, 1);
+  DType sum = 0.0f;
+  for (index_t i = 0; i < n; i++)
+    sum += in[i];
+
+  dst[0] = sum;
+}
+
+template <typename DType>
+MSHADOW_XINLINE static void max_(int n, DType * __restrict__ in, DType *dst) {
+  dst[0] = in[0];
+  for (int i = 1; i < n; i++)
+    dst[0] = (dst[0] < in[i]) ? in[i] : dst[0];
+}
+
+// LayerNorm on the last dimension
+template <typename DType>
+MSHADOW_XINLINE static void LayerNormLastDim(const index_t m,
+                                             const index_t n,
+                                             const DType *a,
+                                             const DType *b,
+                                             const DType *ws,
+                                             const DType *gamma,
+                                             const DType *beta,
+                                             const DType *mean,
+                                             const DType *var,
+                                             const DType eps) {
+#pragma omp parallel for
+  for (index_t i = 0; i < m; i++) {
+    DType* in_offset = a + i * n;
+    DType* out_offset = b + i * n;
+    DType* ws_offset = ws + i * n;
+
+    sum_(n, in_offset, &(mean[i]));
+    mean[i] /= n;
+    sub_(n, in_offset, mean[i], out_offset);
+    square(n, out_offset, ws_offset);
+    sum_(n, ws_offset, &(var[i]));
+    var[i] = sqrt(var[i] / n + eps);
+
+    mul(n, out_offset, gamma, out_offset);
+    div_(n, out_offset, var[i], out_offset);
+    add(n, out_offset, beta, out_offset);
+  }
+}
+
+// softmax on the last dimension
+template <typename DType>
+MSHADOW_XINLINE static void SoftmaxLastDim(const index_t m,
+                                           const index_t n,
+                                           const DType *a,
+                                           const DType *b) {
+#pragma omp paralle for
+  for (index_t i = 0; i < m; i++) {
+    DType* in_offset = a + i * n;
+    DType* out_offset = b + i * n;
+
+    exp(n, in_offset, out_offset);
+    float sum = 0.0f;
+    sum_(n, out_offset, &sum);
+    div_(n, out_offset, sum, out_offset);
+  }
+}
+
+template <typename DType>
+MSHADOW_XINLINE static void LogSoftmaxLastDim(const index_t m,
+                                              const index_t n,
+                                              const DType *a,
+                                              const DType *b) {
+#pragma parallel for
+  for (index_t i = 0; i < m; i++) {
+    DType* in_offset = a + i * n;
+    DType* out_offset = b + i * n;
+
+    DType b, logsum;
+    max_(n, in_offset, &b);
+    sub_(n, in_offset, b, out_offset);
+    exp(n, out_offset, out_offset);
+    sum_(n, out_offset, &logsum);
+    logsum = b + logf(logsum);
+    sub_(n, in_offset, logsum, out_offset);
+  }
+}
+
 }  // namespace mkl_func
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
index 600803c953f6..d2d221bbd628 100644
--- a/src/operator/tensor/elemwise_unary_op.h
+++ b/src/operator/tensor/elemwise_unary_op.h
@@ -390,7 +390,7 @@ class UnaryOp : public OpBase {
         mkl_func::check_type(type_flag)) {
       // set DType as float or double according to type_flag
       MSHADOW_SGL_DBL_TYPE_SWITCH(type_flag, DType, {
-        MKL_OP::Map(input_size, inputs[0].dptr<DType>(), outputs[0].dptr<DType>());
+        MKL_OP::Vectorize(input_size, inputs[0].dptr<DType>(), outputs[0].dptr<DType>());
       });
     } else {
       Compute<cpu, OP>(attrs, ctx, inputs, req, outputs);
@@ -562,7 +562,7 @@ struct ReshapeLikeParam : public dmlc::Parameter<ReshapeLikeParam> {
   NNVM_REGISTER_OP(__name$)                                         \
   .set_num_inputs(1)                                                \
   .set_num_outputs(1)                                               \
-  .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)  \
+  .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>) \
   .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)     \
   .set_attr<nnvm::FInplaceOption>("FInplaceOption",                 \
     [](const NodeAttrs& attrs){                                     \
@@ -578,7 +578,7 @@ struct ReshapeLikeParam : public dmlc::Parameter<ReshapeLikeParam> {
   NNVM_REGISTER_OP(__name$)                                         \
   .set_num_inputs(1)                                                \
   .set_num_outputs(1)                                               \
-  .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)  \
+  .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>) \
   .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)     \
   .set_attr<nnvm::FInplaceOption>("FInplaceOption",                 \
     [](const NodeAttrs& attrs){                                     \
@@ -591,27 +591,27 @@ struct ReshapeLikeParam : public dmlc::Parameter<ReshapeLikeParam> {
    *  *  With this macro means mxnet compile with MKL to accelerate math function with mkl.
    *   *  Will Register FCompute with UnaryOp::MKL_Compute() to compelet the math function.
   */
-  #define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(__name$, __xpu$, __kernel$, __mkl_kernel$)      \
-    MXNET_MKL_OPERATOR_REGISTER_UNARY(__name$)                                                           \
-    MXNET_ADD_SPARSE_OP_ALIAS(__name$)                                                                   \
-    .set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, true>)      \
-    .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>)         \
-    .set_attr<FComputeEx>("FComputeEx<" #__xpu$ ">", UnaryOp::MKL_ComputeEx<__kernel$, __mkl_kernel$>)
-
-  /*! \bried MKL Unary compute.
-   *  *  With this macro means mxnet compile with MKL to accelerate math function with mkl.
-   *   *  Will Register FCompute with UnaryOp::MKL_Compute() to compelet the math function.
-  */
-  #define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_RSP(__name$, __xpu$, __kernel$, __mkl_kernel$)          \
-    MXNET_MKL_OPERATOR_REGISTER_UNARY(__name$)                                                           \
-    MXNET_ADD_SPARSE_OP_ALIAS(__name$)                                                                   \
-    .set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, false>)     \
-    .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>)         \
-    .set_attr<FComputeEx>("FComputeEx<" #__xpu$ ">", UnaryOp::MKL_ComputeEx<__kernel$, __mkl_kerbel$>)
-
-  #define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(__name$, __xpu$, __kernel$, __mkl_kernel$)    \
-    MXNET_MKL_OPERATOR_REGISTER_UNARY(__name$)                                                           \
-    .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>)
+#define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(__name$, __xpu$, __kernel$, __mkl_kernel$) \
+  MXNET_MKL_OPERATOR_REGISTER_UNARY(__name$)                                                      \
+  MXNET_ADD_SPARSE_OP_ALIAS(__name$)                                                              \
+  .set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, true>) \
+  .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>)    \
+  .set_attr<FComputeEx>("FComputeEx<" #__xpu$ ">", UnaryOp::MKL_ComputeEx<__kernel$, __mkl_kernel$>)
+
+/*! \bried MKL Unary compute.
+ *  *  With this macro means mxnet compile with MKL to accelerate math function with mkl.
+ *   *  Will Register FCompute with UnaryOp::MKL_Compute() to compelet the math function.
+*/
+#define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_RSP(__name$, __xpu$, __kernel$, __mkl_kernel$)     \
+  MXNET_MKL_OPERATOR_REGISTER_UNARY(__name$)                                                      \
+  MXNET_ADD_SPARSE_OP_ALIAS(__name$)                                                              \
+  .set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, false>)\
+  .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>)    \
+  .set_attr<FComputeEx>("FComputeEx<" #__xpu$ ">", UnaryOp::MKL_ComputeEx<__kernel$, __mkl_kerbel$>)
+
+#define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(__name$, __xpu$, __kernel$, __mkl_kernel$)\
+  MXNET_MKL_OPERATOR_REGISTER_UNARY(__name$)                                                       \
+  .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>)
 #endif
 
 /*! \brief Unary compute, with FComputeEx for csr and rsp available  */

From c69a25c7ec5694a7002a3e44bc2e6821156246d4 Mon Sep 17 00:00:00 2001
From: shufan wu <shufan.wu@intel.com>
Date: Sat, 9 Mar 2019 16:16:55 +0800
Subject: [PATCH 06/10] Fix review comments

---
 src/operator/tensor/elemwise_unary_op.h       | 42 +++++++++----------
 .../tensor/elemwise_unary_op_basic.cc         | 35 ++++------------
 2 files changed, 30 insertions(+), 47 deletions(-)

diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
index d2d221bbd628..26938404018c 100644
--- a/src/operator/tensor/elemwise_unary_op.h
+++ b/src/operator/tensor/elemwise_unary_op.h
@@ -591,27 +591,27 @@ struct ReshapeLikeParam : public dmlc::Parameter<ReshapeLikeParam> {
    *  *  With this macro means mxnet compile with MKL to accelerate math function with mkl.
    *   *  Will Register FCompute with UnaryOp::MKL_Compute() to compelet the math function.
   */
-#define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(__name$, __xpu$, __kernel$, __mkl_kernel$) \
-  MXNET_MKL_OPERATOR_REGISTER_UNARY(__name$)                                                      \
-  MXNET_ADD_SPARSE_OP_ALIAS(__name$)                                                              \
-  .set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, true>) \
-  .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>)    \
-  .set_attr<FComputeEx>("FComputeEx<" #__xpu$ ">", UnaryOp::MKL_ComputeEx<__kernel$, __mkl_kernel$>)
-
-/*! \bried MKL Unary compute.
- *  *  With this macro means mxnet compile with MKL to accelerate math function with mkl.
- *   *  Will Register FCompute with UnaryOp::MKL_Compute() to compelet the math function.
-*/
-#define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_RSP(__name$, __xpu$, __kernel$, __mkl_kernel$)     \
-  MXNET_MKL_OPERATOR_REGISTER_UNARY(__name$)                                                      \
-  MXNET_ADD_SPARSE_OP_ALIAS(__name$)                                                              \
-  .set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, false>)\
-  .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>)    \
-  .set_attr<FComputeEx>("FComputeEx<" #__xpu$ ">", UnaryOp::MKL_ComputeEx<__kernel$, __mkl_kerbel$>)
-
-#define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(__name$, __xpu$, __kernel$, __mkl_kernel$)\
-  MXNET_MKL_OPERATOR_REGISTER_UNARY(__name$)                                                       \
-  .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>)
+  #define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(__name$, __xpu$, __kernel$, __mkl_kernel$)      \
+    MXNET_OPERATOR_REGISTER_UNARY(__name$)                                                               \
+    MXNET_ADD_SPARSE_OP_ALIAS(__name$)                                                                   \
+    .set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, true>)      \
+    .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>)         \
+    .set_attr<FComputeEx>("FComputeEx<" #__xpu$ ">", UnaryOp::MKL_ComputeEx<__kernel$, __mkl_kernel$>)
+
+  /*! \bried MKL Unary compute.
+   *  *  With this macro means mxnet compile with MKL to accelerate math function with mkl.
+   *   *  Will Register FCompute with UnaryOp::MKL_Compute() to compelet the math function.
+  */
+  #define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_RSP(__name$, __xpu$, __kernel$, __mkl_kernel$)          \
+    MXNET_OPERATOR_REGISTER_UNARY(__name$)                                                               \
+    MXNET_ADD_SPARSE_OP_ALIAS(__name$)                                                                   \
+    .set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, false>)     \
+    .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>)         \
+    .set_attr<FComputeEx>("FComputeEx<" #__xpu$ ">", UnaryOp::MKL_ComputeEx<__kernel$, __mkl_kerbel$>)
+
+  #define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(__name$, __xpu$, __kernel$, __mkl_kernel$)    \
+    MXNET_OPERATOR_REGISTER_UNARY(__name$)                                                               \
+    .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>)
 #endif
 
 /*! \brief Unary compute, with FComputeEx for csr and rsp available  */
diff --git a/src/operator/tensor/elemwise_unary_op_basic.cc b/src/operator/tensor/elemwise_unary_op_basic.cc
index dc876a02a2b4..8e37a488eebc 100644
--- a/src/operator/tensor/elemwise_unary_op_basic.cc
+++ b/src/operator/tensor/elemwise_unary_op_basic.cc
@@ -929,9 +929,9 @@ The storage type of ``cbrt`` output depends upon the input storage type:
 
 MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_cbrt,
                                                   unary_bwd<mshadow_op::cube_root_grad>);
+
 // erf
-#if MSHADOW_USE_MKL == 1
-MXNET_MKL_OPERATOR_REGISTER_UNARY(erf)
+MXNET_OPERATOR_REGISTER_UNARY(erf)
 .describe(R"code(Returns element-wise gauss error function of the input.
 
 Example::
@@ -939,20 +939,13 @@ Example::
    erf([0, -1., 10.]) = [0., -0.8427, 1.]
 
 )code" ADD_FILELINE)
+#if MSHADOW_USE_MKL == 1
 .set_attr<FCompute>("FCompute<cpu>", UnaryOp::MKL_Compute<mshadow_op::erf, mkl_func::erf>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_erf"});
 #else
-MXNET_OPERATOR_REGISTER_UNARY(erf)
-.describe(R"code(Returns element-wise gauss error function of the input.
-
-Example::
-
-   erf([0, -1., 10.]) = [0., -0.8427, 1.]
-
-)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", UnaryOp::Compute<cpu, mshadow_op::erf>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_erf"});
 #endif    // MSHADOW_USE_MKL == 1
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_erf"});
+
 
 MXNET_OPERATOR_REGISTER_BINARY(_backward_erf)
 .set_attr<FCompute>("FCompute<cpu>",
@@ -1030,18 +1023,6 @@ The storage type of ``exp`` output is always dense
 #endif
 
 // log
-#if MSHADOW_USE_MKL == 1
-MXNET_MKL_OPERATOR_REGISTER_UNARY(log)
-.describe(R"code(Returns element-wise Natural logarithmic value of the input.
-
-The natural logarithm is logarithm in base *e*, so that ``log(exp(x)) = x``
-
-The storage type of ``log`` output is always dense
-
-)code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", UnaryOp::MKL_Compute<mshadow_op::log, mkl_func::log>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_log"});
-#else
 MXNET_OPERATOR_REGISTER_UNARY(log)
 MXNET_ADD_SPARSE_OP_ALIAS(log)
 .describe(R"code(Returns element-wise Natural logarithmic value of the input.
@@ -1051,10 +1032,12 @@ The natural logarithm is logarithm in base *e*, so that ``log(exp(x)) = x``
 The storage type of ``log`` output is always dense
 
 )code" ADD_FILELINE)
+#if MSHADOW_USE_MKL == 1
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::MKL_Compute<mshadow_op::log, mkl_func::log>)
+#else
 .set_attr<FCompute>("FCompute<cpu>", UnaryOp::Compute<cpu, mshadow_op::log>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_log"});
 #endif    // MSHADOW_USE_MKL == 1
-
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_log"});
 
 // log10
 MXNET_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(log10, cpu, mshadow_op::log10)

From 2c5c20c694f440a8a9b9335a634a76d70402f03d Mon Sep 17 00:00:00 2001
From: shufan wu <shufan.wu@intel.com>
Date: Sat, 9 Mar 2019 16:34:04 +0800
Subject: [PATCH 07/10] remove unecessary code

---
 src/operator/tensor/elemwise_unary_op.h | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
index 26938404018c..82a9aa9fe7a1 100644
--- a/src/operator/tensor/elemwise_unary_op.h
+++ b/src/operator/tensor/elemwise_unary_op.h
@@ -570,22 +570,6 @@ struct ReshapeLikeParam : public dmlc::Parameter<ReshapeLikeParam> {
     })                                                              \
   .add_argument("data", "NDArray-or-Symbol", "The input array.")
 
-/*! \bried MKL Unary compute. 
- *  With this macro means mxnet compile with MKL to accelerate math function with mkl. 
- *  Will Register FCompute with UnaryOp::MKL_Compute() to compelet the math function. 
- */
-#define MXNET_MKL_OPERATOR_REGISTER_UNARY(__name$)                  \
-  NNVM_REGISTER_OP(__name$)                                         \
-  .set_num_inputs(1)                                                \
-  .set_num_outputs(1)                                               \
-  .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>) \
-  .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)     \
-  .set_attr<nnvm::FInplaceOption>("FInplaceOption",                 \
-    [](const NodeAttrs& attrs){                                     \
-      return std::vector<std::pair<int, int> >{{0, 0}};             \
-    })                                                              \
-  .add_argument("data", "NDArray-or-Symbol", "The input array.")
-
 #if MSHADOW_USE_MKL == 1
   /*! \bried MKL Unary compute.
    *  *  With this macro means mxnet compile with MKL to accelerate math function with mkl.

From b1b635558f68a1fec7f1bc287e7d2a1e9f5e7265 Mon Sep 17 00:00:00 2001
From: shufan wu <shufan.wu@intel.com>
Date: Sun, 10 Mar 2019 09:22:18 +0800
Subject: [PATCH 08/10] Update test case

---
 tests/python/gpu/test_operator_gpu.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index b037218dc07f..08902563c9b0 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -2200,7 +2200,7 @@ def test_context_num_gpus():
     assert mx.context.num_gpus() > 0
 
 def math_log(shape, dtype, check_value):
-    np_x = np.random.rand(shape[0], shape[1])
+    np_x = np.random.rand(*tuple(shape))
     x = mx.nd.array(np_x, dtype=dtype)
     mx.nd.waitall()
     y = mx.nd.log(data=x)
@@ -2213,7 +2213,7 @@ def math_log(shape, dtype, check_value):
         assert_almost_equal(y.asnumpy(), y_.asnumpy())
 
 def math_erf(shape, dtype, check_value):
-    np_x = np.random.rand(shape[0], shape[1])
+    np_x = np.random.rand(*tuple(shape))
     x = mx.nd.array(np_x, dtype=dtype)
     mx.nd.waitall()
     y = mx.nd.erf(data=x)
@@ -2226,7 +2226,7 @@ def math_erf(shape, dtype, check_value):
         assert_almost_equal(y.asnumpy(), y_.asnumpy())
 
 def math_square(shape, dtype, check_value):
-    np_x = np.random.rand(shape[0], shape[1])
+    np_x = np.random.rand(*tuple(shape))
     x = mx.nd.array(np_x, dtype=dtype)
     mx.nd.waitall()
     y = mx.nd.square(data=x)
@@ -2252,12 +2252,9 @@ def run_math(op, shape, dtype="float32", check_value=True):
 def test_math():
     ops = ['log', 'erf', 'square']
     check_value= True
-    lshape = 1000
-    rshapes = [1, 10, 100, 1000, 10000]
+    shape_lst = [[1000], [100,1000], [10,100,100], [10,100,100,100]] 
     dtypes = ["float32", "float64"]
-    for rshape in rshapes:
-        shape = (lshape, rshape)
-        print("shape:(%d, %d), " % (lshape, rshape), end="")
+    for shape in shape_lst:
         for dtype in dtypes:
             for op in ops:
                 run_math(op, shape, dtype, check_value=check_value)

From f96c34a326c70a846fb7a3bca58d712af59d1817 Mon Sep 17 00:00:00 2001
From: shufan wu <shufan.wu@intel.com>
Date: Mon, 11 Mar 2019 11:56:17 +0800
Subject: [PATCH 09/10] minor fix

---
 tests/python/gpu/test_operator_gpu.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 08902563c9b0..533d7b0ed655 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -2202,40 +2202,28 @@ def test_context_num_gpus():
 def math_log(shape, dtype, check_value):
     np_x = np.random.rand(*tuple(shape))
     x = mx.nd.array(np_x, dtype=dtype)
-    mx.nd.waitall()
     y = mx.nd.log(data=x)
-    y.wait_to_read()
     if check_value:
         x_ = x.as_in_context(mx.cpu())
-        mx.nd.waitall()
         y_ = mx.nd.log(data=x_)
-        y_.wait_to_read()
         assert_almost_equal(y.asnumpy(), y_.asnumpy())
 
 def math_erf(shape, dtype, check_value):
     np_x = np.random.rand(*tuple(shape))
     x = mx.nd.array(np_x, dtype=dtype)
-    mx.nd.waitall()
     y = mx.nd.erf(data=x)
-    y.wait_to_read()
     if check_value:
         x_ = x.as_in_context(mx.cpu())
-        mx.nd.waitall()
         y_ = mx.nd.erf(data=x_)
-        y_.wait_to_read()
         assert_almost_equal(y.asnumpy(), y_.asnumpy())
 
 def math_square(shape, dtype, check_value):
     np_x = np.random.rand(*tuple(shape))
     x = mx.nd.array(np_x, dtype=dtype)
-    mx.nd.waitall()
     y = mx.nd.square(data=x)
-    y.wait_to_read()
     if check_value:
         x_ = x.as_in_context(mx.cpu())
-        mx.nd.waitall()
         y_ = mx.nd.square(data=x_)
-        y_.wait_to_read()
         assert_almost_equal(y.asnumpy(), y_.asnumpy())
 
 def run_math(op, shape, dtype="float32", check_value=True):

From 06c51e974cbaa70939e6ac82c98e5ee1b9a6bdf3 Mon Sep 17 00:00:00 2001
From: shufan wu <shufan.wu@intel.com>
Date: Thu, 18 Apr 2019 22:02:25 +0800
Subject: [PATCH 10/10] move the position of MKL_Compute

---
 src/operator/tensor/elemwise_unary_op.h | 43 ++++++++++++-------------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
index 82a9aa9fe7a1..279efcf97084 100644
--- a/src/operator/tensor/elemwise_unary_op.h
+++ b/src/operator/tensor/elemwise_unary_op.h
@@ -266,6 +266,27 @@ class UnaryOp : public OpBase {
   }
 
 #if MSHADOW_USE_MKL == 1
+  template<typename OP, typename MKL_OP>
+  static void MKL_Compute(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const std::vector<TBlob>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<TBlob>& outputs) {
+    if (req[0] == kNullOp)  return;
+    auto type_flag = inputs[0].type_flag_;
+    size_t input_size = inputs[0].Size();
+    if ((req[0] == kWriteTo || req[0] == kWriteInplace) &&
+        mkl_func::check_size(input_size) &&
+        mkl_func::check_type(type_flag)) {
+      // set DType as float or double according to type_flag   
+      MSHADOW_SGL_DBL_TYPE_SWITCH(type_flag, DType, {
+        MKL_OP::Vectorize(input_size, inputs[0].dptr<DType>(), outputs[0].dptr<DType>());
+      });
+    } else {
+      Compute<cpu, OP>(attrs, ctx, inputs, req, outputs);
+    }
+  }
+
   template<typename OP, typename MKL_OP>
   static void MKL_ComputeEx(const nnvm::NodeAttrs& attrs,
                             const OpContext& ctx,
@@ -375,28 +396,6 @@ class UnaryOp : public OpBase {
     }
   }
 
-#if MSHADOW_USE_MKL == 1
-  template<typename OP, typename MKL_OP>
-  static void MKL_Compute(const nnvm::NodeAttrs& attrs,
-                          const OpContext& ctx,
-                          const std::vector<TBlob>& inputs,
-                          const std::vector<OpReqType>& req,
-                          const std::vector<TBlob>& outputs) {
-    if (req[0] == kNullOp)  return;
-    auto type_flag = inputs[0].type_flag_;
-    size_t input_size = inputs[0].Size();
-    if ((req[0] == kWriteTo || req[0] == kWriteInplace) &&
-        mkl_func::check_size(input_size) &&
-        mkl_func::check_type(type_flag)) {
-      // set DType as float or double according to type_flag
-      MSHADOW_SGL_DBL_TYPE_SWITCH(type_flag, DType, {
-        MKL_OP::Vectorize(input_size, inputs[0].dptr<DType>(), outputs[0].dptr<DType>());
-      });
-    } else {
-      Compute<cpu, OP>(attrs, ctx, inputs, req, outputs);
-    }
-  }
-#endif  // MSHADOW_USE_MKL == 1
 };
 
 /*! \brief Map legacy unary_bwd to backward_grad */