TaoLv · TaoLv · Jan 25, 2019 · Jan 25, 2019 · Feb 22, 2019 · Mar 3, 2019
diff --git a/src/operator/mkl_functions-inl.h b/src/operator/mkl_functions-inl.h
@@ -0,0 +1,218 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file mkl_functions-inl.h
+ * \brief
+ * \author
+*/
+#ifndef MXNET_OPERATOR_MKL_FUNCTIONS_INL_H_
+#define MXNET_OPERATOR_MKL_FUNCTIONS_INL_H_
+
+#if MSHADOW_USE_MKL == 1
+#include "mkl.h"
+
+namespace mxnet {
+namespace op {
+namespace mkl_func {
+
+MSHADOW_XINLINE
+static bool check_size(const size_t n) {
+  const size_t MKL_INT_MAX = (sizeof(MKL_INT) == sizeof(int)) ? INT_MAX : LLONG_MAX;
+  return (n <= MKL_INT_MAX);
+}
+
+MSHADOW_XINLINE
+static bool check_type(const int t) {
+  return (t == mshadow::kFloat32 || t == mshadow::kFloat64);
+}
+
+#define MXNET_MKL_UNARY_MATH_FUNC(name, func)                                               \
+struct name {                                                                               \
+  MSHADOW_XINLINE static void Vectorize(const index_t n, const float *src, float *dst) {    \
+    vs##func(static_cast<MKL_INT>(n), src, dst);                                            \
+  }                                                                                         \
+  MSHADOW_XINLINE static void Vectorize(const index_t n, const double *src, double *dst) {  \
+    vd##func(static_cast<MKL_INT>(n), src, dst);                                            \
+  }                                                                                         \
+};
+
+#define MXNET_MKL_BINARY_MATH_FUNC(name, func)                                        \
+struct name {                                                                         \
+  MSHADOW_XINLINE static void Vectorize(const index_t n,                              \
+                                        const float *a,                               \
+                                        const float *b,                               \
+                                        float *c) {                                   \
+    vs##func(static_cast<MKL_INT>(n), a, b, c);                                       \
+  }                                                                                   \
+  MSHADOW_XINLINE static void Vectorize(const index_t n,                              \
+                                        const double *a,                              \
+                                        const double *b,                              \
+                                        double *c) {                                  \
+    vd##func(static_cast<MKL_INT>(n), a, b, c);                                       \
+  }                                                                                   \
+};
+
+MXNET_MKL_UNARY_MATH_FUNC(erf, Erf);
+MXNET_MKL_UNARY_MATH_FUNC(exp, Exp);
+MXNET_MKL_UNARY_MATH_FUNC(exp2, Exp2);
+MXNET_MKL_UNARY_MATH_FUNC(exp10, Exp10);
+MXNET_MKL_UNARY_MATH_FUNC(expm1, Expm1);
+MXNET_MKL_UNARY_MATH_FUNC(log, Ln);
+MXNET_MKL_UNARY_MATH_FUNC(log2, Log2);
+MXNET_MKL_UNARY_MATH_FUNC(log10, Log10);
+MXNET_MKL_UNARY_MATH_FUNC(log1p, Log1p);
+
+MXNET_MKL_UNARY_MATH_FUNC(sin, Sin);
+MXNET_MKL_UNARY_MATH_FUNC(cos, Cos);
+MXNET_MKL_UNARY_MATH_FUNC(tan, Tan);
+MXNET_MKL_UNARY_MATH_FUNC(asin, Asin);
+MXNET_MKL_UNARY_MATH_FUNC(acos, Acos);
+MXNET_MKL_UNARY_MATH_FUNC(atan, Atan);
+
+MXNET_MKL_UNARY_MATH_FUNC(sinh, Sinh);
+MXNET_MKL_UNARY_MATH_FUNC(cosh, Cosh);
+MXNET_MKL_UNARY_MATH_FUNC(tanh, Tanh);
+MXNET_MKL_UNARY_MATH_FUNC(asinh, Asinh);
+MXNET_MKL_UNARY_MATH_FUNC(acosh, Acosh);
+MXNET_MKL_UNARY_MATH_FUNC(atanh, Atanh);
+
+MXNET_MKL_UNARY_MATH_FUNC(sqrt, Sqrt);
+MXNET_MKL_UNARY_MATH_FUNC(abs, Abs);
+MXNET_MKL_UNARY_MATH_FUNC(cbrt, Cbrt);
+MXNET_MKL_UNARY_MATH_FUNC(round, Round);
+MXNET_MKL_UNARY_MATH_FUNC(ceil, Ceil);
+MXNET_MKL_UNARY_MATH_FUNC(floor, Floor);
+MXNET_MKL_UNARY_MATH_FUNC(trunc, Trunc);
+
+MXNET_MKL_UNARY_MATH_FUNC(lgamma, LGamma);
+MXNET_MKL_UNARY_MATH_FUNC(tgamma, TGamma);
+MXNET_MKL_UNARY_MATH_FUNC(square, Sqr);
+
+MXNET_MKL_BINARY_MATH_FUNC(add, Add);
+MXNET_MKL_BINARY_MATH_FUNC(sub, Sub);
+MXNET_MKL_BINARY_MATH_FUNC(mul, Mul);
+MXNET_MKL_BINARY_MATH_FUNC(pow, Pow);
+MXNET_MKL_BINARY_MATH_FUNC(hypot, Hypot);
+
+
+template <typename DType>
+MSHADOW_XINLINE static void sub_(index_t n, DType *in, DType b, DType *dst) {
+  for (index_t i = 0; i < n; i++)
+    dst[i] = in[i] - b;
+}
+
+template <typename DType>
+MSHADOW_XINLINE static void div_(index_t n, DType *in, DType b, DType *dst) {
+  for (index_t i = 0; i < n; i++)
+    dst[i] = in[i] / b;
+}
+
+template <typename DType>
+MSHADOW_XINLINE static void sum_(index_t n, DType *in, DType *dst) {
+  // dst[0] = cblas_sasum(n, in, 1);
+  DType sum = 0.0f;
+  for (index_t i = 0; i < n; i++)
+    sum += in[i];
+
+  dst[0] = sum;
+}
+
+template <typename DType>
+MSHADOW_XINLINE static void max_(int n, DType * __restrict__ in, DType *dst) {
+  dst[0] = in[0];
+  for (int i = 1; i < n; i++)
+    dst[0] = (dst[0] < in[i]) ? in[i] : dst[0];
+}
+
+// LayerNorm on the last dimension
+template <typename DType>
+MSHADOW_XINLINE static void LayerNormLastDim(const index_t m,
+                                             const index_t n,
+                                             const DType *a,
+                                             const DType *b,
+                                             const DType *ws,
+                                             const DType *gamma,
+                                             const DType *beta,
+                                             const DType *mean,
+                                             const DType *var,
+                                             const DType eps) {
+#pragma omp parallel for
+  for (index_t i = 0; i < m; i++) {
+    DType* in_offset = a + i * n;
+    DType* out_offset = b + i * n;
+    DType* ws_offset = ws + i * n;
+
+    sum_(n, in_offset, &(mean[i]));
+    mean[i] /= n;
+    sub_(n, in_offset, mean[i], out_offset);
+    square(n, out_offset, ws_offset);
+    sum_(n, ws_offset, &(var[i]));
+    var[i] = sqrt(var[i] / n + eps);
+
+    mul(n, out_offset, gamma, out_offset);
+    div_(n, out_offset, var[i], out_offset);
+    add(n, out_offset, beta, out_offset);
+  }
+}
+
+// softmax on the last dimension
+template <typename DType>
+MSHADOW_XINLINE static void SoftmaxLastDim(const index_t m,
+                                           const index_t n,
+                                           const DType *a,
+                                           const DType *b) {
+#pragma omp paralle for
+  for (index_t i = 0; i < m; i++) {
+    DType* in_offset = a + i * n;
+    DType* out_offset = b + i * n;
+
+    exp(n, in_offset, out_offset);
+    float sum = 0.0f;
+    sum_(n, out_offset, &sum);
+    div_(n, out_offset, sum, out_offset);
+  }
+}
+
+template <typename DType>
+MSHADOW_XINLINE static void LogSoftmaxLastDim(const index_t m,
+                                              const index_t n,
+                                              const DType *a,
+                                              const DType *b) {
+#pragma parallel for
+  for (index_t i = 0; i < m; i++) {
+    DType* in_offset = a + i * n;
+    DType* out_offset = b + i * n;
+
+    DType b, logsum;
+    max_(n, in_offset, &b);
+    sub_(n, in_offset, b, out_offset);
+    exp(n, out_offset, out_offset);
+    sum_(n, out_offset, &logsum);
+    logsum = b + logf(logsum);
+    sub_(n, in_offset, logsum, out_offset);
+  }
+}
+
+}  // namespace mkl_func
+}  // namespace op
+}  // namespace mxnet
+#endif  // MSHADOW_USE_MKL == 1
+#endif  // MXNET_OPERATOR_MKL_FUNCTIONS_INL_H_
diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
@@ -35,9 +35,10 @@
 #include "../mxnet_op.h"
 #include "../elemwise_op_common.h"
 #include "../../ndarray/ndarray_function.h"
+
 #if MSHADOW_USE_MKL == 1
-#include "mkl.h"
-#endif
+#include "../mkl_functions-inl.h"
+#endif  // MSHADOW_USE_MKL == 1
 
 namespace mxnet {
 namespace op {
@@ -264,6 +265,48 @@ class UnaryOp : public OpBase {
     }
   }
 
+#if MSHADOW_USE_MKL == 1
+  template<typename OP, typename MKL_OP>
+  static void MKL_Compute(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const std::vector<TBlob>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<TBlob>& outputs) {
+    if (req[0] == kNullOp)  return;
+    auto type_flag = inputs[0].type_flag_;
+    size_t input_size = inputs[0].Size();
+    if ((req[0] == kWriteTo || req[0] == kWriteInplace) &&
+        mkl_func::check_size(input_size) &&
+        mkl_func::check_type(type_flag)) {
+      // set DType as float or double according to type_flag   
+      MSHADOW_SGL_DBL_TYPE_SWITCH(type_flag, DType, {
+        MKL_OP::Vectorize(input_size, inputs[0].dptr<DType>(), outputs[0].dptr<DType>());
+      });
+    } else {
+      Compute<cpu, OP>(attrs, ctx, inputs, req, outputs);
+    }
+  }
+
+  template<typename OP, typename MKL_OP>
+  static void MKL_ComputeEx(const nnvm::NodeAttrs& attrs,
+                            const OpContext& ctx,
+                            const std::vector<NDArray>& inputs,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<NDArray>& outputs) {
+    CHECK_EQ(inputs.size(), 1U)
+      << "Invalid input, only one input is allowed";
+    CHECK_EQ(outputs.size(), 1U)
+      << "Invalid output, only one output is allowed";
+    CHECK_NE(inputs[0].storage_type(), kDefaultStorage)
+      << "Operation requires a sparse output storage type";
+    CHECK_NE(outputs[0].storage_type(), kDefaultStorage)
+      << "Operation requires a sparse output storage type";
+    if (inputs[0].storage_shape().Size()) {
+      MapToFCompute<cpu>(attrs, ctx, inputs, req, outputs, MKL_Compute<OP, MKL_OP>);
+    }
+  }
+#endif
+
   template<typename xpu, typename op>
   static void ComputeWithHalf2(const nnvm::NodeAttrs &attrs,
                                const OpContext &ctx,
@@ -353,42 +396,6 @@ class UnaryOp : public OpBase {
     }
   }
 
-#if MSHADOW_USE_MKL == 1
-  static inline void MKLLog(MKL_INT size, const float* pIn, float* pOut) {
-    vsLn(size, pIn, pOut);
-  }
-
-  static inline void MKLLog(MKL_INT size, const double* pIn, double* pOut) {
-    vdLn(size, pIn, pOut);
-  }
-#endif
-
-  template<typename xpu, typename OP>
-  static void LogCompute(const nnvm::NodeAttrs& attrs,
-                         const OpContext& ctx,
-                         const std::vector<TBlob>& inputs,
-                         const std::vector<OpReqType>& req,
-                         const std::vector<TBlob>& outputs) {
-    if (req[0] == kNullOp) return;
-    // if defined MSHADOW_USE_MKL then call mkl log when req is KWriteTo, type_flag
-    // is mshadow::kFloat32 or mshadow::kFloat64 and data size less than or equal MKL_INT_MAX
-#if MSHADOW_USE_MKL == 1
-    auto type_flag = inputs[0].type_flag_;
-    const size_t MKL_INT_MAX = (sizeof(MKL_INT) == sizeof(int)) ? INT_MAX : LLONG_MAX;
-    size_t input_size = inputs[0].Size();
-    if (req[0] == kWriteTo &&
-        input_size <= MKL_INT_MAX &&
-        (type_flag == mshadow::kFloat32 || type_flag == mshadow::kFloat64)) {
-      MSHADOW_SGL_DBL_TYPE_SWITCH(type_flag, DType, {
-        MKLLog(input_size, inputs[0].dptr<DType>(), outputs[0].dptr<DType>());
-      });
-    } else {
-      Compute<xpu, OP>(attrs, ctx, inputs, req, outputs);
-    }
-#else
-    Compute<xpu, OP>(attrs, ctx, inputs, req, outputs);
-#endif
-  }
 };
 
 /*! \brief Map legacy unary_bwd to backward_grad */
@@ -554,14 +561,42 @@ struct ReshapeLikeParam : public dmlc::Parameter<ReshapeLikeParam> {
   NNVM_REGISTER_OP(__name$)                                         \
   .set_num_inputs(1)                                                \
   .set_num_outputs(1)                                               \
-  .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)  \
+  .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>) \
   .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)     \
   .set_attr<nnvm::FInplaceOption>("FInplaceOption",                 \
     [](const NodeAttrs& attrs){                                     \
       return std::vector<std::pair<int, int> >{{0, 0}};             \
     })                                                              \
   .add_argument("data", "NDArray-or-Symbol", "The input array.")
 
+#if MSHADOW_USE_MKL == 1
+  /*! \bried MKL Unary compute.
+   *  *  With this macro means mxnet compile with MKL to accelerate math function with mkl.
+   *   *  Will Register FCompute with UnaryOp::MKL_Compute() to compelet the math function.
+  */
+  #define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(__name$, __xpu$, __kernel$, __mkl_kernel$)      \
+    MXNET_OPERATOR_REGISTER_UNARY(__name$)                                                               \
+    MXNET_ADD_SPARSE_OP_ALIAS(__name$)                                                                   \
+    .set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, true>)      \
+    .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>)         \
+    .set_attr<FComputeEx>("FComputeEx<" #__xpu$ ">", UnaryOp::MKL_ComputeEx<__kernel$, __mkl_kernel$>)
+
+  /*! \bried MKL Unary compute.
+   *  *  With this macro means mxnet compile with MKL to accelerate math function with mkl.
+   *   *  Will Register FCompute with UnaryOp::MKL_Compute() to compelet the math function.
+  */
+  #define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_RSP(__name$, __xpu$, __kernel$, __mkl_kernel$)          \
+    MXNET_OPERATOR_REGISTER_UNARY(__name$)                                                               \
+    MXNET_ADD_SPARSE_OP_ALIAS(__name$)                                                                   \
+    .set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, false>)     \
+    .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>)         \
+    .set_attr<FComputeEx>("FComputeEx<" #__xpu$ ">", UnaryOp::MKL_ComputeEx<__kernel$, __mkl_kerbel$>)
+
+  #define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(__name$, __xpu$, __kernel$, __mkl_kernel$)    \
+    MXNET_OPERATOR_REGISTER_UNARY(__name$)                                                               \
+    .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>)
+#endif
+
 /*! \brief Unary compute, with FComputeEx for csr and rsp available  */
 #define MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(__name$, __xpu$, __kernel$)                     \
   MXNET_OPERATOR_REGISTER_UNARY(__name$)                                                           \
@@ -579,12 +614,11 @@ struct ReshapeLikeParam : public dmlc::Parameter<ReshapeLikeParam> {
   .set_attr<FComputeEx>("FComputeEx<" #__xpu$ ">", UnaryOp::ComputeEx<__xpu$, __kernel$>)
 
 /*! \brief Unary compute, dense result.
- *  FInferStorageType attr is not set using this macro. By default DefaultStorageType is used.
+ *  *  FInferStorageType attr is not set using this macro. By default DefaultStorageType is used.
  */
 #define MXNET_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(__name$, __xpu$, __kernel$)        \
   MXNET_OPERATOR_REGISTER_UNARY(__name$)                                                \
   .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::Compute<__xpu$, __kernel$>)
-
 }  // namespace op
 }  // namespace mxnet