Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize ERF with MKL math function #5

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
218 changes: 218 additions & 0 deletions src/operator/mkl_functions-inl.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,218 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

/*!
* Copyright (c) 2018 by Contributors
* \file mkl_functions-inl.h
* \brief
* \author
*/
#ifndef MXNET_OPERATOR_MKL_FUNCTIONS_INL_H_
#define MXNET_OPERATOR_MKL_FUNCTIONS_INL_H_

#if MSHADOW_USE_MKL == 1
#include "mkl.h"

namespace mxnet {
namespace op {
namespace mkl_func {

MSHADOW_XINLINE
static bool check_size(const size_t n) {
const size_t MKL_INT_MAX = (sizeof(MKL_INT) == sizeof(int)) ? INT_MAX : LLONG_MAX;
return (n <= MKL_INT_MAX);
}

MSHADOW_XINLINE
static bool check_type(const int t) {
return (t == mshadow::kFloat32 || t == mshadow::kFloat64);
}

#define MXNET_MKL_UNARY_MATH_FUNC(name, func) \
struct name { \
MSHADOW_XINLINE static void Vectorize(const index_t n, const float *src, float *dst) { \
vs##func(static_cast<MKL_INT>(n), src, dst); \
} \
MSHADOW_XINLINE static void Vectorize(const index_t n, const double *src, double *dst) { \
vd##func(static_cast<MKL_INT>(n), src, dst); \
} \
};

#define MXNET_MKL_BINARY_MATH_FUNC(name, func) \
struct name { \
MSHADOW_XINLINE static void Vectorize(const index_t n, \
const float *a, \
const float *b, \
float *c) { \
vs##func(static_cast<MKL_INT>(n), a, b, c); \
} \
MSHADOW_XINLINE static void Vectorize(const index_t n, \
const double *a, \
const double *b, \
double *c) { \
vd##func(static_cast<MKL_INT>(n), a, b, c); \
} \
};

MXNET_MKL_UNARY_MATH_FUNC(erf, Erf);
MXNET_MKL_UNARY_MATH_FUNC(exp, Exp);
MXNET_MKL_UNARY_MATH_FUNC(exp2, Exp2);
MXNET_MKL_UNARY_MATH_FUNC(exp10, Exp10);
MXNET_MKL_UNARY_MATH_FUNC(expm1, Expm1);
MXNET_MKL_UNARY_MATH_FUNC(log, Ln);
MXNET_MKL_UNARY_MATH_FUNC(log2, Log2);
MXNET_MKL_UNARY_MATH_FUNC(log10, Log10);
MXNET_MKL_UNARY_MATH_FUNC(log1p, Log1p);

MXNET_MKL_UNARY_MATH_FUNC(sin, Sin);
MXNET_MKL_UNARY_MATH_FUNC(cos, Cos);
MXNET_MKL_UNARY_MATH_FUNC(tan, Tan);
MXNET_MKL_UNARY_MATH_FUNC(asin, Asin);
MXNET_MKL_UNARY_MATH_FUNC(acos, Acos);
MXNET_MKL_UNARY_MATH_FUNC(atan, Atan);

MXNET_MKL_UNARY_MATH_FUNC(sinh, Sinh);
MXNET_MKL_UNARY_MATH_FUNC(cosh, Cosh);
MXNET_MKL_UNARY_MATH_FUNC(tanh, Tanh);
MXNET_MKL_UNARY_MATH_FUNC(asinh, Asinh);
MXNET_MKL_UNARY_MATH_FUNC(acosh, Acosh);
MXNET_MKL_UNARY_MATH_FUNC(atanh, Atanh);

MXNET_MKL_UNARY_MATH_FUNC(sqrt, Sqrt);
MXNET_MKL_UNARY_MATH_FUNC(abs, Abs);
MXNET_MKL_UNARY_MATH_FUNC(cbrt, Cbrt);
MXNET_MKL_UNARY_MATH_FUNC(round, Round);
MXNET_MKL_UNARY_MATH_FUNC(ceil, Ceil);
MXNET_MKL_UNARY_MATH_FUNC(floor, Floor);
MXNET_MKL_UNARY_MATH_FUNC(trunc, Trunc);

MXNET_MKL_UNARY_MATH_FUNC(lgamma, LGamma);
MXNET_MKL_UNARY_MATH_FUNC(tgamma, TGamma);
MXNET_MKL_UNARY_MATH_FUNC(square, Sqr);

MXNET_MKL_BINARY_MATH_FUNC(add, Add);
MXNET_MKL_BINARY_MATH_FUNC(sub, Sub);
MXNET_MKL_BINARY_MATH_FUNC(mul, Mul);
MXNET_MKL_BINARY_MATH_FUNC(pow, Pow);
MXNET_MKL_BINARY_MATH_FUNC(hypot, Hypot);


template <typename DType>
MSHADOW_XINLINE static void sub_(index_t n, DType *in, DType b, DType *dst) {
for (index_t i = 0; i < n; i++)
dst[i] = in[i] - b;
}

template <typename DType>
MSHADOW_XINLINE static void div_(index_t n, DType *in, DType b, DType *dst) {
for (index_t i = 0; i < n; i++)
dst[i] = in[i] / b;
}

template <typename DType>
MSHADOW_XINLINE static void sum_(index_t n, DType *in, DType *dst) {
// dst[0] = cblas_sasum(n, in, 1);
DType sum = 0.0f;
for (index_t i = 0; i < n; i++)
sum += in[i];

dst[0] = sum;
}

template <typename DType>
MSHADOW_XINLINE static void max_(int n, DType * __restrict__ in, DType *dst) {
dst[0] = in[0];
for (int i = 1; i < n; i++)
dst[0] = (dst[0] < in[i]) ? in[i] : dst[0];
}

// LayerNorm on the last dimension
template <typename DType>
MSHADOW_XINLINE static void LayerNormLastDim(const index_t m,
const index_t n,
const DType *a,
const DType *b,
const DType *ws,
const DType *gamma,
const DType *beta,
const DType *mean,
const DType *var,
const DType eps) {
#pragma omp parallel for
for (index_t i = 0; i < m; i++) {
DType* in_offset = a + i * n;
DType* out_offset = b + i * n;
DType* ws_offset = ws + i * n;

sum_(n, in_offset, &(mean[i]));
mean[i] /= n;
sub_(n, in_offset, mean[i], out_offset);
square(n, out_offset, ws_offset);
sum_(n, ws_offset, &(var[i]));
var[i] = sqrt(var[i] / n + eps);

mul(n, out_offset, gamma, out_offset);
div_(n, out_offset, var[i], out_offset);
add(n, out_offset, beta, out_offset);
}
}

// softmax on the last dimension
template <typename DType>
MSHADOW_XINLINE static void SoftmaxLastDim(const index_t m,
const index_t n,
const DType *a,
const DType *b) {
#pragma omp paralle for
for (index_t i = 0; i < m; i++) {
DType* in_offset = a + i * n;
DType* out_offset = b + i * n;

exp(n, in_offset, out_offset);
float sum = 0.0f;
sum_(n, out_offset, &sum);
div_(n, out_offset, sum, out_offset);
}
}

template <typename DType>
MSHADOW_XINLINE static void LogSoftmaxLastDim(const index_t m,
const index_t n,
const DType *a,
const DType *b) {
#pragma parallel for
for (index_t i = 0; i < m; i++) {
DType* in_offset = a + i * n;
DType* out_offset = b + i * n;

DType b, logsum;
max_(n, in_offset, &b);
sub_(n, in_offset, b, out_offset);
exp(n, out_offset, out_offset);
sum_(n, out_offset, &logsum);
logsum = b + logf(logsum);
sub_(n, in_offset, logsum, out_offset);
}
}

} // namespace mkl_func
} // namespace op
} // namespace mxnet
#endif // MSHADOW_USE_MKL == 1
#endif // MXNET_OPERATOR_MKL_FUNCTIONS_INL_H_
116 changes: 75 additions & 41 deletions src/operator/tensor/elemwise_unary_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,10 @@
#include "../mxnet_op.h"
#include "../elemwise_op_common.h"
#include "../../ndarray/ndarray_function.h"

#if MSHADOW_USE_MKL == 1

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have not set USE_MKL before. Just curious: is blas=MKL also tested in mxnet CI?

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

MSHADOW_USE_MKL is widely used in mshadow and mxnet to indicate MKL is used as BLAS library. Yes, USE_BLAS=mkl is built and tested in CI:
https://github.com/apache/incubator-mxnet/blob/master/ci/docker/runtime_functions.sh#L375
https://github.com/apache/incubator-mxnet/blob/master/ci/docker/runtime_functions.sh#L553

#include "mkl.h"
#endif
#include "../mkl_functions-inl.h"
#endif // MSHADOW_USE_MKL == 1

namespace mxnet {
namespace op {
Expand Down Expand Up @@ -264,6 +265,48 @@ class UnaryOp : public OpBase {
}
}

#if MSHADOW_USE_MKL == 1
template<typename OP, typename MKL_OP>
static void MKL_Compute(const nnvm::NodeAttrs& attrs,
const OpContext& ctx,
const std::vector<TBlob>& inputs,
const std::vector<OpReqType>& req,
const std::vector<TBlob>& outputs) {
if (req[0] == kNullOp) return;
auto type_flag = inputs[0].type_flag_;
size_t input_size = inputs[0].Size();
if ((req[0] == kWriteTo || req[0] == kWriteInplace) &&
mkl_func::check_size(input_size) &&
mkl_func::check_type(type_flag)) {
// set DType as float or double according to type_flag
MSHADOW_SGL_DBL_TYPE_SWITCH(type_flag, DType, {
MKL_OP::Vectorize(input_size, inputs[0].dptr<DType>(), outputs[0].dptr<DType>());
});
} else {
Compute<cpu, OP>(attrs, ctx, inputs, req, outputs);
}
}

template<typename OP, typename MKL_OP>
static void MKL_ComputeEx(const nnvm::NodeAttrs& attrs,
const OpContext& ctx,
const std::vector<NDArray>& inputs,
const std::vector<OpReqType>& req,
const std::vector<NDArray>& outputs) {
CHECK_EQ(inputs.size(), 1U)
<< "Invalid input, only one input is allowed";
CHECK_EQ(outputs.size(), 1U)
<< "Invalid output, only one output is allowed";
CHECK_NE(inputs[0].storage_type(), kDefaultStorage)
<< "Operation requires a sparse output storage type";
CHECK_NE(outputs[0].storage_type(), kDefaultStorage)
<< "Operation requires a sparse output storage type";
if (inputs[0].storage_shape().Size()) {
MapToFCompute<cpu>(attrs, ctx, inputs, req, outputs, MKL_Compute<OP, MKL_OP>);
}
}
#endif

template<typename xpu, typename op>
static void ComputeWithHalf2(const nnvm::NodeAttrs &attrs,
const OpContext &ctx,
Expand Down Expand Up @@ -353,42 +396,6 @@ class UnaryOp : public OpBase {
}
}

#if MSHADOW_USE_MKL == 1
static inline void MKLLog(MKL_INT size, const float* pIn, float* pOut) {
vsLn(size, pIn, pOut);
}

static inline void MKLLog(MKL_INT size, const double* pIn, double* pOut) {
vdLn(size, pIn, pOut);
}
#endif

template<typename xpu, typename OP>
static void LogCompute(const nnvm::NodeAttrs& attrs,
const OpContext& ctx,
const std::vector<TBlob>& inputs,
const std::vector<OpReqType>& req,
const std::vector<TBlob>& outputs) {
if (req[0] == kNullOp) return;
// if defined MSHADOW_USE_MKL then call mkl log when req is KWriteTo, type_flag
// is mshadow::kFloat32 or mshadow::kFloat64 and data size less than or equal MKL_INT_MAX
#if MSHADOW_USE_MKL == 1
auto type_flag = inputs[0].type_flag_;
const size_t MKL_INT_MAX = (sizeof(MKL_INT) == sizeof(int)) ? INT_MAX : LLONG_MAX;
size_t input_size = inputs[0].Size();
if (req[0] == kWriteTo &&
input_size <= MKL_INT_MAX &&
(type_flag == mshadow::kFloat32 || type_flag == mshadow::kFloat64)) {
MSHADOW_SGL_DBL_TYPE_SWITCH(type_flag, DType, {
MKLLog(input_size, inputs[0].dptr<DType>(), outputs[0].dptr<DType>());
});
} else {
Compute<xpu, OP>(attrs, ctx, inputs, req, outputs);
}
#else
Compute<xpu, OP>(attrs, ctx, inputs, req, outputs);
#endif
}
};

/*! \brief Map legacy unary_bwd to backward_grad */
Expand Down Expand Up @@ -554,14 +561,42 @@ struct ReshapeLikeParam : public dmlc::Parameter<ReshapeLikeParam> {
NNVM_REGISTER_OP(__name$) \
.set_num_inputs(1) \
.set_num_outputs(1) \
.set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>) \
.set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>) \
.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>) \
.set_attr<nnvm::FInplaceOption>("FInplaceOption", \
[](const NodeAttrs& attrs){ \
return std::vector<std::pair<int, int> >{{0, 0}}; \
}) \
.add_argument("data", "NDArray-or-Symbol", "The input array.")

#if MSHADOW_USE_MKL == 1
/*! \bried MKL Unary compute.
* * With this macro means mxnet compile with MKL to accelerate math function with mkl.
* * Will Register FCompute with UnaryOp::MKL_Compute() to compelet the math function.
*/
#define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(__name$, __xpu$, __kernel$, __mkl_kernel$) \
MXNET_OPERATOR_REGISTER_UNARY(__name$) \
MXNET_ADD_SPARSE_OP_ALIAS(__name$) \
.set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, true>) \
.set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>) \
.set_attr<FComputeEx>("FComputeEx<" #__xpu$ ">", UnaryOp::MKL_ComputeEx<__kernel$, __mkl_kernel$>)

/*! \bried MKL Unary compute.
* * With this macro means mxnet compile with MKL to accelerate math function with mkl.
* * Will Register FCompute with UnaryOp::MKL_Compute() to compelet the math function.
*/
#define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_RSP(__name$, __xpu$, __kernel$, __mkl_kernel$) \
MXNET_OPERATOR_REGISTER_UNARY(__name$) \
MXNET_ADD_SPARSE_OP_ALIAS(__name$) \
.set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, false>) \
.set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>) \
.set_attr<FComputeEx>("FComputeEx<" #__xpu$ ">", UnaryOp::MKL_ComputeEx<__kernel$, __mkl_kerbel$>)

#define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(__name$, __xpu$, __kernel$, __mkl_kernel$) \
MXNET_OPERATOR_REGISTER_UNARY(__name$) \
.set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>)
#endif

/*! \brief Unary compute, with FComputeEx for csr and rsp available */
#define MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(__name$, __xpu$, __kernel$) \
MXNET_OPERATOR_REGISTER_UNARY(__name$) \
Expand All @@ -579,12 +614,11 @@ struct ReshapeLikeParam : public dmlc::Parameter<ReshapeLikeParam> {
.set_attr<FComputeEx>("FComputeEx<" #__xpu$ ">", UnaryOp::ComputeEx<__xpu$, __kernel$>)

/*! \brief Unary compute, dense result.
* FInferStorageType attr is not set using this macro. By default DefaultStorageType is used.
* * FInferStorageType attr is not set using this macro. By default DefaultStorageType is used.
*/
#define MXNET_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(__name$, __xpu$, __kernel$) \
MXNET_OPERATOR_REGISTER_UNARY(__name$) \
.set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::Compute<__xpu$, __kernel$>)

} // namespace op
} // namespace mxnet

Expand Down
Loading