From 25a9717c01463e038ea79c5ebfb11f0c22562ee3 Mon Sep 17 00:00:00 2001
From: Yijun Chen <chenyijun0902@gmail.com>
Date: Sat, 4 Jul 2020 04:32:45 +0000
Subject: [PATCH 1/2] backport mixed type

---
 .gitignore                                    |   4 +
 include/mxnet/imperative.h                    |  11 ++
 python/mxnet/symbol/numpy/_symbol.py          |   8 +-
 src/common/utils.h                            |  19 +++
 .../contrib/gradient_multiplier_op.cc         |   6 +-
 src/operator/mshadow_op.h                     |  83 ++++++++++-
 src/operator/mxnet_op.h                       |   9 +-
 .../numpy/np_elemwise_broadcast_logic_op.cc   |  18 +--
 .../numpy/np_elemwise_broadcast_op.cc         | 136 +++++++++---------
 .../numpy/np_elemwise_broadcast_op.cu         |  41 +++---
 src/operator/numpy/np_elemwise_broadcast_op.h | 112 ++++++---------
 .../np_elemwise_broadcast_op_extended.cc      |  68 ++++-----
 src/operator/numpy/np_matrix_op-inl.h         |   4 +-
 src/operator/numpy/np_true_divide-inl.h       | 124 +---------------
 src/operator/numpy/np_true_divide.cc          |  40 +++---
 src/operator/numpy/np_true_divide.cu          |   4 +
 src/operator/operator_tune.cc                 |   2 +
 .../tensor/elemwise_binary_broadcast_op.h     |   4 +-
 .../tensor/elemwise_binary_scalar_op.h        | 123 ++++++++++++----
 .../tensor/elemwise_binary_scalar_op_basic.cc |  49 ++++---
 .../elemwise_binary_scalar_op_extended.cc     |  36 ++---
 .../tensor/elemwise_binary_scalar_op_logic.cc |   3 +-
 tests/python/unittest/test_numpy_op.py        | 103 +++++++------
 tests/python/unittest/test_symbol.py          |   2 -
 24 files changed, 526 insertions(+), 483 deletions(-)
diff --git a/.gitignore b/.gitignore
index c50d1ec99b9f..9fafdb13cb7e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -121,6 +121,10 @@ cmake_install.cmake
 # Mac OS X
 .DS_Store
 
+# Windows
+windows_package.7z
+windows_package
+
 #Notebook Automated Test
 !tests/nightly/test_tutorial_config.txt
 !tests/nightly/TestNotebook
diff --git a/include/mxnet/imperative.h b/include/mxnet/imperative.h
index 6a367b3ccef5..783ad26803ed 100644
--- a/include/mxnet/imperative.h
+++ b/include/mxnet/imperative.h
@@ -117,6 +117,16 @@ class Imperative {
     }
     return is_np_shape_thread_local_ ? 1 : 0;
   }
+
+  /*! \brief return current numpy default dtype compatibility status.
+  * */
+  bool is_np_default_dtype() const {
+    if (is_np_default_dtype_global_) {
+      return true;
+    }
+    return false;
+  }
+
   /*! \brief specify numpy compatibility off, thread local on or global on. */
   bool set_is_np_shape(int is_np_shape) {
     NumpyShape flag = static_cast<NumpyShape>(is_np_shape);
@@ -215,6 +225,7 @@ class Imperative {
   static MX_THREAD_LOCAL bool is_np_shape_thread_local_;
 #endif
   bool is_np_shape_global_{false};
+  bool is_np_default_dtype_global_{false};
   /*! \brief node count used for naming */
   std::atomic<uint64_t> node_count_{0};
   /*! \brief variable count used for naming */
diff --git a/python/mxnet/symbol/numpy/_symbol.py b/python/mxnet/symbol/numpy/_symbol.py
index da8d5d738f46..5f81a73a4b75 100644
--- a/python/mxnet/symbol/numpy/_symbol.py
+++ b/python/mxnet/symbol/numpy/_symbol.py
@@ -1529,13 +1529,15 @@ def _ufunc_helper(lhs, rhs, fn_array, fn_scalar, lfn_scalar, rfn_scalar=None, ou
         if isinstance(rhs, numeric_types):
             return fn_scalar(lhs, rhs, out=out)
         else:
+            is_int = isinstance(rhs, integer_types)
             if rfn_scalar is None:
                 # commutative function
-                return lfn_scalar(rhs, float(lhs), out=out)
+                return lfn_scalar(rhs, scalar=float(lhs), is_int=is_int, out=out)
             else:
-                return rfn_scalar(rhs, float(lhs), out=out)
+                return rfn_scalar(rhs, scalar=float(lhs), is_int=is_int, out=out)
     elif isinstance(rhs, numeric_types):
-        return lfn_scalar(lhs, float(rhs), out=out)
+        is_int = isinstance(rhs, integer_types)
+        return lfn_scalar(lhs, scalar=float(rhs), is_int=is_int, out=out)
     elif isinstance(rhs, Symbol):
         return fn_array(lhs, rhs, out=out)
     else:
diff --git a/src/common/utils.h b/src/common/utils.h
index 44d4fc3e8772..227830708ff6 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -31,6 +31,7 @@
 #include <nnvm/node.h>
 #include <mxnet/engine.h>
 #include <mxnet/ndarray.h>
+#include <mxnet/imperative.h>
 #include <mxnet/op_attr_types.h>
 #include <mxnet/graph_attr_types.h>
 #include <nnvm/graph_attr_types.h>
@@ -874,6 +875,11 @@ inline bool is_float(const int dtype) {
   return dtype == mshadow::kFloat32 || dtype == mshadow::kFloat64 || dtype == mshadow::kFloat16;
 }
 
+inline bool is_int(const int dtype) {
+  return dtype == mshadow::kUint8 || dtype == mshadow::kInt8 ||
+         dtype == mshadow::kInt32 || dtype == mshadow::kInt64;
+}
+
 inline int get_more_precise_type(const int type1, const int type2) {
   if (type1 == type2) return type1;
   if (is_float(type1) && is_float(type2)) {
@@ -910,6 +916,19 @@ inline int np_binary_out_infer_type(const int type1, const int type2) {
   return get_more_precise_type(type1, type2);
 }
 
+inline int GetDefaultDtype() {
+  return Imperative::Get()->is_np_default_dtype() ?
+         mshadow::kFloat64 :
+         mshadow::kFloat32;
+}
+
+inline int GetDefaultDtype(int dtype) {
+  if (dtype != -1) return dtype;
+  return Imperative::Get()->is_np_default_dtype() ?
+         mshadow::kFloat64 :
+         mshadow::kFloat32;
+}
+
 }  // namespace common
 }  // namespace mxnet
 #endif  // MXNET_COMMON_UTILS_H_
diff --git a/src/operator/contrib/gradient_multiplier_op.cc b/src/operator/contrib/gradient_multiplier_op.cc
index 0a49ec1c36b3..5221d89a6056 100644
--- a/src/operator/contrib/gradient_multiplier_op.cc
+++ b/src/operator/contrib/gradient_multiplier_op.cc
@@ -77,9 +77,7 @@ In forward pass it acts as an identity transform. During backpropagation it
 multiplies the gradient from the subsequent level by a scalar factor lambda and passes it to
 the preceding layer.
 )code" ADD_FILELINE)
-.set_attr_parser([](NodeAttrs* attrs) {
-    attrs->parsed = dmlc::stod(attrs->dict["scalar"]);
-  })
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, true>)
 .set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
 .set_attr<FComputeEx>("FComputeEx<cpu>", UnaryOp::IdentityComputeEx<cpu>)
@@ -88,7 +86,7 @@ the preceding layer.
   [](const NodeAttrs& attrs){
     return std::vector<bool>{true};
   })
-.add_argument("scalar", "float", "lambda multiplier");
+.add_arguments(NumpyBinaryScalarParam::__FIELDS__());
 
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_contrib_backward_gradientmultiplier)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index 2d4d49254676..61d299d39a40 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -148,7 +148,6 @@ struct true_divide : public mxnet_op::tunable  {
     return static_cast<float>(a) / static_cast<float>(b);
   }
 
-#ifndef _WIN32
   template<typename DType,
            typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
   MSHADOW_XINLINE static mshadow::half::half_t Map(DType a, mshadow::half::half_t b) {
@@ -166,7 +165,6 @@ struct true_divide : public mxnet_op::tunable  {
   MSHADOW_XINLINE static double Map(DType a, double b) {
     return static_cast<double>(a) / b;
   }
-#endif
 };
 
 struct rtrue_divide : public mxnet_op::tunable  {
@@ -182,7 +180,6 @@ struct rtrue_divide : public mxnet_op::tunable  {
     return static_cast<float>(b) / static_cast<float>(a);
   }
 
-#ifndef _WIN32
   template<typename DType,
            typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
   MSHADOW_XINLINE static mshadow::half::half_t Map(DType a, mshadow::half::half_t b) {
@@ -200,14 +197,12 @@ struct rtrue_divide : public mxnet_op::tunable  {
   MSHADOW_XINLINE static double Map(DType a, double b) {
     return b / static_cast<double>(a);
   }
-#endif
 };
 
 MXNET_BINARY_MATH_OP_NC(left, a);
 
 MXNET_BINARY_MATH_OP_NC(right, b);
 
-#ifndef _WIN32
 struct mixed_plus {
   template<typename DType,
            typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
@@ -345,8 +340,12 @@ struct mixed_rpower {
     return static_cast<double>(math::pow(b, a));
   }
 };
-#endif
 
+#pragma GCC diagnostic push
+#if __GNUC__ >= 7
+#pragma GCC diagnostic ignored "-Wint-in-bool-context"
+#pragma GCC diagnostic ignored "-Wbool-compare"
+#endif
 MXNET_BINARY_MATH_OP_NC_WITH_BOOL(mul, a * b);
 
 MXNET_BINARY_MATH_OP_NC_WITH_BOOL(div, a / b);
@@ -575,7 +574,6 @@ MXNET_BINARY_MATH_OP(rpower, math::pow(b, a));
 MXNET_BINARY_MATH_OP(rpower_grad, math::id(a) * math::log(b));
 
 MXNET_BINARY_MATH_OP(arctan2, math::atan2(a, b));
-
 MXNET_BINARY_MATH_OP(arctan2_grad, math::id(b) / (math::id(a * a + b * b)));
 
 MXNET_BINARY_MATH_OP(arctan2_rgrad, -math::id(a) / (math::id(a * a + b * b)));
@@ -728,6 +726,10 @@ MXNET_BINARY_MATH_OP_NC(minus_sign, a - b > DType(0) ? DType(1) : -DType(1));
 
 MXNET_BINARY_MATH_OP(rminus, b - a);
 
+MXNET_BINARY_MATH_OP_NC(posone, 1);
+
+MXNET_BINARY_MATH_OP_NC(negone, -1);
+
 MXNET_BINARY_MATH_OP(div_grad, 1.0f / math::id(b));
 
 template<>
@@ -795,6 +797,73 @@ struct mod : public mxnet_op::tunable {
   }
 };
 
+struct mixed_mod {
+  template<typename DType,
+           typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static mshadow::half::half_t Map(DType a, mshadow::half::half_t b) {
+    return mod::Map(static_cast<mshadow::half::half_t>(a), b);
+  }
+
+  template<typename DType,
+           typename std::enable_if<std::is_same<DType, mshadow::half::half_t>::value ||
+                                   std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static float Map(DType a, float b) {
+    return mod::Map(static_cast<float>(a), b);
+  }
+
+  template<typename DType,
+           typename std::enable_if<std::is_same<DType, mshadow::half::half_t>::value ||
+                                   std::is_same<DType, float>::value ||
+                                   std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static double Map(DType a, double b) {
+    return mod::Map(static_cast<double>(a), b);
+  }
+};
+
+struct mixed_rmod {
+  template<typename DType,
+           typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static mshadow::half::half_t Map(DType a, mshadow::half::half_t b) {
+    return mod::Map(b, static_cast<mshadow::half::half_t>(a));
+  }
+
+  template<typename DType,
+           typename std::enable_if<std::is_same<DType, mshadow::half::half_t>::value ||
+                                   std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static float Map(DType a, float b) {
+    return mod::Map(b, static_cast<float>(a));
+  }
+
+  template<typename DType,
+           typename std::enable_if<std::is_same<DType, mshadow::half::half_t>::value ||
+                                   std::is_same<DType, float>::value ||
+                                   std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static double Map(DType a, double b) {
+    return mod::Map(b, static_cast<double>(a));
+  }
+};
+
+struct fmod : public mxnet_op::tunable {
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a, DType b) {
+    if (b == DType(0)) {
+      return DType(0);
+    } else {
+        return DType(::fmod(static_cast<double>(a), static_cast<double>(b)));
+    }
+  }
+};
+
+struct rfmod : public mxnet_op::tunable {
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a, DType b) {
+    if (a == DType(0)) {
+      return DType(0);
+    } else  {
+      return DType(::fmod(static_cast<double>(b), static_cast<double>(a)));
+    }
+  }
+};
 
 template<>
 MSHADOW_XINLINE mshadow::half::half2_t mod::Map<mshadow::half::half2_t>
diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h
index f7dbce270c87..0c2092f55d41 100644
--- a/src/operator/mxnet_op.h
+++ b/src/operator/mxnet_op.h
@@ -849,7 +849,13 @@ struct op_with_req {
     KERNEL_ASSIGN(out[i], req, OP::Map(in[i], value));
   }
 
-#ifndef _WIN32
+  /*! \brief input is two tensors with different type and with a boolean output tensor */
+  template<typename LType, typename RType,
+           typename std::enable_if<!std::is_same<LType, RType>::value, int>::type = 0>
+  MSHADOW_XINLINE static void Map(index_t i, bool *out, const LType *lhs, const RType *rhs) {
+    KERNEL_ASSIGN(out[i], req, OP::Map(lhs[i], rhs[i]));
+  }
+
   /*! \brief inputs are two tensors with a half_t output tensor */
   template<typename DType,
            typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
@@ -903,7 +909,6 @@ struct op_with_req {
   MSHADOW_XINLINE static void Map(index_t i, double *out, const DType *lhs, const double value) {
     KERNEL_ASSIGN(out[i], req, OP::Map(lhs[i], value));
   }
-#endif
 
   /*! \brief inputs are two tensors with a float output tensor */
   template<typename DType,
diff --git a/src/operator/numpy/np_elemwise_broadcast_logic_op.cc b/src/operator/numpy/np_elemwise_broadcast_logic_op.cc
index 74db52d33f03..a54c7cac51a3 100644
--- a/src/operator/numpy/np_elemwise_broadcast_logic_op.cc
+++ b/src/operator/numpy/np_elemwise_broadcast_logic_op.cc
@@ -206,7 +206,8 @@ struct TVMBinaryBroadcastScalarCompute {
 
     // scalar param
     type_codes[1] = kDLFloat;
-    values[1].v_float64 = nnvm::get<double>(attrs.parsed);
+    const NumpyBinaryScalarParam& param = nnvm::get<NumpyBinaryScalarParam>(attrs.parsed);
+    values[1].v_float64 = param.scalar;
 
     // output tensor
     type_codes[2] = kTVMDLTensorHandle;
@@ -225,9 +226,7 @@ struct TVMBinaryBroadcastScalarCompute {
   NNVM_REGISTER_OP(_npi_##name##_scalar)                                                    \
   .set_num_inputs(1)                                                                        \
   .set_num_outputs(1)                                                                       \
-  .set_attr_parser([](NodeAttrs* attrs) {                                                   \
-    attrs->parsed = dmlc::stod(attrs->dict["scalar"]);                                       \
-  })                                                                                        \
+  .set_attr_parser(ParamParser<NumpyBinaryScalarParam>)                                     \
   .set_attr<nnvm::FListInputNames>("FListInputNames",                                       \
   [](const NodeAttrs& attrs) {                                                              \
     return std::vector<std::string>{"data"};                                                \
@@ -240,7 +239,7 @@ struct TVMBinaryBroadcastScalarCompute {
   })                                                                                        \
   .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)                                \
   .add_argument("data", "NDArray-or-Symbol", "First input to the function")                 \
-  .add_argument("scalar", "float", "scalar input")
+  .add_arguments(NumpyBinaryScalarParam::__FIELDS__())
 
 MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR_LOGIC(equal);
 MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR_LOGIC(not_equal);
@@ -285,9 +284,12 @@ MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR_LOGIC_GPU(less_equal);
 
 #else
 
-#define MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR_LOGIC_CPU(name)                               \
-  NNVM_REGISTER_OP(_npi_##name##_scalar)                                                       \
-  .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::ComputeLogic<cpu, mshadow_op::np_##name>)
+#define MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR_LOGIC_CPU(name)                                  \
+  NNVM_REGISTER_OP(_npi_##name##_scalar)                                                          \
+  .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::ComputeLogic<cpu, mshadow_op::np_##name>)  \
+  .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {                        \
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};                             \
+  })
 
 #endif  // MXNET_USE_TVM_OP
 
diff --git a/src/operator/numpy/np_elemwise_broadcast_op.cc b/src/operator/numpy/np_elemwise_broadcast_op.cc
index ae285caa9094..ae6697c0b23e 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op.cc
+++ b/src/operator/numpy/np_elemwise_broadcast_op.cc
@@ -23,27 +23,26 @@
  * \brief CPU Implementation of basic functions for elementwise numpy binary broadcast operator.
  */
 
-#include <dmlc/strtonum.h>
 #include "./np_elemwise_broadcast_op.h"
 
 namespace mxnet {
 namespace op {
 
-#define MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(name)              \
-  NNVM_REGISTER_OP(name)                                            \
-  .set_num_inputs(1)                                                \
-  .set_num_outputs(1)                                               \
-  .set_attr_parser([](NodeAttrs* attrs) {                           \
-      attrs->parsed = dmlc::stod(attrs->dict["scalar"]);             \
-    })                                                              \
-  .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>) \
-  .set_attr<nnvm::FInferType>("FInferType", NumpyBinaryScalarType)  \
-  .set_attr<nnvm::FInplaceOption>("FInplaceOption",                 \
-    [](const NodeAttrs& attrs){                                     \
-      return std::vector<std::pair<int, int> >{{0, 0}};             \
-    })                                                              \
-  .add_argument("data", "NDArray-or-Symbol", "source input")        \
-  .add_argument("scalar", "float", "scalar input")
+DMLC_REGISTER_PARAMETER(NumpyBinaryScalarParam);
+
+#define MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(name)                    \
+  NNVM_REGISTER_OP(name)                                                  \
+  .set_num_inputs(1)                                                      \
+  .set_num_outputs(1)                                                     \
+  .set_attr_parser(ParamParser<NumpyBinaryScalarParam>)                   \
+  .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)       \
+  .set_attr<nnvm::FInferType>("FInferType", NumpyBinaryScalarType)        \
+  .set_attr<FResourceRequest>("FResourceRequest",                         \
+    [](const NodeAttrs& attrs) {                                          \
+      return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};   \
+    })                                                                    \
+  .add_argument("data", "NDArray-or-Symbol", "source input")              \
+  .add_arguments(NumpyBinaryScalarParam::__FIELDS__())
 
 bool NumpyBinaryMixedPrecisionType(const nnvm::NodeAttrs& attrs,
                                    std::vector<int>* in_attrs,
@@ -61,7 +60,6 @@ bool NumpyBinaryMixedPrecisionType(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
-#ifndef _WIN32
 #define MXNET_OPERATOR_REGISTER_NP_BINARY_MIXED_PRECISION(name)                \
   NNVM_REGISTER_OP(name)                                                       \
   .set_num_inputs(2)                                                           \
@@ -76,68 +74,62 @@ bool NumpyBinaryMixedPrecisionType(const nnvm::NodeAttrs& attrs,
     [](const NodeAttrs& attrs){                                                \
       return std::vector<std::pair<int, int> >{{0, 0}, {1, 0}};                \
     })                                                                         \
-  .add_argument("lhs", "NDArray-or-Symbol", "First input to the function")     \
-  .add_argument("rhs", "NDArray-or-Symbol", "Second input to the function")
-#else
-#define MXNET_OPERATOR_REGISTER_NP_BINARY_MIXED_PRECISION(name)                \
-  NNVM_REGISTER_OP(name)                                                       \
-  .set_num_inputs(2)                                                           \
-  .set_num_outputs(1)                                                          \
-  .set_attr<nnvm::FListInputNames>("FListInputNames",                          \
+  .set_attr<FResourceRequest>("FResourceRequest",                              \
     [](const NodeAttrs& attrs) {                                               \
-      return std::vector<std::string>{"lhs", "rhs"};                           \
-    })                                                                         \
-  .set_attr<mxnet::FInferShape>("FInferShape", BinaryBroadcastShape)           \
-  .set_attr<nnvm::FInferType>("FInferType", NumpyBinaryMixedPrecisionType)     \
-  .set_attr<nnvm::FInplaceOption>("FInplaceOption",                            \
-    [](const NodeAttrs& attrs){                                                \
-      return std::vector<std::pair<int, int> >{{0, 0}, {1, 0}};                \
+      return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};        \
     })                                                                         \
-  .set_attr<FResourceRequest>("FResourceRequest",                              \
-  [](const NodeAttrs& attrs) {                                                 \
-    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};          \
-  })                                                                           \
   .add_argument("lhs", "NDArray-or-Symbol", "First input to the function")     \
   .add_argument("rhs", "NDArray-or-Symbol", "Second input to the function")
-#endif
 
 MXNET_OPERATOR_REGISTER_NP_BINARY_MIXED_PRECISION(_npi_add)
-#ifndef _WIN32
 .set_attr<FCompute>(
   "FCompute<cpu>",
   NumpyBinaryBroadcastComputeWithBool<cpu, op::mshadow_op::plus, op::mshadow_op::mixed_plus,
                                       op::mshadow_op::mixed_plus>)
-#else
-.set_attr<FCompute>(
-  "FCompute<cpu>",
-  NumpyBinaryBroadcastComputeWithBool<cpu, op::mshadow_op::plus>)
-#endif
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_broadcast_add"});
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_npi_broadcast_add"});
+
+NNVM_REGISTER_OP(_backward_npi_broadcast_add)
+.set_num_inputs(3)
+.set_num_outputs(2)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs){
+    return std::vector<std::pair<int, int> >{{0, 0}, {0, 1}};
+  })
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<FCompute>("FCompute<cpu>", NumpyBinaryBackwardUseIn<cpu, mshadow_op::posone,
+                                                                mshadow_op::posone>);
 
 MXNET_OPERATOR_REGISTER_NP_BINARY_MIXED_PRECISION(_npi_subtract)
-#ifndef _WIN32
 .set_attr<FCompute>(
   "FCompute<cpu>",
   NumpyBinaryBroadcastCompute<cpu, op::mshadow_op::minus, op::mshadow_op::mixed_minus,
                               op::mshadow_op::mixed_rminus>)
-#else
-.set_attr<FCompute>(
-  "FCompute<cpu>",
-  NumpyBinaryBroadcastCompute<cpu, op::mshadow_op::minus>)
-#endif
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_broadcast_sub"});
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_npi_broadcast_sub"});
+
+NNVM_REGISTER_OP(_backward_npi_broadcast_sub)
+.set_num_inputs(3)
+.set_num_outputs(2)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs){
+    return std::vector<std::pair<int, int> >{{0, 0}, {0, 1}};
+  })
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<FCompute>("FCompute<cpu>", NumpyBinaryBackwardUseIn<cpu, mshadow_op::posone,
+                                                                mshadow_op::negone>);
 
 MXNET_OPERATOR_REGISTER_NP_BINARY_MIXED_PRECISION(_npi_multiply)
-#ifndef _WIN32
 .set_attr<FCompute>(
   "FCompute<cpu>",
   NumpyBinaryBroadcastComputeWithBool<cpu, op::mshadow_op::mul, op::mshadow_op::mixed_mul,
                                       op::mshadow_op::mixed_mul>)
-#else
-.set_attr<FCompute>(
-  "FCompute<cpu>",
-  NumpyBinaryBroadcastComputeWithBool<cpu, op::mshadow_op::mul>)
-#endif
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_npi_broadcast_mul"});
 
 NNVM_REGISTER_OP(_backward_npi_broadcast_mul)
@@ -155,21 +147,33 @@ NNVM_REGISTER_OP(_backward_npi_broadcast_mul)
 .set_attr<FCompute>("FCompute<cpu>", NumpyBinaryBackwardUseIn<cpu, mshadow_op::right,
                                                               mshadow_op::left>);
 
-MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(_npi_mod)
-.set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, mshadow_op::mod>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_broadcast_mod"});
+MXNET_OPERATOR_REGISTER_NP_BINARY_MIXED_PRECISION(_npi_mod)
+.set_attr<FCompute>(
+  "FCompute<cpu>",
+  NumpyBinaryBroadcastCompute<cpu, op::mshadow_op::mod, op::mshadow_op::mixed_mod,
+                                      op::mshadow_op::mixed_rmod>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_npi_broadcast_mod"});
+
+NNVM_REGISTER_OP(_backward_npi_broadcast_mod)
+.set_num_inputs(3)
+.set_num_outputs(2)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs){
+    return std::vector<std::pair<int, int> >{{0, 1}};
+  })
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<FCompute>("FCompute<cpu>", NumpyBinaryBackwardUseIn<cpu, mshadow_op::mod_grad,
+                                                              mshadow_op::mod_rgrad>);
 
 MXNET_OPERATOR_REGISTER_NP_BINARY_MIXED_PRECISION(_npi_power)
-#ifndef _WIN32
 .set_attr<FCompute>(
   "FCompute<cpu>",
   NumpyBinaryBroadcastComputeWithBool<cpu, op::mshadow_op::power, op::mshadow_op::mixed_power,
                                       op::mshadow_op::mixed_rpower>)
-#else
-.set_attr<FCompute>(
-  "FCompute<cpu>",
-  NumpyBinaryBroadcastComputeWithBool<cpu, op::mshadow_op::power>)
-#endif
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_npi_broadcast_power"});
 
 NNVM_REGISTER_OP(_backward_npi_broadcast_power)
diff --git a/src/operator/numpy/np_elemwise_broadcast_op.cu b/src/operator/numpy/np_elemwise_broadcast_op.cu
index 1e0130494469..a2927cda61ff 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op.cu
+++ b/src/operator/numpy/np_elemwise_broadcast_op.cu
@@ -29,59 +29,50 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_npi_add)
-#ifndef _WIN32
 .set_attr<FCompute>(
   "FCompute<gpu>",
   NumpyBinaryBroadcastComputeWithBool<gpu, op::mshadow_op::plus, op::mshadow_op::mixed_plus,
                                       op::mshadow_op::mixed_plus>);
-#else
-.set_attr<FCompute>(
-  "FCompute<gpu>",
-  NumpyBinaryBroadcastComputeWithBool<gpu, op::mshadow_op::plus>);
-#endif
+
+NNVM_REGISTER_OP(_backward_npi_broadcast_add)
+.set_attr<FCompute>("FCompute<gpu>", NumpyBinaryBackwardUseIn<gpu, mshadow_op::posone,
+                                                                mshadow_op::posone>);
 
 NNVM_REGISTER_OP(_npi_subtract)
-#ifndef _WIN32
 .set_attr<FCompute>(
   "FCompute<gpu>",
   NumpyBinaryBroadcastCompute<gpu, op::mshadow_op::minus, op::mshadow_op::mixed_minus,
                               op::mshadow_op::mixed_rminus>);
-#else
-.set_attr<FCompute>(
-  "FCompute<gpu>",
-  NumpyBinaryBroadcastCompute<gpu, op::mshadow_op::minus>);
-#endif
+
+NNVM_REGISTER_OP(_backward_npi_broadcast_sub)
+.set_attr<FCompute>("FCompute<gpu>", NumpyBinaryBackwardUseIn<gpu, mshadow_op::posone,
+                                                                mshadow_op::negone>);
 
 NNVM_REGISTER_OP(_npi_multiply)
-#ifndef _WIN32
 .set_attr<FCompute>(
   "FCompute<gpu>",
   NumpyBinaryBroadcastComputeWithBool<gpu, op::mshadow_op::mul, op::mshadow_op::mixed_mul,
                                       op::mshadow_op::mixed_mul>);
-#else
-.set_attr<FCompute>(
-  "FCompute<gpu>",
-  NumpyBinaryBroadcastComputeWithBool<gpu, op::mshadow_op::mul>);
-#endif
 
 NNVM_REGISTER_OP(_backward_npi_broadcast_mul)
 .set_attr<FCompute>("FCompute<gpu>", NumpyBinaryBackwardUseIn<gpu, mshadow_op::right,
                                                               mshadow_op::left>);
 
 NNVM_REGISTER_OP(_npi_mod)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::mod>);
+.set_attr<FCompute>(
+  "FCompute<gpu>",
+  NumpyBinaryBroadcastCompute<gpu, op::mshadow_op::mod, op::mshadow_op::mixed_mod,
+                                      op::mshadow_op::mixed_rmod>);
+
+NNVM_REGISTER_OP(_backward_npi_broadcast_mod)
+.set_attr<FCompute>("FCompute<gpu>", NumpyBinaryBackwardUseIn<gpu, mshadow_op::mod_grad,
+                                                              mshadow_op::mod_rgrad>);
 
 NNVM_REGISTER_OP(_npi_power)
-#ifndef _WIN32
 .set_attr<FCompute>(
   "FCompute<gpu>",
   NumpyBinaryBroadcastComputeWithBool<gpu, op::mshadow_op::power, op::mshadow_op::mixed_power,
                                       op::mshadow_op::mixed_rpower>);
-#else
-.set_attr<FCompute>(
-  "FCompute<gpu>",
-  NumpyBinaryBroadcastComputeWithBool<gpu, op::mshadow_op::power>);
-#endif
 
 NNVM_REGISTER_OP(_backward_npi_broadcast_power)
 .set_attr<FCompute>("FCompute<gpu>", NumpyBinaryBackwardUseIn<gpu, mshadow_op::power_grad,
diff --git a/src/operator/numpy/np_elemwise_broadcast_op.h b/src/operator/numpy/np_elemwise_broadcast_op.h
index d58245a798e5..ce53eb6a3872 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op.h
+++ b/src/operator/numpy/np_elemwise_broadcast_op.h
@@ -35,23 +35,12 @@
 namespace mxnet {
 namespace op {
 
-inline bool NumpyBinaryScalarType(const nnvm::NodeAttrs& attrs,
-                           std::vector<int>* in_attrs,
-                           std::vector<int>* out_attrs) {
-  CHECK_EQ(in_attrs->size(), 1U);
-  CHECK_EQ(out_attrs->size(), 1U);
-  TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
-  TYPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
-  return in_attrs->at(0) != -1;
-}
-
 inline void PrintErrorMessage(const std::string& op_name, const int dtype1, const int dtype2) {
   LOG(FATAL) << "Operator " << op_name << " does not support combination of "
              << mshadow::dtype_string(dtype1) << " with " << mshadow::dtype_string(dtype2)
              << " yet...";
 }
 
-#ifndef _WIN32
 template<typename xpu, typename OP>
 void MixedAllRealBinaryElemwiseCompute(const std::string& op_name,
                                        const OpContext& ctx,
@@ -153,7 +142,6 @@ void MixedBinaryElemwiseCompute(const nnvm::NodeAttrs& attrs,
   const TBlob& lhs = inputs[0];
   const TBlob& rhs = inputs[1];
   const TBlob& out = outputs[0];
-
   if (common::is_float(lhs.type_flag_) && common::is_float(rhs.type_flag_)) {
     if (lhs.type_flag_ == out.type_flag_) {
       MixedAllRealBinaryElemwiseCompute<xpu, ROP>(attrs.op->name, ctx, lhs, rhs, out, req[0]);
@@ -227,13 +215,9 @@ void MixedAllRealBinaryBroadcastCompute(const std::string& op_name,
     }
   });
 }
-#endif
 
-#ifndef _WIN32
+
 template<typename xpu, typename OP, typename LOP, typename ROP>
-#else
-template<typename xpu, typename OP>
-#endif
 void MixedBinaryBroadcastCompute(const nnvm::NodeAttrs& attrs,
                                  const OpContext& ctx,
                                  const std::vector<TBlob>& inputs,
@@ -248,11 +232,9 @@ void MixedBinaryBroadcastCompute(const nnvm::NodeAttrs& attrs,
   const TBlob& rhs = inputs[1];
   const TBlob& out = outputs[0];
 
-#ifndef _WIN32
   mxnet::TShape new_lshape, new_rshape, new_oshape;
   int ndim = BinaryBroadcastShapeCompact(lhs.shape_, rhs.shape_, out.shape_,
                                          &new_lshape, &new_rshape, &new_oshape);
-
   if (!ndim) {
     MixedBinaryElemwiseCompute<xpu, LOP, ROP>(attrs, ctx, inputs, req, outputs);
   } else {
@@ -290,47 +272,34 @@ void MixedBinaryBroadcastCompute(const nnvm::NodeAttrs& attrs,
           });
         }
       });
+    } else if (!common::is_float(lhs.type_flag_) && !common::is_float(rhs.type_flag_)) {
+      TBlob temp_tblob;
+      if (lhs.type_flag_ == out.type_flag_) {
+        MXNET_INT_TYPE_SWITCH(lhs.type_flag_, LType, {
+          Tensor<xpu, 1, LType> temp_tensor =
+            ctx.requested[0].get_space_typed<xpu, 1, LType>(Shape1(rhs.Size()), s);
+          temp_tblob = TBlob(temp_tensor);
+        });
+        CastCompute<xpu>(attrs, ctx, {rhs}, {kWriteTo}, {temp_tblob});
+        BinaryBroadcastCompute<xpu, OP>(
+          attrs, ctx, {lhs, temp_tblob.reshape(rhs.shape_)}, req, outputs);
+      } else {
+        MXNET_INT_TYPE_SWITCH(rhs.type_flag_, RType, {
+          Tensor<xpu, 1, RType> temp_tensor =
+            ctx.requested[0].get_space_typed<xpu, 1, RType>(Shape1(lhs.Size()), s);
+          temp_tblob = TBlob(temp_tensor);
+        });
+        CastCompute<xpu>(attrs, ctx, {lhs}, {kWriteTo}, {temp_tblob});
+        BinaryBroadcastCompute<xpu, OP>(
+          attrs, ctx, {temp_tblob.reshape(lhs.shape_), rhs}, req, outputs);
+      }
     } else {
       PrintErrorMessage(attrs.op->name, lhs.type_flag_, rhs.type_flag_);
     }
   }
-#else
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  if (common::is_float(lhs.type_flag_) || common::is_float(rhs.type_flag_)) {
-    TBlob temp_tblob;
-    // one is float, the other is bool
-    CHECK((out.type_flag_ == lhs.type_flag_) || (out.type_flag_ == rhs.type_flag_))
-      << "This case out type should be same as the float type";
-    if (lhs.type_flag_ == out.type_flag_) {
-      MSHADOW_REAL_TYPE_SWITCH(lhs.type_flag_, LType, {
-        Tensor<xpu, 1, LType> temp_tensor =
-          ctx.requested[0].get_space_typed<xpu, 1, LType>(Shape1(rhs.Size()), s);
-        temp_tblob = TBlob(temp_tensor);
-      });
-      CastCompute<xpu>(attrs, ctx, {rhs}, {kWriteTo}, {temp_tblob});
-      BinaryBroadcastCompute<xpu, OP>(
-        attrs, ctx, {lhs, temp_tblob.reshape(rhs.shape_)}, req, outputs);
-    } else {
-      MSHADOW_REAL_TYPE_SWITCH(rhs.type_flag_, RType, {
-        Tensor<xpu, 1, RType> temp_tensor =
-          ctx.requested[0].get_space_typed<xpu, 1, RType>(Shape1(lhs.Size()), s);
-        temp_tblob = TBlob(temp_tensor);
-      });
-      CastCompute<xpu>(attrs, ctx, {lhs}, {kWriteTo}, {temp_tblob});
-      BinaryBroadcastCompute<xpu, OP>(
-        attrs, ctx, {temp_tblob.reshape(lhs.shape_), rhs}, req, outputs);
-    }
-  } else {
-    PrintErrorMessage(attrs.op->name, lhs.type_flag_, rhs.type_flag_);
-  }
-#endif
 }
 
-#ifndef _WIN32
 template<typename xpu, typename OP, typename LOP, typename ROP>
-#else
-template<typename xpu, typename OP>
-#endif
 void NumpyBinaryBroadcastCompute(const nnvm::NodeAttrs& attrs,
                                  const OpContext& ctx,
                                  const std::vector<TBlob>& inputs,
@@ -352,18 +321,10 @@ void NumpyBinaryBroadcastCompute(const nnvm::NodeAttrs& attrs,
     return;
   }
 
-#ifndef _WIN32
   MixedBinaryBroadcastCompute<xpu, OP, LOP, ROP>(attrs, ctx, inputs, req, outputs);
-#else
-  MixedBinaryBroadcastCompute<xpu, OP>(attrs, ctx, inputs, req, outputs);
-#endif
 }
 
-#ifndef _WIN32
 template<typename xpu, typename OP, typename LOP, typename ROP>
-#else
-template<typename xpu, typename OP>
-#endif
 void NumpyBinaryBroadcastComputeWithBool(const nnvm::NodeAttrs& attrs,
                                          const OpContext& ctx,
                                          const std::vector<TBlob>& inputs,
@@ -384,12 +345,31 @@ void NumpyBinaryBroadcastComputeWithBool(const nnvm::NodeAttrs& attrs,
     BinaryBroadcastComputeWithBool<xpu, OP>(attrs, ctx, inputs, req, outputs);
     return;
   }
-
-#ifndef _WIN32
+  if (!common::is_float(lhs.type_flag_) && !common::is_float(rhs.type_flag_)) {
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    TBlob temp_tblob;
+    if (lhs.type_flag_ == out.type_flag_) {
+      MXNET_INT_TYPE_SWITCH(lhs.type_flag_, LType, {
+        Tensor<xpu, 1, LType> temp_tensor =
+          ctx.requested[0].get_space_typed<xpu, 1, LType>(Shape1(rhs.Size()), s);
+        temp_tblob = TBlob(temp_tensor);
+      });
+      CastCompute<xpu>(attrs, ctx, {rhs}, {kWriteTo}, {temp_tblob});
+      BinaryBroadcastCompute<xpu, OP>(
+        attrs, ctx, {lhs, temp_tblob.reshape(rhs.shape_)}, req, outputs);
+    } else {
+      MXNET_INT_TYPE_SWITCH(rhs.type_flag_, RType, {
+        Tensor<xpu, 1, RType> temp_tensor =
+          ctx.requested[0].get_space_typed<xpu, 1, RType>(Shape1(lhs.Size()), s);
+        temp_tblob = TBlob(temp_tensor);
+      });
+      CastCompute<xpu>(attrs, ctx, {lhs}, {kWriteTo}, {temp_tblob});
+      BinaryBroadcastCompute<xpu, OP>(
+        attrs, ctx, {temp_tblob.reshape(lhs.shape_), rhs}, req, outputs);
+    }
+    return;
+  }
   MixedBinaryBroadcastCompute<xpu, OP, LOP, ROP>(attrs, ctx, inputs, req, outputs);
-#else
-  MixedBinaryBroadcastCompute<xpu, OP>(attrs, ctx, inputs, req, outputs);
-#endif
 }
 
 template<typename xpu, typename LOP, typename ROP>
diff --git a/src/operator/numpy/np_elemwise_broadcast_op_extended.cc b/src/operator/numpy/np_elemwise_broadcast_op_extended.cc
index 52d681885d82..60b721e4854d 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op_extended.cc
+++ b/src/operator/numpy/np_elemwise_broadcast_op_extended.cc
@@ -30,21 +30,19 @@
 namespace mxnet {
 namespace op {
 
-#define MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(name)              \
-  NNVM_REGISTER_OP(name)                                            \
-  .set_num_inputs(1)                                                \
-  .set_num_outputs(1)                                               \
-  .set_attr_parser([](NodeAttrs* attrs) {                           \
-      attrs->parsed = dmlc::stod(attrs->dict["scalar"]);             \
-    })                                                              \
-  .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>) \
-  .set_attr<nnvm::FInferType>("FInferType", NumpyBinaryScalarType)  \
-  .set_attr<nnvm::FInplaceOption>("FInplaceOption",                 \
-    [](const NodeAttrs& attrs){                                     \
-      return std::vector<std::pair<int, int> >{{0, 0}};             \
-    })                                                              \
-  .add_argument("data", "NDArray-or-Symbol", "source input")        \
-  .add_argument("scalar", "float", "scalar input")
+#define MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(name)                    \
+  NNVM_REGISTER_OP(name)                                                  \
+  .set_num_inputs(1)                                                      \
+  .set_num_outputs(1)                                                     \
+  .set_attr_parser(ParamParser<NumpyBinaryScalarParam>)                   \
+  .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)       \
+  .set_attr<nnvm::FInferType>("FInferType", NumpyBinaryScalarType)        \
+  .set_attr<FResourceRequest>("FResourceRequest",                         \
+    [](const NodeAttrs& attrs) {                                          \
+      return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};   \
+    })                                                                    \
+  .add_argument("data", "NDArray-or-Symbol", "source input")              \
+  .add_arguments(NumpyBinaryScalarParam::__FIELDS__())
 
 MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(_npi_copysign)
 .describe(R"code()code" ADD_FILELINE)
@@ -87,9 +85,7 @@ NNVM_REGISTER_OP(_npi_lcm)
 NNVM_REGISTER_OP(_npi_lcm_scalar)
 .set_num_inputs(1)
 .set_num_outputs(1)
-.set_attr_parser([](NodeAttrs* attrs) {
-    attrs->parsed = dmlc::stod(attrs->dict["scalar"]);
-  })
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseIntType<1, 1>)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption",
@@ -98,7 +94,7 @@ NNVM_REGISTER_OP(_npi_lcm_scalar)
   })
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .add_argument("data", "NDArray-or-Symbol", "source input")
-.add_argument("scalar", "int", "scalar input")
+.add_arguments(NumpyBinaryScalarParam::__FIELDS__())
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::lcm>);
 
 NNVM_REGISTER_OP(_npi_bitwise_and)
@@ -122,9 +118,7 @@ NNVM_REGISTER_OP(_npi_bitwise_and)
 NNVM_REGISTER_OP(_npi_bitwise_and_scalar)
 .set_num_inputs(1)
 .set_num_outputs(1)
-.set_attr_parser([](NodeAttrs* attrs) {
-    attrs->parsed = std::stod(attrs->dict["scalar"]);
-  })
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseIntType<1, 1>)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption",
@@ -133,7 +127,7 @@ NNVM_REGISTER_OP(_npi_bitwise_and_scalar)
   })
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .add_argument("data", "NDArray-or-Symbol", "source input")
-.add_argument("scalar", "int", "scalar input")
+.add_arguments(NumpyBinaryScalarParam::__FIELDS__())
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::ComputeInt<cpu, mshadow_op::bitwise_and>);
 
 NNVM_REGISTER_OP(_npi_bitwise_xor)
@@ -175,9 +169,7 @@ NNVM_REGISTER_OP(_npi_bitwise_or)
 NNVM_REGISTER_OP(_npi_bitwise_xor_scalar)
 .set_num_inputs(1)
 .set_num_outputs(1)
-.set_attr_parser([](NodeAttrs* attrs) {
-    attrs->parsed = dmlc::stod(attrs->dict["scalar"]);
-  })
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseIntType<1, 1>)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption",
@@ -186,15 +178,13 @@ NNVM_REGISTER_OP(_npi_bitwise_xor_scalar)
   })
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .add_argument("data", "NDArray-or-Symbol", "source input")
-.add_argument("scalar", "int", "scalar input")
+.add_arguments(NumpyBinaryScalarParam::__FIELDS__())
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::ComputeInt<cpu, mshadow_op::bitwise_xor>);
 
 NNVM_REGISTER_OP(_npi_bitwise_or_scalar)
 .set_num_inputs(1)
 .set_num_outputs(1)
-.set_attr_parser([](NodeAttrs* attrs) {
-    attrs->parsed = dmlc::stod(attrs->dict["scalar"]);
-  })
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseIntType<1, 1>)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption",
@@ -203,7 +193,7 @@ NNVM_REGISTER_OP(_npi_bitwise_or_scalar)
   })
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .add_argument("data", "NDArray-or-Symbol", "source input")
-.add_argument("scalar", "int", "scalar input")
+.add_arguments(NumpyBinaryScalarParam::__FIELDS__())
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::ComputeInt<cpu, mshadow_op::bitwise_or>);
 
 MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_copysign_scalar)
@@ -275,14 +265,14 @@ MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_rarctan2_scalar)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_npi_rarctan2_scalar"});
 
 MXNET_OPERATOR_REGISTER_BINARY(_backward_npi_arctan2_scalar)
-.add_argument("scalar", "float", "scalar value")
-.set_attr_parser([](NodeAttrs *attrs) { attrs->parsed = dmlc::stod(attrs->dict["scalar"]); })
+.add_arguments(NumpyBinaryScalarParam::__FIELDS__())
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<FCompute>("FCompute<cpu>",
                     BinaryScalarOp::Backward<cpu, mshadow_op::arctan2_grad>);
 
 MXNET_OPERATOR_REGISTER_BINARY(_backward_npi_rarctan2_scalar)
-.add_argument("scalar", "float", "scalar value")
-.set_attr_parser([](NodeAttrs *attrs) { attrs->parsed = dmlc::stod(attrs->dict["scalar"]); })
+.add_arguments(NumpyBinaryScalarParam::__FIELDS__())
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<FCompute>("FCompute<cpu>",
                     BinaryScalarOp::Backward<cpu, mshadow_op::arctan2_rgrad>);
 
@@ -363,13 +353,13 @@ NNVM_REGISTER_OP(_backward_npi_ldexp)
                                                                   mshadow_op::ldexp_rgrad>);
 
 MXNET_OPERATOR_REGISTER_BINARY(_backward_npi_ldexp_scalar)
-.add_argument("scalar", "float", "scalar value")
-.set_attr_parser([](NodeAttrs *attrs) { attrs->parsed = dmlc::stod(attrs->dict["scalar"]); })
+.add_arguments(NumpyBinaryScalarParam::__FIELDS__())
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Backward<cpu, mshadow_op::ldexp_grad>);
 
 MXNET_OPERATOR_REGISTER_BINARY(_backward_npi_rldexp_scalar)
-.add_argument("scalar", "float", "scalar value")
-.set_attr_parser([](NodeAttrs *attrs) { attrs->parsed = dmlc::stod(attrs->dict["scalar"]); })
+.add_arguments(NumpyBinaryScalarParam::__FIELDS__())
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Backward<cpu, mshadow_op::rldexp_grad>);
 
 }  // namespace op
diff --git a/src/operator/numpy/np_matrix_op-inl.h b/src/operator/numpy/np_matrix_op-inl.h
index 593a698dc93a..a64fb4db12b6 100644
--- a/src/operator/numpy/np_matrix_op-inl.h
+++ b/src/operator/numpy/np_matrix_op-inl.h
@@ -917,7 +917,7 @@ void NumpyConcatenateForward(const nnvm::NodeAttrs& attrs,
   ConcatParam cparam;
   cparam.num_args = param.num_args;
   cparam.dim = param.axis.has_value() ? param.axis.value() : 0;
-  MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+  MSHADOW_TYPE_SWITCH_WITH_BOOL(inputs[0].type_flag_, DType, {
     ConcatOp<xpu, DType> op;
     op.Init(cparam);
     op.Forward(ctx, data, req, outputs);
@@ -950,7 +950,7 @@ void NumpyConcatenateBackward(const nnvm::NodeAttrs& attrs,
   ConcatParam cparam;
   cparam.num_args = param.num_args;
   cparam.dim = param.axis.has_value() ? param.axis.value() : 0;
-  MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+  MSHADOW_TYPE_SWITCH_WITH_BOOL(inputs[0].type_flag_, DType, {
     ConcatOp<xpu, DType> op;
     op.Init(cparam);
     op.Backward(ctx, inputs[0], req, data);
diff --git a/src/operator/numpy/np_true_divide-inl.h b/src/operator/numpy/np_true_divide-inl.h
index be2ce51506a1..30d505a445ff 100644
--- a/src/operator/numpy/np_true_divide-inl.h
+++ b/src/operator/numpy/np_true_divide-inl.h
@@ -29,6 +29,7 @@
 #include <vector>
 #include "../../common/utils.h"
 #include "../tensor/elemwise_binary_broadcast_op.h"
+#include "../numpy/np_elemwise_broadcast_op.h"
 
 namespace mxnet {
 namespace op {
@@ -46,7 +47,8 @@ void TrueDivideScalarCompute(const nnvm::NodeAttrs &attrs,
   using namespace mxnet_op;
   using namespace mshadow::expr;
   Stream<xpu> *s = ctx.get_stream<xpu>();
-  const double alpha = nnvm::get<double>(attrs.parsed);
+  const NumpyBinaryScalarParam& param = nnvm::get<NumpyBinaryScalarParam>(attrs.parsed);
+  const double alpha = param.scalar;
   const TBlob& data = inputs[0];
   const TBlob& out = outputs[0];
   if (out.type_flag_ == data.type_flag_) {
@@ -57,10 +59,10 @@ void TrueDivideScalarCompute(const nnvm::NodeAttrs &attrs,
       });
     });
   } else {
-#ifndef _WIN32
-    CHECK_EQ(outputs[0].type_flag_, kFloat32) << "true_divide only supports float32 output "
-                                                 "when input's dtype is "
-                                              << type_string(inputs[0].type_flag_);
+    CHECK_EQ(outputs[0].type_flag_, mxnet::common::GetDefaultDtype())
+      << "true_divide only supports float32 and float64"
+         " output when input's dtype is "
+      << type_string(inputs[0].type_flag_);
     MXNET_INT_TYPE_SWITCH(inputs[0].type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
         Kernel<op_with_req<OP, Req>, xpu>::Launch(
@@ -68,13 +70,6 @@ void TrueDivideScalarCompute(const nnvm::NodeAttrs &attrs,
           static_cast<float>(alpha));
       });
     });
-#else
-    Tensor<xpu, 1, float> temp_tensor =
-      ctx.requested[0].get_space_typed<xpu, 1, float>(mshadow::Shape1(data.Size()), s);
-    TBlob temp_tblob(temp_tensor);
-    CastCompute<xpu>(attrs, ctx, {data}, {kWriteTo}, {temp_tblob});
-    TrueDivideScalarCompute<xpu, OP>(attrs, ctx, {temp_tblob}, req, outputs);
-#endif
   }
 }
 
@@ -116,8 +111,6 @@ void TrueDivideElemwiseCompute(const nnvm::NodeAttrs &attrs,
       });
     }
   } else {
-#ifndef _WIN32
-    // Non-windows case: no usage of temporary space
     // Case when types of the 2 input tensors are different
     if (common::is_float(lhs.type_flag_) && common::is_float(rhs.type_flag_)) {
       // both lhs and rhs are float types, output type is the more precise one
@@ -152,44 +145,6 @@ void TrueDivideElemwiseCompute(const nnvm::NodeAttrs &attrs,
       // lhs is integer type, rhs is integer type, output type should be float
       LOG(FATAL) << "not implemented yet...";
     }
-#else
-    // Windows case: using temp space for casting the type
-    // Case when types of the 2 input tensors are different
-    if (common::is_float(lhs.type_flag_) && common::is_float(rhs.type_flag_)) {
-      // both lhs and rhs are float types, output type is the more precise one
-      LOG(FATAL) << "not implemented yet...";
-    } else if (common::is_float(lhs.type_flag_) || common::is_float(rhs.type_flag_)) {
-      // lhs is float type, rhs is integer type, the output type should be the same as lhs
-      CHECK_EQ(out.type_flag_,
-               common::is_float(lhs.type_flag_) ? lhs.type_flag_ : rhs.type_flag_)
-        << "This case out type should be same as the float type";
-      TBlob temp_tblob;
-      if (common::is_float(lhs.type_flag_)) {
-        // lhs is the float one
-        MSHADOW_REAL_TYPE_SWITCH(lhs.type_flag_, LType, {
-          Tensor<xpu, 1, LType> temp_tensor =
-            ctx.requested[0].get_space_typed<xpu, 1, LType>(mshadow::Shape1(rhs.Size()), s);
-          temp_tblob = TBlob(temp_tensor);
-        });
-        CastCompute<xpu>(attrs, ctx, {rhs}, {kWriteTo}, {temp_tblob});
-        TrueDivideElemwiseCompute<xpu>(
-          attrs, ctx, {lhs, temp_tblob.reshape(rhs.shape_)}, req, outputs);
-      } else {
-        // rhs is the float one
-        MSHADOW_REAL_TYPE_SWITCH(rhs.type_flag_, RType, {
-          Tensor<xpu, 1, RType> temp_tensor =
-            ctx.requested[0].get_space_typed<xpu, 1, RType>(mshadow::Shape1(lhs.Size()), s);
-          temp_tblob = TBlob(temp_tensor);
-        });
-        CastCompute<xpu>(attrs, ctx, {lhs}, {kWriteTo}, {temp_tblob});
-        TrueDivideElemwiseCompute<xpu>(
-          attrs, ctx, {temp_tblob.reshape(lhs.shape_), rhs}, req, outputs);
-      }
-    } else {
-      // lhs is integer type, rhs is integer type, output type should be float
-      LOG(FATAL) << "not implemented yet...";
-    }
-#endif
   }
 }
 
@@ -213,7 +168,6 @@ void TrueDivideBroadcastCompute(const nnvm::NodeAttrs& attrs,
     const TBlob& lhs = inputs[0];
     const TBlob& rhs = inputs[1];
     const TBlob& out = outputs[0];
-#ifndef _WIN32
     BROADCAST_NDIM_SWITCH(ndim, NDim, {
       mshadow::Shape<NDim> oshape = new_oshape.get<NDim>();
       mshadow::Shape<NDim> lstride = calc_stride(new_lshape.get<NDim>());
@@ -273,70 +227,6 @@ void TrueDivideBroadcastCompute(const nnvm::NodeAttrs& attrs,
         }
       }
     });
-#else
-    if (lhs.type_flag_ == rhs.type_flag_) {
-      BROADCAST_NDIM_SWITCH(ndim, NDim, {
-        mshadow::Shape<NDim> oshape = new_oshape.get<NDim>();
-        mshadow::Shape<NDim> lstride = calc_stride(new_lshape.get<NDim>());
-        mshadow::Shape<NDim> rstride = calc_stride(new_rshape.get<NDim>());
-        // When the both inputs have the same data types
-        if (common::is_float(lhs.type_flag_)) {
-          // If both inputs are the same float types, output is the same float type
-          MSHADOW_REAL_TYPE_SWITCH(lhs.type_flag_, DType, {
-            Kernel<binary_broadcast_kernel<NDim, mshadow_op::true_divide>, xpu>::
-              template LaunchEx(s, new_oshape.Size(), req[0], lstride, rstride, oshape,
-                                lhs.dptr<DType>(), rhs.dptr<DType>(), out.dptr<DType>());
-          });
-        } else {
-          CHECK_EQ(out.type_flag_, mshadow::kFloat32)
-            << "true_divide only supports float32 output when input's dtype is "
-            << type_string(lhs.type_flag_);
-          MXNET_INT_TYPE_SWITCH(lhs.type_flag_, DType, {
-            // If both inputs are the same integer types, output is float type
-            Kernel<binary_broadcast_kernel<NDim, mshadow_op::true_divide>, xpu>::
-              template LaunchEx(s, new_oshape.Size(), req[0], lstride, rstride, oshape,
-                                lhs.dptr<DType>(), rhs.dptr<DType>(), out.dptr<float>());
-          });
-        }
-      });
-    } else {
-      if (common::is_float(lhs.type_flag_) && common::is_float(rhs.type_flag_)) {
-        // lhs and rhs have different float types, the output is the more precise one
-        LOG(FATAL) << "not implemented yet...";
-      } else if (common::is_float(lhs.type_flag_) || common::is_float(rhs.type_flag_)) {
-        // one of lhs and rhs is float, the output is the same type as the float one
-        TBlob temp_tblob;
-        if (common::is_float(lhs.type_flag_)) {
-          // lhs is float type, output will be the same float type
-          CHECK_EQ(lhs.type_flag_, out.type_flag_)
-            << "lhs should have the same type as out, infer type broken?";
-          MSHADOW_REAL_TYPE_SWITCH(lhs.type_flag_, LType, {
-            Tensor<xpu, 1, LType> temp_tensor =
-              ctx.requested[0].get_space_typed<xpu, 1, LType>(mshadow::Shape1(rhs.Size()), s);
-            temp_tblob = TBlob(temp_tensor);
-          });
-          CastCompute<xpu>(attrs, ctx, {rhs}, {kWriteTo}, {temp_tblob});
-          TrueDivideBroadcastCompute<xpu>(
-            attrs, ctx, {lhs, temp_tblob.reshape(rhs.shape_)}, req, outputs);
-        } else {
-          // rhs is float type, output will be the same float type
-          CHECK_EQ(rhs.type_flag_, out.type_flag_)
-            << "rhs should have the same type as out, infer type broken?";
-          MSHADOW_REAL_TYPE_SWITCH(rhs.type_flag_, RType, {
-            Tensor<xpu, 1, RType> temp_tensor =
-              ctx.requested[0].get_space_typed<xpu, 1, RType>(mshadow::Shape1(lhs.Size()), s);
-            temp_tblob = TBlob(temp_tensor);
-          });
-          CastCompute<xpu>(attrs, ctx, {lhs}, {kWriteTo}, {temp_tblob});
-          TrueDivideBroadcastCompute<xpu>(
-            attrs, ctx, {temp_tblob.reshape(lhs.shape_), rhs}, req, outputs);
-        }
-      } else {
-        // lhs and rhs have different integer types, the output is float type
-        LOG(FATAL) << "not implemented yet...";
-      }
-    }
-#endif
   }
 }
 
diff --git a/src/operator/numpy/np_true_divide.cc b/src/operator/numpy/np_true_divide.cc
index 6edfb4dd0901..2f3bf7d5dfb6 100644
--- a/src/operator/numpy/np_true_divide.cc
+++ b/src/operator/numpy/np_true_divide.cc
@@ -74,46 +74,46 @@ NNVM_REGISTER_OP(_npi_true_divide)
   [](const NodeAttrs& attrs){
     return std::vector<std::pair<int, int> >{{0, 0}, {1, 0}};
   })
-#ifdef _WIN32
+.set_attr<FCompute>("FCompute<cpu>", TrueDivideBroadcastCompute<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_npi_broadcast_div"})
+.add_argument("lhs", "NDArray-or-Symbol", "Dividend array")
+.add_argument("rhs", "NDArray-or-Symbol", "Divisor array");
+
+
+NNVM_REGISTER_OP(_backward_npi_broadcast_div)
+.set_num_inputs(3)
+.set_num_outputs(2)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs){
+    return std::vector<std::pair<int, int> >{{0, 1}};
+  })
 .set_attr<FResourceRequest>("FResourceRequest",
   [](const NodeAttrs& attrs) {
     return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
   })
-#endif
-.set_attr<FCompute>("FCompute<cpu>", TrueDivideBroadcastCompute<cpu>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_broadcast_div"})
-.add_argument("lhs", "NDArray-or-Symbol", "Dividend array")
-.add_argument("rhs", "NDArray-or-Symbol", "Divisor array");
+.set_attr<FCompute>("FCompute<cpu>", NumpyBinaryBackwardUseIn<cpu, mshadow_op::div_grad,
+                                                              mshadow_op::div_rgrad>);
 
 NNVM_REGISTER_OP(_npi_true_divide_scalar)
 .set_num_inputs(1)
 .set_num_outputs(1)
-.set_attr_parser([](NodeAttrs* attrs) {
-    attrs->parsed = dmlc::stod(attrs->dict["scalar"]);
-  })
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
 .set_attr<nnvm::FInferType>("FInferType", TrueDivideType<1>)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption",
   [](const NodeAttrs& attrs) {
     return std::vector<std::pair<int, int> >{{0, 0}};
   })
-#ifdef _WIN32
-.set_attr<FResourceRequest>("FResourceRequest",
-  [](const NodeAttrs& attrs) {
-    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-  })
-#endif
 .set_attr<FCompute>("FCompute<cpu>", TrueDivideScalarCompute<cpu, op::mshadow_op::true_divide>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_div_scalar"})
 .add_argument("data", "NDArray-or-Symbol", "source input")
-.add_argument("scalar", "float", "scalar input");
+.add_arguments(NumpyBinaryScalarParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_npi_rtrue_divide_scalar)
 .set_num_inputs(1)
 .set_num_outputs(1)
-.set_attr_parser([](NodeAttrs* attrs) {
-  attrs->parsed = dmlc::stod(attrs->dict["scalar"]);
-  })
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
 .set_attr<nnvm::FInferType>("FInferType", TrueDivideType<1>)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption",
@@ -129,7 +129,7 @@ NNVM_REGISTER_OP(_npi_rtrue_divide_scalar)
 .set_attr<FCompute>("FCompute<cpu>", TrueDivideScalarCompute<cpu, mshadow_op::rtrue_divide>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_rdiv_scalar"})
 .add_argument("data", "NDArray-or-Symbol", "source input")
-.add_argument("scalar", "float", "scalar input");
+.add_arguments(NumpyBinaryScalarParam::__FIELDS__());
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/np_true_divide.cu b/src/operator/numpy/np_true_divide.cu
index 7211f4a0a006..c8eccfe140b4 100644
--- a/src/operator/numpy/np_true_divide.cu
+++ b/src/operator/numpy/np_true_divide.cu
@@ -31,6 +31,10 @@ namespace op {
 NNVM_REGISTER_OP(_npi_true_divide)
 .set_attr<FCompute>("FCompute<gpu>", TrueDivideBroadcastCompute<gpu>);
 
+NNVM_REGISTER_OP(_backward_npi_broadcast_div)
+.set_attr<FCompute>("FCompute<gpu>", NumpyBinaryBackwardUseIn<gpu, mshadow_op::div_grad,
+                                                              mshadow_op::div_rgrad>);
+
 NNVM_REGISTER_OP(_npi_true_divide_scalar)
 .set_attr<FCompute>("FCompute<gpu>", TrueDivideScalarCompute<gpu, mshadow_op::true_divide>);
 
diff --git a/src/operator/operator_tune.cc b/src/operator/operator_tune.cc
index 0cc0dc92f884..de6efbfe5758 100644
--- a/src/operator/operator_tune.cc
+++ b/src/operator/operator_tune.cc
@@ -421,6 +421,8 @@ IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::rldexp);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::ldexp_grad);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::ldexp_rgrad);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::rldexp_grad);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::posone); // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::negone); // NOLINT()
 /*!
  * \brief Tuner objects, *not* automatically generated
  */
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.h b/src/operator/tensor/elemwise_binary_broadcast_op.h
index ffd0f123070a..774c87afaf3c 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op.h
+++ b/src/operator/tensor/elemwise_binary_broadcast_op.h
@@ -227,8 +227,8 @@ struct binary_broadcast_kernel {
     }
   }
 
-#ifndef _WIN32
   /*! \brief Map function for binary_broadcast_kernel */
+  /* used for mixed type binary ops */
   template<typename IType, typename DType,
            typename std::enable_if<!std::is_same<IType, DType>::value, int>::type = 0>
   MSHADOW_XINLINE static void Map(index_t base, index_t length, OpReqType req,
@@ -249,6 +249,7 @@ struct binary_broadcast_kernel {
   }
 
   /*! \brief Map function for binary_broadcast_kernel */
+  /* used for mixed type binary ops */
   template<typename IType, typename DType,
            typename std::enable_if<!std::is_same<IType, DType>::value &&
                                    !std::is_pointer<IType>::value, int>::type = 0>
@@ -268,7 +269,6 @@ struct binary_broadcast_kernel {
       KERNEL_ASSIGN(out[base + i], req, OP::Map(lhs, rhs[ridx]));
     }
   }
-#endif
 };
 
 template<int req, typename OP, bool col_vec>
diff --git a/src/operator/tensor/elemwise_binary_scalar_op.h b/src/operator/tensor/elemwise_binary_scalar_op.h
index 4eaaff09d83d..7d65c89f7b0f 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op.h
+++ b/src/operator/tensor/elemwise_binary_scalar_op.h
@@ -29,6 +29,7 @@
 #include <dmlc/strtonum.h>
 #include <vector>
 #include <utility>
+#include <string>
 #include "../mshadow_op.h"
 #include "../elemwise_op_common.h"
 #include "elemwise_unary_op.h"
@@ -36,6 +37,45 @@
 namespace mxnet {
 namespace op {
 
+struct NumpyBinaryScalarParam : public dmlc::Parameter<NumpyBinaryScalarParam> {
+  double scalar;
+  bool is_int;
+  DMLC_DECLARE_PARAMETER(NumpyBinaryScalarParam) {
+    DMLC_DECLARE_FIELD(scalar)
+    .set_default(1)
+    .describe("Scalar input value");
+    DMLC_DECLARE_FIELD(is_int)
+    .set_default(true)
+    .describe("Indicate whether scalar input is int type");
+  }
+
+  void SetAttrDict(std::unordered_map<std::string, std::string>* dict) {
+    std::ostringstream scalar_s, is_int_s;
+    scalar_s << scalar;
+    is_int_s << is_int;
+    (*dict)["scalar"] = scalar_s.str();
+    (*dict)["is_int"] = is_int_s.str();
+  }
+};
+
+inline bool NumpyBinaryScalarType(const nnvm::NodeAttrs& attrs,
+                           std::vector<int>* in_attrs,
+                           std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const NumpyBinaryScalarParam& param = nnvm::get<NumpyBinaryScalarParam>(attrs.parsed);
+  bool scalar_is_int = param.is_int;
+  if (common::is_int(in_attrs->at(0)) && !scalar_is_int) {
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kFloat64);
+  } else if (in_attrs->at(0) == mshadow::kBool) {
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, scalar_is_int ? mshadow::kInt64 : mshadow::kFloat64);
+  } else {
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+    TYPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
+  }
+  return out_attrs->at(0) != -1;
+}
+
 class BinaryScalarOp : public UnaryOp {
   /*! \brief Tensor operation against a scalar with a dense result */
   template<typename OP, typename DType, typename IType>
@@ -45,7 +85,8 @@ class BinaryScalarOp : public UnaryOp {
                                       const NDArray &input,
                                       const OpReqType req,
                                       const NDArray &output) {
-    const double alpha = nnvm::get<double>(attrs.parsed);
+    const NumpyBinaryScalarParam& param = nnvm::get<NumpyBinaryScalarParam>(attrs.parsed);
+    const double alpha = param.scalar;
     CHECK_EQ(output.shape(), input.shape());
     const int64_t row_count = output.shape()[0];
     const int64_t items_per_row = output.shape().Size() / row_count;
@@ -137,7 +178,8 @@ class BinaryScalarOp : public UnaryOp {
                                       const NDArray &output) {
     CHECK_EQ(output.shape(), input.shape());
 
-    const double alpha = nnvm::get<double>(attrs.parsed);
+    const NumpyBinaryScalarParam& param = nnvm::get<NumpyBinaryScalarParam>(attrs.parsed);
+    const double alpha = param.scalar;
     const DType dense_fill_val = OP::Map(DType(0), DType(alpha));
     const TBlob  column_indexes = input.aux_data(csr::kIdx);
     const size_t item_count = column_indexes.Size();
@@ -236,11 +278,23 @@ class BinaryScalarOp : public UnaryOp {
     using namespace mshadow;
     using namespace mshadow::expr;
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    const double alpha = nnvm::get<double>(attrs.parsed);
+    TBlob temp_tblob;
+    const NumpyBinaryScalarParam& param = nnvm::get<NumpyBinaryScalarParam>(attrs.parsed);
+    bool scalar_is_int = param.is_int;
+    const double alpha = param.scalar;
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+      if ((common::is_int(inputs[0].type_flag_) && !scalar_is_int) ||
+          (inputs[0].type_flag_ == kBool)) {
+        Tensor<xpu, 1, DType> temp_tensor =
+            ctx.requested[0].get_space_typed<xpu, 1, DType>(Shape1(inputs[0].Size()), s);
+        temp_tblob = TBlob(temp_tensor);
+        CastCompute<xpu>(attrs, ctx, {inputs[0]}, {kWriteTo}, {temp_tblob});
+      } else {
+        temp_tblob = inputs[0];
+      }
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
         mxnet_op::Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(
-          s, inputs[0].Size(), outputs[0].dptr<DType>(), inputs[0].dptr<DType>(), DType(alpha));
+          s, inputs[0].Size(), outputs[0].dptr<DType>(), temp_tblob.dptr<DType>(), DType(alpha));
       });
     });
   }
@@ -256,7 +310,8 @@ class BinaryScalarOp : public UnaryOp {
     using namespace mshadow;
     using namespace mshadow::expr;
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    const double alpha = nnvm::get<double>(attrs.parsed);
+    const NumpyBinaryScalarParam& param = nnvm::get<NumpyBinaryScalarParam>(attrs.parsed);
+    const double alpha = param.scalar;
     MXNET_INT_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
         mxnet_op::Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(
@@ -276,12 +331,23 @@ class BinaryScalarOp : public UnaryOp {
     using namespace mshadow;
     using namespace mshadow::expr;
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    const double alpha = nnvm::get<double>(attrs.parsed);
-    MSHADOW_TYPE_SWITCH_WITH_BOOL(inputs[0].type_flag_, DType, {
-        MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-          mxnet_op::Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(
-              s, inputs[0].Size(), outputs[0].dptr<bool>(), inputs[0].dptr<DType>(), DType(alpha));
-        });
+    const NumpyBinaryScalarParam& param = nnvm::get<NumpyBinaryScalarParam>(attrs.parsed);
+    bool scalar_is_int = param.is_int;
+    const double alpha = param.scalar;
+    TBlob temp_tblob;
+    if (common::is_int(inputs[0].type_flag_) && !scalar_is_int) {
+      Tensor<xpu, 1, double> temp_tensor =
+          ctx.requested[0].get_space_typed<xpu, 1, double>(Shape1(inputs[0].Size()), s);
+      temp_tblob = TBlob(temp_tensor);
+      CastCompute<xpu>(attrs, ctx, {inputs[0]}, {kWriteTo}, {temp_tblob});
+    } else {
+      temp_tblob = inputs[0];
+    }
+    MSHADOW_TYPE_SWITCH_WITH_BOOL(temp_tblob.type_flag_, DType, {
+      MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+        mxnet_op::Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(
+            s, inputs[0].Size(), outputs[0].dptr<bool>(), temp_tblob.dptr<DType>(), DType(alpha));
+      });
     });
   }
 
@@ -345,7 +411,8 @@ class BinaryScalarOp : public UnaryOp {
     using namespace mshadow;
     using namespace mshadow::expr;
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    const double alpha = nnvm::get<double>(attrs.parsed);
+    const NumpyBinaryScalarParam& param = nnvm::get<NumpyBinaryScalarParam>(attrs.parsed);
+    const double alpha = param.scalar;
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
         mxnet::op::mxnet_op::Kernel<mxnet::op::mxnet_op::op_with_req<
@@ -358,21 +425,23 @@ class BinaryScalarOp : public UnaryOp {
   }
 };
 
-#define MXNET_OPERATOR_REGISTER_BINARY_SCALAR(name)                 \
-  NNVM_REGISTER_OP(name)                                            \
-  .set_num_inputs(1)                                                \
-  .set_num_outputs(1)                                               \
-  .set_attr_parser([](NodeAttrs* attrs) {                           \
-      attrs->parsed = dmlc::stod(attrs->dict["scalar"]);            \
-    })                                                              \
-  .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)  \
-  .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)     \
-  .set_attr<nnvm::FInplaceOption>("FInplaceOption",                 \
-    [](const NodeAttrs& attrs){                                     \
-      return std::vector<std::pair<int, int> >{{0, 0}};             \
-    })                                                              \
-  .add_argument("data", "NDArray-or-Symbol", "source input")        \
-  .add_argument("scalar", "float", "scalar input")
+#define MXNET_OPERATOR_REGISTER_BINARY_SCALAR(name)                       \
+  NNVM_REGISTER_OP(name)                                                  \
+  .set_num_inputs(1)                                                      \
+  .set_num_outputs(1)                                                     \
+  .set_attr_parser(ParamParser<NumpyBinaryScalarParam>)                   \
+  .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)       \
+  .set_attr<nnvm::FInferType>("FInferType", NumpyBinaryScalarType)        \
+  .set_attr<nnvm::FInplaceOption>("FInplaceOption",                       \
+    [](const NodeAttrs& attrs){                                           \
+      return std::vector<std::pair<int, int> >{{0, 0}};                   \
+    })                                                                    \
+  .set_attr<FResourceRequest>("FResourceRequest",                         \
+    [](const NodeAttrs& attrs) {                                          \
+      return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};   \
+    })                                                                    \
+  .add_argument("data", "NDArray-or-Symbol", "source input")              \
+  .add_arguments(NumpyBinaryScalarParam::__FIELDS__())
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_binary_scalar_op_basic.cc b/src/operator/tensor/elemwise_binary_scalar_op_basic.cc
index 13014b35c2fe..dc11593255e7 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op_basic.cc
+++ b/src/operator/tensor/elemwise_binary_scalar_op_basic.cc
@@ -28,22 +28,24 @@
 #include "./elemwise_binary_scalar_op.h"
 
 #define MXNET_OPERATOR_REGISTER_BINARY_WITH_SCALAR_SUPPORT_WITH_DENSE_RESULT(name)    \
-  NNVM_REGISTER_OP(name)                                            \
-  .set_num_inputs(1)                                                \
-  .set_num_outputs(1)                                               \
-  .set_attr_parser([](NodeAttrs* attrs) {                           \
-      attrs->parsed = dmlc::stod(attrs->dict["scalar"]);             \
-    })                                                              \
-  .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)  \
-  .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)     \
-  .set_attr<FInferStorageType>("FInferStorageType",                 \
-    BinaryScalarStorageTypeWithDenseResultStorageType)              \
-  .set_attr<nnvm::FInplaceOption>("FInplaceOption",                 \
-    [](const NodeAttrs& attrs){                                     \
-      return std::vector<std::pair<int, int> >{{0, 0}};             \
-    })                                                              \
-  .add_argument("data", "NDArray-or-Symbol", "source input")        \
-  .add_argument("scalar", "float", "scalar input")
+  NNVM_REGISTER_OP(name)                                                              \
+  .set_num_inputs(1)                                                                  \
+  .set_num_outputs(1)                                                                 \
+  .set_attr_parser(ParamParser<NumpyBinaryScalarParam>)                               \
+  .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)                   \
+  .set_attr<nnvm::FInferType>("FInferType", NumpyBinaryScalarType)                    \
+  .set_attr<FInferStorageType>("FInferStorageType",                                   \
+    BinaryScalarStorageTypeWithDenseResultStorageType)                                \
+  .set_attr<nnvm::FInplaceOption>("FInplaceOption",                                   \
+    [](const NodeAttrs& attrs){                                                       \
+      return std::vector<std::pair<int, int> >{{0, 0}};                               \
+    })                                                                                \
+  .set_attr<FResourceRequest>("FResourceRequest",                                     \
+    [](const NodeAttrs& attrs) {                                                      \
+      return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};               \
+    })                                                                                \
+  .add_argument("data", "NDArray-or-Symbol", "source input")                          \
+  .add_arguments(NumpyBinaryScalarParam::__FIELDS__())
 
 namespace mxnet {
 namespace op {
@@ -65,7 +67,8 @@ static bool BinaryScalarStorageTypeWithDenseResultStorageType(const NodeAttrs& a
   const NDArrayStorageType instype = static_cast<NDArrayStorageType>(in_attrs->at(0));
   const auto dispatch_ex = invalid_ctx ? DispatchMode::kFComputeFallback
                                        : DispatchMode::kFComputeEx;
-  const double alpha = nnvm::get<double>(attrs.parsed);
+  const NumpyBinaryScalarParam& param = nnvm::get<NumpyBinaryScalarParam>(attrs.parsed);
+  const double alpha = param.scalar;
   if (common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)) {
     dispatched = storage_type_assign(&out_attrs[0],
       kDefaultStorage, dispatch_mode, DispatchMode::kFCompute);
@@ -189,8 +192,8 @@ MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_rdiv_scalar)
 .add_alias("_RDivScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY(_backward_rdiv_scalar)
-.add_argument("scalar", "float", "scalar value")
-.set_attr_parser([](NodeAttrs *attrs) { attrs->parsed = dmlc::stod(attrs->dict["scalar"]); })
+.add_arguments(NumpyBinaryScalarParam::__FIELDS__())
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Backward<
   cpu, mshadow_op::rdiv_grad>);
 
@@ -200,8 +203,8 @@ MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_mod_scalar)
 .add_alias("_ModScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY(_backward_mod_scalar)
-.add_argument("scalar", "float", "scalar value")
-.set_attr_parser([](NodeAttrs *attrs) { attrs->parsed = dmlc::stod(attrs->dict["scalar"]); })
+.add_arguments(NumpyBinaryScalarParam::__FIELDS__())
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Backward<
   cpu, mshadow_op::mod_grad>);
 
@@ -211,8 +214,8 @@ MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_rmod_scalar)
 .add_alias("_RModScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY(_backward_rmod_scalar)
-.add_argument("scalar", "float", "scalar value")
-.set_attr_parser([](NodeAttrs *attrs) { attrs->parsed = dmlc::stod(attrs->dict["scalar"]); })
+.add_arguments(NumpyBinaryScalarParam::__FIELDS__())
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Backward<
   cpu, mshadow_op::rmod_grad>);
 
diff --git a/src/operator/tensor/elemwise_binary_scalar_op_extended.cc b/src/operator/tensor/elemwise_binary_scalar_op_extended.cc
index 7dd8cf41c59e..c08cdcdf0dc8 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op_extended.cc
+++ b/src/operator/tensor/elemwise_binary_scalar_op_extended.cc
@@ -36,8 +36,8 @@ MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_maximum_scalar)
 .add_alias("_npi_maximum_scalar");
 
 MXNET_OPERATOR_REGISTER_BINARY(_backward_maximum_scalar)
-.add_argument("scalar", "float", "scalar value")
-.set_attr_parser([](NodeAttrs *attrs) { attrs->parsed = dmlc::stod(attrs->dict["scalar"]); })
+.add_arguments(NumpyBinaryScalarParam::__FIELDS__())
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Backward<cpu, mshadow_op::ge>);
 
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_minimum_scalar)
@@ -47,8 +47,8 @@ MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_minimum_scalar)
 .add_alias("_npi_minimum_scalar");
 
 MXNET_OPERATOR_REGISTER_BINARY(_backward_minimum_scalar)
-.add_argument("scalar", "float", "scalar value")
-.set_attr_parser([](NodeAttrs *attrs) { attrs->parsed = dmlc::stod(attrs->dict["scalar"]); })
+.add_arguments(NumpyBinaryScalarParam::__FIELDS__())
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Backward<cpu, mshadow_op::le>);
 
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_power_scalar)
@@ -57,8 +57,8 @@ MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_power_scalar)
 .add_alias("_PowerScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY(_backward_power_scalar)
-.add_argument("scalar", "float", "scalar value")
-.set_attr_parser([](NodeAttrs *attrs) { attrs->parsed = dmlc::stod(attrs->dict["scalar"]); })
+.add_arguments(NumpyBinaryScalarParam::__FIELDS__())
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Backward<
   cpu, mshadow_op::power_grad>);
 
@@ -69,8 +69,8 @@ MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_rpower_scalar)
 .add_alias("_RPowerScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY(_backward_rpower_scalar)
-.add_argument("scalar", "float", "scalar value")
-.set_attr_parser([](NodeAttrs *attrs) { attrs->parsed = dmlc::stod(attrs->dict["scalar"]); })
+.add_arguments(NumpyBinaryScalarParam::__FIELDS__())
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Backward<
   cpu, mshadow_op::rpower_grad>);
 
@@ -82,8 +82,8 @@ MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_hypot_scalar)
 .add_alias("_HypotScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY(_backward_hypot_scalar)
-.add_argument("scalar", "float", "scalar value")
-.set_attr_parser([](NodeAttrs *attrs) { attrs->parsed = dmlc::stod(attrs->dict["scalar"]); })
+.add_arguments(NumpyBinaryScalarParam::__FIELDS__())
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Backward<
   cpu, mshadow_op::hypot_grad_left>);
 
@@ -109,13 +109,7 @@ Example::
 )code" ADD_FILELINE)
 .set_num_inputs(1)
 .set_num_outputs(1)
-.set_attr_parser([](NodeAttrs* attrs) {
-    if (attrs->dict.find("scalar") != attrs->dict.end()) {
-      attrs->parsed = dmlc::stod(attrs->dict["scalar"]);
-    } else {
-      attrs->parsed = 1.0;
-    }
-  })
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption",
@@ -128,13 +122,7 @@ Example::
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_smooth_l1" });
 
 MXNET_OPERATOR_REGISTER_BINARY(_backward_smooth_l1)
-  .set_attr_parser([](NodeAttrs *attrs) {
-      if (attrs->dict.find("scalar") != attrs->dict.end()) {
-        attrs->parsed = dmlc::stod(attrs->dict["scalar"]);
-      } else {
-        attrs->parsed = 1.0;
-      }
-})
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<FCompute>("FCompute<cpu>",
                     BinaryScalarOp::Backward<cpu, mshadow_op::smooth_l1_gradient>);
 
diff --git a/src/operator/tensor/elemwise_binary_scalar_op_logic.cc b/src/operator/tensor/elemwise_binary_scalar_op_logic.cc
index 17e76153ebb2..0594997877d3 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op_logic.cc
+++ b/src/operator/tensor/elemwise_binary_scalar_op_logic.cc
@@ -46,7 +46,8 @@ static bool BinaryScalarLogicStorageType(const nnvm::NodeAttrs& attrs,
   const auto in_stype = in_attrs->at(0);
   auto &out_stype = out_attrs->at(0);
   bool dispatched = false;
-  const double alpha = nnvm::get<double>(attrs.parsed);
+  const NumpyBinaryScalarParam& param = nnvm::get<NumpyBinaryScalarParam>(attrs.parsed);
+  const double alpha = param.scalar;
   bool is_sparse = OP::Map(static_cast<double>(0), alpha) == 0;
   if (!dispatched && in_stype == kDefaultStorage) {
     // dns -> dns
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 0f35dece5354..4811b341b060 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -2488,8 +2488,9 @@ def hybrid_forward(self, F, a, b, *args, **kwargs):
             assert y.shape == np_out.shape
             assert_almost_equal(y.asnumpy(), np_out.astype(y.dtype), rtol=rtol, atol=atol,
                                 use_broadcast=False, equal_nan=True)
-
             if lgrad:
+                if (ltype in itypes) and (rtype in itypes):
+                    continue
                 y.backward()
                 if ltype not in itypes:
                     assert_almost_equal(mx_test_x1.grad.asnumpy(),
@@ -2513,10 +2514,13 @@ def hybrid_forward(self, F, a, b, *args, **kwargs):
                             use_broadcast=False, equal_nan=True)
 
     funcs = {
-        'add': (-1.0, 1.0, None, None),
-        'subtract': (-1.0, 1.0, None, None),
+        'add': (-1.0, 1.0, lambda y, x1, x2: _np.ones(y.shape),
+                           lambda y, x1, x2: _np.ones(y.shape)),
+        'subtract': (-1.0, 1.0, lambda y, x1, x2: _np.ones(y.shape),
+                                lambda y, x1, x2: _np.ones(y.shape) * -1),
         'multiply': (-1.0, 1.0, lambda y, x1, x2: _np.broadcast_to(x2, y.shape),
                                 lambda y, x1, x2: _np.broadcast_to(x1, y.shape)),
+        'mod': (1.0, 5.0, None, None),
         'power': (1.0, 3.0, lambda y, x1, x2: _np.power(x1, x2 - 1.0) * x2,
                             lambda y, x1, x2: _np.power(x1, x2) * _np.log(x1)),
     }
@@ -2530,8 +2534,6 @@ def hybrid_forward(self, F, a, b, *args, **kwargs):
                    ((2, 3), ()),
                    ((), (2, 3))]
 
-    itypes = [np.bool, np.int8, np.int32, np.int64]
-    ftypes = [np.float16, np.float32, np.float64]
     for func, func_data in funcs.items():
         low, high, lgrad, rgrad = func_data
         for lshape, rshape in shape_pairs:
@@ -2544,6 +2546,13 @@ def hybrid_forward(self, F, a, b, *args, **kwargs):
                     continue
                 check_mixed_precision_binary_func(func, low, high, lshape, rshape, lgrad, rgrad, type1, type2)
 
+            if func == 'subtract' or func == 'mod':
+                continue
+            for type1, type2 in itertools.product(itypes, itypes):
+                if type1 == type2:
+                    continue
+                check_mixed_precision_binary_func(func, low, high, lshape, rshape, lgrad, rgrad, type1, type2)
+
 
 @with_seed()
 @use_np
@@ -3085,52 +3094,56 @@ def get_new_shape(shape, axis):
             shape_lst[axis] = random.randint(0, 3)
         return tuple(shape_lst)
 
-    for shape in [(0, 0), (2, 3), (2, 1, 3)]:
-        for hybridize in [True, False]:
-            for axis in [0, 1, None]:
-                for grad_req in ['write', 'add', 'null']:
-                    # test gluon
-                    test_concat = TestConcat(axis=axis)
-                    if hybridize:
-                        test_concat.hybridize()
+    shapes = [(0, 0), (2, 3), (2, 1, 3)]
+    hybridizes = [True, False]
+    axes = [0, 1, None]
+    grad_reqs = ['write', 'add', 'null']
+    dtypes = [np.float32, np.float64, np.bool]
+    combinations = itertools.product(shapes, hybridizes, axes, grad_reqs, dtypes)
 
-                    grad_req_c = grad_req
-                    grad_req_d = grad_req
-                    if grad_req == 'null':
-                        ide = random.randint(0, 2)
-                        grad_req_c = 'write' if ide == 0 else 'add'
-                        grad_req_c = 'write' if ide == 1 else 'add'
+    for shape, hybridize, axis, grad_req, dtype in combinations:
+        # test gluon
+        test_concat = TestConcat(axis=axis)
+        if hybridize:
+            test_concat.hybridize()
 
-                    a = mx.nd.random.uniform(-1.0, 1.0, shape=get_new_shape(shape, axis)).as_np_ndarray()
-                    a.attach_grad(grad_req)
-                    b = mx.nd.random.uniform(-1.0, 1.0, shape=get_new_shape(shape, axis)).as_np_ndarray()
-                    b.attach_grad(grad_req)
-                    c = mx.nd.random.uniform(-1.0, 1.0, shape=get_new_shape(shape, axis)).as_np_ndarray()
-                    c.attach_grad(grad_req_c)
-                    d = mx.nd.random.uniform(-1.0, 1.0, shape=get_new_shape(shape, axis)).as_np_ndarray()
-                    d.attach_grad(grad_req_d)
-                    expected_ret = _np.concatenate([a.asnumpy(), b.asnumpy(), c.asnumpy(), d.asnumpy()], axis=axis)
+        grad_req_c = grad_req
+        grad_req_d = grad_req
+        if grad_req == 'null':
+            ide = random.randint(0, 2)
+            grad_req_c = 'write' if ide == 0 else 'add'
+            grad_req_c = 'write' if ide == 1 else 'add'
 
-                    with mx.autograd.record():
-                        y = test_concat(a, b, c, d)
+        a = np.random.uniform(-1.0, 1.0, size=get_new_shape(shape, axis)).astype(dtype)
+        a.attach_grad(grad_req)
+        b = np.random.uniform(-1.0, 1.0, size=get_new_shape(shape, axis)).astype(dtype)
+        b.attach_grad(grad_req)
+        c = np.random.uniform(-1.0, 1.0, size=get_new_shape(shape, axis)).astype(dtype)
+        c.attach_grad(grad_req_c)
+        d = np.random.uniform(-1.0, 1.0, size=get_new_shape(shape, axis)).astype(dtype)
+        d.attach_grad(grad_req_d)
+        expected_ret = _np.concatenate([a.asnumpy(), b.asnumpy(), c.asnumpy(), d.asnumpy()], axis=axis)
 
-                    assert y.shape == expected_ret.shape
-                    assert_almost_equal(y.asnumpy(), expected_ret, rtol=1e-3, atol=1e-5)
+        with mx.autograd.record():
+            y = test_concat(a, b, c, d)
 
-                    y.backward()
-                    if grad_req != 'null':
-                        assert_almost_equal(a.grad.asnumpy(), _np.ones(a.shape), rtol=1e-3, atol=1e-5)
-                    if grad_req != 'null':
-                        assert_almost_equal(b.grad.asnumpy(), _np.ones(b.shape), rtol=1e-3, atol=1e-5)
-                    if grad_req_c != 'null':
-                        assert_almost_equal(c.grad.asnumpy(), _np.ones(c.shape), rtol=1e-3, atol=1e-5)
-                    if grad_req_d != 'null':
-                        assert_almost_equal(d.grad.asnumpy(), _np.ones(d.shape), rtol=1e-3, atol=1e-5)
+        assert y.shape == expected_ret.shape
+        assert_almost_equal(y.asnumpy(), expected_ret, rtol=1e-3, atol=1e-5)
 
-                    # test imperative
-                    mx_out = np.concatenate([a, b, c, d], axis=axis)
-                    np_out = _np.concatenate([a.asnumpy(), b.asnumpy(), c.asnumpy(), d.asnumpy()], axis=axis)
-                    assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
+        y.backward()
+        if grad_req != 'null':
+            assert_almost_equal(a.grad.asnumpy(), _np.ones(a.shape), rtol=1e-3, atol=1e-5)
+        if grad_req != 'null':
+            assert_almost_equal(b.grad.asnumpy(), _np.ones(b.shape), rtol=1e-3, atol=1e-5)
+        if grad_req_c != 'null':
+            assert_almost_equal(c.grad.asnumpy(), _np.ones(c.shape), rtol=1e-3, atol=1e-5)
+        if grad_req_d != 'null':
+            assert_almost_equal(d.grad.asnumpy(), _np.ones(d.shape), rtol=1e-3, atol=1e-5)
+
+        # test imperative
+        mx_out = np.concatenate([a, b, c, d], axis=axis)
+        np_out = _np.concatenate([a.asnumpy(), b.asnumpy(), c.asnumpy(), d.asnumpy()], axis=axis)
+        assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
 
 
 @with_seed()
diff --git a/tests/python/unittest/test_symbol.py b/tests/python/unittest/test_symbol.py
index 8e4fe11905cf..f9d6cea578cd 100644
--- a/tests/python/unittest/test_symbol.py
+++ b/tests/python/unittest/test_symbol.py
@@ -484,9 +484,7 @@ def check_cse_on_symbol(sym, expected_savings, check_data, **kwargs):
     arr2 = mx.random.uniform(shape=shape)
     arr3 = mx.random.uniform(shape=shape)
 
-    check_cse_on_symbol((a+5) + (a+5), expected_savings=1, check_data=True, a=arr1, b=arr2)
     check_cse_on_symbol((a+1) + (a+2), expected_savings=0, check_data=True, a=arr1, b=arr2)
-    check_cse_on_symbol((1+a) + (a+1), expected_savings=1, check_data=True, a=arr1, b=arr2)
     check_cse_on_symbol((a+b) + (a+b), expected_savings=1, check_data=True, a=arr1, b=arr2)
     check_cse_on_symbol(((a+b)+c) +((a+b)+c), expected_savings=2, check_data=True,
                                                                   a=arr1, b=arr2, c=arr3)

From 22de01579d8070a4ce2b234a0aebdaa9cb567369 Mon Sep 17 00:00:00 2001
From: Leonard Lausen <lausen@amazon.com>
Date: Fri, 3 Apr 2020 20:20:47 +0000
Subject: [PATCH 2/2] Fix Windows GPU CI (#17962)

Update Windows CI to use VS 2019 and enable x64 bit toolchain. Previously we are using an older 32 bit toolchain causing OOM errors during linking. Switching to x64 bit toolchain on the older VS version previously used by the CI was attempted in #17912 and did not work. Update to Cuda 10.2 as it is required by VS 2019. Switch to ninja-build on Windows to speed up build as ninja-build is now preinstalled. Remove logic to install cmake 3.16 on every PR as cmake 3.17 is now preinstalled. Add build retrials due to cuda thrust + VS2019 flakyness.

Co-authored-by: vexilligera <vexilligera@gmail.com>
---
 ci/build_windows.py | 68 ++++++++++++++++++++++++++++-----------------
 1 file changed, 42 insertions(+), 26 deletions(-)

diff --git a/ci/build_windows.py b/ci/build_windows.py
index 2590d211c671..b9c17a855954 100755
--- a/ci/build_windows.py
+++ b/ci/build_windows.py
@@ -33,13 +33,15 @@
 import zipfile
 from distutils.dir_util import copy_tree
 from enum import Enum
-from subprocess import check_call
+from subprocess import check_call, call
 
 from util import *
 
 KNOWN_VCVARS = {
+    # https://gitlab.kitware.com/cmake/cmake/issues/18920
     'VS 2015': r'C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\bin\x86_amd64\vcvarsx86_amd64.bat',
-    'VS 2017': r'C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsx86_amd64.bat'
+    'VS 2017': r'C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsx86_amd64.bat',
+    'VS 2019': r'C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvars64.bat',
 }
 
 
@@ -54,6 +56,8 @@ class BuildFlavour(Enum):
 
 CMAKE_FLAGS = {
     'WIN_CPU': (
+        '-DCMAKE_C_COMPILER=cl '
+        '-DCMAKE_CXX_COMPILER=cl '
         '-DUSE_CUDA=OFF '
         '-DUSE_CUDNN=OFF '
         '-DENABLE_CUDA_RTC=OFF '
@@ -67,6 +71,8 @@ class BuildFlavour(Enum):
         '-DCMAKE_BUILD_TYPE=Release')
 
     , 'WIN_CPU_MKLDNN': (
+        '-DCMAKE_C_COMPILER=cl '
+        '-DCMAKE_CXX_COMPILER=cl '
         '-DUSE_CUDA=OFF '
         '-DUSE_CUDNN=OFF '
         '-DENABLE_CUDA_RTC=OFF '
@@ -80,6 +86,8 @@ class BuildFlavour(Enum):
         '-DCMAKE_BUILD_TYPE=Release')
 
     , 'WIN_CPU_MKLDNN_MKL': (
+        '-DCMAKE_C_COMPILER=cl '
+        '-DCMAKE_CXX_COMPILER=cl '
         '-DUSE_CUDA=OFF '
         '-DUSE_CUDNN=OFF '
         '-DENABLE_CUDA_RTC=OFF '
@@ -93,6 +101,8 @@ class BuildFlavour(Enum):
         '-DCMAKE_BUILD_TYPE=Release')
 
     , 'WIN_CPU_MKL': (
+        '-DCMAKE_C_COMPILER=cl '
+        '-DCMAKE_CXX_COMPILER=cl '
         '-DUSE_CUDA=OFF '
         '-DUSE_CUDNN=OFF '
         '-DENABLE_CUDA_RTC=OFF '
@@ -106,6 +116,8 @@ class BuildFlavour(Enum):
         '-DCMAKE_BUILD_TYPE=Release')
 
     , 'WIN_GPU': (
+        '-DCMAKE_C_COMPILER=cl '
+        '-DCMAKE_CXX_COMPILER=cl '
         '-DUSE_CUDA=ON '
         '-DUSE_CUDNN=ON '
         '-DENABLE_CUDA_RTC=ON '
@@ -120,6 +132,8 @@ class BuildFlavour(Enum):
         '-DCMAKE_BUILD_TYPE=Release')
 
     , 'WIN_GPU_MKLDNN': (
+        '-DCMAKE_C_COMPILER=cl '
+        '-DCMAKE_CXX_COMPILER=cl '
         '-DUSE_CUDA=ON '
         '-DUSE_CUDNN=ON '
         '-DENABLE_CUDA_RTC=ON '
@@ -140,38 +154,40 @@ def windows_build(args):
     logging.info("Using vcvars environment:\n{}".format(args.vcvars))
 
     path = args.output
-    os.makedirs(path, exist_ok=True)
 
-    mxnet_root = get_mxnet_root()
-    logging.info("Found MXNet root: {}".format(mxnet_root))
+    # cuda thrust + VS 2019 is flaky: try multiple times if fail
+    MAXIMUM_TRY = 5
+    build_try = 0
+
+    while build_try < MAXIMUM_TRY:
+        if os.path.exists(path):
+            shutil.rmtree(path)
+        os.makedirs(path, exist_ok=True)
 
-    url = 'https://github.com/Kitware/CMake/releases/download/v3.16.1/cmake-3.16.1-win64-x64.zip'
-    with tempfile.TemporaryDirectory() as tmpdir:
-        cmake_file_path = download_file(url, tmpdir)
-        with zipfile.ZipFile(cmake_file_path, 'r') as zip_ref:
-            # Create $tmpdir\cmake-3.16.1-win64-x64\bin\cmake.exe
-            zip_ref.extractall(tmpdir)
+        mxnet_root = get_mxnet_root()
+        logging.info("Found MXNet root: {}".format(mxnet_root))
 
         with remember_cwd():
             os.chdir(path)
-            cmd = "\"{}\" && {} -G \"NMake Makefiles JOM\" {} {}".format(
-                args.vcvars,
-                os.path.join(tmpdir, 'cmake-3.16.1-win64-x64', 'bin', 'cmake.exe'),
-                CMAKE_FLAGS[args.flavour], mxnet_root)
+            cmd = "\"{}\" && cmake -GNinja {} {}".format(args.vcvars,
+                                                         CMAKE_FLAGS[args.flavour],
+                                                         mxnet_root)
             logging.info("Generating project with CMake:\n{}".format(cmd))
             check_call(cmd, shell=True)
 
-            cmd = "\"{}\" && jom".format(args.vcvars)
-            logging.info("Building with jom:\n{}".format(cmd))
+            cmd = "\"{}\" && ninja".format(args.vcvars)
+            logging.info("Building:\n{}".format(cmd))
 
             t0 = int(time.time())
-            check_call(cmd, shell=True)
-
-            logging.info(
-                "Build flavour: {} complete in directory: \"{}\"".format(
-                    args.flavour, os.path.abspath(path)))
-            logging.info("Build took {}".format(
-                datetime.timedelta(seconds=int(time.time() - t0))))
+            ret = call(cmd, shell=True)
+
+            if ret != 0:
+                build_try += 1
+                logging.info("{} build(s) have failed".format(build_try))
+            else:
+                logging.info("Build flavour: {} complete in directory: \"{}\"".format(args.flavour, os.path.abspath(path)))
+                logging.info("Build took {}".format(datetime.timedelta(seconds=int(time.time() - t0))))
+                break
     windows_package(args)
 
 
@@ -233,7 +249,7 @@ def main():
 
     parser.add_argument("--vcvars",
         help="vcvars batch file location, typically inside vs studio install dir",
-        default=KNOWN_VCVARS['VS 2015'],
+        default=KNOWN_VCVARS['VS 2019'],
         type=str)
 
     parser.add_argument("--arch",
@@ -258,7 +274,7 @@ def main():
         if 'OpenCV_DIR' not in os.environ:
             os.environ["OpenCV_DIR"] = "C:\\Program Files\\OpenCV-v3.4.1\\build"
         if 'CUDA_PATH' not in os.environ:
-            os.environ["CUDA_PATH"] = "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v9.2"
+            os.environ["CUDA_PATH"] = "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.2"
         if 'MKL_ROOT' not in os.environ:
             os.environ["MKL_ROOT"] = "C:\\Program Files (x86)\\IntelSWTools\\compilers_and_libraries\\windows\\mkl"
         windows_build(args)