Fused clip grad (#9582)

Eager multi-tensor fused_clip_grad kernel --------- Co-authored-by: oneflow-ci-bot <ci-bot@oneflow.org>
Oneflow-Inc · Jun 9, 2023 · f72ebf6 · f72ebf6
1 parent 1cbe5f5
commit f72ebf6
Show file tree

Hide file tree

Showing 10 changed files with 408 additions and 2 deletions.
diff --git a/oneflow/core/functional/functional_api.yaml b/oneflow/core/functional/functional_api.yaml
@@ -3482,3 +3482,7 @@
     "Tensor (Tensor x, Scalar other) => ZetaTensorScalar",
   ]
   bind_python: True
+
+- name: "fused_clip_grad"
+  signature: "Tensor (TensorTuple model_diff, Float max_norm, Float norm_type) => FusedClipGrad"
+  bind_python: True
diff --git a/oneflow/core/functional/impl/nn_functor.cpp b/oneflow/core/functional/impl/nn_functor.cpp
@@ -5345,6 +5345,30 @@ class FusedScaleMaskBiasSoftmaxGradFunctor {
   std::shared_ptr<OpExpr> op_;
 };
 
+class FusedClipGradFunctor {
+ public:
+  FusedClipGradFunctor() {
+    op_.resize(kMaxInputCount /*the maximum number of inputs*/);
+    for (int n = 0; n < op_.size(); ++n) {
+      op_[n] = CHECK_JUST(
+          one::OpBuilder("fused_clip_grad").Input("model_diff", n + 1).Output("out").Build());
+    }
+  }
+
+  Maybe<Tensor> operator()(const TensorTuple& model_diff, const float& max_norm,
+                           const float& norm_type) const {
+    auto& attrs = THREAD_CACHED_MUTABLE_ATTR_MAP("max_norm", "norm_type");
+    attrs.SetAllAttrs(max_norm, norm_type);
+    const int64_t input_size = model_diff.size();
+    CHECK_LE_OR_RETURN(input_size, kMaxInputCount)
+        << Error::RuntimeError() << "model_diff size should not be greater than 128";
+    return JUST(OpInterpUtil::Dispatch<Tensor>(*op_[input_size - 1], model_diff, attrs));
+  }
+
+ private:
+  std::vector<std::shared_ptr<OpExpr>> op_;
+};
+
 class NonContiguousBinaryOpFunctor {
  public:
   NonContiguousBinaryOpFunctor() {
@@ -5530,6 +5554,7 @@ ONEFLOW_FUNCTION_LIBRARY(m) {
   m.add_functor<impl::NonContiguousBinaryOpFunctor>("NonContiguousBinaryOp");
   m.add_functor<impl::NonContiguousBinaryOpGradFunctor>("NonContiguousBinaryOpGrad");
   m.add_functor<impl::MultiTensorYoloV5WeightUpdateFunctor>("MultiTensorYoloV5WeightUpdate");
+  m.add_functor<impl::FusedClipGradFunctor>("FusedClipGrad");
 }
 
 }  // namespace functional

diff --git a/oneflow/ir/include/OneFlow/OneFlowUserOps.td b/oneflow/ir/include/OneFlow/OneFlowUserOps.td
@@ -3336,6 +3336,24 @@ def OneFlow_FusedCodegeexQkvReshapeOp : OneFlow_BaseOp<"fused_codegeex_qkv_resha
   let has_data_type_infer_fn = 1;
 }
 
+def OneFlow_FusedClipGradOp : OneFlow_BaseOp<"fused_clip_grad", [NoGrad, DeclareOpInterfaceMethods<UserOpCompatibleInterface>]> {
+  let input = (ins
+    Variadic<OneFlow_Tensor>:$model_diff
+  );
+  let output = (outs
+    OneFlow_Tensor:$out
+  );
+  let attrs = (ins
+    DefaultValuedAttr<F32Attr, "2.">:$max_norm,
+    DefaultValuedAttr<F32Attr, "1.">:$norm_type
+  );
+  let has_logical_tensor_desc_infer_fn = 1;
+  let has_physical_tensor_desc_infer_fn = 1;
+  let has_get_sbp_fn = 1;
+  let has_data_type_infer_fn = 1;
+  let has_input_arg_modify_fn = 1;
+}
+
 def OneFlow_NonContiguousBinaryOp : OneFlow_BaseOp<"noncontiguous_binary_op", [NoMemoryEffect, SupportNonContiguous, DeclareOpInterfaceMethods<UserOpCompatibleInterface>]> {
   let input = (ins
     OneFlow_Tensor:$lhs,

diff --git a/oneflow/user/kernels/fused_clip_grad.cu b/oneflow/user/kernels/fused_clip_grad.cu
@@ -0,0 +1,114 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/ep/cuda/cuda_stream.h"
+#include "oneflow/core/device/cuda_util.h"
+#include "oneflow/user/kernels/fused_clip_grad.h"
+
+namespace oneflow {
+
+namespace {
+
+constexpr int64_t kMultiReduceScaleMulPackSize = 64;
+
+template<typename T>
+struct MultiClipGradParamPack {
+  MultiClipGradParam<T> params[kMultiReduceScaleMulPackSize];
+  size_t size;
+};
+
+size_t InferFusedClipGradTempStorageSize(user_op::InferContext* ctx) {
+  auto input_size = ctx->input_size("model_diff");
+  if (input_size == 0) { return 0; }
+  int64_t max_elem_cnt = 0;
+  int64_t pack_size = 0;
+  int32_t num_blocks = 0;
+  for (size_t i = 0; i < input_size; ++i) {
+    int64_t elem_cnt = ctx->InputShape("model_diff", i).elem_cnt();
+    max_elem_cnt = std::max(max_elem_cnt, elem_cnt);
+    pack_size++;
+    if (pack_size == kMultiReduceScaleMulPackSize || i == input_size - 1) {
+      CHECK_LT(max_elem_cnt, std::numeric_limits<int32_t>::max());
+      num_blocks += BlocksNum4ThreadsNum(static_cast<int32_t>(max_elem_cnt));
+      max_elem_cnt = 0;
+      pack_size = 0;
+    }
+  }
+  CHECK_LT(num_blocks, kCudaThreadsNumPerBlock * kCudaThreadsNumPerBlock * kCudaThreadsNumPerBlock)
+      << "Too much blocks needed for computing " << ctx->op_name() << ", should be less than "
+      << kCudaThreadsNumPerBlock << "*" << kCudaThreadsNumPerBlock << "*" << kCudaThreadsNumPerBlock
+      << ", but got " << num_blocks;
+  size_t elem_size = GetSizeOfDataType(ctx->InputDType("model_diff", 0));
+  return GetCudaAlignedSize(num_blocks * elem_size * 2);
+}
+
+template<typename T>
+__global__ void MultiBlockClipGradGpu(MultiClipGradParamPack<T> pack_params, T* scale,
+                                      const float norm_type, const float max_norm,
+                                      const ClipGradType clip_grad_type,
+                                      const bool scale_writable) {
+  T t = *scale;
+  if (clip_grad_type == ClipGradType::ZeroType) {
+    t = static_cast<T>(t > 0);
+  } else if (clip_grad_type == ClipGradType::PowerType) {
+    t = std::pow(t, 1.f / norm_type);
+  }
+  if (scale_writable && blockDim.x * blockIdx.x + threadIdx.x == 0) { *scale = t; }
+  t = max_norm / (t + 1e-6);
+  if (t >= 1.) { return; }
+  for (int i = 0; i < pack_params.size; ++i) {
+    auto& param = pack_params.params[i];
+    CUDA_1D_KERNEL_LOOP(j, param.size) { param.data[j] *= t; }
+  }
+}
+
+}  // namespace
+
+template<typename T>
+struct MultiClipGrad<DeviceType::kCUDA, T> {
+  void operator()(ep::Stream* stream, std::vector<MultiClipGradParam<T>>& params, T* scale,
+                  const float norm_type, const float max_norm, const ClipGradType clip_grad_type) {
+    int32_t total_num_blocks = 0;
+    for (size_t i = 0; i < params.size(); i += kMultiReduceScaleMulPackSize) {
+      MultiClipGradParamPack<T> pack_params{};
+      size_t max_elem_cnt = 0;
+      pack_params.size = std::min<size_t>(kMultiReduceScaleMulPackSize, params.size() - i);
+      for (size_t j = 0; j < pack_params.size; ++j) {
+        pack_params.params[j] = params[i + j];
+        max_elem_cnt = std::max<size_t>(max_elem_cnt, pack_params.params[j].size);
+      }
+      int32_t num_blocks = BlocksNum4ThreadsNum(max_elem_cnt);
+      bool scale_writable = static_cast<bool>(i + kMultiReduceScaleMulPackSize >= params.size());
+      MultiBlockClipGradGpu<T>
+          <<<num_blocks, kCudaThreadsNumPerBlock, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+              pack_params, scale, norm_type, max_norm, clip_grad_type, scale_writable);
+      total_num_blocks += num_blocks;
+    }
+  }
+};
+
+#define REGISTER_FUSED_CLIP_GRAD_KERNEL(device, dtype)                                         \
+  REGISTER_USER_KERNEL("fused_clip_grad")                                                      \
+      .SetCreateFn<FusedClipGradKernel<device, dtype>>()                                       \
+      .SetIsMatchedHob((user_op::HobDeviceType() == device)                                    \
+                       && (user_op::HobDataType("model_diff", 0) == GetDataType<dtype>::value) \
+                       && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value))       \
+      .SetInferTmpSizeFn(InferFusedClipGradTempStorageSize);
+
+REGISTER_FUSED_CLIP_GRAD_KERNEL(DeviceType::kCUDA, float);
+REGISTER_FUSED_CLIP_GRAD_KERNEL(DeviceType::kCUDA, double);
+
+}  // namespace oneflow
diff --git a/oneflow/user/kernels/fused_clip_grad.h b/oneflow/user/kernels/fused_clip_grad.h
@@ -0,0 +1,99 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_USER_KERNELS_FUSED_CLIP_GRAD_H_
+#define ONEFLOW_USER_KERNELS_FUSED_CLIP_GRAD_H_
+
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/cuda_graph_support.h"
+#include "oneflow/user/kernels/multi_reduce_kernel_util.h"
+#include "oneflow/user/kernels/fused_clip_grad_util.h"
+
+namespace oneflow {
+
+template<DeviceType device_type, typename T>
+class FusedClipGradKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport {
+ public:
+  FusedClipGradKernel() = default;
+  ~FusedClipGradKernel() override = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+    T* out_ptr = out->mut_dptr<T>();
+    T* temp = (ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0))->mut_dptr<T>();
+    const int32_t input_size = ctx->input_size("model_diff");
+    const float max_norm = ctx->Attr<float>("max_norm");
+    const float norm_type = ctx->Attr<float>("norm_type");
+
+    std::vector<MultiReduceParam<T>> params;
+    params.resize(input_size);
+    for (size_t i = 0; i < input_size; ++i) {
+      const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("model_diff", i);
+      params[i].size = x->shape_view().elem_cnt();
+      params[i].data = x->dptr<T>();
+    }
+    if (norm_type == 0) {
+      PowByZero<T> func{};
+      MultiReduce<device_type, T, decltype(func), BinaryAdd<T>> reduce_add{};
+      reduce_add(ctx->stream(), func, params, GetZeroVal<T>(), out_ptr, temp);
+    } else if (norm_type == INFINITY) {
+      Abs<T> func{};
+      MultiReduce<device_type, T, decltype(func), BinaryMax<T>> reduce_max{};
+      reduce_max(ctx->stream(), func, params, GetZeroVal<T>(), out_ptr, temp);
+    } else if (norm_type == -INFINITY) {
+      Abs<T> func{};
+      MultiReduce<device_type, T, decltype(func), BinaryMin<T>> reduce_min{};
+      reduce_min(ctx->stream(), func, params, std::numeric_limits<T>::max(), out_ptr, temp);
+    } else if (norm_type == 1) {
+      Abs<T> func{};
+      MultiReduce<device_type, T, decltype(func), BinaryAdd<T>> reduce_sum{};
+      reduce_sum(ctx->stream(), func, params, GetZeroVal<T>(), out_ptr, temp);
+    } else if (norm_type == 2) {
+      Square<T> func{};
+      MultiReduce<device_type, T, decltype(func), BinaryAdd<T>> reduce_sum{};
+      reduce_sum(ctx->stream(), func, params, GetZeroVal<T>(), out_ptr, temp);
+    } else {
+      AbsPow<T> func{norm_type};
+      MultiReduce<device_type, T, decltype(func), BinaryAdd<T>> reduce_sum{};
+      reduce_sum(ctx->stream(), func, params, GetZeroVal<T>(), out_ptr, temp);
+    }
+
+    std::vector<MultiClipGradParam<T>> mut_params;
+    mut_params.resize(input_size);
+    for (size_t i = 0; i < input_size; ++i) {
+      user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("model_diff", i);
+      mut_params[i].size = x->shape_view().elem_cnt();
+      mut_params[i].data = x->mut_dptr<T>();
+    }
+    MultiClipGrad<device_type, T> multi_clip_grad{};
+    if (norm_type == 0) {
+      multi_clip_grad(ctx->stream(), mut_params, out_ptr, norm_type, max_norm,
+                      ClipGradType::ZeroType);
+    } else if (std::abs(norm_type) == INFINITY || norm_type == 1) {
+      multi_clip_grad(ctx->stream(), mut_params, out_ptr, norm_type, max_norm,
+                      ClipGradType::OtherType);
+    } else {
+      multi_clip_grad(ctx->stream(), mut_params, out_ptr, norm_type, max_norm,
+                      ClipGradType::PowerType);
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return true; }
+};
+
+}  // namespace oneflow
+
+#endif  // ONEFLOW_USER_KERNELS_FUSED_CLIP_GRAD_H_
diff --git a/oneflow/user/kernels/fused_clip_grad_util.h b/oneflow/user/kernels/fused_clip_grad_util.h
@@ -0,0 +1,46 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_USER_KERNELS_FUSED_CLIP_GRAD_UTIL_H_
+#define ONEFLOW_USER_KERNELS_FUSED_CLIP_GRAD_UTIL_H_
+
+#include "oneflow/core/common/data_type.h"
+#include "oneflow/core/common/device_type.h"
+#include "oneflow/core/common/device_type.pb.h"
+#include "oneflow/core/ep/include/stream.h"
+
+namespace oneflow {
+
+template<typename T>
+struct MultiClipGradParam {
+  T* data;
+  size_t size;
+};
+
+enum ClipGradType : int {
+  ZeroType,
+  PowerType,
+  OtherType,
+};
+
+template<DeviceType device_type, typename T>
+struct MultiClipGrad {
+  void operator()(ep::Stream* stream, std::vector<MultiClipGradParam<T>>& params, T* scale,
+                  const float norm_type, const float max_norm, const ClipGradType clip_grad_type);
+};
+
+}  // namespace oneflow
+
+#endif  // ONEFLOW_USER_KERNELS_FUSED_CLIP_GRAD_UTIL_H_