From 39903f72b5b3904126b07e818b0f6bebfb2c8c4c Mon Sep 17 00:00:00 2001
From: niuliling123 <51102941+niuliling123@users.noreply.github.com>
Date: Mon, 6 Jun 2022 16:23:34 +0800
Subject: [PATCH 01/22] Replace ReduceAmax/Amax.part.cu with KP  (#43202)

---
 .../reduce_ops/reduce_amax_op.part.cu         | 19 ++--
 .../reduce_ops/reduce_amin_op.part.cu         | 19 ++--
 paddle/fluid/operators/reduce_ops/reduce_op.h | 96 ++++++++++++++++++-
 paddle/phi/kernels/funcs/broadcast_function.h | 19 +++-
 .../phi/kernels/gpu/frobenius_norm_kernel.cu  | 22 ++++-
 5 files changed, 150 insertions(+), 25 deletions(-)
diff --git a/paddle/fluid/operators/reduce_ops/reduce_amax_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_amax_op.part.cu
index 18c846bc2b4699..ed6df1e558bed6 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_amax_op.part.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_amax_op.part.cu
@@ -12,15 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 
-REGISTER_OP_CUDA_KERNEL(
-    reduce_amax_grad,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, float,
-                          ops::AMaxOrAMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::AMaxOrAMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
-                          ops::AMaxOrAMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
-                          ops::AMaxOrAMinGradFunctor>);
+template <typename T>
+using CUDAReduceMaxGradKernel =
+    ops::ReduceCudaAMaxAMinGradKernel<T, kps::IdentityFunctor>;
+REGISTER_OP_CUDA_KERNEL(reduce_amax_grad, CUDAReduceMaxGradKernel<int>,
+                        CUDAReduceMaxGradKernel<int64_t>,
+                        CUDAReduceMaxGradKernel<float>,
+                        CUDAReduceMaxGradKernel<double>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_amin_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_amin_op.part.cu
index c7a26049634ce6..69854da3c4f259 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_amin_op.part.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_amin_op.part.cu
@@ -12,15 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 
-REGISTER_OP_CUDA_KERNEL(
-    reduce_amin_grad,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, float,
-                          ops::AMaxOrAMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::AMaxOrAMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
-                          ops::AMaxOrAMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
-                          ops::AMaxOrAMinGradFunctor>);
+template <typename T>
+using CUDAReduceMinGradKernel =
+    ops::ReduceCudaAMaxAMinGradKernel<T, kps::IdentityFunctor>;
+REGISTER_OP_CUDA_KERNEL(reduce_amin_grad, CUDAReduceMinGradKernel<int>,
+                        CUDAReduceMinGradKernel<int64_t>,
+                        CUDAReduceMinGradKernel<float>,
+                        CUDAReduceMinGradKernel<double>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index 322ef1fdff67ab..ff7429f75ebe3a 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/cast_op.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op_function.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-
 // only can include the headers in paddle/phi/api dirs
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
@@ -655,6 +654,7 @@ class ReduceCudaGradKernel : public framework::OpKernel<T> {
     bool reduce_all = context.Attr<bool>("reduce_all");
     std::vector<int> dims = context.Attr<std::vector<int>>("dim");
     auto* in_x = context.Input<Tensor>("X");
+
     auto* d_out =
         context.Input<framework::Tensor>(framework::GradVarName("Out"));
     auto* d_x = context.Output<framework::Tensor>(framework::GradVarName("X"));
@@ -685,12 +685,106 @@ class ReduceCudaGradKernel : public framework::OpKernel<T> {
     if (out_dtype <= 0) {
       pt_out_dtype = d_out->dtype();
     }
+
     using MPType = typename kps::details::MPTypeTrait<T>::Type;
     phi::ReduceGrad<T, TransformOp<T, MPType>>(
         dev_ctx, pt_d_out.get(), pt_d_x.get(), pt_out_dtype,
         TransformOp<T, MPType>(reduce_num));
   }
 };
+
+template <typename T>
+struct EqualFunctor {
+  inline T initial() { return static_cast<T>(0.0f); }
+
+  inline HOSTDEVICE T operator()(const T a, const T b) const {
+    return static_cast<T>(a == b);
+  }
+};
+
+template <typename T, typename Enable = void>
+struct DivideFunctor {
+  inline T initial() { return static_cast<T>(1.0f); }
+
+  inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; }
+};
+
+template <typename T, template <typename, typename> class TransformOp>
+class ReduceCudaAMaxAMinGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    std::vector<int> dims = context.Attr<std::vector<int>>("dim");
+    auto* in_x = context.Input<Tensor>("X");
+    auto* out_y = context.Input<Tensor>("Out");
+    auto* d_out =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* d_x = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto out_dtype = context.Attr<int>("in_dtype");
+    auto pt_out_dtype = framework::TransToPhiDataType(
+        static_cast<framework::proto::VarType::Type>(out_dtype));
+    // get reduce_dim and reduce_num for reduce_mean_grad
+    int dim_size = in_x->dims().size();
+    std::vector<int> reduce_dims = GetReduceDim(dims, dim_size, reduce_all);
+    auto update_dims = vectorize(d_x->dims());
+    int reduce_num = 1;
+    for (auto i : reduce_dims) {
+      reduce_num *= (in_x->dims())[i];
+      update_dims[i] = 1;
+    }
+    auto& dev_ctx = context.cuda_device_context();
+
+    // make new tensor reduce_out
+    phi::DenseTensor new_y(out_y->type());
+    new_y.ShareDataWith(*out_y);
+    new_y.Resize(phi::make_ddim(update_dims));
+
+    // make new tensor d_out
+    phi::DenseTensor new_dout(d_out->type());
+    new_dout.ShareDataWith(*d_out);
+    new_dout.Resize(phi::make_ddim(update_dims));
+    d_x->mutable_data(dev_ctx.GetPlace(), d_out->dtype());
+
+    auto new_in = paddle::experimental::MakePhiDenseTensor(*in_x);
+    auto new_in_tensor = new_in.get();
+
+    auto new_dx = paddle::experimental::MakePhiDenseTensor(*d_x);
+    auto new_dx_tensor = new_dx.get();
+
+    // make equal_out
+    phi::DenseTensor* equal_out = new phi::DenseTensor();
+    equal_out->Resize(in_x->dims());
+    dev_ctx.template Alloc<T>(equal_out);
+    auto equal_out_tensor = *equal_out;
+
+    // make new tensor equal_count
+    phi::DenseTensor* equal_count = new phi::DenseTensor();
+    equal_count->Resize(phi::make_ddim(update_dims));
+    dev_ctx.template Alloc<T>(equal_count);
+
+    // compute
+    // 1. equal_out = Equal(x, y)
+    std::vector<const phi::DenseTensor*> equal_inputs = {&new_y, new_in_tensor};
+    std::vector<phi::DenseTensor*> equal_outputs = {&equal_out_tensor};
+    phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
+        dev_ctx, equal_inputs, &equal_outputs, 0, EqualFunctor<T>());
+    // 2. equal_count = reduceSum(equal_out)
+    using MPType = typename kps::details::MPTypeTrait<T>::Type;
+    phi::funcs::ReduceKernel<T, T, kps::AddFunctor,
+                             kps::IdentityFunctor<T, MPType>>(
+        dev_ctx, equal_out_tensor, equal_count,
+        kps::IdentityFunctor<T, MPType>(), reduce_dims, false);
+
+    // 3. dx = Div(dout, equal_out)
+    std::vector<const phi::DenseTensor*> grad_inputs = {&equal_out_tensor,
+                                                        equal_count};
+    std::vector<phi::DenseTensor*> grad_outputs = {new_dx_tensor};
+    phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
+        dev_ctx, grad_inputs, &grad_outputs, 0, DivideFunctor<T>());
+    delete equal_out;
+    delete equal_count;
+  }
+};
 #endif
 #endif
 
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index 88b87c07c7615c..74e48f39185485 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -605,7 +605,22 @@ void ElementwiseCompute(const GPUContext &dev_ctx,
       dev_ctx, ins, &outs, axis, func);
 }
 
-#endif
+template <typename DeviceContext,
+          typename T,
+          typename Functor,
+          typename InverseFunctor>
+void DefaultElementwiseOperator(const DeviceContext &dev_ctx,
+                                const DenseTensor &x,
+                                const DenseTensor &y,
+                                DenseTensor *z,
+                                int axis = -1) {
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+  dev_ctx.template Alloc<T>(z);
+  funcs::ElementwiseCompute<Functor, T>(dev_ctx, x, y, axis, Functor(), z);
+}
+
+#else
 
 template <typename DeviceContext,
           typename T,
@@ -627,5 +642,7 @@ void DefaultElementwiseOperator(const DeviceContext &dev_ctx,
   }
 }
 
+#endif
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/frobenius_norm_kernel.cu b/paddle/phi/kernels/gpu/frobenius_norm_kernel.cu
index b921d2d6403304..f41ccfd37735d9 100644
--- a/paddle/phi/kernels/gpu/frobenius_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/frobenius_norm_kernel.cu
@@ -14,7 +14,27 @@
 
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/frobenius_norm_kernel.h"
-#include "paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h"
+#include "paddle/phi/kernels/funcs/activation_functor.h"
+#include "paddle/phi/kernels/gpu/reduce.h"
+namespace phi {
+
+template <typename T, typename Context>
+void FrobeniusNormKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const std::vector<int64_t>& dims,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<T, kps::AddFunctor, kps::SquareFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+  std::vector<const DenseTensor*> ins = {out};
+  std::vector<DenseTensor*> outs = {out};
+  auto functor = funcs::CudaSqrtFunctor<T>();
+  funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
+}
+
+}  // namespace phi
 
 PD_REGISTER_KERNEL(
     frobenius_norm, GPU, ALL_LAYOUT, phi::FrobeniusNormKernel, float, double) {}

From 398b96c6b887298e1e721b4e62d52480b37d6f63 Mon Sep 17 00:00:00 2001
From: SmirnovKol <31559413+SmirnovKol@users.noreply.github.com>
Date: Mon, 6 Jun 2022 18:31:52 +0800
Subject: [PATCH 02/22] Update optimizer.py (#43201)

---
 python/paddle/optimizer/optimizer.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index e3e7257f75705b..ec367c7c710eda 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -338,9 +338,6 @@ def set_state_dict(self, state_dict):
                 adam.set_state_dict(opti_state_dict)
 
         '''
-        if isinstance(self._learning_rate, LRScheduler):
-            self._learning_rate.set_dict(state_dict["LR_Scheduler"])
-
         if isinstance(self._learning_rate, LRScheduler):
             self._learning_rate.set_state_dict(state_dict["LR_Scheduler"])
 

From c22e1123091d7b6592b07a9f6acb1c8c108e271b Mon Sep 17 00:00:00 2001
From: zhaoyingli <86812880+zhaoyinglia@users.noreply.github.com>
Date: Mon, 6 Jun 2022 19:06:42 +0800
Subject: [PATCH 03/22] [AutoParallel] fix gradient merge optimize parse
 (#43169)

* fix gradient merge

* bug fix

* update annotation
---
 .../auto_parallel/parallelizer_v2.py          |  6 +-
 .../passes/auto_parallel_gradient_merge.py    | 70 ++++++++-------
 .../distributed_passes/CMakeLists.txt         |  2 +-
 ...test_auto_parallel_gradient_merge_pass.py} | 88 +++++--------------
 4 files changed, 66 insertions(+), 100 deletions(-)
 rename python/paddle/fluid/tests/unittests/distributed_passes/{test_dist_gradient_merge_pass.py => test_auto_parallel_gradient_merge_pass.py} (72%)

diff --git a/python/paddle/distributed/auto_parallel/parallelizer_v2.py b/python/paddle/distributed/auto_parallel/parallelizer_v2.py
index ce543988ea4e1b..f02eb38f45877b 100644
--- a/python/paddle/distributed/auto_parallel/parallelizer_v2.py
+++ b/python/paddle/distributed/auto_parallel/parallelizer_v2.py
@@ -148,7 +148,7 @@ def _apply_pre_optimization(self, main_program, startup_program, loss,
                                                     config)
             auto_parallel_recompute_pass.apply([main_program],
                                                [startup_program],
-                                               self._dist_context)
+                                               self._pass_context)
 
     def _apply_post_optimization(self, main_program, startup_program, rank,
                                  params_grads):
@@ -162,7 +162,7 @@ def _apply_post_optimization(self, main_program, startup_program, rank,
             auto_parallel_sharding_pass = new_pass("auto_parallel_sharding",
                                                    config)
             auto_parallel_sharding_pass.apply([main_program], [startup_program],
-                                              self._dist_context)
+                                              self._pass_context)
 
         if self._strategy.gradient_merge:
             config = copy.deepcopy(self._strategy.gradient_merge_configs)
@@ -172,4 +172,4 @@ def _apply_post_optimization(self, main_program, startup_program, rank,
                 "auto_parallel_gradient_merge_pass", config)
             auto_parallel_gradient_merge_pass.apply([main_program],
                                                     [startup_program],
-                                                    self._dist_context)
+                                                    self._pass_context)
diff --git a/python/paddle/distributed/passes/auto_parallel_gradient_merge.py b/python/paddle/distributed/passes/auto_parallel_gradient_merge.py
index bc40dad8ac0d9a..394d71706c4c49 100644
--- a/python/paddle/distributed/passes/auto_parallel_gradient_merge.py
+++ b/python/paddle/distributed/passes/auto_parallel_gradient_merge.py
@@ -18,10 +18,10 @@
 
 import paddle
 from paddle.framework import core
+from paddle.fluid import layers
 from paddle.fluid.framework import program_guard, device_guard
-from paddle.fluid import unique_name, layers
-from paddle.fluid.clip import append_gradient_clip_ops
 from .pass_base import PassBase, PassType, register_pass
+from paddle.distributed.fleet.meta_optimizers.common import OpRole
 from paddle.distributed.auto_parallel.utils import set_var_dist_attr
 from paddle.distributed.auto_parallel.utils import naive_set_dist_op_attr_for_program_by_mesh_and_mapping
 from paddle.distributed.auto_parallel.process_group import get_world_process_group
@@ -29,16 +29,8 @@
 world_process_group = get_world_process_group()
 
 
-def _is_the_backward_op(op):
-    OP_ROLE_KEY = core.op_proto_and_checker_maker.kOpRoleAttrName()
-    OpRole = core.op_proto_and_checker_maker.OpRole
-    return OP_ROLE_KEY in op.attr_names and \
-            int(op.all_attrs()[OP_ROLE_KEY]) & int(OpRole.Backward)
-
-
 def _is_the_optimizer_op(op):
     OP_ROLE_KEY = core.op_proto_and_checker_maker.kOpRoleAttrName()
-    OpRole = core.op_proto_and_checker_maker.OpRole
     return OP_ROLE_KEY in op.attr_names and \
             int(op.all_attrs()[OP_ROLE_KEY]) & int(OpRole.Optimize)
 
@@ -47,13 +39,13 @@ def _remove_and_get_optimizer_op(main_program, dist_context):
     # 1 create tmp block
     # 2 mv optimizer op from global program to tmp block
     # 3 del the op from dist_context
-    from paddle.distributed.fleet.meta_optimizers.common import OpRole
     main_block = main_program.global_block()
     temp_block = main_program._create_block()
     removed_op_idx = []
     optimize_ops_desc = []
+    skip_ops = ["increment", "elementwise_mod", "equal"]
     for idx, op in enumerate(main_block.ops):
-        if _is_the_optimizer_op(op):
+        if _is_the_optimizer_op(op) and op.type not in skip_ops:
             # append optimizer op to tmp block
             new_op_desc = temp_block.desc.append_op()
             new_op_desc.copy_from(op.desc)
@@ -111,8 +103,17 @@ def _get_gm_cond_var(main_program, k_steps, dist_context):
     set_var_dist_attr(dist_context, cond_var, [-1], world_process_group.ranks)
 
     with device_guard("cpu"):
-        # step_var = (step_var + 1) % k_step
-        layers.increment(x=step_var, value=1.0, in_place=True)
+        # step_var += 1
+        increment_op = main_block.append_op(type='increment',
+                                            inputs={'X': [step_var]},
+                                            outputs={'Out': [step_var]},
+                                            attrs={
+                                                'step': float(1.0),
+                                                'op_role': OpRole.Optimize
+                                            })
+        naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
+            increment_op, world_process_group.ranks, [-1], dist_context)
+        # step_var %= k_step
         elementwise_mod_op = main_block.append_op(type='elementwise_mod',
                                                   inputs={
                                                       'X': step_var,
@@ -121,18 +122,19 @@ def _get_gm_cond_var(main_program, k_steps, dist_context):
                                                   outputs={'Out': step_var},
                                                   attrs={
                                                       'axis': -1,
-                                                      'use_mkldnn': False
+                                                      'use_mkldnn': False,
+                                                      'op_role': OpRole.Optimize
                                                   })
         naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
             elementwise_mod_op, world_process_group.ranks, [-1], dist_context)
-
         # cond_var = (step_var == 0)
         equal_op = main_block.append_op(type='equal',
                                         inputs={
                                             'X': step_var,
                                             'Y': zero_var
                                         },
-                                        outputs={'Out': cond_var})
+                                        outputs={'Out': cond_var},
+                                        attrs={'op_role': OpRole.Optimize})
         naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
             equal_op, world_process_group.ranks, [-1], dist_context)
 
@@ -154,7 +156,9 @@ def _append_gradient_merge_backward_op(
 
         _remove_op_role_var(param, grad)
 
-    param_to_gradient_merge = {}
+    # {grad.name: gradient_merge_var.name} to rename opt inputs
+    grad_to_gradient_merge = {}
+    # {param: gradient_merge_var} to insert scale op and fill_constant op
     new_params_to_grads = []
     # step2: create gradient_merge var and init with 0
     for param, grad in params_grads:
@@ -168,7 +172,6 @@ def _append_gradient_merge_backward_op(
                                                    shape=param_var.shape,
                                                    dtype=param_var.dtype,
                                                    persistable=True)
-        param_to_gradient_merge[param_name] = gradient_merge_var
         ref_process_mesh = ref_dist_attr.process_mesh
         ref_dims_mapping = ref_dist_attr.dims_mapping
 
@@ -197,17 +200,19 @@ def _append_gradient_merge_backward_op(
                                            outputs={'Out': gradient_merge_var},
                                            attrs={
                                                'axis': -1,
-                                               'use_mkldnn': False
+                                               'use_mkldnn': False,
+                                               'op_role': OpRole.Optimize
                                            })
         new_params_to_grads.append([param, gradient_merge_var])
+        grad_to_gradient_merge[grad.name] = gradient_merge_var.name
         naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
             new_grad_op, ref_process_mesh, ref_dims_mapping, dist_context)
-    return new_params_to_grads, param_to_gradient_merge
+    return new_params_to_grads, grad_to_gradient_merge
 
 
 def _create_cond_block_and_update_optimizer(
         main_program, cond_var, new_params_to_grads: List[Tuple[Any, Any]],
-        param_to_gradient_merge: Dict[str, Any], optimize_ops_desc: List[Any],
+        grad_to_gradient_merge: Dict[str, str], optimize_ops_desc: List[Any],
         k_steps, avg):
 
     def true_apply_gradient():
@@ -229,7 +234,7 @@ def true_apply_gradient():
                                         'bias_after_scale': False
                                     })
                 new_grad.op._set_attr(op_maker.kOpRoleAttrName(),
-                                      op_maker.OpRole.Optimize)
+                                      OpRole.Optimize)
 
         # append optimizer ops
         for op_desc in optimize_ops_desc:
@@ -238,14 +243,14 @@ def true_apply_gradient():
 
             #update input/output
             for input_name in new_op_desc.input_arg_names():
-                if input_name in new_params_to_grads:
-                    new_op_desc._rename_input(input_name,
-                                              new_params_to_grads[input_name])
+                if input_name in grad_to_gradient_merge:
+                    new_op_desc._rename_input(
+                        input_name, grad_to_gradient_merge[input_name])
 
             for output_name in new_op_desc.output_arg_names():
-                if output_name in new_params_to_grads:
-                    new_op_desc._rename_output(output_name,
-                                               new_params_to_grads[output_name])
+                if output_name in grad_to_gradient_merge:
+                    new_op_desc._rename_output(
+                        output_name, grad_to_gradient_merge[output_name])
 
             # remove op_role_var
             if new_op_desc.has_attr(op_maker.kOpRoleVarAttrName()):
@@ -271,6 +276,8 @@ def true_apply_gradient():
                                   op_maker.OpRole.Optimize)
 
     layers.cond(cond_var, true_fn=true_apply_gradient, false_fn=None)
+    cond_op = main_program.global_block().ops[-1]
+    cond_op._set_attr('op_role', OpRole.Optimize)
 
 
 def parse_program(main_program, startup_program, params_grads, k_steps, avg,
@@ -285,14 +292,14 @@ def parse_program(main_program, startup_program, params_grads, k_steps, avg,
     main_program._rollback()
 
     # 3 append gradient merge backward op to main_program
-    new_params_to_grads, param_to_gradient_merge = _append_gradient_merge_backward_op(
+    new_params_to_grads, grad_to_gradient_merge = _append_gradient_merge_backward_op(
         main_program, startup_program, params_grads, cond_var.name,
         dist_context)
 
     # 4 create ConditionalBlock and append gradient merge optimizer ops
     _create_cond_block_and_update_optimizer(main_program, cond_var,
                                             new_params_to_grads,
-                                            param_to_gradient_merge,
+                                            grad_to_gradient_merge,
                                             optimize_ops_desc, k_steps, avg)
 
 
@@ -303,7 +310,6 @@ def __init__(self):
         super(GradientMergePass, self).__init__()
         self.set_attr("k_steps", -1)
         self.set_attr("avg", True)
-        self.set_attr("inner_optimizer", None)
 
     def _check_self(self):
         if self.get_attr("k_steps") < 1:
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt b/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt
index c68cebaa25b22d..29e528edce914a 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt
@@ -14,12 +14,12 @@ if((NOT WITH_GPU)
   list(REMOVE_ITEM TEST_OPS "test_dist_fuse_momentum_pass")
   list(REMOVE_ITEM TEST_OPS "test_dist_fuse_relu_depthwise_conv_pass")
   list(REMOVE_ITEM TEST_OPS "test_dist_fuse_sgd_pass")
-  list(REMOVE_ITEM TEST_OPS "test_dist_gradient_merge_pass")
   list(REMOVE_ITEM TEST_OPS "test_dist_inplace_addto_pass")
   list(REMOVE_ITEM TEST_OPS "test_auto_parallel_amp_pass")
   list(REMOVE_ITEM TEST_OPS "test_auto_parallel_recompute_pass")
   list(REMOVE_ITEM TEST_OPS "test_auto_parallel_sharding_pass")
   list(REMOVE_ITEM TEST_OPS "test_auto_parallel_fp16_pass")
+  list(REMOVE_ITEM TEST_OPS "test_auto_parallel_gradient_merge_pass")
 endif()
 
 foreach(TEST_OP ${TEST_OPS})
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_gradient_merge_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_gradient_merge_pass.py
similarity index 72%
rename from python/paddle/fluid/tests/unittests/distributed_passes/test_dist_gradient_merge_pass.py
rename to python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_gradient_merge_pass.py
index f856059402efb0..50e18718201865 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_gradient_merge_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_gradient_merge_pass.py
@@ -25,20 +25,14 @@
 import paddle.utils as utils
 import paddle.static as static
 import paddle.nn.functional as F
+import paddle.distributed.fleet as fleet
 import paddle.distributed.auto_parallel as auto
-from paddle.fluid.initializer import NumpyArrayInitializer
 
-from paddle.distributed.passes import new_pass, PassManager, PassContext
-import paddle.distributed.fleet as fleet
-from dist_pass_test_base import DistPassTestBase
-from paddle.distributed.auto_parallel.dist_context import DistributedContext
+from paddle.fluid.initializer import NumpyArrayInitializer
+from auto_parallel_pass_test_base import AutoPallelPassTestBase
 
 logging.getLogger().setLevel(logging.INFO)
 paddle.enable_static()
-_global_parallel_strategy = None
-_global_process_mesh = None
-
-#np.set_printoptions(suppress=True)
 
 
 class MLPLayer(nn.Layer):
@@ -103,13 +97,11 @@ def forward(self, input):
 
 
 def mlp_forward(input, label, hidden_size):
-    if _global_parallel_strategy == "dp":
-        auto.shard_tensor(input,
-                          dist_attr={
-                              "process_mesh": _global_process_mesh,
-                              "dims_mapping": [0, -1]
-                          })
-
+    auto.shard_tensor(input,
+                      dist_attr={
+                          "process_mesh": [0],
+                          "dims_mapping": [-1, -1]
+                      })
     mlp = MLPLayer(hidden_size=hidden_size,
                    intermediate_size=4 * hidden_size,
                    initializer_range=0.02)
@@ -119,40 +111,33 @@ def mlp_forward(input, label, hidden_size):
     return loss
 
 
-class TestGradientMergePass(DistPassTestBase):
+class TestGradientMergePass(AutoPallelPassTestBase):
 
     def init(self):
-        self._params_grads = None
-        self._config = {"k_steps": 4, "avg": True}
-        #self._config["dist_context"] = DistributedContext()
-
-    def apply_passes(self, main_prog, startup_prog):
-        #self._config["params_grads"] = self._params_grads
-        #pass_context = PassContext()
-        #auto_parallel_gradient_merge_pass = new_pass(
-        #    "auto_parallel_gradient_merge_pass", self._config)
-        #auto_parallel_gradient_merge_pass.apply([main_prog], [startup_prog],
-        #                                        pass_context)
+        paddle.seed(2022)
+        random.seed(2022)
+        np.random.seed(2022)
+
+    def apply_passes(self):
         dist_strategy = fleet.DistributedStrategy()
+        dist_strategy.semi_auto = True
         dist_strategy.gradient_merge = True
         dist_strategy.gradient_merge_configs = {"k_steps": 4, "avg": True}
-        dist_strategy.semi_auto = True
         fleet.init(is_collective=True, strategy=dist_strategy)
 
     def test_result(self):
         no_pass_rets = self._distributed_launch(model=None,
                                                 apply_pass=False,
                                                 gpus=[0],
-                                                gradient_merge=False,
                                                 batch_size=32,
+                                                hidden_size=128,
                                                 max_step=2)
         pass_rets = self._distributed_launch(model=None,
                                              apply_pass=True,
                                              gpus=[0],
-                                             gradient_merge=True,
                                              batch_size=8,
+                                             hidden_size=128,
                                              max_step=8)
-        """
         # avg loss for gradient_merge pass
         avg_loss = 0
         pass_avg_ret_list = []
@@ -167,40 +152,16 @@ def test_result(self):
         for no_pass_ret, pass_ret in zip(no_pass_rets[0], pass_avg_ret_list):
             print(f"no_pass_ret={no_pass_ret}, pass_ret={pass_ret}")
             self.assertTrue(
-                np.isclose(
-                    no_pass_ret,
-                    pass_ret,
-                    rtol=self.rtol,
-                    atol=self.atol,
-                    equal_nan=self.equal_nan))
-        """
-
-    def get_model(self, place, gradient_merge, batch_size, max_step):
-        paddle.seed(2021)
-        random.seed(2021)
-        np.random.seed(2021)
+                np.isclose(no_pass_ret,
+                           pass_ret,
+                           rtol=self.rtol,
+                           atol=self.atol,
+                           equal_nan=self.equal_nan))
 
-        hidden_size = 128
-
-        global _global_parallel_strategy
-        global _global_process_mesh
-        world_size = paddle.distributed.get_world_size()
-        if world_size == 1:
-            _global_parallel_strategy = "dp"
-            _global_process_mesh = auto.ProcessMesh([0])
-        elif world_size == 2:
-            _global_parallel_strategy = "dp"
-            _global_process_mesh = auto.ProcessMesh([0, 1])
+    def get_model(self, place, batch_size, hidden_size, max_step):
 
         train_program = static.Program()
         startup_program = static.Program()
-        dist_strategy = fleet.DistributedStrategy()
-        dist_strategy.semi_auto = True
-        #if gradient_merge:
-        #    dist_strategy.gradient_merge = True
-        #    dist_strategy.gradient_merge_configs = {"k_steps": 4, "avg": True}
-        fleet.init(is_collective=True, strategy=dist_strategy)
-
         with static.program_guard(train_program, startup_program), \
             utils.unique_name.guard():
             input = static.data(name="input",
@@ -212,8 +173,7 @@ def get_model(self, place, gradient_merge, batch_size, max_step):
             input.stop_gradient = False
             loss = mlp_forward(input, label, hidden_size)
 
-        optimizer = paddle.fluid.optimizer.SGDOptimizer(learning_rate=0.01)
-        #optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
+        optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.01)
         optimizer = fleet.distributed_optimizer(optimizer)
         _, self._params_grads, dist_startup_prog, dist_main_prog = optimizer.minimize(
             loss, startup_program)

From 607a1d65de8f9c01a7e17e95160928030c45553b Mon Sep 17 00:00:00 2001
From: heliqi <1101791222@qq.com>
Date: Mon, 6 Jun 2022 06:59:27 -0500
Subject: [PATCH 04/22] [inference]Resolve protobuf of ORT Backend conflict
 (#43159)

* modify paddle2onnx cmake

* modify paddle2onnx cmake

* modify export interface

* modify paddle2onnx export

* paddle2onnx add mac windows

* modify paddle2onnx mac windows cmake

* modify paddle2onnx mac windows cmake

* modify paddle2onnx cmake support windows

* modify paddle2onnx cmake support windows

* modify paddle2onnx cmake support windows

Co-authored-by: xiegegege <xiege01@baidu.com>
---
 cmake/external/paddle2onnx.cmake              | 102 +++++++++---------
 cmake/external/protobuf.cmake                 |   9 +-
 cmake/inference_lib.cmake                     |  16 +--
 .../eager/auto_code_generator/CMakeLists.txt  |   2 +-
 .../inference/api/onnxruntime_predictor.cc    |  27 +++--
 paddle/fluid/pybind/CMakeLists.txt            |   2 +-
 python/setup.py.in                            |  10 +-
 7 files changed, 84 insertions(+), 84 deletions(-)

diff --git a/cmake/external/paddle2onnx.cmake b/cmake/external/paddle2onnx.cmake
index 8252b2a73e9432..75e2c42cb5a291 100644
--- a/cmake/external/paddle2onnx.cmake
+++ b/cmake/external/paddle2onnx.cmake
@@ -25,82 +25,82 @@ include(ExternalProject)
 
 set(PADDLE2ONNX_PROJECT "extern_paddle2onnx")
 set(PADDLE2ONNX_PREFIX_DIR ${THIRD_PARTY_PATH}/paddle2onnx)
+set(PADDLE2ONNX_SOURCE_DIR
+    ${THIRD_PARTY_PATH}/paddle2onnx/src/${PADDLE2ONNX_PROJECT})
 set(PADDLE2ONNX_INSTALL_DIR ${THIRD_PARTY_PATH}/install/paddle2onnx)
 set(PADDLE2ONNX_INC_DIR
     "${PADDLE2ONNX_INSTALL_DIR}/include"
     CACHE PATH "paddle2onnx include directory." FORCE)
-set(PADDLE2ONNX_REPOSITORY ${GIT_URL}/PaddlePaddle/Paddle2ONNX.git)
-set(PADDLE2ONNX_TAG cpp)
-set(LIBDIR "lib")
+set(PADDLE2ONNX_LIB_DIR
+    "${PADDLE2ONNX_INSTALL_DIR}/lib"
+    CACHE PATH "onnxruntime lib directory." FORCE)
 set(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}"
                       "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}")
 
 include_directories(${PADDLE2ONNX_INC_DIR}
 )# For PADDLE2ONNX code to include internal headers.
 if(WIN32)
+  set(PADDLE2ONNX_SOURCE_LIB
+      "${PADDLE2ONNX_SOURCE_DIR}/lib/libpaddle2onnx.dylib"
+      CACHE FILEPATH "Paddle2ONNX source library." FORCE)
   set(PADDLE2ONNX_LIB
-      "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/paddle2onnx.lib"
-      CACHE FILEPATH "paddle2onnx static library." FORCE)
-  set(PADDLE2ONNX_SHARED_LIB
-      "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/paddle2onnx.dll"
-      CACHE FILEPATH "paddle2onnx shared library." FORCE)
+      "${PADDLE2ONNX_INSTALL_DIR}/lib/paddle2onnx.dll"
+      CACHE FILEPATH "paddle2onnx library." FORCE)
+  set(PADDLE2ONNX_COMPILE_LIB
+      "${PADDLE2ONNX_INSTALL_DIR}/lib/paddle2onnx.lib"
+      CACHE FILEPATH "paddle2onnx compile library." FORCE)
 elseif(APPLE)
+  set(PADDLE2ONNX_SOURCE_LIB
+      "${PADDLE2ONNX_SOURCE_DIR}/lib/libpaddle2onnx.dylib"
+      CACHE FILEPATH "Paddle2ONNX source library." FORCE)
   set(PADDLE2ONNX_LIB
-      "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/libpaddle2onnx.dylib"
+      "${PADDLE2ONNX_INSTALL_DIR}/lib/libpaddle2onnx.dylib"
       CACHE FILEPATH "PADDLE2ONNX library." FORCE)
+  set(PADDLE2ONNX_COMPILE_LIB
+      "${PADDLE2ONNX_INSTALL_DIR}/lib/libpaddle2onnx.dylib"
+      CACHE FILEPATH "paddle2onnx compile library." FORCE)
 else()
+  set(PADDLE2ONNX_SOURCE_LIB
+      "${PADDLE2ONNX_SOURCE_DIR}/lib/libpaddle2onnx.so"
+      CACHE FILEPATH "Paddle2ONNX source library." FORCE)
   set(PADDLE2ONNX_LIB
-      "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/libpaddle2onnx.so"
+      "${PADDLE2ONNX_INSTALL_DIR}/lib/libpaddle2onnx.so"
       CACHE FILEPATH "PADDLE2ONNX library." FORCE)
+  set(PADDLE2ONNX_COMPILE_LIB
+      "${PADDLE2ONNX_INSTALL_DIR}/lib/libpaddle2onnx.so"
+      CACHE FILEPATH "paddle2onnx compile library." FORCE)
 endif(WIN32)
 
-# The protoc path is required to compile onnx.
-string(REPLACE "/" ";" PROTOC_BIN_PATH ${PROTOBUF_PROTOC_EXECUTABLE})
-list(POP_BACK PROTOC_BIN_PATH)
-list(JOIN PROTOC_BIN_PATH "/" PROTOC_BIN_PATH)
-
-set(PADDLE2ONNX_OPTIONAL_ARGS
-    -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-    -DCMAKE_CXX_STANDARD=14
-    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-    -DONNX_CUSTOM_PROTOC_PATH=${PROTOC_BIN_PATH}
-    -DWITH_STATIC=OFF
-    -DMSVC_STATIC_CRT=${MSVC_STATIC_CRT}
-    -DCMAKE_INSTALL_PREFIX=${PADDLE2ONNX_INSTALL_DIR}
-    -DCMAKE_INSTALL_LIBDIR=${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}
-    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-    ${EXTERNAL_OPTIONAL_ARGS})
-
-if(WITH_PYTHON)
-  set(PADDLE2ONNX_OPTIONAL_ARGS
-      ${PADDLE2ONNX_OPTIONAL_ARGS}
-      -DPYTHON_EXECUTABLE:FILEPATH=${PYTHON_EXECUTABLE}
-      -DPYTHON_INCLUDE_DIR:PATH=${PYTHON_INCLUDE_DIR}
-      -DPYTHON_LIBRARY:FILEPATH=${PYTHON_LIBRARY})
+if(WIN32)
+  set(PADDLE2ONNX_URL
+      "https://github.com/PaddlePaddle/Paddle2ONNX/releases/download/v0.9.7/paddle2onnx-win-x64-0.9.7.zip"
+  )
+elseif(APPLE)
+  set(PADDLE2ONNX_URL
+      "https://github.com/PaddlePaddle/Paddle2ONNX/releases/download/v0.9.7/paddle2onnx-osx-x86_64-0.9.7.tgz"
+  )
+else()
+  set(PADDLE2ONNX_URL
+      "https://github.com/PaddlePaddle/Paddle2ONNX/releases/download/v0.9.7/paddle2onnx-linux-x64-0.9.7.tgz"
+  )
 endif()
 
 ExternalProject_Add(
   ${PADDLE2ONNX_PROJECT}
-  ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
-  GIT_REPOSITORY ${PADDLE2ONNX_REPOSITORY}
-  GIT_TAG ${PADDLE2ONNX_TAG}
-  DEPENDS protobuf
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  URL ${PADDLE2ONNX_URL}
   PREFIX ${PADDLE2ONNX_PREFIX_DIR}
+  DOWNLOAD_NO_PROGRESS 1
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND ""
   UPDATE_COMMAND ""
-  CMAKE_ARGS ${PADDLE2ONNX_OPTIONAL_ARGS}
-  CMAKE_CACHE_ARGS
-    -DCMAKE_INSTALL_PREFIX:PATH=${PADDLE2ONNX_INSTALL_DIR}
-    -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-    -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-  BUILD_BYPRODUCTS ${PADDLE2ONNX_LIB})
+  INSTALL_COMMAND
+    ${CMAKE_COMMAND} -E copy_directory ${PADDLE2ONNX_SOURCE_DIR}/lib
+    ${PADDLE2ONNX_LIB_DIR} && ${CMAKE_COMMAND} -E copy_directory
+    ${PADDLE2ONNX_SOURCE_DIR}/include ${PADDLE2ONNX_INC_DIR}
+  BUILD_BYPRODUCTS ${PADDLE2ONNX_COMPILE_LIB})
 
 add_library(paddle2onnx STATIC IMPORTED GLOBAL)
-set_property(TARGET paddle2onnx PROPERTY IMPORTED_LOCATION ${PADDLE2ONNX_LIB})
+set_property(TARGET paddle2onnx PROPERTY IMPORTED_LOCATION
+                                         ${PADDLE2ONNX_COMPILE_LIB})
 add_dependencies(paddle2onnx ${PADDLE2ONNX_PROJECT})
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 1368081b58fdad..7c5de92362db48 100755
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -234,10 +234,7 @@ function(build_protobuf TARGET_NAME BUILD_FOR_HOST)
         "-Dprotobuf_MSVC_STATIC_RUNTIME=${MSVC_STATIC_CRT}")
   endif()
 
-  if(WITH_ONNXRUNTIME)
-    set(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git)
-    set(PROTOBUF_TAG v3.18.0)
-  elseif(WITH_ASCEND AND NOT WITH_ASCEND_CXX11)
+  if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11)
     set(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git)
     set(PROTOBUF_TAG v3.8.0)
   elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
@@ -319,9 +316,7 @@ function(build_protobuf TARGET_NAME BUILD_FOR_HOST)
   endif()
 endfunction()
 
-if(WITH_ONNXRUNTIME)
-  set(PROTOBUF_VERSION 3.18.0)
-elseif(WITH_ASCEND OR WITH_ASCEND_CL)
+if(WITH_ASCEND OR WITH_ASCEND_CL)
   set(PROTOBUF_VERSION 3.8.0)
 elseif(WITH_IPU)
   set(PROTOBUF_VERSION 3.6.1)
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index bf69ddc8fb49ad..14ae8efb5b4f84 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -148,18 +148,10 @@ function(copy_part_of_thrid_party TARGET DST)
       DSTS ${dst_dir} ${dst_dir})
 
     set(dst_dir "${DST}/third_party/install/paddle2onnx")
-    if(WIN32)
-      copy(
-        ${TARGET}
-        SRCS ${PADDLE2ONNX_INC_DIR}/paddle2onnx ${PADDLE2ONNX_SHARED_LIB}
-             ${PADDLE2ONNX_LIB}
-        DSTS ${dst_dir}/include ${dst_dir}/lib ${dst_dir}/lib)
-    else()
-      copy(
-        ${TARGET}
-        SRCS ${PADDLE2ONNX_INC_DIR}/paddle2onnx ${PADDLE2ONNX_LIB}
-        DSTS ${dst_dir}/include ${dst_dir}/lib)
-    endif()
+    copy(
+      ${TARGET}
+      SRCS ${PADDLE2ONNX_INC_DIR}/paddle2onnx ${PADDLE2ONNX_LIB_DIR}
+      DSTS ${dst_dir}/include ${dst_dir})
   endif()
 
   set(dst_dir "${DST}/third_party/install/gflags")
diff --git a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
index 8c067074d6efd3..aff7f057f4601c 100644
--- a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
+++ b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
@@ -103,7 +103,7 @@ if(WIN32)
     list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/onnxruntime.dll)
     add_custom_command(
       OUTPUT ${eager_generator_path}/paddle2onnx.dll
-      COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE2ONNX_SHARED_LIB}
+      COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE2ONNX_LIB}
               ${eager_generator_path}
       DEPENDS paddle2onnx)
     list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/paddle2onnx.dll)
diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.cc b/paddle/fluid/inference/api/onnxruntime_predictor.cc
index e42e395ce90f8b..93a96863053e55 100644
--- a/paddle/fluid/inference/api/onnxruntime_predictor.cc
+++ b/paddle/fluid/inference/api/onnxruntime_predictor.cc
@@ -74,8 +74,14 @@ bool CheckConvertToONNX(const AnalysisConfig &config) {
         config.model_dir(), config.prog_file(), config.params_file());
     return false;
   }
-  return paddle2onnx::IsExportable(config.prog_file(), config.params_file(),
-                                   config.model_from_memory());
+  if (config.model_from_memory()) {
+    return paddle2onnx::IsExportable(
+        config.prog_file().data(), config.prog_file().size(),
+        config.params_file().data(), config.params_file().size());
+  } else {
+    return paddle2onnx::IsExportable(config.prog_file().c_str(),
+                                     config.params_file().c_str());
+  }
 }
 
 bool ONNXRuntimePredictor::Init() {
@@ -89,9 +95,16 @@ bool ONNXRuntimePredictor::Init() {
     place_ = paddle::platform::CPUPlace();
   }
 
-  std::string onnx_proto;
-  paddle2onnx::Export(config_.prog_file(), config_.params_file(), &onnx_proto,
-                      config_.model_from_memory());
+  char *onnx_proto = nullptr;
+  int out_size;
+  if (config_.model_from_memory()) {
+    paddle2onnx::Export(config_.prog_file().data(), config_.prog_file().size(),
+                        config_.params_file().data(),
+                        config_.params_file().size(), &onnx_proto, &out_size);
+  } else {
+    paddle2onnx::Export(config_.prog_file().c_str(),
+                        config_.params_file().c_str(), &onnx_proto, &out_size);
+  }
 
   Ort::SessionOptions session_options;
   if (config_.ort_optimization_enabled()) {
@@ -118,7 +131,7 @@ bool ONNXRuntimePredictor::Init() {
                "will be "
                "generated.";
   }
-  session_ = {env_, onnx_proto.data(), onnx_proto.size(), session_options};
+  session_ = {env_, onnx_proto, static_cast<size_t>(out_size), session_options};
   binding_ = std::make_shared<Ort::IoBinding>(session_);
 
   Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator,
@@ -153,6 +166,8 @@ bool ONNXRuntimePredictor::Init() {
 
     allocator.Free(output_name);
   }
+  delete onnx_proto;
+  onnx_proto = nullptr;
   return true;
 }
 
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index bf74d1184322cd..a99dded4d5af1a 100755
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -335,7 +335,7 @@ if(WITH_PYTHON)
     if(WITH_ONNXRUNTIME)
       add_custom_command(
         OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/paddle2onnx.dll
-        COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE2ONNX_SHARED_LIB}
+        COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE2ONNX_LIB}
                 ${CMAKE_CURRENT_BINARY_DIR}
         DEPENDS paddle2onnx)
       list(APPEND OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/paddle2onnx.dll)
diff --git a/python/setup.py.in b/python/setup.py.in
index ca1768c9462f04..bb6416038f1981 100755
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -531,15 +531,13 @@ if '${WITH_MKLDNN}' == 'ON':
 
 if '${WITH_ONNXRUNTIME}' == 'ON':
     shutil.copy('${ONNXRUNTIME_SHARED_LIB}', libs_path)
+    shutil.copy('${PADDLE2ONNX_LIB}', libs_path)
     if os.name == 'nt':
-        shutil.copy('${PADDLE2ONNX_SHARED_LIB}', libs_path)
         package_data['paddle.libs']+=['paddle2onnx.dll', 'onnxruntime.dll']
+    elif sys.platform == 'darwin':
+        package_data['paddle.libs']+=['libpaddle2onnx.dylib', 'libonnxruntime.1.10.0.dylib']
     else:
-        shutil.copy('${PADDLE2ONNX_LIB}', libs_path)
-        if sys.platform == 'darwin':
-            package_data['paddle.libs']+=['libpaddle2onnx.dylib', 'libonnxruntime.1.10.0.dylib']
-        else:
-            package_data['paddle.libs']+=['libpaddle2onnx.so', 'libonnxruntime.so.1.10.0']
+        package_data['paddle.libs']+=['libpaddle2onnx.so', 'libonnxruntime.so.1.10.0']
 
 if '${WITH_XPU}' == 'ON':
     # only change rpath in Release mode,

From 3af98de560659956c98882e37d039b1349b4d0c2 Mon Sep 17 00:00:00 2001
From: Sing_chan <51314274+betterpig@users.noreply.github.com>
Date: Mon, 6 Jun 2022 20:03:56 +0800
Subject: [PATCH 05/22] format CMakeLists.txt;add cmakelint hook and its config
 file (#43222)

---
 .pre-commit-config.yaml      | 116 +++++-
 CMakeLists.txt               | 733 ++++++++++++++++++++---------------
 tools/codestyle/.cmakelintrc |   1 +
 3 files changed, 536 insertions(+), 314 deletions(-)
 create mode 100644 tools/codestyle/.cmakelintrc

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4b588cbeb91dcd..bf9aa6e915a466 100755
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -5,7 +5,7 @@ repos:
     -   id: remove-crlf
         files: (?!.*third_party)^.*$ | (?!.*book)^.*$
 -   repo: https://github.com/google/yapf
-    sha: v0.32.0
+    rev: v0.32.0
     hooks:
     -   id: yapf
         files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$
@@ -74,3 +74,117 @@ repos:
             (?x)^(
                 paddle/fluid/operators/CMakeLists.txt
             )$
+
+-   repo: https://github.com/cmake-lint/cmake-lint
+    rev: 1.4.2
+    hooks:
+    -   id: cmakelint
+        args: [--config=./tools/codestyle/.cmakelintrc]
+        # exclude files which need to be fixed
+        exclude: |
+            (?x)^(
+                cmake/generic.cmake|
+                CMakeLists.txt|
+                paddle/fluid/pybind/CMakeLists.txt|
+                python/paddle/fluid/tests/unittests/CMakeLists.txt|
+                paddle/fluid/eager/auto_code_generator/CMakeLists.txt|
+                paddle/fluid/framework/CMakeLists.txt|
+                paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt|
+                cmake/third_party.cmake|
+                paddle/fluid/inference/tests/infer_ut/CMakeLists.txt|
+                cmake/configure.cmake|
+                paddle/fluid/inference/api/demo_ci/CMakeLists.txt|
+                cmake/flags.cmake|
+                cmake/inference_lib.cmake|
+                cmake/external/protobuf.cmake|
+                cmake/system.cmake|
+                cmake/cudnn.cmake|
+                cmake/external/mkldnn.cmake|
+                cmake/unity_build.cmake|
+                paddle/fluid/framework/fleet/CMakeLists.txt|
+                paddle/fluid/inference/CMakeLists.txt|
+                paddle/fluid/inference/tests/api/CMakeLists.txt|
+                paddle/fluid/operators/CMakeLists.txt|
+                paddle/phi/api/lib/CMakeLists.txt|
+                cmake/external/gflags.cmake|
+                cmake/external/lite.cmake|
+                cmake/external/poplar.cmake|
+                cmake/python_module.cmake|
+                python/paddle/fluid/tests/unittests/asp/CMakeLists.txt|
+                cmake/cuda.cmake|
+                cmake/FindNumPy.cmake|
+                cmake/phi.cmake|
+                paddle/fluid/framework/ir/CMakeLists.txt|
+                paddle/fluid/platform/CMakeLists.txt|
+                python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt|
+                python/paddle/tests/CMakeLists.txt|
+                cmake/ccache.cmake|
+                cmake/coveralls.cmake|
+                cmake/external/glog.cmake|
+                cmake/external/onnxruntime.cmake|
+                cmake/external/openblas.cmake|
+                cmake/external/xpu.cmake|
+                cmake/hip.cmake|
+                paddle/fluid/distributed/CMakeLists.txt|
+                paddle/fluid/framework/details/CMakeLists.txt|
+                paddle/fluid/imperative/CMakeLists.txt|
+                paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt|
+                paddle/fluid/inference/api/CMakeLists.txt|
+                paddle/fluid/operators/controlflow/CMakeLists.txt|
+                python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt|
+                cmake/cblas.cmake|
+                cmake/coverallsGcovJsons.cmake|
+                cmake/external/brpc.cmake|
+                cmake/external/cryptopp.cmake|
+                cmake/external/gtest.cmake|
+                cmake/external/llvm.cmake|
+                cmake/external/utf8proc.cmake|
+                cmake/external/warpctc.cmake|
+                cmake/external/zlib.cmake|
+                cmake/FindGperftools.cmake|
+                cmake/operators.cmake|
+                cmake/tensorrt.cmake|
+                paddle/fluid/inference/api/details/CMakeLists.txt|
+                python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt|
+                cmake/external/arm_brpc.cmake|
+                cmake/external/concurrentqueue.cmake|
+                cmake/external/eigen.cmake|
+                cmake/external/mklml.cmake|
+                cmake/external/paddle2onnx.cmake|
+                cmake/miopen.cmake|
+                cmake/nccl.cmake|
+                cmake/simd.cmake|
+                paddle/fluid/distributed/fleet_executor/CMakeLists.txt|
+                paddle/fluid/eager/api/generated/fluid_generated/forwards/CMakeLists.txt|
+                paddle/fluid/framework/io/CMakeLists.txt|
+                paddle/fluid/imperative/tests/CMakeLists.txt|
+                paddle/fluid/inference/analysis/CMakeLists.txt|
+                paddle/fluid/inference/tests/infer_ut/external-cmake/gtest-cpp.cmake|
+                paddle/fluid/memory/allocation/CMakeLists.txt|
+                paddle/fluid/memory/CMakeLists.txt|
+                paddle/fluid/operators/cinn/CMakeLists.txt|
+                paddle/fluid/operators/collective/CMakeLists.txt|
+                paddle/fluid/operators/ipu/CMakeLists.txt|
+                paddle/fluid/operators/jit/CMakeLists.txt|
+                paddle/fluid/operators/pscore/CMakeLists.txt|
+                paddle/fluid/platform/device/ipu/CMakeLists.txt|
+                paddle/fluid/platform/dynload/CMakeLists.txt|
+                paddle/infrt/external_kernels/CMakeLists.txt|
+                paddle/infrt/kernel/phi/CMakeLists.txt|
+                paddle/phi/backends/dynload/CMakeLists.txt|
+                paddle/phi/CMakeLists.txt|
+                paddle/phi/kernels/CMakeLists.txt|
+                paddle/phi/tests/core/CMakeLists.txt|
+                python/CMakeLists.txt|
+                python/paddle/fluid/contrib/slim/tests/CMakeLists.txt|
+                python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt|
+                python/paddle/fluid/tests/unittests/distribution/CMakeLists.txt|
+                python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt|
+                python/paddle/fluid/tests/unittests/fft/CMakeLists.txt|
+                python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt|
+                python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt|
+                python/paddle/fluid/tests/unittests/npu/CMakeLists.txt|
+                python/paddle/fluid/tests/unittests/ps/CMakeLists.txt|
+                python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt|
+                python/paddle/fluid/tests/unittests/sequence/CMakeLists.txt
+            )$
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 70eb5f11ea168a..ba438a74718f25 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,12 +13,12 @@
 # limitations under the License
 
 if(APPLE AND WITH_ARM)
-    # cmake 3.19.2 version starts to support M1
-    cmake_minimum_required(VERSION 3.19.2)
-    cmake_policy(VERSION 3.19.2)
+  # cmake 3.19.2 version starts to support M1
+  cmake_minimum_required(VERSION 3.19.2)
+  cmake_policy(VERSION 3.19.2)
 else(APPLE AND WITH_ARM)
-    cmake_minimum_required(VERSION 3.15)
-    cmake_policy(VERSION 3.10)
+  cmake_minimum_required(VERSION 3.15)
+  cmake_policy(VERSION 3.10)
 endif(APPLE AND WITH_ARM)
 # use to get_property location of static lib
 # https://cmake.org/cmake/help/v3.0/policy/CMP0026.html?highlight=cmp0026
@@ -31,9 +31,12 @@ include(system)
 
 # Note(zhouwei): Ninja Generator will set CMAKE_BUILD_TYPE to Debug
 if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE "Release" CACHE STRING
-      "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
-      FORCE)
+  set(CMAKE_BUILD_TYPE
+      "Release"
+      CACHE
+        STRING
+        "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
+        FORCE)
 endif()
 
 project(paddle CXX C)
@@ -42,157 +45,185 @@ project(paddle CXX C)
 # TODO(Shibo Tao): remove find_package(CUDA) completely.
 find_package(CUDA QUIET)
 find_package(MKL CONFIG QUIET)
-option(WITH_ONEMKL      "Compile PaddlePaddle with oneMKL"              OFF)
-option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
-option(WITH_TENSORRT    "Compile PaddlePaddle with NVIDIA TensorRT"     OFF)
-option(WITH_XPU         "Compile PaddlePaddle with BAIDU KUNLUN XPU"    OFF)
-option(WITH_XPU_KP      "Compile PaddlePaddle with BAIDU XPU compiler " OFF)
-option(WITH_MLU    "Compile PaddlePaddle with CAMBRICON MLU"     OFF)
-option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode"    OFF)
-option(WITH_ASCEND         "Compile PaddlePaddle with ASCEND"        OFF)
-option(WITH_ROCM        "Compile PaddlePaddle with ROCM platform"       OFF)
-option(WITH_IPU         "Compile PaddlePaddle with Graphcore IPU"    OFF)
+option(WITH_ONEMKL "Compile PaddlePaddle with oneMKL" OFF)
+option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND})
+option(WITH_TENSORRT "Compile PaddlePaddle with NVIDIA TensorRT" OFF)
+option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF)
+option(WITH_XPU_KP "Compile PaddlePaddle with BAIDU XPU compiler " OFF)
+option(WITH_MLU "Compile PaddlePaddle with CAMBRICON MLU" OFF)
+option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF)
+option(WITH_ASCEND "Compile PaddlePaddle with ASCEND" OFF)
+option(WITH_ROCM "Compile PaddlePaddle with ROCM platform" OFF)
+option(WITH_IPU "Compile PaddlePaddle with Graphcore IPU" OFF)
 # NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON
 # to develop some acl related functionality on x86
-option(WITH_ASCEND_CL         "Compile PaddlePaddle with ASCEND CL"        ${WITH_ASCEND})
-option(WITH_ASCEND_CXX11         "Compile PaddlePaddle with ASCEND and CXX11 ABI"        OFF)
-option(WITH_ONNXRUNTIME         "Compile PaddlePaddle with ONNXRUNTIME"          OFF)
+option(WITH_ASCEND_CL "Compile PaddlePaddle with ASCEND CL" ${WITH_ASCEND})
+option(WITH_ASCEND_CXX11 "Compile PaddlePaddle with ASCEND and CXX11 ABI" OFF)
+option(WITH_ONNXRUNTIME "Compile PaddlePaddle with ONNXRUNTIME" OFF)
 # Note(zhouwei): It use option above, so put here
 include(init)
-include(generic)            # simplify cmake module
-include(experimental)       # experimental build options
+include(generic) # simplify cmake module
+include(experimental) # experimental build options
 
-if (WITH_GPU  AND WITH_XPU)
-    message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
+if(WITH_GPU AND WITH_XPU)
+  message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
 endif()
-if (WITH_GPU AND WITH_XPU_KP)
-    message(FATAL_ERROR "Error when compile GPU and XPU2 at the same time")
+if(WITH_GPU AND WITH_XPU_KP)
+  message(FATAL_ERROR "Error when compile GPU and XPU2 at the same time")
 endif()
-if (WITH_GPU AND WITH_ASCEND)
-    message(FATAL_ERROR "Error when compile GPU and ASCEND at the same time")
+if(WITH_GPU AND WITH_ASCEND)
+  message(FATAL_ERROR "Error when compile GPU and ASCEND at the same time")
 endif()
-if (WITH_GPU AND WITH_ROCM)
-    message(FATAL_ERROR "Error when compile CUDA and ROCM at the same time")
+if(WITH_GPU AND WITH_ROCM)
+  message(FATAL_ERROR "Error when compile CUDA and ROCM at the same time")
 endif()
-if (WITH_GPU AND WITH_MLU)
-    message(FATAL_ERROR "Error when compile GPU and MLU at the same time")
+if(WITH_GPU AND WITH_MLU)
+  message(FATAL_ERROR "Error when compile GPU and MLU at the same time")
 endif()
 
 if(WITH_GPU AND NOT APPLE)
-    enable_language(CUDA)
-    message(STATUS "CUDA compiler: ${CMAKE_CUDA_COMPILER}, version: "
-        "${CMAKE_CUDA_COMPILER_ID} ${CMAKE_CUDA_COMPILER_VERSION}")
+  enable_language(CUDA)
+  message(STATUS "CUDA compiler: ${CMAKE_CUDA_COMPILER}, version: "
+                 "${CMAKE_CUDA_COMPILER_ID} ${CMAKE_CUDA_COMPILER_VERSION}")
 endif()
 
 message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
-        "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
+               "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
 message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
-        "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
+               "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
 message(STATUS "AR tools: ${CMAKE_AR}")
 
 # MUSL build turn off warnings
 if(WITH_MUSL)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy")
+  set(CMAKE_CXX_FLAGS
+      "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy"
+  )
 endif()
 
 if(APPLE AND WITH_ARM)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -target arm64-apple-darwin")
-    set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -target arm64-apple-darwin")
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -target arm64-apple-darwin")
+  set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -target arm64-apple-darwin")
 endif()
 
 if(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
-    if(WITH_ARM_BRPC)
-        set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1")
-    else()
-        set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
-    endif()
+  if(WITH_ARM_BRPC)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1")
+  else()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
+  endif()
 endif()
 
 if(WIN32)
-    option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)
-
-    set(CMAKE_SUPPRESS_REGENERATION ON)
-    set(CMAKE_STATIC_LIBRARY_PREFIX lib)
-
-    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj")
-    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj")
-    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj")
-    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj")
-
-    if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
-        set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /Zc:inline")
-        set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /Zc:inline")
-        set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /Zc:inline")
-        set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /Zc:inline")
+  option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)
+
+  set(CMAKE_SUPPRESS_REGENERATION ON)
+  set(CMAKE_STATIC_LIBRARY_PREFIX lib)
+
+  set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj")
+  set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj")
+  set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj")
+  set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj")
+
+  if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
+    set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /Zc:inline")
+    set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zc:inline")
+    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Zc:inline")
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Zc:inline")
+  endif()
+
+  if(MSVC_STATIC_CRT)
+    message(
+      STATUS
+        "Use static C runtime time, refer to https://docs.microsoft.com/en-us/cpp/c-runtime-library/crt-library-features?view=vs-2019"
+    )
+    foreach(
+      flag_var
+      CMAKE_CXX_FLAGS
+      CMAKE_CXX_FLAGS_DEBUG
+      CMAKE_CXX_FLAGS_RELEASE
+      CMAKE_CXX_FLAGS_MINSIZEREL
+      CMAKE_CXX_FLAGS_RELWITHDEBINFO
+      CMAKE_C_FLAGS
+      CMAKE_C_FLAGS_DEBUG
+      CMAKE_C_FLAGS_RELEASE
+      CMAKE_C_FLAGS_MINSIZEREL
+      CMAKE_C_FLAGS_RELWITHDEBINFO)
+      if(${flag_var} MATCHES "/MD")
+        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+      endif()
+    endforeach(flag_var)
+  endif()
+
+  # NOTE(zhouwei): msvc max/min macro conflict with std::min/max, define NOMINMAX globally
+  add_definitions("-DNOMINMAX")
+  # windows build turn off warnings, use parallel compiling.
+  foreach(
+    flag_var
+    CMAKE_CXX_FLAGS
+    CMAKE_CXX_FLAGS_DEBUG
+    CMAKE_CXX_FLAGS_RELEASE
+    CMAKE_CXX_FLAGS_MINSIZEREL
+    CMAKE_CXX_FLAGS_RELWITHDEBINFO
+    CMAKE_C_FLAGS
+    CMAKE_C_FLAGS_DEBUG
+    CMAKE_C_FLAGS_RELEASE
+    CMAKE_C_FLAGS_MINSIZEREL
+    CMAKE_C_FLAGS_RELWITHDEBINFO)
+    string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
+
+    # NOTE(zhouwei25): GPU compile have too high memory utilization when parallel compiling,
+    # For Visual Studio generators, /MP should be added.
+    # For other generators like Ninja, it is not need to add /MP.
+    if(CMAKE_GENERATOR MATCHES "Visual Studio" AND NOT WITH_GPU)
+      math(EXPR PROCESS_MAX "${CPU_CORES} * 2 / 3")
+      set(${flag_var} "${${flag_var}} /MP${PROCESS_MAX}")
     endif()
-
-    if (MSVC_STATIC_CRT)
-        message(STATUS "Use static C runtime time, refer to https://docs.microsoft.com/en-us/cpp/c-runtime-library/crt-library-features?view=vs-2019")
-        foreach(flag_var
-            CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-            CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
-            CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
-            CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
-            if(${flag_var} MATCHES "/MD")
-                string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
-            endif()
-        endforeach(flag_var)
+  endforeach(flag_var)
+  foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
+    set(${flag_var} "${${flag_var}} /w")
+  endforeach(flag_var)
+
+  # Windows Remove /Zi, /ZI for Release, MinSizeRel builds
+  foreach(flag_var
+          CMAKE_C_FLAGS CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_MINSIZEREL
+          CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_MINSIZEREL)
+    if(${flag_var} MATCHES "/Z[iI]")
+      string(REGEX REPLACE "/Z[iI]" "" ${flag_var} "${${flag_var}}")
     endif()
+  endforeach(flag_var)
+
+  set(CMAKE_C_FLAGS
+      "${CMAKE_C_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838"
+  )
+  set(CMAKE_CXX_FLAGS
+      "${CMAKE_CXX_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838"
+  )
+
+  foreach(flag_var CMAKE_SHARED_LINKER_FLAGS CMAKE_STATIC_LINKER_FLAGS
+                   CMAKE_EXE_LINKER_FLAGS CMAKE_LINKER_FLAGS)
+    set(${flag_var}
+        "${${flag_var}} /ignore:4049 /ignore:4217 /ignore:4006 /ignore:4221")
+    if(MSVC_STATIC_CRT)
+      set(${flag_var} "${${flag_var}} /NODEFAULTLIB:MSVCRT.LIB")
+    endif()
+  endforeach(flag_var)
 
-    # NOTE(zhouwei): msvc max/min macro conflict with std::min/max, define NOMINMAX globally
-    add_definitions("-DNOMINMAX")
-    # windows build turn off warnings, use parallel compiling.
-    foreach(flag_var
-        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
-        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
-        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
-        string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
-        
-        # NOTE(zhouwei25): GPU compile have too high memory utilization when parallel compiling,
-        # For Visual Studio generators, /MP should be added. 
-        # For other generators like Ninja, it is not need to add /MP.
-        if(CMAKE_GENERATOR MATCHES "Visual Studio" AND NOT WITH_GPU)
-            math(EXPR PROCESS_MAX "${CPU_CORES} * 2 / 3")
-            set(${flag_var} "${${flag_var}} /MP${PROCESS_MAX}")
-        endif()
-    endforeach(flag_var)
-    foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
-        set(${flag_var} "${${flag_var}} /w")
-    endforeach(flag_var)
-
-    # Windows Remove /Zi, /ZI for Release, MinSizeRel builds
-    foreach(flag_var
-        CMAKE_C_FLAGS CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_MINSIZEREL
-        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_MINSIZEREL)
-        if(${flag_var} MATCHES "/Z[iI]")
-            string(REGEX REPLACE "/Z[iI]" "" ${flag_var} "${${flag_var}}")
-        endif()
-    endforeach(flag_var)
-
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838")
+  if(WITH_WIN_DUMP_DBG)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Zi")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi")
 
-    foreach(flag_var CMAKE_SHARED_LINKER_FLAGS CMAKE_STATIC_LINKER_FLAGS CMAKE_EXE_LINKER_FLAGS CMAKE_LINKER_FLAGS)
-        set(${flag_var} "${${flag_var}} /ignore:4049 /ignore:4217 /ignore:4006 /ignore:4221")
-        if(MSVC_STATIC_CRT)
-            set(${flag_var} "${${flag_var}} /NODEFAULTLIB:MSVCRT.LIB")
-        endif()
+    foreach(flag_var CMAKE_SHARED_LINKER_FLAGS CMAKE_STATIC_LINKER_FLAGS
+                     CMAKE_EXE_LINKER_FLAGS CMAKE_LINKER_FLAGS)
+      set(${flag_var} "${${flag_var}} /DEBUG /OPT:REF /OPT:ICF")
     endforeach(flag_var)
 
-    if (WITH_WIN_DUMP_DBG)
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Zi")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi")
-
-        foreach(flag_var CMAKE_SHARED_LINKER_FLAGS CMAKE_STATIC_LINKER_FLAGS CMAKE_EXE_LINKER_FLAGS CMAKE_LINKER_FLAGS)
-            set(${flag_var} "${${flag_var}} /DEBUG /OPT:REF /OPT:ICF")
-        endforeach(flag_var)
-
-        add_definitions("-DWITH_WIN_DUMP_DBG")
-    endif()
+    add_definitions("-DWITH_WIN_DUMP_DBG")
+  endif()
 
 else(WIN32)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations")
+  set(CMAKE_CXX_FLAGS
+      "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations"
+  )
 endif(WIN32)
 
 find_package(Git REQUIRED)
@@ -200,7 +231,7 @@ find_package(Git REQUIRED)
 # config GIT_URL with github mirrors to speed up dependent repos clone
 option(GIT_URL "Git URL to clone dependent repos" ${GIT_URL})
 if(NOT GIT_URL)
-    set(GIT_URL "https://github.com")
+  set(GIT_URL "https://github.com")
 endif()
 
 find_package(Threads REQUIRED)
@@ -208,60 +239,83 @@ find_package(Threads REQUIRED)
 include(simd)
 
 ################################ Exposed Configurations #######################################
-option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
-option(WITH_PYTHON      "Compile PaddlePaddle with python interpreter"  ON)
-option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        OFF)
-option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FOUND})
-option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
-option(WITH_DISTRIBUTE  "Compile with distributed support"              OFF)
-option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
-option(ON_INFER         "Turn on inference optimization and inference-lib generation" OFF)
+option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND})
+option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
+option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF)
+option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND})
+option(WITH_SYSTEM_BLAS "Use system blas library" OFF)
+option(WITH_DISTRIBUTE "Compile with distributed support" OFF)
+option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF)
+option(ON_INFER "Turn on inference optimization and inference-lib generation"
+       OFF)
 ################################ Internal Configurations #######################################
-option(WITH_NV_JETSON   "Compile PaddlePaddle with NV JETSON"             OFF)
-option(WITH_PROFILER    "Compile PaddlePaddle with GPU profiler and gperftools"        OFF)
-option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
-option(WITH_INCREMENTAL_COVERAGE    "Generate coverage reports only for incremental code"       OFF)
-OPTION(WITH_LIBXSMM     "Compile with libxsmm"                          OFF)
-option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
-option(WITH_PSLIB       "Compile with pslib support"                    OFF)
-option(WITH_BOX_PS      "Compile with box_ps support"                   OFF)
-option(WITH_XBYAK       "Compile with xbyak support"                    ON)
-option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
-option(WITH_PSCORE     "Compile with parameter server support"         ${WITH_DISTRIBUTE})
-option(WITH_HETERPS     "Compile with heterps"                          OFF})
-option(WITH_INFERENCE_API_TEST   "Test fluid inference C++ high-level api interface"  OFF)
-option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
-option(WITH_DGC   "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE})
-option(SANITIZER_TYPE "Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined" OFF)
-option(WITH_LITE   "Compile Paddle Fluid with Lite Engine" OFF)
-option(WITH_CINN   "Compile PaddlePaddle with CINN" OFF)
-option(WITH_INFRT  "Compile PaddlePaddle with INFRT" OFF)
-option(WITH_NCCL   "Compile PaddlePaddle with NCCL support"             ON)
-option(WITH_RCCL   "Compile PaddlePaddle with RCCL support"             ON)
-option(WITH_XPU_BKCL    "Compile PaddlePaddle with BAIDU KUNLUN XPU BKCL"   OFF)
-option(WITH_CNCL   "Compile PaddlePaddle with CNCL support"             OFF)
-option(WITH_CRYPTO   "Compile PaddlePaddle with crypto support"         ON)
-option(WITH_ARM   "Compile PaddlePaddle with arm support"         OFF)
-option(WITH_SW   "Compile PaddlePaddle with sw support"         OFF)
-option(WITH_MIPS   "Compile PaddlePaddle with mips support"         OFF)
-option(WITH_MUSL        "Compile with musl libc instead of gblic"  OFF)
-option(WITH_UNITY_BUILD "Compile with UnityBuild mode"             OFF)
-option(WITH_STRIP       "Strip so files of Whl packages"         OFF)
-option(NEW_RELEASE_PYPI   "PaddlePaddle next-level release strategy for pypi cubin package"             OFF)
-option(NEW_RELEASE_ALL   "PaddlePaddle next-level release strategy for all arches cubin package"             OFF)
-option(NEW_RELEASE_JIT   "PaddlePaddle next-level release strategy for backup jit package"             OFF)
-option(WITH_ASCEND_INT64 "Compile with int64 kernel for ascend NPU"    OFF)
-option(WITH_POCKETFFT    "Compile with pocketfft support"      ON)
-option(WITH_RECORD_BUILDTIME    "Compile PaddlePaddle with record all targets build time"       OFF)
-option(WITH_CUSTOM_DEVICE "Compile with custom device support"    OFF)
-option(WITH_ARM_BRPC "Supprot Brpc in Arm"    OFF)
-option(WITH_FLPS     "FL PS mode"    OFF)
+option(WITH_NV_JETSON "Compile PaddlePaddle with NV JETSON" OFF)
+option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler and gperftools"
+       OFF)
+option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF)
+option(WITH_INCREMENTAL_COVERAGE
+       "Generate coverage reports only for incremental code" OFF)
+option(WITH_LIBXSMM "Compile with libxsmm" OFF)
+option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF)
+option(WITH_PSLIB "Compile with pslib support" OFF)
+option(WITH_BOX_PS "Compile with box_ps support" OFF)
+option(WITH_XBYAK "Compile with xbyak support" ON)
+option(WITH_CONTRIB "Compile the third-party contributation" OFF)
+option(WITH_PSCORE "Compile with parameter server support" ${WITH_DISTRIBUTE})
+option(WITH_HETERPS "Compile with heterps" OFF})
+option(WITH_INFERENCE_API_TEST
+       "Test fluid inference C++ high-level api interface" OFF)
+option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION})
+option(WITH_DGC "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE})
+option(
+  SANITIZER_TYPE
+  "Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined"
+  OFF)
+option(WITH_LITE "Compile Paddle Fluid with Lite Engine" OFF)
+option(WITH_CINN "Compile PaddlePaddle with CINN" OFF)
+option(WITH_INFRT "Compile PaddlePaddle with INFRT" OFF)
+option(WITH_NCCL "Compile PaddlePaddle with NCCL support" ON)
+option(WITH_RCCL "Compile PaddlePaddle with RCCL support" ON)
+option(WITH_XPU_BKCL "Compile PaddlePaddle with BAIDU KUNLUN XPU BKCL" OFF)
+option(WITH_CNCL "Compile PaddlePaddle with CNCL support" OFF)
+option(WITH_CRYPTO "Compile PaddlePaddle with crypto support" ON)
+option(WITH_ARM "Compile PaddlePaddle with arm support" OFF)
+option(WITH_SW "Compile PaddlePaddle with sw support" OFF)
+option(WITH_MIPS "Compile PaddlePaddle with mips support" OFF)
+option(WITH_MUSL "Compile with musl libc instead of gblic" OFF)
+option(WITH_UNITY_BUILD "Compile with UnityBuild mode" OFF)
+option(WITH_STRIP "Strip so files of Whl packages" OFF)
+option(NEW_RELEASE_PYPI
+       "PaddlePaddle next-level release strategy for pypi cubin package" OFF)
+option(NEW_RELEASE_ALL
+       "PaddlePaddle next-level release strategy for all arches cubin package"
+       OFF)
+option(NEW_RELEASE_JIT
+       "PaddlePaddle next-level release strategy for backup jit package" OFF)
+option(WITH_ASCEND_INT64 "Compile with int64 kernel for ascend NPU" OFF)
+option(WITH_POCKETFFT "Compile with pocketfft support" ON)
+option(WITH_RECORD_BUILDTIME
+       "Compile PaddlePaddle with record all targets build time" OFF)
+option(WITH_CUSTOM_DEVICE "Compile with custom device support" OFF)
+option(WITH_ARM_BRPC "Supprot Brpc in Arm" OFF)
+option(WITH_FLPS "FL PS mode" OFF)
 
 if(WITH_RECORD_BUILDTIME)
-    set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CMAKE_CURRENT_SOURCE_DIR}/tools/get_build_time.sh ${CMAKE_CURRENT_BINARY_DIR}")
-    set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK "${CMAKE_CURRENT_SOURCE_DIR}/tools/get_build_time.sh ${CMAKE_CURRENT_BINARY_DIR}")
-else()            
-    include(ccache) # set ccache for compilation ; if WITH_RECORD_BUILDTIME=ON can't use ccache
+  set_property(
+    GLOBAL
+    PROPERTY
+      RULE_LAUNCH_COMPILE
+      "${CMAKE_CURRENT_SOURCE_DIR}/tools/get_build_time.sh ${CMAKE_CURRENT_BINARY_DIR}"
+  )
+  set_property(
+    GLOBAL
+    PROPERTY
+      RULE_LAUNCH_LINK
+      "${CMAKE_CURRENT_SOURCE_DIR}/tools/get_build_time.sh ${CMAKE_CURRENT_BINARY_DIR}"
+  )
+else()
+  include(ccache
+  )# set ccache for compilation ; if WITH_RECORD_BUILDTIME=ON can't use ccache
 endif()
 unset(WITH_RECORD_BUILDTIME CACHE)
 
@@ -271,191 +325,237 @@ if(NOT PY_VERSION)
 endif()
 set(PYBIND11_PYTHON_VERSION ${PY_VERSION})
 
-
 # the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined. Default: OFF
-if(SANITIZER_TYPE AND NOT "${SANITIZER_TYPE}" MATCHES "^(Address|Leak|Memory|Thread|Undefined)$")
+if(SANITIZER_TYPE AND NOT "${SANITIZER_TYPE}" MATCHES
+                      "^(Address|Leak|Memory|Thread|Undefined)$")
   message("Choose the correct type of sanitizer")
   return()
 endif()
 
-if (LINUX AND NOT WITH_CUSTOM_DEVICE AND NOT ON_INFER)
-set(WITH_CUSTOM_DEVICE ON)
+if(LINUX
+   AND NOT WITH_CUSTOM_DEVICE
+   AND NOT ON_INFER)
+  set(WITH_CUSTOM_DEVICE ON)
 endif()
 
 if(WIN32)
-    if(WITH_DISTRIBUTE)
-        MESSAGE(WARNING
-            "Disable DISTRIBUTE when compiling for Windows. Force WITH_DISTRIBUTE=OFF.")
-        set(WITH_DISTRIBUTE OFF CACHE STRING
-            "Disable DISTRIBUTE when compiling for Windows" FORCE)
-    endif()
-    if(WITH_NCCL)
-        MESSAGE(WARNING
-            "Disable NCCL when compiling for Windows. Force WITH_NCCL=OFF.")
-        set(WITH_NCCL OFF CACHE STRING
-            "Disable NCCL when compiling for Windows" FORCE)
-    endif()
-endif()
-
-if (NOT WITH_GPU AND WITH_NCCL)
-    MESSAGE(WARNING
-        "Disable NCCL when compiling without GPU. Force WITH_NCCL=OFF.")
-    set(WITH_NCCL OFF CACHE STRING
-        "Disable NCCL when compiling without GPU" FORCE)
+  if(WITH_DISTRIBUTE)
+    message(
+      WARNING
+        "Disable DISTRIBUTE when compiling for Windows. Force WITH_DISTRIBUTE=OFF."
+    )
+    set(WITH_DISTRIBUTE
+        OFF
+        CACHE STRING "Disable DISTRIBUTE when compiling for Windows" FORCE)
+  endif()
+  if(WITH_NCCL)
+    message(
+      WARNING "Disable NCCL when compiling for Windows. Force WITH_NCCL=OFF.")
+    set(WITH_NCCL
+        OFF
+        CACHE STRING "Disable NCCL when compiling for Windows" FORCE)
+  endif()
+endif()
+
+if(NOT WITH_GPU AND WITH_NCCL)
+  message(
+    WARNING "Disable NCCL when compiling without GPU. Force WITH_NCCL=OFF.")
+  set(WITH_NCCL
+      OFF
+      CACHE STRING "Disable NCCL when compiling without GPU" FORCE)
 endif()
 
 # force WITH_XPU on when WITH_XPU_KP
-if (WITH_XPU_KP AND NOT WITH_XPU)
-    MESSAGE(WARNING
-        "Enable WITH_XPU when compiling with WITH_XPU_KP. Force WITH_XPU=ON.")
-    set(WITH_XPU ON CACHE STRING
-        "Enable WITH_XPU when compiling with WITH_XPU_KP" FORCE)
+if(WITH_XPU_KP AND NOT WITH_XPU)
+  message(
+    WARNING
+      "Enable WITH_XPU when compiling with WITH_XPU_KP. Force WITH_XPU=ON.")
+  set(WITH_XPU
+      ON
+      CACHE STRING "Enable WITH_XPU when compiling with WITH_XPU_KP" FORCE)
 endif()
 
-if (NOT WITH_XPU AND WITH_XPU_BKCL)
-    MESSAGE(WARNING
-        "Disable BKCL when compiling without XPU. Force WITH_XPU_BKCL=OFF.")
-    set(WITH_XPU_BKCL OFF CACHE STRING
-        "Disable BKCL when compiling without XPU" FORCE)
+if(NOT WITH_XPU AND WITH_XPU_BKCL)
+  message(
+    WARNING "Disable BKCL when compiling without XPU. Force WITH_XPU_BKCL=OFF.")
+  set(WITH_XPU_BKCL
+      OFF
+      CACHE STRING "Disable BKCL when compiling without XPU" FORCE)
 endif()
 
-if (NOT WITH_MLU AND WITH_CNCL)
-    MESSAGE(WARNING
-        "Disable CNCL when compiling without MLU. Force WITH_MLU=OFF.")
-    set(WITH_MLU OFF CACHE STRING
-        "Disable CNCL when compiling without MLU" FORCE)
+if(NOT WITH_MLU AND WITH_CNCL)
+  message(
+    WARNING "Disable CNCL when compiling without MLU. Force WITH_MLU=OFF.")
+  set(WITH_MLU
+      OFF
+      CACHE STRING "Disable CNCL when compiling without MLU" FORCE)
 endif()
 
 if(WITH_NCCL)
-     add_definitions("-DPADDLE_WITH_NCCL")
-     include(nccl)
+  add_definitions("-DPADDLE_WITH_NCCL")
+  include(nccl)
 else()
-     if(WITH_GPU)
-         MESSAGE(WARNING "If the environment is multi-card, the WITH_NCCL option needs to be turned on, otherwise only a single card can be used.")
-     endif()
+  if(WITH_GPU)
+    message(
+      WARNING
+        "If the environment is multi-card, the WITH_NCCL option needs to be turned on, otherwise only a single card can be used."
+    )
+  endif()
 endif()
 
 if(WITH_BRPC_RDMA)
-    message(STATUS "Use brpc with rdma.")
-    if(NOT WITH_DISTRIBUTE)
-        message(FATAL_ERROR "Can't use brpc rdma in no distribute env.")
-    endif()
+  message(STATUS "Use brpc with rdma.")
+  if(NOT WITH_DISTRIBUTE)
+    message(FATAL_ERROR "Can't use brpc rdma in no distribute env.")
+  endif()
 endif()
 
-
 if(WITH_GPU)
-    include(cuda)
-    # lite subgraph compilation depends on CUDNN_ROOT,
-    # so include(cudnn) needs to be in front of include(third_party/lite)
-    include(cudnn)              # set cudnn libraries, must before configure
-    include(tensorrt)
-    # there is no official support of nccl, cupti in windows
-    if(NOT WIN32)
-        include(cupti)
-    endif()
+  include(cuda)
+  # lite subgraph compilation depends on CUDNN_ROOT,
+  # so include(cudnn) needs to be in front of include(third_party/lite)
+  include(cudnn) # set cudnn libraries, must before configure
+  include(tensorrt)
+  # there is no official support of nccl, cupti in windows
+  if(NOT WIN32)
+    include(cupti)
+  endif()
 endif()
 
 if(WITH_MLU)
-    include(neuware)
+  include(neuware)
 endif()
 
 if(WITH_ROCM)
-    include(hip)
-    include(miopen) # set miopen libraries, must before configure
+  include(hip)
+  include(miopen) # set miopen libraries, must before configure
 endif(WITH_ROCM)
 
 if(WITH_XPU_KP)
-    include(xpu_kp)
+  include(xpu_kp)
 endif()
 
-if (NOT WITH_ROCM AND WITH_RCCL)
-    MESSAGE(WARNING
-        "Disable RCCL when compiling without ROCM. Force WITH_RCCL=OFF.")
-    set(WITH_RCCL OFF CACHE STRING
-        "Disable RCCL when compiling without ROCM" FORCE)
+if(NOT WITH_ROCM AND WITH_RCCL)
+  message(
+    WARNING "Disable RCCL when compiling without ROCM. Force WITH_RCCL=OFF.")
+  set(WITH_RCCL
+      OFF
+      CACHE STRING "Disable RCCL when compiling without ROCM" FORCE)
 endif()
 
 if(WITH_RCCL)
-     add_definitions("-DPADDLE_WITH_RCCL")
-     include(rccl)
+  add_definitions("-DPADDLE_WITH_RCCL")
+  include(rccl)
 else()
-     if(WITH_ROCM)
-         MESSAGE(WARNING "If the environment is multi-card, the WITH_RCCL option needs to be turned on, otherwise only a single card can be used.")
-     endif()
+  if(WITH_ROCM)
+    message(
+      WARNING
+        "If the environment is multi-card, the WITH_RCCL option needs to be turned on, otherwise only a single card can be used."
+    )
+  endif()
 endif()
 
 if(WITH_HETERPS AND WITH_PSLIB)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
 endif()
 
 if(WITH_DISTRIBUTE)
-    if(LINUX)
-        set(WITH_GLOO ON CACHE STRING "Enable GLOO when compiling WITH_DISTRIBUTE=ON." FORCE)
-    endif()
-    if(WITH_ASCEND_CL AND NOT WITH_ARM_BRPC)
-        # disable WITH_PSCORE for NPU before include third_party
-        MESSAGE(WARNING "Disable WITH_PSCORE when compiling with NPU. Force WITH_PSCORE=OFF.")
-        set(WITH_PSCORE OFF CACHE BOOL "Disable WITH_PSCORE when compiling with NPU" FORCE)
-    endif()
-    if(WITH_ROCM AND HIP_VERSION LESS_EQUAL 40020496)
-        # TODO(qili93): third-party rocksdb throw Illegal instruction with HIP version 40020496
-        MESSAGE(WARNING "Disable WITH_PSCORE when HIP_VERSION is less than or equal 40020496. Force WITH_PSCORE=OFF.")
-        set(WITH_PSCORE OFF CACHE BOOL "Disable WITH_PSCORE when HIP_VERSION is less than or equal 40020496" FORCE)
-    endif()
-endif()
-
-include(third_party)  # download, build, install third_party, Contains about 20+ dependencies
-
-include(flags)              # set paddle compile flags
+  if(LINUX)
+    set(WITH_GLOO
+        ON
+        CACHE STRING "Enable GLOO when compiling WITH_DISTRIBUTE=ON." FORCE)
+  endif()
+  if(WITH_ASCEND_CL AND NOT WITH_ARM_BRPC)
+    # disable WITH_PSCORE for NPU before include third_party
+    message(
+      WARNING
+        "Disable WITH_PSCORE when compiling with NPU. Force WITH_PSCORE=OFF.")
+    set(WITH_PSCORE
+        OFF
+        CACHE BOOL "Disable WITH_PSCORE when compiling with NPU" FORCE)
+  endif()
+  if(WITH_ROCM AND HIP_VERSION LESS_EQUAL 40020496)
+    # TODO(qili93): third-party rocksdb throw Illegal instruction with HIP version 40020496
+    message(
+      WARNING
+        "Disable WITH_PSCORE when HIP_VERSION is less than or equal 40020496. Force WITH_PSCORE=OFF."
+    )
+    set(WITH_PSCORE
+        OFF
+        CACHE
+          BOOL
+          "Disable WITH_PSCORE when HIP_VERSION is less than or equal 40020496"
+          FORCE)
+  endif()
+endif()
+
+include(third_party
+)# download, build, install third_party, Contains about 20+ dependencies
+
+include(flags) # set paddle compile flags
 
 if(WITH_PROFILER)
-    find_package(Gperftools REQUIRED)
-    include_directories(${GPERFTOOLS_INCLUDE_DIR})
-    add_definitions(-DWITH_GPERFTOOLS)
+  find_package(Gperftools REQUIRED)
+  include_directories(${GPERFTOOLS_INCLUDE_DIR})
+  add_definitions(-DWITH_GPERFTOOLS)
 endif()
 
-include(util)               # set unittest and link libs
-include(version)            # set PADDLE_VERSION
-include(coveralls)          # set code coverage
-include(configure)          # add paddle env configuration
+include(util) # set unittest and link libs
+include(version) # set PADDLE_VERSION
+include(coveralls) # set code coverage
+include(configure) # add paddle env configuration
 
 include_directories("${PADDLE_SOURCE_DIR}")
 
 if(WITH_NV_JETSON)
-    set(WITH_ARM ON CACHE STRING "Set WITH_ARM=ON when compiling WITH_NV_JETSON=ON." FORCE)
+  set(WITH_ARM
+      ON
+      CACHE STRING "Set WITH_ARM=ON when compiling WITH_NV_JETSON=ON." FORCE)
 endif()
 
 if(WITH_ARM)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
-    set(WITH_XBYAK OFF CACHE STRING "Disable XBYAK when compiling WITH_ARM=ON." FORCE)
-    set(WITH_MKL OFF CACHE STRING "Disable MKL when compiling WITH_ARM=ON." FORCE)
-    set(WITH_AVX OFF CACHE STRING "Disable AVX when compiling WITH_AVX=OFF." FORCE)
-    add_definitions(-DPADDLE_WITH_ARM)
-endif()
-
-if (WITH_SW)
-    # mieee flag solves floating-point exceptions under sw and ALPHA architectures
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC -mieee")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -mieee")
-    set(WITH_XBYAK OFF CACHE STRING "Disable XBYAK when compiling WITH_SW=ON" FORCE)
-    set(WITH_MKL OFF CACHE STRING "Disable MKL when compiling WITH_SW=ON." FORCE)
-    add_definitions(-DPADDLE_WITH_SW)
-endif()
-
-if (WITH_MIPS)
-    set(WITH_XBYAK OFF CACHE STRING "Disable XBYAK when compiling WITH_MIPS=ON" FORCE)
-    add_definitions(-DPADDLE_WITH_MIPS)
-endif()
-
-if (WITH_ONEMKL)
-    add_definitions(-DPADDLE_WITH_ONEMKL)
-endif()
-
-if (WITH_HETERPS)
-    if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -faligned-new")
-    endif()
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
+  set(WITH_XBYAK
+      OFF
+      CACHE STRING "Disable XBYAK when compiling WITH_ARM=ON." FORCE)
+  set(WITH_MKL
+      OFF
+      CACHE STRING "Disable MKL when compiling WITH_ARM=ON." FORCE)
+  set(WITH_AVX
+      OFF
+      CACHE STRING "Disable AVX when compiling WITH_AVX=OFF." FORCE)
+  add_definitions(-DPADDLE_WITH_ARM)
+endif()
+
+if(WITH_SW)
+  # mieee flag solves floating-point exceptions under sw and ALPHA architectures
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC -mieee")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -mieee")
+  set(WITH_XBYAK
+      OFF
+      CACHE STRING "Disable XBYAK when compiling WITH_SW=ON" FORCE)
+  set(WITH_MKL
+      OFF
+      CACHE STRING "Disable MKL when compiling WITH_SW=ON." FORCE)
+  add_definitions(-DPADDLE_WITH_SW)
+endif()
+
+if(WITH_MIPS)
+  set(WITH_XBYAK
+      OFF
+      CACHE STRING "Disable XBYAK when compiling WITH_MIPS=ON" FORCE)
+  add_definitions(-DPADDLE_WITH_MIPS)
+endif()
+
+if(WITH_ONEMKL)
+  add_definitions(-DPADDLE_WITH_ONEMKL)
+endif()
+
+if(WITH_HETERPS)
+  if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -faligned-new")
+  endif()
 endif()
 set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
 
@@ -465,25 +565,32 @@ set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 add_definitions(-DPADDLE_DLL_EXPORT)
 
 if(ON_INFER)
-    # you can trun off the paddle fluid and inference lib by set ON_INFER=OFF
-    message(STATUS "On inference mode, will take place some specific optimization.")
-    include(inference_lib)
-    add_definitions(-DPADDLE_ON_INFERENCE)
+  # you can trun off the paddle fluid and inference lib by set ON_INFER=OFF
+  message(
+    STATUS "On inference mode, will take place some specific optimization.")
+  include(inference_lib)
+  add_definitions(-DPADDLE_ON_INFERENCE)
 else()
-    #TODO(luotao), combine this warning with `make inference_lib_dist` command.
-    message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")
+  #TODO(luotao), combine this warning with `make inference_lib_dist` command.
+  message(
+    WARNING
+      "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only."
+  )
 endif()
 
 if(WITH_STRIP)
-    find_program(STRIP_PATH strip)
-    if(NOT STRIP_PATH OR NOT LINUX)
-        set(WITH_STRIP OFF CACHE STRING "Command strip is only used on Linux when it exists." FORCE)
-    endif()
+  find_program(STRIP_PATH strip)
+  if(NOT STRIP_PATH OR NOT LINUX)
+    set(WITH_STRIP
+        OFF
+        CACHE STRING "Command strip is only used on Linux when it exists."
+              FORCE)
+  endif()
 endif()
 
 add_subdirectory(paddle)
 if(WITH_PYTHON)
-    add_subdirectory(python)
+  add_subdirectory(python)
 endif()
 
 get_directory_property(all_inc_dirs INCLUDE_DIRECTORIES)
diff --git a/tools/codestyle/.cmakelintrc b/tools/codestyle/.cmakelintrc
new file mode 100644
index 00000000000000..6c5fe30276fc66
--- /dev/null
+++ b/tools/codestyle/.cmakelintrc
@@ -0,0 +1 @@
+filter=-readability/wonkycase,-syntax,-convention/filename,-package/stdargs,-whitespace/indent

From 264de612eb2d2d4742cd74f63a0686d9a287c461 Mon Sep 17 00:00:00 2001
From: tianshuo78520a <707759223@qq.com>
Date: Tue, 7 Jun 2022 10:22:19 +0800
Subject: [PATCH 06/22] update docker (#43136)

---
 paddle/scripts/paddle_build.sh    | 1 -
 tools/dockerfile/ci_dockerfile.sh | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index b3862ea6b3232c..ad081d8128162e 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -3300,7 +3300,6 @@ function check_coverage_build() {
     set -x
 }
 
-
 function main() {
     local CMD=$1 
     local parallel_number=$2
diff --git a/tools/dockerfile/ci_dockerfile.sh b/tools/dockerfile/ci_dockerfile.sh
index 485bfd7968f05c..1195e4c4594c67 100644
--- a/tools/dockerfile/ci_dockerfile.sh
+++ b/tools/dockerfile/ci_dockerfile.sh
@@ -20,7 +20,7 @@ function make_ubuntu_dockerfile(){
   sed -i "s#liblzma-dev#liblzma-dev openmpi-bin openmpi-doc libopenmpi-dev#g" ${dockerfile_name} 
   dockerfile_line=$(wc -l ${dockerfile_name}|awk '{print $1}')
   sed -i "${dockerfile_line}i RUN wget --no-check-certificate -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz \&\& \
-     tar -xzf hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name} 
+     tar -xzf     hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name}
   sed -i "${dockerfile_line}i RUN apt remove git -y \&\& apt install -y libcurl4-openssl-dev gettext zstd \&\& wget -q https://paddle-ci.gz.bcebos.com/git-2.17.1.tar.gz \&\& \
     tar -xvf git-2.17.1.tar.gz \&\& \
     cd git-2.17.1 \&\& \

From d9f8636c3d44a70f114d910fa31c15a25846e344 Mon Sep 17 00:00:00 2001
From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com>
Date: Tue, 7 Jun 2022 10:30:03 +0800
Subject: [PATCH 07/22] Supoort more dimensions in forward fast layer_norm
 kernel (#43226)

---
 .../operators/fused/fused_layernorm_residual_dropout_bias.h   | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
index f72f73438c0a22..fc044e0bafa310 100644
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
@@ -481,10 +481,12 @@ void LaunchLayernormResidualDropoutBias(
   LAUNCH_FUSED_FAST_LN_KERNEL_BASE(1536); \
   LAUNCH_FUSED_FAST_LN_KERNEL_BASE(1792); \
   LAUNCH_FUSED_FAST_LN_KERNEL_BASE(2048); \
+  LAUNCH_FUSED_FAST_LN_KERNEL_BASE(3072); \
   LAUNCH_FUSED_FAST_LN_KERNEL_BASE(4096)
 
   bool can_call_fast_ln_kernel = false;
-  if (((cols >= 768 && cols <= 2048 && cols % 256 == 0) || cols == 4096) &&
+  if (((cols >= 768 && cols <= 2048 && cols % 256 == 0) || cols == 3072 ||
+       cols == 4096) &&
       scale != nullptr && layernorm_bias != nullptr) {
     can_call_fast_ln_kernel = true;
   }

From a2020d0cc369d7d2cf5c4d7eae41f007afb8ab89 Mon Sep 17 00:00:00 2001
From: sneaxiy <32832641+sneaxiy@users.noreply.github.com>
Date: Tue, 7 Jun 2022 10:36:03 +0800
Subject: [PATCH 08/22] fix dropout (#43234)

---
 paddle/fluid/operators/dropout_impl.cu.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h
index c40f6c0bbaea02..6db3efa3cdd60b 100644
--- a/paddle/fluid/operators/dropout_impl.cu.h
+++ b/paddle/fluid/operators/dropout_impl.cu.h
@@ -198,11 +198,13 @@ void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx, bool is_test,
     size_t main_offset =
         size / (block_size * kVecSize) * (block_size * kVecSize);
 
+#define PD_DROPOUT_KERNEL_NAME VectorizedRandomGenerator<T, uint8_t>
     PD_RECORD_CUDA_GRAPH_RANDOM_KERNEL(
-        !is_fix_seed, (VectorizedRandomGenerator<T, uint8_t>), grid_size,
-        block_size, 0, stream, offset, KERNEL_PARAMS.As<uint64_t>(1),
-        KERNEL_PARAMS.As<uint64_t>(7), size, seed_data, dropout_prob, x_data,
-        mask_data, y_data, upscale_in_train, increment, main_offset);
+        !is_fix_seed, PD_DROPOUT_KERNEL_NAME, grid_size, block_size, 0, stream,
+        offset, KERNEL_PARAMS.As<uint64_t>(1), KERNEL_PARAMS.As<uint64_t>(7),
+        size, seed_data, dropout_prob, x_data, mask_data, y_data,
+        upscale_in_train, increment, main_offset);
+#undef PD_DROPOUT_KERNEL_NAME
   } else {
     if (upscale_in_train) {
 // todo: can y share with data with x directly?

From aec49361ee75a44c453ecfbfd996ad7373686864 Mon Sep 17 00:00:00 2001
From: niuliling123 <51102941+niuliling123@users.noreply.github.com>
Date: Tue, 7 Jun 2022 10:38:21 +0800
Subject: [PATCH 09/22] [XPU KP]Add xpu register, any, amax, amin op test
 (#43204)

---
 .../{reduce_amax_op.cu => reduce_amax_op.kps} | 14 ++-
 .../{reduce_amin_op.cu => reduce_amin_op.kps} | 14 ++-
 paddle/phi/kernels/funcs/reduce_function.h    | 15 +--
 .../kernels/{gpu => kps}/reduce_any_kernel.cu |  6 +-
 paddle/phi/kernels/kps/reduce_max_kernel.cu   |  1 -
 .../{gpu => kps}/reduce_prod_kernel.cu        |  7 +-
 .../primitive/compute_primitives_xpu2.h       | 15 +--
 paddle/phi/kernels/reduce_all_kernel.cc       |  4 +
 paddle/phi/kernels/reduce_any_kernel.cc       |  4 +
 paddle/phi/kernels/reduce_max_kernel.cc       |  4 +
 paddle/phi/kernels/reduce_mean_kernel.cc      |  4 +
 paddle/phi/kernels/reduce_min_kernel.cc       |  4 +
 paddle/phi/kernels/reduce_prod_kernel.cc      |  4 +
 paddle/phi/kernels/reduce_sum_kernel.cc       |  6 ++
 .../unittests/xpu/test_reduce_amax_op_xpu.py  | 67 +++++++++++++
 .../unittests/xpu/test_reduce_amin_op_xpu.py  | 67 +++++++++++++
 .../unittests/xpu/test_reduce_any_op_xpu.py   | 99 +++++++++++++++++++
 17 files changed, 315 insertions(+), 20 deletions(-)
 rename paddle/fluid/operators/reduce_ops/{reduce_amax_op.cu => reduce_amax_op.kps} (77%)
 rename paddle/fluid/operators/reduce_ops/{reduce_amin_op.cu => reduce_amin_op.kps} (77%)
 rename paddle/phi/kernels/{gpu => kps}/reduce_any_kernel.cu (87%)
 rename paddle/phi/kernels/{gpu => kps}/reduce_prod_kernel.cu (91%)
 create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_reduce_amax_op_xpu.py
 create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_reduce_amin_op_xpu.py
 create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_reduce_any_op_xpu.py

diff --git a/paddle/fluid/operators/reduce_ops/reduce_amax_op.cu b/paddle/fluid/operators/reduce_ops/reduce_amax_op.kps
similarity index 77%
rename from paddle/fluid/operators/reduce_ops/reduce_amax_op.cu
rename to paddle/fluid/operators/reduce_ops/reduce_amax_op.kps
index b33859153419c3..09987279184694 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_amax_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_amax_op.kps
@@ -12,13 +12,25 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifndef PADDLE_WITH_XPU_KP
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
+#endif
+
 #include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 
-// reduce_max
+#ifdef PADDLE_WITH_XPU_KP
+REGISTER_OP_KERNEL(
+    reduce_amax, KP, plat::XPUPlace,
+    ops::ReduceCudaKernel<float, kps::MaxFunctor, kps::IdentityFunctor>);
+#else
 REGISTER_OP_CUDA_KERNEL(
     reduce_amax,
     ops::ReduceCudaKernel<float, kps::MaxFunctor, kps::IdentityFunctor>,
     ops::ReduceCudaKernel<double, kps::MaxFunctor, kps::IdentityFunctor>,
     ops::ReduceCudaKernel<int, kps::MaxFunctor, kps::IdentityFunctor>,
     ops::ReduceCudaKernel<int64_t, kps::MaxFunctor, kps::IdentityFunctor>);
+#endif
diff --git a/paddle/fluid/operators/reduce_ops/reduce_amin_op.cu b/paddle/fluid/operators/reduce_ops/reduce_amin_op.kps
similarity index 77%
rename from paddle/fluid/operators/reduce_ops/reduce_amin_op.cu
rename to paddle/fluid/operators/reduce_ops/reduce_amin_op.kps
index 037dab396c757c..5e1139396d90cb 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_amin_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_amin_op.kps
@@ -12,13 +12,25 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifndef PADDLE_WITH_XPU_KP
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
+#endif
+
 #include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 
-// reduce_min
+#ifdef PADDLE_WITH_XPU_KP
+REGISTER_OP_KERNEL(
+    reduce_amin, KP, plat::XPUPlace,
+    ops::ReduceCudaKernel<float, kps::MinFunctor, kps::IdentityFunctor>);
+#else
 REGISTER_OP_CUDA_KERNEL(
     reduce_amin,
     ops::ReduceCudaKernel<float, kps::MinFunctor, kps::IdentityFunctor>,
     ops::ReduceCudaKernel<double, kps::MinFunctor, kps::IdentityFunctor>,
     ops::ReduceCudaKernel<int, kps::MinFunctor, kps::IdentityFunctor>,
     ops::ReduceCudaKernel<int64_t, kps::MinFunctor, kps::IdentityFunctor>);
+#endif
diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h
index 5c74751b348c0e..4d903e01a49824 100644
--- a/paddle/phi/kernels/funcs/reduce_function.h
+++ b/paddle/phi/kernels/funcs/reduce_function.h
@@ -236,8 +236,9 @@ struct IndexCalculator {
 template <bool ReduceLastDim = false>
 struct ReduceIndexMapping {
   const kps::DimConfig dim;
-  HOSTDEVICE explicit ReduceIndexMapping(const kps::DimConfig& dims)
-      : dim(dims) {}
+  int loop_size;
+  HOSTDEVICE ReduceIndexMapping(const kps::DimConfig& dims, int max_loop = 1)
+      : dim(dims), loop_size(max_loop) {}
 
 #ifdef PADDLE_WITH_XPU_KP
   __device__ __forceinline__ int BlockIdX() {
@@ -277,10 +278,10 @@ struct ReduceIndexMapping {
   }
 
   __device__ __forceinline__ int GetLoopSize() {
-    if (ReduceLastDim) {
-      return dim.deal_size_y;
-    } else {
+    if ((!ReduceLastDim) && (loop_size == 1)) {
       return dim.deal_size_x;
+    } else {
+      return loop_size;
     }
   }
 #else
@@ -670,7 +671,7 @@ __global__ void ReduceAnyKernel(const Tx* x,
   int store_offset = 0;
   int stride_left = 0;
   if (reduce_last_dim) {
-    auto block = ReduceIndexMapping<true>(dim);
+    auto block = ReduceIndexMapping<true>(dim, left_num);
     input_idx = block.BlockIdY() * block.BlockDimX();
     left_idx = block.BlockIdX() * block.BlockDimY() + THREAD_ID_Y;
     stride = block.GridDimY() * block.BlockDimX();
@@ -681,7 +682,7 @@ __global__ void ReduceAnyKernel(const Tx* x,
     stride_left = 1;
     tid = THREAD_ID_X;
   } else {
-    auto block = ReduceIndexMapping<false>(dim);
+    auto block = ReduceIndexMapping<false>(dim, left_num);
     input_idx = block.BlockIdY() * block.BlockDimY();
     left_idx = block.BlockIdX() * block.BlockDimX() + THREAD_ID_X;
     stride = block.GridDimY() * block.BlockDimY();
diff --git a/paddle/phi/kernels/gpu/reduce_any_kernel.cu b/paddle/phi/kernels/kps/reduce_any_kernel.cu
similarity index 87%
rename from paddle/phi/kernels/gpu/reduce_any_kernel.cu
rename to paddle/phi/kernels/kps/reduce_any_kernel.cu
index 25f73c64a5417c..480268936f49f1 100644
--- a/paddle/phi/kernels/gpu/reduce_any_kernel.cu
+++ b/paddle/phi/kernels/kps/reduce_any_kernel.cu
@@ -32,4 +32,8 @@ void AnyRawKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PD_REGISTER_KERNEL(any_raw, GPU, ALL_LAYOUT, phi::AnyRawKernel, bool) {}
+#ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(any_raw, KPS, ALL_LAYOUT, phi::AnyRawKernel, bool) {}
+#else
+PD_REGISTER_KERNEL(any_raw, KPS, ALL_LAYOUT, phi::AnyRawKernel, bool) {}
+#endif
diff --git a/paddle/phi/kernels/kps/reduce_max_kernel.cu b/paddle/phi/kernels/kps/reduce_max_kernel.cu
index bc997c6c4e3b66..52644849ad8bf3 100644
--- a/paddle/phi/kernels/kps/reduce_max_kernel.cu
+++ b/paddle/phi/kernels/kps/reduce_max_kernel.cu
@@ -37,5 +37,4 @@ PD_REGISTER_KERNEL(max_raw, KPS, ALL_LAYOUT, phi::MaxRawKernel, float) {}
 #else
 PD_REGISTER_KERNEL(
     max_raw, KPS, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {}
-
 #endif
diff --git a/paddle/phi/kernels/gpu/reduce_prod_kernel.cu b/paddle/phi/kernels/kps/reduce_prod_kernel.cu
similarity index 91%
rename from paddle/phi/kernels/gpu/reduce_prod_kernel.cu
rename to paddle/phi/kernels/kps/reduce_prod_kernel.cu
index 4ae1dcfeba0a19..13d8e29b60b127 100644
--- a/paddle/phi/kernels/gpu/reduce_prod_kernel.cu
+++ b/paddle/phi/kernels/kps/reduce_prod_kernel.cu
@@ -31,12 +31,15 @@ void ProdRawKernel(const Context& dev_ctx,
 }
 
 }  // namespace phi
-
+#ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(prod_raw, KPS, ALL_LAYOUT, phi::ProdRawKernel, float) {}
+#else
 PD_REGISTER_KERNEL(prod_raw,
-                   GPU,
+                   KPS,
                    ALL_LAYOUT,
                    phi::ProdRawKernel,
                    float,
                    double,
                    int,
                    int64_t) {}
+#endif
diff --git a/paddle/phi/kernels/primitive/compute_primitives_xpu2.h b/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
index 6ec05ee5054437..38a8d40aee6287 100644
--- a/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
@@ -48,7 +48,7 @@ static inline __device__ void sync_all() {
 
 #define ncores 64
 template <typename T, typename OpFunc, int VecSize>
-__device__ void BlockXReduce(T* data, OpFunc reducer) {
+__device__ void BlockXReduce(T* out, const T* data, OpFunc reducer) {
   __shared__ T sum_array[ncores * VecSize];
   int core_idx = core_id() * VecSize;
   mfence();
@@ -57,21 +57,22 @@ __device__ void BlockXReduce(T* data, OpFunc reducer) {
 #pragma unroll
   for (int i = 0; i < VecSize; i++) {
     mfence();
-    sum_array[core_idx + i] = data[i];
+    sum_array[i * ncores + core_idx] = data[i];
     mfence();
-    data[i] = 0;
   }
   sync_all();
 #pragma unroll
   for (int i = 0; i < VecSize; i++) {
+    T start = data[i * ncores];
 #pragma unroll
-    for (int j = 0; j < ncores; j++) {
+    for (int j = 1; j < ncores; j++) {
       mfence();
-      T tmp = sum_array[j * VecSize + i];
+      T tmp = sum_array[i * ncores + j];
       mfence();
-      data[i] = reducer(data[i], tmp);
+      start = reducer(start, tmp);
       mfence();
     }
+    out[i] = start;
   }
   sync_all();
 }
@@ -346,7 +347,7 @@ __device__ __forceinline__ void Reduce(T* out,
     if (reduce_last_dim) {
 #pragma unroll
       for (int i = 0; i < NY * NX; i++) {  // reduce along blockDim.x
-        details::BlockXReduce<T, ReduceFunctor, 1>(&out[i], reducer);
+        details::BlockXReduce<T, ReduceFunctor, 1>(&out[i], &in[i], reducer);
       }
     }
   } else {  // else  kLocalMode
diff --git a/paddle/phi/kernels/reduce_all_kernel.cc b/paddle/phi/kernels/reduce_all_kernel.cc
index 5525f0dbfa7ed6..9b4515ee2909f7 100644
--- a/paddle/phi/kernels/reduce_all_kernel.cc
+++ b/paddle/phi/kernels/reduce_all_kernel.cc
@@ -36,3 +36,7 @@ PD_REGISTER_KERNEL(all, CPU, ALL_LAYOUT, phi::AllKernel, bool) {}
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(all, GPU, ALL_LAYOUT, phi::AllKernel, bool) {}
 #endif
+
+#if defined(PADDLE_WITH_XPU_KP)
+PD_REGISTER_KERNEL(all, KPS, ALL_LAYOUT, phi::AllKernel, bool) {}
+#endif
diff --git a/paddle/phi/kernels/reduce_any_kernel.cc b/paddle/phi/kernels/reduce_any_kernel.cc
index 01cbcd4029c777..642b80c3d86f02 100644
--- a/paddle/phi/kernels/reduce_any_kernel.cc
+++ b/paddle/phi/kernels/reduce_any_kernel.cc
@@ -36,3 +36,7 @@ PD_REGISTER_KERNEL(any, CPU, ALL_LAYOUT, phi::AnyKernel, bool) {}
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(any, GPU, ALL_LAYOUT, phi::AnyKernel, bool) {}
 #endif
+
+#if defined(PADDLE_WITH_XPU_KP)
+PD_REGISTER_KERNEL(any, KPS, ALL_LAYOUT, phi::AnyKernel, bool) {}
+#endif
diff --git a/paddle/phi/kernels/reduce_max_kernel.cc b/paddle/phi/kernels/reduce_max_kernel.cc
index a7458a3e0ac132..26b8bc196ccd4e 100644
--- a/paddle/phi/kernels/reduce_max_kernel.cc
+++ b/paddle/phi/kernels/reduce_max_kernel.cc
@@ -38,3 +38,7 @@ PD_REGISTER_KERNEL(
 PD_REGISTER_KERNEL(
     max, GPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {}
 #endif
+
+#if defined(PADDLE_WITH_XPU_KP)
+PD_REGISTER_KERNEL(max, KPS, ALL_LAYOUT, phi::MaxKernel, float) {}
+#endif
diff --git a/paddle/phi/kernels/reduce_mean_kernel.cc b/paddle/phi/kernels/reduce_mean_kernel.cc
index 812cf8702e15cd..599b7eca32110c 100644
--- a/paddle/phi/kernels/reduce_mean_kernel.cc
+++ b/paddle/phi/kernels/reduce_mean_kernel.cc
@@ -46,3 +46,7 @@ PD_REGISTER_KERNEL(mean,
                    int64_t,
                    phi::dtype::float16) {}
 #endif
+
+#if defined(PADDLE_WITH_XPU_KP)
+PD_REGISTER_KERNEL(mean, KPS, ALL_LAYOUT, phi::MeanKernel, float) {}
+#endif
diff --git a/paddle/phi/kernels/reduce_min_kernel.cc b/paddle/phi/kernels/reduce_min_kernel.cc
index 620b5167566f2b..75d906aa4bd75e 100644
--- a/paddle/phi/kernels/reduce_min_kernel.cc
+++ b/paddle/phi/kernels/reduce_min_kernel.cc
@@ -38,3 +38,7 @@ PD_REGISTER_KERNEL(
 PD_REGISTER_KERNEL(
     min, GPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {}
 #endif
+
+#if defined(PADDLE_WITH_XPU_KP)
+PD_REGISTER_KERNEL(min, KPS, ALL_LAYOUT, phi::MinKernel, float) {}
+#endif
diff --git a/paddle/phi/kernels/reduce_prod_kernel.cc b/paddle/phi/kernels/reduce_prod_kernel.cc
index 5bd410709c6ba1..3bb1c7552b11f2 100644
--- a/paddle/phi/kernels/reduce_prod_kernel.cc
+++ b/paddle/phi/kernels/reduce_prod_kernel.cc
@@ -38,3 +38,7 @@ PD_REGISTER_KERNEL(
 PD_REGISTER_KERNEL(
     prod, GPU, ALL_LAYOUT, phi::ProdKernel, float, double, int, int64_t) {}
 #endif
+
+#if defined(PADDLE_WITH_XPU_KP)
+PD_REGISTER_KERNEL(prod, KPS, ALL_LAYOUT, phi::ProdKernel, float) {}
+#endif
diff --git a/paddle/phi/kernels/reduce_sum_kernel.cc b/paddle/phi/kernels/reduce_sum_kernel.cc
index e2b13333d7f81a..0d79fa34bc2748 100644
--- a/paddle/phi/kernels/reduce_sum_kernel.cc
+++ b/paddle/phi/kernels/reduce_sum_kernel.cc
@@ -69,3 +69,9 @@ PD_REGISTER_KERNEL(sum,
   kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
 }
 #endif
+
+#if defined(PADDLE_WITH_XPU_KP)
+PD_REGISTER_KERNEL(sum, KPS, ALL_LAYOUT, phi::SumKernel, float) {
+  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
+}
+#endif
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_amax_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_reduce_amax_op_xpu.py
new file mode 100644
index 00000000000000..a6a0c7b5920a86
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_reduce_amax_op_xpu.py
@@ -0,0 +1,67 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+
+sys.path.append("..")
+
+import paddle
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
+paddle.enable_static()
+
+
+class XPUTestReduceAmaxOp(XPUOpTestWrapper):
+
+    def __init__(self):
+        self.op_name = 'reduce_amax'
+
+    class XPUTestReduceAmaxBase(XPUOpTest):
+
+        def setUp(self):
+            self.place = paddle.XPUPlace(0)
+            self.set_case()
+
+        def set_case(self):
+            self.op_type = 'reduce_amax'
+            self.shape = (20, 10)
+            self.attrs = {'use_xpu': True, 'keep_dim': False, 'dim': (1, )}
+
+            self.inputs = {
+                'X': np.random.randint(0, 100, self.shape).astype("float32")
+            }
+
+            expect_intput = self.inputs['X']
+            self.outputs = {
+                'Out':
+                np.amax(expect_intput,
+                        axis=self.attrs['dim'],
+                        keepdims=self.attrs['keep_dim'])
+            }
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+
+support_types = get_xpu_op_support_types('reduce_amax')
+for stype in support_types:
+    create_test_class(globals(), XPUTestReduceAmaxOp, stype)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_amin_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_reduce_amin_op_xpu.py
new file mode 100644
index 00000000000000..def6c0821f5a39
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_reduce_amin_op_xpu.py
@@ -0,0 +1,67 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+
+sys.path.append("..")
+
+import paddle
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
+paddle.enable_static()
+
+
+class XPUTestReduceAmaxOp(XPUOpTestWrapper):
+
+    def __init__(self):
+        self.op_name = 'reduce_amin'
+
+    class XPUTestReduceAmaxBase(XPUOpTest):
+
+        def setUp(self):
+            self.place = paddle.XPUPlace(0)
+            self.set_case()
+
+        def set_case(self):
+            self.op_type = 'reduce_amin'
+            self.shape = (20, 10)
+            self.attrs = {'use_xpu': True, 'keep_dim': False, 'dim': (1, )}
+
+            self.inputs = {
+                'X': np.random.randint(0, 100, self.shape).astype("float32")
+            }
+
+            expect_intput = self.inputs['X']
+            self.outputs = {
+                'Out':
+                np.amin(expect_intput,
+                        axis=self.attrs['dim'],
+                        keepdims=self.attrs['keep_dim'])
+            }
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+
+support_types = get_xpu_op_support_types('reduce_amin')
+for stype in support_types:
+    create_test_class(globals(), XPUTestReduceAmaxOp, stype)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_any_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_reduce_any_op_xpu.py
new file mode 100644
index 00000000000000..5118c3787e663f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_reduce_any_op_xpu.py
@@ -0,0 +1,99 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+
+sys.path.append("..")
+
+import paddle
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
+paddle.enable_static()
+
+
+class XPUTestReduceAnyOp(XPUOpTestWrapper):
+
+    def __init__(self):
+        self.op_name = 'reduce_any'
+
+    class XPUTestReduceAnyBase(XPUOpTest):
+
+        def setUp(self):
+            self.place = paddle.XPUPlace(0)
+            self.set_case()
+
+        def set_case(self):
+            self.op_type = 'reduce_any'
+            self.attrs = {
+                'use_xpu': True,
+                'reduce_all': True,
+                'keep_dim': True,
+                'dim': (3, 5, 4)
+            }
+            self.inputs = {
+                'X':
+                np.random.randint(0, 2, (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+            }
+            self.outputs = {'Out': self.inputs['X'].any(axis=self.attrs['dim'])}
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            pass
+
+    class XPUTestReduceAnyCase1(XPUTestReduceAnyBase):
+
+        def set_case(self):
+            self.op_type = 'reduce_any'
+            self.attrs = {
+                'use_xpu': True,
+                'dim': [1]
+                # 'reduce_all': True,
+                # 'keep_dim': True,
+            }
+            self.inputs = {
+                'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")
+            }
+            self.outputs = {'Out': self.inputs['X'].any(axis=1)}
+
+    class XPUTestReduceAnyCase2(XPUTestReduceAnyBase):
+
+        def set_case(self):
+            self.op_type = 'reduce_any'
+            self.attrs = {
+                'use_xpu': True,
+                'reduce_all': True,
+                'keep_dim': False,
+                'dim': (3, 6)
+            }
+            self.inputs = {
+                'X':
+                np.random.randint(0, 2, (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+            }
+            self.outputs = {'Out': self.inputs['X'].any(axis=self.attrs['dim'])}
+
+
+support_types = get_xpu_op_support_types('reduce_any')
+for stype in support_types:
+    create_test_class(globals(), XPUTestReduceAnyOp, stype)
+
+if __name__ == '__main__':
+    unittest.main()

From 552808378be6d609301aa596a3f76b38e9c25467 Mon Sep 17 00:00:00 2001
From: wangguanzhong <jerrywgz@126.com>
Date: Tue, 7 Jun 2022 11:23:01 +0800
Subject: [PATCH 10/22] fix conv3d doc, test=document_fix (#43253)

---
 python/paddle/nn/functional/conv.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 26f07c2f9a11c7..f1d66a9e3a1b57 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -1244,10 +1244,10 @@ def conv3d(x,
             where M is the number of filters(output channels), g is the number of groups,
             kD, kH, kW are the filter's depth, height and width respectively.
         bias (Tensor, optional): The bias, a Tensor of shape [M, ].
-        stride (int|list|tuple): The stride size. It means the stride in convolution. If stride is a 
+        stride (int|list|tuple, optional): The stride size. It means the stride in convolution. If stride is a 
             list/tuple, it must contain three integers, (stride_depth, stride_height, stride_width). 
             Otherwise, stride_depth = stride_height = stride_width = stride. Default: stride = 1.
-        padding (string|int|list|tuple): The padding size. It means the number of zero-paddings 
+        padding (string|int|list|tuple, optional): The padding size. It means the number of zero-paddings 
             on both sides for each dimension. If `padding` is a string, either 'VALID' or
             'SAME' which is the padding algorithm. If padding size is a tuple or list,
             it could be in three forms: `[pad_depth, pad_height, pad_width]` or
@@ -1257,20 +1257,20 @@ def conv3d(x,
             when `data_format` is `"NDHWC"`, `padding` can be in the form
             `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
-        dilation (int|list|tuple): The dilation size. It means the spacing between the kernel points. 
+        dilation (int|list|tuple, optional): The dilation size. It means the spacing between the kernel points. 
             If dilation is a list/tuple, it must contain three integers, (dilation_depth, dilation_height,
             dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. 
             Default: dilation = 1.
-        groups (int): The groups number of the Conv3D Layer. According to grouped
+        groups (int, optional): The groups number of the Conv3D Layer. According to grouped
             convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
             the first half of the filters is only connected to the first half
             of the input channels, while the second half of the filters is only
             connected to the second half of the input channels. Default: groups=1
         data_format (str, optional): Specify the data format of the input, and the data format of the output 
-            will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
-            The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-            `[batch_size, input_channels, input_height, input_width]`.
-        name(str|None): For detailed information, please refer 
+            will be consistent with that of the input. An optional string from: `"NCDHW"`, `"NDHWC"`.
+            The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
+            `[batch_size, input_channels, input_depth, input_height, input_width]`.
+        name(str|None, optional): For detailed information, please refer 
            to :ref:`api_guide_Name`. Usually name is no need to set and 
            None by default.
 

From e74f287bb32fe899587b0881785ce29fe8d3ee1d Mon Sep 17 00:00:00 2001
From: Guoxia Wang <mingzilaochongtu@gmail.com>
Date: Tue, 7 Jun 2022 11:25:33 +0800
Subject: [PATCH 11/22] fix the unittest bug of none grad of
 margin_cross_entropy when FLAGS_retain_grad_for_all_tensor change default
 setting (#43241)

---
 .../fluid/tests/unittests/parallel_margin_cross_entropy.py       | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/paddle/fluid/tests/unittests/parallel_margin_cross_entropy.py b/python/paddle/fluid/tests/unittests/parallel_margin_cross_entropy.py
index b77a04d8eea9c2..26e9e05b82ab81 100644
--- a/python/paddle/fluid/tests/unittests/parallel_margin_cross_entropy.py
+++ b/python/paddle/fluid/tests/unittests/parallel_margin_cross_entropy.py
@@ -39,6 +39,7 @@ class TestParallelMarginSoftmaxCrossEntropyOp(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         fleet.init(is_collective=True, strategy=strategy)
+        paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
 
     def test_parallel_margin_softmax_cross_entropy(self):
         margin1s = [1.0, 1.0, 1.35]

From 71a63f0a9be78d371a648a7cc97456857cadf718 Mon Sep 17 00:00:00 2001
From: limingshu <61349199+JamesLim-sy@users.noreply.github.com>
Date: Tue, 7 Jun 2022 11:28:49 +0800
Subject: [PATCH 12/22] Transpose optimization with assitant of  Chengdu
 Supercomputing Center and auto_tune operation (#42704)

---
 paddle/fluid/operators/transpose_op.cu.h      | 432 +++++++++++++++++-
 paddle/fluid/operators/transpose_op.h         | 178 ++++++++
 paddle/fluid/platform/fast_divmod.h           |   2 +-
 paddle/phi/kernels/autotune/auto_tune_base.h  | 114 +++--
 paddle/phi/kernels/autotune/auto_tune_test.cu |  22 +-
 paddle/phi/kernels/autotune/cache.h           |   5 +-
 6 files changed, 709 insertions(+), 44 deletions(-)

diff --git a/paddle/fluid/operators/transpose_op.cu.h b/paddle/fluid/operators/transpose_op.cu.h
index 40a967b11f7a92..f9d91fec4c3f6a 100644
--- a/paddle/fluid/operators/transpose_op.cu.h
+++ b/paddle/fluid/operators/transpose_op.cu.h
@@ -17,8 +17,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/gpu_utils.h"
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/fluid/platform/fast_divmod.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/kernels/autotune/auto_tune_base.h"
+#include "paddle/phi/kernels/autotune/cache.h"
+#include "paddle/phi/kernels/copy_kernel.h"
 
 namespace paddle {
 namespace operators {
@@ -656,13 +660,437 @@ struct TransposeSimple {
   }
 };
 
+template <int N, typename T>
+class IdxHelper {
+ public:
+  IdxHelper() {}
+  explicit IdxHelper(const T* dims) {
+    for (int i = N - 1; i >= 0; --i) {
+      stride_[i] = i < (N - 1) ? dims[i + 1] * stride_[i + 1] : 1;
+    }
+  }
+
+  __device__ inline T GetStride(int idx) const { return stride_[idx]; }
+
+  __device__ inline void GetIndexFromOffset(T offset, T* index) const {
+    T remaining = offset;
+#pragma unroll
+    for (int i = 0; i < N - 1; ++i) {
+      const T idx = remaining / stride_[i];
+      remaining -= idx * stride_[i];
+      index[i] = idx;
+    }
+    index[N - 1] = remaining;
+  }
+
+ private:
+  T stride_[N];
+};
+
+template <int N>
+class IdxHelper<N, uint32_t> {
+ public:
+  IdxHelper() {}
+  explicit IdxHelper(const uint32_t* dims) {
+    for (int i = N - 1; i >= 0; --i) {
+      uint32_t value = i < (N - 1) ? dims[i + 1] * stride_[i + 1] : 1;
+      divmoder_[i] = paddle::platform::FastDivMod(value);
+      stride_[i] = value;
+    }
+  }
+
+  __device__ inline uint32_t GetStride(int idx) const { return stride_[idx]; }
+
+  __device__ inline void GetIndexFromOffset(uint32_t offset,
+                                            uint32_t* index) const {
+    uint32_t remaining = offset;
+#pragma unroll
+    for (int i = 0; i < N - 1; ++i) {
+      uint32_t idx = divmoder_[i].Div(remaining);
+      index[i] = idx;
+      remaining -= idx * stride_[i];
+    }
+    index[N - 1] = remaining;
+  }
+
+ private:
+  uint32_t stride_[N];
+  paddle::platform::FastDivMod divmoder_[N];
+};
+
+// Transform index between memory offset and shape coodinate.
+template <typename T, int N>
+class IdxAndOffsetHelper {
+ public:
+  IdxAndOffsetHelper() {}
+  ~IdxAndOffsetHelper() = default;
+
+  explicit IdxAndOffsetHelper(const T* dims) {
+    index_helper = IdxHelper<N, T>(dims);
+  }
+
+  template <typename U>
+  explicit IdxAndOffsetHelper(const U* dims) {
+    T temp_dims[N];
+    for (int i = 0; i < N; ++i) {
+      temp_dims[i] = static_cast<T>(dims[i]);
+    }
+    index_helper = IdxHelper<N, T>(temp_dims);
+  }
+
+  __device__ inline T IndexToOffset(const T* index) const {
+    T offset = 0;
+#pragma unroll
+    for (int i = 0; i < N - 1; ++i) {
+      offset += index[i] * index_helper.GetStride(i);
+    }
+    offset += index[N - 1];
+    return offset;
+  }
+
+  __device__ inline void OffsetToIndex(T offset, T* index) const {
+    index_helper.GetIndexFromOffset(offset, index);
+  }
+
+ private:
+  IdxHelper<N, T> index_helper;
+};
+
+template <size_t Rank, typename IndexT>
+struct PermuteParams {
+ public:
+  IdxAndOffsetHelper<IndexT, Rank> src_index_helper;
+  IdxAndOffsetHelper<IndexT, Rank> dst_index_helper;
+  int perm[Rank]{};
+
+  explicit PermuteParams(const std::vector<size_t>& dims,
+                         const std::vector<int>& perm_) {
+    size_t dst_dims[Rank];
+    for (size_t i = 0; i < Rank; ++i) {
+      dst_dims[i] = dims[perm_[i]];
+      perm[i] = perm_[i];
+    }
+    dst_index_helper = IdxAndOffsetHelper<IndexT, Rank>(dst_dims);
+    src_index_helper = IdxAndOffsetHelper<IndexT, Rank>(dims.data());
+  }
+};
+
+// A special kernel for target case, both vectorized read and write supported.
+template <typename T, typename IndexT, int VecSize, int Rank>
+__global__ void VectorizedPermuteKernel(PermuteParams<Rank, IndexT> params,
+                                        const size_t count,
+                                        const T* __restrict__ src_data,
+                                        T* dst_data) {
+  using VecT = phi::AlignedVector<T, VecSize>;
+  IndexT src_index[Rank];
+  IndexT dst_index[Rank];
+
+  const VecT* __restrict__ src =
+      reinterpret_cast<const VecT* __restrict__>(src_data);
+  VecT* dst = reinterpret_cast<VecT*>(dst_data);
+
+  IndexT tid = blockIdx.x * blockDim.x + threadIdx.x;
+  for (IndexT i = tid; i < count; i += blockDim.x * gridDim.x) {
+    params.dst_index_helper.OffsetToIndex(i, dst_index);
+
+#pragma unroll
+    for (int j = 0; j < Rank; ++j) {
+      src_index[params.perm[j]] = dst_index[j];
+    }
+    IndexT src_offset = params.src_index_helper.IndexToOffset(src_index);
+    dst[i] = src[src_offset];
+  }
+}
+
+// A general kernel for normal case, only support vectorized write.
+template <typename T, typename IndexT, int VecSize, int Rank>
+__global__ void GeneralPermuteKernel(PermuteParams<Rank, IndexT> params,
+                                     const T* __restrict__ src, T* dst,
+                                     const size_t main_cnt,
+                                     const size_t tail_cnt,
+                                     const size_t offset) {
+  using VecT = phi::AlignedVector<T, VecSize>;
+  VecT* vec_dst = reinterpret_cast<VecT*>(dst);
+
+  IndexT src_index[VecSize][Rank];
+  IndexT dst_index[VecSize][Rank];
+
+  // Avoid read perm data both in 2 load process.
+  __shared__ int perm[Rank];
+  if (threadIdx.x < Rank) {
+    perm[threadIdx.x] = params.perm[threadIdx.x];
+  }
+  __syncthreads();
+
+  // Vectorized load data.
+  IndexT tid = blockIdx.x * blockDim.x + threadIdx.x;
+  for (IndexT idx = tid; idx < main_cnt; idx += blockDim.x * gridDim.x) {
+    VecT vec_data;
+    IndexT vec_idx = idx * VecSize;
+
+#pragma unroll
+    for (int i = 0; i < VecSize; ++i) {
+      params.dst_index_helper.OffsetToIndex(vec_idx + i, dst_index[i]);
+
+#pragma unroll
+      for (int j = 0; j < Rank; ++j) {
+        src_index[i][perm[j]] = dst_index[i][j];
+      }
+      IndexT src_offset = params.src_index_helper.IndexToOffset(src_index[i]);
+      vec_data[i] = src[src_offset];
+    }
+    vec_dst[idx] = vec_data;
+  }
+
+  // Singularized load data.
+  if (tid < tail_cnt) {
+    IndexT idx = tid + offset;
+    params.dst_index_helper.OffsetToIndex(idx, dst_index[0]);
+
+#pragma unroll
+    for (int j = 0; j < Rank; ++j) {
+      src_index[0][perm[j]] = dst_index[0][j];
+    }
+    IndexT src_offset = params.src_index_helper.IndexToOffset(src_index[0]);
+    dst[idx] = src[src_offset];
+  }
+}
+
+// A Gerneral permute method that drectly find the dst data
+// coordinate in the source data.
+template <typename T, typename IndexT, int VecSize, int Rank>
+inline void LaunchPermuteKernel(const phi::GPUContext& ctx, const IndexT count,
+                                const PermuteType perm_type,
+                                const std::vector<size_t>& dims,
+                                const std::vector<int>& perm, const T* src,
+                                T* dst) {
+  size_t main_count = count / VecSize;
+  auto params = PermuteParams<Rank, IndexT>(dims, perm);
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx, main_count);
+
+  if (perm_type == PermuteType::kNormalPermute) {
+    size_t tail_count = count - main_count * VecSize;
+    size_t offset = count - tail_count;
+    GeneralPermuteKernel<
+        T, IndexT, VecSize,
+        Rank><<<config.GetGridSize(), config.GetBlockSize(), 0, ctx.stream()>>>(
+        params, src, dst, main_count, tail_count, offset);
+  } else {
+    VectorizedPermuteKernel<
+        T, IndexT, VecSize,
+        Rank><<<config.GetGridSize(), config.GetBlockSize(), 0, ctx.stream()>>>(
+        params, main_count, src, dst);
+  }
+}
+
+template <typename T, typename IndexT, int VecSize>
+inline void LaunchPermuteRankDispatch(const phi::GPUContext& ctx,
+                                      const IndexT count,
+                                      const PermuteType perm_type,
+                                      const std::vector<size_t>& dims,
+                                      const std::vector<int>& perm,
+                                      const T* src, T* dst) {
+#define CALL_DISPATCH_RANK(rank)                                               \
+  case rank: {                                                                 \
+    LaunchPermuteKernel<T, IndexT, VecSize, rank>(ctx, count, perm_type, dims, \
+                                                  perm, src, dst);             \
+    break;                                                                     \
+  }
+
+  switch (dims.size()) {
+    CALL_DISPATCH_RANK(1);
+    CALL_DISPATCH_RANK(2);
+    CALL_DISPATCH_RANK(3);
+    CALL_DISPATCH_RANK(4);
+    CALL_DISPATCH_RANK(5);
+    CALL_DISPATCH_RANK(6);
+    CALL_DISPATCH_RANK(7);
+    CALL_DISPATCH_RANK(8);
+    CALL_DISPATCH_RANK(9);
+  }
+#undef CALL_DISPATCH_RANK
+}
+
+// Aim at transposing the last 2 dimensions. Refer from
+// https://developer.nvidia.com/blog/efficient-matrix-transpose-cuda-cc/
+template <typename T, typename IndexT, int VecSize>
+__global__ void BatchTransposeKernel(const T* __restrict__ src_data,
+                                     T* dst_data, IndexT rows, IndexT cols) {
+  using VecT = phi::AlignedVector<T, VecSize>;
+
+  __shared__ VecT tile[kTileSize][kShareCol];
+  T* single_tile = reinterpret_cast<T*>(tile);
+
+  IndexT col_in_matrix = blockIdx.x * kTileSize + threadIdx.x;
+  IndexT offset = blockIdx.z * rows * cols;
+
+  // Vectorized load data from src into shared memory. [rows, cols]
+  const VecT* __restrict__ src =
+      reinterpret_cast<const VecT* __restrict__>(src_data);
+
+  for (IndexT tile_y = threadIdx.y; tile_y < kTileSize; tile_y += kBlockRows) {
+    IndexT row_in_matrix = tile_y + blockIdx.y * kTileSize;
+
+    if (col_in_matrix < cols && row_in_matrix < rows) {
+      tile[tile_y][threadIdx.x] =
+          src[offset + row_in_matrix * cols + col_in_matrix];
+    }
+  }
+
+  // Singularized load data from shared memory into dst.
+  // and dst_cols = rows, dst_rows = cols, [cols * Vecsize, rows]
+  col_in_matrix = blockIdx.y * kTileSize + threadIdx.x;
+  offset = offset * VecSize + col_in_matrix;
+  IndexT tile_x_idx = threadIdx.x * (kShareCol * VecSize);
+
+  __syncthreads();
+
+  for (IndexT tile_y = threadIdx.y; tile_y < kTileSize; tile_y += kBlockRows) {
+    IndexT row_in_matrix = tile_y + blockIdx.x * kTileSize;
+    IndexT dst_idx = offset + row_in_matrix * VecSize * rows;
+    IndexT tile_idx = tile_x_idx + tile_y * VecSize;
+    if (col_in_matrix < /*dst_cols=*/rows &&
+        row_in_matrix < /*dst_rows=*/cols) {
+#pragma unroll
+      for (auto i = 0; i < VecSize; ++i) {
+        dst_data[dst_idx + i * rows] = single_tile[tile_idx + i];
+      }
+    }
+  }
+}
+
+// With the byte limitation of shared_memory, the VecSize shall be restricted
+// for the type whose byte-size is less than 8.
+template <typename T, typename IndexT, int Size,
+          int VecSize = (sizeof(T) > 8 ? 1 : Size)>
+inline void LaunchTransposeKernel(const phi::GPUContext& ctx,
+                                  const std::vector<size_t>& dims, const T* src,
+                                  T* dst) {
+  auto rank = dims.size();
+  IndexT num_batches = (rank == 2) ? 1 : dims[0];
+  IndexT rows = dims[rank - 2];
+  IndexT cols = dims[rank - 1];
+  IndexT num_tile_rows = (rows + kTileSize - 1) / kTileSize;
+  IndexT num_tile_cols = (cols + kTileSize - 1) / kTileSize;
+
+  dim3 blocks(num_tile_cols, num_tile_rows, num_batches);
+  dim3 threads(kTileSize, kBlockRows, 1);
+
+  BatchTransposeKernel<T, IndexT,
+                       VecSize><<<blocks, threads, 0, ctx.stream()>>>(
+      src, dst, rows, cols);
+}
+
+template <typename T, typename IndexT>
+inline void LaunchWithDispatchVecSize(const phi::GPUContext& ctx,
+                                      const int vec_size,
+                                      const PermuteType perm_type,
+                                      const std::vector<size_t>& dims,
+                                      const std::vector<int>& perm,
+                                      const T* src, T* dst, IndexT count) {
+#define CALL_DISPATCH_VEC_SIZE(vec_size)                                    \
+  case vec_size: {                                                          \
+    if (perm_type == PermuteType::kTranspose) {                             \
+      LaunchTransposeKernel<T, IndexT, vec_size>(ctx, dims, src, dst);      \
+    } else {                                                                \
+      LaunchPermuteRankDispatch<T, IndexT, vec_size>(ctx, count, perm_type, \
+                                                     dims, perm, src, dst); \
+    }                                                                       \
+    break;                                                                  \
+  }
+
+  switch (vec_size) {
+    CALL_DISPATCH_VEC_SIZE(1);
+    CALL_DISPATCH_VEC_SIZE(2);
+    CALL_DISPATCH_VEC_SIZE(4);
+    default: {
+      PADDLE_THROW(phi::errors::Unimplemented(
+          "Unsupported vectorized size: %d !", vec_size));
+      break;
+    }
+  }
+#undef CALL_DISPATCH_VEC_SIZE
+}
+
+template <typename T>
+inline void LaunchWithDispatchIndex(const phi::GPUContext& ctx,
+                                    const size_t count, const int vec_size,
+                                    const PermuteType perm_type,
+                                    const std::vector<size_t>& dims,
+                                    const std::vector<int>& perm, const T* src,
+                                    T* dst) {
+  if (count < std::numeric_limits<uint32_t>::max()) {
+    LaunchWithDispatchVecSize<T, uint32_t>(ctx, vec_size, perm_type, dims, perm,
+                                           src, dst,
+                                           static_cast<uint32_t>(count));
+  } else {
+    int64_t cnt = static_cast<int64_t>(count);
+    LaunchWithDispatchVecSize<T, int64_t>(ctx, vec_size, perm_type, dims, perm,
+                                          src, dst,
+                                          static_cast<int64_t>(count));
+  }
+}
+
+template <typename DeviceContext, typename T>
+inline void SimplifyThenLaunch(const int rank, const DeviceContext& ctx,
+                               const Tensor& in, Tensor* out,
+                               const std::vector<int32_t>& perm) {
+  int sm_count = ctx.GetSMCount();
+  auto src_dims = phi::vectorize<size_t>(in.dims());
+  auto simplifier = DimsSimplifier<T>(sm_count, rank, perm, src_dims,
+                                      in.data<T>(), out->data<T>());
+
+  if (simplifier.GetPermType() == PermuteType::kCopy) {
+    // If perm is [0,1,2,3], then just operate a DtoD copy.
+    phi::Copy(ctx, in, ctx.GetPlace(), false, out);
+  } else {
+    LaunchWithDispatchIndex<T>(
+        ctx, simplifier.GetCount(), simplifier.GetVecSize(),
+        simplifier.GetPermType(), simplifier.GetDims(), simplifier.GetPerm(),
+        in.data<T>(), out->data<T>());
+  }
+}
+
+template <typename T>
+size_t GetTransposeKey(const int rank, const Tensor& in,
+                       const std::vector<int32_t>& perm) {
+  auto in_shape = phi::vectorize(in.dims());
+  return phi::autotune::GetKey(
+      in_shape, perm, rank, paddle::experimental::CppTypeToDataType<T>::Type());
+}
+
 template <typename T>
-void TransposeGPUKernelDriver(const phi::GPUContext& dev_ctx, const int ndims,
+void TransposeGPUKernelDriver(const phi::GPUContext& dev_ctx, const int rank,
                               const Tensor& in,
                               const std::vector<int32_t>& perm, Tensor* out) {
+  PADDLE_ENFORCE_LT(
+      rank, phi::DDim::kMaxRank,
+      platform::errors::OutOfRange(
+          "The maximum dimension rank of "
+          "tensor is expected to be less than %d, but here is %d.",
+          phi::DDim::kMaxRank, rank));
+
   auto ret = TransposeSimple<T>::run(dev_ctx, in, perm, out);
   if (!ret) {
-    TransCompute<phi::GPUContext, T>(ndims, dev_ctx, in, out, perm);
+    auto* tuner = phi::autotune::MakeTransposeTuner<T>(
+        SimplifyThenLaunch<phi::GPUContext, T>);
+    if (!tuner->IsInit()) {
+      tuner->AddCallBack(
+          phi::autotune::MakeCallback<T>(TransCompute<phi::GPUContext, T>));
+      tuner->Finalize();
+    }
+
+    auto key = GetTransposeKey<T>(rank, in, perm);
+    auto& cache = phi::autotune::AutoTuneCache::Instance().GetTranspose();
+    if (cache.Find(key)) {
+      auto index = cache.Get(key);
+      tuner->RunBestKernel(index, rank, dev_ctx, in, out, perm);
+    } else {
+      // All avaliable kernels have ran while picking the best kernel, so
+      // there may be no need for another RunBestKernel.
+      auto index = tuner->PickBestKernel(dev_ctx, rank, dev_ctx, in, out, perm);
+      cache.Set(key, index);
+    }
   }
 }
 
diff --git a/paddle/fluid/operators/transpose_op.h b/paddle/fluid/operators/transpose_op.h
index 891aa312f69ffa..ca57687ea5fe4b 100644
--- a/paddle/fluid/operators/transpose_op.h
+++ b/paddle/fluid/operators/transpose_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -60,5 +61,182 @@ inline void TransCompute(const int dim, const DeviceContext& dev_ctx,
   }
 }
 
+enum PermuteType {
+  kCopy = 1,
+  kTranspose = 2,
+  kVecPermute = 3,
+  kNormalPermute = 4
+};
+
+constexpr int kBlockRows = 16;
+constexpr int kTileSize = 32;
+// To avoid bank conflict.
+constexpr int kShareCol = kTileSize + 1;
+
+// Simplify the input dims and permute dims if possible.
+template <typename T>
+class DimsSimplifier {
+ public:
+  explicit DimsSimplifier(const int sm_count, const int rank,
+                          const std::vector<int32_t>& perm,
+                          const std::vector<size_t>& dims, const T* src, T* dst)
+      : perm_(rank), dims_(rank) {
+    SimplifyPermAndDims(rank, dims, perm);
+    count_ = std::accumulate(dims.begin(), dims.end(), size_t{1},
+                             std::multiplies<size_t>());
+    if (rank_ > 1) {
+      vec_size_ = GetPermVecSize(sm_count, src, dst);
+      perm_.resize(rank_);
+      dims_.resize(rank_);
+    }
+  }
+
+  size_t GetCount() const { return count_; }
+  int GetVecSize() const { return vec_size_; }
+  PermuteType GetPermType() const { return type_; }
+
+  std::vector<int> GetPerm() const { return perm_; }
+  std::vector<size_t> GetDims() const { return dims_; }
+
+ private:
+  size_t rank_{1};
+  size_t count_{0};
+  int vec_size_{1};
+  std::vector<int> perm_;
+  std::vector<size_t> dims_;
+  PermuteType type_{kCopy};
+
+  void SimplifyPermAndDims(const size_t rank,
+                           const std::vector<size_t>& in_dims,
+                           const std::vector<int32_t>& perm) {
+    size_t combined_dims[phi::DDim::kMaxRank];
+    int valid_map[phi::DDim::kMaxRank];
+
+    // Merge consecutive dims to the fist one of this these dims,
+    // and leave the origin dim value to be 1. Example below :
+    // perm: [2, 3, 0, 1], origin_dims : [4, 8, 2, 5]
+    // new_dims: [4, 8, 2, 5] -> [32, 1, 10, 1]
+    size_t start_perm_idx = 0;
+    while (start_perm_idx < rank) {
+      const size_t start_dim_idx = perm[start_perm_idx];
+      combined_dims[start_dim_idx] = in_dims[start_dim_idx];
+      size_t end_perm_idx = start_perm_idx + 1;
+
+      while (end_perm_idx < rank &&
+             perm[end_perm_idx] == perm[end_perm_idx - 1] + 1) {
+        const size_t end_dim_idx = perm[end_perm_idx];
+        combined_dims[start_dim_idx] *= in_dims[end_dim_idx];
+        combined_dims[end_dim_idx] = 1;
+        end_perm_idx += 1;
+      }
+      start_perm_idx = end_perm_idx;
+    }
+
+    // Reorder combined dims and marked useless dim as -1.
+    // for example, if combined dims is [32, 1, 10, 1],
+    // valid_map is [0, -1, 1, -1] and generate simplified
+    // dims as [32, 10]
+    size_t valid_dim_idx = 0;
+    bool sequential_flag = false;
+    for (size_t i = 0; i < rank; ++i) {
+      const int src_dim = combined_dims[i];
+      if (src_dim == 1) {
+        valid_map[i] = -1;
+      } else {
+        sequential_flag = true;
+        valid_map[i] = valid_dim_idx;
+        dims_[valid_dim_idx] = src_dim;
+        valid_dim_idx += 1;
+      }
+    }
+
+    if (valid_dim_idx == 0) {
+      dims_[0] = 1;
+      perm_[0] = 0;
+      return;
+    } else if (valid_dim_idx == 1) {
+      type_ = PermuteType::kCopy;
+    }
+
+    // Acquire simplified perm with help of combined dims
+    // and original perm, finally simplified perm is [1, 0]
+    size_t perm_idx = 0;
+    for (size_t i = 0; i < rank; ++i) {
+      const int mapped = valid_map[perm[i]];
+      if (mapped >= 0) {
+        perm_[perm_idx] = mapped;
+        perm_idx += 1;
+      }
+    }
+    rank_ = valid_dim_idx;
+  }
+
+  int GetPermVecSize(const int sm_count, const T* src, T* dst) {
+    // For gerneal_permute kernel, there is good chance for
+    // vectorized write.
+    int vec_size = phi::GetVectorizedSize<T>(dst);
+    type_ = PermuteType::kNormalPermute;
+
+    // While the last dim is fixed, there is good chance for
+    // both vectorized read and write.
+    if (perm_[rank_ - 1] == rank_ - 1) {
+      int tmp_size = std::min(vec_size, phi::GetVectorizedSize<T>(src));
+      tmp_size = GetDimVesSize(tmp_size, dims_[rank_ - 1]);
+      if (tmp_size > 1) {
+        type_ = kVecPermute;
+        vec_size = tmp_size;
+
+        // For stride calculation of src_data index.
+        dims_[rank_ - 1] /= vec_size;
+      }
+    }
+
+    // Once only transpose at the last 2 dims, there is good
+    // chance for vectorized read.
+    if ((rank_ == 2 && perm_[1] == 0 && perm_[0] == 1) ||
+        (rank_ == 3 && perm_[2] == 1 && perm_[1] == 2)) {
+      type_ = PermuteType::kTranspose;
+
+      // Compared with vectorized load or read, set config to let more
+      // sm work simultaneously affect more according to performance.
+      constexpr int threads = kTileSize * kTileSize;
+      int blocks = count_ / threads;
+      if (blocks < sm_count) {
+        vec_size = 1;
+      } else {
+        int tmp_vec = std::min(vec_size, phi::GetVectorizedSize<T>(src));
+        // With bytes limitation of shared_memory, the VecSize shall be
+        // restricted for the type whose byte-size is less than 8 (double).
+        int type_vec =
+            sizeof(T) > 8 ? 1 : GetDimVesSize(tmp_vec, dims_[rank_ - 1]);
+        for (int i = type_vec; i > 0; i /= 2) {
+          if (blocks / i >= sm_count) {
+            break;
+          }
+          // When blocks is smaller than sm_count, a test shown that decrease
+          // vec_size to make blocks close to sm_count would gain performance.
+          vec_size = i;
+        }
+      }
+
+      dims_[rank_ - 1] /= vec_size;
+      count_ /= vec_size;
+    }
+    return vec_size;
+  }
+
+  // To find if highest common divisor and make it as vec_size.
+  int GetDimVesSize(const int vec_size, const size_t target_dim) {
+    int dim_vec_size = 1;
+    for (auto size = vec_size; size > 0; size /= 2) {
+      if (target_dim % size == 0) {
+        dim_vec_size = size;
+        break;
+      }
+    }
+    return dim_vec_size;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/platform/fast_divmod.h b/paddle/fluid/platform/fast_divmod.h
index f2a150c301216b..892c5b29aae332 100644
--- a/paddle/fluid/platform/fast_divmod.h
+++ b/paddle/fluid/platform/fast_divmod.h
@@ -59,8 +59,8 @@ struct FastDivMod {
     return result;
   }
 
-  int32_t divisor;
   int32_t shift_val;
+  uint32_t divisor;
   uint32_t multiplier;
 };
 
diff --git a/paddle/phi/kernels/autotune/auto_tune_base.h b/paddle/phi/kernels/autotune/auto_tune_base.h
index e18b854cf34b39..95afa7f697b49e 100644
--- a/paddle/phi/kernels/autotune/auto_tune_base.h
+++ b/paddle/phi/kernels/autotune/auto_tune_base.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <mutex>
 #include <type_traits>
 
 #include "glog/logging.h"
@@ -23,7 +24,7 @@
 namespace phi {
 namespace autotune {
 
-template <typename RetureType, typename... Args>
+template <typename T, typename RetureType, typename... Args>
 class KernelCallback {
  public:
   using ReturnT = RetureType;
@@ -33,71 +34,126 @@ class KernelCallback {
   explicit KernelCallback(FuncType func_) : func(func_) {}
   virtual ~KernelCallback() {}
 
-  RetureType Call(Args... args) { return func(args...); }
+  RetureType Run(Args... args) { return func(args...); }
 
  private:
   FuncType func;
 };
 
-template <typename RetureType, typename... Args>
-static KernelCallback<RetureType, Args...> MakeCallback(
+template <typename T, typename RetureType, typename... Args>
+static KernelCallback<T, RetureType, Args...> MakeCallback(
     RetureType (*cb)(Args...)) {
-  return KernelCallback<RetureType, Args...>(cb);
+  return KernelCallback<T, RetureType, Args...>(cb);
 }
 
-template <typename KernelType>
+template <typename T, typename KernelType>
 class AutoTuneBase {
  public:
   AutoTuneBase() {}
   virtual ~AutoTuneBase() {}
-  explicit AutoTuneBase(KernelType kernel) : default_kernel_(kernel) {
+  explicit AutoTuneBase(KernelType kernel) { kernels_.push_back(kernel); }
+
+  template <typename Type>
+  void AddCallBack(Type kernel) {
+    static_assert(std::is_same<Type, KernelType>::value,
+                  "Type must be the same");
     kernels_.push_back(kernel);
   }
 
-  template <typename T>
-  void AddCallBack(T kernel) {
-    static_assert(std::is_same<T, KernelType>::value, "Type must be the same");
-    kernels_.push_back(kernel);
+  template <typename... Args>
+  void RunBestKernel(const int idx, Args&&... args) {
+    kernels_[idx].Run(args...);
+  }
+
+  template <typename... Args>
+  void RunDefaultKernel(Args&&... args) {
+    kernels_[0].Run(args...);
   }
 
   template <typename Context, typename... Args>
-  KernelType PickBestKernel(const Context& ctx, Args&&... args) {
+  int PickBestKernel(const Context& ctx, Args&&... args) {
     PADDLE_ENFORCE_GT(
         kernels_.size(),
         0,
         paddle::platform::errors::InvalidArgument(
             "kernel num must be greater than 0, now is %d", kernels_.size()));
-    int idx = 0;
-    phi::GpuTimer timer;
+    int best_idx = 0;
     float min_time = std::numeric_limits<float>::max();
 
+    // Time cost test estabulished in default stream.
     for (int i = 0; i < kernels_.size(); ++i) {
-      ctx.Wait();
-      timer.Start(0);
-      kernels_[i].Call(args...);
-      timer.Stop(0);
-      auto time = timer.ElapsedTime();
-      VLOG(3) << "kernel[" << i << "]: time cost is " << time;
-
+      auto time = RunAndMeasureKernel<Context>(ctx, i, args...);
       if (time < min_time) {
         min_time = time;
-        idx = i;
+        best_idx = i;
       }
     }
-    VLOG(3) << "best kernel idx is " << idx;
-    return kernels_[idx];
+    VLOG(3) << "best kernel idx is " << best_idx;
+    return best_idx;
   }
 
+  bool IsInit() { return is_init_; }
+  void Finalize() { is_init_ = true; }
+
  private:
-  KernelType default_kernel_;
+  bool is_init_{false};
   std::vector<KernelType> kernels_;
+
+  template <typename Context, typename... Args>
+  float RunAndMeasureKernel(const Context& ctx, const int idx, Args&&... args) {
+    phi::GpuTimer timer;
+    float time_cost = 0;
+    const auto& stream = ctx.stream();
+
+    // Treat 1st run as warm up. Judge the result with
+    // the sum of 2nd and 3rd run.
+    constexpr int repeats = 3;
+
+    ctx.Wait();
+    for (int i = 0; i < repeats; ++i) {
+      timer.Start(stream);
+      kernels_[idx].Run(args...);
+      timer.Stop(stream);
+      auto time = timer.ElapsedTime();
+      if (i > 0) {
+        time_cost += time;
+      }
+      VLOG(3) << "kernel[" << idx << "][" << i << "th time cost is " << time;
+    }
+    return time_cost;
+  }
 };
 
-template <typename RetureType, typename... Args>
-static AutoTuneBase<KernelCallback<RetureType, Args...>> MakeAutoTuner(
+template <typename T, typename RetureType, typename... Args>
+static AutoTuneBase<T, KernelCallback<T, RetureType, Args...>> MakeAutoTuner(
     RetureType (*func)(Args...)) {
-  auto obj = MakeCallback(func);
-  return AutoTuneBase<decltype(obj)>(obj);
+  auto obj = MakeCallback<T>(func);
+  return AutoTuneBase<T, decltype(obj)>(obj);
+}
+
+template <typename T, typename KernelType>
+class TransposeAutoTuner : public AutoTuneBase<T, KernelType> {
+ public:
+  static AutoTuneBase<T, KernelType>* Instance(KernelType kernel) {
+    static std::unique_ptr<AutoTuneBase<T, KernelType>> instance_;
+    std::call_once(init_flag_, [&] {
+      instance_.reset(new AutoTuneBase<T, KernelType>(kernel));
+    });
+    return instance_.get();
+  }
+
+ private:
+  static std::once_flag init_flag_;
+};
+
+template <typename T, typename KernelType>
+std::once_flag TransposeAutoTuner<T, KernelType>::init_flag_;
+
+template <typename T, typename RetureType, typename... Args>
+static AutoTuneBase<T, KernelCallback<T, RetureType, Args...>>*
+    MakeTransposeTuner(RetureType (*func)(Args...)) {
+  auto obj = MakeCallback<T>(func);
+  return TransposeAutoTuner<T, decltype(obj)>::Instance(obj);
 }
 
 }  // namespace autotune
diff --git a/paddle/phi/kernels/autotune/auto_tune_test.cu b/paddle/phi/kernels/autotune/auto_tune_test.cu
index c3918b8ebe59d8..8701a0572fcd82 100644
--- a/paddle/phi/kernels/autotune/auto_tune_test.cu
+++ b/paddle/phi/kernels/autotune/auto_tune_test.cu
@@ -74,7 +74,7 @@ float Algo(const phi::GPUContext& ctx,
 }
 
 TEST(AutoTune, sum) {
-  int64_t N = 1 << 22;
+  int64_t N = 1 << 20;
   size_t blocks = 512;
   size_t threads = 256;
   size_t size = sizeof(float) * N;
@@ -119,35 +119,35 @@ TEST(AutoTune, sum) {
 
   // 1. Test call_back.
   VLOG(3) << ">>> [CallBack]: Test case.";
-  auto callback1 = tune::MakeCallback(Algo<4>);
-  auto callback2 = tune::MakeCallback(Algo<2>);
-  auto callback3 = tune::MakeCallback(Algo<1>);
+  auto callback1 = tune::MakeCallback<float>(Algo<4>);
+  auto callback2 = tune::MakeCallback<float>(Algo<2>);
+  auto callback3 = tune::MakeCallback<float>(Algo<1>);
   std::vector<decltype(callback1)> callbacks{callback1, callback2, callback3};
   for (int i = 0; i < callbacks.size(); ++i) {
     dev_ctx->Wait();
     phi::GpuTimer timer;
     timer.Start(0);
-    callbacks[i].Call(*dev_ctx, *d_in1.get(), d_in2.get(), N, threads, blocks);
+    callbacks[i].Run(*dev_ctx, *d_in1.get(), d_in2.get(), N, threads, blocks);
     timer.Stop(0);
     VLOG(3) << "kernel[" << i << "]: time cost is " << timer.ElapsedTime();
   }
 
   // 2. Test call_back tune.
   VLOG(3) << ">>> [AutoTune]: Test case.";
-  auto tuner = tune::MakeAutoTuner(Algo<4>);
-  tuner.AddCallBack(tune::MakeCallback(Algo<2>));
-  tuner.AddCallBack(tune::MakeCallback(Algo<1>));
+  auto tuner = tune::MakeAutoTuner<float>(Algo<4>);
+  tuner.AddCallBack(tune::MakeCallback<float>(Algo<2>));
+  tuner.AddCallBack(tune::MakeCallback<float>(Algo<1>));
 
   /* The 1st ctx works for ctx.Wait(),
      the 2nd is just the param of call_back. */
-  auto best_call_back = tuner.PickBestKernel(
+  auto best_index = tuner.PickBestKernel(
       *dev_ctx, *dev_ctx, *d_in1.get(), d_in2.get(), N, threads, blocks);
-  best_call_back.Call(*dev_ctx, *d_in1.get(), d_in2.get(), N, threads, blocks);
 
   dev_ctx->Wait();
   phi::GpuTimer timer;
   timer.Start(0);
-  best_call_back.Call(*dev_ctx, *d_in1.get(), d_in2.get(), N, threads, blocks);
+  tuner.RunBestKernel(
+      best_index, *dev_ctx, *d_in1.get(), d_in2.get(), N, threads, blocks);
   timer.Stop(0);
   VLOG(3) << "Best CallBackKernel time cost is " << timer.ElapsedTime();
 #endif
diff --git a/paddle/phi/kernels/autotune/cache.h b/paddle/phi/kernels/autotune/cache.h
index 9d7f57e96e3735..8de0695ede40c3 100644
--- a/paddle/phi/kernels/autotune/cache.h
+++ b/paddle/phi/kernels/autotune/cache.h
@@ -134,7 +134,8 @@ enum class AlgorithmType {
   kConvForward = 1,
   kConvBackwardData = 2,
   kConvBackwardFilter = 3,
-  kAlgorithmCount = 4
+  kTranspose = 4,
+  kAlgorithmCount = 5
 };
 
 // AlgorithmsConfigKey -> AlgorithmsID
@@ -165,6 +166,8 @@ class AutoTuneCache {
     return Get(AlgorithmType::kConvBackwardFilter);
   }
 
+  AlgorithmsCacheMap& GetTranspose() { return Get(AlgorithmType::kTranspose); }
+
   void Clean() {
     for (auto& v : auto_tune_map_) {
       v.second.Clean();

From 9bb39d489972ad85eec43d6418619e9e5a2a22f0 Mon Sep 17 00:00:00 2001
From: Jiabin Yang <360788950@qq.com>
Date: Tue, 7 Jun 2022 12:00:17 +0800
Subject: [PATCH 13/22] support prune (#43250)

---
 .../auto_code_generator/eager_generator.cc    | 41 +++++++++++++------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index 817a0de6e0ca95..73baf210158332 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -1206,22 +1206,37 @@ static std::string GenerateGradNodeCreationContent(
     if (!input.duplicable()) {
       compute_require_grad_args += ", " + input_autograd_name;
       size_t input_position = fwd_inputs_name_pos_map.at(input_name);
-
-      const char* SET_GRAD_OUT_META_TEMPLATE =
-          "      grad_node->SetGradOutMeta(%s, %d);\n";
-      grad_node_creation_str +=
-          paddle::string::Sprintf(SET_GRAD_OUT_META_TEMPLATE,
-                                  LegalizeVarName(input_name), input_position);
-
+      bool found_target_name = false;
+      for (const auto& iter : op_base_infos) {
+        const auto& grad_outs_slot_map = iter.GetGradOutsSlotnameMap();
+        for (auto iter : grad_outs_slot_map) {
+          if ((!found_target_name) && (input_name == iter.second)) {
+            const char* SET_GRAD_OUT_META_TEMPLATE =
+                "      grad_node->SetGradOutMeta(%s, %d);\n";
+            grad_node_creation_str += paddle::string::Sprintf(
+                SET_GRAD_OUT_META_TEMPLATE, LegalizeVarName(input_name),
+                input_position);
+            found_target_name = true;
+          }
+        }
+      }
     } else {
       compute_require_grad_args += ", &" + input_autograd_name;
       size_t input_position = fwd_inputs_name_pos_map.at(input_name);
-
-      const char* SET_GRAD_OUT_META_TEMPLATE =
-          "      grad_node->SetGradOutMeta(%s, %d);\n";
-      grad_node_creation_str +=
-          paddle::string::Sprintf(SET_GRAD_OUT_META_TEMPLATE,
-                                  LegalizeVarName(input_name), input_position);
+      bool found_target_name = false;
+      for (const auto& iter : op_base_infos) {
+        const auto& grad_outs_slot_map = iter.GetGradOutsSlotnameMap();
+        for (auto iter : grad_outs_slot_map) {
+          if ((!found_target_name) && (input_name == iter.second)) {
+            const char* SET_GRAD_OUT_META_TEMPLATE =
+                "      grad_node->SetGradOutMeta(%s, %d);\n";
+            grad_node_creation_str += paddle::string::Sprintf(
+                SET_GRAD_OUT_META_TEMPLATE, LegalizeVarName(input_name),
+                input_position);
+            found_target_name = true;
+          }
+        }
+      }
     }
   }
 

From 8c3777dfcba3e4d20045087db2957ea34f076aec Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Tue, 7 Jun 2022 12:46:06 +0800
Subject: [PATCH 14/22] [multi-stream] Fix split and concat problem. (#43039)

---
 .../fluid/inference/api/analysis_predictor.cc |  6 ---
 .../inference/tests/infer_ut/CMakeLists.txt   |  2 +
 .../inference/tests/infer_ut/test_LeViT.cc    |  2 +-
 paddle/fluid/memory/memcpy.cc                 |  2 +-
 paddle/fluid/platform/device_context.cc       |  4 +-
 paddle/phi/backends/gpu/gpu_context.cc        |  3 +-
 .../kernels/funcs/concat_and_split_functor.cu | 39 ++++++++++---------
 7 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 5f9051ff2fdb9e..18229c302db395 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1090,12 +1090,6 @@ CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
         process_level_allocator_enabled = true;
       }
 
-      // TODO(Jingzhuangzhuang): Fix trt error when allocator_strategy is
-      // auto_growth
-      if (config.tensorrt_engine_enabled()) {
-        gflags.push_back("--allocator_strategy=naive_best_fit");
-      }
-
       if (framework::InitGflags(gflags)) {
         VLOG(3) << "The following gpu analysis configurations only take effect "
                    "for the first predictor: ";
diff --git a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt
index 5aef30bf335c3d..0aee989367e4b3 100644
--- a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt
@@ -87,9 +87,11 @@ endif()
 
 if(WITH_GPU)
   if(NOT WIN32)
+    add_definitions("-DPADDLE_WITH_GPU")
     set(CUDA_LIB
         "/usr/local/cuda/lib64/"
         CACHE STRING "CUDA Library")
+    include_directories("${CUDA_LIB}/../include")
   else()
     set(CUDA_LIB
         ""
diff --git a/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc b/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc
index b74d1189b804be..b069feaec1ae79 100644
--- a/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc
+++ b/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc
@@ -157,7 +157,7 @@ TEST(tensorrt_tester_LeViT, multi_thread4_trt_fp32_bz2) {
   for (int i = 0; i < thread_num; ++i) {
     threads.emplace_back(paddle::test::SingleThreadPrediction,
                          pred_pool.Retrive(i), &my_input_data_map,
-                         &infer_output_data, 2);
+                         &infer_output_data, 10);
   }
 
   // thread join & check outputs
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index 3198b4f8d935e3..c45180f600e3e8 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -648,7 +648,7 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
 
   platform::SetDeviceId(dst_place.device);
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
-          << dst_place << " by thream(" << stream << ")";
+          << dst_place << " by stream(" << stream << ")";
   if (stream) {
     platform::RecordEvent record_event(
         "GpuMemcpyAsync:CPU->GPU", platform::TracerEventType::UserDefined, 1);
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 0bd606257f5415..fd61b813f0aa26 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -54,7 +54,9 @@ AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) {
     auto& desired_dev_ctx =
         static_cast<const platform::CUDADeviceContext&>(dev_ctx);
     if (default_dev_ctx->stream() == desired_dev_ctx.stream()) {
-      return Alloc(place, size);
+      return paddle::memory::Alloc(desired_dev_ctx.GetPlace(), size,
+                                   phi::Stream(reinterpret_cast<phi::StreamId>(
+                                       desired_dev_ctx.stream())));
     } else {
       return allocation::CUDADeviceContextAllocatorPool::Instance().Alloc(
           desired_dev_ctx, size);
diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc
index f51f287ee4a084..f68e4510390923 100644
--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -504,8 +504,7 @@ struct GPUContext::Impl {
 
   void AddStreamCallback(const std::function<void()>& callback) const {
     // NOTE(zhiqiu): better use threadpool here, otherwise "std::async" may
-    // launch too
-    // many threads and result in thread oversubscription.
+    // launch too many threads and result in thread oversubscription.
     auto* callback_func = new std::function<void()>(std::move(callback));
     auto* func = new std::function<void()>([this, callback_func] {
       std::lock_guard<std::mutex> lock(stream_call_back_mtx_);
diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cu b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
index 5abaf6c2ffa87c..1c9fbffa2ac195 100644
--- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
@@ -276,10 +276,7 @@ struct ConcatFunctor<phi::GPUContext, T> {
     int64_t out_row = in_row, out_col = 0;
 
     int inputs_col_num = in_num + 1;
-    std::vector<const T*> inputs_data_vec(in_num);
-    std::vector<int64_t> inputs_col_vec(inputs_col_num);
-    const T** inputs_data = inputs_data_vec.data();
-    int64_t* inputs_col = inputs_col_vec.data();
+    paddle::memory::AllocationPtr data_alloc, col_alloc;
 
 // There are some differences between hip runtime and NV runtime.
 // In NV, when the pageable memory data less than 64K is transferred from
@@ -289,16 +286,22 @@ struct ConcatFunctor<phi::GPUContext, T> {
 // 3.2.6.1. Concurrent Execution between Host and Device
 // Memory copies from host to device of a memory block of 64 KB or less
 #ifdef PADDLE_WITH_HIP
-    paddle::memory::AllocationPtr data_alloc, col_alloc;
     // TODO(chentianyu03): try to find a method to remove the Alloc function
     data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
                                        in_num * sizeof(T*));
-    inputs_data = reinterpret_cast<const T**>(data_alloc->ptr());
     // TODO(chentianyu03): try to find a method to remove the Alloc function
     col_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
                                       inputs_col_num * sizeof(int));
-    inputs_col = reinterpret_cast<int64_t*>(col_alloc->ptr());
+#else
+    // TODO(pinned): cuda-graph not support pinned memory, we just use the cpu
+    // allocator.
+    data_alloc = paddle::memory::Alloc(paddle::platform::CPUPlace(),
+                                       in_num * sizeof(T*));
+    col_alloc = paddle::memory::Alloc(paddle::platform::CPUPlace(),
+                                      (inputs_col_num) * sizeof(int64_t));
 #endif
+    const T** inputs_data = reinterpret_cast<const T**>(data_alloc->ptr());
+    int64_t* inputs_col = reinterpret_cast<int64_t*>(col_alloc->ptr());
 
     inputs_col[0] = 0;
     bool has_same_shape = true;
@@ -387,7 +390,6 @@ struct ConcatFunctor<phi::GPUContext, T> {
           output->data<T>());
     }
 
-#ifdef PADDLE_WITH_HIP
     // Prevent the pinned memory value from being covered and release the memory
     // after the launch kernel of the stream is executed (reapply pinned memory
     // next time)
@@ -401,7 +403,6 @@ struct ConcatFunctor<phi::GPUContext, T> {
       paddle::memory::allocation::Allocator::AllocationDeleter(
           col_alloc_released);
     });
-#endif
   }
 };
 
@@ -432,10 +433,7 @@ class SplitFunctor<phi::GPUContext, T> {
     bool has_same_shape = true;
 
     int outputs_cols_num = o_num + 1;
-    std::vector<T*> outputs_data_vec(o_num);
-    std::vector<int64_t> outputs_cols_vec(outputs_cols_num);
-    T** outputs_data = outputs_data_vec.data();
-    int64_t* outputs_cols = outputs_cols_vec.data();
+    paddle::memory::AllocationPtr data_alloc, cols_alloc;
 
 // There are some differences between hip runtime and NV runtime.
 // In NV, when the pageable memory data less than 64K is transferred from
@@ -445,16 +443,22 @@ class SplitFunctor<phi::GPUContext, T> {
 // 3.2.6.1. Concurrent Execution between Host and Device
 // Memory copies from host to device of a memory block of 64 KB or less
 #ifdef PADDLE_WITH_HIP
-    paddle::memory::AllocationPtr data_alloc, cols_alloc;
     // TODO(chentianyu03): try to find a method to remove the Alloc function
     data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
                                        o_num * sizeof(T*));
-    outputs_data = reinterpret_cast<T**>(data_alloc->ptr());
     // TODO(chentianyu03): try to find a method to remove the Alloc function
     cols_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
                                        (outputs_cols_num) * sizeof(int64_t));
-    outputs_cols = reinterpret_cast<int64_t*>(cols_alloc->ptr());
+#else
+    // TODO(pinned): cuda-graph not support pinned memory, we just use the cpu
+    // allocator.
+    data_alloc =
+        paddle::memory::Alloc(paddle::platform::CPUPlace(), o_num * sizeof(T*));
+    cols_alloc = paddle::memory::Alloc(paddle::platform::CPUPlace(),
+                                       (outputs_cols_num) * sizeof(int64_t));
 #endif
+    T** outputs_data = reinterpret_cast<T**>(data_alloc->ptr());
+    int64_t* outputs_cols = reinterpret_cast<int64_t*>(cols_alloc->ptr());
 
     outputs_cols[0] = 0;
     for (int i = 0; i < o_num; ++i) {
@@ -547,7 +551,7 @@ class SplitFunctor<phi::GPUContext, T> {
           static_cast<int>(outputs_cols_num),
           dev_out_gpu_data);
     }
-#ifdef PADDLE_WITH_HIP
+
     // Prevent the pinned memory value from being covered and release the memory
     // after the launch kernel of the stream is executed (reapply pinned memory
     // next time)
@@ -559,7 +563,6 @@ class SplitFunctor<phi::GPUContext, T> {
       paddle::memory::allocation::Allocator::AllocationDeleter(
           cols_alloc_released);
     });
-#endif
   }
 };
 

From d95293f3de7113eb58f427ed7990f759f797909f Mon Sep 17 00:00:00 2001
From: Weilong Wu <veyron_wu@163.com>
Date: Tue, 7 Jun 2022 14:05:39 +0800
Subject: [PATCH 15/22] [Eager] fix 2 fused op test and add retain_grad flag
 under eager (#43258)

---
 ...sed_bias_dropout_residual_layer_norm_op.py |  4 +-
 .../unittests/test_fused_gate_attention_op.py |  4 +-
 .../test_tensor_fill_diagonal_tensor.py       | 54 ++++++++++---------
 3 files changed, 36 insertions(+), 26 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_fused_bias_dropout_residual_layer_norm_op.py b/python/paddle/fluid/tests/unittests/test_fused_bias_dropout_residual_layer_norm_op.py
index 92c815a246f734..f31cc78986e565 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_bias_dropout_residual_layer_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_bias_dropout_residual_layer_norm_op.py
@@ -26,7 +26,9 @@
 from paddle.fluid import layers
 import unittest
 from op_test import OpTest
-from paddle.fluid.framework import default_main_program
+from paddle.fluid.framework import default_main_program, _enable_legacy_dygraph
+
+_enable_legacy_dygraph()
 
 default_main_program().random_seed = 42
 
diff --git a/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py b/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py
index 2d6243955478c3..edfb46f5813b6d 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py
@@ -21,9 +21,11 @@
 from op_test import OpTest, convert_float_to_uint16
 from test_sparse_attention_op import get_cuda_version
 from paddle import _C_ops
-from paddle.fluid.framework import default_main_program
+from paddle.fluid.framework import default_main_program, _enable_legacy_dygraph
 from paddle.fluid import core
 
+_enable_legacy_dygraph()
+
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "Paddle is not compiled with CUDA")
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_tensor.py b/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_tensor.py
index 4765b540c7e60d..e71cc3b7239f1c 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_tensor.py
@@ -30,10 +30,10 @@ def setUp(self):
 
     def test_dim2(self):
         fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
-        expected_np = np.array(
-            [[1, 2, 2], [2, 1, 2], [2, 2, 1], [2, 2, 2]]).astype('float32')
-        expected_grad = np.array(
-            [[0, 1, 1], [1, 0, 1], [1, 1, 0], [1, 1, 1]]).astype('float32')
+        expected_np = np.array([[1, 2, 2], [2, 1, 2], [2, 2, 1],
+                                [2, 2, 2]]).astype('float32')
+        expected_grad = np.array([[0, 1, 1], [1, 0, 1], [1, 1, 0],
+                                  [1, 1, 1]]).astype('float32')
 
         for idx, p in enumerate(self.places):
             if idx == 0:
@@ -59,10 +59,10 @@ def test_dim2(self):
 
     def test_dim2_offset_1(self):
         fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
-        expected_np = np.array(
-            [[2, 2, 2], [1, 2, 2], [2, 1, 2], [2, 2, 1]]).astype('float32')
-        expected_grad = np.array(
-            [[1, 1, 1], [0, 1, 1], [1, 0, 1], [1, 1, 0]]).astype('float32')
+        expected_np = np.array([[2, 2, 2], [1, 2, 2], [2, 1, 2],
+                                [2, 2, 1]]).astype('float32')
+        expected_grad = np.array([[1, 1, 1], [0, 1, 1], [1, 0, 1],
+                                  [1, 1, 0]]).astype('float32')
 
         for idx, p in enumerate(self.places):
             if idx == 0:
@@ -88,10 +88,10 @@ def test_dim2_offset_1(self):
 
     def test_dim2_offset1(self):
         fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
-        expected_np = np.array(
-            [[2, 1, 2], [2, 2, 1], [2, 2, 2], [2, 2, 2]]).astype('float32')
-        expected_grad = np.array(
-            [[1, 0, 1], [1, 1, 0], [1, 1, 1], [1, 1, 1]]).astype('float32')
+        expected_np = np.array([[2, 1, 2], [2, 2, 1], [2, 2, 2],
+                                [2, 2, 2]]).astype('float32')
+        expected_grad = np.array([[1, 0, 1], [1, 1, 0], [1, 1, 1],
+                                  [1, 1, 1]]).astype('float32')
 
         for idx, p in enumerate(self.places):
             if idx == 0:
@@ -117,18 +117,22 @@ def test_dim2_offset1(self):
 
     def test_dim4(self):
         fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
-        expected_np = np.array(
-            [[[[0, 3], [2, 2], [2, 2]], [[2, 2], [1, 4], [2, 2]],
-              [[2, 2], [2, 2], [2, 5]], [[2, 2], [2, 2], [2, 2]]],
-             [[[6, 9], [2, 2], [2, 2]], [[2, 2], [7, 10], [2, 2]],
-              [[2, 2], [2, 2], [8, 11]],
-              [[2, 2], [2, 2], [2, 2]]]]).astype('float32')
-        expected_grad = np.array(
-            [[[[0, 0], [1, 1], [1, 1]], [[1, 1], [0, 0], [1, 1]],
-              [[1, 1], [1, 1], [0, 0]], [[1, 1], [1, 1], [1, 1]]],
-             [[[0, 0], [1, 1], [1, 1]], [[1, 1], [0, 0], [1, 1]],
-              [[1, 1], [1, 1], [0, 0]],
-              [[1, 1], [1, 1], [1, 1]]]]).astype('float32')
+        expected_np = np.array([[[[0, 3], [2, 2], [2, 2]],
+                                 [[2, 2], [1, 4], [2, 2]],
+                                 [[2, 2], [2, 2], [2, 5]],
+                                 [[2, 2], [2, 2], [2, 2]]],
+                                [[[6, 9], [2, 2], [2, 2]],
+                                 [[2, 2], [7, 10], [2, 2]],
+                                 [[2, 2], [2, 2], [8, 11]],
+                                 [[2, 2], [2, 2], [2, 2]]]]).astype('float32')
+        expected_grad = np.array([[[[0, 0], [1, 1], [1, 1]],
+                                   [[1, 1], [0, 0], [1, 1]],
+                                   [[1, 1], [1, 1], [0, 0]],
+                                   [[1, 1], [1, 1], [1, 1]]],
+                                  [[[0, 0], [1, 1], [1, 1]],
+                                   [[1, 1], [0, 0], [1, 1]],
+                                   [[1, 1], [1, 1], [0, 0]],
+                                   [[1, 1], [1, 1], [1, 1]]]]).astype('float32')
 
         for idx, p in enumerate(self.places):
             if idx == 0:
@@ -154,6 +158,7 @@ def test_dim4(self):
         fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_largedim(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         if len(self.places) > 1:
             bsdim = 1024
             fsdim = 128
@@ -175,6 +180,7 @@ def test_largedim(self):
 
                 self.assertEqual((ny == expected_pred).all(), True)
                 self.assertEqual((y.grad == expected_grad).all(), True)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
 
 if __name__ == '__main__':

From eac125f9124a3bc04a89acec4fc01ec8a3de9677 Mon Sep 17 00:00:00 2001
From: BrilliantYuKaimin <91609464+BrilliantYuKaimin@users.noreply.github.com>
Date: Tue, 7 Jun 2022 14:09:24 +0800
Subject: [PATCH 16/22] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=20paddle.assign=20?=
 =?UTF-8?q?=E7=AD=89=20API=20=E7=9A=84=E6=96=87=E6=A1=A3=20=20(#42942)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Update creation.py

* Update search.py

* Update search.py

* Update xavier.py

* Update xavier.py

* Update pooling.py

* Update pooling.py

* Update pooling.py

* Update search.py
---
 python/paddle/nn/functional/pooling.py | 22 +++++------
 python/paddle/nn/initializer/xavier.py | 34 ++++++++---------
 python/paddle/nn/layer/pooling.py      | 36 +++++++----------
 python/paddle/tensor/creation.py       | 15 ++++----
 python/paddle/tensor/search.py         | 53 ++++++++++++--------------
 5 files changed, 70 insertions(+), 90 deletions(-)

diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index f79a43fbc03a62..4bb53e1737bf80 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -1273,24 +1273,20 @@ def max_pool3d(x,
 
 def adaptive_avg_pool1d(x, output_size, name=None):
     """
-    This API implements adaptive average pooling 1d operation.
-    See more details in :ref:`api_nn_pooling_AdaptiveAvgPool1d` .
+    Adaptive average pooling 1d operation on :attr:`x` according to :attr:`output_size`. 
+    
+    Notes:
+        See more details in :ref:`api_nn_pooling_AdaptiveAvgPool1d` .
 
     Args:
-        x (Tensor): The input tensor of pooling operator, which is a 3-D tensor
-                              with shape [N, C, L].  The format of input tensor is NCL,
-                              where N is batch size, C is the number of channels, L is the
-                              length of the feature. The data type is float32 or float64.
-        output_size (int): The target output size. It must be an integer.
-        name(str, optional): For detailed information, please refer
-                                 to :ref:`api_guide_Name`. Usually name is no need to set and
-                                 None by default.
+        x (Tensor): The input Tensor of pooling, which is a 3-D tensor with shape :math:`[N, C, L]`, where :math:`N` is batch size, :math:`C` is the number of channels and :math:`L` is the length of the feature. The data type is float32 or float64.
+        output_size (int): The target output size. Its data type must be int.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
     Returns:
-            Tensor: The output tensor of adaptive average pooling result. The data type is same
-                      as input tensor.
+        Tensor: The result of 1D adaptive average pooling. Its data type is same as input.
     Examples:
         .. code-block:: python
-          :name: code-example1
+          :name: adaptive_avg_pool1d-example
 
               # average adaptive pool1d
               # suppose input data in shape of [N, C, L], `output_size` is m or [m],
diff --git a/python/paddle/nn/initializer/xavier.py b/python/paddle/nn/initializer/xavier.py
index e11790df7dfbcd..d6570f9db2fe59 100644
--- a/python/paddle/nn/initializer/xavier.py
+++ b/python/paddle/nn/initializer/xavier.py
@@ -22,28 +22,26 @@ class XavierNormal(XavierInitializer):
     This class implements the Xavier weight initializer from the paper
     `Understanding the difficulty of training deep feedforward neural
     networks <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_
-    by Xavier Glorot and Yoshua Bengio, using a normal distribution.
-
-    The mean is 0 and the standard deviation is
+    by Xavier Glorot and Yoshua Bengio, using a normal distribution whose mean is :math:`0` and standard deviation is
 
     .. math::
 
-        \sqrt{\frac{2.0}{fan\_in + fan\_out}}
+        \sqrt{\frac{2.0}{fan\_in + fan\_out}}.
 
 
     Args:
-        fan_in (float, optional): fan_in for Xavier initialization, It is
-                inferred from the tensor. The default value is None.
-        fan_out (float, optional): fan_out for Xavier initialization, it is
-                 inferred from the tensor. The default value is None.
-        name(str, optional): The default value is None. Normally there is no need for user to set this
-            property. For more information, please refer to :ref:`api_guide_Name`.
+        fan_in (float, optional): fan_in for Xavier initialization, which is
+                inferred from the Tensor. The default value is None.
+        fan_out (float, optional): fan_out for Xavier initialization, which is
+                 inferred from the Tensor. The default value is None.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A parameter initialized by Xavier weight, using a normal distribution.
 
     Examples:
         .. code-block:: python
+            :name: initializer_XavierNormal-example
 
             import paddle
 
@@ -81,25 +79,25 @@ class XavierUniform(XavierInitializer):
 
     This initializer is designed to keep the scale of the gradients
     approximately same in all the layers. In case of Uniform distribution,
-    the range is [-x, x], where
+    the range is :math:`[-x,x]`, where
 
     .. math::
 
-        x = \sqrt{\frac{6.0}{fan\_in + fan\_out}}
+        x = \sqrt{\frac{6.0}{fan\_in + fan\_out}}.
 
     Args:
-        fan_in (float, optional): fan_in for Xavier initialization, it is
-                inferred from the tensor. The default value is None.
-        fan_out (float, optional): fan_out for Xavier initialization, it is
-                 inferred from the tensor. The default value is None.
-        name(str, optional): The default value is None. Normally there is no need for user to set this
-            property. For more information, please refer to :ref:`api_guide_Name`.
+        fan_in (float, optional): fan_in for Xavier initialization, which is
+                inferred from the Tensor. The default value is None.
+        fan_out (float, optional): fan_out for Xavier initialization, which is
+                 inferred from the Tensor. The default value is None.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A parameter initialized by Xavier weight, using a uniform distribution.
 
     Examples:
         .. code-block:: python
+            :name: initializer_XavierUniform-example
 
             import paddle
 
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index 990d0b61078641..e7b6fc24afad8b 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -619,42 +619,32 @@ def extra_repr(self):
 class AdaptiveAvgPool1D(Layer):
     r"""
 
-    This operation applies a 1D adaptive average pooling over an input signal composed
-    of several input planes, based on the input, output_size, return_mask parameters.
-    Input(X) and output(Out) are in NCL format, where N is batch
-    size, C is the number of channels, L is the length of the feature.
-    The output tensor shape will be [N, C, output_size].
+    A 1D adaptive average pooling over an input signal composed
+    of several input planes, based on :attr:`output_size`.
+    Input and output are in NCL format, where N is batch
+    size, C is the number of channels and L is the length of the feature.
+    The shape of output will be :math:`[N, C, output\_size]`.
 
-    For average adaptive pool1d:
+    The formulation for average adaptive pool1d is
 
     ..  math::
 
-        lstart &= floor(i * L_{in} / L_{out})
+        lstart &= \lfloor i * L_{in} / L_{out}\rfloor,
 
-        lend &= ceil((i + 1) * L_{in} / L_{out})
+        lend &= \lceil(i + 1) * L_{in} / L_{out}\rceil,
 
-        Output(i) &= \frac{ \sum Input[lstart:lend]}{lend - lstart}
+        Output(i) &= \frac{\sum Input[lstart:lend]}{lend - lstart}.
 
     Parameters:
-        output_size(int): The target output size. It must be an integer.
-        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
-            Usually name is no need to set and None by default.
+        output_size(int): The target output size. Its data type must be int.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
-        A callable object of AdaptiveAvgPool1D.
-
-    Raises:
-        ValueError: 'output_size' should be an integer.
-
-    Shape:
-        - x(Tensor): 3-D tensor. The input tensor of adaptive avg pool1d operator, which is a 3-D tensor.
-          The data type can be float32, float64.
-        - output(Tensor): 3-D tensor. The output tensor of adaptive avg pool1d operator, which is a 3-D tensor.
-          The data type is same as input x.
+        A callable object for computing 1D adaptive average pooling.
 
     Examples:
         .. code-block:: python
-
+            :name: AdaptiveAvgPool1D-example
             # average adaptive pool1d
             # suppose input data in shape of [N, C, L], `output_size` is m or [m],
             # output shape is [N, C, m], adaptive pool divide L dimension
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 67547212bb196f..521839af902b56 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -1479,22 +1479,21 @@ def empty_like(x, dtype=None, name=None):
 def assign(x, output=None):
     """
 
-    The OP copies the :attr:`x` to the :attr:`output`.
+    Copy value of the :attr:`x` to the :attr:`output`.
  
     Parameters:
-        x (Tensor|np.ndarray|list|tuple|scalar): A tensor, numpy ndarray, tuple/list of scalar,
-            or scalar. Its data type supports float16, float32, float64, int32, int64, and bool.
-            Note: the float64 data will be converted to float32 because of current platform protobuf
+        x (Tensor|np.ndarray|list|tuple|scalar): A Tensor, numpy ndarray, tuple/list of scalar,
+            or scalar. Its data type can be float16, float32, float64, int32, int64 or bool. Note: the float64 data will be converted to float32 because of current platform protobuf
             data limitation.
-        output (Tensor, optional): A tensor. If :attr:`output` is None, a new tensor will
-            be created as :attr:`output`. Default: None.
+        output (Tensor, optional): A Tensor. If :attr:`output` is None, a new Tensor will be created as :attr:`output`. Default: None.
  
     Returns:
-        Tensor: A tensor with the same shape, data type and value as :attr:`x`.
+        Tensor: A Tensor with the same shape, data type and value as :attr:`x`.
  
     Examples:
         .. code-block:: python
- 
+          :name: assign-example
+
           import paddle
           import numpy as np
           data = paddle.full(shape=[3, 2], fill_value=2.5, dtype='float64') # [[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]]
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index 42087ac7dafa39..94a05294aaa63c 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -572,49 +572,46 @@ def mode(x, axis=-1, keepdim=False, name=None):
 
 def where(condition, x=None, y=None, name=None):
     r"""
-    Return a tensor of elements selected from either $x$ or $y$, depending on $condition$.
-
-    **Note**:
-        ``paddle.where(condition)`` is identical to ``paddle.nonzero(condition, as_tuple=True)``.
+    Return a Tensor of elements selected from either :attr:`x` or :attr:`y` according to corresponding elements of :attr:`condition`. Concretely,
 
     .. math::
 
-      out_i =
-      \begin{cases}
-      x_i, \quad  \text{if}  \ condition_i \  is \ True \\
-      y_i, \quad  \text{if}  \ condition_i \  is \ False \\
-      \end{cases}
+        out_i =
+        \begin{cases}
+        x_i, & \text{if}  \ condition_i \  \text{is} \ True \\
+        y_i, & \text{if}  \ condition_i \  \text{is} \ False \\
+        \end{cases}.
 
+    Notes:
+        ``numpy.where(condition)`` is identical to ``paddle.nonzero(condition, as_tuple=True)``, please refer to :ref:`api_tensor_search_nonzero`.
 
     Args:
-        condition(Tensor): The condition to choose x or y. When True(nonzero), yield x, otherwise yield y.
-        x(Tensor or Scalar, optional): x is a Tensor or Scalar with data type float32, float64, int32, int64. Either both or neither of x and y should be given.
-        y(Tensor or Scalar, optional): y is a Tensor or Scalar with data type float32, float64, int32, int64. Either both or neither of x and y should be given.
-
-        name(str, optional): The default value is None. Normally there is no
-            need for user to set this property. For more information, please
-            refer to :ref:`api_guide_Name`.
+        condition (Tensor): The condition to choose x or y. When True (nonzero), yield x, otherwise yield y.
+        x (Tensor|scalar, optional): A Tensor or scalar to choose when the condition is True with data type of float32, float64, int32 or int64. Either both or neither of x and y should be given.
+        y (Tensor|scalar, optional): A Tensor or scalar to choose when the condition is False with data type of float32, float64, int32 or int64. Either both or neither of x and y should be given.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
-        Tensor: A Tensor with the same data dype as x. 
+        Tensor: A Tensor with the same shape as :attr:`condition` and same data type as :attr:`x` and :attr:`y`.
 
     Examples:
         .. code-block:: python
+            :name:where-example
 
-          import paddle
+            import paddle
 
-          x = paddle.to_tensor([0.9383, 0.1983, 3.2, 1.2])
-          y = paddle.to_tensor([1.0, 1.0, 1.0, 1.0])
-          out = paddle.where(x>1, x, y)
+            x = paddle.to_tensor([0.9383, 0.1983, 3.2, 1.2])
+            y = paddle.to_tensor([1.0, 1.0, 1.0, 1.0])
+            out = paddle.where(x>1, x, y)
 
-          print(out)
-          #out: [1.0, 1.0, 3.2, 1.2]
+            print(out)
+            #out: [1.0, 1.0, 3.2, 1.2]
 
-          out = paddle.where(x>1)
-          print(out)
-          #out: (Tensor(shape=[2, 1], dtype=int64, place=CPUPlace, stop_gradient=True,
-          #            [[2],
-          #             [3]]),)
+            out = paddle.where(x>1)
+            print(out)
+            #out: (Tensor(shape=[2, 1], dtype=int64, place=CPUPlace, stop_gradient=True,
+            #            [[2],
+            #             [3]]),)
     """
     if np.isscalar(x):
         x = paddle.full([1], x, np.array([x]).dtype.name)

From 0fdb3ced4574487c3fbed7f325aa7b89f71af28b Mon Sep 17 00:00:00 2001
From: Guoxia Wang <mingzilaochongtu@gmail.com>
Date: Tue, 7 Jun 2022 14:23:18 +0800
Subject: [PATCH 17/22] add bf16 dtype for flatten kernel (#43264)

---
 paddle/phi/kernels/flatten_grad_kernel.cc | 2 ++
 paddle/phi/kernels/flatten_kernel.cc      | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/paddle/phi/kernels/flatten_grad_kernel.cc b/paddle/phi/kernels/flatten_grad_kernel.cc
index 54279fca6e429e..73d963f606e3f1 100644
--- a/paddle/phi/kernels/flatten_grad_kernel.cc
+++ b/paddle/phi/kernels/flatten_grad_kernel.cc
@@ -38,6 +38,7 @@ PD_REGISTER_KERNEL(flatten_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::FlattenGradKernel,
+                   phi::dtype::bfloat16,
                    float,
                    double,
                    uint8_t,
@@ -52,6 +53,7 @@ PD_REGISTER_KERNEL(flatten_grad,
                    phi::FlattenGradKernel,
                    float,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    double,
                    uint8_t,
                    int8_t,
diff --git a/paddle/phi/kernels/flatten_kernel.cc b/paddle/phi/kernels/flatten_kernel.cc
index dd000896073c70..006d3438288c1e 100644
--- a/paddle/phi/kernels/flatten_kernel.cc
+++ b/paddle/phi/kernels/flatten_kernel.cc
@@ -54,6 +54,7 @@ PD_REGISTER_KERNEL(flatten,
                    ALL_LAYOUT,
                    phi::FlattenKernel,
                    float,
+                   phi::dtype::bfloat16,
                    double,
                    uint8_t,
                    int8_t,
@@ -66,6 +67,7 @@ PD_REGISTER_KERNEL(flatten_with_xshape,
                    ALL_LAYOUT,
                    phi::FlattenWithXShape,
                    float,
+                   phi::dtype::bfloat16,
                    double,
                    uint8_t,
                    int8_t,
@@ -80,6 +82,7 @@ PD_REGISTER_KERNEL(flatten,
                    phi::FlattenKernel,
                    float,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    double,
                    uint8_t,
                    int8_t,
@@ -93,6 +96,7 @@ PD_REGISTER_KERNEL(flatten_with_xshape,
                    phi::FlattenWithXShape,
                    float,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    double,
                    uint8_t,
                    int8_t,

From 42dd0f1b7a46d13a59e4d901dcd8499e7643bbce Mon Sep 17 00:00:00 2001
From: qipengh <huangqipeng@cambricon.com>
Date: Tue, 7 Jun 2022 14:29:49 +0800
Subject: [PATCH 18/22] [MLU]support cast double type (#43058)

* [MLU]support cast double type

* [MLU]fix cast test
---
 paddle/fluid/operators/cast_op_mlu.cc         | 32 +------------------
 paddle/fluid/operators/mlu/mlu_baseop.h       |  3 ++
 .../tests/unittests/mlu/test_cast_op_mlu.py   | 19 +++++++++++
 3 files changed, 23 insertions(+), 31 deletions(-)

diff --git a/paddle/fluid/operators/cast_op_mlu.cc b/paddle/fluid/operators/cast_op_mlu.cc
index f28889e7acf877..f0df271a8d07e7 100644
--- a/paddle/fluid/operators/cast_op_mlu.cc
+++ b/paddle/fluid/operators/cast_op_mlu.cc
@@ -44,37 +44,7 @@ class CastMLUKernel : public framework::OpKernel<T> {
                           framework::DataTypeToString(src_type),
                           framework::DataTypeToString(dst_type)));
 
-    switch (dst_type) {
-      case VT::FP32:
-        output->mutable_data<float>(place);
-        break;
-      case VT::FP16:
-        output->mutable_data<paddle::platform::float16>(place);
-        break;
-      case VT::INT32:
-        output->mutable_data<int32_t>(place);
-        break;
-      case VT::INT16:
-        output->mutable_data<int16_t>(place);
-        break;
-      case VT::INT8:
-        output->mutable_data<int8_t>(place);
-        break;
-      case VT::UINT8:
-        output->mutable_data<uint8_t>(place);
-        break;
-      case VT::BOOL:
-        output->mutable_data<bool>(place);
-        break;
-      case VT::INT64:
-        output->mutable_data<int64_t>(place);
-        break;
-      default:
-        PADDLE_THROW(platform::errors::Unavailable(
-            "Not supported cast %d -> %d",
-            framework::DataTypeToString(src_type),
-            framework::DataTypeToString(dst_type)));
-    }
+    output->mutable_data(place, framework::TransToPhiDataType(dst_type));
 
     MLUCnnlTensorDesc input_desc(*input);
     MLUCnnlTensorDesc output_desc(*output);
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h
index f048ac7c5c3be0..c97ee3efd3f566 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.h
+++ b/paddle/fluid/operators/mlu/mlu_baseop.h
@@ -75,6 +75,9 @@ inline cnnlDataType_t ToCnnlDataType(
     case DataType::FLOAT32:
       type = CNNL_DTYPE_FLOAT;
       break;
+    case DataType::FLOAT64:
+      type = CNNL_DTYPE_DOUBLE;
+      break;
     case DataType::INT8:
       type = CNNL_DTYPE_INT8;
       break;
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_cast_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_cast_op_mlu.py
index 6ba62b11499f46..88b46af8df2a36 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_cast_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_cast_op_mlu.py
@@ -61,6 +61,25 @@ def setUp(self):
         self.op_type = 'cast'
         self.place = paddle.device.MLUPlace(0)
         self.__class__.use_mlu = True
+        self.__class__.no_need_check_grad = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-3)
+
+
+class TestCastOpFp32ToFp64(OpTest):
+    def setUp(self):
+        ipt = np.random.random(size=[10, 10])
+        self.inputs = {'X': ipt.astype('float32')}
+        self.outputs = {'Out': ipt.astype('float64')}
+        self.attrs = {
+            'in_dtype': int(core.VarDesc.VarType.FP32),
+            'out_dtype': int(core.VarDesc.VarType.FP64)
+        }
+        self.op_type = 'cast'
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+        self.__class__.no_need_check_grad = True
 
     def test_check_output(self):
         self.check_output_with_place(self.place, atol=1e-3)

From 2922985a3d0d2a77a4e3fe4b5650755166be5768 Mon Sep 17 00:00:00 2001
From: Haohongxiang <86215757+haohongxiang@users.noreply.github.com>
Date: Tue, 7 Jun 2022 15:31:03 +0800
Subject: [PATCH 19/22] [Dygraph] Fix bugs of EagerReducer for complex control
 flows (#43252)

* fix bugs of reducer

* update

* update
---
 paddle/fluid/distributed/collective/reducer.cc        |  7 +++++++
 .../distributed/fleet/utils/hybrid_parallel_util.py   | 11 ++++++-----
 python/paddle/fluid/tests/unittests/CMakeLists.txt    |  2 +-
 .../unittests/test_parallel_dygraph_dataparallel.py   |  4 +++-
 .../test_parallel_dygraph_no_sync_gradient_check.py   |  4 ++++
 5 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc
index 9c04b95a732e8c..f3ac17cc46cd21 100644
--- a/paddle/fluid/distributed/collective/reducer.cc
+++ b/paddle/fluid/distributed/collective/reducer.cc
@@ -775,6 +775,13 @@ void EagerReducer::ProcessUnusedDenseVars() {
         continue;
       }
 
+      // NOTE(haohongxiang): Calling SetFakeEmpty here is to make sure that
+      // gradient accumulation can continue normally after clear_gradients()
+      // especiall in cases including complex control flow.
+      std::static_pointer_cast<egr::GradNodeAccumulation>(
+          GetGradNodeFromTensor(&tensors_[var_index]))
+          ->SetFakeEmpty(false);
+
       Tensor grad_value(std::make_shared<phi::DenseTensor>(src_tensor));
 
       auto dest_var_base = tensors_[var_index];
diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
index e2f7af769d39e9..161f4d3262ab17 100644
--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
@@ -43,12 +43,11 @@ def _apply_collective_grads(parameters, comm_group):
 
     coalesced_grads_and_vars = build_groups(grad_vars, 128 * 1024 * 1024)
 
+    nranks = paddle.distributed.get_world_size(
+    ) if comm_group is None else comm_group.nranks
     for coalesced_grad, _, _ in coalesced_grads_and_vars:
         # need to div nranks
-        nranks = paddle.distributed.get_world_size(
-        ) if comm_group is None else comm_group.nranks
         div_factor = paddle.to_tensor(nranks, dtype=coalesced_grad.dtype)
-        paddle.distributed.all_reduce(coalesced_grad, group=comm_group)
         paddle.fluid.framework._dygraph_tracer().trace_op(
             type="elementwise_div",
             inputs={
@@ -57,6 +56,7 @@ def _apply_collective_grads(parameters, comm_group):
             },
             outputs={'Out': coalesced_grad},
             attrs={'axis': -1})
+        paddle.distributed.all_reduce(coalesced_grad, group=comm_group)
 
     _split_tensors(coalesced_grads_and_vars)
 
@@ -76,10 +76,11 @@ def _apply_collective_grads_eager(parameters, comm_group):
 
     coalesced_grads_and_vars = build_groups(grad_vars, 128 * 1024 * 1024)
 
-    div_factor = 1.0 / comm_group.nranks
+    nranks = paddle.distributed.get_world_size(
+    ) if comm_group is None else comm_group.nranks
     for coalesced_grad, _, _ in coalesced_grads_and_vars:
         # need to div nranks
-        coalesced_grad.scale_(div_factor)
+        coalesced_grad.scale_(1.0 / nranks)
         paddle.distributed.all_reduce(coalesced_grad, group=comm_group)
 
     _split_tensors(coalesced_grads_and_vars)
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 214c68c250ea98..6710ddb97dc24f 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -1507,7 +1507,7 @@ if(WITH_DISTRIBUTE
                                                                      350)
   set_tests_properties(test_parallel_dygraph_no_sync PROPERTIES TIMEOUT 300)
   set_tests_properties(test_parallel_dygraph_no_sync_gradient_check
-                       PROPERTIES TIMEOUT 30)
+                       PROPERTIES TIMEOUT 60)
   set_tests_properties(test_parallel_dygraph_pipeline_parallel
                        PROPERTIES TIMEOUT 500)
   set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
index 930bf5345fcae3..1e8aae7226a7e8 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
@@ -200,7 +200,8 @@ def run_mnist_2cpu(self, target_file_name):
 class TestDataParallelGradientCheck(TestMultipleGpus):
 
     def test_multiple_gpus_dynamic(self):
-        self.run_mnist_2gpu('parallel_dygraph_gradient_check.py')
+        self.run_mnist_2gpu('parallel_dygraph_gradient_check.py',
+                            eager_mode=False)
 
 
 class TestDataParallelWithPyLayer(TestMultipleGpus):
@@ -218,4 +219,5 @@ def test_multiple_gpus_dynamic(self):
 
 
 if __name__ == "__main__":
+    os.environ["FLAGS_enable_eager_mode"] = "1"
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync_gradient_check.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync_gradient_check.py
index fad9e902cc91ea..d6a48b504a2dc6 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync_gradient_check.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync_gradient_check.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import os
 import unittest
 import paddle.fluid as fluid
 
@@ -24,7 +25,10 @@ class TestDataParallelLayer(TestMultipleGpus):
 
     def test_parallel_dygraph_dataparallel_no_sync(self):
         self.run_mnist_2gpu('parallel_dygraph_no_sync_gradient_check.py')
+        self.run_mnist_2gpu('parallel_dygraph_no_sync_gradient_check.py',
+                            eager_mode=False)
 
 
 if __name__ == "__main__":
+    os.environ["FLAGS_enable_eager_mode"] = "1"
     unittest.main()

From b4a3dab727e5d5c50b040326ab9e52ba82b957f7 Mon Sep 17 00:00:00 2001
From: Yuang Liu <liuyuang@baidu.com>
Date: Tue, 7 Jun 2022 16:20:37 +0800
Subject: [PATCH 20/22] [cuda graph] Add cuda graph attr to op desc (#43228)

---
 python/paddle/device/cuda/graphs.py           |  20 +++
 python/paddle/fluid/backward.py               | 137 ++++++++++++++----
 python/paddle/fluid/framework.py              |  35 +++++
 .../test_cuda_graph_partial_graph_static.py   |  71 +++++++++
 4 files changed, 237 insertions(+), 26 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_cuda_graph_partial_graph_static.py

diff --git a/python/paddle/device/cuda/graphs.py b/python/paddle/device/cuda/graphs.py
index c6554d78fb86ab..dca32fb6bb85bc 100644
--- a/python/paddle/device/cuda/graphs.py
+++ b/python/paddle/device/cuda/graphs.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import os
+import paddle
 from paddle.fluid.core import is_compiled_with_cuda, is_compiled_with_rocm, CUDAPlace
 
 if is_compiled_with_cuda() and not is_compiled_with_rocm():
@@ -28,6 +29,7 @@ def is_cuda_graph_supported():
 
 
 ALL_MODES = ["global", "thread_local", "relaxed"]
+cuda_graph_id = 0
 
 
 class CUDAGraph:
@@ -68,6 +70,24 @@ def print_to_dot_files(self, dirname, flags=None):
 
 def wrap_cuda_graph(function, mode="thread_local", memory_pool="default"):
     assert mode in ALL_MODES
+    if not paddle.in_dynamic_mode():
+        # static mode
+        from paddle.fluid.framework import _cuda_graph_guard
+        global cuda_graph_id
+        graph_id = str(cuda_graph_id)
+        cuda_graph_id += 1
+        if memory_pool == 'default':
+            memory_pool_id = 0
+        elif memory_pool == 'new':
+            memory_pool_id = CoreCUDAGraph.gen_new_memory_pool_id()
+        else:
+            raise ValueError(
+                "memory_pool should be one of default or new under static mode, but got",
+                memory_pool)
+        return _cuda_graph_guard(
+            mode + ';' + str(memory_pool_id) + ';' +
+            graph_id)(lambda *args, **kwargs: function(*args, **kwargs))
+
     from paddle.jit import to_static
     from paddle.nn import Layer
     new_function = to_static(function)
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 0ca69b5f94de49..c37ac87da71b80 100755
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -236,7 +236,11 @@ def _pretty_op_desc_(op_desc, prefix):
     return out_s
 
 
-def _add_needed_descs_to_block(descs, block, main_block, in_memory_vars):
+def _add_needed_descs_to_block(descs,
+                               block,
+                               main_block,
+                               in_memory_vars,
+                               grad_op_id_to_fwd_op=None):
     if len(descs) == 0:
         return []
     result_descs = []
@@ -244,8 +248,11 @@ def _add_needed_descs_to_block(descs, block, main_block, in_memory_vars):
         core.op_proto_and_checker_maker.kOpRoleAttrName()
     backward = core.op_proto_and_checker_maker.OpRole.Backward
     for desc in descs:
+        origin_desc = desc
+        origin_is_operator = False
         if isinstance(desc, framework.Operator):
             desc = desc.desc
+            origin_is_operator = True
         if isinstance(desc, tuple):
             desc = desc[0]
         is_needed = False
@@ -255,6 +262,8 @@ def _add_needed_descs_to_block(descs, block, main_block, in_memory_vars):
             if name not in in_memory_vars:
                 is_needed = True
         if is_needed:
+            if origin_is_operator and grad_op_id_to_fwd_op is not None:
+                grad_op_id_to_fwd_op[desc.original_id()] = origin_desc
             new_op_desc = block.desc.append_op()
             new_op_desc.copy_from(desc)
             new_op_desc._set_attr(op_role_attr_name, backward)
@@ -264,7 +273,7 @@ def _add_needed_descs_to_block(descs, block, main_block, in_memory_vars):
     return result_descs
 
 
-def _add_descs_to_block(descs, block):
+def _add_descs_to_block(descs, block, grad_op_id_to_fwd_op=None):
     if len(descs) == 0:
         return []
     result_descs = []
@@ -273,6 +282,9 @@ def _add_descs_to_block(descs, block):
     backward = core.op_proto_and_checker_maker.OpRole.Backward
     for desc in descs:
         if isinstance(desc, framework.Operator):
+            # for recompute, should record recompute ops
+            if grad_op_id_to_fwd_op is not None:
+                grad_op_id_to_fwd_op[desc.desc.original_id()] = desc
             desc = desc.desc
         if isinstance(desc, tuple):
             desc = desc[0]
@@ -489,7 +501,10 @@ def _accumulate_gradients_by_add_ops_(var_name,
     renamed_vars[var_name] = [var_name]
 
 
-def _addup_repetitive_outputs_(op_descs, block_idx, grad_var_to_var=None):
+def _addup_repetitive_outputs_(op_descs,
+                               block_idx,
+                               grad_var_to_var=None,
+                               grad_op_id_to_fwd_op=None):
     """
     In backward part, an variable may be the output of more than one ops.
     And one op may yield its multiple outputs to the same variable.
@@ -500,6 +515,7 @@ def _addup_repetitive_outputs_(op_descs, block_idx, grad_var_to_var=None):
         grad_var_to_var(dict): used to build the mapping between grad var name and forward var name.
         Only for auto parallel.
     """
+
     _MAX_ADD_NUM_ = framework._global_flags()['FLAGS_max_inplace_grad_add']
     #pending_sum_ops = []
     pending_sum_ops = collections.OrderedDict()
@@ -604,6 +620,7 @@ def _addup_repetitive_outputs_(op_descs, block_idx, grad_var_to_var=None):
                                                   len(op_descs),
                                                   var_device[var_name])
 
+    op_descs_len = len(op_descs)
     # sum_op descs are sorted according to their insert position
     for key, value in collections.OrderedDict(
             reversed(list(pending_sum_ops.items()))).items():
@@ -614,12 +631,18 @@ def _addup_repetitive_outputs_(op_descs, block_idx, grad_var_to_var=None):
         # If not reverse, we first insert 'a' at idx 1, it becomes [0, 1, 'a', 2], and then insert 'b' at idx 2, it becomes [0, 1, 'a', 'b', 2].
         idx = key
         for i, op in enumerate(value):
+            # update the mapping between fwd and bwd
+            target_idx = idx - 1 if idx == op_descs_len else idx + i
+            if grad_op_id_to_fwd_op is not None and grad_op_id_to_fwd_op.get(
+                    op_descs[target_idx].original_id(), None) is not None:
+                grad_op_id_to_fwd_op[op.original_id()] = grad_op_id_to_fwd_op[
+                    op_descs[target_idx].original_id()]
             op_descs.insert(idx + i, op)
 
     return op_descs
 
 
-def _remove_no_grad_branch_(op_descs, no_grad_set):
+def _remove_no_grad_branch_(op_descs, no_grad_set, grad_op_id_to_fwd_op=None):
     """
     Remove unnecessary grad ops
     A grad op can be removed in two cases:
@@ -653,9 +676,14 @@ def _op_can_be_removed_(op_desc, no_grad_set):
                 x_in = _strip_grad_suffix_(arg)
                 # the reason should be: arg can be input of another grad op
                 # and the op is a not-to-remove op
-                to_insert.append(
-                    (_create_op_desc_("fill_zeros_like", {"X": [x_in]},
-                                      {"Out": [arg]}, {}), idx))
+                new_op_desc = _create_op_desc_("fill_zeros_like", {"X": [x_in]},
+                                               {"Out": [arg]}, {})
+                # update the mapping between fwd and bwd
+                if grad_op_id_to_fwd_op is not None and grad_op_id_to_fwd_op.get(
+                        op_desc.original_id(), None) is not None:
+                    grad_op_id_to_fwd_op[new_op_desc.original_id(
+                    )] = grad_op_id_to_fwd_op[op_desc.original_id()]
+                to_insert.append((new_op_desc, idx))
 
     list([op_descs.insert(p[1], p[0]) for p in reversed(to_insert)])
 
@@ -794,9 +822,13 @@ def serialize_op_decs(op_desc):
     return proto.__str__()
 
 
-def _append_backward_ops_with_checkpoints_(block, ops, target_block,
-                                           no_grad_dict, grad_to_var,
-                                           checkpoints):
+def _append_backward_ops_with_checkpoints_(block,
+                                           ops,
+                                           target_block,
+                                           no_grad_dict,
+                                           grad_to_var,
+                                           checkpoints,
+                                           grad_op_id_to_fwd_op=None):
     """
     Create grad ops with forward ops, and insert them into given block
 
@@ -926,12 +958,19 @@ def _append_backward_ops_with_checkpoints_(block, ops, target_block,
                                 _pretty_op_desc_(op.desc, "with_sub_block"))
             grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
                 op.desc, cpt.to_text(no_grad_dict[block.idx]), [])
+
+            # record the mapping between fwd and bwd
+            if grad_op_id_to_fwd_op is not None:
+                for op_desc in grad_op_desc:
+                    grad_op_id_to_fwd_op[op_desc.original_id()] = op
+
             # Set device for grad_op according to forward Op
             if op.desc.has_attr(device_attr_name):
                 op_device = op.desc.attr(device_attr_name)
                 for op_desc in grad_op_desc:
                     op_desc._set_attr(device_attr_name, op_device)
-            added_descs = _add_descs_to_block(grad_op_desc, local_block)
+            added_descs = _add_descs_to_block(grad_op_desc, local_block,
+                                              grad_op_id_to_fwd_op)
             grad_op_descs.extend(added_descs)
             grad_to_var.update(op_grad_to_var)
 
@@ -945,12 +984,19 @@ def _append_backward_ops_with_checkpoints_(block, ops, target_block,
                                 _pretty_op_desc_(op.desc, "with_sub_block"))
             grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
                 op.desc, cpt.to_text(no_grad_dict[block.idx]), [])
+
+            # record the mapping between fwd and bwd
+            if grad_op_id_to_fwd_op is not None:
+                for op_desc in grad_op_desc:
+                    grad_op_id_to_fwd_op[op_desc.original_id()] = op
+
             # Set device for grad_op according to forward Op
             if op.desc.has_attr(device_attr_name):
                 op_device = op.desc.attr(device_attr_name)
                 for op_desc in grad_op_desc:
                     op_desc._set_attr(device_attr_name, op_device)
-            added_descs = _add_descs_to_block(grad_op_desc, local_block)
+            added_descs = _add_descs_to_block(grad_op_desc, local_block,
+                                              grad_op_id_to_fwd_op)
             grad_op_descs.extend(added_descs)
             grad_to_var.update(op_grad_to_var)
 
@@ -984,8 +1030,10 @@ def _append_backward_ops_with_checkpoints_(block, ops, target_block,
 
         # 3.a. add ops in current recompute_segment as forward recomputation ops
         buffer_descs = _add_needed_descs_to_block(ff_ops, buffer_block, block,
-                                                  vars_in_memory)
-        added_descs = _add_descs_to_block(ff_ops, local_block)
+                                                  vars_in_memory,
+                                                  grad_op_id_to_fwd_op)
+        added_descs = _add_descs_to_block(ff_ops, local_block,
+                                          grad_op_id_to_fwd_op)
 
         # 3.b. rename all non-checkpoint variables in recomputation ops
         for key in var_name_dict:
@@ -999,6 +1047,12 @@ def _append_backward_ops_with_checkpoints_(block, ops, target_block,
             grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
                 op_desc, cpt.to_text(no_grad_dict[block.idx]), [])
 
+            # record the mapping between fwd and bwd
+            if grad_op_id_to_fwd_op is not None:
+                for g_op_desc in grad_op_desc:
+                    grad_op_id_to_fwd_op[g_op_desc.original_id(
+                    )] = grad_op_id_to_fwd_op[op_desc.original_id()]
+
             # Set device for grad_op according to forward Op
             if op_desc.has_attr(device_attr_name):
                 op_device = op_desc.attr(device_attr_name)
@@ -1011,11 +1065,14 @@ def _append_backward_ops_with_checkpoints_(block, ops, target_block,
             grad_to_var.update(op_grad_to_var)
 
     # 3.d. add sum op for repetitive_outputs
-    grad_op_descs = _addup_repetitive_outputs_(grad_op_descs, block.idx)
+    grad_op_descs = _addup_repetitive_outputs_(
+        grad_op_descs, block.idx, grad_op_id_to_fwd_op=grad_op_id_to_fwd_op)
     # 4) remove no grad branch as it is in _remove_no_grad_branch_
     grad_op_descs = _remove_no_grad_branch_(grad_op_descs,
-                                            no_grad_dict[block.idx])
-    added_descs = _add_descs_to_block(grad_op_descs, target_block)
+                                            no_grad_dict[block.idx],
+                                            grad_op_id_to_fwd_op)
+    added_descs = _add_descs_to_block(grad_op_descs, target_block,
+                                      grad_op_id_to_fwd_op)
     return program_stat, checkpoints_name, vars_should_be_hold, recompute_segments
 
 
@@ -1090,7 +1147,8 @@ def _append_backward_ops_(block,
                           input_grad_names_set=None,
                           op_path_dict=None,
                           distop_context=None,
-                          rename_var_map=None):
+                          rename_var_map=None,
+                          grad_op_id_to_fwd_op=None):
     """
     Create all grad ops, and insert them into given block
 
@@ -1152,9 +1210,15 @@ def update_distop_context(distop_context, op_grad_to_var,
             pre_input_grad_names_set = copy.copy(input_grad_names_set)
             input_grad_names_set = None
             sub_block_path = op_path_dict[op._block_attr_id("sub_block")]
-            _append_backward_ops_(sub_block, sub_block_path, grad_sub_block,
-                                  no_grad_dict, grad_to_var, callbacks,
-                                  input_grad_names_set, op_path_dict)
+            _append_backward_ops_(sub_block,
+                                  sub_block_path,
+                                  grad_sub_block,
+                                  no_grad_dict,
+                                  grad_to_var,
+                                  callbacks,
+                                  input_grad_names_set,
+                                  op_path_dict,
+                                  grad_op_id_to_fwd_op=grad_op_id_to_fwd_op)
             input_grad_names_set = pre_input_grad_names_set
 
             program._rollback()
@@ -1164,6 +1228,11 @@ def update_distop_context(distop_context, op_grad_to_var,
         grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
             op.desc, cpt.to_text(no_grad_dict[block.idx]), grad_sub_block_list)
 
+        # record the mapping between fwd and bwd
+        if grad_op_id_to_fwd_op is not None:
+            for op_desc in grad_op_desc:
+                grad_op_id_to_fwd_op[op_desc.original_id()] = op
+
         # Build the mapping between the forward op and backward op (Only for auto parallel)
         if distop_context is not None:
             update_distop_context(distop_context, op_grad_to_var,
@@ -1251,13 +1320,17 @@ def update_distop_context(distop_context, op_grad_to_var,
         grad_var_to_var = distop_context.grad_var_to_var[
             program._appending_grad_times]
     # sum parameter's gradients' var given multiple var gradient
-    grad_op_descs = _addup_repetitive_outputs_(grad_op_descs, block.idx,
-                                               grad_var_to_var)
+    grad_op_descs = _addup_repetitive_outputs_(
+        grad_op_descs,
+        block.idx,
+        grad_var_to_var,
+        grad_op_id_to_fwd_op=grad_op_id_to_fwd_op)
 
     # if all outputs of the grad op are in no_grad_set, then just remove and fill zero
     # if all inputs of the grad op are in no_grad_set, just remove this op
     grad_op_descs = _remove_no_grad_branch_(grad_op_descs,
-                                            no_grad_dict[block.idx])
+                                            no_grad_dict[block.idx],
+                                            grad_op_id_to_fwd_op)
 
     # remove some backward ops
     not_need_ops = _find_not_need_ops(grad_op_descs, ops, input_grad_names_set)
@@ -1585,6 +1658,9 @@ def append_backward(loss,
             p_g_list6 = paddle.static.append_backward(loss=avg_loss, parameter_list=all_weights, no_grad_set=set(all_weights))
 
     """
+    grad_op_id_to_fwd_op = {
+    }  # for cuda graph usage, recording the mapping between grad op original id to fwd op
+
     check_type(loss, 'loss', framework.Variable,
                'paddle.static.append_backward')
 
@@ -1644,7 +1720,9 @@ def append_backward(loss,
 
     grad_to_var = dict()
 
+    # pass the cuda_graph_attr to the fill_constant which generates the loss_grad
     op_desc = _create_loss_op_desc_(loss)
+    grad_op_id_to_fwd_op[op_desc.original_id()] = loss.op
     target_grad_block.desc.append_op().copy_from(op_desc)
 
     for block_idx in son_parent_block_idx_dict:
@@ -1690,7 +1768,8 @@ def append_backward(loss,
                     root_block,
                     no_grad_dict,
                     grad_to_var,
-                    checkpoints)
+                    checkpoints,
+                    grad_op_id_to_fwd_op)
         else:
             _append_backward_ops_(
                 block,  # the block where forward ops are in
@@ -1702,7 +1781,7 @@ def append_backward(loss,
                 input_grad_names_set=input_grad_names_set,
                 op_path_dict=op_path_dict,
                 distop_context=distop_context,
-            )
+                grad_op_id_to_fwd_op=grad_op_id_to_fwd_op)
 
     grad_info_map = dict()
 
@@ -1722,6 +1801,12 @@ def append_backward(loss,
     program.current_block_idx = current_block_idx
     program._sync_with_cpp()
 
+    # for cuda graph, copy the cuda graph attr from forward op to backward op
+    for op in target_grad_block.ops:
+        if grad_op_id_to_fwd_op.get(op.desc.original_id(), None) is not None:
+            fwd_op = grad_op_id_to_fwd_op[op.desc.original_id()]
+            op._cuda_graph_attr = fwd_op._cuda_graph_attr
+
     if parameter_list is not None:
         check_type(parameter_list, 'parameter_list', (list, tuple, set),
                    'fluid.backward.append_backward')
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index e0b4f8d19e8610..fdd5c0b47b4dc4 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -81,6 +81,7 @@
 _current_pipeline_stage = None
 _already_patch_eager_tensor = False
 _already_patch_varbase = False
+_current_cuda_graph_mode = None
 _global_flags_ = core.globals()
 
 # Some explanation of our execution system 2022.03
@@ -2622,6 +2623,9 @@ def __init__(self,
                 op_attrs = dict()
             del attrs
 
+            # attr for static mode cuda graph
+            self._cuda_graph_attr = _current_cuda_graph_mode
+
             op_maker = core.op_proto_and_checker_maker
 
             if op_maker.kOpRoleAttrName() not in op_attrs:
@@ -7017,6 +7021,37 @@ def device_guard(device=None):
         switch_device(pre_device)
 
 
+def _switch_cuda_graph_mode(cuda_graph_attr):
+    global _current_cuda_graph_mode
+    pre_mode = _current_cuda_graph_mode
+    _current_cuda_graph_mode = cuda_graph_attr
+    return pre_mode
+
+
+@signature_safe_contextmanager
+def _cuda_graph_guard(cuda_graph_attr=None):
+    """
+
+    Note:
+        The API only supports static mode.
+
+    A context manager that specifies the cuda_graph_mode which indicating the cuda graph capture under static mode.
+
+    Args:
+        cuda_graph_attr(str|None): The cuda graph attr with the format of:
+                                   cuda_graph_capture_mode;memory_pool_id;cuda_graph_id
+    """
+    assert not _non_static_mode(
+    ), "cuda_graph_guard only works under static mode"
+    assert core.is_compiled_with_cuda(
+    ), "cuda_graph_guard context can be only used when Paddle is compiled with cuda"
+    pre_mode = _switch_cuda_graph_mode(cuda_graph_attr)
+    try:
+        yield
+    finally:
+        _switch_cuda_graph_mode(pre_mode)
+
+
 def set_flags(flags):
     """
     This function sets the GFlags value in Paddle.
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_graph_partial_graph_static.py b/python/paddle/fluid/tests/unittests/test_cuda_graph_partial_graph_static.py
new file mode 100644
index 00000000000000..b70be74ea92a53
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_cuda_graph_partial_graph_static.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import unittest
+import numpy as np
+from paddle.device.cuda.graphs import wrap_cuda_graph, is_cuda_graph_supported
+
+paddle.enable_static()
+
+
+class SimpleModel(nn.Layer):
+
+    def __init__(self, in_size, out_size):
+        super(SimpleModel, self).__init__()
+        self.linear = nn.Linear(in_size, out_size)
+        self.dropout_1 = paddle.nn.Dropout(0.1)
+        self.relu = nn.ReLU()
+        self.dropout_2 = paddle.nn.Dropout(0.5)
+        self.gelu = nn.GELU()
+
+    def forward(self, x):
+        x = self.linear(x)
+        x = self.dropout_1(x)
+        x = self.relu(x)
+        x = self.dropout_2(x)
+        x = self.gelu(x)
+        return x
+
+
+class TestCudaGraphAttrAll(unittest.TestCase):
+
+    def test_all_program(self):
+        if not is_cuda_graph_supported():
+            return
+        main_prog = paddle.static.Program()
+        start_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog, start_prog):
+            model = SimpleModel(10, 20)
+            cuda_graph_model = wrap_cuda_graph(model)
+            x = paddle.static.data(shape=[3, 10], dtype='float32', name='x')
+            y = cuda_graph_model(x)
+            loss = paddle.mean(y)
+            opt = paddle.optimizer.SGD()
+            opt.minimize(loss)
+            block = main_prog.global_block()
+            for op in block.ops:
+                if op._cuda_graph_attr is None:
+                    # the loss and opt are not wrapped
+                    assert op.type in [
+                        'sgd', 'reduce_mean', 'fill_constant',
+                        'reduce_mean_grad'
+                    ]
+                else:
+                    assert op._cuda_graph_attr == 'thread_local;0;0'
+
+
+if __name__ == "__main__":
+    unittest.main()

From c0ed75a8babb86a1fec345601f9aa39cd1756ee5 Mon Sep 17 00:00:00 2001
From: liutiexing <74819124+liutiexing@users.noreply.github.com>
Date: Tue, 7 Jun 2022 16:22:47 +0800
Subject: [PATCH 21/22] Update profiler (#42998)

* Update Profiler

* make HostEventRecorder templated
---
 paddle/fluid/platform/profiler.cc             | 30 +++++++++++--------
 .../platform/profiler/host_event_recorder.h   | 23 +++++++-------
 paddle/fluid/platform/profiler/host_tracer.cc |  8 ++---
 3 files changed, 34 insertions(+), 27 deletions(-)

diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index c573650f1791fd..47141bd73a5555 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -192,15 +192,15 @@ void RecordEvent::End() {
   if (LIKELY(FLAGS_enable_host_event_recorder_hook && is_enabled_)) {
     uint64_t end_ns = PosixInNsec();
     if (LIKELY(shallow_copy_name_ != nullptr)) {
-      HostEventRecorder::GetInstance().RecordEvent(
+      HostEventRecorder<CommonEvent>::GetInstance().RecordEvent(
           shallow_copy_name_, start_ns_, end_ns, role_, type_);
     } else if (name_ != nullptr) {
       if (attr_ == nullptr) {
-        HostEventRecorder::GetInstance().RecordEvent(*name_, start_ns_, end_ns,
-                                                     role_, type_);
+        HostEventRecorder<CommonEvent>::GetInstance().RecordEvent(
+            *name_, start_ns_, end_ns, role_, type_);
       } else {
-        HostEventRecorder::GetInstance().RecordEvent(*name_, start_ns_, end_ns,
-                                                     role_, type_, *attr_);
+        HostEventRecorder<CommonEvent>::GetInstance().RecordEvent(
+            *name_, start_ns_, end_ns, role_, type_, *attr_);
         delete attr_;
       }
       delete name_;
@@ -232,8 +232,8 @@ RecordInstantEvent::RecordInstantEvent(const char *name, TracerEventType type,
     return;
   }
   auto start_end_ns = PosixInNsec();
-  HostEventRecorder::GetInstance().RecordEvent(name, start_end_ns, start_end_ns,
-                                               EventRole::kOrdinary, type);
+  HostEventRecorder<CommonEvent>::GetInstance().RecordEvent(
+      name, start_end_ns, start_end_ns, EventRole::kOrdinary, type);
 }
 
 void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place,
@@ -327,7 +327,7 @@ void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
 
 void Mark(const std::string &name) {
   if (FLAGS_enable_host_event_recorder_hook) {
-    HostEventRecorder::GetInstance().RecordEvent(
+    HostEventRecorder<CommonEvent>::GetInstance().RecordEvent(
         name, 0, 0, EventRole::kOrdinary, TracerEventType::UserDefined);
     return;
   }
@@ -522,7 +522,8 @@ void DisableHostEventRecorder() {
 
 std::string PrintHostEvents() {
   std::ostringstream oss;
-  auto host_evt_sec = HostEventRecorder::GetInstance().GatherEvents();
+  auto host_evt_sec =
+      HostEventRecorder<CommonEvent>::GetInstance().GatherEvents();
   for (const auto &thr_evt_sec : host_evt_sec.thr_sections) {
     oss << thr_evt_sec.thread_id << std::endl;
     for (const auto &evt : thr_evt_sec.events) {
@@ -534,8 +535,9 @@ std::string PrintHostEvents() {
   return oss.str();
 }
 
-static void EmulateEventPushAndPop(const HostEventSection &host_sec,
-                                   std::map<uint64_t, ThreadEvents> *out) {
+static void EmulateEventPushAndPop(
+    const HostEventSection<CommonEvent> &host_sec,
+    std::map<uint64_t, ThreadEvents> *out) {
   for (const auto &thr_sec : host_sec.thr_sections) {
     uint64_t tid = thr_sec.thread_id;
     auto cur_thr_list = std::make_shared<EventList<Event>>();
@@ -582,7 +584,8 @@ static void EmulateEventPushAndPop(const HostEventSection &host_sec,
   }
 }
 
-static void EmulateCPURecordsAdd(const HostEventSection &host_sec) {
+static void EmulateCPURecordsAdd(
+    const HostEventSection<CommonEvent> &host_sec) {
   DeviceTracer *tracer = GetDeviceTracer();
   if (tracer == nullptr) {
     return;
@@ -610,7 +613,8 @@ static std::map<uint64_t, ThreadEvents> DockHostEventRecorderHostPart() {
   if (FLAGS_enable_host_event_recorder_hook == false) {
     return thr_events;
   }
-  auto host_evt_sec = HostEventRecorder::GetInstance().GatherEvents();
+  auto host_evt_sec =
+      HostEventRecorder<CommonEvent>::GetInstance().GatherEvents();
   EmulateEventPushAndPop(host_evt_sec, &thr_events);
   EmulateCPURecordsAdd(host_evt_sec);
   return thr_events;
diff --git a/paddle/fluid/platform/profiler/host_event_recorder.h b/paddle/fluid/platform/profiler/host_event_recorder.h
index 1359c3b85a0964..d5b495e8b25b69 100644
--- a/paddle/fluid/platform/profiler/host_event_recorder.h
+++ b/paddle/fluid/platform/profiler/host_event_recorder.h
@@ -21,7 +21,6 @@
 #include "paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/os_info.h"
-#include "paddle/fluid/platform/profiler/common_event.h"
 
 namespace paddle {
 namespace platform {
@@ -182,12 +181,14 @@ char *EventContainer<EventType>::GetStringStorage(size_t sz) {
   return storage;
 }
 
+template <typename EventType>
 struct ThreadEventSection {
   std::string thread_name;
   uint64_t thread_id;
-  std::vector<CommonEvent> events;
+  std::vector<EventType> events;
 };
 
+template <typename EventType>
 class ThreadEventRecorder {
  public:
   ThreadEventRecorder() {
@@ -204,8 +205,8 @@ class ThreadEventRecorder {
     base_evt_cntr_.Record(std::forward<Args>(args)...);
   }
 
-  ThreadEventSection GatherEvents() {
-    ThreadEventSection thr_sec;
+  ThreadEventSection<EventType> GatherEvents() {
+    ThreadEventSection<EventType> thr_sec;
     thr_sec.thread_name = thread_name_;
     thr_sec.thread_id = thread_id_;
     thr_sec.events = std::move(base_evt_cntr_.Reduce());
@@ -215,15 +216,17 @@ class ThreadEventRecorder {
  private:
   uint64_t thread_id_;
   std::string thread_name_;
-  EventContainer<CommonEvent> base_evt_cntr_;
+  EventContainer<EventType> base_evt_cntr_;
 };
 
+template <typename EventType>
 struct HostEventSection {
   std::string process_name;
   uint64_t process_id;
-  std::vector<ThreadEventSection> thr_sections;
+  std::vector<ThreadEventSection<EventType>> thr_sections;
 };
 
+template <typename EventType>
 class HostEventRecorder {
  public:
   // singleton
@@ -244,10 +247,10 @@ class HostEventRecorder {
 
   // thread-unsafe, make sure make sure there is no running tracing.
   // Poor performance, call it at the ending
-  HostEventSection GatherEvents() {
+  HostEventSection<EventType> GatherEvents() {
     auto thr_recorders =
         ThreadEventRecorderRegistry::GetInstance().GetAllThreadDataByRef();
-    HostEventSection host_sec;
+    HostEventSection<EventType> host_sec;
     host_sec.process_id = GetProcessId();
     host_sec.thr_sections.reserve(thr_recorders.size());
     for (auto &kv : thr_recorders) {
@@ -260,12 +263,12 @@ class HostEventRecorder {
 
  private:
   using ThreadEventRecorderRegistry =
-      framework::ThreadDataRegistry<ThreadEventRecorder>;
+      framework::ThreadDataRegistry<ThreadEventRecorder<EventType>>;
 
   HostEventRecorder() = default;
   DISABLE_COPY_AND_ASSIGN(HostEventRecorder);
 
-  ThreadEventRecorder *GetThreadLocalRecorder() {
+  ThreadEventRecorder<EventType> *GetThreadLocalRecorder() {
     return ThreadEventRecorderRegistry::GetInstance()
         .GetMutableCurrentThreadData();
   }
diff --git a/paddle/fluid/platform/profiler/host_tracer.cc b/paddle/fluid/platform/profiler/host_tracer.cc
index 8a36a3a8bab44e..bde1395c1253c8 100644
--- a/paddle/fluid/platform/profiler/host_tracer.cc
+++ b/paddle/fluid/platform/profiler/host_tracer.cc
@@ -30,7 +30,7 @@ namespace platform {
 
 namespace {
 
-void ProcessHostEvents(const HostEventSection& host_events,
+void ProcessHostEvents(const HostEventSection<CommonEvent>& host_events,
                        TraceEventCollector* collector) {
   for (const auto& thr_sec : host_events.thr_sections) {
     uint64_t tid = thr_sec.thread_id;
@@ -62,7 +62,7 @@ void HostTracer::StartTracing() {
   PADDLE_ENFORCE_EQ(
       state_ == TracerState::READY || state_ == TracerState::STOPED, true,
       platform::errors::PreconditionNotMet("TracerState must be READY"));
-  HostEventRecorder::GetInstance().GatherEvents();
+  HostEventRecorder<CommonEvent>::GetInstance().GatherEvents();
   HostTraceLevel::GetInstance().SetLevel(options_.trace_level);
   state_ = TracerState::STARTED;
 }
@@ -79,8 +79,8 @@ void HostTracer::CollectTraceData(TraceEventCollector* collector) {
   PADDLE_ENFORCE_EQ(
       state_, TracerState::STOPED,
       platform::errors::PreconditionNotMet("TracerState must be STOPED"));
-  HostEventSection host_events =
-      HostEventRecorder::GetInstance().GatherEvents();
+  HostEventSection<CommonEvent> host_events =
+      HostEventRecorder<CommonEvent>::GetInstance().GatherEvents();
   ProcessHostEvents(host_events, collector);
 }
 

From f3d43fa9903a6c89afc0f238d0c08fb38fa58ef5 Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Tue, 7 Jun 2022 16:50:43 +0800
Subject: [PATCH 22/22] patch pr (#43270)

---
 .../fluid/inference/api/resource_manager.cc   | 165 +++++++++++++-----
 paddle/fluid/inference/api/resource_manager.h |  66 ++++++-
 2 files changed, 174 insertions(+), 57 deletions(-)

diff --git a/paddle/fluid/inference/api/resource_manager.cc b/paddle/fluid/inference/api/resource_manager.cc
index d88f282ce7a62b..4cd84995a2e2f2 100644
--- a/paddle/fluid/inference/api/resource_manager.cc
+++ b/paddle/fluid/inference/api/resource_manager.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/fluid/inference/api/resource_manager.h"
 
+#include <memory>
+#include <mutex>
 #include <unordered_map>
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
@@ -106,31 +108,26 @@ class EigenGpuStreamDevice : public Eigen::StreamInterface {
 #endif
 }  // namespace internal
 
-ResourceManager::ResourceManager(const phi::Place& place, void* stream)
-    : place_(place) {
-  InitCPUResource();
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  InitGPUResource(stream);
-#endif
-}
-
-ResourceManager::~ResourceManager() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  DestroyGPUResource();
-#endif
+Eigen::DefaultDevice* CPUContextResource::GetCPUEigenDevice() const {
+  return cpu_eigen_device_.get();
 }
 
-void ResourceManager::InitCPUResource() {
+void CPUContextResource::InitCPUResource() {
   cpu_eigen_device_.reset(new Eigen::DefaultDevice());
 }
 
-Eigen::DefaultDevice* ResourceManager::GetCpuEigenDevice() {
-  return cpu_eigen_device_.get();
-}
+CPUContextResource::CPUContextResource() { InitCPUResource(); }
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-void ResourceManager::InitGPUResource(void* stream) {
+GPUContextResource::GPUContextResource(const phi::Place& place, void* stream)
+    : place_(place) {
+  InitGPUResource(stream);
+}
+
+GPUContextResource::~GPUContextResource() { DestroyGPUResource(); }
+
+void GPUContextResource::InitGPUResource(void* stream) {
+  phi::backends::gpu::GPUDeviceGuard guard(place_.device);
   if (stream == nullptr) {
     owned_stream_ = true;
     phi::InitStream(&stream_);
@@ -148,7 +145,7 @@ void ResourceManager::InitGPUResource(void* stream) {
   InitSparseHandle();
 }
 
-void ResourceManager::DestroyGPUResource() {
+void GPUContextResource::DestroyGPUResource() {
   if (owned_stream_) {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream_));
@@ -165,15 +162,14 @@ void ResourceManager::DestroyGPUResource() {
   DestroySparseHandle();
 }
 
-void ResourceManager::InitGpuProperties() {
-  phi::backends::gpu::GPUDeviceGuard guard(place_.device);
+void GPUContextResource::InitGpuProperties() {
   phi::InitGpuProperties(place_, &compute_capability_, &runtime_version_,
                          &driver_version_, &multi_process_,
                          &max_threads_per_mp_, &max_threads_per_block_,
                          &max_grid_dim_size_);
 }
 
-void ResourceManager::InitGpuEigenDevice() {
+void GPUContextResource::InitGpuEigenDevice() {
   auto* allocator = paddle::memory::allocation::AllocatorFacade::Instance()
                         .GetAllocator(place_)
                         .get();
@@ -182,13 +178,15 @@ void ResourceManager::InitGpuEigenDevice() {
   gpu_eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
 }
 
-void ResourceManager::InitDnnHanlde() {
+void GPUContextResource::InitDnnHanlde() {
   phi::InitDnnHandle(&dnn_handle_, stream_, place_);
 }
 
-void ResourceManager::DestroyDnnHandle() { phi::DestroyDnnHandle(dnn_handle_); }
+void GPUContextResource::DestroyDnnHandle() {
+  phi::DestroyDnnHandle(dnn_handle_);
+}
 
-void ResourceManager::InitBlasHandle() {
+void GPUContextResource::InitBlasHandle() {
   phi::InitBlasHandle(&blas_handle_, stream_);
 #ifdef PADDLE_WITH_CUDA
 #if CUDA_VERSION >= 9000
@@ -204,87 +202,158 @@ void ResourceManager::InitBlasHandle() {
 #endif
 }
 
-void ResourceManager::DestroyBlasHandle() {
+void GPUContextResource::DestroyBlasHandle() {
   phi::DestroyBlasHandle(blas_handle_);
   phi::DestroyBlasHandle(blas_tensor_core_handle_);
   phi::DestroyBlasHandle(blas_tf32_tensor_core_handle_);
 }
 
-void ResourceManager::InitBlasLtHandle() {
+void GPUContextResource::InitBlasLtHandle() {
   phi::InitBlasLtHandle(&blaslt_handle_);
 }
 
-void ResourceManager::DestroyBlasLtHandle() {
+void GPUContextResource::DestroyBlasLtHandle() {
   phi::DestroyBlasLtHandle(blaslt_handle_);
 }
 
-void ResourceManager::InitSolverHandle() {
+void GPUContextResource::InitSolverHandle() {
   phi::InitSolverHandle(&solver_handle_, stream_);
 }
 
-void ResourceManager::DestroySolverHandle() {
+void GPUContextResource::DestroySolverHandle() {
   phi::DestroySolverHandle(solver_handle_);
 }
 
-void ResourceManager::InitSparseHandle() {
+void GPUContextResource::InitSparseHandle() {
   phi::InitSparseHandle(&sparse_handle_, stream_);
 }
 
-void ResourceManager::DestroySparseHandle() {
+void GPUContextResource::DestroySparseHandle() {
   phi::DestroySparseHandle(sparse_handle_);
 }
 
-gpuStream_t ResourceManager::GetStream() const { return stream_; }
+gpuStream_t GPUContextResource::GetStream() const { return stream_; }
 
-dnnHandle_t ResourceManager::GetDnnHandle() const { return dnn_handle_; }
+dnnHandle_t GPUContextResource::GetDnnHandle() const { return dnn_handle_; }
 
-blasHandle_t ResourceManager::GetBlasHandle() const { return blas_handle_; }
+blasHandle_t GPUContextResource::GetBlasHandle() const { return blas_handle_; }
 
-blasHandle_t ResourceManager::GetBlasTensorCoreHandle() const {
+blasHandle_t GPUContextResource::GetBlasTensorCoreHandle() const {
   return blas_tensor_core_handle_;
 }
 
-blasHandle_t ResourceManager::GetBlasTF32Handle() const {
+blasHandle_t GPUContextResource::GetBlasTF32Handle() const {
   return blas_tf32_tensor_core_handle_;
 }
 
-blasLtHandle_t ResourceManager::GetBlasLtHandle() const {
+blasLtHandle_t GPUContextResource::GetBlasLtHandle() const {
   return blaslt_handle_;
 }
 
-phi::solverHandle_t ResourceManager::GetSolverDnHandle() const {
+phi::solverHandle_t GPUContextResource::GetSolverDnHandle() const {
   return solver_handle_;
 }
 
-phi::sparseHandle_t ResourceManager::GetSparseHandle() const {
+phi::sparseHandle_t GPUContextResource::GetSparseHandle() const {
   return sparse_handle_;
 }
 
-Eigen::GpuDevice* ResourceManager::GetGpuEigenDevice() const {
+Eigen::GpuDevice* GPUContextResource::GetGpuEigenDevice() const {
   return gpu_eigen_device_.get();
 }
 
-int ResourceManager::GetGpuComputeCapability() const {
+int GPUContextResource::GetGpuComputeCapability() const {
   return compute_capability_;
 }
 
-int ResourceManager::GetGpuRuntimeVersion() const { return runtime_version_; }
+int GPUContextResource::GetGpuRuntimeVersion() const {
+  return runtime_version_;
+}
 
-int ResourceManager::GetGpuDriverVersion() const { return driver_version_; }
+int GPUContextResource::GetGpuDriverVersion() const { return driver_version_; }
 
-int ResourceManager::GetGPUMultiProcessors() const { return multi_process_; }
+int GPUContextResource::GetGPUMultiProcessors() const { return multi_process_; }
 
-int ResourceManager::GetGpuMaxThreadsPerMp() const {
+int GPUContextResource::GetGpuMaxThreadsPerMp() const {
   return max_threads_per_mp_;
 }
 
-int ResourceManager::GetGpuMaxThreadsPerBlock() const {
+int GPUContextResource::GetGpuMaxThreadsPerBlock() const {
   return max_threads_per_block_;
 }
 
-std::array<int, 3> ResourceManager::GetGpuMaxGridDimSize() const {
+std::array<int, 3> GPUContextResource::GetGpuMaxGridDimSize() const {
   return max_grid_dim_size_;
 }
 
 #endif
+
+void ResourceManager::InitCPUResource() {
+  std::lock_guard<std::mutex> lock_gurad(cpu_mutex_);
+  if (cpu_resource_ == nullptr) {
+    cpu_resource_.reset(new CPUContextResource());
+  }
+}
+
+CPUContextResource* ResourceManager::GetCPUResource() const {
+  PADDLE_ENFORCE_NOT_NULL(
+      cpu_resource_.get(),
+      platform::errors::PreconditionNotMet("cpu_resource should be not null!"));
+  return cpu_resource_.get();
+}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+void* ResourceManager::InitGPUResource(const phi::Place& place, void* stream) {
+  std::lock_guard<std::mutex> lock_gurad(gpu_mutex_);
+  if (gpu_resources_.count(stream)) {
+    Increase(stream);
+    return stream;
+  } else {
+    std::unique_ptr<GPUContextResource> resource{
+        new GPUContextResource(place, stream)};
+    gpuStream_t s = resource->GetStream();
+    ref_count_[s] = 1;
+    gpu_resources_.emplace(s, std::move(resource));
+    return s;
+  }
+}
+
+void ResourceManager::DestroyGPUResource(void* stream) {
+  PADDLE_ENFORCE_EQ(gpu_resources_.count(stream), true,
+                    platform::errors::InvalidArgument(
+                        "The stream[%p] not found in gpu_resources.", stream));
+  Decrease(stream);
+}
+
+void ResourceManager::Decrease(void* stream) {
+  PADDLE_ENFORCE_EQ(ref_count_.count(stream), true,
+                    platform::errors::InvalidArgument(
+                        "The stream[%p] not found in ref_count.", stream));
+  --ref_count_[stream];
+  if (ref_count_[stream] == 0) {
+    ref_count_.erase(stream);
+    gpu_resources_.erase(stream);
+  }
+}
+
+void ResourceManager::Increase(void* stream) {
+  PADDLE_ENFORCE_EQ(ref_count_.count(stream), true,
+                    platform::errors::InvalidArgument(
+                        "The stream[%p] not found in ref_count.", stream));
+  ++ref_count_[stream];
+}
+
+GPUContextResource* ResourceManager::GetGPUResource(void* stream) const {
+  PADDLE_ENFORCE_EQ(gpu_resources_.count(stream), true,
+                    platform::errors::InvalidArgument(
+                        "The stream[%p] not found in gpu_resources.", stream));
+  return gpu_resources_.at(stream).get();
+}
+
+int ResourceManager::RefCount(void* stream) const {
+  if (ref_count_.count(stream) == 0) return 0;
+  return ref_count_.at(stream);
+}
+#endif
+
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/resource_manager.h b/paddle/fluid/inference/api/resource_manager.h
index 24e76598e400b6..03345403159d58 100644
--- a/paddle/fluid/inference/api/resource_manager.h
+++ b/paddle/fluid/inference/api/resource_manager.h
@@ -13,9 +13,13 @@
 // limitations under the License.
 #pragma once
 
+#include <atomic>
 #include <functional>
+#include <map>
 #include <memory>
+#include <mutex>
 
+#include "paddle/fluid/platform/macros.h"
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/backends/cpu/forwards.h"
 
@@ -31,24 +35,24 @@ namespace internal {
 class EigenGpuStreamDevice;
 }  // namespace internal
 
-class ResourceManager {
- public:
-  explicit ResourceManager(const phi::Place& place, void* stream);
-  ~ResourceManager();
-
+class CPUContextResource {
  public:
-  Eigen::DefaultDevice* GetCpuEigenDevice();
+  CPUContextResource();
+  Eigen::DefaultDevice* GetCPUEigenDevice() const;
 
  private:
   void InitCPUResource();
 
  private:
-  phi::Place place_;
   std::unique_ptr<Eigen::DefaultDevice> cpu_eigen_device_;
+};
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-
+class GPUContextResource {
  public:
+  explicit GPUContextResource(const phi::Place& place, void* stream);
+  ~GPUContextResource();
+
   gpuStream_t GetStream() const;
   dnnHandle_t GetDnnHandle() const;
   blasHandle_t GetBlasHandle() const;
@@ -83,6 +87,8 @@ class ResourceManager {
   void DestroySparseHandle();
 
  private:
+  phi::Place place_;
+
   int compute_capability_;
   int runtime_version_;
   int driver_version_;
@@ -103,8 +109,50 @@ class ResourceManager {
   dnnHandle_t dnn_handle_{nullptr};
   phi::solverHandle_t solver_handle_{nullptr};
   phi::sparseHandle_t sparse_handle_{nullptr};
-// DnnWorkspaceHandle
+  // DnnWorkspaceHandle
+};
 #endif
+
+class ResourceManager {
+ public:
+  ResourceManager() = default;
+  static ResourceManager& Instance() {
+    static ResourceManager* resource_manager = new ResourceManager;
+    return *resource_manager;
+  }
+
+  // CPU Resource
+ public:
+  void InitCPUResource();
+  CPUContextResource* GetCPUResource() const;
+
+ private:
+  std::mutex cpu_mutex_;
+  std::unique_ptr<CPUContextResource> cpu_resource_{nullptr};
+
+// GPU Resource
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+ public:
+  void* InitGPUResource(const phi::Place& place, void* stream);
+  void DestroyGPUResource(void* stream);
+  GPUContextResource* GetGPUResource(void* stream) const;
+  int RefCount(void* stream) const;
+
+ private:
+  void Decrease(void* stream);
+  void Increase(void* stream);
+
+ private:
+  std::mutex gpu_mutex_;
+  // a stream corresponding to a series of resource.
+  std::map<void* /*stream*/, std::atomic<int>> ref_count_;
+  std::map<void* /*stream*/, std::unique_ptr<GPUContextResource>>
+      gpu_resources_;
+#endif
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(ResourceManager);
 };
 
 }  // namespace paddle