From c0b947ff4acc18b40ca9be6b20a347d2767381fc Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 22 Mar 2021 05:13:32 +0000
Subject: [PATCH 01/16] refactor and simplify hook design

---
 paddle/fluid/imperative/basic_engine.cc       |  15 +-
 .../fluid/imperative/gradient_accumulator.h   |  75 ++++---
 paddle/fluid/imperative/hooks.h               | 193 +++---------------
 paddle/fluid/imperative/layer.h               |  11 +
 paddle/fluid/imperative/op_base.h             |   2 -
 paddle/fluid/imperative/reducer.cc            |   7 +-
 paddle/fluid/imperative/tests/test_hooks.cc   |  20 +-
 paddle/fluid/imperative/variable_wrapper.h    | 111 +++-------
 8 files changed, 124 insertions(+), 310 deletions(-)

diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index 29ba54986801f..bbed0a4951fd3 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -141,17 +141,6 @@ void BasicEngine::PrepareGradAccumulators(
                 << var.get()
                 << ") that don't have grad node  with reference count "
                 << accumulator->RefCnt();
-
-        if (var->HasLeafHooks()) {
-          VLOG(3) << "Grad variable wrapper (" << var->Name()
-                  << ") has leaf grad hooks.";
-          PADDLE_ENFORCE_NE(
-              var->HasGradNode(), true,
-              platform::errors::PermissionDenied(
-                  "Only leaf Tensor's gradient can append hook to "
-                  "Gradientaccumulator."));
-          accumulator->SetPostHooks(var->GetLeafHooks());
-        }
       } else {
         // Because Inplace op overwrites the grad_node of the input grad_var. So
         // only the information of grad_pending_node can be used to find the
@@ -434,9 +423,7 @@ void BasicEngine::Execute() {
         accumulator->AccumulateGrad();
 
         // 3. Call backward Hooks for **var_**
-        if (accumulator->HasPostHooks()) {
-          accumulator->CallBackwardPostHooks();
-        }
+        accumulator->CallReduceHooks();
       }
 
       need_accu_var_list_.clear();
diff --git a/paddle/fluid/imperative/gradient_accumulator.h b/paddle/fluid/imperative/gradient_accumulator.h
index e2dabc06a7dae..0f63d991ec2f2 100644
--- a/paddle/fluid/imperative/gradient_accumulator.h
+++ b/paddle/fluid/imperative/gradient_accumulator.h
@@ -93,40 +93,52 @@ class GradientAccumulator {
 
   inline bool HasInnerVar() const { return inner_var_ != nullptr; }
 
-  /* Hook related methods */
-  inline bool HasPostHooks() const { return !post_hooks_.expired(); }
-
-  void SetPostHooks(const std::shared_ptr<LeafVarHookPipeline>& hooks) {
-    PADDLE_ENFORCE_NOT_NULL(
-        hooks, platform::errors::InvalidArgument(
-                   "The hook set to GradientAccumulator is nullptr."));
-
-    auto shared_hooks = post_hooks_.lock();
-    if (shared_hooks != hooks) {
-      PADDLE_ENFORCE_EQ(
-          shared_hooks, nullptr,
-          platform::errors::PermissionDenied(
-              "Cannot set post hooks twice to GradientAccumulator."));
-      post_hooks_ = hooks;
-    }
-  }
-  // void CallHooks(){}
-  //  ** inner_var_ **
-
   // function that Sum Gradient with Previous Graph
   void AccumulateGrad();
 
-  // call backward post hooks, such as reduce hook
-  void CallBackwardPostHooks() {
-    PADDLE_ENFORCE_NE(
-        post_hooks_.expired(), true,
-        platform::errors::NotFound(
-            "The post hooks of GradientAccumulator for Tensor `%s` expired.",
-            var_->Name()));
-    auto shared_hooks = post_hooks_.lock();
-    for (const auto& hook : shared_hooks->backward_hooks()) {
-      VLOG(3) << "call gradient accumulator backward hooks.";
-      (*hook)(var_);
+  /** [ Hook related methods ]
+   *
+   *  [Why need two types of VariableWrapperHook? ]
+   *
+   *    There are two types of gradient accumulation:
+   *    1. Gradient accumulation in same batch
+   *    2. Gradient accumulation across batchs
+   *    The order of execution between Hooks and gradient accumulation:
+
+   *      [ Gradient accumulation in same batch]
+   *                        |
+   *            [ leaf GradVarBase hooks ]
+   *                        |
+   *      [ Gradient accumulation across batchs ]
+   *                        |
+   *          [ Gradient reduce / allreduce]
+
+   *    Because we currently intend to accumulate these two gradient
+   *    accumulation in one GradientAccumulator, We must distinguish between
+   *    two types of hooks.
+
+   *    And the InplaceVariableWrapperHook does not allow users to register
+   *    directly, and is currently only used to support the reduce strategy of
+   *    parallel multi-card training.
+   */
+
+  // VariableWrapper CallHooks() {
+  //   if (var_->HasHook()) {
+  //     VariableWrapper tmp = *var_;
+  //     for (const auto& hook : var_->GetHooks()) {
+  //       VLOG(3) << "call gradient accumulator backward hooks.";
+  //       tmp = (*hook)(tmp);
+  //     }
+  //     *var_ = tmp;
+  //   }
+  // }
+
+  void CallReduceHooks() {
+    if (var_->HasReduceHook()) {
+      for (const auto& hook : var_->GetReduceHooks()) {
+        VLOG(3) << "call gradient accumulator backward hooks.";
+        (*hook)(var_);
+      }
     }
   }
 
@@ -137,7 +149,6 @@ class GradientAccumulator {
   std::shared_ptr<VariableWrapper> inner_var_;
   size_t ref_cnt_{0};
   size_t cur_cnt_{0};
-  std::weak_ptr<LeafVarHookPipeline> post_hooks_;
 };
 
 class EagerGradientAccumulator : public GradientAccumulator {
diff --git a/paddle/fluid/imperative/hooks.h b/paddle/fluid/imperative/hooks.h
index 1211ec6ae6c7b..5e31a2227c188 100644
--- a/paddle/fluid/imperative/hooks.h
+++ b/paddle/fluid/imperative/hooks.h
@@ -18,99 +18,65 @@
 #include <memory>
 #include <utility>
 #include <vector>
-
-#include "paddle/fluid/imperative/type_defs.h"
-#include "paddle/fluid/platform/macros.h"
-
 namespace paddle {
 namespace imperative {
 
 class VariableWrapper;
 
-/** [ Basic hook classes ]
- * s
- * @brief OpBasePreHook is executed before the grad OpBase is executed,
+/** [ Const VariableWrapper Hook: Pre hook functor of OpBase ]
+ *
+ * @brief This hook functor is executed before the grad OpBase is executed,
  *        taking the input of the current grad OpBase as input, and
  *        executing python hooks (user-defined) or C++ hooks (developer-defined)
  *        to achieve the purpose of custom operations on the interior VarBase
  *        gradient.
  *
- * @note  OpBasePreHook will not change the input gradient VarBase.
+ * @note  This hook functor will not change the input gradient VarBase.
  *
  * @note  [Why need to be OpBase `PreHook`, why not `PostHook`?]
  *
- *        If set OpBase post hook, when the op executed end, the op's output
- *        gradient may not be the final state, because it may need other op's
- *        gradient output to accumulated to it. But before op can be executed,
- *        the gradient output must have been accumulated to final value.
+ *        1. We expect If set OpBase post hook, when the op executed end, the
+ *        op's output gradient may not be the final state, because it may need
+ *        other op's gradient output to accumulated to it. But before op can
+ *        be executed, the gradient output must have been accumulated to final
+ *        value.
+ *        2. We don’t want the hook to change its input Tensor value, so now
+ *        we can't call all hooks in GradAccumulator.
  *
  * @note  [Why only can be used for interior VarBase?]
  *
  *        Because the leaf VarBase's GradVarBase has no GradOpNode, so leaf
  *        GradVarBase has no next OpBase to executed, so if need to deal with
- *        the leaf GradVarBase, cannot use OpBasePreHook. For this case, we
- *        deal with by GradAccumulatorPostHook.
+ *        the leaf GradVarBase, cannot use this hook functor. For this case, we
+ *        deal with by other inplace hook method.
  */
-class OpBasePreHook {
+class VariableWrapperHook {
  public:
-  virtual ~OpBasePreHook() = default;
-  virtual VariableWrapperList operator()(
-      const VariableWrapperList& grad_inputs) = 0;
+  virtual ~VariableWrapperHook() = default;
+  virtual VariableWrapper operator()(const VariableWrapper& var) = 0;
 };
 
-/**
- * @brief GradAccumulatorPostHook is the Hook that operates on the current
+/** [ Inplace VariableWrapper Hook: Post hook functor of GradAccumulator ]
+ *
+ * @brief This hook functor is the Hook that operates on the current
  *        gradientafter the GradientAccumulator has accumulated the gradient.
  *        Leaf GradVarBase has no next OpBase, if we want to register hook
  *        for it, we also need to wait until the leaf GradVarBase accumulation
  *        is completed, so we can add post hook to GradientAccumulator.
  *
- * @note  GradAccumulatorPostHook will change the grad VarBase value.
+ * @note  This hook functor will change the grad VarBase value.
  *
- * @note  Only allow leaf VarBase hold GradientAccumulatorPostHook.
+ * @note  Only allow leaf VarBase hold call this hook functor.
  */
-class GradAccumulatorPostHook {
+class InplaceVariableWrapperHook {
  public:
-  virtual ~GradAccumulatorPostHook() = default;
+  virtual ~InplaceVariableWrapperHook() = default;
   virtual void operator()(VariableWrapper* var) = 0;
 };
 
-/** [ Hook for cpp functions ]
- *
- * Here we design three C++ hooks；
- * 1. CppOpBasePreHook (Implement later):
- *    - used for developer-defined C++ interior VarBase hooks
- * 2. CppGradAccumulatorPostHook (Implement later):
- *    - used for developer-defined C++ leaf VarBase hooks
- * 3. LambdaGradAccumulatorPostHook:
- *    - used for VarBase reduce in parallel training
- *
- * @note  [Why need two types of GradAccumulatorPostHook? ]
- *
- *        There are two types of gradient accumulation:
- *        1. Gradient accumulation in same batch
- *        2. Gradient accumulation across batchs
- *        The order of execution between Hooks and gradient accumulation:
- *
- *          [ Gradient accumulation in same batch]
- *                            |
- *                [ leaf GradVarBase hooks ]
- *                            |
- *          [ Gradient accumulation across batchs ]
- *                            |
- *              [ Gradient reduce / allreduce]
- *
- *        Because we currently intend to accumulate these two gradient
- *        accumulation in one GradientAccumulator, We must distinguish between
- *        two types of hooks.
- *
- *        And the LambdaGradAccumulatorPostHook does not allow users to register
- *        directly, and is currently only used to support the reduce strategy of
- *        parallel multi-card training.
- */
-class LambdaGradAccumulatorPostHook : public GradAccumulatorPostHook {
+class LambdaInplaceVariableWrapperHook : public InplaceVariableWrapperHook {
  public:
-  explicit LambdaGradAccumulatorPostHook(
+  explicit LambdaInplaceVariableWrapperHook(
       std::function<void(VariableWrapper*)> fn)
       : fn_(std::move(fn)) {}
 
@@ -120,114 +86,5 @@ class LambdaGradAccumulatorPostHook : public GradAccumulatorPostHook {
   std::function<void(VariableWrapper*)> fn_;
 };
 
-/* Hooks for python function: in pybind/imperative.cc */
-
-/** Add Python Hooks later:
- * - PyOpBasePreHook (Implement later): used for user-defined interior python
- * VarBase hooks
- * - PyGradAccumulatorPostHook (Implement later): used for user-defined leaf
- * python VarBase hooks
- */
-
-/** [ Hook Pipeline classes ]
- *
- * @note  [Why need hook pipeline classes?]
- *
- *        There are 2 purposes for adding Hook pipeline here:
- *
- *        1. Make the code implementation cleaner.
- *
- *          If there are no Hook pipeline, we need to add 3 hook vector into
- *          VariableWrapper, 1 hook vector into OpBase, 2 hook vector into
- *          GradientAccumulator, like:
- *
- *          - VariableWrapper:
- *            std::vector<std::shared_ptr<OpBasePreHook>>
- *              interior_var_hooks_;
- *            std::vector<std::shared_ptr<GradAccumulatorPostHook>>
- *              leaf_var_hooks_;
- *            std::vector<std::shared_ptr<GradAccumulatorPostHook>>
- *              backward_hooks_;
- *
- *          - OpBase:
- *            std::vector<std::weak_ptr<OpBasePreHook>>
- *              interior_var_hooks_;
- *
- *          - GradientAccumulator:
- *            std::vector<std::weak_ptr<GradAccumulatorPostHook>>
- *              leaf_var_hooks_;
- *            std::vector<std::weak_ptr<GradAccumulatorPostHook>>
- *              backward_hooks_;
- *
- *          This seems more complicated, and std::vector<std::weak_ptr<...>>
- *          is not easy to destruct.
- *
- *        2. Make the code easier to understand.
- *
- *          From these two packages, we can clearly understand that we
- *          have two types of Hooks, respectively for the interior
- *          gradient var and leaf gradient var inside the backward
- *          calculation graph.
- */
-
-class InteriorVarHookPipeline {
- public:
-  InteriorVarHookPipeline() = default;
-
-  void add_hook(std::unique_ptr<OpBasePreHook>&& hook) {
-    hooks_.emplace_back(std::move(hook));
-  }
-
-  const std::vector<std::unique_ptr<OpBasePreHook>>& hooks() const {
-    return hooks_;
-  }
-
-  std::vector<std::unique_ptr<OpBasePreHook>>& hooks() { return hooks_; }
-
- private:
-  std::vector<std::unique_ptr<OpBasePreHook>> hooks_;
-
-  DISABLE_COPY_AND_ASSIGN(InteriorVarHookPipeline);
-};
-
-class LeafVarHookPipeline {
- public:
-  LeafVarHookPipeline() = default;
-
-  void add_hook(std::unique_ptr<GradAccumulatorPostHook>&& hook) {
-    hooks_.emplace_back(std::move(hook));
-  }
-
-  const std::vector<std::unique_ptr<GradAccumulatorPostHook>>& hooks() const {
-    return hooks_;
-  }
-
-  std::vector<std::unique_ptr<GradAccumulatorPostHook>>& hooks() {
-    return hooks_;
-  }
-
-  void add_backward_hook(std::unique_ptr<GradAccumulatorPostHook>&& hook) {
-    backward_hooks_.emplace_back(std::move(hook));
-  }
-
-  const std::vector<std::unique_ptr<GradAccumulatorPostHook>>& backward_hooks()
-      const {
-    return backward_hooks_;
-  }
-
-  std::vector<std::unique_ptr<GradAccumulatorPostHook>>& backward_hooks() {
-    return backward_hooks_;
-  }
-
- private:
-  std::vector<std::unique_ptr<GradAccumulatorPostHook>> hooks_;
-  // NOTE: the `backward` here means the `whole backward process`,
-  // the `backward_hooks_` need to be executed after the `whole backward
-  // process`.
-  std::vector<std::unique_ptr<GradAccumulatorPostHook>> backward_hooks_;
-
-  DISABLE_COPY_AND_ASSIGN(LeafVarHookPipeline);
-};
-
 }  // namespace imperative
 }  // namespace paddle
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index ff5a780a5f9db..deb56ed4cd96d 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -30,6 +30,7 @@
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/imperative/flags.h"
+#include "paddle/fluid/imperative/hooks.h"
 #include "paddle/fluid/imperative/saved_variable_wrapper_list.h"
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/imperative/variable_wrapper.h"
@@ -220,6 +221,16 @@ class VarBase {
 
   void BumpInplaceVersion();
 
+  /* Hook related method: only can be call by GradVarBase */
+  void AddHook(std::shared_ptr<VariableWrapperHook>&& hook) {
+    var_->AddHook(std::forward<std::shared_ptr<VariableWrapperHook>>(hook));
+  }
+
+  void AddReduceHook(std::shared_ptr<InplaceVariableWrapperHook>&& hook) {
+    var_->AddReduceHook(
+        std::forward<std::shared_ptr<InplaceVariableWrapperHook>>(hook));
+  }
+
  private:
   /**
    * NOTE(zengjinle): never remove the const qualifier of `var_` if you are
diff --git a/paddle/fluid/imperative/op_base.h b/paddle/fluid/imperative/op_base.h
index 2b7642ae7cfd9..0164ff9313cdf 100644
--- a/paddle/fluid/imperative/op_base.h
+++ b/paddle/fluid/imperative/op_base.h
@@ -177,8 +177,6 @@ class OpBase {
   std::unique_ptr<framework::OperatorBase> op_;
   platform::Place place_;
   size_t id_{-1UL};
-
-  std::weak_ptr<InteriorVarHookPipeline> pre_hooks_;
 };
 
 class GradOpNode {
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index e8b531d35cabf..e74cbed547be4 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -310,11 +310,8 @@ Reducer::Reducer(const std::vector<std::shared_ptr<imperative::VarBase>> &vars,
   for (size_t global_var_index = 0; global_var_index < vars_.size();
        ++global_var_index) {
     auto var = vars_[global_var_index];
-    var->SharedVar()->AddGradVarLeafBackwardHook(
-        std::unique_ptr<LambdaGradAccumulatorPostHook>(
-            new LambdaGradAccumulatorPostHook([=](VariableWrapper *grad) {
-              this->AddDistHook(global_var_index);
-            })));
+    var->AddReduceHook(std::make_shared<LambdaInplaceVariableWrapperHook>(
+        [=](VariableWrapper *grad) { this->AddDistHook(global_var_index); }));
     var_index_map_[var->GradVarBase()->SharedVar().get()] = global_var_index;
   }
 }
diff --git a/paddle/fluid/imperative/tests/test_hooks.cc b/paddle/fluid/imperative/tests/test_hooks.cc
index 7bf5f876681ba..a196cd263c120 100644
--- a/paddle/fluid/imperative/tests/test_hooks.cc
+++ b/paddle/fluid/imperative/tests/test_hooks.cc
@@ -74,16 +74,15 @@ TEST(TestHooks, TestGradVarLeafBackwardHook) {
   mul_attr_map["use_mkldnn"] = false;
 
   // add GradAccumulatorPostHook
-  auto x_var_wrapper = x->SharedVar();
-  x_var_wrapper->AddGradVarLeafBackwardHook(
-      std::unique_ptr<LambdaGradAccumulatorPostHook>(
-          new LambdaGradAccumulatorPostHook([=](VariableWrapper* grad) {
+  x->GradVarBase()->AddReduceHook(
+      std::make_shared<LambdaInplaceVariableWrapperHook>(
+          [=](VariableWrapper* grad) {
             auto* grad_tensor =
                 grad->MutableVar()->GetMutable<framework::LoDTensor>();
             for (int i = 0; i < grad_tensor->numel(); ++i) {
               grad_tensor->mutable_data<float>(place)[i] *= 2.0;
             }
-          })));
+          }));
 
   // 2. forward
   tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true);
@@ -151,17 +150,16 @@ void GradVarLeafBackwardHookWithGradAccmulatedTest() {
   memory::Copy(place, mutable_z, place, src_data.data(),
                sizeof(float) * src_data.size());
 
-  // add GradAccumulatorPostHook
-  auto x_var_wrapper = x->SharedVar();
-  x_var_wrapper->AddGradVarLeafBackwardHook(
-      std::unique_ptr<LambdaGradAccumulatorPostHook>(
-          new LambdaGradAccumulatorPostHook([=](VariableWrapper* grad) {
+  // add ReduceBackwardHook
+  x->GradVarBase()->AddReduceHook(
+      std::make_shared<LambdaInplaceVariableWrapperHook>(
+          [=](VariableWrapper* grad) {
             auto* grad_tensor =
                 grad->MutableVar()->GetMutable<framework::LoDTensor>();
             for (int i = 0; i < grad_tensor->numel(); ++i) {
               grad_tensor->mutable_data<float>(place)[i] *= 2.0;
             }
-          })));
+          }));
 
   // 2. forward
   var_pair x_pair = var_pair("X", vb_vector(1, x));
diff --git a/paddle/fluid/imperative/variable_wrapper.h b/paddle/fluid/imperative/variable_wrapper.h
index b42f25dcc8800..c9f361607c283 100644
--- a/paddle/fluid/imperative/variable_wrapper.h
+++ b/paddle/fluid/imperative/variable_wrapper.h
@@ -27,8 +27,8 @@
 namespace paddle {
 namespace imperative {
 
-class InteriorVarHookPipeline;
-class LeafVarHookPipeline;
+class VariableWrapperHook;
+class InplaceVariableWrapperHook;
 class VarBase;
 class GradOpNode;
 
@@ -193,42 +193,6 @@ class VariableWrapper {
     }
   }
 
-  /* Hook related method: only can be call by GradVarBase */
-
-  bool HasInteriorHooks() const { return interior_hooks_ != nullptr; }
-
-  bool HasLeafHooks() const { return leaf_hooks_ != nullptr; }
-
-  void AddGradVarInteriorHook(std::unique_ptr<OpBasePreHook>&& hook) {
-    auto interior_hooks = GetGradVarInteriorHooksSafely();
-    interior_hooks->add_hook(std::move(hook));
-  }
-
-  void AddGradVarLeafHook(std::unique_ptr<GradAccumulatorPostHook>&& hook) {
-    auto leaf_hooks = GetGradVarLeafHooksSafely();
-    leaf_hooks->add_hook(std::move(hook));
-  }
-
-  void AddGradVarLeafBackwardHook(
-      std::unique_ptr<GradAccumulatorPostHook>&& hook) {
-    auto leaf_hooks = GetGradVarLeafHooksSafely();
-    leaf_hooks->add_backward_hook(std::move(hook));
-  }
-
-  const std::shared_ptr<InteriorVarHookPipeline>& GetInteriorHooks() const {
-    return interior_hooks_;
-  }
-
-  std::shared_ptr<InteriorVarHookPipeline>& GetInteriorHooks() {
-    return interior_hooks_;
-  }
-
-  const std::shared_ptr<LeafVarHookPipeline>& GetLeafHooks() const {
-    return leaf_hooks_;
-  }
-
-  std::shared_ptr<LeafVarHookPipeline>& GetLeafHooks() { return leaf_hooks_; }
-
   uint32_t InplaceVersionSnapshot() const { return inplace_version_snapshot_; }
 
   void ResetInplaceVersion() {
@@ -255,6 +219,34 @@ class VariableWrapper {
     return;
   }
 
+  /* Hook related methods */
+  bool HasHook() const { return !hooks_.empty(); }
+
+  bool HasReduceHook() const { return !reduce_hooks_.empty(); }
+
+  void AddHook(std::shared_ptr<VariableWrapperHook>&& hook) {
+    // PADDLE_ENFORCE_NOT_NULL(hook,
+    //   platform::errors::InvalidArgument(
+    //     "The added backward hook for Tensor is nullptr."));
+    hooks_.emplace_back(std::move(hook));
+  }
+
+  const std::vector<std::shared_ptr<VariableWrapperHook>>& GetHooks() const {
+    return hooks_;
+  }
+
+  void AddReduceHook(std::shared_ptr<InplaceVariableWrapperHook>&& hook) {
+    // PADDLE_ENFORCE_NOT_NULL(hook,
+    //   platform::errors::InvalidArgument(
+    //     "The added backward hook for Tensor is nullptr."));
+    reduce_hooks_.emplace_back(std::move(hook));
+  }
+
+  const std::vector<std::shared_ptr<InplaceVariableWrapperHook>>&
+  GetReduceHooks() const {
+    return reduce_hooks_;
+  }
+
  private:
   void SetGradVar(const std::shared_ptr<VariableWrapper>& var) {
     auto shared_var = grad_var_.lock();
@@ -289,41 +281,6 @@ class VariableWrapper {
     }
   }
 
-  /* Hook related private methods */
-  std::shared_ptr<VariableWrapper> GetGradVarSafely() const {
-    auto shared_grad_var = grad_var_.lock();
-    PADDLE_ENFORCE_NOT_NULL(
-        shared_grad_var,
-        platform::errors::PermissionDenied(
-            "Cannot add gradient hook on Tensor without gradient."));
-    return shared_grad_var;
-  }
-
-  std::shared_ptr<InteriorVarHookPipeline>& GetGradVarInteriorHooksSafely() {
-    auto shared_grad_var = GetGradVarSafely();
-    PADDLE_ENFORCE_EQ(HasGradNode(), true,
-                      platform::errors::PermissionDenied(
-                          "Only interior Tensor in backward can register "
-                          "interior gradient hook."));
-    if (shared_grad_var->interior_hooks_ == nullptr) {
-      shared_grad_var->interior_hooks_ =
-          std::make_shared<InteriorVarHookPipeline>();
-    }
-    return shared_grad_var->interior_hooks_;
-  }
-
-  std::shared_ptr<LeafVarHookPipeline>& GetGradVarLeafHooksSafely() {
-    auto shared_grad_var = GetGradVarSafely();
-    PADDLE_ENFORCE_EQ(
-        HasGradNode(), false,
-        platform::errors::PermissionDenied(
-            "Only leaf Tensor in backward can register leaf gradient hook."));
-    if (shared_grad_var->leaf_hooks_ == nullptr) {
-      shared_grad_var->leaf_hooks_ = std::make_shared<LeafVarHookPipeline>();
-    }
-    return shared_grad_var->leaf_hooks_;
-  }
-
  private:
   framework::Variable var_;
   std::string name_;
@@ -358,11 +315,9 @@ class VariableWrapper {
   // isn't need
   bool is_empty_{false};
 
-  // NOTE: only grad var can hold hooks now
-  // only interior var can hold interior hooks
-  std::shared_ptr<InteriorVarHookPipeline> interior_hooks_;
-  // only leaf var can hold leaf hooks
-  std::shared_ptr<LeafVarHookPipeline> leaf_hooks_;
+  // NOTE(chenweihang): only grad var can hold hooks now
+  std::vector<std::shared_ptr<VariableWrapperHook>> hooks_;
+  std::vector<std::shared_ptr<InplaceVariableWrapperHook>> reduce_hooks_;
 };
 
 }  // namespace imperative

From b4b3e9f944b3f367f82a84eaacd6180eea7f1957 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Tue, 23 Mar 2021 02:20:01 +0000
Subject: [PATCH 02/16] fix reducer add hook error

---
 paddle/fluid/imperative/reducer.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index e74cbed547be4..775e7008fc700 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -310,8 +310,9 @@ Reducer::Reducer(const std::vector<std::shared_ptr<imperative::VarBase>> &vars,
   for (size_t global_var_index = 0; global_var_index < vars_.size();
        ++global_var_index) {
     auto var = vars_[global_var_index];
-    var->AddReduceHook(std::make_shared<LambdaInplaceVariableWrapperHook>(
-        [=](VariableWrapper *grad) { this->AddDistHook(global_var_index); }));
+    var->GradVarBase()->AddReduceHook(
+        std::make_shared<LambdaInplaceVariableWrapperHook>([=](
+            VariableWrapper *grad) { this->AddDistHook(global_var_index); }));
     var_index_map_[var->GradVarBase()->SharedVar().get()] = global_var_index;
   }
 }

From 16b3dcd2ccee1e6f181a38ed86e51b8bea40607b Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Tue, 23 Mar 2021 05:52:32 +0000
Subject: [PATCH 03/16] add Tensor.register_hook basic impl

---
 paddle/fluid/imperative/hooks.h               |   3 +-
 paddle/fluid/imperative/layer.h               |   7 +-
 paddle/fluid/imperative/variable_wrapper.h    |  29 +++--
 paddle/fluid/pybind/imperative.cc             |  77 ++++++++++--
 .../fluid/dygraph/varbase_patch_methods.py    |  63 +++++++++-
 .../unittests/test_tensor_register_hook.py    | 115 ++++++++++++++++++
 6 files changed, 270 insertions(+), 24 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_tensor_register_hook.py

diff --git a/paddle/fluid/imperative/hooks.h b/paddle/fluid/imperative/hooks.h
index 5e31a2227c188..dd2845edb80d1 100644
--- a/paddle/fluid/imperative/hooks.h
+++ b/paddle/fluid/imperative/hooks.h
@@ -53,7 +53,8 @@ class VariableWrapper;
 class VariableWrapperHook {
  public:
   virtual ~VariableWrapperHook() = default;
-  virtual VariableWrapper operator()(const VariableWrapper& var) = 0;
+  virtual std::shared_ptr<VariableWrapper> operator()(
+      const std::shared_ptr<VariableWrapper>& var) = 0;
 };
 
 /** [ Inplace VariableWrapper Hook: Post hook functor of GradAccumulator ]
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index deb56ed4cd96d..f26605fb2c098 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -222,10 +222,13 @@ class VarBase {
   void BumpInplaceVersion();
 
   /* Hook related method: only can be call by GradVarBase */
-  void AddHook(std::shared_ptr<VariableWrapperHook>&& hook) {
-    var_->AddHook(std::forward<std::shared_ptr<VariableWrapperHook>>(hook));
+  int64_t AddHook(std::shared_ptr<VariableWrapperHook>&& hook) {
+    return var_->AddHook(
+        std::forward<std::shared_ptr<VariableWrapperHook>>(hook));
   }
 
+  bool RemoveHook(const int64_t& hook_id) { return var_->RemoveHook(hook_id); }
+
   void AddReduceHook(std::shared_ptr<InplaceVariableWrapperHook>&& hook) {
     var_->AddReduceHook(
         std::forward<std::shared_ptr<InplaceVariableWrapperHook>>(hook));
diff --git a/paddle/fluid/imperative/variable_wrapper.h b/paddle/fluid/imperative/variable_wrapper.h
index c9f361607c283..77d097678acda 100644
--- a/paddle/fluid/imperative/variable_wrapper.h
+++ b/paddle/fluid/imperative/variable_wrapper.h
@@ -224,21 +224,25 @@ class VariableWrapper {
 
   bool HasReduceHook() const { return !reduce_hooks_.empty(); }
 
-  void AddHook(std::shared_ptr<VariableWrapperHook>&& hook) {
-    // PADDLE_ENFORCE_NOT_NULL(hook,
-    //   platform::errors::InvalidArgument(
-    //     "The added backward hook for Tensor is nullptr."));
-    hooks_.emplace_back(std::move(hook));
+  int64_t AddHook(std::shared_ptr<VariableWrapperHook>&& hook) {
+    hooks_.emplace(next_hook_id_, std::move(hook));
+    return next_hook_id_++;
   }
 
-  const std::vector<std::shared_ptr<VariableWrapperHook>>& GetHooks() const {
+  bool RemoveHook(const int64_t& hook_id) {
+    auto remove_cnt = hooks_.erase(hook_id);
+    if (remove_cnt == 0) {
+      return false;
+    }
+    return true;
+  }
+
+  const std::map<int64_t, std::shared_ptr<VariableWrapperHook>>& GetHooks()
+      const {
     return hooks_;
   }
 
   void AddReduceHook(std::shared_ptr<InplaceVariableWrapperHook>&& hook) {
-    // PADDLE_ENFORCE_NOT_NULL(hook,
-    //   platform::errors::InvalidArgument(
-    //     "The added backward hook for Tensor is nullptr."));
     reduce_hooks_.emplace_back(std::move(hook));
   }
 
@@ -316,7 +320,12 @@ class VariableWrapper {
   bool is_empty_{false};
 
   // NOTE(chenweihang): only grad var can hold hooks now
-  std::vector<std::shared_ptr<VariableWrapperHook>> hooks_;
+  int64_t next_hook_id_{0};
+  // Hooks used to register hook for grad var, support adding and removing,
+  // key is the accumulated int64_t value
+  std::map<int64_t, std::shared_ptr<VariableWrapperHook>> hooks_;
+  // Hooks executed after the execution of the entire backward process is over,
+  // currently only supported for reducing in distributed training
   std::vector<std::shared_ptr<InplaceVariableWrapperHook>> reduce_hooks_;
 };
 
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 58ef177863093..720afa44b5ecd 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -34,6 +34,7 @@ limitations under the License. */
 #include "paddle/fluid/imperative/basic_engine.h"
 #include "paddle/fluid/imperative/bkcl_context.h"
 #include "paddle/fluid/imperative/data_loader.h"
+#include "paddle/fluid/imperative/hooks.h"
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/imperative/nccl_context.h"
 #include "paddle/fluid/imperative/partial_grad_engine.h"
@@ -63,6 +64,53 @@ class Layer : public imperative::Layer {
   }
 };
 
+template <typename T>
+static T PyObjectCast(PyObject *obj) {
+  try {
+    return py::cast<T>(py::handle(obj));
+  } catch (py::cast_error &) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Python object is not type of %s", typeid(T).name()));
+  }
+}
+
+class PyVariableWrapperHook : public imperative::VariableWrapperHook {
+ public:
+  explicit PyVariableWrapperHook(PyObject *func) : py_func_(func) {
+    VLOG(0) << "Construct PyVariableWrapperHook based func " << func;
+    Py_INCREF(func);
+  }
+
+  ~PyVariableWrapperHook() {
+    VLOG(0) << "Destruct PyVariableWrapperHook based func " << py_func_;
+    py::gil_scoped_acquire gil;
+    Py_DECREF(py_func_);
+  }
+
+  std::shared_ptr<imperative::VariableWrapper> operator()(
+      const std::shared_ptr<imperative::VariableWrapper> &var) override {
+    py::gil_scoped_acquire gil;
+
+    // 1. unpack temp VarBase from VariableWrapper
+    std::shared_ptr<imperative::VarBase> tmp_varbase =
+        std::make_shared<imperative::VarBase>(var);
+
+    // 2. call hook and return
+    PyObject *res = PyObject_CallFunctionObjArgs(
+        py_func_, py::cast(tmp_varbase).ptr(), nullptr);
+    PADDLE_ENFORCE_NOT_NULL(res,
+                            platform::errors::Unavailable(
+                                "The gradient Tensor hook return nullptr."));
+    if (res == Py_None) {
+      return var;
+    }
+    return PyObjectCast<std::shared_ptr<imperative::VarBase>>(res)->SharedVar();
+  }
+
+ private:
+  PyObject *py_func_;
+};
+
 static const platform::Place PyObjectToPlace(const py::object &place_obj) {
   if (py::isinstance<platform::CPUPlace>(place_obj)) {
     return place_obj.cast<platform::CPUPlace>();
@@ -213,16 +261,6 @@ static std::string GetTypeName(const imperative::VarBase &var) {
 
 using PyNameVarBaseMap = std::unordered_map<std::string, py::handle>;
 
-template <typename T>
-static T PyObjectCast(PyObject *obj) {
-  try {
-    return py::cast<T>(py::handle(obj));
-  } catch (py::cast_error &) {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "Python object is not type of %s", typeid(T).name()));
-  }
-}
-
 // NOTE(zjl): py::handle is a very light wrapper of PyObject *.
 // Unlike py::object, py::handle does not change reference count of PyObject *.
 static std::vector<std::shared_ptr<imperative::VarBase>>
@@ -988,6 +1026,25 @@ void BindImperative(py::module *m_ptr) {
              }
            },
            py::call_guard<py::gil_scoped_release>())
+      .def("_register_grad_hook",
+           [](imperative::VarBase &self, const py::handle &hook) {
+             PADDLE_ENFORCE_EQ(
+                 self.HasGradVar(), true,
+                 platform::errors::InvalidArgument(
+                     "Cannot register hook on a tensor without gradient."));
+             return self.GradVarBase()->AddHook(
+                 std::make_shared<PyVariableWrapperHook>(hook.ptr()));
+           })
+      .def("_remove_grad_hook",
+           [](imperative::VarBase &self, int64_t hook_id) {
+             PADDLE_ENFORCE_EQ(
+                 self.HasGradVar(), true,
+                 platform::errors::InvalidArgument(
+                     "Cannot remove hook on a tensor without gradient."));
+             return self.GradVarBase()->RemoveHook(hook_id);
+           })
+      .def("_register_grad_reduce_hook",
+           [](imperative::VarBase &self, const py::handle &hook) { return; })
       .def("cpu",
            [](const std::shared_ptr<imperative::VarBase> &self) {
              if (platform::is_cpu_place(self->Place())) {
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index ac0944c571890..8ab8d3010b0df 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -14,6 +14,7 @@
 
 import inspect
 import numpy as np
+import weakref
 
 import paddle
 from .. import framework
@@ -26,6 +27,21 @@
 from paddle.fluid.data_feeder import convert_dtype, _PADDLE_DTYPE_2_NUMPY_DTYPE
 
 
+class TensorHookRemoveHelper(object):
+    """
+    A helper class that for removing Tensor gradient's hook. 
+    """
+
+    def __init__(self, tensor, hook_id):
+        self._tensor_ref = weakref.ref(tensor)
+        self._hook_id = hook_id
+
+    def remove(self):
+        tensor = self._tensor_ref()
+        if tensor is not None:
+            tensor._remove_grad_hook(self.hook_id)
+
+
 def monkey_patch_varbase():
     @switch_to_static_graph
     def _to_static_var(self, to_parameter=False, **kwargs):
@@ -211,6 +227,50 @@ def gradient(self):
         else:
             return np.array(new_ivar.value().get_tensor())
 
+    @framework.dygraph_only
+    def register_hook(self, hook):
+        """
+        Registers a backward hook for current Tensor.
+
+        The hook will be called every time the gradient Tensor of Current Tensor is computed.
+
+        The hook should not modify the input gradient Tensor, but it can optionally return
+        a new gradient Tensor which will be used in place of current Tensor's gradient.
+
+        The hook should have the following signature:
+
+            hook(grad) -> Variable or None
+
+        Args:
+            hook(function): A backward hook to be registered for Tensor.grad
+
+        Returns:
+            TensorHookRemoveHelper: A helper object that can be used to remove the registered hook by calling `remove()` method.
+
+        Examples:
+             .. code-block:: python
+
+                import paddle
+                import numpy as np
+
+                def hook_fn(g):
+                    g = 2 * g
+                    print g
+
+                t = paddle.randn([2, 3])
+                t.stop_gradient = False
+                h = t.register_hook(hook_fn)
+                t.backward()
+                h.remove()
+        """
+        if self.stop_gradient is True:
+            raise RuntimeError(
+                "Cannot register hook on a tensor that stop gradient.")
+
+        hook_id = self._register_grad_hook(hook)
+        helper = TensorHookRemoveHelper(self, hook_id)
+        return helper
+
     @property
     def grad(self):
         """
@@ -316,7 +376,8 @@ def __bool__(self):
         ("_to_static_var", _to_static_var), ("set_value", set_value),
         ("block", block), ("backward", backward), ("clear_grad", clear_grad),
         ("inplace_version", inplace_version), ("grad", grad),
-        ("gradient", gradient), ("__str__", __str__), ("__repr__", __str__),
+        ("gradient", gradient), ("register_hook", register_hook),
+        ("__str__", __str__), ("__repr__", __str__),
         ("__deepcopy__", __deepcopy__), ("__module__", "paddle"),
         ("__name__", "Tensor")):
         setattr(core.VarBase, method_name, method)
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
new file mode 100644
index 0000000000000..e97f4d99abf8c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
@@ -0,0 +1,115 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+
+
+class SimpleNet(nn.Layer):
+    def __init__(self, in_size, out_size):
+        super(SimpleNet, self).__init__()
+        self.linear1 = nn.Linear(in_size, in_size)
+        self.linear2 = nn.Linear(in_size, out_size)
+
+    def forward(self, x, hook=None, register=False, remove=False):
+        ret1 = self.linear1(x)
+        if hook is not None:
+            if register:
+                h = ret.register_hook(hook)
+                if remove:
+                    h.remove()
+        ret2 = self.linear2(ret)
+        out = paddle.mean(ret, axis=-1)
+        return ret1, out
+
+
+class TestTensorRegisterHook(unittest.TestCase):
+    def setUp(self):
+        self.seed = 2021
+        self.in_size = 10
+        self.out_size = 10
+        self.batch_size = 4
+        self.data = np.random.uniform(
+            size=[self.batch_size, self.in_size]).astype('float32')
+        self.label = np.random.uniform(
+            size=[self.batch_size, 1]).astype('float32')
+        self.devices = ["cpu"]
+        if paddle.is_compiled_with_cuda():
+            self.devices.append("gpu")
+
+        paddle.seed(self.seed)
+
+    def test_hook_for_interior_var(self):
+        def hook_fn(grad):
+            grad = grad * 2
+            print(grad)
+            return grad
+
+        for device in self.devices:
+            x = paddle.to_tensor([0., 1., 2., 3.])
+            y = paddle.to_tensor([4., 5., 6., 7.])
+            x.stop_gradient = False
+            y.stop_gradient = False
+
+            w = x + y
+            w.stop_gradient = False
+            w.register_hook(hook_fn)
+
+            z = paddle.to_tensor([1., 2., 3., 4.])
+            z.stop_gradient = False
+
+            o = z.matmul(w)
+
+            print('=====Start backprop=====')
+            o.backward()
+            print('=====End backprop=====')
+            print('x.grad:', x.grad)
+            print('y.grad:', y.grad)
+            print('w.grad:', w.grad)
+            print('z.grad:', z.grad)
+
+    def test_hook_for_leaf_var(self):
+        pass
+
+    def test_hook_for_accumulated_grad(self):
+        pass
+
+    def test_lambda_hook(self):
+        pass
+
+    def test_hook_in_model(self):
+        def register_and_remove_hook(hook=None, register=False, remove=False):
+            for device in self.devices:
+                net = SimpleNet(self.in_size, self.out_size)
+                loss_fn = nn.MSELoss()
+
+                data = paddle.to_tensor(self.data)
+                label = paddle.to_tensor(self.label)
+
+                ret1, out = net(data)
+                loss = loss_fn(out, label)
+                loss.backward()
+
+                return ret1.grad, net.linear1.weight.grad, net.linear1.bias.grad
+
+        pass
+
+
+if __name__ == '__main__':
+    unittest.main()

From 2553179cb441deed002b4e95b59601a70c3f7429 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Tue, 23 Mar 2021 06:50:05 +0000
Subject: [PATCH 04/16] refine prepare data impl

---
 paddle/fluid/imperative/layer.cc              | 29 +++++++++++++------
 paddle/fluid/imperative/prepared_operator.h   | 23 +++++----------
 .../fluid/imperative/tests/test_prepare_op.cc |  8 ++---
 3 files changed, 31 insertions(+), 29 deletions(-)

diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 062f04c6b7052..73662183e8883 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -350,10 +350,14 @@ static void OpBaseRunImpl(const framework::OperatorBase& op,
 
   VLOG(5) << LayerDebugString(op.Type(), ins, outs);
 
+  // Prepare Op only construct Op
+  auto prepared_op = PreparedOp::Prepare(ins, outs, *op_kernel, place, attrs);
+
   /**
    * [ Why need temporary inputs here? ]
    *
-   * PrepareData should not change original input tensor inplace.
+   * 1. PrepareData should not change original input tensor inplace.
+   *
    * Suppose the user defines a tensor(int), enters an op to execute,
    * and then this op rewrites GetExpectedKernelForVar, and converts
    * this tensor to float type during execution. After the dynamic
@@ -367,15 +371,22 @@ static void OpBaseRunImpl(const framework::OperatorBase& op,
    * transform is stored in the temporary scope, and then discarded
    * after the execution of op, but the original input is directly
    * overwritten in the previous dynamic graph implemention.
+   *
+   * 2. Hook execution should not change original input tensor.
+   *
+   * User can register hook for Tensor's gradient, It is expected
+   * that the hook only affects the gradient of the backward
+   * propagation, and does not affect the gradient value input
+   * as the hook.
    */
-  auto prepared_op = PreparedOp::Prepare(ins, outs, *op_kernel, place, attrs);
-  auto tmp_ins_ptr =
-      PrepareData<VarType>(*op_kernel, ins, prepared_op.kernel_type());
-  if (tmp_ins_ptr == nullptr) {
-    prepared_op.Run(ins, outs, attrs);
-  } else {
-    prepared_op.Run(*tmp_ins_ptr, outs, attrs);
-  }
+  NameVarMap<VarType> tmp_ins(ins);
+
+  // 1. prepare data
+  PrepareData<VarType>(*op_kernel, prepared_op.kernel_type(), &tmp_ins);
+
+  // 2. call hooks
+
+  prepared_op.Run(tmp_ins, outs, attrs);
 
   VLOG(4) << LayerDebugString(op.Type(), ins, outs);
 
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index 1f6be5483be30..d7f9dc53a53b0 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -70,11 +70,10 @@ extern const std::shared_ptr<VariableWrapper>& GetVariableWrapper(
     const std::shared_ptr<VariableWrapper>& var);
 
 template <typename VarType>
-std::shared_ptr<NameVarMap<VarType>> PrepareData(
-    const framework::OperatorWithKernel& op, const NameVarMap<VarType>& ins,
-    const framework::OpKernelType& expected_kernel_key) {
-  std::shared_ptr<NameVarMap<VarType>> tmp_ins_ptr = nullptr;
-  for (const auto& name_pair : ins) {
+void PrepareData(const framework::OperatorWithKernel& op,
+                 const framework::OpKernelType& expected_kernel_key,
+                 NameVarMap<VarType>* ins) {
+  for (auto& name_pair : *ins) {
     for (size_t i = 0; i < name_pair.second.size(); ++i) {
       auto& var_base = name_pair.second[i];
       SetForwardDataTypeOfGradVar(var_base);
@@ -94,16 +93,13 @@ std::shared_ptr<NameVarMap<VarType>> PrepareData(
             std::shared_ptr<VariableWrapper> cache_var =
                 GetVariableWrapper(var_base)->getCacheValue(
                     expected_kernel_key);
-            if (tmp_ins_ptr == nullptr) {
-              tmp_ins_ptr = std::make_shared<NameVarMap<VarType>>(ins);
-            }
 
             const auto* tensor = GetTensorFromVar(cache_var->Var());
             auto tmp_var = std::make_shared<VarType>(var_base->Name());
             tmp_var->SetType(var_base->Type());
             SetTensorToVariable(cache_var->Var(), *tensor,
                                 tmp_var->MutableVar());
-            (*tmp_ins_ptr)[name_pair.first][i] = tmp_var;
+            (*ins)[name_pair.first][i] = tmp_var;
           } else {
             framework::Tensor out;
             TransformData(expected_kernel_key, kernel_type_for_var, *tensor,
@@ -113,13 +109,10 @@ std::shared_ptr<NameVarMap<VarType>> PrepareData(
               // To avoid NameVarMap copy construction overhead in general
               // scenarios, if inplace transformed, return original input
               // directly
-              if (tmp_ins_ptr == nullptr) {
-                tmp_ins_ptr = std::make_shared<NameVarMap<VarType>>(ins);
-              }
               auto tmp_var = std::make_shared<VarType>(var_base->Name());
               tmp_var->SetType(var_base->Type());
               SetTensorToVariable(var_base->Var(), out, tmp_var->MutableVar());
-              (*tmp_ins_ptr)[name_pair.first][i] = tmp_var;
+              (*ins)[name_pair.first][i] = tmp_var;
 
               GetVariableWrapper(var_base)->setCacheValue(
                   expected_kernel_key, GetVariableWrapper(tmp_var));
@@ -127,8 +120,7 @@ std::shared_ptr<NameVarMap<VarType>> PrepareData(
                       << expected_kernel_key;
             } else {
               // if dtype is same, transform inplace will not change the
-              // original
-              // value, transform inplace to avoid multiple copy
+              // original value, transform inplace to avoid multiple copy
               SetTensorToVariable(var_base->Var(), out, var_base->MutableVar());
             }
           }
@@ -136,7 +128,6 @@ std::shared_ptr<NameVarMap<VarType>> PrepareData(
       }
     }
   }
-  return tmp_ins_ptr;
 }
 
 class PreparedOp {
diff --git a/paddle/fluid/imperative/tests/test_prepare_op.cc b/paddle/fluid/imperative/tests/test_prepare_op.cc
index 7d6882a4ee7d0..397ed01993b6a 100644
--- a/paddle/fluid/imperative/tests/test_prepare_op.cc
+++ b/paddle/fluid/imperative/tests/test_prepare_op.cc
@@ -146,8 +146,8 @@ TEST(test_prepare_op, test_prepare_data) {
       ins, outs, dynamic_cast<framework::OperatorWithKernel&>(*op), gpu_place,
       attr_map);
   PrepareData<imperative::VarBase>(
-      dynamic_cast<framework::OperatorWithKernel&>(*op), ins,
-      prepared_op.kernel_type());
+      dynamic_cast<framework::OperatorWithKernel&>(*op),
+      prepared_op.kernel_type(), &ins);
   for (const auto& name_pair : ins) {
     for (const auto& vb : name_pair.second) {
       ASSERT_TRUE(platform::is_same_place(
@@ -195,8 +195,8 @@ void TestPrepareDataSamePlace(framework::AttributeMap attr_map) {
       ins, outs, dynamic_cast<framework::OperatorWithKernel&>(*op), cpu_place,
       attr_map);
   PrepareData<imperative::VarBase>(
-      dynamic_cast<framework::OperatorWithKernel&>(*op), ins,
-      prepared_op.kernel_type());
+      dynamic_cast<framework::OperatorWithKernel&>(*op),
+      prepared_op.kernel_type(), &ins);
   for (const auto& name_pair : ins) {
     for (const auto& vb : name_pair.second) {
       ASSERT_TRUE(platform::is_same_place(

From 2fac74f686de4c1721f8e106775ac5663fa298b1 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 24 Mar 2021 06:26:54 +0000
Subject: [PATCH 05/16] revert prepare data change

---
 paddle/fluid/imperative/layer.cc              | 29 ++++++-------------
 paddle/fluid/imperative/prepared_operator.h   | 23 ++++++++++-----
 .../fluid/imperative/tests/test_prepare_op.cc |  8 ++---
 3 files changed, 29 insertions(+), 31 deletions(-)

diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 73662183e8883..062f04c6b7052 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -350,14 +350,10 @@ static void OpBaseRunImpl(const framework::OperatorBase& op,
 
   VLOG(5) << LayerDebugString(op.Type(), ins, outs);
 
-  // Prepare Op only construct Op
-  auto prepared_op = PreparedOp::Prepare(ins, outs, *op_kernel, place, attrs);
-
   /**
    * [ Why need temporary inputs here? ]
    *
-   * 1. PrepareData should not change original input tensor inplace.
-   *
+   * PrepareData should not change original input tensor inplace.
    * Suppose the user defines a tensor(int), enters an op to execute,
    * and then this op rewrites GetExpectedKernelForVar, and converts
    * this tensor to float type during execution. After the dynamic
@@ -371,22 +367,15 @@ static void OpBaseRunImpl(const framework::OperatorBase& op,
    * transform is stored in the temporary scope, and then discarded
    * after the execution of op, but the original input is directly
    * overwritten in the previous dynamic graph implemention.
-   *
-   * 2. Hook execution should not change original input tensor.
-   *
-   * User can register hook for Tensor's gradient, It is expected
-   * that the hook only affects the gradient of the backward
-   * propagation, and does not affect the gradient value input
-   * as the hook.
    */
-  NameVarMap<VarType> tmp_ins(ins);
-
-  // 1. prepare data
-  PrepareData<VarType>(*op_kernel, prepared_op.kernel_type(), &tmp_ins);
-
-  // 2. call hooks
-
-  prepared_op.Run(tmp_ins, outs, attrs);
+  auto prepared_op = PreparedOp::Prepare(ins, outs, *op_kernel, place, attrs);
+  auto tmp_ins_ptr =
+      PrepareData<VarType>(*op_kernel, ins, prepared_op.kernel_type());
+  if (tmp_ins_ptr == nullptr) {
+    prepared_op.Run(ins, outs, attrs);
+  } else {
+    prepared_op.Run(*tmp_ins_ptr, outs, attrs);
+  }
 
   VLOG(4) << LayerDebugString(op.Type(), ins, outs);
 
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index d7f9dc53a53b0..1f6be5483be30 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -70,10 +70,11 @@ extern const std::shared_ptr<VariableWrapper>& GetVariableWrapper(
     const std::shared_ptr<VariableWrapper>& var);
 
 template <typename VarType>
-void PrepareData(const framework::OperatorWithKernel& op,
-                 const framework::OpKernelType& expected_kernel_key,
-                 NameVarMap<VarType>* ins) {
-  for (auto& name_pair : *ins) {
+std::shared_ptr<NameVarMap<VarType>> PrepareData(
+    const framework::OperatorWithKernel& op, const NameVarMap<VarType>& ins,
+    const framework::OpKernelType& expected_kernel_key) {
+  std::shared_ptr<NameVarMap<VarType>> tmp_ins_ptr = nullptr;
+  for (const auto& name_pair : ins) {
     for (size_t i = 0; i < name_pair.second.size(); ++i) {
       auto& var_base = name_pair.second[i];
       SetForwardDataTypeOfGradVar(var_base);
@@ -93,13 +94,16 @@ void PrepareData(const framework::OperatorWithKernel& op,
             std::shared_ptr<VariableWrapper> cache_var =
                 GetVariableWrapper(var_base)->getCacheValue(
                     expected_kernel_key);
+            if (tmp_ins_ptr == nullptr) {
+              tmp_ins_ptr = std::make_shared<NameVarMap<VarType>>(ins);
+            }
 
             const auto* tensor = GetTensorFromVar(cache_var->Var());
             auto tmp_var = std::make_shared<VarType>(var_base->Name());
             tmp_var->SetType(var_base->Type());
             SetTensorToVariable(cache_var->Var(), *tensor,
                                 tmp_var->MutableVar());
-            (*ins)[name_pair.first][i] = tmp_var;
+            (*tmp_ins_ptr)[name_pair.first][i] = tmp_var;
           } else {
             framework::Tensor out;
             TransformData(expected_kernel_key, kernel_type_for_var, *tensor,
@@ -109,10 +113,13 @@ void PrepareData(const framework::OperatorWithKernel& op,
               // To avoid NameVarMap copy construction overhead in general
               // scenarios, if inplace transformed, return original input
               // directly
+              if (tmp_ins_ptr == nullptr) {
+                tmp_ins_ptr = std::make_shared<NameVarMap<VarType>>(ins);
+              }
               auto tmp_var = std::make_shared<VarType>(var_base->Name());
               tmp_var->SetType(var_base->Type());
               SetTensorToVariable(var_base->Var(), out, tmp_var->MutableVar());
-              (*ins)[name_pair.first][i] = tmp_var;
+              (*tmp_ins_ptr)[name_pair.first][i] = tmp_var;
 
               GetVariableWrapper(var_base)->setCacheValue(
                   expected_kernel_key, GetVariableWrapper(tmp_var));
@@ -120,7 +127,8 @@ void PrepareData(const framework::OperatorWithKernel& op,
                       << expected_kernel_key;
             } else {
               // if dtype is same, transform inplace will not change the
-              // original value, transform inplace to avoid multiple copy
+              // original
+              // value, transform inplace to avoid multiple copy
               SetTensorToVariable(var_base->Var(), out, var_base->MutableVar());
             }
           }
@@ -128,6 +136,7 @@ void PrepareData(const framework::OperatorWithKernel& op,
       }
     }
   }
+  return tmp_ins_ptr;
 }
 
 class PreparedOp {
diff --git a/paddle/fluid/imperative/tests/test_prepare_op.cc b/paddle/fluid/imperative/tests/test_prepare_op.cc
index 397ed01993b6a..7d6882a4ee7d0 100644
--- a/paddle/fluid/imperative/tests/test_prepare_op.cc
+++ b/paddle/fluid/imperative/tests/test_prepare_op.cc
@@ -146,8 +146,8 @@ TEST(test_prepare_op, test_prepare_data) {
       ins, outs, dynamic_cast<framework::OperatorWithKernel&>(*op), gpu_place,
       attr_map);
   PrepareData<imperative::VarBase>(
-      dynamic_cast<framework::OperatorWithKernel&>(*op),
-      prepared_op.kernel_type(), &ins);
+      dynamic_cast<framework::OperatorWithKernel&>(*op), ins,
+      prepared_op.kernel_type());
   for (const auto& name_pair : ins) {
     for (const auto& vb : name_pair.second) {
       ASSERT_TRUE(platform::is_same_place(
@@ -195,8 +195,8 @@ void TestPrepareDataSamePlace(framework::AttributeMap attr_map) {
       ins, outs, dynamic_cast<framework::OperatorWithKernel&>(*op), cpu_place,
       attr_map);
   PrepareData<imperative::VarBase>(
-      dynamic_cast<framework::OperatorWithKernel&>(*op),
-      prepared_op.kernel_type(), &ins);
+      dynamic_cast<framework::OperatorWithKernel&>(*op), ins,
+      prepared_op.kernel_type());
   for (const auto& name_pair : ins) {
     for (const auto& vb : name_pair.second) {
       ASSERT_TRUE(platform::is_same_place(

From de8b2dfe33f56f866e4275c2f51006880b832b6d Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 25 Mar 2021 03:05:17 +0000
Subject: [PATCH 06/16] support register_hook for Tensor

---
 paddle/fluid/imperative/basic_engine.cc       |  44 +++++-
 .../fluid/imperative/gradient_accumulator.cc  |  37 ++++-
 .../fluid/imperative/gradient_accumulator.h   |  28 +---
 paddle/fluid/imperative/hooks.h               |   2 +-
 paddle/fluid/imperative/layer.h               |   9 +-
 paddle/fluid/pybind/imperative.cc             |  27 +++-
 .../fluid/dygraph/varbase_patch_methods.py    |   2 +-
 .../unittests/test_tensor_register_hook.py    | 129 ++++++++++++++++--
 8 files changed, 228 insertions(+), 50 deletions(-)

diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index bbed0a4951fd3..29eac618e9427 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -281,10 +281,25 @@ void BasicEngine::Execute() {
       auto& bwd_ins = cur_op.GetInsMap();
       auto& bwd_outs = cur_op.GetOutsMap();
 
+      /**
+       * [ Why need temporary inputs and outputs here? ]
+       *
+       * 1. For inputs
+       * - Hook execution should not change original input tensor.
+       *   User can register hook for Tensor's gradient, It is expected
+       *   that the hook only affects the gradient of the backward
+       *   propagation, and does not affect the gradient value input
+       *   as the hook.
+       *
+       * 2. For outputs
+       *
+       * - construct the temp output map, avoid to disrupt graph
+       * - replace the element in the map by temp var, because a
+       *   var may be coresponding to several grad var in one op
+       */
+      NameVarMap<VariableWrapper> tmp_ins(bwd_ins);
       NameVarMap<VariableWrapper> tmp_outs(bwd_outs);
-      // 1. construct the temp output map, avoid to disrupt graph
-      // 2. replace the element in the map by temp var, because a
-      // var may be coresponding to several grad var in one op
+
       for (auto& pair : tmp_outs) {
         if (!pair.second.IsGrad()) {
           continue;
@@ -355,7 +370,7 @@ void BasicEngine::Execute() {
             // If a tmp var has been created, there is no need to create it
             // again.
             for (auto& in_var :
-                 bwd_ins.at(inplace_grad_name_map.at(pair.first))) {
+                 tmp_ins.at(inplace_grad_name_map.at(pair.first))) {
               if (in_var == var) {
                 auto tmp_var = std::make_shared<VariableWrapper>(var->Name());
                 tmp_var->SetType(var->Type());
@@ -374,7 +389,7 @@ void BasicEngine::Execute() {
 
       VLOG(4) << "Check whether there is any inplace operation affecting "
                  "gradient calculation.";
-      for (auto& pair : bwd_ins) {
+      for (auto& pair : tmp_ins) {
         for (auto& var_wrapper : pair.second) {
           auto wrapper_version_snapshot = var_wrapper->InplaceVersionSnapshot();
           auto tensor_version =
@@ -397,9 +412,25 @@ void BasicEngine::Execute() {
         }
       }
 
+      for (auto& pair : tmp_ins) {
+        for (size_t i = 0; i < pair.second.size(); ++i) {
+          auto& var = pair.second[i];
+          if (var->HasHook()) {
+            VLOG(3) << "Call " << var->GetHooks().size() << " hooks of "
+                    << cur_op.Type() << "'s input `" << pair.first
+                    << "`'s var `" << var->Name() << "`.";
+            auto tmp_var = var;
+            for (const auto& hook_pair : var->GetHooks()) {
+              tmp_var = (*hook_pair.second)(tmp_var);
+            }
+            tmp_ins[pair.first][i] = tmp_var;
+          }
+        }
+      }
+
       {
         VLOG(3) << "Start to execute grad op " << cur_op.Type();
-        OpBase::Run(cur_op.InnerOp(), bwd_ins, tmp_outs, cur_op.Attrs(),
+        OpBase::Run(cur_op.InnerOp(), tmp_ins, tmp_outs, cur_op.Attrs(),
                     cur_op.place());
       }
 
@@ -418,6 +449,7 @@ void BasicEngine::Execute() {
           continue;
         }
         // 1. Call Hooks for **inner_var_**
+        accumulator->CallHooks();
 
         // 2. Sum Gradient with Previous Graph
         accumulator->AccumulateGrad();
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index deb504a1b657e..6779ab71e30b4 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -397,7 +397,7 @@ void GradientAccumulator::AccumulateGrad() {
                         "this auto-grad"));
   PADDLE_ENFORCE_EQ(inner_var_->Var().IsInitialized(), true,
                     platform::errors::InvalidArgument(
-                        "Interior var of Leaf tensor  should be initialized."));
+                        "Interior var of Leaf tensor should be initialized."));
   auto* src = inner_var_->MutableVar();
   auto* dst = var_->MutableVar();
   if (!var_->IsEmpty()) {
@@ -432,6 +432,41 @@ void GradientAccumulator::AccumulateGrad() {
   inner_var_.reset();
 }
 
+void GradientAccumulator::CallHooks() {
+  if (!var_->IsLeafGrad() || !SumGradCompleted() || !HasInnerVar()) {
+    return;
+  }
+  PADDLE_ENFORCE_EQ(
+      HasInnerVar(), true,
+      platform::errors::InvalidArgument(
+          "Leaf Tensor's inner var is nullptr when call gradient hook."));
+  PADDLE_ENFORCE_EQ(inner_var_->Var().IsInitialized(), true,
+                    platform::errors::InvalidArgument("Leaf Tensor's inner var "
+                                                      "is not initialized when "
+                                                      "call gradient hook."));
+  if (var_->HasHook()) {
+    VLOG(3) << "Call " << var_->GetHooks().size()
+            << " hooks of leaf gradient accumulator's inner var `"
+            << var_->Name() << "`.";
+    auto tmp_var = inner_var_;
+    VLOG(3) << "Input var " << var_->Name() << "'s hook size - "
+            << var_->GetHooks().size();
+    for (const auto& hook_pair : var_->GetHooks()) {
+      tmp_var = (*hook_pair.second)(tmp_var);
+    }
+    inner_var_ = tmp_var;
+  }
+}
+
+void GradientAccumulator::CallReduceHooks() {
+  if (var_->HasReduceHook()) {
+    for (const auto& hook : var_->GetReduceHooks()) {
+      VLOG(3) << "call gradient accumulator backward hooks.";
+      (*hook)(var_);
+    }
+  }
+}
+
 void EagerGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
                                        size_t trace_id, bool unchange_input) {
   /**
diff --git a/paddle/fluid/imperative/gradient_accumulator.h b/paddle/fluid/imperative/gradient_accumulator.h
index 0f63d991ec2f2..08a60ee03059e 100644
--- a/paddle/fluid/imperative/gradient_accumulator.h
+++ b/paddle/fluid/imperative/gradient_accumulator.h
@@ -40,8 +40,8 @@ class GradientAccumulator {
     }
 
     // inner_var_ record the grad of this auto-grad.
-    // Only need to generate inner var for non-empty leaf-tensor.
-    if (var->IsLeafGrad() && !var->IsEmpty()) {
+    // Only need to generate inner var for leaf-tensor.
+    if (var->IsLeafGrad()) {
       inner_var_ = std::make_shared<VariableWrapper>(var->Name());
       inner_var_->SetType(var->Type());
       inner_var_->SetDataType(var->DataType());
@@ -111,7 +111,7 @@ class GradientAccumulator {
    *                        |
    *      [ Gradient accumulation across batchs ]
    *                        |
-   *          [ Gradient reduce / allreduce]
+   *          [ Gradient reduce / allreduce hooks ]
 
    *    Because we currently intend to accumulate these two gradient
    *    accumulation in one GradientAccumulator, We must distinguish between
@@ -122,25 +122,9 @@ class GradientAccumulator {
    *    parallel multi-card training.
    */
 
-  // VariableWrapper CallHooks() {
-  //   if (var_->HasHook()) {
-  //     VariableWrapper tmp = *var_;
-  //     for (const auto& hook : var_->GetHooks()) {
-  //       VLOG(3) << "call gradient accumulator backward hooks.";
-  //       tmp = (*hook)(tmp);
-  //     }
-  //     *var_ = tmp;
-  //   }
-  // }
-
-  void CallReduceHooks() {
-    if (var_->HasReduceHook()) {
-      for (const auto& hook : var_->GetReduceHooks()) {
-        VLOG(3) << "call gradient accumulator backward hooks.";
-        (*hook)(var_);
-      }
-    }
-  }
+  void CallHooks();
+
+  void CallReduceHooks();
 
  protected:
   VariableWrapper* var_;
diff --git a/paddle/fluid/imperative/hooks.h b/paddle/fluid/imperative/hooks.h
index dd2845edb80d1..4d59298aed51f 100644
--- a/paddle/fluid/imperative/hooks.h
+++ b/paddle/fluid/imperative/hooks.h
@@ -78,7 +78,7 @@ class InplaceVariableWrapperHook {
 class LambdaInplaceVariableWrapperHook : public InplaceVariableWrapperHook {
  public:
   explicit LambdaInplaceVariableWrapperHook(
-      std::function<void(VariableWrapper*)> fn)
+      std::function<void(VariableWrapper*)>&& fn)
       : fn_(std::move(fn)) {}
 
   void operator()(VariableWrapper* var) override { fn_(var); }
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index f26605fb2c098..625af0c1fc3da 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -221,7 +221,9 @@ class VarBase {
 
   void BumpInplaceVersion();
 
-  /* Hook related method: only can be call by GradVarBase */
+  /* Hook related method: now only used for GradVarBase */
+  bool HasHook() const { return var_->HasHook(); }
+
   int64_t AddHook(std::shared_ptr<VariableWrapperHook>&& hook) {
     return var_->AddHook(
         std::forward<std::shared_ptr<VariableWrapperHook>>(hook));
@@ -229,6 +231,11 @@ class VarBase {
 
   bool RemoveHook(const int64_t& hook_id) { return var_->RemoveHook(hook_id); }
 
+  const std::map<int64_t, std::shared_ptr<VariableWrapperHook>>& GetHooks()
+      const {
+    return var_->GetHooks();
+  }
+
   void AddReduceHook(std::shared_ptr<InplaceVariableWrapperHook>&& hook) {
     var_->AddReduceHook(
         std::forward<std::shared_ptr<InplaceVariableWrapperHook>>(hook));
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 720afa44b5ecd..b7ae709960479 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -77,12 +77,10 @@ static T PyObjectCast(PyObject *obj) {
 class PyVariableWrapperHook : public imperative::VariableWrapperHook {
  public:
   explicit PyVariableWrapperHook(PyObject *func) : py_func_(func) {
-    VLOG(0) << "Construct PyVariableWrapperHook based func " << func;
-    Py_INCREF(func);
+    Py_INCREF(py_func_);
   }
 
   ~PyVariableWrapperHook() {
-    VLOG(0) << "Destruct PyVariableWrapperHook based func " << py_func_;
     py::gil_scoped_acquire gil;
     Py_DECREF(py_func_);
   }
@@ -90,20 +88,37 @@ class PyVariableWrapperHook : public imperative::VariableWrapperHook {
   std::shared_ptr<imperative::VariableWrapper> operator()(
       const std::shared_ptr<imperative::VariableWrapper> &var) override {
     py::gil_scoped_acquire gil;
+    VLOG(3) << "Call PyVariableWrapperHook for var " << var->Name();
 
     // 1. unpack temp VarBase from VariableWrapper
     std::shared_ptr<imperative::VarBase> tmp_varbase =
         std::make_shared<imperative::VarBase>(var);
 
     // 2. call hook and return
-    PyObject *res = PyObject_CallFunctionObjArgs(
-        py_func_, py::cast(tmp_varbase).ptr(), nullptr);
+    PyObject *res = nullptr;
+    try {
+      res = PyObject_CallFunctionObjArgs(py_func_, py::cast(tmp_varbase).ptr(),
+                                         nullptr);
+    } catch (platform::EnforceNotMet &e) {
+      throw std::move(e);
+    } catch (std::exception &e) {
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Hook function of Tensor raises an exception: %s.", e.what()));
+    } catch (...) {
+      // NOTE: this branch represents a very serious bug with
+      // low probability of occurrence, and we can't get its
+      // exception content here.
+      PADDLE_THROW(platform::errors::Fatal(
+          "Hook function of Tensor raises an unknown exception."));
+    }
+
     PADDLE_ENFORCE_NOT_NULL(res,
                             platform::errors::Unavailable(
-                                "The gradient Tensor hook return nullptr."));
+                                "Hook function of Tensor return a nullptr."));
     if (res == Py_None) {
       return var;
     }
+
     return PyObjectCast<std::shared_ptr<imperative::VarBase>>(res)->SharedVar();
   }
 
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 8ab8d3010b0df..7ec88d0fcd4ce 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -39,7 +39,7 @@ def __init__(self, tensor, hook_id):
     def remove(self):
         tensor = self._tensor_ref()
         if tensor is not None:
-            tensor._remove_grad_hook(self.hook_id)
+            tensor._remove_grad_hook(self._hook_id)
 
 
 def monkey_patch_varbase():
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
index e97f4d99abf8c..3395bd95b7b18 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
@@ -55,13 +55,87 @@ def setUp(self):
 
         paddle.seed(self.seed)
 
-    def test_hook_for_interior_var(self):
-        def hook_fn(grad):
-            grad = grad * 2
-            print(grad)
-            return grad
+    def run_hook_for_interior_var(self, hook, removed=False):
+        for device in self.devices:
+            paddle.set_device(device)
+
+            x = paddle.to_tensor([0., 1., 2., 3.])
+            y = paddle.to_tensor([4., 5., 6., 7.])
+            x.stop_gradient = False
+            y.stop_gradient = False
+
+            w = x + y
+            w.stop_gradient = False
+            helper = w.register_hook(hook)
+
+            z = paddle.to_tensor([1., 2., 3., 4.])
+            z.stop_gradient = False
+
+            o = z.matmul(w)
+
+            # remove hook before backward
+            if removed:
+                helper.remove()
+
+            o.backward()
+            print('x.grad:', x.grad)
+            print('y.grad:', y.grad)
+            print('w.grad:', w.grad)
+            print('z.grad:', z.grad)
+
+            # z.grad is not affected
+            self.assertTrue(np.array_equal(z.grad, w.numpy()))
+            # w.grad is not changed by hook
+            self.assertTrue(np.array_equal(w.grad, z.numpy()))
+            # x.grad and y.grad are changed if run hook
+            self.assertTrue(
+                np.array_equal(x.grad,
+                               z.numpy() * 2 if not removed else z.numpy()))
+            self.assertTrue(
+                np.array_equal(y.grad,
+                               z.numpy() * 2 if not removed else z.numpy()))
+
+    def run_hook_for_leaf_var(self, hook, removed=False):
+        for device in self.devices:
+            paddle.set_device(device)
+
+            x = paddle.to_tensor([0., 1., 2., 3.])
+            y = paddle.to_tensor([4., 5., 6., 7.])
+            x.stop_gradient = False
+            y.stop_gradient = False
+            helper = y.register_hook(hook)
+
+            w = x + y
+            w.stop_gradient = False
+
+            z = paddle.to_tensor([1., 2., 3., 4.])
+            z.stop_gradient = False
+
+            o = z.matmul(w)
+
+            # remove hook before backward
+            if removed:
+                helper.remove()
+
+            o.backward()
+            print('x.grad:', x.grad)
+            print('y.grad:', y.grad)
+            print('w.grad:', w.grad)
+            print('z.grad:', z.grad)
 
+            # z.grad, w.grad, x.grad is not affected
+            self.assertTrue(np.array_equal(z.grad, w.numpy()))
+            self.assertTrue(np.array_equal(w.grad, z.numpy()))
+            self.assertTrue(np.array_equal(x.grad, z.numpy()))
+            # y.grad are changed if run hook
+            self.assertTrue(
+                np.array_equal(y.grad,
+                               z.numpy() * 2 if not removed else z.numpy()))
+
+    def run_hook_for_accumulated_grad(self, hook, removed=False):
         for device in self.devices:
+            paddle.set_device(device)
+
             x = paddle.to_tensor([0., 1., 2., 3.])
             y = paddle.to_tensor([4., 5., 6., 7.])
             x.stop_gradient = False
@@ -69,30 +143,61 @@ def hook_fn(grad):
 
             w = x + y
             w.stop_gradient = False
-            w.register_hook(hook_fn)
+            helper = w.register_hook(hook)
 
             z = paddle.to_tensor([1., 2., 3., 4.])
             z.stop_gradient = False
 
             o = z.matmul(w)
 
-            print('=====Start backprop=====')
+            # remove hook before backward
+            if removed:
+                helper.remove()
+
             o.backward()
-            print('=====End backprop=====')
             print('x.grad:', x.grad)
             print('y.grad:', y.grad)
             print('w.grad:', w.grad)
             print('z.grad:', z.grad)
 
+            # z.grad is not affected
+            self.assertTrue(np.array_equal(z.grad, w.numpy()))
+            # w.grad is not changed by hook
+            self.assertTrue(np.array_equal(w.grad, z.numpy()))
+            # x.grad and y.grad are changed if run hook
+            self.assertTrue(
+                np.array_equal(x.grad,
+                               z.numpy() * 2 if not removed else z.numpy()))
+            self.assertTrue(
+                np.array_equal(y.grad,
+                               z.numpy() * 2 if not removed else z.numpy()))
+
+    def test_func_hook_for_interior_var(self):
+        def hook_fn(grad):
+            grad = grad * 2
+            print(grad)
+            return grad
+
+        # register hook
+        self.run_hook_for_interior_var(hook_fn)
+        # register hook and removed
+        self.run_hook_for_interior_var(hook_fn, removed=True)
+
+    def test_lambda_hook_for_interior_var(self):
+        # register hook
+        self.run_hook_for_interior_var(lambda grad: grad * 2)
+        # register hook and removed
+        self.run_hook_for_interior_var(lambda grad: grad * 2, removed=True)
+
     def test_hook_for_leaf_var(self):
-        pass
+        # register hook
+        self.run_hook_for_leaf_var(lambda grad: grad * 2)
+        # register hook and removed
+        self.run_hook_for_leaf_var(lambda grad: grad * 2, removed=True)
 
     def test_hook_for_accumulated_grad(self):
         pass
 
-    def test_lambda_hook(self):
-        pass
-
     def test_hook_in_model(self):
         def register_and_remove_hook(hook=None, register=False, remove=False):
             for device in self.devices:

From 665b15be4faa479f909d23f8c023059b66e705e2 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 25 Mar 2021 08:30:24 +0000
Subject: [PATCH 07/16] add hook test in model

---
 .../unittests/test_tensor_register_hook.py    | 144 +++++++++++-------
 1 file changed, 85 insertions(+), 59 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
index 3395bd95b7b18..f1d17fada3aaf 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
@@ -31,11 +31,11 @@ def forward(self, x, hook=None, register=False, remove=False):
         ret1 = self.linear1(x)
         if hook is not None:
             if register:
-                h = ret.register_hook(hook)
+                h = ret1.register_hook(hook)
                 if remove:
                     h.remove()
-        ret2 = self.linear2(ret)
-        out = paddle.mean(ret, axis=-1)
+        ret2 = self.linear2(ret1)
+        out = paddle.mean(ret2, axis=-1)
         return ret1, out
 
 
@@ -45,17 +45,11 @@ def setUp(self):
         self.in_size = 10
         self.out_size = 10
         self.batch_size = 4
-        self.data = np.random.uniform(
-            size=[self.batch_size, self.in_size]).astype('float32')
-        self.label = np.random.uniform(
-            size=[self.batch_size, 1]).astype('float32')
         self.devices = ["cpu"]
         if paddle.is_compiled_with_cuda():
             self.devices.append("gpu")
 
-        paddle.seed(self.seed)
-
-    def run_hook_for_interior_var(self, hook, removed=False):
+    def run_hook_for_interior_var(self, double_hook, removed=False):
         for device in self.devices:
             paddle.set_device(device)
 
@@ -66,7 +60,7 @@ def run_hook_for_interior_var(self, hook, removed=False):
 
             w = x + y
             w.stop_gradient = False
-            helper = w.register_hook(hook)
+            helper = w.register_hook(double_hook)
 
             z = paddle.to_tensor([1., 2., 3., 4.])
             z.stop_gradient = False
@@ -78,10 +72,6 @@ def run_hook_for_interior_var(self, hook, removed=False):
                 helper.remove()
 
             o.backward()
-            print('x.grad:', x.grad)
-            print('y.grad:', y.grad)
-            print('w.grad:', w.grad)
-            print('z.grad:', z.grad)
 
             # z.grad is not affected
             self.assertTrue(np.array_equal(z.grad, w.numpy()))
@@ -95,7 +85,7 @@ def run_hook_for_interior_var(self, hook, removed=False):
                 np.array_equal(y.grad,
                                z.numpy() * 2 if not removed else z.numpy()))
 
-    def run_hook_for_leaf_var(self, hook, removed=False):
+    def run_hook_for_leaf_var(self, double_hook, removed=False):
         for device in self.devices:
             paddle.set_device(device)
 
@@ -103,7 +93,7 @@ def run_hook_for_leaf_var(self, hook, removed=False):
             y = paddle.to_tensor([4., 5., 6., 7.])
             x.stop_gradient = False
             y.stop_gradient = False
-            helper = y.register_hook(hook)
+            helper = y.register_hook(double_hook)
 
             w = x + y
             w.stop_gradient = False
@@ -118,10 +108,6 @@ def run_hook_for_leaf_var(self, hook, removed=False):
                 helper.remove()
 
             o.backward()
-            print('x.grad:', x.grad)
-            print('y.grad:', y.grad)
-            print('w.grad:', w.grad)
-            print('z.grad:', z.grad)
 
             # z.grad, w.grad, x.grad is not affected
             self.assertTrue(np.array_equal(z.grad, w.numpy()))
@@ -132,45 +118,74 @@ def run_hook_for_leaf_var(self, hook, removed=False):
                 np.array_equal(y.grad,
                                z.numpy() * 2 if not removed else z.numpy()))
 
-    def run_hook_for_accumulated_grad(self, hook, removed=False):
+    def run_hook_for_accumulated_grad(self, double_hook, removed=False):
         for device in self.devices:
             paddle.set_device(device)
 
-            x = paddle.to_tensor([0., 1., 2., 3.])
-            y = paddle.to_tensor([4., 5., 6., 7.])
+            a = paddle.to_tensor([0., 1., 1., 2.])
+            b = paddle.to_tensor([0., 0., 1., 2.])
+            a.stop_gradient = False
+            b.stop_gradient = False
+
+            helper1 = a.register_hook(double_hook)
+
+            x = a + b
             x.stop_gradient = False
-            y.stop_gradient = False
 
-            w = x + y
-            w.stop_gradient = False
-            helper = w.register_hook(hook)
+            helper2 = x.register_hook(double_hook)
 
+            y = paddle.to_tensor([4., 5., 6., 7.])
             z = paddle.to_tensor([1., 2., 3., 4.])
+            y.stop_gradient = False
             z.stop_gradient = False
 
-            o = z.matmul(w)
+            o1 = x + y
+            o2 = x + z
+            o1.stop_gradient = False
+            o2.stop_gradient = False
+
+            o = o1.matmul(o2)
 
             # remove hook before backward
             if removed:
-                helper.remove()
+                helper1.remove()
+                helper2.remove()
 
             o.backward()
-            print('x.grad:', x.grad)
-            print('y.grad:', y.grad)
-            print('w.grad:', w.grad)
-            print('z.grad:', z.grad)
 
-            # z.grad is not affected
-            self.assertTrue(np.array_equal(z.grad, w.numpy()))
-            # w.grad is not changed by hook
-            self.assertTrue(np.array_equal(w.grad, z.numpy()))
-            # x.grad and y.grad are changed if run hook
+            base_grad = np.array([5., 9., 13., 19.])
+            # x.grad is not changed
+            self.assertTrue(np.array_equal(x.grad, base_grad))
+            # b.grad is changed by x.hook
             self.assertTrue(
-                np.array_equal(x.grad,
-                               z.numpy() * 2 if not removed else z.numpy()))
+                np.array_equal(b.grad, base_grad * 2
+                               if not removed else base_grad))
+            # a.grad is changed by x.hook and a.hook
             self.assertTrue(
-                np.array_equal(y.grad,
-                               z.numpy() * 2 if not removed else z.numpy()))
+                np.array_equal(a.grad, base_grad * 4
+                               if not removed else base_grad))
+
+    def run_hook_in_model(self,
+                          data,
+                          label,
+                          hook=None,
+                          register=False,
+                          remove=False):
+        for device in self.devices:
+            paddle.seed(self.seed)
+            paddle.set_device(device)
+
+            net = SimpleNet(self.in_size, self.out_size)
+            loss_fn = nn.MSELoss()
+
+            data = paddle.to_tensor(data)
+            label = paddle.to_tensor(label)
+
+            ret1, out = net(data, hook, register, remove)
+            loss = loss_fn(out, label)
+            loss.backward()
+
+            return ret1.grad, net.linear1.weight.grad, net.linear1.bias.grad
 
     def test_func_hook_for_interior_var(self):
         def hook_fn(grad):
@@ -196,24 +211,35 @@ def test_hook_for_leaf_var(self):
         self.run_hook_for_leaf_var(lambda grad: grad * 2, removed=True)
 
     def test_hook_for_accumulated_grad(self):
-        pass
+        # register hook
+        self.run_hook_for_accumulated_grad(lambda grad: grad * 2)
+        # register hook and removed
+        self.run_hook_for_accumulated_grad(lambda grad: grad * 2, removed=True)
 
     def test_hook_in_model(self):
-        def register_and_remove_hook(hook=None, register=False, remove=False):
-            for device in self.devices:
-                net = SimpleNet(self.in_size, self.out_size)
-                loss_fn = nn.MSELoss()
-
-                data = paddle.to_tensor(self.data)
-                label = paddle.to_tensor(self.label)
-
-                ret1, out = net(data)
-                loss = loss_fn(out, label)
-                loss.backward()
-
-                return ret1.grad, net.linear1.weight.grad, net.linear1.bias.grad
-
-        pass
+        data = np.random.uniform(
+            size=[self.batch_size, self.in_size]).astype('float32')
+        label = np.random.uniform(size=[self.batch_size, 1]).astype('float32')
+
+        # get original value
+        ret1_grad, linear1_w_grad, linear1_b_grad = self.run_hook_in_model(
+            data, label)
+        # get value changed by hook
+        ret1_grad_hook, linear1_w_grad_hook, linear1_b_grad_hook = self.run_hook_in_model(
+            data, label, lambda grad: grad * 2, True)
+        # get value after removing hook
+        ret1_grad_rm, linear1_w_grad_rm, linear1_b_grad_rm = self.run_hook_in_model(
+            data, label, lambda grad: grad * 2, True, True)
+
+        # compare original value and with hook
+        self.assertTrue(np.array_equal(ret1_grad, ret1_grad_hook))
+        self.assertTrue(np.array_equal(linear1_w_grad * 2, linear1_w_grad_hook))
+        self.assertTrue(np.array_equal(linear1_b_grad * 2, linear1_b_grad_hook))
+
+        # compare original value and remove hook
+        self.assertTrue(np.array_equal(ret1_grad, ret1_grad_rm))
+        self.assertTrue(np.array_equal(linear1_w_grad, linear1_w_grad_rm))
+        self.assertTrue(np.array_equal(linear1_b_grad, linear1_b_grad_rm))
 
 
 if __name__ == '__main__':

From 118cc07f3e8e178fdfbe79ce56dda908223bc424 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 25 Mar 2021 12:00:44 +0000
Subject: [PATCH 08/16] polish tests and doc example

---
 .../fluid/dygraph/varbase_patch_methods.py    |  58 ++-
 .../unittests/test_tensor_register_hook.py    | 406 ++++++++++++------
 2 files changed, 312 insertions(+), 152 deletions(-)

diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 7ec88d0fcd4ce..c5ac15ee84987 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -14,6 +14,7 @@
 
 import inspect
 import numpy as np
+import warnings
 import weakref
 
 import paddle
@@ -29,7 +30,7 @@
 
 class TensorHookRemoveHelper(object):
     """
-    A helper class that for removing Tensor gradient's hook. 
+    A helper class that for removing Tensor gradient's hook.
     """
 
     def __init__(self, tensor, hook_id):
@@ -37,9 +38,22 @@ def __init__(self, tensor, hook_id):
         self._hook_id = hook_id
 
     def remove(self):
+        """
+        Remove reference Tensor's hook.
+
+        Returns:
+            bool: Return True if removed successfully
+        """
         tensor = self._tensor_ref()
         if tensor is not None:
-            tensor._remove_grad_hook(self._hook_id)
+            res = tensor._remove_grad_hook(self._hook_id)
+            if res is True:
+                return True
+            else:
+                warnings.warn(
+                    "The backward hook (ID: %d) of Tensor `%s` you want to remove does not exist or has been removed."
+                    % (self._hook_id, tensor.name), RuntimeWarning)
+        return False
 
 
 def monkey_patch_varbase():
@@ -232,14 +246,14 @@ def register_hook(self, hook):
         """
         Registers a backward hook for current Tensor.
 
-        The hook will be called every time the gradient Tensor of Current Tensor is computed.
+        The hook will be called every time the gradient Tensor of current Tensor is computed.
 
         The hook should not modify the input gradient Tensor, but it can optionally return
         a new gradient Tensor which will be used in place of current Tensor's gradient.
 
         The hook should have the following signature:
 
-            hook(grad) -> Variable or None
+            hook(grad) -> Tensor or None
 
         Args:
             hook(function): A backward hook to be registered for Tensor.grad
@@ -251,16 +265,36 @@ def register_hook(self, hook):
              .. code-block:: python
 
                 import paddle
-                import numpy as np
 
-                def hook_fn(g):
-                    g = 2 * g
-                    print g
+                # hook function return None
+                def print_hook_fn(grad):
+                    print(grad)
+
+                # hook function return Tensor
+                def double_hook_fn(grad):
+                    grad = grad * 2
+                    return grad
+
+                x = paddle.to_tensor([0., 1., 2., 3.], stop_gradient=False)
+                y = paddle.to_tensor([4., 5., 6., 7.], stop_gradient=False)
+                z = paddle.to_tensor([1., 2., 3., 4.])
+
+                # one Tensor can register multiple hooks
+                h = x.register_hook(print_hook_fn)
+                x.register_hook(double_hook_fn)
+
+                w = x + y
+                # register hook by lambda function
+                w.register_hook(lambda grad: grad * 2)
+
+                o = z.matmul(w)
+                o.backward()
+
+                # ('w.grad: ', array([1., 2., 3., 4.], dtype=float32))
+                # ('x.grad: ', array([ 4.,  8., 12., 16.], dtype=float32))
+                # ('y.grad: ', array([2., 4., 6., 8.], dtype=float32))
 
-                t = paddle.randn([2, 3])
-                t.stop_gradient = False
-                h = t.register_hook(hook_fn)
-                t.backward()
+                # remove hook
                 h.remove()
         """
         if self.stop_gradient is True:
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
index f1d17fada3aaf..14a04c9ee9f7b 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
@@ -49,186 +49,223 @@ def setUp(self):
         if paddle.is_compiled_with_cuda():
             self.devices.append("gpu")
 
-    def run_hook_for_interior_var(self, double_hook, removed=False):
-        for device in self.devices:
-            paddle.set_device(device)
-
-            x = paddle.to_tensor([0., 1., 2., 3.])
-            y = paddle.to_tensor([4., 5., 6., 7.])
-            x.stop_gradient = False
-            y.stop_gradient = False
-
-            w = x + y
-            w.stop_gradient = False
-            helper = w.register_hook(double_hook)
-
-            z = paddle.to_tensor([1., 2., 3., 4.])
-            z.stop_gradient = False
-
-            o = z.matmul(w)
-
-            # remove hook before backward
-            if removed:
-                helper.remove()
-
-            o.backward()
+    def test_hook_for_interior_var(self):
+        def run_double_hook_for_interior_var(double_hook, removed=False):
+            for device in self.devices:
+                paddle.set_device(device)
+
+                x = paddle.to_tensor([0., 1., 2., 3.])
+                y = paddle.to_tensor([4., 5., 6., 7.])
+                x.stop_gradient = False
+                y.stop_gradient = False
+
+                w = x + y
+                w.stop_gradient = False
+                helper = w.register_hook(double_hook)
+
+                z = paddle.to_tensor([1., 2., 3., 4.])
+                z.stop_gradient = False
+
+                o = z.matmul(w)
+
+                # remove hook before backward
+                if removed:
+                    helper.remove()
+
+                o.backward()
+
+                # z.grad is not affected
+                self.assertTrue(np.array_equal(z.grad, w.numpy()))
+                # w.grad is not changed by hook
+                self.assertTrue(np.array_equal(w.grad, z.numpy()))
+                # x.grad and y.grad are changed if run hook
+                self.assertTrue(
+                    np.array_equal(x.grad,
+                                   z.numpy() * 2 if not removed else z.numpy()))
+                self.assertTrue(
+                    np.array_equal(y.grad,
+                                   z.numpy() * 2 if not removed else z.numpy()))
+
+        def run_print_hook_for_interior_var(print_hook, removed=False):
+            for device in self.devices:
+                paddle.set_device(device)
+
+                x = paddle.to_tensor([0., 1., 2., 3.])
+                y = paddle.to_tensor([4., 5., 6., 7.])
+                x.stop_gradient = False
+                y.stop_gradient = False
+
+                w = x + y
+                w.stop_gradient = False
+                helper = w.register_hook(print_hook)
+
+                z = paddle.to_tensor([1., 2., 3., 4.])
+                z.stop_gradient = False
+
+                o = z.matmul(w)
+
+                # remove hook before backward
+                if removed:
+                    helper.remove()
+
+                o.backward()
+
+                # all grads are not affected
+                self.assertTrue(np.array_equal(z.grad, w.numpy()))
+                self.assertTrue(np.array_equal(w.grad, z.numpy()))
+                self.assertTrue(np.array_equal(x.grad, z.numpy()))
+                self.assertTrue(np.array_equal(y.grad, z.numpy()))
+
+        def double_hook(grad):
+            grad = grad * 2
+            print(grad)
+            return grad
 
-            # z.grad is not affected
-            self.assertTrue(np.array_equal(z.grad, w.numpy()))
-            # w.grad is not changed by hook
-            self.assertTrue(np.array_equal(w.grad, z.numpy()))
-            # x.grad and y.grad are changed if run hook
-            self.assertTrue(
-                np.array_equal(x.grad,
-                               z.numpy() * 2 if not removed else z.numpy()))
-            self.assertTrue(
-                np.array_equal(y.grad,
-                               z.numpy() * 2 if not removed else z.numpy()))
-
-    def run_hook_for_leaf_var(self, double_hook, removed=False):
-        for device in self.devices:
-            paddle.set_device(device)
+        def print_hook(grad):
+            print(grad)
 
-            x = paddle.to_tensor([0., 1., 2., 3.])
-            y = paddle.to_tensor([4., 5., 6., 7.])
-            x.stop_gradient = False
-            y.stop_gradient = False
-            helper = y.register_hook(double_hook)
+        # register hook
+        run_double_hook_for_interior_var(double_hook)
+        # register hook and removed
+        run_double_hook_for_interior_var(double_hook, removed=True)
 
-            w = x + y
-            w.stop_gradient = False
+        # register hook
+        run_double_hook_for_interior_var(lambda grad: grad * 2)
+        # register hook and removed
+        run_double_hook_for_interior_var(lambda grad: grad * 2, removed=True)
 
-            z = paddle.to_tensor([1., 2., 3., 4.])
-            z.stop_gradient = False
+        # register hook
+        run_print_hook_for_interior_var(print_hook)
+        # register hook and removed
+        run_print_hook_for_interior_var(print_hook, removed=True)
 
-            o = z.matmul(w)
+    def test_hook_for_leaf_var(self):
+        def run_double_hook_for_leaf_var(double_hook, removed=False):
+            for device in self.devices:
+                paddle.set_device(device)
 
-            # remove hook before backward
-            if removed:
-                helper.remove()
+                x = paddle.to_tensor([0., 1., 2., 3.])
+                y = paddle.to_tensor([4., 5., 6., 7.])
+                x.stop_gradient = False
+                y.stop_gradient = False
+                helper = y.register_hook(double_hook)
 
-            o.backward()
+                w = x + y
+                w.stop_gradient = False
 
-            # z.grad, w.grad, x.grad is not affected
-            self.assertTrue(np.array_equal(z.grad, w.numpy()))
-            self.assertTrue(np.array_equal(w.grad, z.numpy()))
-            self.assertTrue(np.array_equal(x.grad, z.numpy()))
-            # y.grad are changed if run hook
-            self.assertTrue(
-                np.array_equal(y.grad,
-                               z.numpy() * 2 if not removed else z.numpy()))
+                z = paddle.to_tensor([1., 2., 3., 4.])
+                z.stop_gradient = False
 
-    def run_hook_for_accumulated_grad(self, double_hook, removed=False):
-        for device in self.devices:
-            paddle.set_device(device)
+                o = z.matmul(w)
 
-            a = paddle.to_tensor([0., 1., 1., 2.])
-            b = paddle.to_tensor([0., 0., 1., 2.])
-            a.stop_gradient = False
-            b.stop_gradient = False
+                # remove hook before backward
+                if removed:
+                    helper.remove()
 
-            helper1 = a.register_hook(double_hook)
+                o.backward()
 
-            x = a + b
-            x.stop_gradient = False
+                # z.grad, w.grad, x.grad is not affected
+                self.assertTrue(np.array_equal(z.grad, w.numpy()))
+                self.assertTrue(np.array_equal(w.grad, z.numpy()))
+                self.assertTrue(np.array_equal(x.grad, z.numpy()))
+                # y.grad are changed if run hook
+                self.assertTrue(
+                    np.array_equal(y.grad,
+                                   z.numpy() * 2 if not removed else z.numpy()))
 
-            helper2 = x.register_hook(double_hook)
+        # register hook
+        run_double_hook_for_leaf_var(lambda grad: grad * 2)
+        # register hook and removed
+        run_double_hook_for_leaf_var(lambda grad: grad * 2, removed=True)
 
-            y = paddle.to_tensor([4., 5., 6., 7.])
-            z = paddle.to_tensor([1., 2., 3., 4.])
-            y.stop_gradient = False
-            z.stop_gradient = False
+    def test_hook_for_accumulated_grad(self):
+        def run_double_hook_for_accumulated_grad(double_hook, removed=False):
+            for device in self.devices:
+                paddle.set_device(device)
 
-            o1 = x + y
-            o2 = x + z
-            o1.stop_gradient = False
-            o2.stop_gradient = False
+                a = paddle.to_tensor([0., 1., 1., 2.])
+                b = paddle.to_tensor([0., 0., 1., 2.])
+                a.stop_gradient = False
+                b.stop_gradient = False
 
-            o = o1.matmul(o2)
+                helper1 = a.register_hook(double_hook)
 
-            # remove hook before backward
-            if removed:
-                helper1.remove()
-                helper2.remove()
+                x = a + b
+                x.stop_gradient = False
 
-            o.backward()
+                helper2 = x.register_hook(double_hook)
 
-            base_grad = np.array([5., 9., 13., 19.])
-            # x.grad is not changed
-            self.assertTrue(np.array_equal(x.grad, base_grad))
-            # b.grad is changed by x.hook
-            self.assertTrue(
-                np.array_equal(b.grad, base_grad * 2
-                               if not removed else base_grad))
-            # a.grad is changed by x.hook and a.hook
-            self.assertTrue(
-                np.array_equal(a.grad, base_grad * 4
-                               if not removed else base_grad))
-
-    def run_hook_in_model(self,
-                          data,
-                          label,
-                          hook=None,
-                          register=False,
-                          remove=False):
-        for device in self.devices:
-            paddle.seed(self.seed)
-            paddle.set_device(device)
+                y = paddle.to_tensor([4., 5., 6., 7.])
+                z = paddle.to_tensor([1., 2., 3., 4.])
+                y.stop_gradient = False
+                z.stop_gradient = False
 
-            net = SimpleNet(self.in_size, self.out_size)
-            loss_fn = nn.MSELoss()
+                o1 = x + y
+                o2 = x + z
+                o1.stop_gradient = False
+                o2.stop_gradient = False
 
-            data = paddle.to_tensor(data)
-            label = paddle.to_tensor(label)
+                o = o1.matmul(o2)
 
-            ret1, out = net(data, hook, register, remove)
-            loss = loss_fn(out, label)
-            loss.backward()
+                # remove hook before backward
+                if removed:
+                    helper1.remove()
+                    helper2.remove()
 
-            return ret1.grad, net.linear1.weight.grad, net.linear1.bias.grad
+                o.backward()
 
-    def test_func_hook_for_interior_var(self):
-        def hook_fn(grad):
-            grad = grad * 2
-            print(grad)
-            return grad
+                base_grad = np.array([5., 9., 13., 19.])
+                # x.grad is not changed
+                self.assertTrue(np.array_equal(x.grad, base_grad))
+                # b.grad is changed by x.hook
+                self.assertTrue(
+                    np.array_equal(b.grad, base_grad * 2
+                                   if not removed else base_grad))
+                # a.grad is changed by x.hook and a.hook
+                self.assertTrue(
+                    np.array_equal(a.grad, base_grad * 4
+                                   if not removed else base_grad))
 
         # register hook
-        self.run_hook_for_interior_var(hook_fn)
+        run_double_hook_for_accumulated_grad(lambda grad: grad * 2)
         # register hook and removed
-        self.run_hook_for_interior_var(hook_fn, removed=True)
+        run_double_hook_for_accumulated_grad(
+            lambda grad: grad * 2, removed=True)
 
-    def test_lambda_hook_for_interior_var(self):
-        # register hook
-        self.run_hook_for_interior_var(lambda grad: grad * 2)
-        # register hook and removed
-        self.run_hook_for_interior_var(lambda grad: grad * 2, removed=True)
+    def test_hook_in_model(self):
+        def run_double_hook_in_model(data,
+                                     label,
+                                     hook=None,
+                                     register=False,
+                                     remove=False):
+            for device in self.devices:
+                paddle.seed(self.seed)
+                paddle.set_device(device)
 
-    def test_hook_for_leaf_var(self):
-        # register hook
-        self.run_hook_for_leaf_var(lambda grad: grad * 2)
-        # register hook and removed
-        self.run_hook_for_leaf_var(lambda grad: grad * 2, removed=True)
+                net = SimpleNet(self.in_size, self.out_size)
+                loss_fn = nn.MSELoss()
 
-    def test_hook_for_accumulated_grad(self):
-        # register hook
-        self.run_hook_for_accumulated_grad(lambda grad: grad * 2)
-        # register hook and removed
-        self.run_hook_for_accumulated_grad(lambda grad: grad * 2, removed=True)
+                data = paddle.to_tensor(data)
+                label = paddle.to_tensor(label)
+
+                ret1, out = net(data, hook, register, remove)
+                loss = loss_fn(out, label)
+                loss.backward()
+
+                return ret1.grad, net.linear1.weight.grad, net.linear1.bias.grad
 
-    def test_hook_in_model(self):
         data = np.random.uniform(
             size=[self.batch_size, self.in_size]).astype('float32')
         label = np.random.uniform(size=[self.batch_size, 1]).astype('float32')
 
         # get original value
-        ret1_grad, linear1_w_grad, linear1_b_grad = self.run_hook_in_model(
+        ret1_grad, linear1_w_grad, linear1_b_grad = run_double_hook_in_model(
             data, label)
         # get value changed by hook
-        ret1_grad_hook, linear1_w_grad_hook, linear1_b_grad_hook = self.run_hook_in_model(
+        ret1_grad_hook, linear1_w_grad_hook, linear1_b_grad_hook = run_double_hook_in_model(
             data, label, lambda grad: grad * 2, True)
         # get value after removing hook
-        ret1_grad_rm, linear1_w_grad_rm, linear1_b_grad_rm = self.run_hook_in_model(
+        ret1_grad_rm, linear1_w_grad_rm, linear1_b_grad_rm = run_double_hook_in_model(
             data, label, lambda grad: grad * 2, True, True)
 
         # compare original value and with hook
@@ -241,6 +278,95 @@ def test_hook_in_model(self):
         self.assertTrue(np.array_equal(linear1_w_grad, linear1_w_grad_rm))
         self.assertTrue(np.array_equal(linear1_b_grad, linear1_b_grad_rm))
 
+    def test_multiple_hooks_for_interior_var(self):
+        def run_multiple_hooks_for_interior_var(device,
+                                                hooks,
+                                                remove1=False,
+                                                remove2=False,
+                                                remove3=False):
+            paddle.set_device(device)
+
+            x = paddle.to_tensor([0., 1., 2., 3.])
+            y = paddle.to_tensor([4., 5., 6., 7.])
+            x.stop_gradient = False
+            y.stop_gradient = False
+
+            w = x + y
+            w.stop_gradient = False
+
+            helpers = []
+            for hook in hooks:
+                helper = w.register_hook(hook)
+                helpers.append(helper)
+
+            z = paddle.to_tensor([1., 2., 3., 4.])
+            z.stop_gradient = False
+
+            o = z.matmul(w)
+
+            if remove1:
+                helpers[0].remove()
+            if remove2:
+                helpers[1].remove()
+            if remove3:
+                helpers[2].remove()
+
+            o.backward()
+
+            return z.numpy(), w.grad, x.grad, y.grad
+
+        def double_hook(grad):
+            return grad * 2
+
+        hooks = [double_hook, double_hook, double_hook]
+
+        for device in self.devices:
+            z, w_grad, x_grad, y_grad = run_multiple_hooks_for_interior_var(
+                device, hooks)
+
+            self.assertTrue(np.array_equal(w_grad, z))
+            self.assertTrue(np.array_equal(x_grad, z * 8))
+            self.assertTrue(np.array_equal(y_grad, z * 8))
+
+            z, w_grad, x_grad, y_grad = run_multiple_hooks_for_interior_var(
+                device, hooks, remove1=True)
+
+            self.assertTrue(np.array_equal(w_grad, z))
+            self.assertTrue(np.array_equal(x_grad, z * 4))
+            self.assertTrue(np.array_equal(y_grad, z * 4))
+
+            z, w_grad, x_grad, y_grad = run_multiple_hooks_for_interior_var(
+                device, hooks, remove2=True)
+
+            self.assertTrue(np.array_equal(w_grad, z))
+            self.assertTrue(np.array_equal(x_grad, z * 4))
+            self.assertTrue(np.array_equal(y_grad, z * 4))
+
+            z, w_grad, x_grad, y_grad = run_multiple_hooks_for_interior_var(
+                device, hooks, remove3=True)
+
+            self.assertTrue(np.array_equal(w_grad, z))
+            self.assertTrue(np.array_equal(x_grad, z * 4))
+            self.assertTrue(np.array_equal(y_grad, z * 4))
+
+            z, w_grad, x_grad, y_grad = run_multiple_hooks_for_interior_var(
+                device, hooks, remove1=True, remove2=True, remove3=True)
+
+            self.assertTrue(np.array_equal(w_grad, z))
+            self.assertTrue(np.array_equal(x_grad, z))
+            self.assertTrue(np.array_equal(y_grad, z))
+
+    def test_remove_one_hook_multiple_times(self):
+        for device in self.devices:
+            paddle.set_device(device)
+
+            x = paddle.to_tensor([1., 2., 3., 4.])
+            x.stop_gradient = False
+
+            h = x.register_hook(lambda grad: grad * 2)
+            self.assertTrue(h.remove())
+            self.assertFalse(h.remove())
+
 
 if __name__ == '__main__':
     unittest.main()

From aa6857854eb8a2a219ebdb9786b22d634cc3f844 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 26 Mar 2021 09:40:23 +0000
Subject: [PATCH 09/16] fix double grad test failed

---
 paddle/fluid/imperative/basic_engine.cc       |  6 ++--
 .../fluid/imperative/gradient_accumulator.cc  |  4 +--
 .../fluid/imperative/partial_grad_engine.cc   |  4 +++
 .../fluid/dygraph/varbase_patch_methods.py    |  9 ++++--
 .../unittests/test_tensor_register_hook.py    | 32 +++++++++++++++++++
 5 files changed, 47 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index 29eac618e9427..d10d895e3d032 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -448,13 +448,13 @@ void BasicEngine::Execute() {
         if (!accumulator->SumGradCompleted()) {
           continue;
         }
-        // 1. Call Hooks for **inner_var_**
+        // 1. Call Hooks for `inner_var_`
         accumulator->CallHooks();
 
-        // 2. Sum Gradient with Previous Graph
+        // 2. Sum Gradient `inner_var_` to `var_` of Current or Previous Graph
         accumulator->AccumulateGrad();
 
-        // 3. Call backward Hooks for **var_**
+        // 3. Call backward Hooks for `var_`
         accumulator->CallReduceHooks();
       }
 
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index 6779ab71e30b4..ce78157282c8d 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -385,8 +385,8 @@ static platform::Place GetPlaceOfVar(
 
 void GradientAccumulator::AccumulateGrad() {
   /**
-   * If the gradient has been calculated by previous graph,
-   * it should be added to the previous graph result.
+   * If the leaf gradient has been calculated done, the inner_var_
+   * should be added to the var_.
    */
   if (!var_->IsLeafGrad() || !SumGradCompleted() || !HasInnerVar()) {
     return;
diff --git a/paddle/fluid/imperative/partial_grad_engine.cc b/paddle/fluid/imperative/partial_grad_engine.cc
index 8dd8cafc835ab..3da3a05ed1071 100644
--- a/paddle/fluid/imperative/partial_grad_engine.cc
+++ b/paddle/fluid/imperative/partial_grad_engine.cc
@@ -369,6 +369,10 @@ class GradientAccumulationInfo {
     *is_finished = (cur_ref_cnt_ == total_ref_cnt_);
     accumulator_->SumGrad(grad_var_partial, trace_id, unchange_input);
 
+    if (*is_finished && accumulator_->HasInnerVar()) {
+      accumulator_->AccumulateGrad();
+    }
+
     if (create_graph_) {
       VLOG(10) << "Store partial grad grad for double grad "
                << mapped_grad_var_->Name();
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index c5ac15ee84987..22d605a45a8d1 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -289,10 +289,13 @@ def double_hook_fn(grad):
 
                 o = z.matmul(w)
                 o.backward()
+                # print_hook_fn print content in backward
+                # Tensor(shape=[4], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
+                #        [2., 4., 6., 8.])
 
-                # ('w.grad: ', array([1., 2., 3., 4.], dtype=float32))
-                # ('x.grad: ', array([ 4.,  8., 12., 16.], dtype=float32))
-                # ('y.grad: ', array([2., 4., 6., 8.], dtype=float32))
+                print("w.grad:", w.grad) # w.grad: [1. 2. 3. 4.]
+                print("x.grad:", x.grad) # x.grad: [ 4.  8. 12. 16.]
+                print("y.grad:", y.grad) # y.grad: [2. 4. 6. 8.]
 
                 # remove hook
                 h.remove()
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
index 14a04c9ee9f7b..f384d6fe75268 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
@@ -367,6 +367,38 @@ def test_remove_one_hook_multiple_times(self):
             self.assertTrue(h.remove())
             self.assertFalse(h.remove())
 
+    def test_hook_in_double_grad(self):
+        def double_print_hook(grad):
+            grad = grad * 2
+            print(grad)
+            return grad
+
+        x = paddle.ones(shape=[1], dtype='float32')
+        x.stop_gradient = False
+
+        # hook only works in backward
+        # for forward var x, the x.grad generated in
+        # paddle.grad will not deal with by hook
+        x.register_hook(double_print_hook)
+
+        y = x * x
+
+        # Since y = x * x, dx = 2 * x
+        dx = paddle.grad(
+            outputs=[y], inputs=[x], create_graph=True, retain_graph=True)[0]
+
+        z = y + dx
+        self.assertTrue(x.grad is None)
+
+        # If create_graph = True, the gradient of dx
+        # would be backpropagated. Therefore,
+        # z = x * x + dx = x * x + 2 * x, and
+        # x.gradient() = 2 * x + 2 = 4.0
+        # after changed by hook: 8.0
+
+        z.backward()
+        self.assertTrue(np.array_equal(x.grad, np.array([8.])))
+
 
 if __name__ == '__main__':
     unittest.main()

From e8f799aa502ad02b66ee156735af7331259fe101 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 26 Mar 2021 11:05:44 +0000
Subject: [PATCH 10/16] remove reduce hook func

---
 paddle/fluid/pybind/imperative.cc | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index b7ae709960479..8bb6c31117e92 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -105,9 +105,6 @@ class PyVariableWrapperHook : public imperative::VariableWrapperHook {
       PADDLE_THROW(platform::errors::Unavailable(
           "Hook function of Tensor raises an exception: %s.", e.what()));
     } catch (...) {
-      // NOTE: this branch represents a very serious bug with
-      // low probability of occurrence, and we can't get its
-      // exception content here.
       PADDLE_THROW(platform::errors::Fatal(
           "Hook function of Tensor raises an unknown exception."));
     }
@@ -1058,8 +1055,6 @@ void BindImperative(py::module *m_ptr) {
                      "Cannot remove hook on a tensor without gradient."));
              return self.GradVarBase()->RemoveHook(hook_id);
            })
-      .def("_register_grad_reduce_hook",
-           [](imperative::VarBase &self, const py::handle &hook) { return; })
       .def("cpu",
            [](const std::shared_ptr<imperative::VarBase> &self) {
              if (platform::is_cpu_place(self->Place())) {

From 21eceecd47c4af4e1211428e0c4c4604f91cc9be Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 29 Mar 2021 02:59:25 +0000
Subject: [PATCH 11/16] fix set empty error

---
 paddle/fluid/imperative/gradient_accumulator.cc | 1 +
 paddle/fluid/imperative/gradient_accumulator.h  | 3 ---
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index ce78157282c8d..4bc71db2a083a 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -428,6 +428,7 @@ void GradientAccumulator::AccumulateGrad() {
     *(dst) = std::move(*src);
     var_->SetType(inner_var_->Type());
     var_->SetDataType(inner_var_->DataType());
+    var_->SetIsEmpty(false);
   }
   inner_var_.reset();
 }
diff --git a/paddle/fluid/imperative/gradient_accumulator.h b/paddle/fluid/imperative/gradient_accumulator.h
index 08a60ee03059e..0f500f7da31ef 100644
--- a/paddle/fluid/imperative/gradient_accumulator.h
+++ b/paddle/fluid/imperative/gradient_accumulator.h
@@ -52,9 +52,6 @@ class GradientAccumulator {
               << ") to store result of this Graph";
     }
 
-    // TODO(zhouwei): fix Tensor.clear_gradient() bug, remove this hard flag
-    var->SetIsEmpty(false);
-
     // var_ is the final grad, processed by hooks and grad accumulation
     var_ = var;
   }

From d5468e5a76d686d8af2066d89410af9178d6ef9d Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 29 Mar 2021 09:25:24 +0000
Subject: [PATCH 12/16] polish code by comments

---
 paddle/fluid/imperative/basic_engine.cc       | 77 +++++++++++--------
 .../fluid/imperative/gradient_accumulator.cc  | 37 ++++++---
 .../fluid/imperative/gradient_accumulator.h   |  2 +-
 .../fluid/dygraph/varbase_patch_methods.py    |  2 +-
 .../unittests/test_tensor_register_hook.py    | 31 +++++---
 5 files changed, 97 insertions(+), 52 deletions(-)

diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index d10d895e3d032..cff4a23d4698c 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -251,6 +251,30 @@ void BasicEngine::PrepareDeps() {
   }
 }
 
+static std::shared_ptr<NameVarMap<VariableWrapper>> CallGradientHooks(
+    const NameVarMap<VariableWrapper>& bwd_ins, const std::string& op_type) {
+  std::shared_ptr<NameVarMap<VariableWrapper>> tmp_ins_ptr = nullptr;
+  for (const auto& pair : bwd_ins) {
+    for (size_t i = 0; i < pair.second.size(); ++i) {
+      auto& var = pair.second[i];
+      if (var->HasHook()) {
+        if (tmp_ins_ptr == nullptr) {
+          tmp_ins_ptr = std::make_shared<NameVarMap<VariableWrapper>>(bwd_ins);
+        }
+        VLOG(3) << "Call " << var->GetHooks().size() << " hooks of " << op_type
+                << "'s input `" << pair.first << "`'s var `" << var->Name()
+                << "`.";
+        auto tmp_var = var;
+        for (const auto& hook_pair : var->GetHooks()) {
+          tmp_var = (*hook_pair.second)(tmp_var);
+        }
+        (*tmp_ins_ptr)[pair.first][i] = tmp_var;
+      }
+    }
+  }
+  return tmp_ins_ptr;
+}
+
 void BasicEngine::Execute() {
   if (init_node_ == nullptr) {
     return;
@@ -282,16 +306,7 @@ void BasicEngine::Execute() {
       auto& bwd_outs = cur_op.GetOutsMap();
 
       /**
-       * [ Why need temporary inputs and outputs here? ]
-       *
-       * 1. For inputs
-       * - Hook execution should not change original input tensor.
-       *   User can register hook for Tensor's gradient, It is expected
-       *   that the hook only affects the gradient of the backward
-       *   propagation, and does not affect the gradient value input
-       *   as the hook.
-       *
-       * 2. For outputs
+       * [ Why need temporary outputs here? ]
        *
        * - construct the temp output map, avoid to disrupt graph
        * - replace the element in the map by temp var, because a
@@ -370,7 +385,7 @@ void BasicEngine::Execute() {
             // If a tmp var has been created, there is no need to create it
             // again.
             for (auto& in_var :
-                 tmp_ins.at(inplace_grad_name_map.at(pair.first))) {
+                 bwd_ins.at(inplace_grad_name_map.at(pair.first))) {
               if (in_var == var) {
                 auto tmp_var = std::make_shared<VariableWrapper>(var->Name());
                 tmp_var->SetType(var->Type());
@@ -389,7 +404,7 @@ void BasicEngine::Execute() {
 
       VLOG(4) << "Check whether there is any inplace operation affecting "
                  "gradient calculation.";
-      for (auto& pair : tmp_ins) {
+      for (auto& pair : bwd_ins) {
         for (auto& var_wrapper : pair.second) {
           auto wrapper_version_snapshot = var_wrapper->InplaceVersionSnapshot();
           auto tensor_version =
@@ -412,26 +427,28 @@ void BasicEngine::Execute() {
         }
       }
 
-      for (auto& pair : tmp_ins) {
-        for (size_t i = 0; i < pair.second.size(); ++i) {
-          auto& var = pair.second[i];
-          if (var->HasHook()) {
-            VLOG(3) << "Call " << var->GetHooks().size() << " hooks of "
-                    << cur_op.Type() << "'s input `" << pair.first
-                    << "`'s var `" << var->Name() << "`.";
-            auto tmp_var = var;
-            for (const auto& hook_pair : var->GetHooks()) {
-              tmp_var = (*hook_pair.second)(tmp_var);
-            }
-            tmp_ins[pair.first][i] = tmp_var;
-          }
-        }
-      }
+      /**
+       * [ Why need temporary inputs here? ]
+       *
+       * - Hook execution should not change original input tensor.
+       *   User can register hook for Tensor's gradient, It is expected
+       *   that the hook only affects the gradient of the backward
+       *   propagation, and does not affect the gradient value input
+       *   as the hook.
+       * - use `tmp_ins_ptr`, only copy bwd_ins when the var in bwd_ins
+       *   hold hooks
+       */
+      auto tmp_ins_ptr = CallGradientHooks(bwd_ins, cur_op.Type());
 
       {
         VLOG(3) << "Start to execute grad op " << cur_op.Type();
-        OpBase::Run(cur_op.InnerOp(), tmp_ins, tmp_outs, cur_op.Attrs(),
-                    cur_op.place());
+        if (tmp_ins_ptr == nullptr) {
+          OpBase::Run(cur_op.InnerOp(), bwd_ins, tmp_outs, cur_op.Attrs(),
+                      cur_op.place());
+        } else {
+          OpBase::Run(cur_op.InnerOp(), *tmp_ins_ptr, tmp_outs, cur_op.Attrs(),
+                      cur_op.place());
+        }
       }
 
       for (auto& pair : inplace_output_grad_var_list_) {
@@ -449,7 +466,7 @@ void BasicEngine::Execute() {
           continue;
         }
         // 1. Call Hooks for `inner_var_`
-        accumulator->CallHooks();
+        accumulator->CallGradientHooks();
 
         // 2. Sum Gradient `inner_var_` to `var_` of Current or Previous Graph
         accumulator->AccumulateGrad();
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index 4bc71db2a083a..a8de588569924 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -433,18 +433,24 @@ void GradientAccumulator::AccumulateGrad() {
   inner_var_.reset();
 }
 
-void GradientAccumulator::CallHooks() {
-  if (!var_->IsLeafGrad() || !SumGradCompleted() || !HasInnerVar()) {
-    return;
-  }
+void GradientAccumulator::CallGradientHooks() {
+  PADDLE_ENFORCE_EQ(var_->IsLeafGrad(), true,
+                    platform::errors::Unavailable(
+                        "Only leaf gradient Tensor can deal with by gradient "
+                        "hook in gradient accumulator."));
+  PADDLE_ENFORCE_EQ(
+      SumGradCompleted(), true,
+      platform::errors::PreconditionNotMet(
+          "Only can call gradient hooks after sum gradient completed."));
   PADDLE_ENFORCE_EQ(
       HasInnerVar(), true,
-      platform::errors::InvalidArgument(
+      platform::errors::PreconditionNotMet(
           "Leaf Tensor's inner var is nullptr when call gradient hook."));
-  PADDLE_ENFORCE_EQ(inner_var_->Var().IsInitialized(), true,
-                    platform::errors::InvalidArgument("Leaf Tensor's inner var "
-                                                      "is not initialized when "
-                                                      "call gradient hook."));
+  PADDLE_ENFORCE_EQ(
+      inner_var_->Var().IsInitialized(), true,
+      platform::errors::PreconditionNotMet("Leaf Tensor's inner var "
+                                           "is not initialized when "
+                                           "call gradient hook."));
   if (var_->HasHook()) {
     VLOG(3) << "Call " << var_->GetHooks().size()
             << " hooks of leaf gradient accumulator's inner var `"
@@ -460,6 +466,19 @@ void GradientAccumulator::CallHooks() {
 }
 
 void GradientAccumulator::CallReduceHooks() {
+  PADDLE_ENFORCE_EQ(
+      var_->IsLeafGrad(), true,
+      platform::errors::Unavailable("Only leaf gradient Tensor can deal with "
+                                    "by reduce hook in gradient accumulator."));
+  PADDLE_ENFORCE_EQ(SumGradCompleted(), true,
+                    platform::errors::PreconditionNotMet(
+                        "Only can call reduce hooks after the gradient "
+                        "summation is completed in current batch."));
+  PADDLE_ENFORCE_EQ(HasInnerVar(), false,
+                    platform::errors::PreconditionNotMet(
+                        "Only can call reduce hooks after the "
+                        "gradient accumulation is completed in "
+                        "current batch or across batchs."));
   if (var_->HasReduceHook()) {
     for (const auto& hook : var_->GetReduceHooks()) {
       VLOG(3) << "call gradient accumulator backward hooks.";
diff --git a/paddle/fluid/imperative/gradient_accumulator.h b/paddle/fluid/imperative/gradient_accumulator.h
index 0f500f7da31ef..6411dce4405c1 100644
--- a/paddle/fluid/imperative/gradient_accumulator.h
+++ b/paddle/fluid/imperative/gradient_accumulator.h
@@ -119,7 +119,7 @@ class GradientAccumulator {
    *    parallel multi-card training.
    */
 
-  void CallHooks();
+  void CallGradientHooks();
 
   void CallReduceHooks();
 
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 22d605a45a8d1..e565552632f87 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -262,7 +262,7 @@ def register_hook(self, hook):
             TensorHookRemoveHelper: A helper object that can be used to remove the registered hook by calling `remove()` method.
 
         Examples:
-             .. code-block:: python
+            .. code-block:: python
 
                 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
index f384d6fe75268..a390dd9d80756 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
@@ -356,17 +356,6 @@ def double_hook(grad):
             self.assertTrue(np.array_equal(x_grad, z))
             self.assertTrue(np.array_equal(y_grad, z))
 
-    def test_remove_one_hook_multiple_times(self):
-        for device in self.devices:
-            paddle.set_device(device)
-
-            x = paddle.to_tensor([1., 2., 3., 4.])
-            x.stop_gradient = False
-
-            h = x.register_hook(lambda grad: grad * 2)
-            self.assertTrue(h.remove())
-            self.assertFalse(h.remove())
-
     def test_hook_in_double_grad(self):
         def double_print_hook(grad):
             grad = grad * 2
@@ -399,6 +388,26 @@ def double_print_hook(grad):
         z.backward()
         self.assertTrue(np.array_equal(x.grad, np.array([8.])))
 
+    def test_remove_one_hook_multiple_times(self):
+        for device in self.devices:
+            paddle.set_device(device)
+
+            x = paddle.to_tensor([1., 2., 3., 4.])
+            x.stop_gradient = False
+
+            h = x.register_hook(lambda grad: grad * 2)
+            self.assertTrue(h.remove())
+            self.assertFalse(h.remove())
+
+    def test_register_hook_for_stop_gradient_var(self):
+        for device in self.devices:
+            paddle.set_device(device)
+
+            x = paddle.to_tensor([1., 2., 3., 4.])
+
+            with self.assertRaises(RuntimeError):
+                x.register_hook(lambda grad: grad * 2)
+
 
 if __name__ == '__main__':
     unittest.main()

From c0838dcc5885b962152f8539cb0c79468e12bbce Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 29 Mar 2021 10:02:02 +0000
Subject: [PATCH 13/16] change reduce_hook to mutable_hook

---
 paddle/fluid/imperative/gradient_accumulator.cc |  4 ++--
 paddle/fluid/imperative/layer.h                 |  4 ++--
 paddle/fluid/imperative/reducer.cc              |  2 +-
 paddle/fluid/imperative/tests/test_hooks.cc     |  4 ++--
 paddle/fluid/imperative/variable_wrapper.h      | 12 ++++++------
 5 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index a8de588569924..7c2f818675c66 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -479,8 +479,8 @@ void GradientAccumulator::CallReduceHooks() {
                         "Only can call reduce hooks after the "
                         "gradient accumulation is completed in "
                         "current batch or across batchs."));
-  if (var_->HasReduceHook()) {
-    for (const auto& hook : var_->GetReduceHooks()) {
+  if (var_->HasMutableHook()) {
+    for (const auto& hook : var_->GetMutableHooks()) {
       VLOG(3) << "call gradient accumulator backward hooks.";
       (*hook)(var_);
     }
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index 625af0c1fc3da..f87db415768a1 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -236,8 +236,8 @@ class VarBase {
     return var_->GetHooks();
   }
 
-  void AddReduceHook(std::shared_ptr<InplaceVariableWrapperHook>&& hook) {
-    var_->AddReduceHook(
+  void AddMutableHook(std::shared_ptr<InplaceVariableWrapperHook>&& hook) {
+    var_->AddMutableHook(
         std::forward<std::shared_ptr<InplaceVariableWrapperHook>>(hook));
   }
 
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index 775e7008fc700..4b18886821b8e 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -310,7 +310,7 @@ Reducer::Reducer(const std::vector<std::shared_ptr<imperative::VarBase>> &vars,
   for (size_t global_var_index = 0; global_var_index < vars_.size();
        ++global_var_index) {
     auto var = vars_[global_var_index];
-    var->GradVarBase()->AddReduceHook(
+    var->GradVarBase()->AddMutableHook(
         std::make_shared<LambdaInplaceVariableWrapperHook>([=](
             VariableWrapper *grad) { this->AddDistHook(global_var_index); }));
     var_index_map_[var->GradVarBase()->SharedVar().get()] = global_var_index;
diff --git a/paddle/fluid/imperative/tests/test_hooks.cc b/paddle/fluid/imperative/tests/test_hooks.cc
index a196cd263c120..9b75fac0ca5c4 100644
--- a/paddle/fluid/imperative/tests/test_hooks.cc
+++ b/paddle/fluid/imperative/tests/test_hooks.cc
@@ -74,7 +74,7 @@ TEST(TestHooks, TestGradVarLeafBackwardHook) {
   mul_attr_map["use_mkldnn"] = false;
 
   // add GradAccumulatorPostHook
-  x->GradVarBase()->AddReduceHook(
+  x->GradVarBase()->AddMutableHook(
       std::make_shared<LambdaInplaceVariableWrapperHook>(
           [=](VariableWrapper* grad) {
             auto* grad_tensor =
@@ -151,7 +151,7 @@ void GradVarLeafBackwardHookWithGradAccmulatedTest() {
                sizeof(float) * src_data.size());
 
   // add ReduceBackwardHook
-  x->GradVarBase()->AddReduceHook(
+  x->GradVarBase()->AddMutableHook(
       std::make_shared<LambdaInplaceVariableWrapperHook>(
           [=](VariableWrapper* grad) {
             auto* grad_tensor =
diff --git a/paddle/fluid/imperative/variable_wrapper.h b/paddle/fluid/imperative/variable_wrapper.h
index 77d097678acda..7d287c9829104 100644
--- a/paddle/fluid/imperative/variable_wrapper.h
+++ b/paddle/fluid/imperative/variable_wrapper.h
@@ -222,7 +222,7 @@ class VariableWrapper {
   /* Hook related methods */
   bool HasHook() const { return !hooks_.empty(); }
 
-  bool HasReduceHook() const { return !reduce_hooks_.empty(); }
+  bool HasMutableHook() const { return !mutable_hooks_.empty(); }
 
   int64_t AddHook(std::shared_ptr<VariableWrapperHook>&& hook) {
     hooks_.emplace(next_hook_id_, std::move(hook));
@@ -242,13 +242,13 @@ class VariableWrapper {
     return hooks_;
   }
 
-  void AddReduceHook(std::shared_ptr<InplaceVariableWrapperHook>&& hook) {
-    reduce_hooks_.emplace_back(std::move(hook));
+  void AddMutableHook(std::shared_ptr<InplaceVariableWrapperHook>&& hook) {
+    mutable_hooks_.emplace_back(std::move(hook));
   }
 
   const std::vector<std::shared_ptr<InplaceVariableWrapperHook>>&
-  GetReduceHooks() const {
-    return reduce_hooks_;
+  GetMutableHooks() const {
+    return mutable_hooks_;
   }
 
  private:
@@ -326,7 +326,7 @@ class VariableWrapper {
   std::map<int64_t, std::shared_ptr<VariableWrapperHook>> hooks_;
   // Hooks executed after the execution of the entire backward process is over,
   // currently only supported for reducing in distributed training
-  std::vector<std::shared_ptr<InplaceVariableWrapperHook>> reduce_hooks_;
+  std::vector<std::shared_ptr<InplaceVariableWrapperHook>> mutable_hooks_;
 };
 
 }  // namespace imperative

From dbd3c34f71565845b349ff4726ef05e71a8c1822 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 29 Mar 2021 12:15:57 +0000
Subject: [PATCH 14/16] remove useless tmp_ins

---
 paddle/fluid/imperative/basic_engine.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index cff4a23d4698c..9e46af9cb72f8 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -312,7 +312,6 @@ void BasicEngine::Execute() {
        * - replace the element in the map by temp var, because a
        *   var may be coresponding to several grad var in one op
        */
-      NameVarMap<VariableWrapper> tmp_ins(bwd_ins);
       NameVarMap<VariableWrapper> tmp_outs(bwd_outs);
 
       for (auto& pair : tmp_outs) {

From 7c9fd7089c8f6765734b993b1ac51cdf9a73ef1d Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 31 Mar 2021 11:33:56 +0000
Subject: [PATCH 15/16] fix shape code format error

---
 paddle/fluid/pybind/imperative.cc | 38 ++++++++++++++++++-------------
 1 file changed, 22 insertions(+), 16 deletions(-)

diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 8bb6c31117e92..d6054c1889f4b 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -1234,22 +1234,28 @@ void BindImperative(py::module *m_ptr) {
                     &imperative::VarBase::SetOverridedStopGradient)
       .def_property("persistable", &imperative::VarBase::Persistable,
                     &imperative::VarBase::SetPersistable)
-      .def_property_readonly(
-          "shape",
-          [](imperative::VarBase &self) {
-            if (self.Var().IsType<framework::LoDTensor>()) {
-              return framework::vectorize<int>(
-                  self.Var().Get<framework::LoDTensor>().dims());
-            } else if (self.Var().IsType<framework::SelectedRows>()) {
-              return framework::vectorize<int>(
-                  self.Var().Get<framework::SelectedRows>().value().dims());
-            } else {
-              VLOG(2) << "It is meaningless to get shape of "
-                         "variable type "
-                      << GetTypeName(self);
-              return std::vector<int>();
-            }
-          })
+      .def_property_readonly("shape",
+                             [](imperative::VarBase &self) {
+                               if (self.Var().IsType<framework::LoDTensor>()) {
+                                 return framework::vectorize<int>(
+                                     self.Var()
+                                        .Get<framework::LoDTensor>()
+                                        .dims());
+                               } else if (self.Var()
+                                              .IsType<
+                                                  framework::SelectedRows>()) {
+                                 return framework::vectorize<int>(
+                                     self.Var()
+                                         .Get<framework::SelectedRows>()
+                                         .value()
+                                         .dims());
+                               } else {
+                                 VLOG(2) << "It is meaningless to get shape of "
+                                            "variable type "
+                                         << GetTypeName(self);
+                                 return std::vector<int>();
+                               }
+                             })
       .def_property_readonly("is_leaf", &imperative::VarBase::IsLeaf,
                              R"DOC(
       Whether a Tensor is leaf Tensor.

From ef087a5d7d9bf67b6fe6c77406a484e84fe03e22 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 31 Mar 2021 11:51:03 +0000
Subject: [PATCH 16/16] fix shape code format error

---
 paddle/fluid/pybind/imperative.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 2dbceed21cfe2..38ba1dc029303 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -1303,8 +1303,8 @@ void BindImperative(py::module *m_ptr) {
                                if (self.Var().IsType<framework::LoDTensor>()) {
                                  return framework::vectorize<int>(
                                      self.Var()
-                                        .Get<framework::LoDTensor>()
-                                        .dims());
+                                         .Get<framework::LoDTensor>()
+                                         .dims());
                                } else if (self.Var()
                                               .IsType<
                                                   framework::SelectedRows>()) {