From 61b472ccebd94299878ef49de9819610a55679d4 Mon Sep 17 00:00:00 2001
From: ybc <bichengying@gmail.com>
Date: Sun, 7 Feb 2021 21:06:29 -0800
Subject: [PATCH 01/36] Fixed the self_weight under emtpy receiving case

---
 bluefog/torch/mpi_ops.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/bluefog/torch/mpi_ops.cc b/bluefog/torch/mpi_ops.cc
index f50e272d..124d41cd 100644
--- a/bluefog/torch/mpi_ops.cc
+++ b/bluefog/torch/mpi_ops.cc
@@ -439,6 +439,8 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output,
             }
             output.resize_(shape_vector);
             MaybeCopyBufferBack(output, output_buffer);
+          } else {  // recv_size == 0 
+            output.mul_(self_weight);
           }
         }));
 
@@ -529,8 +531,8 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output,
             }
             output.resize_(shape_vector);
             MaybeCopyBufferBack(output, output_buffer);
-          } else {
-            output.set_(tensor);
+          } else { // recv_size == 0
+            output.set_(tensor.mul(self_weight));
           }
         }));
     ThrowIfError(enqueue_result);

From 7e8ea66a1d047b362b1fb9a1e1bf91d9bd45d725 Mon Sep 17 00:00:00 2001
From: Hanbin Hu <hanbinhu2016@gmail.com>
Date: Thu, 11 Feb 2021 00:52:30 -0800
Subject: [PATCH 02/36] Enable empty send neighbors and fix HalfTensor for
 recv_size==0

---
 bluefog/torch/mpi_ops.cc | 9 +++++++--
 bluefog/torch/mpi_ops.py | 3 ---
 test/torch_ops_test.py   | 2 +-
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/bluefog/torch/mpi_ops.cc b/bluefog/torch/mpi_ops.cc
index 124d41cd..4e517ab3 100644
--- a/bluefog/torch/mpi_ops.cc
+++ b/bluefog/torch/mpi_ops.cc
@@ -440,7 +440,9 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output,
             output.resize_(shape_vector);
             MaybeCopyBufferBack(output, output_buffer);
           } else {  // recv_size == 0 
-            output.mul_(self_weight);
+            ::torch::Tensor output_buffer = MaybeCopyToTensorBuffer(output);
+            output_buffer.mul_(self_weight);
+            MaybeCopyBufferBack(output, output_buffer);
           }
         }));
 
@@ -532,7 +534,10 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output,
             output.resize_(shape_vector);
             MaybeCopyBufferBack(output, output_buffer);
           } else { // recv_size == 0
-            output.set_(tensor.mul(self_weight));
+            output.set_(tensor);
+            ::torch::Tensor output_buffer = MaybeCopyToTensorBuffer(output);
+            output_buffer.mul_(self_weight);
+            MaybeCopyBufferBack(output, output_buffer);
           }
         }));
     ThrowIfError(enqueue_result);
diff --git a/bluefog/torch/mpi_ops.py b/bluefog/torch/mpi_ops.py
index fd1d103b..fa52dcdd 100644
--- a/bluefog/torch/mpi_ops.py
+++ b/bluefog/torch/mpi_ops.py
@@ -441,9 +441,6 @@ def _neighbor_allreduce_nonblocking(tensor, output, self_weight, neighbor_weight
     elif self_weight is None or neighbor_weights is None:
         raise ValueError("Arguments self_weight and neighbor_weights should be presented if "
                          "enabling dynamic topology.")
-    elif not send_neighbors:
-        raise ValueError("Argument send_neighbors cannot be empty list but we plan to support "
-                         "it in future.")
     else:
         dynamic_neighbors_enabled = True
     if self_weight is None and neighbor_weights is None:
diff --git a/test/torch_ops_test.py b/test/torch_ops_test.py
index 7dda2708..4a555563 100644
--- a/test/torch_ops_test.py
+++ b/test/torch_ops_test.py
@@ -521,7 +521,7 @@ def test_neighbor_allreduce_dynamic_topo_move(self):
                 (reduced_tensor.data - (rank-1) % size).abs().max() < eps
             ), "bf.neighbor_allreduce (move) produces incorrect reduced tensor"
 
-    @unittest.skip("Haven't fully clear on the usage due to sync issues. Temporarily disabled")
+    #@unittest.skip("Haven't fully clear on the usage due to sync issues. Temporarily disabled")
     def test_neighbor_allreduce_dynamic_topo_with_empty_send_neighbors(self):
         """Test that the neighbor all reduce (avg) 1D, 2D, 3D tensors correctly with empty
            send_neighbors."""

From 1d43096ed136dd6ffac1eaa5e34a4bb4a42362c3 Mon Sep 17 00:00:00 2001
From: ybc <bichengying@gmail.com>
Date: Sun, 7 Feb 2021 21:06:29 -0800
Subject: [PATCH 03/36] Fixed the self_weight under emtpy receiving case

---
 bluefog/torch/mpi_ops.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/bluefog/torch/mpi_ops.cc b/bluefog/torch/mpi_ops.cc
index f50e272d..124d41cd 100644
--- a/bluefog/torch/mpi_ops.cc
+++ b/bluefog/torch/mpi_ops.cc
@@ -439,6 +439,8 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output,
             }
             output.resize_(shape_vector);
             MaybeCopyBufferBack(output, output_buffer);
+          } else {  // recv_size == 0 
+            output.mul_(self_weight);
           }
         }));
 
@@ -529,8 +531,8 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output,
             }
             output.resize_(shape_vector);
             MaybeCopyBufferBack(output, output_buffer);
-          } else {
-            output.set_(tensor);
+          } else { // recv_size == 0
+            output.set_(tensor.mul(self_weight));
           }
         }));
     ThrowIfError(enqueue_result);

From c082ce93c8e6949e3d9804b3a6a10680678d1fa0 Mon Sep 17 00:00:00 2001
From: Hanbin Hu <hanbinhu2016@gmail.com>
Date: Thu, 11 Feb 2021 00:52:30 -0800
Subject: [PATCH 04/36] Enable empty send neighbors and fix HalfTensor for
 recv_size==0

---
 bluefog/torch/mpi_ops.cc | 9 +++++++--
 bluefog/torch/mpi_ops.py | 3 ---
 test/torch_ops_test.py   | 2 +-
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/bluefog/torch/mpi_ops.cc b/bluefog/torch/mpi_ops.cc
index 124d41cd..4e517ab3 100644
--- a/bluefog/torch/mpi_ops.cc
+++ b/bluefog/torch/mpi_ops.cc
@@ -440,7 +440,9 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output,
             output.resize_(shape_vector);
             MaybeCopyBufferBack(output, output_buffer);
           } else {  // recv_size == 0 
-            output.mul_(self_weight);
+            ::torch::Tensor output_buffer = MaybeCopyToTensorBuffer(output);
+            output_buffer.mul_(self_weight);
+            MaybeCopyBufferBack(output, output_buffer);
           }
         }));
 
@@ -532,7 +534,10 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output,
             output.resize_(shape_vector);
             MaybeCopyBufferBack(output, output_buffer);
           } else { // recv_size == 0
-            output.set_(tensor.mul(self_weight));
+            output.set_(tensor);
+            ::torch::Tensor output_buffer = MaybeCopyToTensorBuffer(output);
+            output_buffer.mul_(self_weight);
+            MaybeCopyBufferBack(output, output_buffer);
           }
         }));
     ThrowIfError(enqueue_result);
diff --git a/bluefog/torch/mpi_ops.py b/bluefog/torch/mpi_ops.py
index fd1d103b..fa52dcdd 100644
--- a/bluefog/torch/mpi_ops.py
+++ b/bluefog/torch/mpi_ops.py
@@ -441,9 +441,6 @@ def _neighbor_allreduce_nonblocking(tensor, output, self_weight, neighbor_weight
     elif self_weight is None or neighbor_weights is None:
         raise ValueError("Arguments self_weight and neighbor_weights should be presented if "
                          "enabling dynamic topology.")
-    elif not send_neighbors:
-        raise ValueError("Argument send_neighbors cannot be empty list but we plan to support "
-                         "it in future.")
     else:
         dynamic_neighbors_enabled = True
     if self_weight is None and neighbor_weights is None:
diff --git a/test/torch_ops_test.py b/test/torch_ops_test.py
index 7dda2708..4a555563 100644
--- a/test/torch_ops_test.py
+++ b/test/torch_ops_test.py
@@ -521,7 +521,7 @@ def test_neighbor_allreduce_dynamic_topo_move(self):
                 (reduced_tensor.data - (rank-1) % size).abs().max() < eps
             ), "bf.neighbor_allreduce (move) produces incorrect reduced tensor"
 
-    @unittest.skip("Haven't fully clear on the usage due to sync issues. Temporarily disabled")
+    #@unittest.skip("Haven't fully clear on the usage due to sync issues. Temporarily disabled")
     def test_neighbor_allreduce_dynamic_topo_with_empty_send_neighbors(self):
         """Test that the neighbor all reduce (avg) 1D, 2D, 3D tensors correctly with empty
            send_neighbors."""

From a3365b30f095255c8f775cbd88a5f0f19e4ad831 Mon Sep 17 00:00:00 2001
From: Hanbin Hu <hanbinhu2016@gmail.com>
Date: Thu, 18 Feb 2021 23:12:47 -0800
Subject: [PATCH 05/36] Rename neighbor_weights to src_weights, and
 send_neighbors to dst_weights for neighbor_allreduce

---
 bluefog/torch/mpi_ops.py              | 98 ++++++++++++++-------------
 bluefog/torch/optimizers.py           | 16 ++---
 examples/pytorch_average_consensus.py |  4 +-
 examples/pytorch_benchmark.py         |  4 +-
 examples/pytorch_mnist.py             |  4 +-
 examples/pytorch_optimization.py      | 17 +++--
 examples/pytorch_resnet.py            |  4 +-
 test/torch_ops_test.py                | 18 ++---
 test/torch_optimizer_test.py          |  4 +-
 9 files changed, 88 insertions(+), 81 deletions(-)

diff --git a/bluefog/torch/mpi_ops.py b/bluefog/torch/mpi_ops.py
index fa52dcdd..82c55e07 100644
--- a/bluefog/torch/mpi_ops.py
+++ b/bluefog/torch/mpi_ops.py
@@ -15,7 +15,7 @@
 # ==============================================================================
 
 from contextlib import contextmanager
-from typing import List, Dict, Optional
+from typing import List, Dict, Union, Optional
 
 import torch
 
@@ -430,46 +430,46 @@ def _neighbor_allreduce_function_factory(tensor):
     return 'bluefog_torch_neighbor_allreduce_nonblocking_' + tensor.type().replace('.', '_')
 
 
-def _neighbor_allreduce_nonblocking(tensor, output, self_weight, neighbor_weights,
-                                    send_neighbors, enable_topo_check, name):
+def _neighbor_allreduce_nonblocking(tensor, output, self_weight, src_weights,
+                                    dst_weights, enable_topo_check, name):
     function = _check_function(_neighbor_allreduce_function_factory, tensor)
-    if send_neighbors is None:
-        send_neighbors = []
+    if dst_weights is None:
+        dst_weights = []
         dynamic_neighbors_enabled = False
-    elif len(set(send_neighbors)) != len(send_neighbors):
-        raise ValueError("Argument send_neighbors should only contain the unique ranks.")
-    elif self_weight is None or neighbor_weights is None:
-        raise ValueError("Arguments self_weight and neighbor_weights should be presented if "
+    elif len(set(dst_weights)) != len(dst_weights):
+        raise ValueError("Argument dst_weights should only contain the unique ranks.")
+    elif self_weight is None or src_weights is None:
+        raise ValueError("Arguments self_weight and src_weights should be presented if "
                          "enabling dynamic topology.")
     else:
         dynamic_neighbors_enabled = True
-    if self_weight is None and neighbor_weights is None:
+    if self_weight is None and src_weights is None:
         # Implying this is static graph.
         if is_topo_weighted():
             topology = load_topology()
-            self_weight, neighbor_weights = GetRecvWeights(topology, rank())
+            self_weight, src_weights = GetRecvWeights(topology, rank())
             weighted_average_computation = True
         else:
             weight = 1.0/(len(in_neighbor_ranks())+1)
             self_weight = weight
-            neighbor_weights = {r: weight for r in in_neighbor_ranks()}
+            src_weights = {r: weight for r in in_neighbor_ranks()}
             weighted_average_computation = False
-    elif self_weight is not None and neighbor_weights is not None:
-        if not isinstance(neighbor_weights, dict):
+    elif self_weight is not None and src_weights is not None:
+        if not isinstance(src_weights, dict):
             raise ValueError("Argument neighbor_weights has to be a dictionary map from the "
                              "(in-)neighbor rank to the weights.")
         if not isinstance(self_weight, float):
             raise ValueError(
                 "Argument self_weight has to be a float for self rank.")
         if not dynamic_neighbors_enabled and \
-           not set(neighbor_weights.keys()).issubset(set(in_neighbor_ranks())):
+           not set(src_weights.keys()).issubset(set(in_neighbor_ranks())):
             raise ValueError("The key of weights should only contain the ranks that belong to "
                              " in-neighbors and self rank.")
-        uniform_weights = 1.0/(len(neighbor_weights)+1)
+        uniform_weights = 1.0/(len(src_weights)+1)
         weighted_average_computation = False
         if abs(self_weight - uniform_weights) > 1e-6:
             weighted_average_computation = True
-        for n_weights in neighbor_weights.values():
+        for n_weights in src_weights.values():
             if abs(n_weights - uniform_weights) > 1e-6:
                 weighted_average_computation = True
                 break
@@ -477,18 +477,18 @@ def _neighbor_allreduce_nonblocking(tensor, output, self_weight, neighbor_weight
         raise ValueError("Arguments self_weight and neighbor_weights have to be presented at "
                          "the same time")
     is_hierarchical = False
-    handle = getattr(mpi_lib, function)(tensor, output, self_weight, neighbor_weights,
-                                        send_neighbors, dynamic_neighbors_enabled,
+    handle = getattr(mpi_lib, function)(tensor, output, self_weight, src_weights,
+                                        dst_weights, dynamic_neighbors_enabled,
                                         enable_topo_check, weighted_average_computation,
                                         is_hierarchical, name.encode() if name is not None else "")
     _handle_map[handle] = (tensor, output)
     return handle
 
 
-def neighbor_allreduce(tensor: torch.Tensor,
+def neighbor_allreduce(tensor: torch.Tensor, *,
                        self_weight: Optional[float] = None,
-                       neighbor_weights: Optional[Dict[int, float]] = None,
-                       send_neighbors: Optional[List[int]] = None,
+                       src_weights: Optional[Dict[int, float]] = None,
+                       dst_weights: Optional[Union[Dict[int, float], List[int]]] = None,
                        enable_topo_check: bool = True,
                        name: Optional[str] = None) -> torch.Tensor:
     """
@@ -505,17 +505,17 @@ def neighbor_allreduce(tensor: torch.Tensor,
     Arguments:
         tensor: A tensor to execute weighted average with neighbors.
         self_weight: The weight for self node, used with neighbor_weights.
-        neighbor_weights: The weights for in-neighbor nodes, used with self weight.
+        src_weights: The weights for in-neighbor nodes, used with self weight.
             If neighbor_weights is presented, the return tensor will return the weighted average
             defined by these weights and the self_weight. If not, the return tensor will return
             the weighted average defined by the topology weights is provided or uniformly average.
             The data structure of weights should be {rank : weight} and rank has to belong to the
             (in-)neighbors.
-        send_neighbors: The list of neighbor nodes to be sent to. If set to be None, assume the
+        dst_weights: The weights for out-neighbor nodes. If set to be None, assume the
             the current node sends to all of its (out-)neighbors. If having values, assume only
-            part of (out-)neighbors will be sent to. In this mode, this node sends its value to
-            partial neighbors listed in this variable in a dynamic graph, and `self_weight` and
-            `neighbor_weights` must be present.
+            part of (out-)neighbors will be sent to. If set to be a list, assume all the weights
+            are one. In this mode, this node sends its value to partial neighbors listed in this
+            variable in a dynamic graph, and `self_weight` and `src_weights` must be present.
         enable_topo_check: When send_neighbors is present, enabling this option checks if the
             sending and recieving neighbors match with each other. Disabling this check can boost
             the performance.
@@ -526,19 +526,23 @@ def neighbor_allreduce(tensor: torch.Tensor,
 
     Note: self_weight and neighbor_weights must be presented at the same time.
     """
-    if (self_weight is None and neighbor_weights is not None) or \
-       (self_weight is not None and neighbor_weights is None):
-        raise ValueError("Arguments self_weight and neighbor_weights have to be presented at "
+    if (self_weight is None and src_weights is not None) or \
+       (self_weight is not None and src_weights is None):
+        raise ValueError("Arguments self_weight and src_weights have to be presented at "
                          "the same time")
-    handle = neighbor_allreduce_nonblocking(tensor, self_weight, neighbor_weights,
-                                            send_neighbors, enable_topo_check, name)
+    handle = neighbor_allreduce_nonblocking(tensor,
+                                            self_weight=self_weight,
+                                            src_weights=src_weights,
+                                            dst_weights=dst_weights,
+                                            enable_topo_check=enable_topo_check,
+                                            name=name)
     return synchronize(handle)
 
 
-def neighbor_allreduce_nonblocking(tensor: torch.Tensor,
+def neighbor_allreduce_nonblocking(tensor: torch.Tensor, *,
                                    self_weight: Optional[float] = None,
-                                   neighbor_weights: Optional[Dict[int, float]] = None,
-                                   send_neighbors: Optional[List[int]] = None,
+                                   src_weights: Optional[Dict[int, float]] = None,
+                                   dst_weights: Optional[Union[Dict[int, float], List[int]]] = None,
                                    enable_topo_check: bool = True,
                                    name: Optional[str] = None) -> int:
     """
@@ -555,17 +559,17 @@ def neighbor_allreduce_nonblocking(tensor: torch.Tensor,
     Arguments:
         tensor: A tensor to execute weighted average with neighbors.
         self_weight: The weight for self node, used with neighbor_weights.
-        neighbor_weights: The weights for in-neighbor nodes, used with self weight.
+        src_weights: The weights for in-neighbor nodes, used with self weight.
             If neighbor_weights is presented, the return tensor will return the weighted average
             defined by these weights and the self_weight. If not, the return tensor will return
             the weighted average defined by the topology weights is provided or uniformly average.
             The data structure of weights should be {rank : weight} and rank has to belong to the
             (in-)neighbors.
-        send_neighbors: The list of neighbor nodes to be sent to. If set to be None, assume the
+        dst_weights: The weights for out-neighbor nodes. If set to be None, assume the
             the current node sends to all of its (out-)neighbors. If having values, assume only
-            part of (out-)neighbors will be sent to. In this mode, this node sends its value to
-            partial neighbors listed in this variable in a dynamic graph, and `self_weight` and
-            `neighbor_weights` must be present.
+            part of (out-)neighbors will be sent to. If set to be a list, assume all the weights
+            are one. In this mode, this node sends its value to partial neighbors listed in this
+            variable in a dynamic graph, and `self_weight` and `src_weights` must be present.
         enable_topo_check: When send_neighbors is present, enabling this option checks if the
             sending and recieving neighbors match with each other. Disabling this check can boost
             the performance.
@@ -577,18 +581,18 @@ def neighbor_allreduce_nonblocking(tensor: torch.Tensor,
 
     Note: self_weight and neighbor_weights must be presented at the same time.
     """
-    if (self_weight is None and neighbor_weights is not None) or \
-       (self_weight is not None and neighbor_weights is None):
-        raise ValueError("Arguments self_weight and neighbor_weights have to be presented at "
+    if (self_weight is None and src_weights is not None) or \
+       (self_weight is not None and src_weights is None):
+        raise ValueError("Arguments self_weight and src_weights have to be presented at "
                          "the same time")
-    if send_neighbors is None:
+    if dst_weights is None:
         first_dim = tensor.shape[0] * len(in_neighbor_ranks())
     else:
-        first_dim = tensor.shape[0] * len(neighbor_weights)
+        first_dim = tensor.shape[0] * len(src_weights)
     new_shape = torch.Size([first_dim] + list(tensor.shape[1:]))
     output = tensor.new(new_shape)  # Pre-allocate the memory for the output.
-    return _neighbor_allreduce_nonblocking(tensor, output, self_weight, neighbor_weights,
-                                           send_neighbors, enable_topo_check, name=name)
+    return _neighbor_allreduce_nonblocking(tensor, output, self_weight, src_weights,
+                                           dst_weights, enable_topo_check, name=name)
 
 
 def hierarchical_neighbor_allreduce(tensor: torch.Tensor,
diff --git a/bluefog/torch/optimizers.py b/bluefog/torch/optimizers.py
index ceeb5a94..82fc677c 100644
--- a/bluefog/torch/optimizers.py
+++ b/bluefog/torch/optimizers.py
@@ -324,8 +324,8 @@ def __init__(self, params, model, communication_type, num_steps_per_communicatio
         named_parameters, models = _check_named_parameters(self, model)
         # knobs for neighbor communication behavior
         self.self_weight = None
-        self.neighbor_weights = None
-        self.send_neighbors = None
+        self.src_weights = None
+        self.dst_weights = None
         self.neighbor_machine_weights = None
         self.send_neighbor_machines = None
         self.enable_topo_check = False
@@ -394,8 +394,8 @@ def hook(model, *unused):
     def _neighbor_allreduce_data_async(self, p):
         name = self._parameter_names.get(p)
         handle = bf.neighbor_allreduce_nonblocking(p.data, name=name, self_weight=self.self_weight,
-                                                   neighbor_weights=self.neighbor_weights,
-                                                   send_neighbors=self.send_neighbors,
+                                                   src_weights=self.src_weights,
+                                                   dst_weights=self.dst_weights,
                                                    enable_topo_check=self.enable_topo_check)
         return handle
 
@@ -489,8 +489,8 @@ def __init__(self, params, model, communication_type, backward_passes_per_step=1
         named_parameters, models = _check_named_parameters(self, model)
         # knobs for neighbor communication behavior
         self.self_weight = None
-        self.neighbor_weights = None
-        self.send_neighbors = None
+        self.src_weights = None
+        self.dst_weights = None
         self.neighbor_machine_weights = None
         self.send_neighbor_machines = None
         self.enable_topo_check = False
@@ -762,8 +762,8 @@ def _adadelta_step(self, p, grad, param_group):
     def _neighbor_allreduce_data_async(self, p):
         name = self._parameter_names.get(p)
         handle = bf.neighbor_allreduce_nonblocking(p.data, name=name, self_weight=self.self_weight,
-                                                   neighbor_weights=self.neighbor_weights,
-                                                   send_neighbors=self.send_neighbors,
+                                                   src_weights=self.src_weights,
+                                                   dst_weights=self.dst_weights,
                                                    enable_topo_check=self.enable_topo_check)
         return handle
 
diff --git a/examples/pytorch_average_consensus.py b/examples/pytorch_average_consensus.py
index 0b1b2f1e..95711596 100644
--- a/examples/pytorch_average_consensus.py
+++ b/examples/pytorch_average_consensus.py
@@ -99,8 +99,8 @@
             self_weight = 1 / (len(recv_neighbors) + 1)
 
         x = bf.neighbor_allreduce(x, name='x', self_weight=self_weight,
-                                  neighbor_weights=neighbor_weights,
-                                  send_neighbors=send_neighbors, enable_topo_check=False)
+                                  src_weights=neighbor_weights,
+                                  dst_weights=send_neighbors, enable_topo_check=False)
         mse.append(torch.norm(x-x_bar, p=2) / torch.norm(x_bar, p=2))
 else:
     outdegree = len(bf.out_neighbor_ranks())
diff --git a/examples/pytorch_benchmark.py b/examples/pytorch_benchmark.py
index 958e6571..64cf1eb2 100644
--- a/examples/pytorch_benchmark.py
+++ b/examples/pytorch_benchmark.py
@@ -186,8 +186,8 @@ def dynamic_topology_update(batch_idx):
         optimizer.dst_weights = {sent_neighbor: 1.0}
     elif args.dist_optimizer == 'neighbor_allreduce':
         send_neighbors, recv_neighbors = next(dynamic_neighbor_allreduce_gen)
-        optimizer.send_neighbors = send_neighbors
-        optimizer.neighbor_weights = {
+        optimizer.dst_weights = send_neighbors
+        optimizer.src_weights = {
             r: 1/(len(recv_neighbors) + 1) for r in recv_neighbors}
         optimizer.self_weight = 1 / (len(recv_neighbors) + 1)
         optimizer.enable_topo_check = False
diff --git a/examples/pytorch_mnist.py b/examples/pytorch_mnist.py
index 8315dffe..7643369c 100644
--- a/examples/pytorch_mnist.py
+++ b/examples/pytorch_mnist.py
@@ -215,8 +215,8 @@ def dynamic_topology_update(epoch, batch_idx):
         optimizer.dst_weights = {sent_neighbor: 1.0}
     elif args.dist_optimizer == 'neighbor_allreduce':
         send_neighbors, recv_neighbors = next(dynamic_neighbor_allreduce_gen)
-        optimizer.send_neighbors = send_neighbors
-        optimizer.neighbor_weights = {r: 1/(len(recv_neighbors) + 1) for r in recv_neighbors}
+        optimizer.dst_weights = send_neighbors
+        optimizer.src_weights = {r: 1/(len(recv_neighbors) + 1) for r in recv_neighbors}
         optimizer.self_weight = 1 / (len(recv_neighbors) + 1)
         optimizer.enable_topo_check = False
     elif args.dist_optimizer == 'hierarchical_neighbor_allreduce':
diff --git a/examples/pytorch_optimization.py b/examples/pytorch_optimization.py
index 058a1ca2..162f9040 100644
--- a/examples/pytorch_optimization.py
+++ b/examples/pytorch_optimization.py
@@ -199,11 +199,13 @@ def diffusion(X, y, w_opt, loss, maxite=2000, alpha=1e-1, **kwargs):
         loss_step(X, y, w,
                   tensor_name='neighbor.allreduce.local_variable', loss=loss, rho=rho)
 
+        # diffusion
         with torch.no_grad():
-            # diffusion
             phi = w - alpha * w.grad.data
-            w.data = bf.neighbor_allreduce(
-                phi, self_weight, neighbor_weights, name='local variable')
+            w.data = bf.neighbor_allreduce(phi,
+                                           self_weight=self_weight,
+                                           src_weights=neighbor_weights,
+                                           name='local variable')
             w.grad.data.zero_()
 
             # record convergence
@@ -271,8 +273,10 @@ def exact_diffusion(X, y, w_opt, loss, maxite=2000, alpha=1e-1, use_Abar=True, *
         with torch.no_grad():
             psi = w - alpha * w.grad.data
             phi = psi + w.data - psi_prev
-            w.data = bf.neighbor_allreduce(
-                phi, self_weight, neighbor_weights, name='local variable')
+            w.data = bf.neighbor_allreduce(phi,
+                                           self_weight=self_weight,
+                                           src_weights=neighbor_weights,
+                                           name='local variable')
             psi_prev = psi.clone()
             w.grad.data.zero_()
 
@@ -330,8 +334,7 @@ def gradient_tracking(X, y, w_opt, loss, maxite=2000, alpha=1e-1, **kwargs):
         # q^{k+1} = neighbor_allreduce(q^k) + grad(w^{k+1}) - grad(w^k)
 
         # Notice the communication of neighbor_allreduce can overlap with gradient computation.
-        w_handle = bf.neighbor_allreduce_nonblocking(
-            w.data, name='Grad.Tracking.w')
+        w_handle = bf.neighbor_allreduce_nonblocking(w.data, name='Grad.Tracking.w')
         q_handle = bf.neighbor_allreduce_nonblocking(q, name='Grad.Tracking.q')
         w.data = bf.synchronize(w_handle) - alpha * q
         # calculate local gradient
diff --git a/examples/pytorch_resnet.py b/examples/pytorch_resnet.py
index b0ab4ce4..62f3eb01 100644
--- a/examples/pytorch_resnet.py
+++ b/examples/pytorch_resnet.py
@@ -361,8 +361,8 @@ def dynamic_topology_update(epoch, batch_idx):
         optimizer.dst_weights = {sent_neighbor: 1.0}
     elif args.dist_optimizer == 'neighbor_allreduce':
         send_neighbors, recv_neighbors = next(dynamic_neighbor_allreduce_gen)
-        optimizer.send_neighbors = send_neighbors
-        optimizer.neighbor_weights = {r: 1/(len(recv_neighbors) + 1) for r in recv_neighbors}
+        optimizer.dst_weights = send_neighbors
+        optimizer.src_weights = {r: 1/(len(recv_neighbors) + 1) for r in recv_neighbors}
         optimizer.self_weight = 1 / (len(recv_neighbors) + 1)
         optimizer.enable_topo_check = False
     elif args.dist_optimizer == 'hierarchical_neighbor_allreduce':
diff --git a/test/torch_ops_test.py b/test/torch_ops_test.py
index 4a555563..f3ec3240 100644
--- a/test/torch_ops_test.py
+++ b/test/torch_ops_test.py
@@ -387,7 +387,7 @@ def test_neighbor_allreduce_sum_precision(self):
             name = "neighbor_allreduce_{}_{}".format(dim, dtype)
             nw = {i: 1.0 for i in neighbor_ranks}
             reduced_tensor = bf.neighbor_allreduce(tensor, self_weight=1.0,
-                                                   neighbor_weights=nw, name=name)
+                                                   src_weights=nw, name=name)
             assert (
                 list(reduced_tensor.shape) == [23] * dim
             ), "bf.neighbor_allreduce (avg) produces incorrect reduced shape"
@@ -450,7 +450,7 @@ def test_neighbor_allreduce_dynamic_topo_check(self):
             name = "neighbor_allreduce_{}_{}".format(dim, dtype)
             with pytest.raises(ValueError):
                 bf.neighbor_allreduce(tensor, name=name, self_weight=self_weight,
-                                      neighbor_weights=neighbor_weights, send_neighbors=send_ranks)
+                                      src_weights=neighbor_weights, dst_weights=send_ranks)
 
     def test_neighbor_allreduce_dynamic_topo_outside_static_topo_move(self):
         """Test that the neighbor all reduce (move) 1D, 2D, 3D tensors correctly
@@ -477,7 +477,7 @@ def test_neighbor_allreduce_dynamic_topo_outside_static_topo_move(self):
             name = "neighbor_allreduce_{}_{}".format(dim, dtype)
             reduced_tensor = bf.neighbor_allreduce(
                 tensor, name=name, self_weight=self_weight,
-                neighbor_weights=neighbor_weights, send_neighbors=send_ranks)
+                src_weights=neighbor_weights, dst_weights=send_ranks)
             eps = EPSILON if tensor.dtype != torch.float16 else LOOSE_EPSILON
             tensor, reduced_tensor = self.convert_cpu_fp16_to_fp32(tensor, reduced_tensor)
             assert (
@@ -511,7 +511,7 @@ def test_neighbor_allreduce_dynamic_topo_move(self):
             name = "neighbor_allreduce_{}_{}".format(dim, dtype)
             reduced_tensor = bf.neighbor_allreduce(
                 tensor, name=name, self_weight=self_weight,
-                neighbor_weights=neighbor_weights, send_neighbors=send_ranks)
+                src_weights=neighbor_weights, dst_weights=send_ranks)
             eps = EPSILON if tensor.dtype != torch.float16 else LOOSE_EPSILON
             tensor, reduced_tensor = self.convert_cpu_fp16_to_fp32(tensor, reduced_tensor)
             assert (
@@ -553,7 +553,7 @@ def test_neighbor_allreduce_dynamic_topo_with_empty_send_neighbors(self):
             name = "neighbor_allreduce_{}_{}".format(dim, dtype)
             reduced_tensor = bf.neighbor_allreduce(
                 tensor, name=name, self_weight=self_weight,
-                neighbor_weights=neighbor_weights, send_neighbors=send_ranks)
+                src_weights=neighbor_weights, dst_weights=send_ranks)
             eps = EPSILON if tensor.dtype != torch.float16 else LOOSE_EPSILON
             tensor, reduced_tensor = self.convert_cpu_fp16_to_fp32(tensor, reduced_tensor)
             assert (
@@ -591,7 +591,7 @@ def test_neighbor_allreduce_dynamic_topo_avg(self):
             name = "neighbor_allreduce_{}_{}".format(dim, dtype)
             reduced_tensor = bf.neighbor_allreduce(
                 tensor, name=name, self_weight=self_weight,
-                neighbor_weights=neighbor_weights, send_neighbors=send_ranks)
+                src_weights=neighbor_weights, dst_weights=send_ranks)
             eps = EPSILON if tensor.dtype != torch.float16 else LOOSE_EPSILON
             tensor, reduced_tensor = self.convert_cpu_fp16_to_fp32(tensor, reduced_tensor)
             assert (
@@ -818,7 +818,7 @@ def test_neighbor_allreduce_sum(self):
             tensor = self.cast_and_place(tensor, dtype)
             nw = {i: 1.0 for i in neighbor_ranks}
             reduced_tensor = bf.neighbor_allreduce(tensor, self_weight=1.0,
-                                                   neighbor_weights=nw)
+                                                   src_weights=nw)
             tensor, reduced_tensor = self.convert_cpu_fp16_to_fp32(tensor, reduced_tensor)
             assert (
                 list(reduced_tensor.shape) == [23] * dim
@@ -994,10 +994,10 @@ def test_neighbor_allreduce_dynamic_topo_fusion(self):
 
             handle_1 = bf.neighbor_allreduce_nonblocking(
                 tensor_1, name=name_1, self_weight=self_weight,
-                neighbor_weights=neighbor_weights, send_neighbors=send_ranks)
+                src_weights=neighbor_weights, dst_weights=send_ranks)
             handle_2 = bf.neighbor_allreduce_nonblocking(
                 tensor_2, name=name_2, self_weight=self_weight,
-                neighbor_weights=neighbor_weights, send_neighbors=send_ranks)
+                src_weights=neighbor_weights, dst_weights=send_ranks)
 
             output_1 = bf.synchronize(handle_1)
             output_2 = bf.synchronize(handle_2)
diff --git a/test/torch_optimizer_test.py b/test/torch_optimizer_test.py
index 8da25053..7508a4b9 100644
--- a/test/torch_optimizer_test.py
+++ b/test/torch_optimizer_test.py
@@ -226,8 +226,8 @@ def dynamic_neighbor_allreduce_train(model, optimizer, dataloader, isCUDA, dynam
     model.train()
     for data, target in dataloader:
         send_neighbors, recv_neighbors = next(dynamic_topo_gen)
-        optimizer.send_neighbors = send_neighbors
-        optimizer.neighbor_weights = {
+        optimizer.dst_weights = send_neighbors
+        optimizer.src_weights = {
             r: 1/(len(recv_neighbors) + 1) for r in recv_neighbors}
         optimizer.self_weight = 1 / (len(recv_neighbors) + 1)
 

From cf178757de19adae43509f32d8e18bd6f6524d96 Mon Sep 17 00:00:00 2001
From: Hanbin Hu <hanbinhu2016@gmail.com>
Date: Thu, 18 Feb 2021 23:13:19 -0800
Subject: [PATCH 06/36] A script to test existing examples

---
 examples/test_all_example.sh | 92 ++++++++++++++++++++++++++++++++++++
 1 file changed, 92 insertions(+)
 create mode 100755 examples/test_all_example.sh

diff --git a/examples/test_all_example.sh b/examples/test_all_example.sh
new file mode 100755
index 00000000..af9aa491
--- /dev/null
+++ b/examples/test_all_example.sh
@@ -0,0 +1,92 @@
+#!/bin/bash
+NUM_PROC=4
+BFRUN="bfrun -np ${NUM_PROC}"
+RUN_DIR="$( pwd )"
+EXAMPLE_DIR="$( cd "$( dirname "$0" )" && pwd )"
+
+die() { echo >&2 -e "\nERROR: $@\n"; exit 1; }
+check() {
+    "$@" >>/dev/null 2>&1;
+    exit_code=$?;
+    [ $exit_code -eq 0 ] \
+        && echo "Command [$*] succeed" \
+        || die "Command [$*] failed with error code $exit_code";
+}
+
+# PyTorch Average Concensus Cases
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_average_consensus.py
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_average_consensus.py --enable-dynamic-topology
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_average_consensus.py --asynchronous-mode
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_average_consensus.py --no-cuda
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_average_consensus.py --no-cuda --enable-dynamic-topology
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_average_consensus.py --no-cuda --asynchronous-mode
+
+# PyTorch Optimization Cases
+[ -f "${RUN_DIR}/plot.png" ] && EXIST_PLOT_PNG=1 || EXIST_PLOT_PNG=0
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_optimization.py --method=diffusion
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_optimization.py --method=exact_diffusion
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_optimization.py --method=gradient_tracking
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_optimization.py --method=push_diging
+[ "${EXIST_PLOT_PNG}" == 0 ] && rm -f ${RUN_DIR}/plot.png
+
+# PyTorch MNIST Cases
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --dist-optimizer=gradient_allreduce
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --dist-optimizer=allreduce
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --dist-optimizer=allreduce --atc-style
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --dist-optimizer=neighbor_allreduce
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --dist-optimizer=neighbor_allreduce --disable-dynamic-topology
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --dist-optimizer=neighbor_allreduce --atc-style
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --dist-optimizer=neighbor_allreduce --atc-style --disable-dynamic-topology
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --dist-optimizer=win_put
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --dist-optimizer=win_put --disable-dynamic-topology
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=gradient_allreduce
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=allreduce
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=allreduce --atc-style
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=neighbor_allreduce
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=neighbor_allreduce --disable-dynamic-topology
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=neighbor_allreduce --atc-style
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=neighbor_allreduce --atc-style --disable-dynamic-topology
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=win_put
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=win_put --disable-dynamic-topology
+
+# PyTorch Benchmark Cases
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --dist-optimizer=gradient_allreduce
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --dist-optimizer=allreduce
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --dist-optimizer=allreduce --atc-style
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --dist-optimizer=neighbor_allreduce
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --dist-optimizer=neighbor_allreduce --disable-dynamic-topology
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --dist-optimizer=neighbor_allreduce --atc-style
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --dist-optimizer=neighbor_allreduce --atc-style --disable-dynamic-topology
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --dist-optimizer=win_put
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --dist-optimizer=win_put --disable-dynamic-topology
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=gradient_allreduce
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=allreduce
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=allreduce --atc-style
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=neighbor_allreduce
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=neighbor_allreduce --disable-dynamic-topology
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=neighbor_allreduce --atc-style
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=neighbor_allreduce --atc-style --disable-dynamic-topology
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=win_put
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=win_put --disable-dynamic-topology
+
+# PyTorch ResNet Cases
+[ -d "${RUN_DIR}/logs" ] && EXIST_LOGS_DIR=1 || EXIST_LOGS_DIR=0
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --dist-optimizer=gradient_allreduce
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --dist-optimizer=allreduce
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --dist-optimizer=allreduce --atc-style
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --dist-optimizer=neighbor_allreduce
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --dist-optimizer=neighbor_allreduce --disable-dynamic-topology
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --dist-optimizer=neighbor_allreduce --atc-style
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --dist-optimizer=neighbor_allreduce --atc-style --disable-dynamic-topology
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --dist-optimizer=win_put
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --dist-optimizer=win_put --disable-dynamic-topology
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --no-cuda --dist-optimizer=gradient_allreduce
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --no-cuda --dist-optimizer=allreduce
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --no-cuda --dist-optimizer=allreduce --atc-style
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --no-cuda --dist-optimizer=neighbor_allreduce
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --no-cuda --dist-optimizer=neighbor_allreduce --disable-dynamic-topology
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --no-cuda --dist-optimizer=neighbor_allreduce --atc-style
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --no-cuda --dist-optimizer=neighbor_allreduce --atc-style --disable-dynamic-topology
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --no-cuda --dist-optimizer=win_put
+check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --no-cuda --dist-optimizer=win_put --disable-dynamic-topology
+[ "${EXIST_LOGS_DIR}" == 0 ] && rm -rf ${RUN_DIR}/logs
\ No newline at end of file

From 923df09c72f09f2e4964f62a852bd3c257743942 Mon Sep 17 00:00:00 2001
From: Hanbin Hu <hanbinhu2016@gmail.com>
Date: Fri, 19 Feb 2021 01:30:20 -0800
Subject: [PATCH 07/36] Accept dst_weights as Dict, and reorganize
 DoNeighborAllreduce

---
 bluefog/torch/mpi_ops.cc | 317 ++++++++++++++++-----------------------
 bluefog/torch/mpi_ops.h  |  14 +-
 bluefog/torch/mpi_ops.py |   4 +-
 3 files changed, 140 insertions(+), 195 deletions(-)

diff --git a/bluefog/torch/mpi_ops.cc b/bluefog/torch/mpi_ops.cc
index 4e517ab3..55fd3e34 100644
--- a/bluefog/torch/mpi_ops.cc
+++ b/bluefog/torch/mpi_ops.cc
@@ -82,8 +82,6 @@ void MaybeCopyBufferBack(::torch::Tensor tensor, ::torch::Tensor buffer) {
   if (IsCPUHalfTensor(tensor)) tensor.copy_(buffer.to(::torch::kFloat16));
 }
 
-}  // namespace
-
 std::function<std::function<void(const Status&)>(std::function<void()>)>
     GetCallbackWrapper(int handle, Timeline* timeline_ptr, const std::string& op_name,
     std::thread::id tid) {
@@ -98,6 +96,99 @@ std::function<std::function<void(const Status&)>(std::function<void()>)>
     };
 }
 
+void PerformNeighborAllreduceCallback(::torch::Tensor tensor, ::torch::Tensor output,
+                                      double self_weight,
+                                      const std::unordered_map<int, double>& src_weights,
+                                      const std::vector<int>& src_neighbors,
+                                      bool avg_computation,
+                                      bool dynamic_neighbors_enabled,
+                                      bool is_hierarchical) {
+  int src_size = bluefog_neighbor_size();
+  if (dynamic_neighbors_enabled) src_size = src_neighbors.size();
+  if (src_size > 0) {
+    ::torch::Tensor output_buffer = MaybeCopyToTensorBuffer(output);
+    ::torch::Tensor tensor_buffer = MaybeCopyToTensorBuffer(tensor);
+
+    int first_dim = output_buffer.size(0) / src_size;
+    std::vector<int64_t> shape_vector;
+    shape_vector.push_back(first_dim);
+    for (int idx = 1; idx < tensor_buffer.dim(); ++idx) {
+      shape_vector.push_back(tensor_buffer.size(idx));
+    }
+
+    // if avg_computation is set to be False, sum computation will be taken place.
+    if (avg_computation) {
+      // 1) For a distributed graph topology, created with
+      // MPI_Dist_graph_create, the sequence of neighbors in the send and
+      // receive buffers at each process is defined as the sequence returned
+      // by MPI_Dist_graph_neighbors for destinations and sources,
+      // respectively. 2) MPI_Dist_graph_neighbors: If the communicator was
+      // created with MPI_Dist_graph_create_adjacent then the order of the
+      // values in sources and destinations is identical to the input that
+      // was used by the process with the same rank in comm_old in the
+      // creation call.
+      int* sources_ptr = nullptr;
+      if (!dynamic_neighbors_enabled) {
+        int indgree = 0;
+        int outdegree = 0;
+        int* destinations_ptr = nullptr;
+        bluefog_load_topology(&indgree, sources_ptr, &outdegree,
+                              destinations_ptr);
+      }
+      auto output_reduced = output_buffer.slice(0, 0, first_dim);
+      for (int i = 0; i < src_size; i++) {
+        double weight = 0.0;
+        int src_rank;
+        if (!dynamic_neighbors_enabled) src_rank = *(sources_ptr+i);
+        else src_rank = src_neighbors[i];
+        auto it = src_weights.find(src_rank);
+        if (it != src_weights.end()) {
+          weight = it->second;
+        }
+
+        if (i == 0) {
+          output_reduced.mul_(weight);
+        } else {
+          output_reduced.add_(
+              output_buffer.slice(0, i * first_dim, (i + 1) * first_dim), weight);
+        }
+      }
+      output_buffer.resize_(shape_vector);
+      output_buffer.add_(tensor_buffer, self_weight);
+      if (is_hierarchical){
+        // Because there is ncclAllreduce just take sum.
+        output_buffer.div_(bluefog_local_size());
+      }
+    } else { // avg_computation is False, using sum operation
+      if (src_size > 1) {
+        auto output_reduced = output_buffer.slice(0, 0, first_dim);
+        for (int i = 1; i < src_size; i++) {
+          output_reduced.add_(
+              output_buffer.slice(0, i * first_dim, (i + 1) * first_dim));
+        }
+        output_buffer.resize_(shape_vector);
+      }
+      // Include self data as well.
+      output_buffer.add_(tensor_buffer);
+      if (is_hierarchical){
+        // Because there is ncclAllreduce just take sum.
+        output_buffer.div_(bluefog_local_size() * (src_size + 1));
+      } else {
+        output_buffer.div_(src_size + 1);
+      }
+    }
+    output.resize_(shape_vector);
+    MaybeCopyBufferBack(output, output_buffer);
+  } else {  // recv_size == 0
+    output.set_(tensor);
+    ::torch::Tensor output_buffer = MaybeCopyToTensorBuffer(output);
+    output_buffer.mul_(self_weight);
+    MaybeCopyBufferBack(output, output_buffer);
+  }
+}
+
+}  // namespace
+
 int DoAllreduce(::torch::Tensor tensor, ::torch::Tensor output, int average,
                 bool is_hierarchical_local, const std::string& name) {
   ThrowIfError(common::CheckInitialized());
@@ -315,10 +406,11 @@ int DoNeighborAllgather(::torch::Tensor tensor, ::torch::Tensor output,
 }
 
 int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output,
-                        double self_weight, const std::unordered_map<int, double>& neighbor_weights,
-                        const std::vector<int>& send_neighbors, bool dynamic_neighbors_enabled,
-                        bool enable_topo_check, bool avg_computation, bool is_hierarchical,
-                        const std::string& name) {
+                        double self_weight,
+                        const std::unordered_map<int, double>& src_weights,
+                        const std::unordered_map<int, double>& dst_weights,
+                        bool dynamic_neighbors_enabled, bool enable_topo_check,
+                        bool avg_computation, bool is_hierarchical, const std::string& name) {
   ThrowIfError(common::CheckInitialized());
 
   auto handle = handle_manager.AllocateHandle();
@@ -333,117 +425,43 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output,
 
   auto callback_wrapper = GetCallbackWrapper(handle, timeline_ptr, op_name, tid);
 
-  std::vector<int> recv_neighbors;
-  for (auto kv : neighbor_weights)
-      recv_neighbors.push_back(kv.first);
-  std::sort(recv_neighbors.begin(), recv_neighbors.end());
+  std::vector<int> src_neighbors;
+  for (auto kv : src_weights)
+      src_neighbors.push_back(kv.first);
+  std::sort(src_neighbors.begin(), src_neighbors.end());
+
+  std::vector<int> dst_neighbors;
+  for (auto kv : dst_weights)
+      dst_neighbors.push_back(kv.first);
+  std::sort(dst_neighbors.begin(), dst_neighbors.end());
 
+  auto bf_src_neighbors = std::make_shared<std::vector<int>>(src_neighbors);
+  auto bf_dst_neighbors = std::make_shared<std::vector<int>>(dst_neighbors);
+  auto ready_event = RecordReadyEvent(device);
   if (OPS_ON_CPU && tensor.device().is_cuda()) {
     ::torch::Tensor cpu_buffer =
         tensor.to(::torch::Device(::torch::kCPU), /*non_blocking=*/false);
     ::torch::Tensor cpu_output =
         output.to(::torch::Device(::torch::kCPU), /*non_blocking=*/false);
     auto bf_tensor = std::make_shared<TorchTensor>(cpu_buffer);
-    auto bf_context =
-        std::make_shared<TorchOpContext>(CPU_DEVICE_ID, cpu_output);
+    auto bf_context = std::make_shared<TorchOpContext>(CPU_DEVICE_ID, cpu_output);
     auto bf_output = std::make_shared<TorchTensor>(cpu_output);
-    auto bf_recv_neighbors = std::make_shared<std::vector<int>>(recv_neighbors);
-    auto bf_send_neighbors = std::make_shared<std::vector<int>>(send_neighbors);
-    auto ready_event = RecordReadyEvent(device);
+
     auto enqueue_result = EnqueueTensorNeighborAllreduce(
-        bf_tensor, bf_output, bf_context, ready_event, bf_recv_neighbors,
-        bf_send_neighbors, dynamic_neighbors_enabled, is_hierarchical,
+        bf_tensor, bf_output, bf_context, ready_event, bf_src_neighbors,
+        bf_dst_neighbors, dynamic_neighbors_enabled, is_hierarchical,
         enable_topo_check, op_name, CPU_DEVICE_ID,
-        callback_wrapper([self_weight, neighbor_weights, avg_computation,
-                          cpu_output, tensor, recv_neighbors, send_neighbors,
+        callback_wrapper([self_weight, src_weights, avg_computation,
+                          cpu_output, tensor, src_neighbors,
                           dynamic_neighbors_enabled, is_hierarchical, output,
                           device]() mutable {
           with_device device_guard(device);
           output.copy_(cpu_output);
-          int recv_size = bluefog_neighbor_size();
-          if (dynamic_neighbors_enabled) recv_size = recv_neighbors.size();
-          if (recv_size > 0) {
-            ::torch::Tensor output_buffer = MaybeCopyToTensorBuffer(output);
-            ::torch::Tensor tensor_buffer = MaybeCopyToTensorBuffer(tensor);
-            
-            int first_dim = output_buffer.size(0) / recv_size;
-            std::vector<int64_t> shape_vector;
-            shape_vector.push_back(first_dim);
-            for (int idx = 1; idx < tensor_buffer.dim(); ++idx) {
-              shape_vector.push_back(tensor_buffer.size(idx));
-            }
-
-            // if avg_computation is set to be False, sum computation will be taken place.
-            if (avg_computation) {
-              // 1) For a distributed graph topology, created with
-              // MPI_Dist_graph_create, the sequence of neighbors in the send and
-              // receive buffers at each process is defined as the sequence returned
-              // by MPI_Dist_graph_neighbors for destinations and sources,
-              // respectively. 2) MPI_Dist_graph_neighbors: If the communicator was
-              // created with MPI_Dist_graph_create_adjacent then the order of the
-              // values in sources and destinations is identical to the input that
-              // was used by the process with the same rank in comm_old in the
-              // creation call.
-              int indgree = 0;
-              int outdegree = 0;
-              int* sources_ptr = nullptr;
-              int* destinations_ptr = nullptr;
-              bluefog_load_topology(&indgree, sources_ptr, &outdegree,
-                                    destinations_ptr);
-
-              auto output_reduced = output_buffer.slice(0, 0, first_dim);
-              if (dynamic_neighbors_enabled) indgree = recv_neighbors.size();
-              for (int i = 0; i < indgree; i++) {
-                double weight = 0.0;
-                int recv_rank;
-                if (!dynamic_neighbors_enabled) recv_rank = *(sources_ptr+i);
-                else recv_rank = recv_neighbors[i];
-                auto it = neighbor_weights.find(recv_rank);
-                if (it != neighbor_weights.end()) {
-                  weight = it->second;
-                }
-
-                if (i == 0) {
-                  output_reduced.mul_(weight);
-                } else {
-                  output_reduced.add_(
-                      output_buffer.slice(0, i * first_dim, (i + 1) * first_dim), weight);
-                }
-              }
-              output_buffer.resize_(shape_vector);
-              output_buffer.add_(tensor_buffer, self_weight);
-              if (is_hierarchical){
-                // Because there is ncclAllreduce just take sum.
-                output_buffer.div_(bluefog_local_size());
-              }
-            } else {
-              int neighbor_size = !dynamic_neighbors_enabled
-                                  ? bluefog_neighbor_size()
-                                  : recv_neighbors.size();
-              if (neighbor_size > 1) {
-                auto output_reduced = output_buffer.slice(0, 0, first_dim);
-                for (int i = 1; i < neighbor_size; i++) {
-                  output_reduced.add_(
-                      output_buffer.slice(0, i * first_dim, (i + 1) * first_dim));
-                }
-                output_buffer.resize_(shape_vector);
-              }
-              // Include self data as well.
-              output_buffer.add_(tensor_buffer);
-              if (is_hierarchical){
-                // Because there is ncclAllreduce just take sum.
-                output_buffer.div_(bluefog_local_size() * (neighbor_size + 1));
-              } else {
-                output_buffer.div_(neighbor_size + 1);
-              }
-            }
-            output.resize_(shape_vector);
-            MaybeCopyBufferBack(output, output_buffer);
-          } else {  // recv_size == 0 
-            ::torch::Tensor output_buffer = MaybeCopyToTensorBuffer(output);
-            output_buffer.mul_(self_weight);
-            MaybeCopyBufferBack(output, output_buffer);
-          }
+          PerformNeighborAllreduceCallback(tensor, output, self_weight,
+                                           src_weights, src_neighbors,
+                                           avg_computation,
+                                           dynamic_neighbors_enabled,
+                                           is_hierarchical);
         }));
 
     ThrowIfError(enqueue_result);
@@ -451,94 +469,19 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output,
     auto bf_tensor = std::make_shared<TorchTensor>(tensor);
     auto bf_context = std::make_shared<TorchOpContext>(device, output);
     auto bf_output = std::make_shared<TorchTensor>(output);
-    auto bf_recv_neighbors = std::make_shared<std::vector<int>>(recv_neighbors);
-    auto bf_send_neighbors = std::make_shared<std::vector<int>>(send_neighbors);
-    auto ready_event = RecordReadyEvent(device);
 
     auto enqueue_result = EnqueueTensorNeighborAllreduce(
-        bf_tensor, bf_output, bf_context, ready_event, bf_recv_neighbors,
-        bf_send_neighbors, dynamic_neighbors_enabled, is_hierarchical,
+        bf_tensor, bf_output, bf_context, ready_event, bf_src_neighbors,
+        bf_dst_neighbors, dynamic_neighbors_enabled, is_hierarchical,
         enable_topo_check, op_name, device,
-        callback_wrapper([self_weight, neighbor_weights, avg_computation,
-                          recv_neighbors, send_neighbors, dynamic_neighbors_enabled,
+        callback_wrapper([self_weight, src_weights, avg_computation,
+                          src_neighbors, dynamic_neighbors_enabled,
                           is_hierarchical, tensor, output]() mutable {
-          int recv_size = bluefog_neighbor_size();
-          if (dynamic_neighbors_enabled) recv_size = recv_neighbors.size();
-          if (recv_size > 0) {
-            ::torch::Tensor output_buffer = MaybeCopyToTensorBuffer(output);
-            ::torch::Tensor tensor_buffer = MaybeCopyToTensorBuffer(tensor);
-
-            int first_dim = output_buffer.size(0) / recv_size;
-            std::vector<int64_t> shape_vector;
-            shape_vector.push_back(first_dim);
-            for (int idx = 1; idx < tensor_buffer.dim(); ++idx) {
-              shape_vector.push_back(tensor_buffer.size(idx));
-            }
-            // if avg_computation is set to be True, average computation will be taken place.
-            if (avg_computation) {
-              int indgree = 0;
-              int outdegree = 0;
-              int* sources_ptr = nullptr;
-              int* destinations_ptr = nullptr;
-              auto output_reduced = output_buffer.slice(0, 0, first_dim);
-              if (!dynamic_neighbors_enabled) {
-                bluefog_load_topology(&indgree, sources_ptr, &outdegree,
-                                      destinations_ptr);
-              } else {
-                indgree = recv_neighbors.size();
-              }
-              for (int i = 0; i < indgree; i++) {
-                double weight = 0.0;
-                int recv_rank;
-                if (!dynamic_neighbors_enabled) recv_rank = *(sources_ptr+i);
-                else recv_rank = recv_neighbors[i];
-                auto it = neighbor_weights.find(recv_rank);
-                if (it != neighbor_weights.end()) {
-                  weight = it->second;
-                }
-
-                if (i == 0) {
-                  output_reduced.mul_(weight);
-                } else {
-                  output_reduced.add_(
-                      output_buffer.slice(0, i * first_dim, (i + 1) * first_dim), weight);
-                }
-              }
-              output_buffer.resize_(shape_vector);
-              output_buffer.add_(tensor_buffer, self_weight);
-              if (is_hierarchical){
-                // Because there is ncclAllreduce just take sum.
-                output_buffer.div_(bluefog_local_size());
-              }
-            } else {
-              int neighbor_size = !dynamic_neighbors_enabled
-                                  ? bluefog_neighbor_size()
-                                  : recv_neighbors.size();
-              if (neighbor_size > 1) {
-                auto output_reduced = output_buffer.slice(0, 0, first_dim);
-                for (int i = 1; i < neighbor_size; i++) {
-                  output_reduced.add_(
-                      output.slice(0, i * first_dim, (i + 1) * first_dim));
-                }
-              }
-              output_buffer.resize_(shape_vector);
-              // Include self data as well.
-              output_buffer.add_(tensor_buffer);
-              if (is_hierarchical){
-                // Because there is ncclAllreduce just take sum.
-                output_buffer.div_(bluefog_local_size() * (neighbor_size + 1));
-              } else {
-                output_buffer.div_(neighbor_size + 1);
-              }
-            }
-            output.resize_(shape_vector);
-            MaybeCopyBufferBack(output, output_buffer);
-          } else { // recv_size == 0
-            output.set_(tensor);
-            ::torch::Tensor output_buffer = MaybeCopyToTensorBuffer(output);
-            output_buffer.mul_(self_weight);
-            MaybeCopyBufferBack(output, output_buffer);
-          }
+          PerformNeighborAllreduceCallback(tensor, output, self_weight,
+                                           src_weights, src_neighbors,
+                                           avg_computation,
+                                           dynamic_neighbors_enabled,
+                                           is_hierarchical);
         }));
     ThrowIfError(enqueue_result);
   }
diff --git a/bluefog/torch/mpi_ops.h b/bluefog/torch/mpi_ops.h
index 5dc2cd7b..66ecb38c 100644
--- a/bluefog/torch/mpi_ops.h
+++ b/bluefog/torch/mpi_ops.h
@@ -111,13 +111,13 @@ NEIGHBOR_ALLGATHER_H(torch_cuda_FloatTensor, THCudaTensor)
 NEIGHBOR_ALLGATHER_H(torch_cuda_DoubleTensor, THCudaDoubleTensor)
 #endif
 
-#define NEIGHBOR_ALLREDUCE_H(torch_Tensor, THTensor)                             \
-  extern "C" int bluefog_torch_neighbor_allreduce_nonblocking_##torch_Tensor(    \
-      THTensor* tensor, THTensor* output, double self_weight,                    \
-      const std::unordered_map<int, double>& neighbor_weights,                   \
-      const std::vector<int>& send_neighbors, bool dynamic_neighbors_enabled,    \
-      bool enable_topo_check, bool avg_computation, bool is_hierarchical,        \
-      char* name);
+#define NEIGHBOR_ALLREDUCE_H(torch_Tensor, THTensor)                           \
+  extern "C" int bluefog_torch_neighbor_allreduce_nonblocking_##torch_Tensor(  \
+      THTensor* tensor, THTensor* output, double self_weight,                  \
+      const std::unordered_map<int, double>& src_weights,                      \
+      const std::unordered_map<int, double>& dst_weights,                      \
+      bool dynamic_neighbors_enabled, bool enable_topo_check,                  \
+      bool avg_computation, bool is_hierarchical, char* name);
 
 NEIGHBOR_ALLREDUCE_H(torch_HalfTensor, THHalfTensor)
 NEIGHBOR_ALLREDUCE_H(torch_FloatTensor, THFloatTensor)
diff --git a/bluefog/torch/mpi_ops.py b/bluefog/torch/mpi_ops.py
index 82c55e07..3720dd6a 100644
--- a/bluefog/torch/mpi_ops.py
+++ b/bluefog/torch/mpi_ops.py
@@ -434,7 +434,7 @@ def _neighbor_allreduce_nonblocking(tensor, output, self_weight, src_weights,
                                     dst_weights, enable_topo_check, name):
     function = _check_function(_neighbor_allreduce_function_factory, tensor)
     if dst_weights is None:
-        dst_weights = []
+        dst_weights = {}
         dynamic_neighbors_enabled = False
     elif len(set(dst_weights)) != len(dst_weights):
         raise ValueError("Argument dst_weights should only contain the unique ranks.")
@@ -443,6 +443,8 @@ def _neighbor_allreduce_nonblocking(tensor, output, self_weight, src_weights,
                          "enabling dynamic topology.")
     else:
         dynamic_neighbors_enabled = True
+        if isinstance(dst_weights, list):
+            dst_weights = {dst:1.0 for dst in dst_weights}
     if self_weight is None and src_weights is None:
         # Implying this is static graph.
         if is_topo_weighted():

From 42d0af797216ea234dec8521a44a70c0f4839cb6 Mon Sep 17 00:00:00 2001
From: Hanbin Hu <hanbinhu2016@gmail.com>
Date: Fri, 19 Feb 2021 02:51:14 -0800
Subject: [PATCH 08/36] Reorganize CheckNeighborSendRecvPattern

---
 bluefog/common/mpi_controller.cc  | 109 ++++++++++++++++--------------
 bluefog/common/mpi_controller.h   |   7 +-
 bluefog/common/nccl_controller.cc |  28 ++++++--
 3 files changed, 86 insertions(+), 58 deletions(-)

diff --git a/bluefog/common/mpi_controller.cc b/bluefog/common/mpi_controller.cc
index b52451b9..26093260 100644
--- a/bluefog/common/mpi_controller.cc
+++ b/bluefog/common/mpi_controller.cc
@@ -293,54 +293,47 @@ void MPIController::NeighborAllgather(TensorTableEntry& entry) {
 }
 
 // Function to check if the sending and receiving neighbors match in the topology.
-bool CheckNeighborSendRecvPattern(int size, const TensorTableEntry& entry,
-                                  Timeline* timeline_ptr, const MPI_Comm& comm) {
+bool CheckNeighborSendRecvPattern(
+  std::shared_ptr<std::vector<int>> send_neighbors,
+  std::shared_ptr<std::vector<int>> recv_neighbors,
+  const std::string& tensor_name,
+  int size, Timeline* timeline_ptr, const MPI_Comm& comm) {
   bool res = false;
-  // enabled the check if enable_topo_check is true and partial
-  // neighbor_allreduce is activated.
-  if (entry.enable_topo_check && entry.dynamic_neighbors_enabled) {
-    if (entry.is_hierarchical) {
-      // TODO: support check.
-      BFLOG(INFO) << "Request to check topology for hierarchical neighbor "
-                  << "allreduce ops but it is not supported yet.";
-      return res;
-    }
-    timeline_ptr->ActivityStart(entry.tensor_name, "NEGOTIATION");
-    // Put all the send and recv neighbors in a single vector, and obtain a send
-    // matrix and a recv matrix through MPI_Allgather.
-    bool* send_check_buf = new bool[2 * size];
-    std::fill_n(send_check_buf, 2 * size, false);
-    bool* recv_check_buf = new bool[2 * size * size];
-    for (int send_rank : *(entry.send_neighbors))
-      send_check_buf[send_rank] = true;
-    for (int recv_rank : *(entry.recv_neighbors))
-      send_check_buf[size + recv_rank] = true;
-    int ret_code = MPI_Allgather(send_check_buf, size * 2, MPI_C_BOOL,
-                                 recv_check_buf, size * 2, MPI_C_BOOL, comm);
-    if (ret_code != MPI_SUCCESS) {
-      throw std::runtime_error(
-          "MPI_Allgather (for dynamic neighbor_allreduce negotiation) failed, "
-          "see MPI output for details.");
-    }
-    // This checks that send matrix and transposed recv matrix should be the
-    // same. If same, the topology is good to go. If not, there is mismatch edge
-    // to be fixed.
-    auto GetSendIndex = [size](int i, int j) -> int { return 2*size*i+j; };
-    auto GetRecvIndex = [size](int i, int j) -> int { return 2*size*i+j+size; };
-    for (int i = 0; i < size; ++i) {
-      if (res) break;
-      for (int j = 0; j < size; ++j) {
-        if (recv_check_buf[GetSendIndex(i, j)] !=
-            recv_check_buf[GetRecvIndex(j, i)]) {
-          res = true;
-          break;
-        }
+  timeline_ptr->ActivityStart(tensor_name, "NEGOTIATION");
+  // Put all the send and recv neighbors in a single vector, and obtain a send
+  // matrix and a recv matrix through MPI_Allgather.
+  bool* send_check_buf = new bool[2 * size];
+  std::fill_n(send_check_buf, 2 * size, false);
+  bool* recv_check_buf = new bool[2 * size * size];
+  for (int send_rank : *(send_neighbors))
+    send_check_buf[send_rank] = true;
+  for (int recv_rank : *(recv_neighbors))
+    send_check_buf[size + recv_rank] = true;
+  int ret_code = MPI_Allgather(send_check_buf, size * 2, MPI_C_BOOL,
+                               recv_check_buf, size * 2, MPI_C_BOOL, comm);
+  if (ret_code != MPI_SUCCESS) {
+    throw std::runtime_error(
+        "MPI_Allgather (for dynamic neighbor_allreduce negotiation) failed, "
+        "see MPI output for details.");
+  }
+  // This checks that send matrix and transposed recv matrix should be the
+  // same. If same, the topology is good to go. If not, there is mismatch edge
+  // to be fixed.
+  auto GetSendIndex = [size](int i, int j) -> int { return 2*size*i+j; };
+  auto GetRecvIndex = [size](int i, int j) -> int { return 2*size*i+j+size; };
+  for (int i = 0; i < size; ++i) {
+    if (res) break;
+    for (int j = 0; j < size; ++j) {
+      if (recv_check_buf[GetSendIndex(i, j)] !=
+          recv_check_buf[GetRecvIndex(j, i)]) {
+        res = true;
+        break;
       }
     }
-    delete [] send_check_buf;
-    delete [] recv_check_buf;
-    timeline_ptr->ActivityEnd(entry.tensor_name);
   }
+  delete [] send_check_buf;
+  delete [] recv_check_buf;
+  timeline_ptr->ActivityEnd(tensor_name);
   return res;
 }
 
@@ -385,9 +378,17 @@ void MPIController::NeighborAllreduce(TensorTableEntry& entry) {
 
   // If only partial sending is enabled, the following code block checks whether the sending
   // and recieving neighbors match each other when enable_topo_check is set to be True.
-  bool is_topo_check_fail = CheckNeighborSendRecvPattern(
-      mpi_ctx_.size_, entry, timeline_ptr,
-      mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL));
+  bool is_topo_check_fail = false;
+  if (entry.enable_topo_check && entry.dynamic_neighbors_enabled) {
+    if (entry.is_hierarchical) {
+      // TODO: support check.
+      BFLOG(INFO) << "Request to check topology for hierarchical neighbor "
+                  << "allreduce ops but it is not supported yet.";
+    }
+    is_topo_check_fail = CheckNeighborSendRecvPattern(
+        entry.send_neighbors, entry.recv_neighbors, entry.tensor_name,
+        mpi_ctx_.size_, timeline_ptr, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL));
+  }
 
   if (is_topo_check_fail) {
     entry.callback(Status::InvalidArgument(
@@ -583,9 +584,17 @@ void MPIController::NeighborAllreduce(std::vector<TensorTableEntry>& entries) {
   // If only partial sending is enabled, the following code block checks whether
   // the sending and recieving neighbors match each other when enable_topo_check
   // is set to be True.
-  bool is_topo_check_fail = CheckNeighborSendRecvPattern(
-      mpi_ctx_.size_, first_entry, timeline_ptr,
-      mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL));
+  bool is_topo_check_fail = false;
+  if (first_entry.enable_topo_check && first_entry.dynamic_neighbors_enabled) {
+    if (first_entry.is_hierarchical) {
+      // TODO: support check.
+      BFLOG(INFO) << "Request to check topology for hierarchical neighbor "
+                  << "allreduce ops but it is not supported yet.";
+    }
+    is_topo_check_fail = CheckNeighborSendRecvPattern(
+        first_entry.send_neighbors, first_entry.recv_neighbors, first_entry.tensor_name,
+        mpi_ctx_.size_, timeline_ptr, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL));
+  }
 
   if (is_topo_check_fail) {
     for (auto& entry : entries) {
diff --git a/bluefog/common/mpi_controller.h b/bluefog/common/mpi_controller.h
index f42fc182..8aacb071 100644
--- a/bluefog/common/mpi_controller.h
+++ b/bluefog/common/mpi_controller.h
@@ -27,8 +27,11 @@ namespace common {
 
 // Function to check if the sending and receiving neighbors match in the
 // topology.
-bool CheckNeighborSendRecvPattern(int size, const TensorTableEntry& entry,
-                                  Timeline* timeline_ptr, const MPI_Comm& comm);
+bool CheckNeighborSendRecvPattern(
+  std::shared_ptr<std::vector<int>> send_neighbors,
+  std::shared_ptr<std::vector<int>> recv_neighbors,
+  const std::string& tensor_name,
+  int size, Timeline* timeline_ptr, const MPI_Comm& comm);
 
 class MPIController {
  public:
diff --git a/bluefog/common/nccl_controller.cc b/bluefog/common/nccl_controller.cc
index 5389f341..cbc63bb7 100644
--- a/bluefog/common/nccl_controller.cc
+++ b/bluefog/common/nccl_controller.cc
@@ -661,9 +661,17 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) {
   // If only partial sending is enabled, the following code block checks whether
   // the sending and recieving neighbors match each other when enable_topo_check
   // is set to be True.
-  bool is_topo_check_fail = CheckNeighborSendRecvPattern(
-      mpi_ctx_.size_, entry, timeline_ptr_,
-      mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL));
+  bool is_topo_check_fail = false;
+  if (entry.enable_topo_check && entry.dynamic_neighbors_enabled) {
+    if (entry.is_hierarchical) {
+      // TODO: support check.
+      BFLOG(INFO) << "Request to check topology for hierarchical neighbor "
+                  << "allreduce ops but it is not supported yet.";
+    }
+    is_topo_check_fail = CheckNeighborSendRecvPattern(
+        entry.send_neighbors, entry.recv_neighbors, entry.tensor_name,
+        mpi_ctx_.size_, timeline_ptr, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL));
+  }
   if (is_topo_check_fail) {
     entry.callback(Status::InvalidArgument(
         "Send and recv neighbors dont' match in neighbor "
@@ -907,9 +915,17 @@ void NCCLController::NeighborAllreduce(std::vector<TensorTableEntry>& entries) {
   // If only partial sending is enabled, the following code block checks whether
   // the sending and recieving neighbors match each other when enable_topo_check
   // is set to be True.
-  bool is_topo_check_fail = CheckNeighborSendRecvPattern(
-      mpi_ctx_.size_, first_entry, timeline_ptr_,
-      mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL));
+  bool is_topo_check_fail = false;
+  if (entry.enable_topo_check && entry.dynamic_neighbors_enabled) {
+    if (entry.is_hierarchical) {
+      // TODO: support check.
+      BFLOG(INFO) << "Request to check topology for hierarchical neighbor "
+                  << "allreduce ops but it is not supported yet.";
+    }
+    is_topo_check_fail = CheckNeighborSendRecvPattern(
+        entry.send_neighbors, entry.recv_neighbors, entry.tensor_name,
+        mpi_ctx_.size_, timeline_ptr, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL));
+  }
   if (is_topo_check_fail) {
     for (auto& entry : entries) {
       entry.callback(Status::InvalidArgument(

From 3184e17054d430014dcba7a18df8b2dc9d1086b8 Mon Sep 17 00:00:00 2001
From: Hanbin Hu <hanbinhu2016@gmail.com>
Date: Fri, 19 Feb 2021 03:01:04 -0800
Subject: [PATCH 09/36] Fix timeline_ptr for NCCL

---
 bluefog/common/nccl_controller.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bluefog/common/nccl_controller.cc b/bluefog/common/nccl_controller.cc
index cbc63bb7..27b5e9c8 100644
--- a/bluefog/common/nccl_controller.cc
+++ b/bluefog/common/nccl_controller.cc
@@ -670,7 +670,7 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) {
     }
     is_topo_check_fail = CheckNeighborSendRecvPattern(
         entry.send_neighbors, entry.recv_neighbors, entry.tensor_name,
-        mpi_ctx_.size_, timeline_ptr, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL));
+        mpi_ctx_.size_, timeline_ptr_, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL));
   }
   if (is_topo_check_fail) {
     entry.callback(Status::InvalidArgument(
@@ -924,7 +924,7 @@ void NCCLController::NeighborAllreduce(std::vector<TensorTableEntry>& entries) {
     }
     is_topo_check_fail = CheckNeighborSendRecvPattern(
         entry.send_neighbors, entry.recv_neighbors, entry.tensor_name,
-        mpi_ctx_.size_, timeline_ptr, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL));
+        mpi_ctx_.size_, timeline_ptr_, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL));
   }
   if (is_topo_check_fail) {
     for (auto& entry : entries) {

From c85cbd50ff509c0c56816baa91a2c756ce394e24 Mon Sep 17 00:00:00 2001
From: Hanbin Hu <hanbinhu2016@gmail.com>
Date: Fri, 19 Feb 2021 03:01:04 -0800
Subject: [PATCH 10/36] Fix timeline_ptr for NCCL

---
 bluefog/common/nccl_controller.cc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/bluefog/common/nccl_controller.cc b/bluefog/common/nccl_controller.cc
index cbc63bb7..e47f4857 100644
--- a/bluefog/common/nccl_controller.cc
+++ b/bluefog/common/nccl_controller.cc
@@ -670,7 +670,7 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) {
     }
     is_topo_check_fail = CheckNeighborSendRecvPattern(
         entry.send_neighbors, entry.recv_neighbors, entry.tensor_name,
-        mpi_ctx_.size_, timeline_ptr, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL));
+        mpi_ctx_.size_, timeline_ptr_, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL));
   }
   if (is_topo_check_fail) {
     entry.callback(Status::InvalidArgument(
@@ -916,15 +916,15 @@ void NCCLController::NeighborAllreduce(std::vector<TensorTableEntry>& entries) {
   // the sending and recieving neighbors match each other when enable_topo_check
   // is set to be True.
   bool is_topo_check_fail = false;
-  if (entry.enable_topo_check && entry.dynamic_neighbors_enabled) {
-    if (entry.is_hierarchical) {
+  if (first_entry.enable_topo_check && first_entry.dynamic_neighbors_enabled) {
+    if (first_entry.is_hierarchical) {
       // TODO: support check.
       BFLOG(INFO) << "Request to check topology for hierarchical neighbor "
                   << "allreduce ops but it is not supported yet.";
     }
     is_topo_check_fail = CheckNeighborSendRecvPattern(
-        entry.send_neighbors, entry.recv_neighbors, entry.tensor_name,
-        mpi_ctx_.size_, timeline_ptr, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL));
+        first_entry.send_neighbors, first_entry.recv_neighbors, first_entry.tensor_name,
+        mpi_ctx_.size_, timeline_ptr_, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL));
   }
   if (is_topo_check_fail) {
     for (auto& entry : entries) {

From 7d60585054e7af06938a851e394be77d9c854384 Mon Sep 17 00:00:00 2001
From: Hanbin Hu <hanbinhu2016@gmail.com>
Date: Fri, 19 Feb 2021 11:29:10 -0800
Subject: [PATCH 11/36] Put dst_weights information into TensorTableEntry

---
 bluefog/common/common.h      |  1 +
 bluefog/common/operations.cc | 37 ++++++++++++++++++------------------
 bluefog/common/operations.h  |  1 +
 bluefog/torch/mpi_ops.cc     | 20 +++++++++++--------
 4 files changed, 33 insertions(+), 26 deletions(-)

diff --git a/bluefog/common/common.h b/bluefog/common/common.h
index 00d2a204..a27dde68 100644
--- a/bluefog/common/common.h
+++ b/bluefog/common/common.h
@@ -278,6 +278,7 @@ struct TensorTableEntry {
   // Neighbors for dynamic neighbor_allreduce.
   std::shared_ptr<std::vector<int>> send_neighbors;
   std::shared_ptr<std::vector<int>> recv_neighbors;
+  std::shared_ptr<std::vector<float>> send_weights;
 
   // Boolean value if dynamic neighbor is enabled.
   bool dynamic_neighbors_enabled = false;
diff --git a/bluefog/common/operations.cc b/bluefog/common/operations.cc
index c9daa4ea..312dc8d6 100644
--- a/bluefog/common/operations.cc
+++ b/bluefog/common/operations.cc
@@ -432,6 +432,20 @@ void CheckForStalledTensors(BluefogGlobalState& state) {
   }
 }
 
+template<typename T>
+bool IsSameList (std::shared_ptr<std::vector<T>> n1, std::shared_ptr<std::vector<T>> n2) {
+  if (n1 == nullptr && n2 == nullptr) return true;
+  if (n1 == nullptr || n2 == nullptr) return false;
+  if (n1->size() != n2->size()) return false;
+  // The order matters as well.
+  for (int i = 0; i < n1->size(); i++) {
+    if (n1->at(i) != n2->at(i)) {
+      return false;
+    }
+  }
+  return true;
+}
+
 }  // namespace
 
 bool RunLoopOnce(BluefogGlobalState& state);
@@ -948,20 +962,6 @@ void NegotiateOfRequestOfMaster(BluefogGlobalState& state,
       // Attempt to add more responses to this fused response.
       const TensorTableEntry& entry =
           state.tensor_queue.GetTensorEntry(response.tensor_names()[0]);
-      auto IsSameNeighborList =
-          [](std::shared_ptr<std::vector<int>> n1,
-             std::shared_ptr<std::vector<int>> n2) -> bool {
-        if (n1 == nullptr && n2 == nullptr) return true;
-        if (n1 == nullptr || n2 == nullptr) return false;
-        if (n1->size() != n2->size()) return false;
-        // The order matters as well.
-        for (int i = 0; i < n1->size(); i++) {
-          if (n1->at(i) != n2->at(i)) {
-            return false;
-          }
-        }
-        return true;
-      };
       // Recall that send_neighbors is empty or not determines we use partial
       // neighbor allreduce or not.
       int num_recv_neighbors = !entry.dynamic_neighbors_enabled
@@ -985,10 +985,9 @@ void NegotiateOfRequestOfMaster(BluefogGlobalState& state,
             entry.tensor->dtype() == new_entry.tensor->dtype() &&
             entry.dynamic_neighbors_enabled == new_entry.dynamic_neighbors_enabled &&
             entry.is_hierarchical == new_entry.is_hierarchical &&
-            IsSameNeighborList(entry.send_neighbors,
-                               new_entry.send_neighbors) &&
-            IsSameNeighborList(entry.recv_neighbors,
-                               new_entry.recv_neighbors) &&
+            IsSameList(entry.send_neighbors, new_entry.send_neighbors) &&
+            IsSameList(entry.send_weights, new_entry.send_weights) &&
+            IsSameList(entry.recv_neighbors, new_entry.recv_neighbors) &&
             tensor_size + new_tensor_size <= state.tensor_fusion_threshold) {
           // These tensors will fuse together well.
           tensor_size += new_tensor_size;
@@ -1545,6 +1544,7 @@ Status EnqueueTensorNeighborAllreduce(std::shared_ptr<Tensor> tensor,
                                       std::shared_ptr<ReadyEvent> ready_event,
                                       std::shared_ptr<std::vector<int>> recv_neighbors,
                                       std::shared_ptr<std::vector<int>> send_neighbors,
+                                      std::shared_ptr<std::vector<float>> send_weights,
                                       bool dynamic_neighbors_enabled,
                                       bool is_hierarchical,
                                       bool enable_topo_check,
@@ -1569,6 +1569,7 @@ Status EnqueueTensorNeighborAllreduce(std::shared_ptr<Tensor> tensor,
   e.ready_event = ready_event;
   e.recv_neighbors = recv_neighbors;
   e.send_neighbors = send_neighbors;
+  e.send_weights = send_weights;
   e.dynamic_neighbors_enabled = dynamic_neighbors_enabled;
   e.is_hierarchical = is_hierarchical;
   e.enable_topo_check = enable_topo_check;
diff --git a/bluefog/common/operations.h b/bluefog/common/operations.h
index 66c06031..1d2a53d2 100644
--- a/bluefog/common/operations.h
+++ b/bluefog/common/operations.h
@@ -144,6 +144,7 @@ Status EnqueueTensorNeighborAllreduce(std::shared_ptr<Tensor> tensor,
                                       std::shared_ptr<ReadyEvent> ready_event,
                                       std::shared_ptr<std::vector<int>> recv_neighbors,
                                       std::shared_ptr<std::vector<int>> send_neighbors,
+                                      std::shared_ptr<std::vector<float>> send_weights,
                                       bool dynamic_neighbors_enabled,
                                       bool is_hierarchical,
                                       bool enable_topo_check,
diff --git a/bluefog/torch/mpi_ops.cc b/bluefog/torch/mpi_ops.cc
index 55fd3e34..a5527d57 100644
--- a/bluefog/torch/mpi_ops.cc
+++ b/bluefog/torch/mpi_ops.cc
@@ -427,16 +427,20 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output,
 
   std::vector<int> src_neighbors;
   for (auto kv : src_weights)
-      src_neighbors.push_back(kv.first);
+    src_neighbors.push_back(kv.first);
   std::sort(src_neighbors.begin(), src_neighbors.end());
 
   std::vector<int> dst_neighbors;
   for (auto kv : dst_weights)
-      dst_neighbors.push_back(kv.first);
+    dst_neighbors.push_back(kv.first);
   std::sort(dst_neighbors.begin(), dst_neighbors.end());
+  std::vector<float> dst_weights_vec;
+  for (int rank : dst_neighbors)
+    dst_weights_vec.push_back(dst_weights.at(rank));
 
   auto bf_src_neighbors = std::make_shared<std::vector<int>>(src_neighbors);
   auto bf_dst_neighbors = std::make_shared<std::vector<int>>(dst_neighbors);
+  auto bf_dst_weights_vec = std::make_shared<std::vector<float>>(dst_weights_vec);
   auto ready_event = RecordReadyEvent(device);
   if (OPS_ON_CPU && tensor.device().is_cuda()) {
     ::torch::Tensor cpu_buffer =
@@ -448,9 +452,9 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output,
     auto bf_output = std::make_shared<TorchTensor>(cpu_output);
 
     auto enqueue_result = EnqueueTensorNeighborAllreduce(
-        bf_tensor, bf_output, bf_context, ready_event, bf_src_neighbors,
-        bf_dst_neighbors, dynamic_neighbors_enabled, is_hierarchical,
-        enable_topo_check, op_name, CPU_DEVICE_ID,
+        bf_tensor, bf_output, bf_context, ready_event,
+        bf_src_neighbors, bf_dst_neighbors, bf_dst_weights_vec,
+        dynamic_neighbors_enabled, is_hierarchical, enable_topo_check, op_name, CPU_DEVICE_ID,
         callback_wrapper([self_weight, src_weights, avg_computation,
                           cpu_output, tensor, src_neighbors,
                           dynamic_neighbors_enabled, is_hierarchical, output,
@@ -471,9 +475,9 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output,
     auto bf_output = std::make_shared<TorchTensor>(output);
 
     auto enqueue_result = EnqueueTensorNeighborAllreduce(
-        bf_tensor, bf_output, bf_context, ready_event, bf_src_neighbors,
-        bf_dst_neighbors, dynamic_neighbors_enabled, is_hierarchical,
-        enable_topo_check, op_name, device,
+        bf_tensor, bf_output, bf_context, ready_event,
+        bf_src_neighbors, bf_dst_neighbors, bf_dst_weights_vec,
+        dynamic_neighbors_enabled, is_hierarchical, enable_topo_check, op_name, device,
         callback_wrapper([self_weight, src_weights, avg_computation,
                           src_neighbors, dynamic_neighbors_enabled,
                           is_hierarchical, tensor, output]() mutable {

From 79c57b36310ae36c5f5ee030521fb46ef7a628b5 Mon Sep 17 00:00:00 2001
From: Hanbin Hu <hanbinhu2016@gmail.com>
Date: Fri, 26 Feb 2021 18:38:09 -0800
Subject: [PATCH 12/36] First Version of neighbor_allreduce dst_weight,
 existing problem: Fusion Not Implemented, CUDA data_weight problem

---
 bluefog/common/common.h           |   7 ++-
 bluefog/common/mpi_controller.cc  | 101 ++++++++++++++++++++++--------
 bluefog/common/mpi_controller.h   |   6 +-
 bluefog/common/nccl_controller.cc |  58 +++++++++++++----
 bluefog/common/operations.cc      |   5 +-
 bluefog/common/operations.h       |   3 +-
 bluefog/tensorflow/adapter.cc     |   2 +-
 bluefog/tensorflow/adapter.h      |   2 +-
 bluefog/torch/adapter.cc          |   6 +-
 bluefog/torch/adapter.h           |   2 +-
 bluefog/torch/mpi_ops.cc          |  15 +++--
 bluefog/torch/mpi_ops.h           |   5 +-
 bluefog/torch/mpi_ops.py          |   7 ++-
 test/torch_ops_test.py            |  93 +++++++++++++++++++++++++++
 14 files changed, 251 insertions(+), 61 deletions(-)

diff --git a/bluefog/common/common.h b/bluefog/common/common.h
index a27dde68..f9e1413f 100644
--- a/bluefog/common/common.h
+++ b/bluefog/common/common.h
@@ -211,7 +211,7 @@ class Tensor {
   virtual const DataType dtype() const = 0;
   virtual const TensorShape shape() const = 0;
   virtual const void* data() const = 0;
-  virtual std::shared_ptr<common::Tensor> data_weight(float weight) = 0;
+  virtual std::unique_ptr<common::Tensor> data_weight(float weight) = 0;
   virtual int64_t size() const = 0;
   virtual ~Tensor() = default;
 };
@@ -278,11 +278,14 @@ struct TensorTableEntry {
   // Neighbors for dynamic neighbor_allreduce.
   std::shared_ptr<std::vector<int>> send_neighbors;
   std::shared_ptr<std::vector<int>> recv_neighbors;
-  std::shared_ptr<std::vector<float>> send_weights;
+  std::shared_ptr<std::vector<double>> send_weights;
 
   // Boolean value if dynamic neighbor is enabled.
   bool dynamic_neighbors_enabled = false;
 
+  // Boolean value for enabling destination(send) weighting operation or not.
+  bool dst_weighting_enabled = false;
+
   // Boolean value for enabling topology check.
   bool enable_topo_check = false;
 
diff --git a/bluefog/common/mpi_controller.cc b/bluefog/common/mpi_controller.cc
index 26093260..788ecdd7 100644
--- a/bluefog/common/mpi_controller.cc
+++ b/bluefog/common/mpi_controller.cc
@@ -294,10 +294,8 @@ void MPIController::NeighborAllgather(TensorTableEntry& entry) {
 
 // Function to check if the sending and receiving neighbors match in the topology.
 bool CheckNeighborSendRecvPattern(
-  std::shared_ptr<std::vector<int>> send_neighbors,
-  std::shared_ptr<std::vector<int>> recv_neighbors,
-  const std::string& tensor_name,
-  int size, Timeline* timeline_ptr, const MPI_Comm& comm) {
+  const std::vector<int>* send_neighbors, const std::vector<int>* recv_neighbors,
+  const std::string& tensor_name, int size, Timeline* timeline_ptr, const MPI_Comm& comm) {
   bool res = false;
   timeline_ptr->ActivityStart(tensor_name, "NEGOTIATION");
   // Put all the send and recv neighbors in a single vector, and obtain a send
@@ -386,7 +384,7 @@ void MPIController::NeighborAllreduce(TensorTableEntry& entry) {
                   << "allreduce ops but it is not supported yet.";
     }
     is_topo_check_fail = CheckNeighborSendRecvPattern(
-        entry.send_neighbors, entry.recv_neighbors, entry.tensor_name,
+        entry.send_neighbors.get(), entry.recv_neighbors.get(), entry.tensor_name,
         mpi_ctx_.size_, timeline_ptr, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL));
   }
 
@@ -437,18 +435,41 @@ void MPIController::NeighborAllreduce(TensorTableEntry& entry) {
               "output for details.");
         }
       }
-      for (int i = 0; i < nsend; ++i) {
-        int ret_code = MPI_Isend(
-            sendbuf, num_elements, mpi_ctx_.GetMPIDataType(entry.tensor),
-            entry.send_neighbors->at(i),
-            mpi_ctx_.rank_ + entry.send_neighbors->at(i),
-            mpi_ctx_.GetMPICommunicator(Communicator::GRAPH), &requests[i]);
-        if (ret_code != MPI_SUCCESS) {
-          throw std::runtime_error(
-              "MPI_Isend (for dynamic neighbor_allreduce) failed, see MPI "
-              "output for details.");
+
+      std::vector<std::unique_ptr<common::Tensor>> weighted_tensors;
+      // Ensure the lifecycle of the weighted tensors are alive after communication.
+
+      if (entry.dst_weighting_enabled) {
+        for (int i = 0; i < nsend; ++i) {
+          int ret_code;
+          auto weighted_tensor_ptr = entry.tensor->data_weight(entry.send_weights->at(i));
+          weighted_tensors.push_back(std::move(weighted_tensor_ptr));
+          ret_code = MPI_Isend(
+              weighted_tensors[i].get()->data(), num_elements, mpi_ctx_.GetMPIDataType(entry.tensor),
+              entry.send_neighbors->at(i),
+              mpi_ctx_.rank_ + entry.send_neighbors->at(i),
+              mpi_ctx_.GetMPICommunicator(Communicator::GRAPH), &requests[i]);
+          if (ret_code != MPI_SUCCESS) {
+            throw std::runtime_error(
+                "MPI_Isend (for dynamic neighbor_allreduce) failed, see MPI "
+                "output for details.");
+          }
+        }
+      } else {
+        for (int i = 0; i < nsend; ++i) {
+          int ret_code = MPI_Isend(
+              sendbuf, num_elements, mpi_ctx_.GetMPIDataType(entry.tensor),
+              entry.send_neighbors->at(i),
+              mpi_ctx_.rank_ + entry.send_neighbors->at(i),
+              mpi_ctx_.GetMPICommunicator(Communicator::GRAPH), &requests[i]);
+          if (ret_code != MPI_SUCCESS) {
+            throw std::runtime_error(
+                "MPI_Isend (for dynamic neighbor_allreduce) failed, see MPI "
+                "output for details.");
+          }
         }
       }
+
       MPI_Waitall(nsend + nrecv, requests.data(), statuses.data());
       error_message =
           GenerateNeighborAllreduceErrorMessage(statuses, nsend, nrecv);
@@ -592,7 +613,7 @@ void MPIController::NeighborAllreduce(std::vector<TensorTableEntry>& entries) {
                   << "allreduce ops but it is not supported yet.";
     }
     is_topo_check_fail = CheckNeighborSendRecvPattern(
-        first_entry.send_neighbors, first_entry.recv_neighbors, first_entry.tensor_name,
+        first_entry.send_neighbors.get(), first_entry.recv_neighbors.get(), first_entry.tensor_name,
         mpi_ctx_.size_, timeline_ptr, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL));
   }
 
@@ -610,6 +631,18 @@ void MPIController::NeighborAllreduce(std::vector<TensorTableEntry>& entries) {
   timeline_ptr->ActivityEndAll(entries);
   const void* fused_input_data = buffer_data;
 
+  const void* weighted_fused_input_data = nullptr;
+  if (first_entry.dst_weighting_enabled) {
+    // Generate weighted data fusion for sending
+    BFLOG(INFO) << "Should not reach here";
+    throw std::runtime_error("Not Implemented for dst_weight fusion");
+    if (first_entry.device == CPU_DEVICE_ID) {
+
+    } else {
+
+    }
+  }
+
   // Unlike allreduce, the storage for neighbor_allreduce in fusion buffer
   // is like [t_1, t_2 | t_1_n1, t_2_n1, t_1_n2, t_2_n2].
   // Here t_1 and t_2  means self tensor 1 and 2 and _n1 and _n2 means the
@@ -655,16 +688,32 @@ void MPIController::NeighborAllreduce(std::vector<TensorTableEntry>& entries) {
               "for details.");
         }
       }
-      for (int i = 0; i < nsend; ++i) {
-        int ret_code = MPI_Isend(
-            fused_input_data, num_elements, mpi_ctx_.GetMPIDataType(first_entry.tensor),
-            first_entry.send_neighbors->at(i),
-            /*tag=*/mpi_ctx_.rank_ + first_entry.send_neighbors->at(i),
-            mpi_ctx_.GetMPICommunicator(Communicator::GRAPH), &requests[i]);
-        if (ret_code != MPI_SUCCESS) {
-          throw std::runtime_error(
-              "MPI_Isend (for dynamic neighbor_allreduce) failed, see MPI output "
-              "for details.");
+      if (!first_entry.dst_weighting_enabled) {
+        for (int i = 0; i < nsend; ++i) {
+          int ret_code = MPI_Isend(
+              fused_input_data, num_elements, mpi_ctx_.GetMPIDataType(first_entry.tensor),
+              first_entry.send_neighbors->at(i),
+              /*tag=*/mpi_ctx_.rank_ + first_entry.send_neighbors->at(i),
+              mpi_ctx_.GetMPICommunicator(Communicator::GRAPH), &requests[i]);
+          if (ret_code != MPI_SUCCESS) {
+            throw std::runtime_error(
+                "MPI_Isend (for dynamic neighbor_allreduce) failed, see MPI output "
+                "for details.");
+          }
+        }
+      } else {
+        for (int i = 0; i < nsend; ++i) {
+          void* sendbuf = 
+              (void*)((uint8_t*)weighted_fused_input_data + num_elements * i * element_size);
+          int ret_code = MPI_Isend(sendbuf, num_elements,
+              mpi_ctx_.GetMPIDataType(first_entry.tensor), first_entry.send_neighbors->at(i),
+              /*tag=*/mpi_ctx_.rank_ + first_entry.send_neighbors->at(i),
+              mpi_ctx_.GetMPICommunicator(Communicator::GRAPH), &requests[i]);
+          if (ret_code != MPI_SUCCESS) {
+            throw std::runtime_error(
+                "MPI_Isend (for dynamic neighbor_allreduce) failed, see MPI output "
+                "for details.");
+          }
         }
       }
       MPI_Waitall(nsend + nrecv, requests.data(), statuses.data());
diff --git a/bluefog/common/mpi_controller.h b/bluefog/common/mpi_controller.h
index 8aacb071..2a850c01 100644
--- a/bluefog/common/mpi_controller.h
+++ b/bluefog/common/mpi_controller.h
@@ -28,10 +28,8 @@ namespace common {
 // Function to check if the sending and receiving neighbors match in the
 // topology.
 bool CheckNeighborSendRecvPattern(
-  std::shared_ptr<std::vector<int>> send_neighbors,
-  std::shared_ptr<std::vector<int>> recv_neighbors,
-  const std::string& tensor_name,
-  int size, Timeline* timeline_ptr, const MPI_Comm& comm);
+  const std::vector<int>* send_neighbors, const std::vector<int>* recv_neighbors,
+  const std::string& tensor_name, int size, Timeline* timeline_ptr, const MPI_Comm& comm);
 
 class MPIController {
  public:
diff --git a/bluefog/common/nccl_controller.cc b/bluefog/common/nccl_controller.cc
index e47f4857..921a8d5b 100644
--- a/bluefog/common/nccl_controller.cc
+++ b/bluefog/common/nccl_controller.cc
@@ -669,7 +669,7 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) {
                   << "allreduce ops but it is not supported yet.";
     }
     is_topo_check_fail = CheckNeighborSendRecvPattern(
-        entry.send_neighbors, entry.recv_neighbors, entry.tensor_name,
+        entry.send_neighbors.get(), entry.recv_neighbors.get(), entry.tensor_name,
         mpi_ctx_.size_, timeline_ptr_, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL));
   }
   if (is_topo_check_fail) {
@@ -679,6 +679,9 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) {
     return; 
   }
 
+  std::vector<std::unique_ptr<common::Tensor>> weighted_tensors;
+  // Ensure the lifecycle of the weighted tensors are alive after communication.
+
 #if NCCL_MINOR > 6
   if (!entry.is_hierarchical) {
     ncclGroupStart();
@@ -702,9 +705,20 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) {
         NCCLCHECK(ncclRecv(recvbuf, num_elements, GetNCCLDataType(entry.tensor),
                           recv_rank, nccl_ctx_.nccl_comm, nccl_ctx_.stream));
       }
-      for (int send_rank : *entry.send_neighbors) {
-        NCCLCHECK(ncclSend(sendbuf, num_elements, GetNCCLDataType(entry.tensor),
-                          send_rank, nccl_ctx_.nccl_comm, nccl_ctx_.stream));
+      if(entry.dst_weighting_enabled)
+      {
+        for (size_t i = 0; i < entry.send_neighbors->size(); ++i) {
+          auto weighted_tensor_ptr = entry.tensor->data_weight(entry.send_weights->at(i));
+          weighted_tensors.push_back(std::move(weighted_tensor_ptr));
+          NCCLCHECK(ncclSend(weighted_tensors[i].get()->data(), num_elements,
+                             GetNCCLDataType(entry.tensor), entry.send_neighbors->at(i),
+                             nccl_ctx_.nccl_comm, nccl_ctx_.stream));
+        }
+      } else {
+        for (int send_rank : *entry.send_neighbors) {
+          NCCLCHECK(ncclSend(sendbuf, num_elements, GetNCCLDataType(entry.tensor),
+                             send_rank, nccl_ctx_.nccl_comm, nccl_ctx_.stream));
+        }
       }
     }
     ncclGroupEnd();
@@ -754,7 +768,7 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) {
 
   auto tid = std::this_thread::get_id();
   nccl_ctx_.finalizer_thread_pool.execute(
-      [this, entry, event_queue, tid]() mutable {
+      [this, entry, event_queue, tid, buffer_data, weighted_tensors]() mutable {
         with_device device_guard(entry.device);
         WaitForEvents(event_queue, {entry}, this->timeline_ptr_, tid);
 
@@ -923,7 +937,7 @@ void NCCLController::NeighborAllreduce(std::vector<TensorTableEntry>& entries) {
                   << "allreduce ops but it is not supported yet.";
     }
     is_topo_check_fail = CheckNeighborSendRecvPattern(
-        first_entry.send_neighbors, first_entry.recv_neighbors, first_entry.tensor_name,
+        first_entry.send_neighbors.get(), first_entry.recv_neighbors.get(), first_entry.tensor_name,
         mpi_ctx_.size_, timeline_ptr_, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL));
   }
   if (is_topo_check_fail) {
@@ -943,6 +957,16 @@ void NCCLController::NeighborAllreduce(std::vector<TensorTableEntry>& entries) {
     RecordEvent(event_queue, "MEM_CPY_IN");
   }
 
+  const void* weighted_fused_input_data = nullptr;
+  if (first_entry.dst_weighting_enabled) {
+    throw std::runtime_error("Not Implemented for dst_weight fusion");
+    if (first_entry.device == CPU_DEVICE_ID) {
+      throw std::runtime_error("NCCL Neighbor Allreduce failed: perform operation on CPU data.");
+    } else {
+
+    }
+  }
+
   // Unlike allreduce, the storage for neighbor_allreduce in fusion buffer
   // is like [t_1, t_2 | t_1_n1, t_2_n1, t_1_n2, t_2_n2].
   // Here t_1 and t_2  means self tensor 1 and 2 and _n1 and _n2 means the
@@ -975,10 +999,22 @@ void NCCLController::NeighborAllreduce(std::vector<TensorTableEntry>& entries) {
                            GetNCCLDataType(first_entry.tensor), recv_rank,
                            nccl_ctx_.nccl_comm, nccl_ctx_.stream));
       }
-      for (int send_rank : *first_entry.send_neighbors) {
-        NCCLCHECK(ncclSend(fused_input_data, num_elements,
-                           GetNCCLDataType(first_entry.tensor), send_rank,
-                           nccl_ctx_.nccl_comm, nccl_ctx_.stream));
+      if (!first_entry.dst_weighting_enabled)
+      {
+        for (int send_rank : *first_entry.send_neighbors) {
+          NCCLCHECK(ncclSend(fused_input_data, num_elements,
+                             GetNCCLDataType(first_entry.tensor), send_rank,
+                             nccl_ctx_.nccl_comm, nccl_ctx_.stream));
+        }
+      } else {
+        size_t i = 0;
+        for (int send_rank : *first_entry.send_neighbors) {
+          void* sendbuf = 
+              (void*)((uint8_t*)weighted_fused_input_data + num_elements * i * element_size);
+          NCCLCHECK(ncclSend(sendbuf, num_elements,
+                             GetNCCLDataType(first_entry.tensor), send_rank,
+                             nccl_ctx_.nccl_comm, nccl_ctx_.stream));
+        }
       }
     }
     ncclGroupEnd();
@@ -1053,7 +1089,7 @@ void NCCLController::NeighborAllreduce(std::vector<TensorTableEntry>& entries) {
 
   auto tid = std::this_thread::get_id();
   nccl_ctx_.finalizer_thread_pool.execute(
-      [this, entries, event_queue, tid, buffer_data]() mutable {
+      [this, entries, event_queue, tid, buffer_data, weighted_fused_input_data]() mutable {
         auto& first_entry = entries[0];
         with_device device_guard(first_entry.device);
         WaitForEvents(event_queue, entries, this->timeline_ptr_, tid);
diff --git a/bluefog/common/operations.cc b/bluefog/common/operations.cc
index 312dc8d6..efec9ff8 100644
--- a/bluefog/common/operations.cc
+++ b/bluefog/common/operations.cc
@@ -984,6 +984,7 @@ void NegotiateOfRequestOfMaster(BluefogGlobalState& state,
             response.devices() == new_response.devices() &&
             entry.tensor->dtype() == new_entry.tensor->dtype() &&
             entry.dynamic_neighbors_enabled == new_entry.dynamic_neighbors_enabled &&
+            entry.dst_weighting_enabled == new_entry.dst_weighting_enabled &&
             entry.is_hierarchical == new_entry.is_hierarchical &&
             IsSameList(entry.send_neighbors, new_entry.send_neighbors) &&
             IsSameList(entry.send_weights, new_entry.send_weights) &&
@@ -1544,8 +1545,9 @@ Status EnqueueTensorNeighborAllreduce(std::shared_ptr<Tensor> tensor,
                                       std::shared_ptr<ReadyEvent> ready_event,
                                       std::shared_ptr<std::vector<int>> recv_neighbors,
                                       std::shared_ptr<std::vector<int>> send_neighbors,
-                                      std::shared_ptr<std::vector<float>> send_weights,
+                                      std::shared_ptr<std::vector<double>> send_weights,
                                       bool dynamic_neighbors_enabled,
+                                      bool dst_weighting_enabled,
                                       bool is_hierarchical,
                                       bool enable_topo_check,
                                       const std::string& name, const int device,
@@ -1571,6 +1573,7 @@ Status EnqueueTensorNeighborAllreduce(std::shared_ptr<Tensor> tensor,
   e.send_neighbors = send_neighbors;
   e.send_weights = send_weights;
   e.dynamic_neighbors_enabled = dynamic_neighbors_enabled;
+  e.dst_weighting_enabled = dst_weighting_enabled;
   e.is_hierarchical = is_hierarchical;
   e.enable_topo_check = enable_topo_check;
   e.device = device;
diff --git a/bluefog/common/operations.h b/bluefog/common/operations.h
index 1d2a53d2..d94a4e26 100644
--- a/bluefog/common/operations.h
+++ b/bluefog/common/operations.h
@@ -144,8 +144,9 @@ Status EnqueueTensorNeighborAllreduce(std::shared_ptr<Tensor> tensor,
                                       std::shared_ptr<ReadyEvent> ready_event,
                                       std::shared_ptr<std::vector<int>> recv_neighbors,
                                       std::shared_ptr<std::vector<int>> send_neighbors,
-                                      std::shared_ptr<std::vector<float>> send_weights,
+                                      std::shared_ptr<std::vector<double>> send_weights,
                                       bool dynamic_neighbors_enabled,
+                                      bool dst_weighting_enabled,
                                       bool is_hierarchical,
                                       bool enable_topo_check,
                                       const std::string& name, const int device,
diff --git a/bluefog/tensorflow/adapter.cc b/bluefog/tensorflow/adapter.cc
index f7864cbc..313a48ed 100644
--- a/bluefog/tensorflow/adapter.cc
+++ b/bluefog/tensorflow/adapter.cc
@@ -128,7 +128,7 @@ const void* TFTensor::data() const {
   return (const void*)tensor_.tensor_data().data();
 }
 
-std::shared_ptr<common::Tensor> TFTensor::data_weight(float weight) {
+std::unique_ptr<common::Tensor> TFTensor::data_weight(float weight) {
   throw std::runtime_error("Tensorflow with weight is not implemented yet.");
 };
 
diff --git a/bluefog/tensorflow/adapter.h b/bluefog/tensorflow/adapter.h
index c61c88ad..02e74ddc 100644
--- a/bluefog/tensorflow/adapter.h
+++ b/bluefog/tensorflow/adapter.h
@@ -50,7 +50,7 @@ class TFTensor : public common::Tensor {
   virtual const common::DataType dtype() const override;
   virtual const common::TensorShape shape() const override;
   virtual const void* data() const override;
-  virtual std::shared_ptr<common::Tensor> data_weight(float weight) override;
+  virtual std::unique_ptr<common::Tensor> data_weight(float weight) override;
   virtual int64_t size() const override;
 
  protected:
diff --git a/bluefog/torch/adapter.cc b/bluefog/torch/adapter.cc
index 3cb38aab..aff49a90 100644
--- a/bluefog/torch/adapter.cc
+++ b/bluefog/torch/adapter.cc
@@ -76,15 +76,15 @@ const common::TensorShape TorchTensor::shape() const {
 
 const void* TorchTensor::data() const { return tensor_.data_ptr(); }
 
-std::shared_ptr<common::Tensor> TorchTensor::data_weight(float weight) {
+std::unique_ptr<common::Tensor> TorchTensor::data_weight(float weight) {
   if (weight == 1.0) {
-    return std::make_shared<TorchTensor>(tensor_);
+    return std::make_unique<TorchTensor>(tensor_);
   } else {
     int device =
         tensor_.device().is_cuda() ? tensor_.device().index() : CPU_DEVICE_ID;
     with_device device_context(device);
     // Note we call mul instead of mul_
-    return std::make_shared<TorchTensor>(tensor_.mul(weight));
+    return std::make_unique<TorchTensor>(tensor_.mul(weight));
   }
 }
 
diff --git a/bluefog/torch/adapter.h b/bluefog/torch/adapter.h
index 4e192ed2..eb6526f2 100644
--- a/bluefog/torch/adapter.h
+++ b/bluefog/torch/adapter.h
@@ -35,7 +35,7 @@ class TorchTensor : public common::Tensor {
   virtual const common::DataType dtype() const override;
   virtual const common::TensorShape shape() const override;
   virtual const void* data() const override;
-  virtual std::shared_ptr<common::Tensor> data_weight(float weight) override;
+  virtual std::unique_ptr<common::Tensor> data_weight(float weight) override;
   virtual int64_t size() const override;
   
   // TODO(ybc) Figure out a better encapsulated way to do it.
diff --git a/bluefog/torch/mpi_ops.cc b/bluefog/torch/mpi_ops.cc
index a5527d57..56168d0f 100644
--- a/bluefog/torch/mpi_ops.cc
+++ b/bluefog/torch/mpi_ops.cc
@@ -409,8 +409,9 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output,
                         double self_weight,
                         const std::unordered_map<int, double>& src_weights,
                         const std::unordered_map<int, double>& dst_weights,
-                        bool dynamic_neighbors_enabled, bool enable_topo_check,
-                        bool avg_computation, bool is_hierarchical, const std::string& name) {
+                        bool dynamic_neighbors_enabled, bool dst_weighting_enabled,
+                        bool enable_topo_check, bool avg_computation, bool is_hierarchical,
+                        const std::string& name) {
   ThrowIfError(common::CheckInitialized());
 
   auto handle = handle_manager.AllocateHandle();
@@ -434,13 +435,13 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output,
   for (auto kv : dst_weights)
     dst_neighbors.push_back(kv.first);
   std::sort(dst_neighbors.begin(), dst_neighbors.end());
-  std::vector<float> dst_weights_vec;
+  std::vector<double> dst_weights_vec;
   for (int rank : dst_neighbors)
     dst_weights_vec.push_back(dst_weights.at(rank));
 
   auto bf_src_neighbors = std::make_shared<std::vector<int>>(src_neighbors);
   auto bf_dst_neighbors = std::make_shared<std::vector<int>>(dst_neighbors);
-  auto bf_dst_weights_vec = std::make_shared<std::vector<float>>(dst_weights_vec);
+  auto bf_dst_weights_vec = std::make_shared<std::vector<double>>(dst_weights_vec);
   auto ready_event = RecordReadyEvent(device);
   if (OPS_ON_CPU && tensor.device().is_cuda()) {
     ::torch::Tensor cpu_buffer =
@@ -454,7 +455,8 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output,
     auto enqueue_result = EnqueueTensorNeighborAllreduce(
         bf_tensor, bf_output, bf_context, ready_event,
         bf_src_neighbors, bf_dst_neighbors, bf_dst_weights_vec,
-        dynamic_neighbors_enabled, is_hierarchical, enable_topo_check, op_name, CPU_DEVICE_ID,
+        dynamic_neighbors_enabled, dst_weighting_enabled, is_hierarchical,
+        enable_topo_check, op_name, CPU_DEVICE_ID,
         callback_wrapper([self_weight, src_weights, avg_computation,
                           cpu_output, tensor, src_neighbors,
                           dynamic_neighbors_enabled, is_hierarchical, output,
@@ -477,7 +479,8 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output,
     auto enqueue_result = EnqueueTensorNeighborAllreduce(
         bf_tensor, bf_output, bf_context, ready_event,
         bf_src_neighbors, bf_dst_neighbors, bf_dst_weights_vec,
-        dynamic_neighbors_enabled, is_hierarchical, enable_topo_check, op_name, device,
+        dynamic_neighbors_enabled, dst_weighting_enabled,
+        is_hierarchical, enable_topo_check, op_name, device,
         callback_wrapper([self_weight, src_weights, avg_computation,
                           src_neighbors, dynamic_neighbors_enabled,
                           is_hierarchical, tensor, output]() mutable {
diff --git a/bluefog/torch/mpi_ops.h b/bluefog/torch/mpi_ops.h
index 66ecb38c..605afa41 100644
--- a/bluefog/torch/mpi_ops.h
+++ b/bluefog/torch/mpi_ops.h
@@ -116,8 +116,9 @@ NEIGHBOR_ALLGATHER_H(torch_cuda_DoubleTensor, THCudaDoubleTensor)
       THTensor* tensor, THTensor* output, double self_weight,                  \
       const std::unordered_map<int, double>& src_weights,                      \
       const std::unordered_map<int, double>& dst_weights,                      \
-      bool dynamic_neighbors_enabled, bool enable_topo_check,                  \
-      bool avg_computation, bool is_hierarchical, char* name);
+      bool dynamic_neighbors_enabled, bool dst_weighting_enabled,              \
+      bool enable_topo_check, bool avg_computation, bool is_hierarchical,      \
+      char* name);
 
 NEIGHBOR_ALLREDUCE_H(torch_HalfTensor, THHalfTensor)
 NEIGHBOR_ALLREDUCE_H(torch_FloatTensor, THFloatTensor)
diff --git a/bluefog/torch/mpi_ops.py b/bluefog/torch/mpi_ops.py
index 3720dd6a..f34270d4 100644
--- a/bluefog/torch/mpi_ops.py
+++ b/bluefog/torch/mpi_ops.py
@@ -436,6 +436,7 @@ def _neighbor_allreduce_nonblocking(tensor, output, self_weight, src_weights,
     if dst_weights is None:
         dst_weights = {}
         dynamic_neighbors_enabled = False
+        dst_weighting_enabled = False
     elif len(set(dst_weights)) != len(dst_weights):
         raise ValueError("Argument dst_weights should only contain the unique ranks.")
     elif self_weight is None or src_weights is None:
@@ -443,8 +444,10 @@ def _neighbor_allreduce_nonblocking(tensor, output, self_weight, src_weights,
                          "enabling dynamic topology.")
     else:
         dynamic_neighbors_enabled = True
+        dst_weighting_enabled = True
         if isinstance(dst_weights, list):
             dst_weights = {dst:1.0 for dst in dst_weights}
+            dst_weighting_enabled = False
     if self_weight is None and src_weights is None:
         # Implying this is static graph.
         if is_topo_weighted():
@@ -479,8 +482,8 @@ def _neighbor_allreduce_nonblocking(tensor, output, self_weight, src_weights,
         raise ValueError("Arguments self_weight and neighbor_weights have to be presented at "
                          "the same time")
     is_hierarchical = False
-    handle = getattr(mpi_lib, function)(tensor, output, self_weight, src_weights,
-                                        dst_weights, dynamic_neighbors_enabled,
+    handle = getattr(mpi_lib, function)(tensor, output, self_weight, src_weights, dst_weights,
+                                        dynamic_neighbors_enabled, dst_weighting_enabled,
                                         enable_topo_check, weighted_average_computation,
                                         is_hierarchical, name.encode() if name is not None else "")
     _handle_map[handle] = (tensor, output)
diff --git a/test/torch_ops_test.py b/test/torch_ops_test.py
index f3ec3240..e76b8770 100644
--- a/test/torch_ops_test.py
+++ b/test/torch_ops_test.py
@@ -830,6 +830,46 @@ def test_neighbor_allreduce_sum(self):
                 reduced_tensor.data.max() == sum_value
             ), "bf.neighbor_allreduce (sum) produces incorrect reduced tensor"
 
+    def test_neighbor_allreduce_dst_weight(self):
+        """Test that the neighbor allreduce with destination weights works correctly."""
+        size = bf.size()
+        rank = bf.rank()
+        if size <= 1:
+            fname = inspect.currentframe().f_code.co_name
+            warnings.warn("Skip {} due to size 1".format(fname))
+            return
+
+        #dtypes = [torch.FloatTensor, torch.DoubleTensor]
+        dtypes = []
+        if TEST_ON_GPU:
+            dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor]
+
+        # By default, we use exponential two ring topology.
+        num_indegree = int(np.ceil(np.log2(size)))
+        neighbor_ranks = [(rank - 2**i) % size for i in range(num_indegree)]
+        dst_weight = 0.5
+        expected_value = (np.sum(neighbor_ranks) + rank)*dst_weight
+
+        self_weight = 0.5
+        src_weights = {i: 1.0 for i in neighbor_ranks}
+        dst_weights = {(rank + 2**i) % size : dst_weight for i in range(num_indegree)}
+
+        dims = [1,2,3]
+        for dtype, dim in itertools.product(dtypes, dims):
+            tensor = torch.FloatTensor(*([23] * dim)).fill_(1).mul_(rank)
+            tensor = self.cast_and_place(tensor, dtype)
+            name = "neighbor_allreduce_{}_{}".format(dim, dtype)
+            reduced_tensor = bf.neighbor_allreduce(
+                tensor, name=name, self_weight=self_weight,
+                src_weights=src_weights, dst_weights=dst_weights)
+            eps = EPSILON if tensor.dtype != torch.float16 else LOOSE_EPSILON
+            assert (
+                list(reduced_tensor.shape) == [23] * dim
+            ), "bf.neighbor_allreduce (avg) produces incorrect reduced shape"
+            assert (
+                (reduced_tensor - expected_value).abs().max() < eps
+            ), "bf.neighbor_allreduce (avg) produces incorrect reduced tensor"
+
     def test_neighbor_allreduce_weighted_avg(self):
         """Test that the neighbor all reduce (avg) 1D, 2D, 3D tensors correctly."""
         size = bf.size()
@@ -1020,6 +1060,59 @@ def test_neighbor_allreduce_dynamic_topo_fusion(self):
                  sum_value).abs().max() < eps
             ), "bf.neighbor_allreduce_2 (fusion) produces incorrect reduced tensor"
 
+    def test_neighbor_allreduce_dst_weight_fusion(self):
+        """Test neighbor allreduce works with destination weights under tensor fusion."""
+        size = bf.size()
+        rank = bf.rank()
+        K = 50 # number of tensors send in short time
+        if size <= 1:
+            fname = inspect.currentframe().f_code.co_name
+            warnings.warn("Skip {} due to size 1".format(fname))
+            return
+
+        dtypes = [torch.FloatTensor, torch.DoubleTensor]
+        if TEST_ON_GPU:
+            dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor]
+
+        # By default, we use exponential two ring topology.
+        num_indegree = int(np.ceil(np.log2(size)))
+        neighbor_ranks = [(rank - 2**i) % size for i in range(num_indegree)]
+        sum_value = (np.sum(neighbor_ranks)+rank)*K
+
+        dst_weight = 0.5
+        self_weight = dst_weight
+        src_weights = {i: 1.0 for i in neighbor_ranks}
+        dst_weights = {(rank + 2**i) % size: dst_weight for i in range(num_indegree)}
+
+        dims = [1, 2, 3]
+        for dtype, dim in itertools.product(dtypes, dims):
+            tensor_list, handles, names = [], [], []
+            for i in range(K):
+                tensor = torch.FloatTensor(*([23] * dim)).fill_(i + rank*K)
+                tensor = self.cast_and_place(tensor, dtype)
+                tensor_list.append(tensor)
+                names.append("index{}_{}_{}".format(i, dtype, dim))
+
+            for i in range(K):
+                handle = bf.neighbor_allreduce_nonblocking(
+                    tensor_list[i], name=names[i],
+                    self_weight=self_weight, src_weights=src_weights, dst_weights=dst_weights)
+                handles.append(handle)
+
+            outputs = []
+            for i in range(K):
+                output = bf.synchronize(handles[i])
+                outputs.append(output)
+
+            for i in range(K):
+                assert (
+                    list(outputs[i].shape) == [23] * dim
+                ), f"{names[i]} (fusion) produces incorrect reduced shape"
+                output_normalized = outputs[i]/dst_weight - i*(num_indegree+1)
+                assert (
+                    (output_normalized - sum_value).abs().max() < EPSILON
+                ), f"{names[i]} (fusion) produces incorrect reduced tensor"
+
     def test_neighbor_allgather(self):
         """Test that the neighbor all gather 1D, 2D, 3D tensors correctly."""
         size = bf.size()

From 29a239c758396e32eb713ad49df6ee91e6ef649a Mon Sep 17 00:00:00 2001
From: Hanbin Hu <hanbinhu2016@gmail.com>
Date: Fri, 26 Feb 2021 18:45:31 -0800
Subject: [PATCH 13/36] Add some delay after data_weight as a temporary
 solution

---
 bluefog/common/mpi_controller.cc | 2 ++
 test/torch_ops_test.py           | 3 +--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/bluefog/common/mpi_controller.cc b/bluefog/common/mpi_controller.cc
index 788ecdd7..eb2ffe52 100644
--- a/bluefog/common/mpi_controller.cc
+++ b/bluefog/common/mpi_controller.cc
@@ -444,6 +444,8 @@ void MPIController::NeighborAllreduce(TensorTableEntry& entry) {
           int ret_code;
           auto weighted_tensor_ptr = entry.tensor->data_weight(entry.send_weights->at(i));
           weighted_tensors.push_back(std::move(weighted_tensor_ptr));
+          //TODO(hhb): Temporary Solution for Ready Event Issue with data_weight
+          std::this_thread::sleep_for(std::chrono::microseconds(100));
           ret_code = MPI_Isend(
               weighted_tensors[i].get()->data(), num_elements, mpi_ctx_.GetMPIDataType(entry.tensor),
               entry.send_neighbors->at(i),
diff --git a/test/torch_ops_test.py b/test/torch_ops_test.py
index e76b8770..c8f3d3ba 100644
--- a/test/torch_ops_test.py
+++ b/test/torch_ops_test.py
@@ -839,8 +839,7 @@ def test_neighbor_allreduce_dst_weight(self):
             warnings.warn("Skip {} due to size 1".format(fname))
             return
 
-        #dtypes = [torch.FloatTensor, torch.DoubleTensor]
-        dtypes = []
+        dtypes = [torch.FloatTensor, torch.DoubleTensor]
         if TEST_ON_GPU:
             dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor]
 

From 97517b3c0bbbbe383c84a242655ffcd566de0154 Mon Sep 17 00:00:00 2001
From: Hanbin Hu <hanbinhu2016@gmail.com>
Date: Sat, 27 Feb 2021 00:12:57 -0800
Subject: [PATCH 14/36] CPU Fusion for dst_weighted added

---
 bluefog/common/mpi_controller.cc  | 85 ++++++++++++++++++++++++++++---
 bluefog/common/mpi_controller.h   |  4 ++
 bluefog/common/nccl_controller.cc | 37 +++++++++++---
 bluefog/common/nccl_controller.h  |  4 ++
 bluefog/common/operations.cc      | 12 ++++-
 bluefog/common/tensor_queue.cc    | 29 +++++++++++
 bluefog/common/tensor_queue.h     | 23 +++++++++
 test/torch_ops_test.py            |  6 ++-
 8 files changed, 185 insertions(+), 15 deletions(-)

diff --git a/bluefog/common/mpi_controller.cc b/bluefog/common/mpi_controller.cc
index eb2ffe52..05850e15 100644
--- a/bluefog/common/mpi_controller.cc
+++ b/bluefog/common/mpi_controller.cc
@@ -32,6 +32,33 @@
 namespace bluefog {
 namespace common {
 
+namespace {
+
+template <typename T, typename TS>
+void ScaleBufferCPUImpl(T* buffer, int64_t num_elements, TS scale_factor) {
+  for (int64_t i = 0; i < num_elements; ++i) {
+    buffer[i] = buffer[i] * scale_factor;
+  }
+}
+
+void ScaleCPUBuffer(double scale_factor, void* weighted_fused_input_data,
+                 int64_t num_elements, DataType dtype) {
+  switch (dtype) {
+    // TODO(hhb): FLOAT16 support
+    case DataType::BLUEFOG_FLOAT32:
+      ScaleBufferCPUImpl((float*) weighted_fused_input_data, num_elements, (float) scale_factor);
+      break;
+    case DataType::BLUEFOG_FLOAT64:
+      ScaleBufferCPUImpl((double*) weighted_fused_input_data, num_elements, scale_factor);
+      break;
+    default:
+      throw std::logic_error("Type " + DataType_Name(dtype) +
+                             " not supported by ScaleBufferCPUImpl.");
+  }
+}
+
+}
+
 // It may be because the win_create is called at different
 // threads from the win_put, win_get, etc. After moving win_create into
 // communicaiton thread, it resolved. (works in Openmpi=4.0.2 and MPICH).
@@ -636,13 +663,27 @@ void MPIController::NeighborAllreduce(std::vector<TensorTableEntry>& entries) {
   const void* weighted_fused_input_data = nullptr;
   if (first_entry.dst_weighting_enabled) {
     // Generate weighted data fusion for sending
-    BFLOG(INFO) << "Should not reach here";
-    throw std::runtime_error("Not Implemented for dst_weight fusion");
-    if (first_entry.device == CPU_DEVICE_ID) {
-
-    } else {
-
+    timeline_ptr->ActivityStartAll(entries, "MEMCPY_IN_WEIGHT_FUSION_BUFFER");
+    void* weight_buffer_data;
+    MemcpyInWeightFusionBuffer(weight_buffer_data, first_entry.send_neighbors->size(),
+                               fused_input_data, num_elements, element_size,
+                               first_entry.context, first_entry.device);
+    int64_t offset = 0;
+    for (size_t i = 0; i < first_entry.send_neighbors->size(); ++i) {
+      double dst_weight = first_entry.send_weights->at(i);
+      void* weight_buffer_data_offset = (uint8_t*)weight_buffer_data + offset;
+      if (first_entry.device == CPU_DEVICE_ID) {
+        ScaleCPUBuffer(dst_weight, weight_buffer_data_offset, num_elements,
+                       first_entry.tensor->dtype());
+      } else {
+#if HAVE_CUDA
+        // CUDA Weighting
+#endif
+      }
+      offset += num_elements * element_size;
     }
+    weighted_fused_input_data = weight_buffer_data;
+    timeline_ptr->ActivityEndAll(entries);
   }
 
   // Unlike allreduce, the storage for neighbor_allreduce in fusion buffer
@@ -1452,6 +1493,38 @@ Status MPIController::GetWindowVersionValue(const std::string& name,
   return Status::OK();
 }
 
+void MPIController::MemcpyInWeightFusionBuffer(
+  void*& weight_buffer_data, size_t num_dst,
+  const void* buffer_data, int64_t num_elements, int element_size,
+  std::shared_ptr<OpContext> context, int device) {
+  // Access the fusion buffer.
+  FusionBufferManager* buffer_manager;
+  auto fusion_status = GetBluefogFusionBuffer(buffer_manager);
+  if (!fusion_status.ok()){
+    throw std::runtime_error(fusion_status.reason());
+  }
+  std::shared_ptr<PersistentBuffer> buffer =
+      buffer_manager->GetWeightBuffer(device);
+  weight_buffer_data = const_cast<void*>(buffer->AccessData(context));
+  size_t data_size = num_elements * element_size;
+
+  int64_t offset = 0;
+  for (size_t i = 0; i < num_dst; ++i) {
+    void* weight_buffer_data_at_offset = (uint8_t*)weight_buffer_data + offset;
+#if HAVE_CUDA
+    if (device != CPU_DEVICE_ID) {
+      CUDACHECK(cudaMemcpy(weight_buffer_data_at_offset, buffer_data, data_size,
+                           cudaMemcpyDeviceToDevice));
+    } else {
+#endif
+      std::memcpy(weight_buffer_data_at_offset, buffer_data, data_size);
+#if HAVE_CUDA
+    }
+#endif
+    offset += data_size;
+  }
+}
+
 void MPIController::MemcpyInFusionBuffer(
     const std::vector<TensorTableEntry>& entries, void*& buffer_data,
     size_t& buffer_len) {
diff --git a/bluefog/common/mpi_controller.h b/bluefog/common/mpi_controller.h
index 2a850c01..3f5af419 100644
--- a/bluefog/common/mpi_controller.h
+++ b/bluefog/common/mpi_controller.h
@@ -121,6 +121,10 @@ class MPIController {
                                          const int num_recv_neighbors,
                                          const int64_t fused_data_size);
 
+  void MemcpyInWeightFusionBuffer(void*& weight_buffer_data, size_t num_dst,
+                                  const void* buffer_data, int64_t num_elements, int element_size,
+                                  std::shared_ptr<OpContext> context, int device);
+
   void MemcpyOutFusionBufferForInputs(const void* fused_input_data,
                                       std::vector<TensorTableEntry>& entries);
 
diff --git a/bluefog/common/nccl_controller.cc b/bluefog/common/nccl_controller.cc
index 921a8d5b..2c68c3e9 100644
--- a/bluefog/common/nccl_controller.cc
+++ b/bluefog/common/nccl_controller.cc
@@ -708,6 +708,7 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) {
       if(entry.dst_weighting_enabled)
       {
         for (size_t i = 0; i < entry.send_neighbors->size(); ++i) {
+          // Add ready event
           auto weighted_tensor_ptr = entry.tensor->data_weight(entry.send_weights->at(i));
           weighted_tensors.push_back(std::move(weighted_tensor_ptr));
           NCCLCHECK(ncclSend(weighted_tensors[i].get()->data(), num_elements,
@@ -959,12 +960,12 @@ void NCCLController::NeighborAllreduce(std::vector<TensorTableEntry>& entries) {
 
   const void* weighted_fused_input_data = nullptr;
   if (first_entry.dst_weighting_enabled) {
-    throw std::runtime_error("Not Implemented for dst_weight fusion");
-    if (first_entry.device == CPU_DEVICE_ID) {
-      throw std::runtime_error("NCCL Neighbor Allreduce failed: perform operation on CPU data.");
-    } else {
-
-    }
+    void* weight_buffer_data = nullptr;
+    MemcpyInWeightFusionBuffer(weight_buffer_data, first_entry.send_neighbors->size(),
+                               fused_input_data, num_elements, element_size,
+                               first_entry.context, first_entry.device);
+    //TODO(hhb): add weighting implementation.
+    weighted_fused_input_data = weight_buffer_data;
   }
 
   // Unlike allreduce, the storage for neighbor_allreduce in fusion buffer
@@ -1839,6 +1840,30 @@ void NCCLController::MemcpyInFusionBuffer(
   buffer_len = (size_t)offset;
 }
 
+void NCCLController::MemcpyInWeightFusionBuffer(
+  void*& weight_buffer_data, size_t num_dst,
+  const void* buffer_data, int64_t num_elements, int element_size,
+  std::shared_ptr<OpContext> context, int device) {
+  // Access the fusion buffer.
+  FusionBufferManager* buffer_manager;
+  auto fusion_status = GetBluefogFusionBuffer(buffer_manager);
+  if (!fusion_status.ok()){
+    throw std::runtime_error(fusion_status.reason());
+  }
+  std::shared_ptr<PersistentBuffer> buffer =
+      buffer_manager->GetWeightBuffer(device);
+  weight_buffer_data = const_cast<void*>(buffer->AccessData(context));
+  size_t data_size = num_elements * element_size;
+
+  int64_t offset = 0;
+  for (size_t i = 0; i < num_dst; ++i) {
+    void* weight_buffer_data_at_offset = (uint8_t*)weight_buffer_data + offset;
+    CUDACHECK(cudaMemcpy(weight_buffer_data_at_offset, buffer_data, data_size,
+                         cudaMemcpyDeviceToDevice));
+    offset += data_size;
+  }
+}
+
 void NCCLController::MemcpyOutFusionBuffer(
     const void* buffer_data, std::vector<TensorTableEntry>& entries) {
   int64_t offset = 0;
diff --git a/bluefog/common/nccl_controller.h b/bluefog/common/nccl_controller.h
index 3835a3d4..71380b08 100644
--- a/bluefog/common/nccl_controller.h
+++ b/bluefog/common/nccl_controller.h
@@ -188,6 +188,10 @@ class NCCLController {
                                          const int num_recv_neighbors,
                                          const int64_t fused_data_size);
 
+  void MemcpyInWeightFusionBuffer(void*& weight_buffer_data, size_t num_dst,
+                                  const void* buffer_data, int64_t num_elements, int element_size,
+                                  std::shared_ptr<OpContext> context, int device);
+
   void MemcpyOutFusionBufferForInputs(const void* fused_input_data,
                                       std::vector<TensorTableEntry>& entries);
 
diff --git a/bluefog/common/operations.cc b/bluefog/common/operations.cc
index efec9ff8..a165f32a 100644
--- a/bluefog/common/operations.cc
+++ b/bluefog/common/operations.cc
@@ -779,7 +779,17 @@ void PerformOperationWithFusion(std::vector<TensorTableEntry>& entries) {
       first_entry.context,
       [&]() { timeline.ActivityStartAll(entries, "INIT_FUSION_BUFFER"); },
       [&]() { timeline.ActivityEndAll(entries); });
-  if (!status.ok()) {
+  
+  Status status_dst_weight = Status::OK();
+  if (first_entry.dst_weighting_enabled) {
+    status_dst_weight = bluefog_global.fusion_buffer.InitializeWeightBuffer(
+      bluefog_global.tensor_fusion_threshold, mpi_context.size_,
+      first_entry.device, first_entry.context,
+      [&]() { timeline.ActivityStartAll(entries, "INIT_WEIGHT_FUSION_BUFFER"); },
+      [&]() { timeline.ActivityEndAll(entries); });
+  }
+
+  if (!status.ok() || !status_dst_weight.ok()) {
     for (auto& e : entries) {
       e.callback(status);
     }
diff --git a/bluefog/common/tensor_queue.cc b/bluefog/common/tensor_queue.cc
index 513459f1..b0f67241 100644
--- a/bluefog/common/tensor_queue.cc
+++ b/bluefog/common/tensor_queue.cc
@@ -154,5 +154,34 @@ std::shared_ptr<PersistentBuffer> FusionBufferManager::GetBuffer(int device) {
   return tensor_fusion_buffers_[device].first;
 }
 
+Status FusionBufferManager::InitializeWeightBuffer(
+    int64_t threshold, int mpi_size, int device, std::shared_ptr<OpContext> context,
+    std::function<void()> on_start_init, std::function<void()> on_end_init) {
+  auto& elem = weight_tensor_fusion_buffers_[device];
+  auto& buffer = elem.first;
+  int64_t& size = elem.second;
+  if (size != threshold*mpi_size) {
+    buffer.reset();
+    size = 0;
+  }
+
+  if (buffer == nullptr) {
+    on_start_init();
+    size = threshold*mpi_size;
+
+    // Lazily allocate persistent buffer for Tensor Fusion and keep it
+    // forever per device.
+    Status status = context->AllocatePersistent(size, &buffer);
+    on_end_init();
+
+    return status;
+  }
+
+  return Status::OK();
+}
+
+std::shared_ptr<PersistentBuffer> FusionBufferManager::GetWeightBuffer(int device) {
+  return weight_tensor_fusion_buffers_[device].first;
+}
 }  // namespace common
 }  // namespace bluefog
\ No newline at end of file
diff --git a/bluefog/common/tensor_queue.h b/bluefog/common/tensor_queue.h
index 50740164..2c54c88f 100644
--- a/bluefog/common/tensor_queue.h
+++ b/bluefog/common/tensor_queue.h
@@ -82,13 +82,36 @@ class FusionBufferManager {
                           std::function<void()> on_start_init,
                           std::function<void()> on_end_init);
 
+  // Initializes a buffer of the given threshold size times MPI size if not already cached.
+  //
+  // Args:
+  //  threshold: Size of the buffer in bytes.
+  //  mpi_size: Size of MPI nodes.
+  //  device: Device ID to associate the buffer.
+  //  context: Framework used to create the buffer and associate it.
+  //  on_start_init: Callback on starting buffer initialization.
+  //  on_end_init: Callback on completing buffer initialization.
+  Status InitializeWeightBuffer(int64_t threshold,
+                                int mpi_size,
+                                int device,
+                                std::shared_ptr<OpContext> context,
+                                std::function<void()> on_start_init,
+                                std::function<void()> on_end_init);
+
   // Returns the buffer associated with the given device and framework, or null.
   std::shared_ptr<PersistentBuffer> GetBuffer(int device);
 
+  // Returns the weight buffer associated with the given device and framework, or null.
+  std::shared_ptr<PersistentBuffer> GetWeightBuffer(int device);
+
  private:
   // Memory buffers for Tensor Fusion.  They are keyed by device ID.
   std::unordered_map<int, std::pair<std::shared_ptr<PersistentBuffer>, int64_t>>
       tensor_fusion_buffers_;
+
+  // Memory buffers for Tensor Fusion with dst weight.  They are keyed by device ID.
+  std::unordered_map<int, std::pair<std::shared_ptr<PersistentBuffer>, int64_t>>
+      weight_tensor_fusion_buffers_;
 };
 
 }  // namespace common
diff --git a/test/torch_ops_test.py b/test/torch_ops_test.py
index c8f3d3ba..71d49d63 100644
--- a/test/torch_ops_test.py
+++ b/test/torch_ops_test.py
@@ -841,7 +841,8 @@ def test_neighbor_allreduce_dst_weight(self):
 
         dtypes = [torch.FloatTensor, torch.DoubleTensor]
         if TEST_ON_GPU:
-            dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor]
+            pass
+            #dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor]
 
         # By default, we use exponential two ring topology.
         num_indegree = int(np.ceil(np.log2(size)))
@@ -1071,7 +1072,8 @@ def test_neighbor_allreduce_dst_weight_fusion(self):
 
         dtypes = [torch.FloatTensor, torch.DoubleTensor]
         if TEST_ON_GPU:
-            dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor]
+            pass
+            #dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor]
 
         # By default, we use exponential two ring topology.
         num_indegree = int(np.ceil(np.log2(size)))

From cc9faf459c30ab55a1351fe3907c164a40da54eb Mon Sep 17 00:00:00 2001
From: Hanbin Hu <hanbinhu2016@gmail.com>
Date: Thu, 4 Mar 2021 00:37:07 -0800
Subject: [PATCH 15/36] Add ReadyEvent for dst_weight for single entry
 neighbor_allreduce

---
 bluefog/common/common.h           |  1 +
 bluefog/common/mpi_controller.cc  | 27 ++++++++++++++++++---------
 bluefog/common/nccl_controller.cc | 22 ++++++++++++++++------
 bluefog/torch/adapter.cc          |  4 ++++
 bluefog/torch/adapter.h           |  1 +
 test/torch_ops_test.py            |  3 +--
 6 files changed, 41 insertions(+), 17 deletions(-)

diff --git a/bluefog/common/common.h b/bluefog/common/common.h
index cf12616d..25edf8bd 100644
--- a/bluefog/common/common.h
+++ b/bluefog/common/common.h
@@ -241,6 +241,7 @@ class OpContext {
                                 std::shared_ptr<Tensor>* tensor) = 0;
   virtual Status AllocateZeros(int64_t num_elements, DataType dtype,
                                std::shared_ptr<Tensor>* tensor) = 0;
+  virtual std::shared_ptr<ReadyEvent> RecordReadyEvent(int device) = 0;
   virtual Framework framework() const = 0;
   virtual ~OpContext() = default;
 };
diff --git a/bluefog/common/mpi_controller.cc b/bluefog/common/mpi_controller.cc
index c4efe82e..32b2b7b7 100644
--- a/bluefog/common/mpi_controller.cc
+++ b/bluefog/common/mpi_controller.cc
@@ -468,6 +468,18 @@ void MPIController::NeighborAllreduce(TensorTableEntry& entry) {
     } else {
       int nsend = entry.send_neighbors->size();
       int nrecv = entry.recv_neighbors->size();
+
+      // Ensure the lifecycle of the weighted tensors are alive after communication.
+      std::vector<std::unique_ptr<common::Tensor>> weighted_tensors;
+      if (entry.dst_weighting_enabled) {
+        for (int i = 0; i < nsend; ++i) {
+          auto weighted_tensor_ptr = entry.tensor->data_weight(entry.send_weights->at(i));
+          weighted_tensors.push_back(std::move(weighted_tensor_ptr));
+        }
+      }
+      std::shared_ptr<common::ReadyEvent> ready_event =
+          entry.context->RecordReadyEvent(entry.device);
+
       std::vector<MPI_Request> requests(nsend + nrecv);
       std::vector<MPI_Status> statuses(nsend + nrecv);
       int element_size = mpi_ctx_.GetMPITypeSize(entry.output->dtype());
@@ -487,17 +499,14 @@ void MPIController::NeighborAllreduce(TensorTableEntry& entry) {
         }
       }
 
-      std::vector<std::unique_ptr<common::Tensor>> weighted_tensors;
-      // Ensure the lifecycle of the weighted tensors are alive after communication.
-
       if (entry.dst_weighting_enabled) {
+        if (ready_event != nullptr) {
+          while (!ready_event->Ready()) {
+            std::this_thread::sleep_for(std::chrono::nanoseconds(100));
+          }
+        }
         for (int i = 0; i < nsend; ++i) {
-          int ret_code;
-          auto weighted_tensor_ptr = entry.tensor->data_weight(entry.send_weights->at(i));
-          weighted_tensors.push_back(std::move(weighted_tensor_ptr));
-          //TODO(hhb): Temporary Solution for Ready Event Issue with data_weight
-          std::this_thread::sleep_for(std::chrono::microseconds(100));
-          ret_code = MPI_Isend(
+          int ret_code = MPI_Isend(
               weighted_tensors[i].get()->data(), num_elements, mpi_ctx_.GetMPIDataType(entry.tensor),
               entry.send_neighbors->at(i),
               mpi_ctx_.rank_ + entry.send_neighbors->at(i),
diff --git a/bluefog/common/nccl_controller.cc b/bluefog/common/nccl_controller.cc
index 30a052d7..ff98a2fa 100644
--- a/bluefog/common/nccl_controller.cc
+++ b/bluefog/common/nccl_controller.cc
@@ -753,7 +753,7 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) {
     return; 
   }
 
-  std::vector<std::unique_ptr<common::Tensor>> weighted_tensors;
+  std::shared_ptr<std::vector<std::unique_ptr<common::Tensor>>> weighted_tensors;
   // Ensure the lifecycle of the weighted tensors are alive after communication.
 
 #if NCCL_MINOR > 6
@@ -772,6 +772,14 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) {
                           send_rank, nccl_ctx_.nccl_comm, nccl_ctx_.stream));
       }
     } else {
+      if(entry.dst_weighting_enabled) {
+        for (size_t i = 0; i < entry.send_neighbors->size(); ++i) {
+          auto weighted_tensor_ptr = entry.tensor->data_weight(entry.send_weights->at(i));
+          weighted_tensors->push_back(std::move(weighted_tensor_ptr));
+        }
+      }
+      std::shared_ptr<common::ReadyEvent> ready_event =
+          entry.context->RecordReadyEvent(entry.device);
       for (size_t i = 0; i < entry.recv_neighbors->size(); ++i) {
         int recv_rank = entry.recv_neighbors->at(i);
         void* recvbuf = (void*)(static_cast<const char*>(entry.output->data()) +
@@ -781,11 +789,13 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) {
       }
       if(entry.dst_weighting_enabled)
       {
+        if (ready_event != nullptr) {
+          while (!ready_event->Ready()) {
+            std::this_thread::sleep_for(std::chrono::nanoseconds(100));
+          }
+        }
         for (size_t i = 0; i < entry.send_neighbors->size(); ++i) {
-          // Add ready event
-          auto weighted_tensor_ptr = entry.tensor->data_weight(entry.send_weights->at(i));
-          weighted_tensors.push_back(std::move(weighted_tensor_ptr));
-          NCCLCHECK(ncclSend(weighted_tensors[i].get()->data(), num_elements,
+          NCCLCHECK(ncclSend((*weighted_tensors)[i].get()->data(), num_elements,
                              GetNCCLDataType(entry.tensor), entry.send_neighbors->at(i),
                              nccl_ctx_.nccl_comm, nccl_ctx_.stream));
         }
@@ -843,7 +853,7 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) {
 
   auto tid = std::this_thread::get_id();
   nccl_ctx_.finalizer_thread_pool.execute(
-      [this, entry, event_queue, tid, buffer_data, weighted_tensors]() mutable {
+      [this, entry, event_queue, tid, weighted_tensors]() mutable {
         with_device device_guard(entry.device);
         WaitForEvents(event_queue, {entry}, this->timeline_ptr_, tid);
 
diff --git a/bluefog/torch/adapter.cc b/bluefog/torch/adapter.cc
index aff49a90..bdccf556 100644
--- a/bluefog/torch/adapter.cc
+++ b/bluefog/torch/adapter.cc
@@ -178,6 +178,10 @@ Status TorchOpContext::AllocateZeros(int64_t num_elements, DataType dtype,
 
 Framework TorchOpContext::framework() const { return Framework::PYTORCH; }
 
+std::shared_ptr<common::ReadyEvent> TorchOpContext::RecordReadyEvent(int device) {
+  return torch::RecordReadyEvent(device);
+}
+
 #if HAVE_CUDA
 struct ReadyEventRegistry {
   std::unordered_map<int, std::queue<cudaEvent_t>> cuda_events;
diff --git a/bluefog/torch/adapter.h b/bluefog/torch/adapter.h
index eb6526f2..3b7ccf61 100644
--- a/bluefog/torch/adapter.h
+++ b/bluefog/torch/adapter.h
@@ -70,6 +70,7 @@ class TorchOpContext : public common::OpContext {
   virtual common::Status AllocateZeros(
       int64_t num_elements, common::DataType dtype,
       std::shared_ptr<common::Tensor>* tensor) override;
+  virtual std::shared_ptr<common::ReadyEvent> RecordReadyEvent(int device) override;
   virtual common::Framework framework() const override;
 
  private:
diff --git a/test/torch_ops_test.py b/test/torch_ops_test.py
index f690764a..9cf95e76 100644
--- a/test/torch_ops_test.py
+++ b/test/torch_ops_test.py
@@ -841,8 +841,7 @@ def test_neighbor_allreduce_dst_weight(self):
 
         dtypes = [torch.FloatTensor, torch.DoubleTensor]
         if TEST_ON_GPU:
-            pass
-            #dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor]
+            dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor]
 
         # By default, we use exponential two ring topology.
         num_indegree = int(np.ceil(np.log2(size)))

From 7f47ce4ce33ecfdfaaac767cf25db86fe6d4c4e0 Mon Sep 17 00:00:00 2001
From: Hanbin Hu <hanbinhu2016@gmail.com>
Date: Thu, 4 Mar 2021 01:10:58 -0800
Subject: [PATCH 16/36] Remove const identifier for tensor dtype as it is
 meaningless

---
 bluefog/common/common.h       | 2 +-
 bluefog/tensorflow/adapter.cc | 2 +-
 bluefog/tensorflow/adapter.h  | 2 +-
 bluefog/torch/adapter.cc      | 2 +-
 bluefog/torch/adapter.h       | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/bluefog/common/common.h b/bluefog/common/common.h
index 25edf8bd..ab78dac9 100644
--- a/bluefog/common/common.h
+++ b/bluefog/common/common.h
@@ -209,7 +209,7 @@ class TensorShape {
 
 class Tensor {
  public:
-  virtual const DataType dtype() const = 0;
+  virtual DataType dtype() const = 0;
   virtual const TensorShape shape() const = 0;
   virtual const void* data() const = 0;
   virtual std::unique_ptr<common::Tensor> data_weight(float weight) = 0;
diff --git a/bluefog/tensorflow/adapter.cc b/bluefog/tensorflow/adapter.cc
index 313a48ed..278cc426 100644
--- a/bluefog/tensorflow/adapter.cc
+++ b/bluefog/tensorflow/adapter.cc
@@ -89,7 +89,7 @@ const void* TFPersistentBuffer::AccessData(
 
 TFTensor::TFTensor(::tensorflow::Tensor& tensor) : tensor_(tensor) {}
 
-const common::DataType TFTensor::dtype() const {
+common::DataType TFTensor::dtype() const {
   switch (tensor_.dtype()) {
     case ::tensorflow::DT_UINT8:
       return common::DataType::BLUEFOG_UINT8;
diff --git a/bluefog/tensorflow/adapter.h b/bluefog/tensorflow/adapter.h
index 02e74ddc..428a0a3e 100644
--- a/bluefog/tensorflow/adapter.h
+++ b/bluefog/tensorflow/adapter.h
@@ -47,7 +47,7 @@ class TFPersistentBuffer : public common::PersistentBuffer {
 class TFTensor : public common::Tensor {
  public:
   TFTensor(::tensorflow::Tensor& tensor);
-  virtual const common::DataType dtype() const override;
+  virtual common::DataType dtype() const override;
   virtual const common::TensorShape shape() const override;
   virtual const void* data() const override;
   virtual std::unique_ptr<common::Tensor> data_weight(float weight) override;
diff --git a/bluefog/torch/adapter.cc b/bluefog/torch/adapter.cc
index bdccf556..8dd8560c 100644
--- a/bluefog/torch/adapter.cc
+++ b/bluefog/torch/adapter.cc
@@ -43,7 +43,7 @@ using ::bluefog::common::with_device;
 
 TorchTensor::TorchTensor(::torch::Tensor tensor) : tensor_(tensor) {}
 
-const DataType TorchTensor::dtype() const {
+DataType TorchTensor::dtype() const {
   switch (tensor_.scalar_type()) {
     case ::torch::kByte:
       return DataType::BLUEFOG_UINT8;
diff --git a/bluefog/torch/adapter.h b/bluefog/torch/adapter.h
index 3b7ccf61..85f07994 100644
--- a/bluefog/torch/adapter.h
+++ b/bluefog/torch/adapter.h
@@ -32,7 +32,7 @@ namespace torch {
 class TorchTensor : public common::Tensor {
  public:
   TorchTensor(::torch::Tensor tensor);
-  virtual const common::DataType dtype() const override;
+  virtual common::DataType dtype() const override;
   virtual const common::TensorShape shape() const override;
   virtual const void* data() const override;
   virtual std::unique_ptr<common::Tensor> data_weight(float weight) override;

From d84138be373bd57f6292ccb8590fe3036c5d704c Mon Sep 17 00:00:00 2001
From: Hanbin Hu <hanbinhu2016@gmail.com>
Date: Thu, 4 Mar 2021 01:14:41 -0800
Subject: [PATCH 17/36] Add cuda source for scalebuffer

---
 bluefog/common/cuda/cuda_kernels.cu | 129 ++++++++++++++++++++++++++++
 bluefog/common/cuda/cuda_kernels.h  |  33 +++++++
 2 files changed, 162 insertions(+)
 create mode 100644 bluefog/common/cuda/cuda_kernels.cu
 create mode 100644 bluefog/common/cuda/cuda_kernels.h

diff --git a/bluefog/common/cuda/cuda_kernels.cu b/bluefog/common/cuda/cuda_kernels.cu
new file mode 100644
index 00000000..0435125d
--- /dev/null
+++ b/bluefog/common/cuda/cuda_kernels.cu
@@ -0,0 +1,129 @@
+// Copyright (C) 2020 NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "cuda_kernels.h"
+
+#include <stdexcept>
+#include <cuda_fp16.h>
+
+namespace bluefog {
+namespace common {
+
+template<typename T, typename TS>
+__global__ void scale_buffer_k(const T* input, T* output, int64_t num_elements, const TS scale_factor) {
+
+  const size_t idx = static_cast<size_t>(blockDim.x) * blockIdx.x + threadIdx.x;
+
+  for (size_t i = idx; i < num_elements; i += gridDim.x * blockDim.x) {
+    output[i] = scale_factor * input[i];
+  }
+}
+
+// Specialization for half2
+__global__ void scale_buffer_half2_k(const __half* input, __half* output, int64_t num_elements, const __half scale_factor) {
+
+  const size_t idx = static_cast<size_t>(blockDim.x) * blockIdx.x + threadIdx.x;
+
+#if __CUDA_ARCH__ > 530
+  const __half2* input_h2 = reinterpret_cast<const __half2 *>(input);
+  __half2* output_h2 = reinterpret_cast<__half2 *>(output);
+  const __half2 scale_factor_h2 = __halves2half2(scale_factor, scale_factor);
+
+  for (size_t i = idx; i < num_elements / 2; i += gridDim.x * blockDim.x) {
+    output_h2[i] = __hmul2(scale_factor_h2, input_h2[i]);
+  }
+
+  // Deal with last element if num_elements is odd
+  if (idx == 0 && num_elements % 2) {
+    output[num_elements - 1] = __hmul(scale_factor, input[num_elements - 1]);
+  }
+#else
+  for (size_t i = idx; i < num_elements; i += gridDim.x * blockDim.x) {
+    output[i] = __float2half(__half2float(scale_factor) * __half2float(input[i]));
+  }
+#endif
+}
+
+// Specialization for architectures without __half compute
+template<>
+__global__ void scale_buffer_k(const __half* input, __half* output, int64_t num_elements, const __half scale_factor) {
+
+  const size_t idx = static_cast<size_t>(blockDim.x) * blockIdx.x + threadIdx.x;
+
+#if __CUDA_ARCH__ > 530
+  for (size_t i = idx; i < num_elements; i += gridDim.x * blockDim.x) {
+    output[i] = scale_factor * input[i];
+  }
+#else
+  for (size_t i = idx; i < num_elements; i += gridDim.x * blockDim.x) {
+    output[i] = __float2half(__half2float(scale_factor) * __half2float(input[i]));
+  }
+#endif
+}
+
+#define NTHREADS_SCALE_BUFFER_KERNEL 512
+void ScaleBufferCudaImpl(const void* fused_input_data, void* buffer_data, const int64_t num_elements, double scale_factor,
+                         DataType dtype, cudaStream_t stream) {
+  const int64_t blocks = (num_elements + NTHREADS_SCALE_BUFFER_KERNEL - 1) / NTHREADS_SCALE_BUFFER_KERNEL;
+  const int threads = NTHREADS_SCALE_BUFFER_KERNEL;
+  switch (dtype) {
+    case DataType::BLUEFOG_UINT8:
+      scale_buffer_k<<<blocks, threads, 0, stream>>>((const uint8_t*) fused_input_data, (uint8_t*) buffer_data,
+                                                     num_elements, scale_factor);
+      break;
+    case DataType::BLUEFOG_INT8:
+      scale_buffer_k<<<blocks, threads, 0, stream>>>((const int8_t*) fused_input_data, (int8_t*) buffer_data,
+                                                     num_elements, scale_factor);
+      break;
+    case DataType::BLUEFOG_INT32:
+      scale_buffer_k<<<blocks, threads, 0, stream>>>((const int32_t*) fused_input_data, (int32_t*) buffer_data,
+                                                     num_elements, scale_factor);
+      break;
+    case DataType::BLUEFOG_INT64:
+      scale_buffer_k<<<blocks, threads, 0, stream>>>((const int64_t*) fused_input_data, (int64_t*) buffer_data,
+                                                     num_elements, scale_factor);
+      break;
+    case DataType::BLUEFOG_FLOAT16:
+    {
+      __half scale_factor_half = __float2half((float) scale_factor);
+      if ((size_t) fused_input_data % 4 == 0 && (size_t) buffer_data % 4 == 0) {
+        // If alignment allows, use half2 specialized kernel
+        int64_t num_elements_h2 = (num_elements + 1) / 2;
+        int64_t blocks_h2 = (num_elements_h2 + NTHREADS_SCALE_BUFFER_KERNEL - 1) / NTHREADS_SCALE_BUFFER_KERNEL;
+        scale_buffer_half2_k<<<blocks_h2, threads, 0, stream>>>((const __half*) fused_input_data, (__half*) buffer_data,
+                                                          num_elements, scale_factor_half);
+      } else {
+        scale_buffer_k<<<blocks, threads, 0, stream>>>((const __half*) fused_input_data, (__half*) buffer_data,
+                                                       num_elements, scale_factor_half);
+     }
+      break;
+    }
+    case DataType::BLUEFOG_FLOAT32:
+      scale_buffer_k<<<blocks, threads, 0, stream>>>((const float*) fused_input_data, (float*) buffer_data,
+                                                     num_elements, (float) scale_factor);
+      break;
+    case DataType::BLUEFOG_FLOAT64:
+      scale_buffer_k<<<blocks, threads, 0, stream>>>((const double*) fused_input_data, (double*) buffer_data,
+                                                     num_elements, scale_factor);
+      break;
+    default:
+      throw std::logic_error("Type " + DataType_Name(dtype) +
+                             " not supported by ScaleBufferCudaImpl.");
+  }
+}
+
+} // namespace common
+} // namespace bluefog
+
diff --git a/bluefog/common/cuda/cuda_kernels.h b/bluefog/common/cuda/cuda_kernels.h
new file mode 100644
index 00000000..3739e1ea
--- /dev/null
+++ b/bluefog/common/cuda/cuda_kernels.h
@@ -0,0 +1,33 @@
+// Copyright (C) 2020 NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef CUDA_KERNELS_H
+#define CUDA_KERNELS_H
+
+#include <cuda_runtime.h>
+
+#include "../common.h"
+
+namespace bluefog {
+namespace common {
+
+// Scales buffer by scalar
+void ScaleBufferCudaImpl(const void* fused_input_data, void* buffer_data, const int64_t num_elements,
+                         double scale_factor, DataType dtype, cudaStream_t stream);
+
+} // namespace common
+} // namespace bluefog
+
+#endif // CUDA_KERNELS_H
\ No newline at end of file

From 06c375d1630fb296dce27529d4deb92edf5a0ef5 Mon Sep 17 00:00:00 2001
From: Hanbin Hu <hanbinhu2016@gmail.com>
Date: Thu, 4 Mar 2021 02:30:24 -0800
Subject: [PATCH 18/36] Scale buffer to modify itself

---
 bluefog/common/cuda/cuda_kernels.cu | 49 ++++++++++++-----------------
 bluefog/common/cuda/cuda_kernels.h  |  4 +--
 2 files changed, 22 insertions(+), 31 deletions(-)

diff --git a/bluefog/common/cuda/cuda_kernels.cu b/bluefog/common/cuda/cuda_kernels.cu
index 0435125d..144b21e8 100644
--- a/bluefog/common/cuda/cuda_kernels.cu
+++ b/bluefog/common/cuda/cuda_kernels.cu
@@ -22,101 +22,92 @@ namespace bluefog {
 namespace common {
 
 template<typename T, typename TS>
-__global__ void scale_buffer_k(const T* input, T* output, int64_t num_elements, const TS scale_factor) {
+__global__ void scale_buffer_k(T* buffer, int64_t num_elements, const TS scale_factor) {
 
   const size_t idx = static_cast<size_t>(blockDim.x) * blockIdx.x + threadIdx.x;
 
   for (size_t i = idx; i < num_elements; i += gridDim.x * blockDim.x) {
-    output[i] = scale_factor * input[i];
+    buffer[i] *= scale_factor;
   }
 }
 
 // Specialization for half2
-__global__ void scale_buffer_half2_k(const __half* input, __half* output, int64_t num_elements, const __half scale_factor) {
+__global__ void scale_buffer_half2_k(__half* buffer, int64_t num_elements, const __half scale_factor) {
 
   const size_t idx = static_cast<size_t>(blockDim.x) * blockIdx.x + threadIdx.x;
 
 #if __CUDA_ARCH__ > 530
-  const __half2* input_h2 = reinterpret_cast<const __half2 *>(input);
-  __half2* output_h2 = reinterpret_cast<__half2 *>(output);
+  __half2* buffer_h2 = reinterpret_cast<__half2 *>(buffer);
   const __half2 scale_factor_h2 = __halves2half2(scale_factor, scale_factor);
 
   for (size_t i = idx; i < num_elements / 2; i += gridDim.x * blockDim.x) {
-    output_h2[i] = __hmul2(scale_factor_h2, input_h2[i]);
+    buffer_h2[i] = __hmul2(scale_factor_h2, buffer_h2[i]);
   }
 
   // Deal with last element if num_elements is odd
   if (idx == 0 && num_elements % 2) {
-    output[num_elements - 1] = __hmul(scale_factor, input[num_elements - 1]);
+    buffer[num_elements - 1] = __hmul(scale_factor, buffer[num_elements - 1]);
   }
 #else
   for (size_t i = idx; i < num_elements; i += gridDim.x * blockDim.x) {
-    output[i] = __float2half(__half2float(scale_factor) * __half2float(input[i]));
+    buffer[i] = __float2half(__half2float(scale_factor) * __half2float(buffer[i]));
   }
 #endif
 }
 
 // Specialization for architectures without __half compute
 template<>
-__global__ void scale_buffer_k(const __half* input, __half* output, int64_t num_elements, const __half scale_factor) {
+__global__ void scale_buffer_k(__half* buffer, int64_t num_elements, const __half scale_factor) {
 
   const size_t idx = static_cast<size_t>(blockDim.x) * blockIdx.x + threadIdx.x;
 
 #if __CUDA_ARCH__ > 530
   for (size_t i = idx; i < num_elements; i += gridDim.x * blockDim.x) {
-    output[i] = scale_factor * input[i];
+    buffer[i] *= scale_factor;
   }
 #else
   for (size_t i = idx; i < num_elements; i += gridDim.x * blockDim.x) {
-    output[i] = __float2half(__half2float(scale_factor) * __half2float(input[i]));
+    buffer[i] = __float2half(__half2float(scale_factor) * __half2float(buffer[i]));
   }
 #endif
 }
 
 #define NTHREADS_SCALE_BUFFER_KERNEL 512
-void ScaleBufferCudaImpl(const void* fused_input_data, void* buffer_data, const int64_t num_elements, double scale_factor,
+void ScaleBufferCudaImpl(double scale_factor, void* buffer_data, const int64_t num_elements,
                          DataType dtype, cudaStream_t stream) {
   const int64_t blocks = (num_elements + NTHREADS_SCALE_BUFFER_KERNEL - 1) / NTHREADS_SCALE_BUFFER_KERNEL;
   const int threads = NTHREADS_SCALE_BUFFER_KERNEL;
   switch (dtype) {
     case DataType::BLUEFOG_UINT8:
-      scale_buffer_k<<<blocks, threads, 0, stream>>>((const uint8_t*) fused_input_data, (uint8_t*) buffer_data,
-                                                     num_elements, scale_factor);
+      scale_buffer_k<<<blocks, threads, 0, stream>>>((uint8_t*) buffer_data, num_elements, scale_factor);
       break;
     case DataType::BLUEFOG_INT8:
-      scale_buffer_k<<<blocks, threads, 0, stream>>>((const int8_t*) fused_input_data, (int8_t*) buffer_data,
-                                                     num_elements, scale_factor);
+      scale_buffer_k<<<blocks, threads, 0, stream>>>((int8_t*) buffer_data, num_elements, scale_factor);
       break;
     case DataType::BLUEFOG_INT32:
-      scale_buffer_k<<<blocks, threads, 0, stream>>>((const int32_t*) fused_input_data, (int32_t*) buffer_data,
-                                                     num_elements, scale_factor);
+      scale_buffer_k<<<blocks, threads, 0, stream>>>((int32_t*) buffer_data, num_elements, scale_factor);
       break;
     case DataType::BLUEFOG_INT64:
-      scale_buffer_k<<<blocks, threads, 0, stream>>>((const int64_t*) fused_input_data, (int64_t*) buffer_data,
-                                                     num_elements, scale_factor);
+      scale_buffer_k<<<blocks, threads, 0, stream>>>((int64_t*) buffer_data, num_elements, scale_factor);
       break;
     case DataType::BLUEFOG_FLOAT16:
     {
       __half scale_factor_half = __float2half((float) scale_factor);
-      if ((size_t) fused_input_data % 4 == 0 && (size_t) buffer_data % 4 == 0) {
+      if ((size_t) buffer_data % 4 == 0) {
         // If alignment allows, use half2 specialized kernel
         int64_t num_elements_h2 = (num_elements + 1) / 2;
         int64_t blocks_h2 = (num_elements_h2 + NTHREADS_SCALE_BUFFER_KERNEL - 1) / NTHREADS_SCALE_BUFFER_KERNEL;
-        scale_buffer_half2_k<<<blocks_h2, threads, 0, stream>>>((const __half*) fused_input_data, (__half*) buffer_data,
-                                                          num_elements, scale_factor_half);
+        scale_buffer_half2_k<<<blocks_h2, threads, 0, stream>>>((__half*) buffer_data, num_elements, scale_factor_half);
       } else {
-        scale_buffer_k<<<blocks, threads, 0, stream>>>((const __half*) fused_input_data, (__half*) buffer_data,
-                                                       num_elements, scale_factor_half);
+        scale_buffer_k<<<blocks, threads, 0, stream>>>((__half*) buffer_data, num_elements, scale_factor_half);
      }
       break;
     }
     case DataType::BLUEFOG_FLOAT32:
-      scale_buffer_k<<<blocks, threads, 0, stream>>>((const float*) fused_input_data, (float*) buffer_data,
-                                                     num_elements, (float) scale_factor);
+      scale_buffer_k<<<blocks, threads, 0, stream>>>((float*) buffer_data, num_elements, (float) scale_factor);
       break;
     case DataType::BLUEFOG_FLOAT64:
-      scale_buffer_k<<<blocks, threads, 0, stream>>>((const double*) fused_input_data, (double*) buffer_data,
-                                                     num_elements, scale_factor);
+      scale_buffer_k<<<blocks, threads, 0, stream>>>((double*) buffer_data, num_elements, scale_factor);
       break;
     default:
       throw std::logic_error("Type " + DataType_Name(dtype) +
diff --git a/bluefog/common/cuda/cuda_kernels.h b/bluefog/common/cuda/cuda_kernels.h
index 3739e1ea..2ba27a09 100644
--- a/bluefog/common/cuda/cuda_kernels.h
+++ b/bluefog/common/cuda/cuda_kernels.h
@@ -24,8 +24,8 @@ namespace bluefog {
 namespace common {
 
 // Scales buffer by scalar
-void ScaleBufferCudaImpl(const void* fused_input_data, void* buffer_data, const int64_t num_elements,
-                         double scale_factor, DataType dtype, cudaStream_t stream);
+void ScaleBufferCudaImpl(double scale_factor, void* buffer_data, const int64_t num_elements,
+                         DataType dtype, cudaStream_t stream);
 
 } // namespace common
 } // namespace bluefog

From e4db92fd7bd6fd45f130605cd0f3ae114826677b Mon Sep 17 00:00:00 2001
From: Hanbin Hu <hanbinhu2016@gmail.com>
Date: Thu, 4 Mar 2021 22:40:05 -0800
Subject: [PATCH 19/36] Add .o file to .gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 56029e30..bb14086e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,7 @@ __pycache__/
 
 # C extensions
 *.so
+*.o
 
 # Distribution / packaging
 .Python

From ec2a3fc4a3d3783d2ebcb22f4ebd4f961dd602e0 Mon Sep 17 00:00:00 2001
From: Hanbin Hu <hanbinhu2016@gmail.com>
Date: Thu, 4 Mar 2021 22:41:12 -0800
Subject: [PATCH 20/36] dst_weight using CUDA for fused entry & compile flow in
 Python setup.py

---
 bluefog/common/mpi_context.cc     |  7 +++++++
 bluefog/common/mpi_context.h      |  9 +++++++++
 bluefog/common/mpi_controller.cc  | 14 +++++++++++---
 bluefog/common/nccl_controller.cc | 11 ++++++++++-
 setup.py                          | 25 +++++++++++++++++++++++++
 test/torch_ops_test.py            |  3 +--
 6 files changed, 63 insertions(+), 6 deletions(-)

diff --git a/bluefog/common/mpi_context.cc b/bluefog/common/mpi_context.cc
index 5d6cefc8..addd3b3a 100644
--- a/bluefog/common/mpi_context.cc
+++ b/bluefog/common/mpi_context.cc
@@ -332,6 +332,13 @@ void MPIContext::Initialize(const std::vector<int>& ranks,
 
   // Create custom MPI float16 summation op.
   MPI_Op_create(&float16_sum, 1, &mpi_float16_sum);
+
+#if HAVE_CUDA
+  int greatest_priority;
+  CUDACHECK(cudaDeviceGetStreamPriorityRange(NULL, &greatest_priority));
+  CUDACHECK(cudaStreamCreateWithPriority(&stream, cudaStreamNonBlocking,
+                                         greatest_priority));
+#endif
 }
 
 void MPIContext::Finalize(MPIContextManager& ctx_manager) {
diff --git a/bluefog/common/mpi_context.h b/bluefog/common/mpi_context.h
index db0c31b1..aaad85db 100644
--- a/bluefog/common/mpi_context.h
+++ b/bluefog/common/mpi_context.h
@@ -22,6 +22,10 @@
 #include <unordered_map>
 #include <vector>
 
+#if HAVE_CUDA
+#include "cuda_runtime.h"
+#endif
+
 #include "common.h"
 #include "mpi.h"
 
@@ -232,6 +236,11 @@ class MPIContext {
   // MPI Custom  data type for float16.
   MPI_Datatype mpi_float16_t;
   MPI_Op mpi_float16_sum;
+
+#if HAVE_CUDA
+  // CUDA Stream
+  cudaStream_t stream;
+#endif
 };
 
 std::string GenerateNeighborExchangeErrorMessage(const std::vector<MPI_Status>& statuses,
diff --git a/bluefog/common/mpi_controller.cc b/bluefog/common/mpi_controller.cc
index 32b2b7b7..07fc3f0a 100644
--- a/bluefog/common/mpi_controller.cc
+++ b/bluefog/common/mpi_controller.cc
@@ -29,6 +29,10 @@
 #include "operations.h"
 #include "timeline.h"
 
+#if HAVE_CUDA
+#include "cuda/cuda_kernels.h"
+#endif
+
 namespace bluefog {
 namespace common {
 
@@ -507,8 +511,8 @@ void MPIController::NeighborAllreduce(TensorTableEntry& entry) {
         }
         for (int i = 0; i < nsend; ++i) {
           int ret_code = MPI_Isend(
-              weighted_tensors[i].get()->data(), num_elements, mpi_ctx_.GetMPIDataType(entry.tensor),
-              entry.send_neighbors->at(i),
+              weighted_tensors[i].get()->data(), num_elements,
+              mpi_ctx_.GetMPIDataType(entry.tensor), entry.send_neighbors->at(i),
               mpi_ctx_.rank_ + entry.send_neighbors->at(i),
               mpi_ctx_.GetMPICommunicator(Communicator::GRAPH), &requests[i]);
           if (ret_code != MPI_SUCCESS) {
@@ -679,7 +683,8 @@ void MPIController::NeighborAllreduce(std::vector<TensorTableEntry>& entries) {
                        first_entry.tensor->dtype());
       } else {
 #if HAVE_CUDA
-        // CUDA Weighting
+        ScaleBufferCudaImpl(dst_weight, weight_buffer_data_offset, num_elements,
+                            first_entry.tensor->dtype(), mpi_ctx_.stream);
 #endif
       }
       offset += num_elements * element_size;
@@ -747,6 +752,9 @@ void MPIController::NeighborAllreduce(std::vector<TensorTableEntry>& entries) {
           }
         }
       } else {
+#if HAVE_CUDA
+        cudaStreamSynchronize(mpi_ctx_.stream);
+#endif
         for (int i = 0; i < nsend; ++i) {
           void* sendbuf = 
               (void*)((uint8_t*)weighted_fused_input_data + num_elements * i * element_size);
diff --git a/bluefog/common/nccl_controller.cc b/bluefog/common/nccl_controller.cc
index ff98a2fa..295dbf22 100644
--- a/bluefog/common/nccl_controller.cc
+++ b/bluefog/common/nccl_controller.cc
@@ -27,6 +27,8 @@
 #include "operations.h"
 #include "timeline.h"
 
+#include "cuda/cuda_kernels.h"
+
 namespace bluefog {
 namespace common {
 
@@ -1048,7 +1050,14 @@ void NCCLController::NeighborAllreduce(std::vector<TensorTableEntry>& entries) {
     MemcpyInWeightFusionBuffer(weight_buffer_data, first_entry.send_neighbors->size(),
                                fused_input_data, num_elements, element_size,
                                first_entry.context, first_entry.device);
-    //TODO(hhb): add weighting implementation.
+    int64_t offset = 0;
+    for (size_t i = 0; i < first_entry.send_neighbors->size(); ++i) {
+      double dst_weight = first_entry.send_weights->at(i);
+      void* weight_buffer_data_offset = (uint8_t*)weight_buffer_data + offset;
+      ScaleBufferCudaImpl(dst_weight, weight_buffer_data_offset, num_elements,
+                          first_entry.tensor->dtype(), nccl_ctx_.stream);
+      offset += num_elements * element_size;
+    }
     weighted_fused_input_data = weight_buffer_data;
   }
 
diff --git a/setup.py b/setup.py
index d06cb445..7e8e0726 100644
--- a/setup.py
+++ b/setup.py
@@ -437,6 +437,26 @@ def is_torch_cuda(build_ext, include_dirs, extra_compile_args):
         print('INFO: Above error indicates that this PyTorch installation does not support CUDA.')
         return False
 
+def build_nvcc_extra_objects():
+    # nvcc --compiler-options '-fPIC -D_GLIBCXX_USE_CXX11_ABI=0' -rdc=true -c cuda_kernels.cu
+    # nvcc --compiler-options '-fPIC -D_GLIBCXX_USE_CXX11_ABI=0' -dlink -o cuda_kernels_link.o \
+    #      cuda_kernels.o -lcudart
+    nvcc = 'nvcc'
+    import torch
+    common_flags = f"'-fPIC -D_GLIBCXX_USE_CXX11_ABI={int(torch.compiled_with_cxx11_abi())}'"
+    nvcc_with_flags = f"{nvcc} --compiler-options {common_flags}"
+
+    extra_object_dir = 'bluefog/common/cuda/'
+    source = extra_object_dir+'cuda_kernels.cu'
+    object_file = extra_object_dir+'cuda_kernels.o'
+    object_link = extra_object_dir+'cuda_kernels_link.o'
+
+    command_object = f"{nvcc_with_flags} -rdc=true -c {source} -o {object_file}"
+    command_link   = f"{nvcc_with_flags} -dlink {object_file} -lcudart -o {object_link}"
+
+    subprocess.run([command_object], shell=True)
+    subprocess.run([command_link], shell=True)
+    return [object_file, object_link]
 
 def build_torch_extension(build_ext, global_options, torch_version):
     # Backup the options, preventing other plugins access libs that
@@ -447,6 +467,9 @@ def build_torch_extension(build_ext, global_options, torch_version):
     if have_cuda:
         cuda_include_dirs, cuda_lib_dirs = get_cuda_dirs(
             build_ext, options['COMPILE_FLAGS'])
+        cuda_extra_objects = build_nvcc_extra_objects()
+        options['EXTRA_OBJECTS'] += cuda_extra_objects
+
         options['INCLUDES'] += cuda_include_dirs
         options['LIBRARY_DIRS'] += cuda_lib_dirs
         options['LIBRARIES'] += ['cudart']
@@ -491,6 +514,7 @@ def build_torch_extension(build_ext, global_options, torch_version):
     else:
         # CUDAExtension fails with `ld: library not found for -lcudart` if CUDA is not present
         from torch.utils.cpp_extension import CppExtension as TorchExtension
+    bluefog_tensorflow_mpi_lib.extra_objects = options['EXTRA_OBJECTS']
 
     ext = TorchExtension(bluefog_torch_mpi_lib.name,
                          define_macros=updated_macros,
@@ -504,6 +528,7 @@ def build_torch_extension(build_ext, global_options, torch_version):
                          extra_compile_args=options['COMPILE_FLAGS'],
                          extra_link_args=options['LINK_FLAGS'],
                          library_dirs=options['LIBRARY_DIRS'],
+                         extra_objects=options['EXTRA_OBJECTS'],
                          libraries=options['LIBRARIES'])
 
     # Patch an existing bluefog_torch_mpi_lib extension object.
diff --git a/test/torch_ops_test.py b/test/torch_ops_test.py
index 9cf95e76..974a1aca 100644
--- a/test/torch_ops_test.py
+++ b/test/torch_ops_test.py
@@ -1071,8 +1071,7 @@ def test_neighbor_allreduce_dst_weight_fusion(self):
 
         dtypes = [torch.FloatTensor, torch.DoubleTensor]
         if TEST_ON_GPU:
-            pass
-            #dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor]
+            dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor]
 
         # By default, we use exponential two ring topology.
         num_indegree = int(np.ceil(np.log2(size)))

From adaa191258f1a23a1d5de051997042d2a41c6730 Mon Sep 17 00:00:00 2001
From: Hanbin Hu <hanbinhu2016@gmail.com>
Date: Thu, 4 Mar 2021 23:08:22 -0800
Subject: [PATCH 21/36] make clean *.o files generated by nvcc

---
 Makefile | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 5b3ebdfb..dadbdf88 100644
--- a/Makefile
+++ b/Makefile
@@ -19,7 +19,7 @@ test_torch: test_torch_basic test_torch_ops test_torch_win_ops test_torch_optimi
 test_tensorflow: test_tensorflow_basic test_tensorflow_ops
 test_all: test_torch test_tensorflow
 
-clean: clean_build clean_so
+clean: clean_build clean_so clean_o
 
 .PHONY: test_torch_basic
 test_torch_basic:
@@ -51,8 +51,12 @@ test_tensorflow_ops:
 
 .PHONY: clean_build
 clean_build:
-	rm -R build
+	rm -fR build
 
 .PHONY: clean_so
 clean_so:
-	rm ./bluefog/torch/mpi_lib.*.so
+	rm -f ./bluefog/torch/mpi_lib.*.so
+
+.PHONY: clean_o
+clean_o:
+	rm -f ./bluefog/common/cuda/*.o
\ No newline at end of file

From 46e64313fa1aa94345ee33f9293558c9cce68b9e Mon Sep 17 00:00:00 2001
From: Hanbin Hu <hanbinhu2016@gmail.com>
Date: Sun, 14 Mar 2021 22:54:03 -0700
Subject: [PATCH 22/36] Add fix for NCCL single entry

---
 bluefog/common/nccl_controller.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/bluefog/common/nccl_controller.cc b/bluefog/common/nccl_controller.cc
index 295dbf22..bc823ddd 100644
--- a/bluefog/common/nccl_controller.cc
+++ b/bluefog/common/nccl_controller.cc
@@ -755,7 +755,7 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) {
     return; 
   }
 
-  std::shared_ptr<std::vector<std::unique_ptr<common::Tensor>>> weighted_tensors;
+  std::vector<std::shared_ptr<common::Tensor>> weighted_tensors;
   // Ensure the lifecycle of the weighted tensors are alive after communication.
 
 #if NCCL_MINOR > 6
@@ -777,7 +777,7 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) {
       if(entry.dst_weighting_enabled) {
         for (size_t i = 0; i < entry.send_neighbors->size(); ++i) {
           auto weighted_tensor_ptr = entry.tensor->data_weight(entry.send_weights->at(i));
-          weighted_tensors->push_back(std::move(weighted_tensor_ptr));
+          weighted_tensors.push_back(std::move(weighted_tensor_ptr));
         }
       }
       std::shared_ptr<common::ReadyEvent> ready_event =
@@ -797,7 +797,7 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) {
           }
         }
         for (size_t i = 0; i < entry.send_neighbors->size(); ++i) {
-          NCCLCHECK(ncclSend((*weighted_tensors)[i].get()->data(), num_elements,
+          NCCLCHECK(ncclSend(weighted_tensors[i].get()->data(), num_elements,
                              GetNCCLDataType(entry.tensor), entry.send_neighbors->at(i),
                              nccl_ctx_.nccl_comm, nccl_ctx_.stream));
         }

From ae57e75d74291210e43bd0ae904b33e0b57abd03 Mon Sep 17 00:00:00 2001
From: Hanbin Hu <hanbinhu2016@gmail.com>
Date: Sun, 14 Mar 2021 23:00:05 -0700
Subject: [PATCH 23/36] Make setup.py more robust

---
 setup.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/setup.py b/setup.py
index 7e8e0726..6857b9bc 100644
--- a/setup.py
+++ b/setup.py
@@ -437,13 +437,12 @@ def is_torch_cuda(build_ext, include_dirs, extra_compile_args):
         print('INFO: Above error indicates that this PyTorch installation does not support CUDA.')
         return False
 
-def build_nvcc_extra_objects():
+def build_nvcc_extra_objects(cxx11_abi: bool):
     # nvcc --compiler-options '-fPIC -D_GLIBCXX_USE_CXX11_ABI=0' -rdc=true -c cuda_kernels.cu
     # nvcc --compiler-options '-fPIC -D_GLIBCXX_USE_CXX11_ABI=0' -dlink -o cuda_kernels_link.o \
     #      cuda_kernels.o -lcudart
     nvcc = 'nvcc'
-    import torch
-    common_flags = f"'-fPIC -D_GLIBCXX_USE_CXX11_ABI={int(torch.compiled_with_cxx11_abi())}'"
+    common_flags = f"'-fPIC -D_GLIBCXX_USE_CXX11_ABI={int(cxx11_abi)}'"
     nvcc_with_flags = f"{nvcc} --compiler-options {common_flags}"
 
     extra_object_dir = 'bluefog/common/cuda/'
@@ -454,20 +453,23 @@ def build_nvcc_extra_objects():
     command_object = f"{nvcc_with_flags} -rdc=true -c {source} -o {object_file}"
     command_link   = f"{nvcc_with_flags} -dlink {object_file} -lcudart -o {object_link}"
 
-    subprocess.run([command_object], shell=True)
-    subprocess.run([command_link], shell=True)
+    subprocess.check_call([command_object], shell=True)
+    subprocess.check_call([command_link], shell=True)
     return [object_file, object_link]
 
 def build_torch_extension(build_ext, global_options, torch_version):
     # Backup the options, preventing other plugins access libs that
     # compiled with compiler of this plugin
+    import torch
+    is_cxx11_abi = torch.compiled_with_cxx11_abi()
+
     options = copy.deepcopy(global_options)
     have_cuda = is_torch_cuda(build_ext, include_dirs=options['INCLUDES'],
                               extra_compile_args=options['COMPILE_FLAGS'])
     if have_cuda:
         cuda_include_dirs, cuda_lib_dirs = get_cuda_dirs(
             build_ext, options['COMPILE_FLAGS'])
-        cuda_extra_objects = build_nvcc_extra_objects()
+        cuda_extra_objects = build_nvcc_extra_objects(is_cxx11_abi)
         options['EXTRA_OBJECTS'] += cuda_extra_objects
 
         options['INCLUDES'] += cuda_include_dirs
@@ -501,9 +503,8 @@ def build_torch_extension(build_ext, global_options, torch_version):
         updated_macros, 'TORCH_VERSION', str(torch_version))
 
     # Always set _GLIBCXX_USE_CXX11_ABI, since PyTorch can only detect whether it was set to 1.
-    import torch
     updated_macros = set_macro(updated_macros, '_GLIBCXX_USE_CXX11_ABI',
-                               str(int(torch.compiled_with_cxx11_abi())))
+                               str(int(is_cxx11_abi)))
 
     # PyTorch requires -DTORCH_API_INCLUDE_EXTENSION_H
     updated_macros = set_macro(

From 62dc99db6f2332e89f9e68885c05854237b7b474 Mon Sep 17 00:00:00 2001
From: Hanbin Hu <hanbinhu2016@gmail.com>
Date: Tue, 16 Mar 2021 23:03:56 -0700
Subject: [PATCH 24/36] Add timeout and cuda check

---
 examples/test_all_example.sh | 71 ++++++++++++++++++++++++------------
 1 file changed, 47 insertions(+), 24 deletions(-)

diff --git a/examples/test_all_example.sh b/examples/test_all_example.sh
index af9aa491..1e2c92d9 100755
--- a/examples/test_all_example.sh
+++ b/examples/test_all_example.sh
@@ -2,24 +2,41 @@
 NUM_PROC=4
 BFRUN="bfrun -np ${NUM_PROC}"
 RUN_DIR="$( pwd )"
-EXAMPLE_DIR="$( cd "$( dirname "$0" )" && pwd )"
+EXAMPLE_DIR="$( cd "$( dirname "$0" )" && pwd )/../examples"
 
 die() { echo >&2 -e "\nERROR: $@\n"; exit 1; }
 check() {
-    "$@" >>/dev/null 2>&1;
-    exit_code=$?;
+    timeout 2m "$@" >>/dev/null 2>&1;
+    local exit_code=$?;
     [ $exit_code -eq 0 ] \
         && echo "Command [$*] succeed" \
         || die "Command [$*] failed with error code $exit_code";
 }
 
+# check GPU exists
+nvidia-smi >>/dev/null 2>&1
+gpu_exit_code=$?
+if [[ $gpu_exit_code -eq 0 ]]; then
+    isgpu=1
+else
+    isgpu=0
+fi
+
+if [[ $isgpu -eq 1 ]]; then
+    echo "GPU Detected"
+else
+    echo "No GPU Detected."
+fi
+
 # PyTorch Average Concensus Cases
 check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_average_consensus.py
 check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_average_consensus.py --enable-dynamic-topology
 check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_average_consensus.py --asynchronous-mode
-check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_average_consensus.py --no-cuda
-check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_average_consensus.py --no-cuda --enable-dynamic-topology
-check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_average_consensus.py --no-cuda --asynchronous-mode
+if [[ $isgpu -eq 1 ]]; then
+    check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_average_consensus.py --no-cuda
+    check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_average_consensus.py --no-cuda --enable-dynamic-topology
+    check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_average_consensus.py --no-cuda --asynchronous-mode
+fi
 
 # PyTorch Optimization Cases
 [ -f "${RUN_DIR}/plot.png" ] && EXIST_PLOT_PNG=1 || EXIST_PLOT_PNG=0
@@ -39,15 +56,17 @@ check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --dist-optimize
 check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --dist-optimizer=neighbor_allreduce --atc-style --disable-dynamic-topology
 check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --dist-optimizer=win_put
 check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --dist-optimizer=win_put --disable-dynamic-topology
-check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=gradient_allreduce
-check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=allreduce
-check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=allreduce --atc-style
-check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=neighbor_allreduce
-check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=neighbor_allreduce --disable-dynamic-topology
-check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=neighbor_allreduce --atc-style
-check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=neighbor_allreduce --atc-style --disable-dynamic-topology
-check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=win_put
-check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=win_put --disable-dynamic-topology
+if [[ $isgpu -eq 1 ]]; then
+    check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=gradient_allreduce
+    check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=allreduce
+    check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=allreduce --atc-style
+    check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=neighbor_allreduce
+    check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=neighbor_allreduce --disable-dynamic-topology
+    check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=neighbor_allreduce --atc-style
+    check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=neighbor_allreduce --atc-style --disable-dynamic-topology
+    check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=win_put
+    check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=win_put --disable-dynamic-topology
+fi
 
 # PyTorch Benchmark Cases
 check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --dist-optimizer=gradient_allreduce
@@ -59,15 +78,17 @@ check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-it
 check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --dist-optimizer=neighbor_allreduce --atc-style --disable-dynamic-topology
 check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --dist-optimizer=win_put
 check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --dist-optimizer=win_put --disable-dynamic-topology
-check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=gradient_allreduce
-check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=allreduce
-check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=allreduce --atc-style
-check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=neighbor_allreduce
-check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=neighbor_allreduce --disable-dynamic-topology
-check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=neighbor_allreduce --atc-style
-check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=neighbor_allreduce --atc-style --disable-dynamic-topology
-check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=win_put
-check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=win_put --disable-dynamic-topology
+if [[ $isgpu -eq 1 ]]; then
+    check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=gradient_allreduce
+    check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=allreduce
+    check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=allreduce --atc-style
+    check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=neighbor_allreduce
+    check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=neighbor_allreduce --disable-dynamic-topology
+    check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=neighbor_allreduce --atc-style
+    check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=neighbor_allreduce --atc-style --disable-dynamic-topology
+    check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=win_put
+    check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=win_put --disable-dynamic-topology
+fi
 
 # PyTorch ResNet Cases
 [ -d "${RUN_DIR}/logs" ] && EXIST_LOGS_DIR=1 || EXIST_LOGS_DIR=0
@@ -80,6 +101,7 @@ check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --e
 check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --dist-optimizer=neighbor_allreduce --atc-style --disable-dynamic-topology
 check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --dist-optimizer=win_put
 check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --dist-optimizer=win_put --disable-dynamic-topology
+if [[ $isgpu -eq 1 ]]; then
 check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --no-cuda --dist-optimizer=gradient_allreduce
 check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --no-cuda --dist-optimizer=allreduce
 check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --no-cuda --dist-optimizer=allreduce --atc-style
@@ -89,4 +111,5 @@ check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --e
 check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --no-cuda --dist-optimizer=neighbor_allreduce --atc-style --disable-dynamic-topology
 check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --no-cuda --dist-optimizer=win_put
 check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --no-cuda --dist-optimizer=win_put --disable-dynamic-topology
+fi
 [ "${EXIST_LOGS_DIR}" == 0 ] && rm -rf ${RUN_DIR}/logs
\ No newline at end of file

From a836b021c82ddd12e3cfaaa5e5951a180d085bfa Mon Sep 17 00:00:00 2001
From: Hanbin Hu <hanbinhu2016@gmail.com>
Date: Tue, 16 Mar 2021 23:04:19 -0700
Subject: [PATCH 25/36] Move test example

---
 {examples => test}/test_all_example.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename {examples => test}/test_all_example.sh (100%)

diff --git a/examples/test_all_example.sh b/test/test_all_example.sh
similarity index 100%
rename from examples/test_all_example.sh
rename to test/test_all_example.sh

From e02a3d055f68963df201ff7e57acc4566b462627 Mon Sep 17 00:00:00 2001
From: Hanbin Hu <hanbinhu2016@gmail.com>
Date: Thu, 18 Mar 2021 01:40:39 -0700
Subject: [PATCH 26/36] Fix NCCL side dst_weight fusion bug

---
 bluefog/common/nccl_controller.cc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/bluefog/common/nccl_controller.cc b/bluefog/common/nccl_controller.cc
index bc823ddd..9fdce4e5 100644
--- a/bluefog/common/nccl_controller.cc
+++ b/bluefog/common/nccl_controller.cc
@@ -1101,12 +1101,12 @@ void NCCLController::NeighborAllreduce(std::vector<TensorTableEntry>& entries) {
                              nccl_ctx_.nccl_comm, nccl_ctx_.stream));
         }
       } else {
-        size_t i = 0;
-        for (int send_rank : *first_entry.send_neighbors) {
+        for (size_t i = 0; i < first_entry.send_neighbors->size(); ++i) {
           void* sendbuf = 
               (void*)((uint8_t*)weighted_fused_input_data + num_elements * i * element_size);
           NCCLCHECK(ncclSend(sendbuf, num_elements,
-                             GetNCCLDataType(first_entry.tensor), send_rank,
+                             GetNCCLDataType(first_entry.tensor),
+                             first_entry.send_neighbors->at(i),
                              nccl_ctx_.nccl_comm, nccl_ctx_.stream));
         }
       }
@@ -1951,8 +1951,8 @@ void NCCLController::MemcpyInWeightFusionBuffer(
   int64_t offset = 0;
   for (size_t i = 0; i < num_dst; ++i) {
     void* weight_buffer_data_at_offset = (uint8_t*)weight_buffer_data + offset;
-    CUDACHECK(cudaMemcpy(weight_buffer_data_at_offset, buffer_data, data_size,
-                         cudaMemcpyDeviceToDevice));
+    CUDACHECK(cudaMemcpyAsync(weight_buffer_data_at_offset, buffer_data, data_size,
+                              cudaMemcpyDeviceToDevice, nccl_ctx_.stream));
     offset += data_size;
   }
 }

From 8e42ca2f0e9d1b6b0408eb5fffff59ad60e73891 Mon Sep 17 00:00:00 2001
From: Hanbin Hu <hanbinhu2016@gmail.com>
Date: Thu, 18 Mar 2021 02:11:18 -0700
Subject: [PATCH 27/36] Add agg to make matplotlib more stable

---
 examples/pytorch_optimization.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/pytorch_optimization.py b/examples/pytorch_optimization.py
index 162f9040..f731cc4b 100644
--- a/examples/pytorch_optimization.py
+++ b/examples/pytorch_optimization.py
@@ -15,6 +15,8 @@
 
 import os
 import torch
+import matplotlib
+matplotlib.use('agg')
 import matplotlib.pyplot as plt
 import argparse
 

From e5f8722fca71b44eecee172352cd5234523cca50 Mon Sep 17 00:00:00 2001
From: Hanbin Hu <hanbinhu2016@gmail.com>
Date: Sun, 21 Mar 2021 01:25:03 -0700
Subject: [PATCH 28/36] Address comments for setup.py

---
 examples/pytorch_optimization.py |  2 +-
 setup.py                         | 22 ++++++++++++++--------
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/examples/pytorch_optimization.py b/examples/pytorch_optimization.py
index f731cc4b..658cd485 100644
--- a/examples/pytorch_optimization.py
+++ b/examples/pytorch_optimization.py
@@ -16,7 +16,7 @@
 import os
 import torch
 import matplotlib
-matplotlib.use('agg')
+matplotlib.use('agg') # Make matplotlib more robust when interface plotting is impossible.
 import matplotlib.pyplot as plt
 import argparse
 
diff --git a/setup.py b/setup.py
index 6857b9bc..f4e3bffb 100644
--- a/setup.py
+++ b/setup.py
@@ -22,6 +22,7 @@
 import sys
 import textwrap
 import traceback
+from typing import List
 
 
 from distutils.errors import CompileError, DistutilsError, \
@@ -437,21 +438,26 @@ def is_torch_cuda(build_ext, include_dirs, extra_compile_args):
         print('INFO: Above error indicates that this PyTorch installation does not support CUDA.')
         return False
 
-def build_nvcc_extra_objects(cxx11_abi: bool):
+def get_nvcc_cmd() -> str:
+    from shutil import which
+    nvcc_cmd = which('nvcc')
+    if nvcc_cmd is None:
+        raise DistutilsPlatformError('Unable to find NVCC compiler')
+    return nvcc_cmd
+
+def build_nvcc_extra_objects(nvcc_cmd: str, cxx11_abi: bool) -> List[str]:
     # nvcc --compiler-options '-fPIC -D_GLIBCXX_USE_CXX11_ABI=0' -rdc=true -c cuda_kernels.cu
     # nvcc --compiler-options '-fPIC -D_GLIBCXX_USE_CXX11_ABI=0' -dlink -o cuda_kernels_link.o \
     #      cuda_kernels.o -lcudart
-    nvcc = 'nvcc'
-    common_flags = f"'-fPIC -D_GLIBCXX_USE_CXX11_ABI={int(cxx11_abi)}'"
-    nvcc_with_flags = f"{nvcc} --compiler-options {common_flags}"
+    nvcc_flags = f"--compiler-options '-fPIC -D_GLIBCXX_USE_CXX11_ABI={int(cxx11_abi)}'"
 
     extra_object_dir = 'bluefog/common/cuda/'
     source = extra_object_dir+'cuda_kernels.cu'
     object_file = extra_object_dir+'cuda_kernels.o'
     object_link = extra_object_dir+'cuda_kernels_link.o'
 
-    command_object = f"{nvcc_with_flags} -rdc=true -c {source} -o {object_file}"
-    command_link   = f"{nvcc_with_flags} -dlink {object_file} -lcudart -o {object_link}"
+    command_object = f"{nvcc_cmd} {nvcc_flags} -rdc=true -c {source} -o {object_file}"
+    command_link   = f"{nvcc_cmd} {nvcc_flags} -dlink {object_file} -lcudart -o {object_link}"
 
     subprocess.check_call([command_object], shell=True)
     subprocess.check_call([command_link], shell=True)
@@ -469,7 +475,8 @@ def build_torch_extension(build_ext, global_options, torch_version):
     if have_cuda:
         cuda_include_dirs, cuda_lib_dirs = get_cuda_dirs(
             build_ext, options['COMPILE_FLAGS'])
-        cuda_extra_objects = build_nvcc_extra_objects(is_cxx11_abi)
+        nvcc_cmd = get_nvcc_cmd()
+        cuda_extra_objects = build_nvcc_extra_objects(nvcc_cmd, is_cxx11_abi)
         options['EXTRA_OBJECTS'] += cuda_extra_objects
 
         options['INCLUDES'] += cuda_include_dirs
@@ -515,7 +522,6 @@ def build_torch_extension(build_ext, global_options, torch_version):
     else:
         # CUDAExtension fails with `ld: library not found for -lcudart` if CUDA is not present
         from torch.utils.cpp_extension import CppExtension as TorchExtension
-    bluefog_tensorflow_mpi_lib.extra_objects = options['EXTRA_OBJECTS']
 
     ext = TorchExtension(bluefog_torch_mpi_lib.name,
                          define_macros=updated_macros,

From 9f2f55d78371fc2356381bea4d1ff62e8baccee8 Mon Sep 17 00:00:00 2001
From: Hanbin Hu <hanbinhu2016@gmail.com>
Date: Sun, 21 Mar 2021 01:46:50 -0700
Subject: [PATCH 29/36] Simpler logic for dst_weighting_enabled and
 weighted_average_computation

---
 bluefog/torch/mpi_ops.py | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/bluefog/torch/mpi_ops.py b/bluefog/torch/mpi_ops.py
index dc508a34..e5e106ae 100644
--- a/bluefog/torch/mpi_ops.py
+++ b/bluefog/torch/mpi_ops.py
@@ -17,6 +17,7 @@
 from contextlib import contextmanager
 from typing import List, Dict, Union, Optional
 
+import numpy as np
 import torch
 
 from bluefog.torch import mpi_lib  # C library
@@ -488,10 +489,9 @@ def _neighbor_allreduce_nonblocking(tensor, output, self_weight, src_weights,
                          "enabling dynamic topology.")
     else:
         dynamic_neighbors_enabled = True
-        dst_weighting_enabled = True
         if isinstance(dst_weights, list):
             dst_weights = {dst:1.0 for dst in dst_weights}
-            dst_weighting_enabled = False
+        dst_weighting_enabled = not np.allclose(list(dst_weights.values()), 1.0)
     if self_weight is None and src_weights is None:
         # Implying this is static graph.
         if is_topo_weighted():
@@ -515,13 +515,8 @@ def _neighbor_allreduce_nonblocking(tensor, output, self_weight, src_weights,
             raise ValueError("The key of weights should only contain the ranks that belong to "
                              " in-neighbors and self rank.")
         uniform_weights = 1.0/(len(src_weights)+1)
-        weighted_average_computation = False
-        if abs(self_weight - uniform_weights) > 1e-6:
-            weighted_average_computation = True
-        for n_weights in src_weights.values():
-            if abs(n_weights - uniform_weights) > 1e-6:
-                weighted_average_computation = True
-                break
+        weighted_average_computation = not(np.isclose(self_weight, uniform_weights) and
+                                           np.allclose(list(src_weights.values()), uniform_weights))
     else:
         raise ValueError("Arguments self_weight and neighbor_weights have to be presented at "
                          "the same time")

From d7a93103a306fcc07d86194a7cf7f192e37edba9 Mon Sep 17 00:00:00 2001
From: Hanbin Hu <hanbinhu2016@gmail.com>
Date: Sun, 21 Mar 2021 01:56:58 -0700
Subject: [PATCH 30/36] Better consideration for weight buffer size

---
 bluefog/common/operations.cc   | 2 ++
 bluefog/common/tensor_queue.cc | 7 ++++---
 bluefog/common/tensor_queue.h  | 4 ++--
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/bluefog/common/operations.cc b/bluefog/common/operations.cc
index 84d18aba..3738f9a3 100644
--- a/bluefog/common/operations.cc
+++ b/bluefog/common/operations.cc
@@ -780,6 +780,8 @@ void PerformOperationWithFusion(std::vector<TensorTableEntry>& entries) {
       [&]() { timeline.ActivityStartAll(entries, "INIT_FUSION_BUFFER"); },
       [&]() { timeline.ActivityEndAll(entries); });
   
+  // As the dst_weight requires extra memory to scale the tensor for each destination, therefore,
+  // extra memory is required.
   Status status_dst_weight = Status::OK();
   if (first_entry.dst_weighting_enabled) {
     status_dst_weight = bluefog_global.fusion_buffer.InitializeWeightBuffer(
diff --git a/bluefog/common/tensor_queue.cc b/bluefog/common/tensor_queue.cc
index b0f67241..0737db91 100644
--- a/bluefog/common/tensor_queue.cc
+++ b/bluefog/common/tensor_queue.cc
@@ -155,19 +155,20 @@ std::shared_ptr<PersistentBuffer> FusionBufferManager::GetBuffer(int device) {
 }
 
 Status FusionBufferManager::InitializeWeightBuffer(
-    int64_t threshold, int mpi_size, int device, std::shared_ptr<OpContext> context,
+    int64_t threshold, int world_size, int device, std::shared_ptr<OpContext> context,
     std::function<void()> on_start_init, std::function<void()> on_end_init) {
   auto& elem = weight_tensor_fusion_buffers_[device];
   auto& buffer = elem.first;
   int64_t& size = elem.second;
-  if (size != threshold*mpi_size) {
+  // threshold * (world_size-1) is the upper bound for buffer
+  if (size != threshold*(world_size-1)) {
     buffer.reset();
     size = 0;
   }
 
   if (buffer == nullptr) {
     on_start_init();
-    size = threshold*mpi_size;
+    size = threshold*(world_size-1);
 
     // Lazily allocate persistent buffer for Tensor Fusion and keep it
     // forever per device.
diff --git a/bluefog/common/tensor_queue.h b/bluefog/common/tensor_queue.h
index 2c54c88f..47d74f88 100644
--- a/bluefog/common/tensor_queue.h
+++ b/bluefog/common/tensor_queue.h
@@ -86,13 +86,13 @@ class FusionBufferManager {
   //
   // Args:
   //  threshold: Size of the buffer in bytes.
-  //  mpi_size: Size of MPI nodes.
+  //  world_size: Size of MPI nodes.
   //  device: Device ID to associate the buffer.
   //  context: Framework used to create the buffer and associate it.
   //  on_start_init: Callback on starting buffer initialization.
   //  on_end_init: Callback on completing buffer initialization.
   Status InitializeWeightBuffer(int64_t threshold,
-                                int mpi_size,
+                                int world_size,
                                 int device,
                                 std::shared_ptr<OpContext> context,
                                 std::function<void()> on_start_init,

From 562b231bf5213f0b575f57657ca59fcc36f58c64 Mon Sep 17 00:00:00 2001
From: Hanbin Hu <hanbinhu2016@gmail.com>
Date: Thu, 25 Mar 2021 22:23:36 -0700
Subject: [PATCH 31/36] Make src_weights as std::map, and simplify logic for
 PerformNeighborAllreduceCallback

---
 bluefog/torch/mpi_ops.cc | 63 +++++++++++-----------------------------
 1 file changed, 17 insertions(+), 46 deletions(-)

diff --git a/bluefog/torch/mpi_ops.cc b/bluefog/torch/mpi_ops.cc
index 16a410d6..636da5f0 100644
--- a/bluefog/torch/mpi_ops.cc
+++ b/bluefog/torch/mpi_ops.cc
@@ -98,13 +98,12 @@ std::function<std::function<void(const Status&)>(std::function<void()>)>
 
 void PerformNeighborAllreduceCallback(::torch::Tensor tensor, ::torch::Tensor output,
                                       double self_weight,
-                                      const std::unordered_map<int, double>& src_weights,
-                                      const std::vector<int>& src_neighbors,
+                                      const std::map<int, double>& src_weights,
                                       bool avg_computation,
                                       bool dynamic_neighbors_enabled,
                                       bool is_hierarchical) {
   int src_size = bluefog_neighbor_size();
-  if (dynamic_neighbors_enabled) src_size = src_neighbors.size();
+  if (dynamic_neighbors_enabled) src_size = src_weights.size();
   if (src_size > 0) {
     ::torch::Tensor output_buffer = MaybeCopyToTensorBuffer(output);
     ::torch::Tensor tensor_buffer = MaybeCopyToTensorBuffer(tensor);
@@ -118,40 +117,17 @@ void PerformNeighborAllreduceCallback(::torch::Tensor tensor, ::torch::Tensor ou
 
     // if avg_computation is set to be False, sum computation will be taken place.
     if (avg_computation) {
-      // 1) For a distributed graph topology, created with
-      // MPI_Dist_graph_create, the sequence of neighbors in the send and
-      // receive buffers at each process is defined as the sequence returned
-      // by MPI_Dist_graph_neighbors for destinations and sources,
-      // respectively. 2) MPI_Dist_graph_neighbors: If the communicator was
-      // created with MPI_Dist_graph_create_adjacent then the order of the
-      // values in sources and destinations is identical to the input that
-      // was used by the process with the same rank in comm_old in the
-      // creation call.
-      int* sources_ptr = nullptr;
-      if (!dynamic_neighbors_enabled) {
-        int indgree = 0;
-        int outdegree = 0;
-        int* destinations_ptr = nullptr;
-        bluefog_load_topology(&indgree, sources_ptr, &outdegree,
-                              destinations_ptr);
-      }
       auto output_reduced = output_buffer.slice(0, 0, first_dim);
-      for (int i = 0; i < src_size; i++) {
-        double weight = 0.0;
-        int src_rank;
-        if (!dynamic_neighbors_enabled) src_rank = *(sources_ptr+i);
-        else src_rank = src_neighbors[i];
-        auto it = src_weights.find(src_rank);
-        if (it != src_weights.end()) {
-          weight = it->second;
-        }
-
+      int i = 0;
+      for (auto kv : src_weights) {
+        double weight = kv.second;
         if (i == 0) {
           output_reduced.mul_(weight);
         } else {
           output_reduced.add_(
               output_buffer.slice(0, i * first_dim, (i + 1) * first_dim), weight);
         }
+        ++i;
       }
       output_buffer.resize_(shape_vector);
       output_buffer.add_(tensor_buffer, self_weight);
@@ -431,10 +407,12 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output,
 
   auto callback_wrapper = GetCallbackWrapper(handle, timeline_ptr, op_name, tid);
 
-  std::vector<int> src_neighbors;
+  std::map<int, double> src_weights_map;
   for (auto kv : src_weights)
+    src_weights_map.insert(kv);
+  std::vector<int> src_neighbors;
+  for (auto kv : src_weights_map)
     src_neighbors.push_back(kv.first);
-  std::sort(src_neighbors.begin(), src_neighbors.end());
 
   std::vector<int> dst_neighbors;
   for (auto kv : dst_weights)
@@ -462,16 +440,12 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output,
         bf_src_neighbors, bf_dst_neighbors, bf_dst_weights_vec,
         dynamic_neighbors_enabled, dst_weighting_enabled, is_hierarchical,
         enable_topo_check, op_name, CPU_DEVICE_ID,
-        callback_wrapper([self_weight, src_weights, avg_computation,
-                          cpu_output, tensor, src_neighbors,
-                          dynamic_neighbors_enabled, is_hierarchical, output,
-                          device]() mutable {
+        callback_wrapper([self_weight, src_weights_map, avg_computation, cpu_output, tensor,
+                          dynamic_neighbors_enabled, is_hierarchical, output, device]() mutable {
           with_device device_guard(device);
           output.copy_(cpu_output);
-          PerformNeighborAllreduceCallback(tensor, output, self_weight,
-                                           src_weights, src_neighbors,
-                                           avg_computation,
-                                           dynamic_neighbors_enabled,
+          PerformNeighborAllreduceCallback(tensor, output, self_weight, src_weights_map,
+                                           avg_computation, dynamic_neighbors_enabled,
                                            is_hierarchical);
         }));
 
@@ -486,13 +460,10 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output,
         bf_src_neighbors, bf_dst_neighbors, bf_dst_weights_vec,
         dynamic_neighbors_enabled, dst_weighting_enabled,
         is_hierarchical, enable_topo_check, op_name, device,
-        callback_wrapper([self_weight, src_weights, avg_computation,
-                          src_neighbors, dynamic_neighbors_enabled,
+        callback_wrapper([self_weight, src_weights_map, avg_computation, dynamic_neighbors_enabled,
                           is_hierarchical, tensor, output]() mutable {
-          PerformNeighborAllreduceCallback(tensor, output, self_weight,
-                                           src_weights, src_neighbors,
-                                           avg_computation,
-                                           dynamic_neighbors_enabled,
+          PerformNeighborAllreduceCallback(tensor, output, self_weight, src_weights_map,
+                                           avg_computation, dynamic_neighbors_enabled,
                                            is_hierarchical);
         }));
     ThrowIfError(enqueue_result);

From 6764efa44c75d9f064e0d50c4b4b61071129b742 Mon Sep 17 00:00:00 2001
From: Hanbin Hu <hanbinhu2016@gmail.com>
Date: Thu, 25 Mar 2021 23:23:12 -0700
Subject: [PATCH 32/36] Add TODO #80 and #81, and simplify the logic for
 dst_weight

---
 bluefog/common/global_state.h     |   1 +
 bluefog/common/mpi_context.h      |   1 +
 bluefog/common/mpi_controller.cc  | 134 ++++++++++--------------------
 bluefog/common/nccl_controller.cc |  53 ++++++------
 bluefog/common/operations.cc      |   2 +-
 bluefog/torch/mpi_ops.py          |   2 +-
 6 files changed, 72 insertions(+), 121 deletions(-)

diff --git a/bluefog/common/global_state.h b/bluefog/common/global_state.h
index d9addcac..6f940769 100644
--- a/bluefog/common/global_state.h
+++ b/bluefog/common/global_state.h
@@ -80,6 +80,7 @@ struct BluefogGlobalState {
   // Threshold for Tensor Fusion.  All tensors that occupy memory beyond this
   // threshold will be fused.
   int64_t tensor_fusion_threshold = 8 * 1024 * 1024;
+  int64_t tensor_fusion_threshold_for_dst_weight = 8 * 1024 * 1024;
   FusionBufferManager fusion_buffer;
 
   // Because setting topology happens in the main thread instead of communication
diff --git a/bluefog/common/mpi_context.h b/bluefog/common/mpi_context.h
index aaad85db..77f28e48 100644
--- a/bluefog/common/mpi_context.h
+++ b/bluefog/common/mpi_context.h
@@ -237,6 +237,7 @@ class MPIContext {
   MPI_Datatype mpi_float16_t;
   MPI_Op mpi_float16_sum;
 
+  // TODO(hhb): #80 We should use a common context for MPI and NCCL controller for CUDA usage.
 #if HAVE_CUDA
   // CUDA Stream
   cudaStream_t stream;
diff --git a/bluefog/common/mpi_controller.cc b/bluefog/common/mpi_controller.cc
index 07fc3f0a..88f17516 100644
--- a/bluefog/common/mpi_controller.cc
+++ b/bluefog/common/mpi_controller.cc
@@ -459,16 +459,10 @@ void MPIController::NeighborAllreduce(TensorTableEntry& entry) {
 
   if (!entry.is_hierarchical) {
     if (!entry.dynamic_neighbors_enabled) {
-      int ret_code = MPI_Neighbor_allgather(
+      MPICHECK(MPI_Neighbor_allgather(
           sendbuf, num_elements, mpi_ctx_.GetMPIDataType(entry.tensor),
           buffer_data, num_elements, mpi_ctx_.GetMPIDataType(entry.output),
-          mpi_ctx_.GetMPICommunicator(Communicator::GRAPH));
-      if (ret_code != MPI_SUCCESS) {
-        throw std::runtime_error(
-            "MPI_Neighbor_allreduce (through neighbor_allgather) failed, see "
-            "MPI "
-            "output for details.");
-      }
+          mpi_ctx_.GetMPICommunicator(Communicator::GRAPH)));
     } else {
       int nsend = entry.send_neighbors->size();
       int nrecv = entry.recv_neighbors->size();
@@ -490,17 +484,10 @@ void MPIController::NeighborAllreduce(TensorTableEntry& entry) {
       for (int i = 0; i < nrecv; ++i) {
         void* recvbuf = (void*)(static_cast<const char*>(entry.output->data()) +
                                 num_elements * i * element_size);
-        int ret_code = MPI_Irecv(
-            recvbuf, num_elements, mpi_ctx_.GetMPIDataType(entry.output),
-            entry.recv_neighbors->at(i),
-            mpi_ctx_.rank_ + entry.recv_neighbors->at(i),
-            mpi_ctx_.GetMPICommunicator(Communicator::GRAPH),
-            &requests[i + nsend]);
-        if (ret_code != MPI_SUCCESS) {
-          throw std::runtime_error(
-              "MPI_Irecv (for dynamic neighbor_allreduce) failed, see MPI "
-              "output for details.");
-        }
+        MPICHECK(MPI_Irecv(recvbuf, num_elements,
+            mpi_ctx_.GetMPIDataType(entry.output), entry.recv_neighbors->at(i),
+            /*tag=*/mpi_ctx_.rank_ + entry.recv_neighbors->at(i),
+            mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL), &requests[i + nsend]));
       }
 
       if (entry.dst_weighting_enabled) {
@@ -509,33 +496,16 @@ void MPIController::NeighborAllreduce(TensorTableEntry& entry) {
             std::this_thread::sleep_for(std::chrono::nanoseconds(100));
           }
         }
-        for (int i = 0; i < nsend; ++i) {
-          int ret_code = MPI_Isend(
-              weighted_tensors[i].get()->data(), num_elements,
-              mpi_ctx_.GetMPIDataType(entry.tensor), entry.send_neighbors->at(i),
-              mpi_ctx_.rank_ + entry.send_neighbors->at(i),
-              mpi_ctx_.GetMPICommunicator(Communicator::GRAPH), &requests[i]);
-          if (ret_code != MPI_SUCCESS) {
-            throw std::runtime_error(
-                "MPI_Isend (for dynamic neighbor_allreduce) failed, see MPI "
-                "output for details.");
-          }
-        }
-      } else {
-        for (int i = 0; i < nsend; ++i) {
-          int ret_code = MPI_Isend(
-              sendbuf, num_elements, mpi_ctx_.GetMPIDataType(entry.tensor),
-              entry.send_neighbors->at(i),
-              mpi_ctx_.rank_ + entry.send_neighbors->at(i),
-              mpi_ctx_.GetMPICommunicator(Communicator::GRAPH), &requests[i]);
-          if (ret_code != MPI_SUCCESS) {
-            throw std::runtime_error(
-                "MPI_Isend (for dynamic neighbor_allreduce) failed, see MPI "
-                "output for details.");
-          }
-        }
       }
-
+      for (int i = 0; i < nsend; ++i) {
+        const void* buffer_send = sendbuf;
+        if (entry.dst_weighting_enabled)
+          buffer_send = weighted_tensors[i].get()->data();
+        MPICHECK(MPI_Isend(buffer_send, num_elements,
+            mpi_ctx_.GetMPIDataType(entry.tensor), entry.send_neighbors->at(i),
+            /*tag=*/mpi_ctx_.rank_ + entry.send_neighbors->at(i),
+            mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL), &requests[i]));
+      }
       MPI_Waitall(nsend + nrecv, requests.data(), statuses.data());
       error_message =
           GenerateNeighborExchangeErrorMessage(statuses, nsend, nrecv);
@@ -551,6 +521,11 @@ void MPIController::NeighborAllreduce(TensorTableEntry& entry) {
           "Local size is smaller than 2, in this case, you should use "
           "neighbor_allreduce instead of hierarchical_neighbor_allreduce.");
     }
+    if (entry.dst_weighting_enabled) {
+      throw std::runtime_error(
+          "Under hierarchical neighbor_allreduce, argument "
+          "dst_weight should not be enabled for now.");
+    }
     // 1. In-place allreduce
     MPI_Allreduce(MPI_IN_PLACE, (void*)sendbuf, num_elements,
                   mpi_ctx_.GetMPIDataType(entry.tensor), MPI_SUM,
@@ -644,8 +619,8 @@ void MPIController::NeighborAllreduce(std::vector<TensorTableEntry>& entries) {
   if (first_entry.enable_topo_check && first_entry.dynamic_neighbors_enabled) {
     if (first_entry.is_hierarchical) {
       // TODO: support check.
-      BFLOG(INFO) << "Request to check topology for hierarchical neighbor "
-                  << "allreduce ops but it is not supported yet.";
+      BFLOG(WARNING) << "Request to check topology for hierarchical neighbor "
+                     << "allreduce ops but it is not supported yet.";
     }
     is_topo_check_fail = CheckNeighborSendRecvPattern(
         first_entry.send_neighbors.get(), first_entry.recv_neighbors.get(), first_entry.tensor_name,
@@ -709,15 +684,10 @@ void MPIController::NeighborAllreduce(std::vector<TensorTableEntry>& entries) {
 
   if (!first_entry.is_hierarchical) {
     if (!first_entry.dynamic_neighbors_enabled) {
-      int ret_code = MPI_Neighbor_allgather(
+      MPICHECK(MPI_Neighbor_allgather(
           fused_input_data, num_elements, mpi_ctx_.GetMPIDataType(first_entry.tensor),
           buffer_data, num_elements, mpi_ctx_.GetMPIDataType(first_entry.output),
-          mpi_ctx_.GetMPICommunicator(Communicator::GRAPH));
-      if (ret_code != MPI_SUCCESS) {
-        throw std::runtime_error(
-            "MPI_Neighbor_allreduce (through neighbor_allgather) failed, see MPI "
-            "output for details.");
-      }
+          mpi_ctx_.GetMPICommunicator(Communicator::GRAPH)));
     } else {
       int nsend = first_entry.send_neighbors->size();
       int nrecv = first_entry.recv_neighbors->size();
@@ -726,48 +696,25 @@ void MPIController::NeighborAllreduce(std::vector<TensorTableEntry>& entries) {
       for (int i = 0; i < nrecv; ++i) {
         void* recvbuf =
             (void*)((uint8_t*)buffer_data + num_elements * i * element_size);
-        int ret_code = MPI_Irecv(recvbuf, num_elements,
-                                mpi_ctx_.GetMPIDataType(first_entry.output),
-                                first_entry.recv_neighbors->at(i),
-                                /*tag=*/mpi_ctx_.rank_ + first_entry.recv_neighbors->at(i),
-                                mpi_ctx_.GetMPICommunicator(Communicator::GRAPH),
-                                &requests[i + nsend]);
-        if (ret_code != MPI_SUCCESS) {
-          throw std::runtime_error(
-              "MPI_Irecv (for dynamic neighbor_allreduce) failed, see MPI output "
-              "for details.");
-        }
+        MPICHECK(MPI_Irecv(recvbuf, num_elements,
+            mpi_ctx_.GetMPIDataType(first_entry.output), first_entry.recv_neighbors->at(i),
+            /*tag=*/mpi_ctx_.rank_ + first_entry.recv_neighbors->at(i),
+            mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL), &requests[i + nsend]));
       }
-      if (!first_entry.dst_weighting_enabled) {
-        for (int i = 0; i < nsend; ++i) {
-          int ret_code = MPI_Isend(
-              fused_input_data, num_elements, mpi_ctx_.GetMPIDataType(first_entry.tensor),
-              first_entry.send_neighbors->at(i),
-              /*tag=*/mpi_ctx_.rank_ + first_entry.send_neighbors->at(i),
-              mpi_ctx_.GetMPICommunicator(Communicator::GRAPH), &requests[i]);
-          if (ret_code != MPI_SUCCESS) {
-            throw std::runtime_error(
-                "MPI_Isend (for dynamic neighbor_allreduce) failed, see MPI output "
-                "for details.");
-          }
-        }
-      } else {
 #if HAVE_CUDA
+      if (first_entry.dst_weighting_enabled && first_entry.device != CPU_DEVICE_ID) {
         cudaStreamSynchronize(mpi_ctx_.stream);
+      }
 #endif
-        for (int i = 0; i < nsend; ++i) {
-          void* sendbuf = 
+      for (int i = 0; i < nsend; ++i) {
+        const void* sendbuf = fused_input_data;
+        if (first_entry.dst_weighting_enabled)
+          sendbuf = 
               (void*)((uint8_t*)weighted_fused_input_data + num_elements * i * element_size);
-          int ret_code = MPI_Isend(sendbuf, num_elements,
-              mpi_ctx_.GetMPIDataType(first_entry.tensor), first_entry.send_neighbors->at(i),
-              /*tag=*/mpi_ctx_.rank_ + first_entry.send_neighbors->at(i),
-              mpi_ctx_.GetMPICommunicator(Communicator::GRAPH), &requests[i]);
-          if (ret_code != MPI_SUCCESS) {
-            throw std::runtime_error(
-                "MPI_Isend (for dynamic neighbor_allreduce) failed, see MPI output "
-                "for details.");
-          }
-        }
+        MPICHECK(MPI_Isend(sendbuf, num_elements,
+            mpi_ctx_.GetMPIDataType(first_entry.tensor), first_entry.send_neighbors->at(i),
+            /*tag=*/mpi_ctx_.rank_ + first_entry.send_neighbors->at(i),
+            mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL), &requests[i]));
       }
       MPI_Waitall(nsend + nrecv, requests.data(), statuses.data());
       error_message =
@@ -784,6 +731,11 @@ void MPIController::NeighborAllreduce(std::vector<TensorTableEntry>& entries) {
           "Local size is smaller than 2, in this case, you should use "
           "neighbor_allreduce instead of hierarchical_neighbor_allreduce.");
     }
+    if (first_entry.dst_weighting_enabled) {
+      throw std::runtime_error(
+          "Under hierarchical neighbor_allreduce, argument "
+          "dst_weight should not be enabled for now.");
+    }
     // 1. In-place allreduce
     MPI_Allreduce(MPI_IN_PLACE, (void*)fused_input_data, num_elements,
                   mpi_ctx_.GetMPIDataType(first_entry.tensor), MPI_SUM,
diff --git a/bluefog/common/nccl_controller.cc b/bluefog/common/nccl_controller.cc
index 9fdce4e5..7bdd9acf 100644
--- a/bluefog/common/nccl_controller.cc
+++ b/bluefog/common/nccl_controller.cc
@@ -789,23 +789,19 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) {
         NCCLCHECK(ncclRecv(recvbuf, num_elements, GetNCCLDataType(entry.tensor),
                           recv_rank, nccl_ctx_.nccl_comm, nccl_ctx_.stream));
       }
-      if(entry.dst_weighting_enabled)
-      {
+      if(entry.dst_weighting_enabled) {
         if (ready_event != nullptr) {
           while (!ready_event->Ready()) {
             std::this_thread::sleep_for(std::chrono::nanoseconds(100));
           }
         }
-        for (size_t i = 0; i < entry.send_neighbors->size(); ++i) {
-          NCCLCHECK(ncclSend(weighted_tensors[i].get()->data(), num_elements,
-                             GetNCCLDataType(entry.tensor), entry.send_neighbors->at(i),
-                             nccl_ctx_.nccl_comm, nccl_ctx_.stream));
-        }
-      } else {
-        for (int send_rank : *entry.send_neighbors) {
-          NCCLCHECK(ncclSend(sendbuf, num_elements, GetNCCLDataType(entry.tensor),
-                             send_rank, nccl_ctx_.nccl_comm, nccl_ctx_.stream));
-        }
+      }
+      for (size_t i = 0; i < entry.send_neighbors->size(); ++i) {
+        const void* buffer_send = sendbuf;
+        if (entry.dst_weighting_enabled)
+          buffer_send = weighted_tensors[i].get()->data();
+        NCCLCHECK(ncclSend(buffer_send, num_elements, GetNCCLDataType(entry.tensor),
+                           entry.send_neighbors->at(i), nccl_ctx_.nccl_comm, nccl_ctx_.stream));
       }
     }
     ncclGroupEnd();
@@ -816,6 +812,11 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) {
           "send_machine_neighbors should "
           "not be empty.");
     }
+    if (entry.dst_weighting_enabled) {
+      throw std::runtime_error(
+          "Under hierarchical neighbor_allreduce, argument "
+          "dst_weight should not be enabled for now.");
+    }
     // 1. In-place allreduce for all local ranks. Note it is sum, so we need to
     // divided by local size at call back stage.
     NCCLCHECK(ncclAllReduce(sendbuf, (void*)sendbuf, num_elements,
@@ -1093,22 +1094,13 @@ void NCCLController::NeighborAllreduce(std::vector<TensorTableEntry>& entries) {
                            GetNCCLDataType(first_entry.tensor), recv_rank,
                            nccl_ctx_.nccl_comm, nccl_ctx_.stream));
       }
-      if (!first_entry.dst_weighting_enabled)
-      {
-        for (int send_rank : *first_entry.send_neighbors) {
-          NCCLCHECK(ncclSend(fused_input_data, num_elements,
-                             GetNCCLDataType(first_entry.tensor), send_rank,
-                             nccl_ctx_.nccl_comm, nccl_ctx_.stream));
-        }
-      } else {
-        for (size_t i = 0; i < first_entry.send_neighbors->size(); ++i) {
-          void* sendbuf = 
-              (void*)((uint8_t*)weighted_fused_input_data + num_elements * i * element_size);
-          NCCLCHECK(ncclSend(sendbuf, num_elements,
-                             GetNCCLDataType(first_entry.tensor),
-                             first_entry.send_neighbors->at(i),
-                             nccl_ctx_.nccl_comm, nccl_ctx_.stream));
-        }
+      for (size_t i = 0; i < first_entry.send_neighbors->size(); ++i) {
+        const void* sendbuf = fused_input_data;
+        if (first_entry.dst_weighting_enabled)
+          sendbuf = (void*)((uint8_t*)weighted_fused_input_data + num_elements * i * element_size);
+        NCCLCHECK(ncclSend(sendbuf, num_elements, GetNCCLDataType(first_entry.tensor),
+                           first_entry.send_neighbors->at(i),
+                           nccl_ctx_.nccl_comm, nccl_ctx_.stream));
       }
     }
     ncclGroupEnd();
@@ -1124,6 +1116,11 @@ void NCCLController::NeighborAllreduce(std::vector<TensorTableEntry>& entries) {
         "neighbor_allreduce instead of hierarchical_neighbor_allreduce."
       );
     }
+    if (first_entry.dst_weighting_enabled) {
+      throw std::runtime_error(
+          "Under hierarchical neighbor_allreduce, argument "
+          "dst_weight should not be enabled for now.");
+    }
 
     // 1. In-place allreduce for all local ranks. Note it is sum, so we need to
     // divided by local size at call back stage.
diff --git a/bluefog/common/operations.cc b/bluefog/common/operations.cc
index 3738f9a3..ddf90cf5 100644
--- a/bluefog/common/operations.cc
+++ b/bluefog/common/operations.cc
@@ -785,7 +785,7 @@ void PerformOperationWithFusion(std::vector<TensorTableEntry>& entries) {
   Status status_dst_weight = Status::OK();
   if (first_entry.dst_weighting_enabled) {
     status_dst_weight = bluefog_global.fusion_buffer.InitializeWeightBuffer(
-      bluefog_global.tensor_fusion_threshold, mpi_context.size_,
+      bluefog_global.tensor_fusion_threshold_for_dst_weight, mpi_context.size_,
       first_entry.device, first_entry.context,
       [&]() { timeline.ActivityStartAll(entries, "INIT_WEIGHT_FUSION_BUFFER"); },
       [&]() { timeline.ActivityEndAll(entries); });
diff --git a/bluefog/torch/mpi_ops.py b/bluefog/torch/mpi_ops.py
index e5e106ae..1b82b664 100644
--- a/bluefog/torch/mpi_ops.py
+++ b/bluefog/torch/mpi_ops.py
@@ -638,7 +638,7 @@ def neighbor_allreduce_nonblocking(tensor: torch.Tensor, *,
     return _neighbor_allreduce_nonblocking(tensor, output, self_weight, src_weights,
                                            dst_weights, enable_topo_check, name=name)
 
-
+# TODO(hanbinhu) #81 Add dst_weight for hierarchical neighbor allreduce.
 def hierarchical_neighbor_allreduce(tensor: torch.Tensor,
                                     self_weight: float = None,
                                     neighbor_machine_weights: Dict[int, float] = None,

From 1e8db23a397950705090bed7f709d9cadec0dfd0 Mon Sep 17 00:00:00 2001
From: Hanbin Hu <hanbinhu2016@gmail.com>
Date: Thu, 25 Mar 2021 23:47:53 -0700
Subject: [PATCH 33/36] Wrap CheckNeighborSendRecvPattern again

---
 bluefog/common/mpi_context.cc     |  2 +-
 bluefog/common/mpi_context.h      |  2 +-
 bluefog/common/mpi_controller.cc  | 74 ++++++++++++-------------------
 bluefog/common/mpi_controller.h   |  4 +-
 bluefog/common/nccl_controller.cc | 39 +++-------------
 5 files changed, 38 insertions(+), 83 deletions(-)

diff --git a/bluefog/common/mpi_context.cc b/bluefog/common/mpi_context.cc
index addd3b3a..7a68ce39 100644
--- a/bluefog/common/mpi_context.cc
+++ b/bluefog/common/mpi_context.cc
@@ -222,7 +222,7 @@ MPI_Op MPIContext::GetMPISumOp(DataType dtype) {
   return dtype == DataType::BLUEFOG_FLOAT16 ? mpi_float16_sum : MPI_SUM;
 }
 
-MPI_Comm MPIContext::GetMPICommunicator(Communicator comm) {
+MPI_Comm MPIContext::GetMPICommunicator(Communicator comm) const {
   switch (comm) {
     case Communicator::GLOBAL:
       return mpi_comm;
diff --git a/bluefog/common/mpi_context.h b/bluefog/common/mpi_context.h
index 77f28e48..c220d497 100644
--- a/bluefog/common/mpi_context.h
+++ b/bluefog/common/mpi_context.h
@@ -148,7 +148,7 @@ class MPIContext {
 
   MPI_Op GetMPISumOp(DataType dtype);
 
-  MPI_Comm GetMPICommunicator(Communicator comm);
+  MPI_Comm GetMPICommunicator(Communicator comm) const;
 
   int GetMPITypeSize(DataType dtype);
 
diff --git a/bluefog/common/mpi_controller.cc b/bluefog/common/mpi_controller.cc
index 88f17516..5d358476 100644
--- a/bluefog/common/mpi_controller.cc
+++ b/bluefog/common/mpi_controller.cc
@@ -294,23 +294,13 @@ void MPIController::NeighborAllgather(TensorTableEntry& entry) {
     displcmnts = new int[mpi_ctx_.neighbor_indgree_];
     status = mpi_ctx_.AllocateOutput(entry, recvcounts, Communicator::GRAPH);
   } else {
-    bool is_topo_check_fail = false;
-    if (entry.enable_topo_check && entry.dynamic_neighbors_enabled) {
-      if (entry.is_hierarchical) {
-        // TODO: support check.
-        BFLOG(INFO) << "Request to check topology for hierarchical neighbor "
-                    << "allreduce ops but it is not supported yet.";
-      }
-      is_topo_check_fail = CheckNeighborSendRecvPattern(
-          entry.send_neighbors.get(), entry.recv_neighbors.get(), entry.tensor_name,
-          mpi_ctx_.size_, timeline_ptr, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL));
-    }
-
+    bool is_topo_check_fail = CheckNeighborSendRecvPatternForEntry(entry, mpi_ctx_, timeline_ptr);
     if (is_topo_check_fail) {
       entry.callback(Status::InvalidArgument(
           "Src and dst neighbor ranks do not match"));
       return;
     }
+
     recvcounts = new int[entry.recv_neighbors->size()];
     displcmnts = new int[entry.recv_neighbors->size()];
     status = mpi_ctx_.AllocateOutput(entry, recvcounts, Communicator::DYNAMIC,
@@ -372,7 +362,7 @@ void MPIController::NeighborAllgather(TensorTableEntry& entry) {
 
 // Function to check if the sending and receiving neighbors match in the topology.
 bool CheckNeighborSendRecvPattern(
-  const std::vector<int>* send_neighbors, const std::vector<int>* recv_neighbors,
+  const std::vector<int>& send_neighbors, const std::vector<int>& recv_neighbors,
   const std::string& tensor_name, int size, Timeline* timeline_ptr, const MPI_Comm& comm) {
   bool res = false;
   timeline_ptr->ActivityStart(tensor_name, "NEGOTIATION");
@@ -381,17 +371,12 @@ bool CheckNeighborSendRecvPattern(
   bool* send_check_buf = new bool[2 * size];
   std::fill_n(send_check_buf, 2 * size, false);
   bool* recv_check_buf = new bool[2 * size * size];
-  for (int send_rank : *(send_neighbors))
+  for (int send_rank : send_neighbors)
     send_check_buf[send_rank] = true;
-  for (int recv_rank : *(recv_neighbors))
+  for (int recv_rank : recv_neighbors)
     send_check_buf[size + recv_rank] = true;
-  int ret_code = MPI_Allgather(send_check_buf, size * 2, MPI_C_BOOL,
-                               recv_check_buf, size * 2, MPI_C_BOOL, comm);
-  if (ret_code != MPI_SUCCESS) {
-    throw std::runtime_error(
-        "MPI_Allgather (for dynamic neighbor_allreduce negotiation) failed, "
-        "see MPI output for details.");
-  }
+  MPICHECK(MPI_Allgather(send_check_buf, size * 2, MPI_C_BOOL,
+                         recv_check_buf, size * 2, MPI_C_BOOL, comm));
   // This checks that send matrix and transposed recv matrix should be the
   // same. If same, the topology is good to go. If not, there is mismatch edge
   // to be fixed.
@@ -413,6 +398,24 @@ bool CheckNeighborSendRecvPattern(
   return res;
 }
 
+bool CheckNeighborSendRecvPatternForEntry(const TensorTableEntry& entry,
+                                          const MPIContext& mpi_ctx,
+                                          Timeline* timeline_ptr) {
+  bool is_topo_check_fail = false;
+  if (entry.enable_topo_check && entry.dynamic_neighbors_enabled) {
+    if (entry.is_hierarchical) {
+      // TODO: support check.
+      BFLOG(WARNING) << "Request to check topology for hierarchical neighbor "
+                     << "allreduce ops but it is not supported yet.";
+    } else {
+      is_topo_check_fail = CheckNeighborSendRecvPattern(
+          *entry.send_neighbors, *entry.recv_neighbors, entry.tensor_name,
+          mpi_ctx.size_, timeline_ptr, mpi_ctx.GetMPICommunicator(Communicator::GLOBAL));
+    }
+  }
+  return is_topo_check_fail;
+}
+
 void MPIController::NeighborAllreduce(TensorTableEntry& entry) {
   const void* sendbuf = entry.tensor->data();
   int num_elements = entry.tensor->shape().num_elements();
@@ -432,18 +435,7 @@ void MPIController::NeighborAllreduce(TensorTableEntry& entry) {
 
   // If only partial sending is enabled, the following code block checks whether the sending
   // and recieving neighbors match each other when enable_topo_check is set to be True.
-  bool is_topo_check_fail = false;
-  if (entry.enable_topo_check && entry.dynamic_neighbors_enabled) {
-    if (entry.is_hierarchical) {
-      // TODO: support check.
-      BFLOG(INFO) << "Request to check topology for hierarchical neighbor "
-                  << "allreduce ops but it is not supported yet.";
-    }
-    is_topo_check_fail = CheckNeighborSendRecvPattern(
-        entry.send_neighbors.get(), entry.recv_neighbors.get(), entry.tensor_name,
-        mpi_ctx_.size_, timeline_ptr, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL));
-  }
-
+  bool is_topo_check_fail = CheckNeighborSendRecvPatternForEntry(entry, mpi_ctx_, timeline_ptr);
   if (is_topo_check_fail) {
     entry.callback(Status::InvalidArgument(
         "Src(recv from) and dst(send to) neighbor ranks do not match"));
@@ -615,18 +607,8 @@ void MPIController::NeighborAllreduce(std::vector<TensorTableEntry>& entries) {
   // If only partial sending is enabled, the following code block checks whether
   // the sending and recieving neighbors match each other when enable_topo_check
   // is set to be True.
-  bool is_topo_check_fail = false;
-  if (first_entry.enable_topo_check && first_entry.dynamic_neighbors_enabled) {
-    if (first_entry.is_hierarchical) {
-      // TODO: support check.
-      BFLOG(WARNING) << "Request to check topology for hierarchical neighbor "
-                     << "allreduce ops but it is not supported yet.";
-    }
-    is_topo_check_fail = CheckNeighborSendRecvPattern(
-        first_entry.send_neighbors.get(), first_entry.recv_neighbors.get(), first_entry.tensor_name,
-        mpi_ctx_.size_, timeline_ptr, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL));
-  }
-
+  bool is_topo_check_fail = CheckNeighborSendRecvPatternForEntry(first_entry, mpi_ctx_,
+                                                                 timeline_ptr);
   if (is_topo_check_fail) {
     for (auto& entry : entries) {
       entry.callback(Status::InvalidArgument(
diff --git a/bluefog/common/mpi_controller.h b/bluefog/common/mpi_controller.h
index 3f5af419..be5daa34 100644
--- a/bluefog/common/mpi_controller.h
+++ b/bluefog/common/mpi_controller.h
@@ -28,8 +28,10 @@ namespace common {
 // Function to check if the sending and receiving neighbors match in the
 // topology.
 bool CheckNeighborSendRecvPattern(
-  const std::vector<int>* send_neighbors, const std::vector<int>* recv_neighbors,
+  const std::vector<int>& send_neighbors, const std::vector<int>& recv_neighbors,
   const std::string& tensor_name, int size, Timeline* timeline_ptr, const MPI_Comm& comm);
+bool CheckNeighborSendRecvPatternForEntry(
+  const TensorTableEntry& entry, const MPIContext& mpi_ctx, Timeline* timeline_ptr);
 
 class MPIController {
  public:
diff --git a/bluefog/common/nccl_controller.cc b/bluefog/common/nccl_controller.cc
index 7bdd9acf..7f2d52e5 100644
--- a/bluefog/common/nccl_controller.cc
+++ b/bluefog/common/nccl_controller.cc
@@ -579,23 +579,13 @@ void NCCLController::NeighborAllgather(TensorTableEntry& entry) {
     displcmnts = new int[mpi_ctx_.neighbor_indgree_];
     status = mpi_ctx_.AllocateOutput(entry, recvcounts, Communicator::GRAPH);
   } else {
-    bool is_topo_check_fail = false;
-    if (entry.enable_topo_check && entry.dynamic_neighbors_enabled) {
-      if (entry.is_hierarchical) {
-        // TODO: support check.
-        BFLOG(INFO) << "Request to check topology for hierarchical neighbor "
-                    << "allreduce ops but it is not supported yet.";
-      }
-      is_topo_check_fail = CheckNeighborSendRecvPattern(
-          entry.send_neighbors.get(), entry.recv_neighbors.get(), entry.tensor_name,
-          mpi_ctx_.size_, timeline_ptr_, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL));
-    }
-
+    bool is_topo_check_fail = CheckNeighborSendRecvPatternForEntry(entry, mpi_ctx_, timeline_ptr_);
     if (is_topo_check_fail) {
       entry.callback(Status::InvalidArgument(
           "Src and dst neighbor ranks do not match"));
       return;
     }
+
     recvcounts = new int[entry.recv_neighbors->size()];
     displcmnts = new int[entry.recv_neighbors->size()];
     status = mpi_ctx_.AllocateOutput(entry, recvcounts, Communicator::DYNAMIC,
@@ -737,17 +727,7 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) {
   // If only partial sending is enabled, the following code block checks whether
   // the sending and recieving neighbors match each other when enable_topo_check
   // is set to be True.
-  bool is_topo_check_fail = false;
-  if (entry.enable_topo_check && entry.dynamic_neighbors_enabled) {
-    if (entry.is_hierarchical) {
-      // TODO: support check.
-      BFLOG(INFO) << "Request to check topology for hierarchical neighbor "
-                  << "allreduce ops but it is not supported yet.";
-    }
-    is_topo_check_fail = CheckNeighborSendRecvPattern(
-        entry.send_neighbors.get(), entry.recv_neighbors.get(), entry.tensor_name,
-        mpi_ctx_.size_, timeline_ptr_, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL));
-  }
+  bool is_topo_check_fail = CheckNeighborSendRecvPatternForEntry(entry, mpi_ctx_, timeline_ptr_);
   if (is_topo_check_fail) {
     entry.callback(Status::InvalidArgument(
         "Send and recv neighbors dont' match in neighbor "
@@ -1017,17 +997,8 @@ void NCCLController::NeighborAllreduce(std::vector<TensorTableEntry>& entries) {
   // If only partial sending is enabled, the following code block checks whether
   // the sending and recieving neighbors match each other when enable_topo_check
   // is set to be True.
-  bool is_topo_check_fail = false;
-  if (first_entry.enable_topo_check && first_entry.dynamic_neighbors_enabled) {
-    if (first_entry.is_hierarchical) {
-      // TODO: support check.
-      BFLOG(INFO) << "Request to check topology for hierarchical neighbor "
-                  << "allreduce ops but it is not supported yet.";
-    }
-    is_topo_check_fail = CheckNeighborSendRecvPattern(
-        first_entry.send_neighbors.get(), first_entry.recv_neighbors.get(), first_entry.tensor_name,
-        mpi_ctx_.size_, timeline_ptr_, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL));
-  }
+  bool is_topo_check_fail = CheckNeighborSendRecvPatternForEntry(first_entry, mpi_ctx_,
+                                                                 timeline_ptr_);
   if (is_topo_check_fail) {
     for (auto& entry : entries) {
       entry.callback(Status::InvalidArgument(

From 3d90087d562a41fe3123865d8a6d620b0c4bd0cd Mon Sep 17 00:00:00 2001
From: Hanbin Hu <hanbinhu2016@gmail.com>
Date: Fri, 26 Mar 2021 00:55:05 -0700
Subject: [PATCH 34/36] Add two more TODOs

---
 bluefog/common/mpi_controller.cc | 3 +++
 bluefog/torch/mpi_ops.py         | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/bluefog/common/mpi_controller.cc b/bluefog/common/mpi_controller.cc
index 5d358476..832f6be5 100644
--- a/bluefog/common/mpi_controller.cc
+++ b/bluefog/common/mpi_controller.cc
@@ -467,6 +467,9 @@ void MPIController::NeighborAllreduce(TensorTableEntry& entry) {
           weighted_tensors.push_back(std::move(weighted_tensor_ptr));
         }
       }
+      // TODO(ybc) #83 Better design pattern for data_weight synchronization
+      // This ready event make sure the data_weight computation is done before communication, as
+      // Pytorch CUDA stream is not synchronized with our CUDA stream.
       std::shared_ptr<common::ReadyEvent> ready_event =
           entry.context->RecordReadyEvent(entry.device);
 
diff --git a/bluefog/torch/mpi_ops.py b/bluefog/torch/mpi_ops.py
index 1b82b664..4c6b1dbc 100644
--- a/bluefog/torch/mpi_ops.py
+++ b/bluefog/torch/mpi_ops.py
@@ -570,6 +570,7 @@ def neighbor_allreduce(tensor: torch.Tensor, *,
 
     Note: self_weight and neighbor_weights must be presented at the same time.
     """
+    # TODO(hanbinhu) #82 Symmetrical argument for self_weight, src_weights, dst_weights
     if (self_weight is None and src_weights is not None) or \
        (self_weight is not None and src_weights is None):
         raise ValueError("Arguments self_weight and src_weights have to be presented at "
@@ -625,6 +626,7 @@ def neighbor_allreduce_nonblocking(tensor: torch.Tensor, *,
 
     Note: self_weight and neighbor_weights must be presented at the same time.
     """
+    # TODO(hanbinhu) #82 Symmetrical argument for self_weight, src_weights, dst_weights
     if (self_weight is None and src_weights is not None) or \
        (self_weight is not None and src_weights is None):
         raise ValueError("Arguments self_weight and src_weights have to be presented at "

From 96887b5063953784400b933a841bcb8d414c97ac Mon Sep 17 00:00:00 2001
From: Hanbin Hu <hanbinhu2016@gmail.com>
Date: Sat, 27 Mar 2021 12:49:53 -0700
Subject: [PATCH 35/36] Address review comments

---
 bluefog/common/global_state.h     |  2 +-
 bluefog/common/mpi_controller.cc  | 92 +++++++++++++++++--------------
 bluefog/common/mpi_controller.h   |  4 ++
 bluefog/common/nccl_controller.cc | 82 +++++++++++++++------------
 bluefog/common/nccl_controller.h  |  4 ++
 bluefog/common/operations.cc      |  2 +
 bluefog/common/tensor_queue.h     |  3 +
 bluefog/torch/mpi_ops.cc          | 19 ++++---
 setup.py                          | 16 ++++--
 9 files changed, 135 insertions(+), 89 deletions(-)

diff --git a/bluefog/common/global_state.h b/bluefog/common/global_state.h
index 6f940769..2be439a3 100644
--- a/bluefog/common/global_state.h
+++ b/bluefog/common/global_state.h
@@ -80,7 +80,7 @@ struct BluefogGlobalState {
   // Threshold for Tensor Fusion.  All tensors that occupy memory beyond this
   // threshold will be fused.
   int64_t tensor_fusion_threshold = 8 * 1024 * 1024;
-  int64_t tensor_fusion_threshold_for_dst_weight = 8 * 1024 * 1024;
+  int64_t tensor_fusion_threshold_for_dst_weight = tensor_fusion_threshold;
   FusionBufferManager fusion_buffer;
 
   // Because setting topology happens in the main thread instead of communication
diff --git a/bluefog/common/mpi_controller.cc b/bluefog/common/mpi_controller.cc
index 832f6be5..cd632049 100644
--- a/bluefog/common/mpi_controller.cc
+++ b/bluefog/common/mpi_controller.cc
@@ -449,13 +449,13 @@ void MPIController::NeighborAllreduce(TensorTableEntry& entry) {
   // including itself is more intuitive.
   std::string error_message = "";
 
-  if (!entry.is_hierarchical) {
-    if (!entry.dynamic_neighbors_enabled) {
+  if (!entry.is_hierarchical) {  // neighbor allreduce without hierarchy
+    if (!entry.dynamic_neighbors_enabled) {  // static topology
       MPICHECK(MPI_Neighbor_allgather(
           sendbuf, num_elements, mpi_ctx_.GetMPIDataType(entry.tensor),
           buffer_data, num_elements, mpi_ctx_.GetMPIDataType(entry.output),
           mpi_ctx_.GetMPICommunicator(Communicator::GRAPH)));
-    } else {
+    } else {  // dynamic topology
       int nsend = entry.send_neighbors->size();
       int nrecv = entry.recv_neighbors->size();
 
@@ -468,8 +468,9 @@ void MPIController::NeighborAllreduce(TensorTableEntry& entry) {
         }
       }
       // TODO(ybc) #83 Better design pattern for data_weight synchronization
-      // This ready event make sure the data_weight computation is done before communication, as
-      // Pytorch CUDA stream is not synchronized with our CUDA stream.
+      // This ready event makes sure the data_weight computation is done before communication, as
+      // Pytorch CUDA stream is not synchronized with our CUDA stream, and it does nothing when
+      // it is running on CPU.
       std::shared_ptr<common::ReadyEvent> ready_event =
           entry.context->RecordReadyEvent(entry.device);
 
@@ -486,16 +487,14 @@ void MPIController::NeighborAllreduce(TensorTableEntry& entry) {
       }
 
       if (entry.dst_weighting_enabled) {
-        if (ready_event != nullptr) {
-          while (!ready_event->Ready()) {
-            std::this_thread::sleep_for(std::chrono::nanoseconds(100));
-          }
+        while ((ready_event != nullptr) && !ready_event->Ready()) {
+          std::this_thread::sleep_for(std::chrono::nanoseconds(100));
         }
       }
       for (int i = 0; i < nsend; ++i) {
-        const void* buffer_send = sendbuf;
-        if (entry.dst_weighting_enabled)
-          buffer_send = weighted_tensors[i].get()->data();
+        const void* buffer_send = (entry.dst_weighting_enabled)
+                                      ? weighted_tensors[i]->data()
+                                      : sendbuf;
         MPICHECK(MPI_Isend(buffer_send, num_elements,
             mpi_ctx_.GetMPIDataType(entry.tensor), entry.send_neighbors->at(i),
             /*tag=*/mpi_ctx_.rank_ + entry.send_neighbors->at(i),
@@ -505,7 +504,7 @@ void MPIController::NeighborAllreduce(TensorTableEntry& entry) {
       error_message =
           GenerateNeighborExchangeErrorMessage(statuses, nsend, nrecv);
     }
-  } else {
+  } else {  // hierarchical neighbor allreduce
     if (entry.send_neighbors->empty()) {
       throw std::runtime_error(
           "Under hierarchical neighbor_allreduce, argument "
@@ -630,26 +629,8 @@ void MPIController::NeighborAllreduce(std::vector<TensorTableEntry>& entries) {
   if (first_entry.dst_weighting_enabled) {
     // Generate weighted data fusion for sending
     timeline_ptr->ActivityStartAll(entries, "MEMCPY_IN_WEIGHT_FUSION_BUFFER");
-    void* weight_buffer_data;
-    MemcpyInWeightFusionBuffer(weight_buffer_data, first_entry.send_neighbors->size(),
-                               fused_input_data, num_elements, element_size,
-                               first_entry.context, first_entry.device);
-    int64_t offset = 0;
-    for (size_t i = 0; i < first_entry.send_neighbors->size(); ++i) {
-      double dst_weight = first_entry.send_weights->at(i);
-      void* weight_buffer_data_offset = (uint8_t*)weight_buffer_data + offset;
-      if (first_entry.device == CPU_DEVICE_ID) {
-        ScaleCPUBuffer(dst_weight, weight_buffer_data_offset, num_elements,
-                       first_entry.tensor->dtype());
-      } else {
-#if HAVE_CUDA
-        ScaleBufferCudaImpl(dst_weight, weight_buffer_data_offset, num_elements,
-                            first_entry.tensor->dtype(), mpi_ctx_.stream);
-#endif
-      }
-      offset += num_elements * element_size;
-    }
-    weighted_fused_input_data = weight_buffer_data;
+    weighted_fused_input_data = GenerateWeightedFusedInputData(fused_input_data, first_entry,
+                                                               num_elements, element_size);
     timeline_ptr->ActivityEndAll(entries);
   }
 
@@ -667,13 +648,13 @@ void MPIController::NeighborAllreduce(std::vector<TensorTableEntry>& entries) {
   // including itself is more intuitive.
   std::string error_message = "";
 
-  if (!first_entry.is_hierarchical) {
-    if (!first_entry.dynamic_neighbors_enabled) {
+  if (!first_entry.is_hierarchical) {  // neighbor allreduce without hierarchy
+    if (!first_entry.dynamic_neighbors_enabled) {  // static topology
       MPICHECK(MPI_Neighbor_allgather(
           fused_input_data, num_elements, mpi_ctx_.GetMPIDataType(first_entry.tensor),
           buffer_data, num_elements, mpi_ctx_.GetMPIDataType(first_entry.output),
           mpi_ctx_.GetMPICommunicator(Communicator::GRAPH)));
-    } else {
+    } else {  // dynamic topology
       int nsend = first_entry.send_neighbors->size();
       int nrecv = first_entry.recv_neighbors->size();
       std::vector<MPI_Request> requests(nsend + nrecv);
@@ -692,10 +673,10 @@ void MPIController::NeighborAllreduce(std::vector<TensorTableEntry>& entries) {
       }
 #endif
       for (int i = 0; i < nsend; ++i) {
-        const void* sendbuf = fused_input_data;
-        if (first_entry.dst_weighting_enabled)
-          sendbuf = 
-              (void*)((uint8_t*)weighted_fused_input_data + num_elements * i * element_size);
+        const void* sendbuf =
+            (first_entry.dst_weighting_enabled)
+                ? (void*)((uint8_t*)weighted_fused_input_data + num_elements * i * element_size)
+                : fused_input_data;
         MPICHECK(MPI_Isend(sendbuf, num_elements,
             mpi_ctx_.GetMPIDataType(first_entry.tensor), first_entry.send_neighbors->at(i),
             /*tag=*/mpi_ctx_.rank_ + first_entry.send_neighbors->at(i),
@@ -705,7 +686,7 @@ void MPIController::NeighborAllreduce(std::vector<TensorTableEntry>& entries) {
       error_message =
           GenerateNeighborExchangeErrorMessage(statuses, nsend, nrecv);
     }
-  } else {
+  } else {  // hierarchical neighbor allreduce
     if (first_entry.send_neighbors->empty()) {
       throw std::runtime_error(
           "Under hierarchical neighbor_allreduce, argument "
@@ -1442,6 +1423,35 @@ void MPIController::MemcpyInWeightFusionBuffer(
   }
 }
 
+const void* MPIController::GenerateWeightedFusedInputData(const void* fused_input_data,
+                                                          const TensorTableEntry& entry,
+                                                          int64_t num_elements, int element_size) {
+  // Given a fused_input_data like [t_1, t_2], the storage for neighbor_allreduce in
+  // weighted fused input data is like [t_1_w1, t_2_w1 | t_1_w2, t_2_w2 | t_1_w3, t_2_w3].
+  // Here t_1 and t_2  means self tensor 1 and 2 and _w1, _w2, and _w3 means the
+  // destination weights to destination 1, 2, and 3.
+  void* weight_buffer_data;
+  MemcpyInWeightFusionBuffer(weight_buffer_data, entry.send_neighbors->size(),
+                             fused_input_data, num_elements, element_size,
+                             entry.context, entry.device);
+  int64_t offset = 0;
+  for (size_t i = 0; i < entry.send_neighbors->size(); ++i) {
+    double dst_weight = entry.send_weights->at(i);
+    void* weight_buffer_data_offset = (uint8_t*)weight_buffer_data + offset;
+    if (entry.device == CPU_DEVICE_ID) {
+      ScaleCPUBuffer(dst_weight, weight_buffer_data_offset, num_elements,
+                     entry.tensor->dtype());
+    } else {
+#if HAVE_CUDA
+      ScaleBufferCudaImpl(dst_weight, weight_buffer_data_offset, num_elements,
+                          entry.tensor->dtype(), mpi_ctx_.stream);
+#endif
+    }
+    offset += num_elements * element_size;
+  }
+  return weight_buffer_data;
+}
+
 void MPIController::MemcpyInFusionBuffer(
     const std::vector<TensorTableEntry>& entries, void*& buffer_data,
     size_t& buffer_len) {
diff --git a/bluefog/common/mpi_controller.h b/bluefog/common/mpi_controller.h
index be5daa34..b3eda810 100644
--- a/bluefog/common/mpi_controller.h
+++ b/bluefog/common/mpi_controller.h
@@ -127,6 +127,10 @@ class MPIController {
                                   const void* buffer_data, int64_t num_elements, int element_size,
                                   std::shared_ptr<OpContext> context, int device);
 
+  const void* GenerateWeightedFusedInputData(const void* fused_input_data,
+                                             const TensorTableEntry& entry,
+                                             int64_t num_elements, int element_size);
+
   void MemcpyOutFusionBufferForInputs(const void* fused_input_data,
                                       std::vector<TensorTableEntry>& entries);
 
diff --git a/bluefog/common/nccl_controller.cc b/bluefog/common/nccl_controller.cc
index 7f2d52e5..f5fbd1df 100644
--- a/bluefog/common/nccl_controller.cc
+++ b/bluefog/common/nccl_controller.cc
@@ -739,9 +739,9 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) {
   // Ensure the lifecycle of the weighted tensors are alive after communication.
 
 #if NCCL_MINOR > 6
-  if (!entry.is_hierarchical) {
+  if (!entry.is_hierarchical) {  // neighbor allreduce without hierarchy
     ncclGroupStart();
-    if (!entry.dynamic_neighbors_enabled) {
+    if (!entry.dynamic_neighbors_enabled) {  // static topology
       for (int i = 0; i < mpi_ctx_.neighbor_indgree_; i++) {
         int recv_rank = mpi_ctx_.neighbor_in_ranks_[i];
         void* recvbuf = (void*)(static_cast<const char*>(entry.output->data()) +
@@ -753,13 +753,17 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) {
         NCCLCHECK(ncclSend(sendbuf, num_elements, GetNCCLDataType(entry.tensor),
                           send_rank, nccl_ctx_.nccl_comm, nccl_ctx_.stream));
       }
-    } else {
+    } else {  // dynamic topology
       if(entry.dst_weighting_enabled) {
         for (size_t i = 0; i < entry.send_neighbors->size(); ++i) {
           auto weighted_tensor_ptr = entry.tensor->data_weight(entry.send_weights->at(i));
           weighted_tensors.push_back(std::move(weighted_tensor_ptr));
         }
       }
+      // TODO(ybc) #83 Better design pattern for data_weight synchronization
+      // This ready event makes sure the data_weight computation is done before communication, as
+      // Pytorch CUDA stream is not synchronized with our CUDA stream, and it does nothing when
+      // it is running on CPU.
       std::shared_ptr<common::ReadyEvent> ready_event =
           entry.context->RecordReadyEvent(entry.device);
       for (size_t i = 0; i < entry.recv_neighbors->size(); ++i) {
@@ -770,22 +774,20 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) {
                           recv_rank, nccl_ctx_.nccl_comm, nccl_ctx_.stream));
       }
       if(entry.dst_weighting_enabled) {
-        if (ready_event != nullptr) {
-          while (!ready_event->Ready()) {
-            std::this_thread::sleep_for(std::chrono::nanoseconds(100));
-          }
+        while ((ready_event != nullptr) && !ready_event->Ready()) {
+          std::this_thread::sleep_for(std::chrono::nanoseconds(100));
         }
       }
       for (size_t i = 0; i < entry.send_neighbors->size(); ++i) {
-        const void* buffer_send = sendbuf;
-        if (entry.dst_weighting_enabled)
-          buffer_send = weighted_tensors[i].get()->data();
+        const void* buffer_send = (entry.dst_weighting_enabled)
+                                      ? weighted_tensors[i]->data()
+                                      : sendbuf;
         NCCLCHECK(ncclSend(buffer_send, num_elements, GetNCCLDataType(entry.tensor),
                            entry.send_neighbors->at(i), nccl_ctx_.nccl_comm, nccl_ctx_.stream));
       }
     }
     ncclGroupEnd();
-  } else {
+  } else {  // hierarchical neighbor allreduce
     if (entry.send_neighbors->empty()) {
       throw std::runtime_error(
           "Under hierarchical neighbor_allreduce, argument "
@@ -1016,22 +1018,11 @@ void NCCLController::NeighborAllreduce(std::vector<TensorTableEntry>& entries) {
     RecordEvent(event_queue, "MEM_CPY_IN");
   }
 
-  const void* weighted_fused_input_data = nullptr;
-  if (first_entry.dst_weighting_enabled) {
-    void* weight_buffer_data = nullptr;
-    MemcpyInWeightFusionBuffer(weight_buffer_data, first_entry.send_neighbors->size(),
-                               fused_input_data, num_elements, element_size,
-                               first_entry.context, first_entry.device);
-    int64_t offset = 0;
-    for (size_t i = 0; i < first_entry.send_neighbors->size(); ++i) {
-      double dst_weight = first_entry.send_weights->at(i);
-      void* weight_buffer_data_offset = (uint8_t*)weight_buffer_data + offset;
-      ScaleBufferCudaImpl(dst_weight, weight_buffer_data_offset, num_elements,
-                          first_entry.tensor->dtype(), nccl_ctx_.stream);
-      offset += num_elements * element_size;
-    }
-    weighted_fused_input_data = weight_buffer_data;
-  }
+  const void* weighted_fused_input_data =
+      (first_entry.dst_weighting_enabled)
+          ? GenerateWeightedFusedInputData(fused_input_data, first_entry,
+                                           num_elements, element_size)
+          : nullptr;
 
   // Unlike allreduce, the storage for neighbor_allreduce in fusion buffer
   // is like [t_1, t_2 | t_1_n1, t_2_n1, t_1_n2, t_2_n2].
@@ -1040,9 +1031,9 @@ void NCCLController::NeighborAllreduce(std::vector<TensorTableEntry>& entries) {
   // Hence, we need to offset the buffer data to location for neighbors.
   buffer_data = (uint8_t*)buffer_data + num_elements * element_size;
 
-  if (!first_entry.is_hierarchical) {
+  if (!first_entry.is_hierarchical) {  // neighbor allreduce without hierarchy
     ncclGroupStart();
-    if (!first_entry.dynamic_neighbors_enabled) {
+    if (!first_entry.dynamic_neighbors_enabled) {  // static topology
       for (int i = 0; i < mpi_ctx_.neighbor_indgree_; i++) {
         int recv_rank = mpi_ctx_.neighbor_in_ranks_[i];
         void* recvbuf =
@@ -1056,7 +1047,7 @@ void NCCLController::NeighborAllreduce(std::vector<TensorTableEntry>& entries) {
                            GetNCCLDataType(first_entry.tensor), send_rank,
                            nccl_ctx_.nccl_comm, nccl_ctx_.stream));
       }
-    } else {
+    } else {  // dynamic topology
       for (size_t i = 0; i < first_entry.recv_neighbors->size(); ++i) {
         int recv_rank = first_entry.recv_neighbors->at(i);
         void* recvbuf =
@@ -1066,16 +1057,17 @@ void NCCLController::NeighborAllreduce(std::vector<TensorTableEntry>& entries) {
                            nccl_ctx_.nccl_comm, nccl_ctx_.stream));
       }
       for (size_t i = 0; i < first_entry.send_neighbors->size(); ++i) {
-        const void* sendbuf = fused_input_data;
-        if (first_entry.dst_weighting_enabled)
-          sendbuf = (void*)((uint8_t*)weighted_fused_input_data + num_elements * i * element_size);
+        const void* sendbuf =
+            (first_entry.dst_weighting_enabled)
+                ? (void*)((uint8_t*)weighted_fused_input_data + num_elements * i * element_size)
+                : fused_input_data;
         NCCLCHECK(ncclSend(sendbuf, num_elements, GetNCCLDataType(first_entry.tensor),
                            first_entry.send_neighbors->at(i),
                            nccl_ctx_.nccl_comm, nccl_ctx_.stream));
       }
     }
     ncclGroupEnd();
-  } else {
+  } else {  // hierarchical neighbor allreduce
     if (first_entry.send_neighbors->empty()) {
       throw std::runtime_error(
           "Under hierarchical neighbor_allreduce, argument "
@@ -1925,6 +1917,28 @@ void NCCLController::MemcpyInWeightFusionBuffer(
   }
 }
 
+const void* NCCLController::GenerateWeightedFusedInputData(const void* fused_input_data,
+                                                           const TensorTableEntry& entry,
+                                                           int64_t num_elements, int element_size) {
+  // Given a fused_input_data like [t_1, t_2], the storage for neighbor_allreduce in
+  // weighted fused input data is like [t_1_w1, t_2_w1 | t_1_w2, t_2_w2 | t_1_w3, t_2_w3].
+  // Here t_1 and t_2  means self tensor 1 and 2 and _w1, _w2, and _w3 means the
+  // destination weights to destination 1, 2, and 3.
+  void* weight_buffer_data = nullptr;
+  MemcpyInWeightFusionBuffer(weight_buffer_data, entry.send_neighbors->size(),
+                             fused_input_data, num_elements, element_size,
+                             entry.context, entry.device);
+  int64_t offset = 0;
+  for (size_t i = 0; i < entry.send_neighbors->size(); ++i) {
+    double dst_weight = entry.send_weights->at(i);
+    void* weight_buffer_data_offset = (uint8_t*)weight_buffer_data + offset;
+    ScaleBufferCudaImpl(dst_weight, weight_buffer_data_offset, num_elements,
+                        entry.tensor->dtype(), nccl_ctx_.stream);
+    offset += num_elements * element_size;
+  }
+  return weight_buffer_data;
+}
+
 void NCCLController::MemcpyOutFusionBuffer(
     const void* buffer_data, std::vector<TensorTableEntry>& entries) {
   int64_t offset = 0;
diff --git a/bluefog/common/nccl_controller.h b/bluefog/common/nccl_controller.h
index 0a2b1013..952cad5e 100644
--- a/bluefog/common/nccl_controller.h
+++ b/bluefog/common/nccl_controller.h
@@ -199,6 +199,10 @@ class NCCLController {
                                   const void* buffer_data, int64_t num_elements, int element_size,
                                   std::shared_ptr<OpContext> context, int device);
 
+  const void* GenerateWeightedFusedInputData(const void* fused_input_data,
+                                             const TensorTableEntry& entry,
+                                             int64_t num_elements, int element_size);
+
   void MemcpyOutFusionBufferForInputs(const void* fused_input_data,
                                       std::vector<TensorTableEntry>& entries);
 
diff --git a/bluefog/common/operations.cc b/bluefog/common/operations.cc
index ddf90cf5..bb6940cb 100644
--- a/bluefog/common/operations.cc
+++ b/bluefog/common/operations.cc
@@ -482,6 +482,8 @@ void BackgroundThreadLoop(BluefogGlobalState& state) {
   if (bluefog_fusion_threshold != nullptr) {
     state.tensor_fusion_threshold =
         std::strtol(bluefog_fusion_threshold, nullptr, 10);
+    state.tensor_fusion_threshold_for_dst_weight =
+        state.tensor_fusion_threshold;
   }
 
   // Initialize the tensor count table. No tensors are available yet.
diff --git a/bluefog/common/tensor_queue.h b/bluefog/common/tensor_queue.h
index 47d74f88..a0302496 100644
--- a/bluefog/common/tensor_queue.h
+++ b/bluefog/common/tensor_queue.h
@@ -83,6 +83,9 @@ class FusionBufferManager {
                           std::function<void()> on_end_init);
 
   // Initializes a buffer of the given threshold size times MPI size if not already cached.
+  // There is one constraint to noticed here. We need WeightBuffer is always larger than
+  // (size-1)*fusion Buffer since we don't want to tensor being able to put into the fusion
+  // buffer but not able to put into weightbuffer.
   //
   // Args:
   //  threshold: Size of the buffer in bytes.
diff --git a/bluefog/torch/mpi_ops.cc b/bluefog/torch/mpi_ops.cc
index 636da5f0..9a980d54 100644
--- a/bluefog/torch/mpi_ops.cc
+++ b/bluefog/torch/mpi_ops.cc
@@ -407,11 +407,14 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output,
 
   auto callback_wrapper = GetCallbackWrapper(handle, timeline_ptr, op_name, tid);
 
-  std::map<int, double> src_weights_map;
+  // src_neighbors, dst_neighbors --> list of ranks only used in Enqueue
+  // src_weights_ordered_map --> used in callback only
+  // dst_weights_vec --> used in Enqueue for sending (same order of dst_neighbors)
+  std::map<int, double> src_weights_ordered_map;
   for (auto kv : src_weights)
-    src_weights_map.insert(kv);
+    src_weights_ordered_map.insert(kv);
   std::vector<int> src_neighbors;
-  for (auto kv : src_weights_map)
+  for (auto kv : src_weights_ordered_map)
     src_neighbors.push_back(kv.first);
 
   std::vector<int> dst_neighbors;
@@ -440,11 +443,11 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output,
         bf_src_neighbors, bf_dst_neighbors, bf_dst_weights_vec,
         dynamic_neighbors_enabled, dst_weighting_enabled, is_hierarchical,
         enable_topo_check, op_name, CPU_DEVICE_ID,
-        callback_wrapper([self_weight, src_weights_map, avg_computation, cpu_output, tensor,
+        callback_wrapper([self_weight, src_weights_ordered_map, avg_computation, cpu_output, tensor,
                           dynamic_neighbors_enabled, is_hierarchical, output, device]() mutable {
           with_device device_guard(device);
           output.copy_(cpu_output);
-          PerformNeighborAllreduceCallback(tensor, output, self_weight, src_weights_map,
+          PerformNeighborAllreduceCallback(tensor, output, self_weight, src_weights_ordered_map,
                                            avg_computation, dynamic_neighbors_enabled,
                                            is_hierarchical);
         }));
@@ -460,9 +463,9 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output,
         bf_src_neighbors, bf_dst_neighbors, bf_dst_weights_vec,
         dynamic_neighbors_enabled, dst_weighting_enabled,
         is_hierarchical, enable_topo_check, op_name, device,
-        callback_wrapper([self_weight, src_weights_map, avg_computation, dynamic_neighbors_enabled,
-                          is_hierarchical, tensor, output]() mutable {
-          PerformNeighborAllreduceCallback(tensor, output, self_weight, src_weights_map,
+        callback_wrapper([self_weight, src_weights_ordered_map, avg_computation,
+                          dynamic_neighbors_enabled, is_hierarchical, tensor, output]() mutable {
+          PerformNeighborAllreduceCallback(tensor, output, self_weight, src_weights_ordered_map,
                                            avg_computation, dynamic_neighbors_enabled,
                                            is_hierarchical);
         }));
diff --git a/setup.py b/setup.py
index f4e3bffb..fa736de5 100644
--- a/setup.py
+++ b/setup.py
@@ -21,6 +21,7 @@
 import subprocess
 import sys
 import textwrap
+import shlex
 import traceback
 from typing import List
 
@@ -449,18 +450,23 @@ def build_nvcc_extra_objects(nvcc_cmd: str, cxx11_abi: bool) -> List[str]:
     # nvcc --compiler-options '-fPIC -D_GLIBCXX_USE_CXX11_ABI=0' -rdc=true -c cuda_kernels.cu
     # nvcc --compiler-options '-fPIC -D_GLIBCXX_USE_CXX11_ABI=0' -dlink -o cuda_kernels_link.o \
     #      cuda_kernels.o -lcudart
-    nvcc_flags = f"--compiler-options '-fPIC -D_GLIBCXX_USE_CXX11_ABI={int(cxx11_abi)}'"
+    nvcc_flags = f'-fPIC -D_GLIBCXX_USE_CXX11_ABI={int(cxx11_abi)}'
 
     extra_object_dir = 'bluefog/common/cuda/'
     source = extra_object_dir+'cuda_kernels.cu'
     object_file = extra_object_dir+'cuda_kernels.o'
     object_link = extra_object_dir+'cuda_kernels_link.o'
 
-    command_object = f"{nvcc_cmd} {nvcc_flags} -rdc=true -c {source} -o {object_file}"
-    command_link   = f"{nvcc_cmd} {nvcc_flags} -dlink {object_file} -lcudart -o {object_link}"
+    command_object = [nvcc_cmd, '--compiler-options', nvcc_flags,
+                      '-rdc=true', '-c', source, '-o', object_file]
+    command_link = [nvcc_cmd, '--compiler-options', nvcc_flags,
+                    '-dlink', object_file, '-lcudart', '-o', object_link]
+
+    command_object_str = ' '.join(shlex.quote(par) for par in command_object)
+    command_link_str = ' '.join(shlex.quote(par) for par in command_link)
 
-    subprocess.check_call([command_object], shell=True)
-    subprocess.check_call([command_link], shell=True)
+    subprocess.check_call(command_object_str, shell=True)
+    subprocess.check_call(command_link_str, shell=True)
     return [object_file, object_link]
 
 def build_torch_extension(build_ext, global_options, torch_version):

From 36c073abf9c5352e85de4a8dd9f96e2c95085c29 Mon Sep 17 00:00:00 2001
From: YBC <bichengying@gmail.com>
Date: Sat, 10 Apr 2021 19:52:04 -0700
Subject: [PATCH 36/36] Add condition variable to control the loop (#88)

* Add condition variable to control the loop

* Minor update on topology_setting in global_state

* Add missing <condition_variable> header

* Change cv.wait to cv.wait_for 10 seconds

* Address comment and remove adjusting resetVersionWinMem in ibfrun
---
 bluefog/common/global_state.h  |  13 +++-
 bluefog/common/mpi_context.cc  |   2 +-
 bluefog/common/operations.cc   | 116 +++++++++++++++------------------
 bluefog/common/tensor_queue.h  |  36 +++++-----
 bluefog/run/interactive_run.py |   3 +-
 test/torch_ops_test.py         |   8 ++-
 6 files changed, 91 insertions(+), 87 deletions(-)

diff --git a/bluefog/common/global_state.h b/bluefog/common/global_state.h
index 2be439a3..2683ad14 100644
--- a/bluefog/common/global_state.h
+++ b/bluefog/common/global_state.h
@@ -18,6 +18,7 @@
 #define BLUEFOG_COMMON_GLOBAL_STATE_H
 
 #include <atomic>
+#include <condition_variable>
 #include <chrono>
 #include <memory>
 #include <queue>
@@ -54,6 +55,14 @@ struct BluefogGlobalState {
   // Whether collective context has been completed on the background thread.
   std::atomic_bool initialization_done{false};
 
+  // Condition variable and its mutex for main loop in communication thread.
+  std::condition_variable loop_cv;
+  std::mutex loop_mutex;
+
+  // Under negotiation, the entries sends to master first and wait until it 
+  // returns ok to run. This variable keeps the records of that.
+  std::atomic_int unfinished_enqueued_entries{0};
+
   // Timeline writer.
   Timeline timeline;
 
@@ -84,10 +93,8 @@ struct BluefogGlobalState {
   FusionBufferManager fusion_buffer;
 
   // Because setting topology happens in the main thread instead of communication
-  // thread. Following three variables are to sync between them.
+  // thread. Not really used since the condition variable refactor.
   std::atomic_bool setting_topology{false};
-  std::atomic_bool setting_topology_done{false};
-  std::atomic_bool ready_to_setting_topology{false};
 
   // Only exists on the coordinator node (rank zero). Maintains a vector of
   // requests to allreduce every tensor (keyed by tensor name).
diff --git a/bluefog/common/mpi_context.cc b/bluefog/common/mpi_context.cc
index 7a68ce39..e5ef0e6c 100644
--- a/bluefog/common/mpi_context.cc
+++ b/bluefog/common/mpi_context.cc
@@ -75,7 +75,7 @@ bool WindowManager::InitializeMutexWin(const MPI_Comm& mpi_comm) {
 std::vector<int> WindowManager::GetVersionMemoryCopy() { return version_mem_; }
 
 void WindowManager::resetVersionWinMem(int initialValue /*=0*/) {
-  for (int i = 0; i < version_mem_.size(); i++) {
+  for (size_t i = 0; i < version_mem_.size(); i++) {
     version_mem_[i] = initialValue;
   }
 }
diff --git a/bluefog/common/operations.cc b/bluefog/common/operations.cc
index bb6940cb..578291e0 100644
--- a/bluefog/common/operations.cc
+++ b/bluefog/common/operations.cc
@@ -438,7 +438,7 @@ bool IsSameList (std::shared_ptr<std::vector<T>> n1, std::shared_ptr<std::vector
   if (n1 == nullptr || n2 == nullptr) return false;
   if (n1->size() != n2->size()) return false;
   // The order matters as well.
-  for (int i = 0; i < n1->size(); i++) {
+  for (size_t i = 0; i < n1->size(); i++) {
     if (n1->at(i) != n2->at(i)) {
       return false;
     }
@@ -853,7 +853,9 @@ void PerformOperationWithFusion(std::vector<TensorTableEntry>& entries) {
 void NegotiateOfRequestOfMaster(BluefogGlobalState& state,
                                 std::deque<Request>& message_queue_buffer,
                                 bool& should_change_topo,
-                                bool& should_shut_down) {
+                                bool& should_shut_down) {  
+  state.unfinished_enqueued_entries.fetch_add(message_queue_buffer.size());
+
   std::vector<std::string> ready_to_reduce;
   RequestList message_list;
   message_list.set_shutdown(should_shut_down);
@@ -1035,6 +1037,7 @@ void NegotiateOfRequestOfMaster(BluefogGlobalState& state,
     } else {
       PerformOperation(nego_entries);
     }
+    state.unfinished_enqueued_entries.fetch_sub(nego_entries.size());
   }
 
   // Check for stalled tensors.
@@ -1049,6 +1052,7 @@ void NegotiateOfRequestOfSlave(BluefogGlobalState& state,
                                std::deque<Request>& message_queue_buffer,
                                bool& should_change_topo,
                                bool& should_shut_down) {
+  state.unfinished_enqueued_entries.fetch_add(message_queue_buffer.size());
   std::string encoded_message;
   RequestList message_list;
   message_list.set_shutdown(state.shut_down);
@@ -1057,6 +1061,7 @@ void NegotiateOfRequestOfSlave(BluefogGlobalState& state,
     message_list.add_request(message_queue_buffer.front());
     message_queue_buffer.pop_front();
   }
+
   RequestList::SerializeToString(message_list, encoded_message);
   int encoded_message_length = (int)encoded_message.length() + 1;
   MPI_Gather(&encoded_message_length, 1, MPI_INT, nullptr, 1, MPI_INT,
@@ -1084,6 +1089,7 @@ void NegotiateOfRequestOfSlave(BluefogGlobalState& state,
     } else {
       PerformOperation(nego_entries);
     }
+    state.unfinished_enqueued_entries.fetch_sub(nego_entries.size());
   }
 
   if (response_list.shutdown()) {
@@ -1097,6 +1103,8 @@ void NegotiateOfRequestOfSlave(BluefogGlobalState& state,
 void NegotiationOfRequest(BluefogGlobalState& state,
                           std::deque<Request>& message_queue_buffer,
                           bool& should_change_topo, bool& should_shut_down) {
+  // TODO(ybc) should_change_topo has no effect after condition variable refactor. 
+  // Just keep it for a while. will remove.
   if (bluefog_rank() == COORDINATE_RANK) {
     NegotiateOfRequestOfMaster(state, message_queue_buffer, should_change_topo,
                                should_shut_down);
@@ -1111,6 +1119,15 @@ bool RunLoopOnce(BluefogGlobalState& state) {
   bool should_shut_down = state.shut_down;
   bool should_change_topo = state.setting_topology;
 
+  std::unique_lock<std::mutex> lk(state.loop_mutex);
+  // The real mutex for queue is the on under TensorQueue.
+  state.loop_cv.wait_for(lk, std::chrono::seconds(10), [&state] {
+    // When we requesting shut_down, or any unfinished entries waiting in the
+    // negotiation we should not wait.
+    return state.shut_down || (state.unfinished_enqueued_entries > 0) ||
+           (state.tensor_queue.size() > 0);
+  });
+
   // This delay determines thread frequency and MPI message latency
   auto sleep_duration =
       state.last_cycle_start +
@@ -1131,7 +1148,7 @@ bool RunLoopOnce(BluefogGlobalState& state) {
   std::vector<TensorTableEntry> entries;
   auto IsRequestConvertToEntryDirectly = [](const Request& request) -> bool {
     return global_skip_negotiate_stage ||
-           (request.request_type() != Request::ALLREDUCE &&
+            (request.request_type() != Request::ALLREDUCE &&
             request.request_type() != Request::ALLGATHER &&
             request.request_type() != Request::BROADCAST &&
             request.request_type() != Request::NEIGHBOR_ALLREDUCE &&
@@ -1150,7 +1167,8 @@ bool RunLoopOnce(BluefogGlobalState& state) {
       std::remove_if(message_queue_buffer.begin(), message_queue_buffer.end(),
                      IsRequestConvertToEntryDirectly),
       message_queue_buffer.end());
-
+   
+  lk.unlock();  // Never hold the mutex when there is remote function.
   PerformOperation(entries);
 
   // For the rest requests, they needs to coordinate and neogiate.
@@ -1163,20 +1181,6 @@ bool RunLoopOnce(BluefogGlobalState& state) {
     NegotiationOfRequest(state, message_queue_buffer, should_change_topo,
                          should_shut_down);
   }
-  // Seperate the setting topology and negotiate communnication.
-  // TODO(ybc) Use conditional variable and mutex to re-implement this.
-  if (should_change_topo) {
-    bluefog_global.ready_to_setting_topology = true;
-    while (!bluefog_global.setting_topology_done) {
-      std::this_thread::sleep_for(SUSPEND_BACKGROUND_WAITTING_DURATION);
-    }
-    bluefog_global.ready_to_setting_topology = false;
-    // Wait for main thread reset.
-    while (bluefog_global.setting_topology_done) {
-      std::this_thread::sleep_for(SUSPEND_BACKGROUND_WAITTING_DURATION);
-    }
-  }
-
   return !should_shut_down;
 }
 
@@ -1215,6 +1219,7 @@ void bluefog_init() { InitializeBluefogOnce(); }
 void bluefog_shutdown() {
   if (bluefog_global.background_thread.joinable()) {
     bluefog_global.shut_down = true;
+    bluefog_global.loop_cv.notify_all();
     bluefog_global.background_thread.join();
     // Reset the initialization flag to allow restarting with bluefog_init(...)
     //bluefog_global.initialize_flag.clear();
@@ -1290,36 +1295,21 @@ int bluefog_set_topology(int indegree, const int* sources, int outdegree,
     return -1;
   }
 #endif
-  bluefog_global.setting_topology = true;
-  while (!bluefog_global.ready_to_setting_topology.load()) {
-    std::this_thread::sleep_for(SUSPEND_BACKGROUND_WAITTING_DURATION);
-  }
-  bluefog_global.tensor_queue.LockTensorQueue();
-  if (bluefog_global.tensor_queue.size() > 0) {
-    BFLOG(ERROR)
-        << "Cannot set the topology because there are unfinished MPI ops.";
-    bluefog_global.tensor_queue.UnlockTensorQueue();
-    return -1;
-  }
-
-  bool mpi_result = bluefog_global.controller->SetTopology(
-      indegree, sources, outdegree, destinations);
+  bool mpi_result;
+  // When we change the topology, there should be no entries being processed at
+  // same time.
+  {
+    std::lock_guard<std::mutex> lk(bluefog_global.loop_mutex);
+    mpi_result = bluefog_global.controller->SetTopology(
+        indegree, sources, outdegree, destinations);
 #if HAVE_NCCL && NCCL_MINOR < 7
-  if (mpi_result && nccl_context.is_initialized) {
-    bluefog_global.nccl_controller->DestroyPeerCommunicators();
-    bluefog_global.nccl_controller->InitPeerCommunicators();
-  }
+    if (mpi_result && nccl_context.is_initialized) {
+      bluefog_global.nccl_controller->DestroyPeerCommunicators();
+      bluefog_global.nccl_controller->InitPeerCommunicators();
+    }
 #endif
-  bluefog_global.tensor_queue.UnlockTensorQueue();
-
-  bluefog_global.setting_topology = false;
-  bluefog_global.setting_topology_done = true;
-  // Wait for the background thread receive the setting_topology_done and
-  // close the ready_to_setting_topology epoch.
-  while (bluefog_global.ready_to_setting_topology) {
-    std::this_thread::sleep_for(SUSPEND_BACKGROUND_WAITTING_DURATION);
   }
-  bluefog_global.setting_topology_done = false;
+  bluefog_global.loop_cv.notify_one();
   return mpi_result;
 }
 
@@ -1447,6 +1437,7 @@ Status EnqueueTensorAllreduce(std::shared_ptr<Tensor> tensor,
     return SUSPEND_ERROR;
   }
   Status status = bluefog_global.tensor_queue.AddToTensorQueue(e, message);
+  bluefog_global.loop_cv.notify_one();
   return status;
 }
 
@@ -1482,6 +1473,7 @@ Status EnqueueTensorBroadcast(std::shared_ptr<Tensor> tensor,
     return SUSPEND_ERROR;
   }
   Status status = bluefog_global.tensor_queue.AddToTensorQueue(e, message);
+  bluefog_global.loop_cv.notify_one();
   return status;
 }
 
@@ -1516,6 +1508,7 @@ Status EnqueueTensorAllgather(std::shared_ptr<Tensor> tensor,
     return SUSPEND_ERROR;
   }
   Status status = bluefog_global.tensor_queue.AddToTensorQueue(e, message);
+  bluefog_global.loop_cv.notify_one();
   return status;
 }
 
@@ -1560,6 +1553,7 @@ Status EnqueueTensorNeighborAllgather(std::shared_ptr<Tensor> tensor,
     return SUSPEND_ERROR;
   }
   Status status = bluefog_global.tensor_queue.AddToTensorQueue(e, message);
+  bluefog_global.loop_cv.notify_one();
   return status;
 }
 
@@ -1611,6 +1605,7 @@ Status EnqueueTensorNeighborAllreduce(std::shared_ptr<Tensor> tensor,
     return SUSPEND_ERROR;
   }
   Status status = bluefog_global.tensor_queue.AddToTensorQueue(e, message);
+  bluefog_global.loop_cv.notify_one();
   return status;
 }
 
@@ -1651,6 +1646,7 @@ Status EnqueueTensorPairGossip(std::shared_ptr<Tensor> tensor,
     return SUSPEND_ERROR;
   }
   Status status = bluefog_global.tensor_queue.AddToTensorQueue(e, message);
+  bluefog_global.loop_cv.notify_one();
   return status;
 }
 
@@ -1683,6 +1679,7 @@ Status EnqueueTensorWindowCreate(
     return SUSPEND_ERROR;
   }
   Status status = bluefog_global.tensor_queue.AddToTensorQueue(e, message);
+  bluefog_global.loop_cv.notify_one();
   return status;
 }
 
@@ -1707,6 +1704,7 @@ Status EnqueueTensorWindowFree(const std::string& name, int device,
     return SUSPEND_ERROR;
   }
   Status status = bluefog_global.tensor_queue.AddToTensorQueue(e, message);
+  bluefog_global.loop_cv.notify_one();
   return status;
 }
 
@@ -1740,6 +1738,7 @@ Status EnqueueTensorWindowPut(std::shared_ptr<Tensor> tensor,
     return SUSPEND_ERROR;
   }
   Status status = bluefog_global.tensor_queue.AddToTensorQueue(e, message);
+  bluefog_global.loop_cv.notify_one();
   return status;
 }
 
@@ -1771,6 +1770,7 @@ Status EnqueueTensorWindowAccumulate(
     return SUSPEND_ERROR;
   }
   Status status = bluefog_global.tensor_queue.AddToTensorQueue(e, message);
+  bluefog_global.loop_cv.notify_one();
   return status;
 }
 
@@ -1798,6 +1798,7 @@ Status EnqueueTensorWindowGet(const std::string& name,
     return SUSPEND_ERROR;
   }
   Status status = bluefog_global.tensor_queue.AddToTensorQueue(e, message);
+  bluefog_global.loop_cv.notify_one();
   return status;
 }
 
@@ -2040,28 +2041,15 @@ void SetSkipNegotiateStageState(bool value) {
   if (value == global_skip_negotiate_stage) {
     return;
   }
-  if (value) {
-    // From running negotiate to skipping negotiate, we need to properly turn
-    // off negotiate stage. Otherwise, it may hang the processes. Use setting
-    // topology flag to suspend the negotiate stage then skip it.
-    bluefog_global.setting_topology = true;
-    while (!bluefog_global.ready_to_setting_topology.load()) {
-      std::this_thread::sleep_for(SUSPEND_BACKGROUND_WAITTING_DURATION);
-    }
-
-    global_skip_negotiate_stage = value;
 
-    bluefog_global.setting_topology = false;
-    bluefog_global.setting_topology_done = true;
-    // Wait for the background thread receive the setting_topology_done and
-    // close the ready_to_setting_topology epoch.
-    while (bluefog_global.ready_to_setting_topology) {
-      std::this_thread::sleep_for(SUSPEND_BACKGROUND_WAITTING_DURATION);
-    }
-    bluefog_global.setting_topology_done = false;
-  } else {
+  // From running negotiate to skipping negotiate, we need to properly turn
+  // off negotiate stage. Otherwise, it may hang the processes. Use setting
+  // topology flag to suspend the negotiate stage then skip it.
+  {
+    std::lock_guard<std::mutex> lk(bluefog_global.loop_mutex);
     global_skip_negotiate_stage = value;
   }
+  bluefog_global.loop_cv.notify_one();
 }
 
 bool GetSkipNegotiateStageState() {
diff --git a/bluefog/common/tensor_queue.h b/bluefog/common/tensor_queue.h
index a0302496..de476743 100644
--- a/bluefog/common/tensor_queue.h
+++ b/bluefog/common/tensor_queue.h
@@ -46,15 +46,20 @@ class TensorQueue {
 
   void PushMessageToQueue(Request& message);
 
-  // Used when setting Topology, which require the tensor queue should be empty always.
+  // Used when setting Topology, which require the tensor queue should be empty
+  // always.
   inline void LockTensorQueue() { mutex_.lock(); }
   inline void UnlockTensorQueue() { mutex_.unlock(); }
-  inline size_t size() { return message_queue_.size(); }
+  inline size_t size() {
+    std::lock_guard<std::mutex> guard(mutex_);
+    return message_queue_.size();
+  }
 
  protected:
   // Tensors waiting to be processed.
-  // Key is based upon the message name since tensor_name in table entry for win ops
-  // is for window and we need to add "win_put."/"win_create." before it in message.
+  // Key is based upon the message name since tensor_name in table entry for win
+  // ops is for window and we need to add "win_put."/"win_create." before it in
+  // message.
   std::unordered_map<std::string, TensorTableEntry> tensor_table_;
 
   // Queue of MPI requests waiting to be sent to the coordinator node.
@@ -65,8 +70,8 @@ class TensorQueue {
   mutable std::mutex mutex_;
 };
 
-// Encapsulates the process of creating and destroying fusion buffers as the requested
-// threshold is changed.
+// Encapsulates the process of creating and destroying fusion buffers as the
+// requested threshold is changed.
 class FusionBufferManager {
  public:
   // Initializes a buffer of the given threshold size if not already cached.
@@ -82,10 +87,11 @@ class FusionBufferManager {
                           std::function<void()> on_start_init,
                           std::function<void()> on_end_init);
 
-  // Initializes a buffer of the given threshold size times MPI size if not already cached.
-  // There is one constraint to noticed here. We need WeightBuffer is always larger than
-  // (size-1)*fusion Buffer since we don't want to tensor being able to put into the fusion
-  // buffer but not able to put into weightbuffer.
+  // Initializes a buffer of the given threshold size times MPI size if not
+  // already cached. There is one constraint to noticed here. We need
+  // WeightBuffer is always larger than (size-1)*fusion Buffer since we don't
+  // want to tensor being able to put into the fusion buffer but not able to put
+  // into weightbuffer.
   //
   // Args:
   //  threshold: Size of the buffer in bytes.
@@ -94,9 +100,7 @@ class FusionBufferManager {
   //  context: Framework used to create the buffer and associate it.
   //  on_start_init: Callback on starting buffer initialization.
   //  on_end_init: Callback on completing buffer initialization.
-  Status InitializeWeightBuffer(int64_t threshold,
-                                int world_size,
-                                int device,
+  Status InitializeWeightBuffer(int64_t threshold, int world_size, int device,
                                 std::shared_ptr<OpContext> context,
                                 std::function<void()> on_start_init,
                                 std::function<void()> on_end_init);
@@ -104,7 +108,8 @@ class FusionBufferManager {
   // Returns the buffer associated with the given device and framework, or null.
   std::shared_ptr<PersistentBuffer> GetBuffer(int device);
 
-  // Returns the weight buffer associated with the given device and framework, or null.
+  // Returns the weight buffer associated with the given device and framework,
+  // or null.
   std::shared_ptr<PersistentBuffer> GetWeightBuffer(int device);
 
  private:
@@ -112,7 +117,8 @@ class FusionBufferManager {
   std::unordered_map<int, std::pair<std::shared_ptr<PersistentBuffer>, int64_t>>
       tensor_fusion_buffers_;
 
-  // Memory buffers for Tensor Fusion with dst weight.  They are keyed by device ID.
+  // Memory buffers for Tensor Fusion with dst weight.  They are keyed by device
+  // ID.
   std::unordered_map<int, std::pair<std::shared_ptr<PersistentBuffer>, int64_t>>
       weight_tensor_fusion_buffers_;
 };
diff --git a/bluefog/run/interactive_run.py b/bluefog/run/interactive_run.py
index ccfc0c79..6682c8e6 100644
--- a/bluefog/run/interactive_run.py
+++ b/bluefog/run/interactive_run.py
@@ -406,7 +406,8 @@ def handler(signum, frame):
     signal.signal(signal.SIGINT, handler)
 
     env = os.environ.copy()
-    env['BLUEFOG_CYCLE_TIME'] = str(20)  # Increase the cycle time
+    # No longer needed after using condition variable.
+    # env['BLUEFOG_CYCLE_TIME'] = str(20)  # Increase the cycle time
 
     # action of stop
     if args.action == "stop":
diff --git a/test/torch_ops_test.py b/test/torch_ops_test.py
index 974a1aca..c8c8ca5a 100644
--- a/test/torch_ops_test.py
+++ b/test/torch_ops_test.py
@@ -171,6 +171,7 @@ def test_allreduce_sum(self):
 
         dims = [1, 2, 3]
         for dtype, dim in itertools.product(dtypes, dims):
+            torch.manual_seed(123456)
             tensor = torch.FloatTensor(*([23] * dim)).random_(-100, 100)
             tensor = self.cast_and_place(tensor, dtype)
             name = "allreduce_tensor_{}_{}".format(dim, dtype)
@@ -1207,7 +1208,7 @@ def test_neighbor_allgather_dynamic_variable_size(self):
                   torch.ByteTensor, torch.CharTensor, torch.ShortTensor, torch.HalfTensor]
         if TEST_ON_GPU:
             dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor]
-        
+
         # Connect to all other ranks
         neighbor_ranks = [i for i in range(size) if i != rank]
         dims = [1, 2, 3]
@@ -1222,10 +1223,11 @@ def test_neighbor_allgather_dynamic_variable_size(self):
             tensor = torch.FloatTensor(
                 *([tensor_sizes[rank]] + [17] * (dim - 1))).fill_(1).mul_(rank)
             tensor = self.cast_and_place(tensor, dtype)
-            gathered = bf.neighbor_allgather(tensor, dst_ranks=neighbor_ranks, src_ranks=neighbor_ranks)
+            gathered = bf.neighbor_allgather(
+                tensor, dst_ranks=neighbor_ranks, src_ranks=neighbor_ranks)
             tensor, gathered = self.convert_cpu_fp16_to_fp32(tensor, gathered)
 
-            tensor_sizes[rank] = 0 # remove self-size since neighbor_allgather does not include self.
+            tensor_sizes[rank] = 0  # remove self since neighbor_allgather does not include self
             expected_size = sum(tensor_sizes)
             assert list(gathered.shape) == [expected_size] + [17] * (dim - 1)