From 61b472ccebd94299878ef49de9819610a55679d4 Mon Sep 17 00:00:00 2001 From: ybc Date: Sun, 7 Feb 2021 21:06:29 -0800 Subject: [PATCH 01/36] Fixed the self_weight under emtpy receiving case --- bluefog/torch/mpi_ops.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bluefog/torch/mpi_ops.cc b/bluefog/torch/mpi_ops.cc index f50e272d..124d41cd 100644 --- a/bluefog/torch/mpi_ops.cc +++ b/bluefog/torch/mpi_ops.cc @@ -439,6 +439,8 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output, } output.resize_(shape_vector); MaybeCopyBufferBack(output, output_buffer); + } else { // recv_size == 0 + output.mul_(self_weight); } })); @@ -529,8 +531,8 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output, } output.resize_(shape_vector); MaybeCopyBufferBack(output, output_buffer); - } else { - output.set_(tensor); + } else { // recv_size == 0 + output.set_(tensor.mul(self_weight)); } })); ThrowIfError(enqueue_result); From 7e8ea66a1d047b362b1fb9a1e1bf91d9bd45d725 Mon Sep 17 00:00:00 2001 From: Hanbin Hu Date: Thu, 11 Feb 2021 00:52:30 -0800 Subject: [PATCH 02/36] Enable empty send neighbors and fix HalfTensor for recv_size==0 --- bluefog/torch/mpi_ops.cc | 9 +++++++-- bluefog/torch/mpi_ops.py | 3 --- test/torch_ops_test.py | 2 +- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/bluefog/torch/mpi_ops.cc b/bluefog/torch/mpi_ops.cc index 124d41cd..4e517ab3 100644 --- a/bluefog/torch/mpi_ops.cc +++ b/bluefog/torch/mpi_ops.cc @@ -440,7 +440,9 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output, output.resize_(shape_vector); MaybeCopyBufferBack(output, output_buffer); } else { // recv_size == 0 - output.mul_(self_weight); + ::torch::Tensor output_buffer = MaybeCopyToTensorBuffer(output); + output_buffer.mul_(self_weight); + MaybeCopyBufferBack(output, output_buffer); } })); @@ -532,7 +534,10 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output, output.resize_(shape_vector); MaybeCopyBufferBack(output, output_buffer); } else { // recv_size == 0 - output.set_(tensor.mul(self_weight)); + output.set_(tensor); + ::torch::Tensor output_buffer = MaybeCopyToTensorBuffer(output); + output_buffer.mul_(self_weight); + MaybeCopyBufferBack(output, output_buffer); } })); ThrowIfError(enqueue_result); diff --git a/bluefog/torch/mpi_ops.py b/bluefog/torch/mpi_ops.py index fd1d103b..fa52dcdd 100644 --- a/bluefog/torch/mpi_ops.py +++ b/bluefog/torch/mpi_ops.py @@ -441,9 +441,6 @@ def _neighbor_allreduce_nonblocking(tensor, output, self_weight, neighbor_weight elif self_weight is None or neighbor_weights is None: raise ValueError("Arguments self_weight and neighbor_weights should be presented if " "enabling dynamic topology.") - elif not send_neighbors: - raise ValueError("Argument send_neighbors cannot be empty list but we plan to support " - "it in future.") else: dynamic_neighbors_enabled = True if self_weight is None and neighbor_weights is None: diff --git a/test/torch_ops_test.py b/test/torch_ops_test.py index 7dda2708..4a555563 100644 --- a/test/torch_ops_test.py +++ b/test/torch_ops_test.py @@ -521,7 +521,7 @@ def test_neighbor_allreduce_dynamic_topo_move(self): (reduced_tensor.data - (rank-1) % size).abs().max() < eps ), "bf.neighbor_allreduce (move) produces incorrect reduced tensor" - @unittest.skip("Haven't fully clear on the usage due to sync issues. Temporarily disabled") + #@unittest.skip("Haven't fully clear on the usage due to sync issues. Temporarily disabled") def test_neighbor_allreduce_dynamic_topo_with_empty_send_neighbors(self): """Test that the neighbor all reduce (avg) 1D, 2D, 3D tensors correctly with empty send_neighbors.""" From 1d43096ed136dd6ffac1eaa5e34a4bb4a42362c3 Mon Sep 17 00:00:00 2001 From: ybc Date: Sun, 7 Feb 2021 21:06:29 -0800 Subject: [PATCH 03/36] Fixed the self_weight under emtpy receiving case --- bluefog/torch/mpi_ops.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bluefog/torch/mpi_ops.cc b/bluefog/torch/mpi_ops.cc index f50e272d..124d41cd 100644 --- a/bluefog/torch/mpi_ops.cc +++ b/bluefog/torch/mpi_ops.cc @@ -439,6 +439,8 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output, } output.resize_(shape_vector); MaybeCopyBufferBack(output, output_buffer); + } else { // recv_size == 0 + output.mul_(self_weight); } })); @@ -529,8 +531,8 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output, } output.resize_(shape_vector); MaybeCopyBufferBack(output, output_buffer); - } else { - output.set_(tensor); + } else { // recv_size == 0 + output.set_(tensor.mul(self_weight)); } })); ThrowIfError(enqueue_result); From c082ce93c8e6949e3d9804b3a6a10680678d1fa0 Mon Sep 17 00:00:00 2001 From: Hanbin Hu Date: Thu, 11 Feb 2021 00:52:30 -0800 Subject: [PATCH 04/36] Enable empty send neighbors and fix HalfTensor for recv_size==0 --- bluefog/torch/mpi_ops.cc | 9 +++++++-- bluefog/torch/mpi_ops.py | 3 --- test/torch_ops_test.py | 2 +- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/bluefog/torch/mpi_ops.cc b/bluefog/torch/mpi_ops.cc index 124d41cd..4e517ab3 100644 --- a/bluefog/torch/mpi_ops.cc +++ b/bluefog/torch/mpi_ops.cc @@ -440,7 +440,9 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output, output.resize_(shape_vector); MaybeCopyBufferBack(output, output_buffer); } else { // recv_size == 0 - output.mul_(self_weight); + ::torch::Tensor output_buffer = MaybeCopyToTensorBuffer(output); + output_buffer.mul_(self_weight); + MaybeCopyBufferBack(output, output_buffer); } })); @@ -532,7 +534,10 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output, output.resize_(shape_vector); MaybeCopyBufferBack(output, output_buffer); } else { // recv_size == 0 - output.set_(tensor.mul(self_weight)); + output.set_(tensor); + ::torch::Tensor output_buffer = MaybeCopyToTensorBuffer(output); + output_buffer.mul_(self_weight); + MaybeCopyBufferBack(output, output_buffer); } })); ThrowIfError(enqueue_result); diff --git a/bluefog/torch/mpi_ops.py b/bluefog/torch/mpi_ops.py index fd1d103b..fa52dcdd 100644 --- a/bluefog/torch/mpi_ops.py +++ b/bluefog/torch/mpi_ops.py @@ -441,9 +441,6 @@ def _neighbor_allreduce_nonblocking(tensor, output, self_weight, neighbor_weight elif self_weight is None or neighbor_weights is None: raise ValueError("Arguments self_weight and neighbor_weights should be presented if " "enabling dynamic topology.") - elif not send_neighbors: - raise ValueError("Argument send_neighbors cannot be empty list but we plan to support " - "it in future.") else: dynamic_neighbors_enabled = True if self_weight is None and neighbor_weights is None: diff --git a/test/torch_ops_test.py b/test/torch_ops_test.py index 7dda2708..4a555563 100644 --- a/test/torch_ops_test.py +++ b/test/torch_ops_test.py @@ -521,7 +521,7 @@ def test_neighbor_allreduce_dynamic_topo_move(self): (reduced_tensor.data - (rank-1) % size).abs().max() < eps ), "bf.neighbor_allreduce (move) produces incorrect reduced tensor" - @unittest.skip("Haven't fully clear on the usage due to sync issues. Temporarily disabled") + #@unittest.skip("Haven't fully clear on the usage due to sync issues. Temporarily disabled") def test_neighbor_allreduce_dynamic_topo_with_empty_send_neighbors(self): """Test that the neighbor all reduce (avg) 1D, 2D, 3D tensors correctly with empty send_neighbors.""" From a3365b30f095255c8f775cbd88a5f0f19e4ad831 Mon Sep 17 00:00:00 2001 From: Hanbin Hu Date: Thu, 18 Feb 2021 23:12:47 -0800 Subject: [PATCH 05/36] Rename neighbor_weights to src_weights, and send_neighbors to dst_weights for neighbor_allreduce --- bluefog/torch/mpi_ops.py | 98 ++++++++++++++------------- bluefog/torch/optimizers.py | 16 ++--- examples/pytorch_average_consensus.py | 4 +- examples/pytorch_benchmark.py | 4 +- examples/pytorch_mnist.py | 4 +- examples/pytorch_optimization.py | 17 +++-- examples/pytorch_resnet.py | 4 +- test/torch_ops_test.py | 18 ++--- test/torch_optimizer_test.py | 4 +- 9 files changed, 88 insertions(+), 81 deletions(-) diff --git a/bluefog/torch/mpi_ops.py b/bluefog/torch/mpi_ops.py index fa52dcdd..82c55e07 100644 --- a/bluefog/torch/mpi_ops.py +++ b/bluefog/torch/mpi_ops.py @@ -15,7 +15,7 @@ # ============================================================================== from contextlib import contextmanager -from typing import List, Dict, Optional +from typing import List, Dict, Union, Optional import torch @@ -430,46 +430,46 @@ def _neighbor_allreduce_function_factory(tensor): return 'bluefog_torch_neighbor_allreduce_nonblocking_' + tensor.type().replace('.', '_') -def _neighbor_allreduce_nonblocking(tensor, output, self_weight, neighbor_weights, - send_neighbors, enable_topo_check, name): +def _neighbor_allreduce_nonblocking(tensor, output, self_weight, src_weights, + dst_weights, enable_topo_check, name): function = _check_function(_neighbor_allreduce_function_factory, tensor) - if send_neighbors is None: - send_neighbors = [] + if dst_weights is None: + dst_weights = [] dynamic_neighbors_enabled = False - elif len(set(send_neighbors)) != len(send_neighbors): - raise ValueError("Argument send_neighbors should only contain the unique ranks.") - elif self_weight is None or neighbor_weights is None: - raise ValueError("Arguments self_weight and neighbor_weights should be presented if " + elif len(set(dst_weights)) != len(dst_weights): + raise ValueError("Argument dst_weights should only contain the unique ranks.") + elif self_weight is None or src_weights is None: + raise ValueError("Arguments self_weight and src_weights should be presented if " "enabling dynamic topology.") else: dynamic_neighbors_enabled = True - if self_weight is None and neighbor_weights is None: + if self_weight is None and src_weights is None: # Implying this is static graph. if is_topo_weighted(): topology = load_topology() - self_weight, neighbor_weights = GetRecvWeights(topology, rank()) + self_weight, src_weights = GetRecvWeights(topology, rank()) weighted_average_computation = True else: weight = 1.0/(len(in_neighbor_ranks())+1) self_weight = weight - neighbor_weights = {r: weight for r in in_neighbor_ranks()} + src_weights = {r: weight for r in in_neighbor_ranks()} weighted_average_computation = False - elif self_weight is not None and neighbor_weights is not None: - if not isinstance(neighbor_weights, dict): + elif self_weight is not None and src_weights is not None: + if not isinstance(src_weights, dict): raise ValueError("Argument neighbor_weights has to be a dictionary map from the " "(in-)neighbor rank to the weights.") if not isinstance(self_weight, float): raise ValueError( "Argument self_weight has to be a float for self rank.") if not dynamic_neighbors_enabled and \ - not set(neighbor_weights.keys()).issubset(set(in_neighbor_ranks())): + not set(src_weights.keys()).issubset(set(in_neighbor_ranks())): raise ValueError("The key of weights should only contain the ranks that belong to " " in-neighbors and self rank.") - uniform_weights = 1.0/(len(neighbor_weights)+1) + uniform_weights = 1.0/(len(src_weights)+1) weighted_average_computation = False if abs(self_weight - uniform_weights) > 1e-6: weighted_average_computation = True - for n_weights in neighbor_weights.values(): + for n_weights in src_weights.values(): if abs(n_weights - uniform_weights) > 1e-6: weighted_average_computation = True break @@ -477,18 +477,18 @@ def _neighbor_allreduce_nonblocking(tensor, output, self_weight, neighbor_weight raise ValueError("Arguments self_weight and neighbor_weights have to be presented at " "the same time") is_hierarchical = False - handle = getattr(mpi_lib, function)(tensor, output, self_weight, neighbor_weights, - send_neighbors, dynamic_neighbors_enabled, + handle = getattr(mpi_lib, function)(tensor, output, self_weight, src_weights, + dst_weights, dynamic_neighbors_enabled, enable_topo_check, weighted_average_computation, is_hierarchical, name.encode() if name is not None else "") _handle_map[handle] = (tensor, output) return handle -def neighbor_allreduce(tensor: torch.Tensor, +def neighbor_allreduce(tensor: torch.Tensor, *, self_weight: Optional[float] = None, - neighbor_weights: Optional[Dict[int, float]] = None, - send_neighbors: Optional[List[int]] = None, + src_weights: Optional[Dict[int, float]] = None, + dst_weights: Optional[Union[Dict[int, float], List[int]]] = None, enable_topo_check: bool = True, name: Optional[str] = None) -> torch.Tensor: """ @@ -505,17 +505,17 @@ def neighbor_allreduce(tensor: torch.Tensor, Arguments: tensor: A tensor to execute weighted average with neighbors. self_weight: The weight for self node, used with neighbor_weights. - neighbor_weights: The weights for in-neighbor nodes, used with self weight. + src_weights: The weights for in-neighbor nodes, used with self weight. If neighbor_weights is presented, the return tensor will return the weighted average defined by these weights and the self_weight. If not, the return tensor will return the weighted average defined by the topology weights is provided or uniformly average. The data structure of weights should be {rank : weight} and rank has to belong to the (in-)neighbors. - send_neighbors: The list of neighbor nodes to be sent to. If set to be None, assume the + dst_weights: The weights for out-neighbor nodes. If set to be None, assume the the current node sends to all of its (out-)neighbors. If having values, assume only - part of (out-)neighbors will be sent to. In this mode, this node sends its value to - partial neighbors listed in this variable in a dynamic graph, and `self_weight` and - `neighbor_weights` must be present. + part of (out-)neighbors will be sent to. If set to be a list, assume all the weights + are one. In this mode, this node sends its value to partial neighbors listed in this + variable in a dynamic graph, and `self_weight` and `src_weights` must be present. enable_topo_check: When send_neighbors is present, enabling this option checks if the sending and recieving neighbors match with each other. Disabling this check can boost the performance. @@ -526,19 +526,23 @@ def neighbor_allreduce(tensor: torch.Tensor, Note: self_weight and neighbor_weights must be presented at the same time. """ - if (self_weight is None and neighbor_weights is not None) or \ - (self_weight is not None and neighbor_weights is None): - raise ValueError("Arguments self_weight and neighbor_weights have to be presented at " + if (self_weight is None and src_weights is not None) or \ + (self_weight is not None and src_weights is None): + raise ValueError("Arguments self_weight and src_weights have to be presented at " "the same time") - handle = neighbor_allreduce_nonblocking(tensor, self_weight, neighbor_weights, - send_neighbors, enable_topo_check, name) + handle = neighbor_allreduce_nonblocking(tensor, + self_weight=self_weight, + src_weights=src_weights, + dst_weights=dst_weights, + enable_topo_check=enable_topo_check, + name=name) return synchronize(handle) -def neighbor_allreduce_nonblocking(tensor: torch.Tensor, +def neighbor_allreduce_nonblocking(tensor: torch.Tensor, *, self_weight: Optional[float] = None, - neighbor_weights: Optional[Dict[int, float]] = None, - send_neighbors: Optional[List[int]] = None, + src_weights: Optional[Dict[int, float]] = None, + dst_weights: Optional[Union[Dict[int, float], List[int]]] = None, enable_topo_check: bool = True, name: Optional[str] = None) -> int: """ @@ -555,17 +559,17 @@ def neighbor_allreduce_nonblocking(tensor: torch.Tensor, Arguments: tensor: A tensor to execute weighted average with neighbors. self_weight: The weight for self node, used with neighbor_weights. - neighbor_weights: The weights for in-neighbor nodes, used with self weight. + src_weights: The weights for in-neighbor nodes, used with self weight. If neighbor_weights is presented, the return tensor will return the weighted average defined by these weights and the self_weight. If not, the return tensor will return the weighted average defined by the topology weights is provided or uniformly average. The data structure of weights should be {rank : weight} and rank has to belong to the (in-)neighbors. - send_neighbors: The list of neighbor nodes to be sent to. If set to be None, assume the + dst_weights: The weights for out-neighbor nodes. If set to be None, assume the the current node sends to all of its (out-)neighbors. If having values, assume only - part of (out-)neighbors will be sent to. In this mode, this node sends its value to - partial neighbors listed in this variable in a dynamic graph, and `self_weight` and - `neighbor_weights` must be present. + part of (out-)neighbors will be sent to. If set to be a list, assume all the weights + are one. In this mode, this node sends its value to partial neighbors listed in this + variable in a dynamic graph, and `self_weight` and `src_weights` must be present. enable_topo_check: When send_neighbors is present, enabling this option checks if the sending and recieving neighbors match with each other. Disabling this check can boost the performance. @@ -577,18 +581,18 @@ def neighbor_allreduce_nonblocking(tensor: torch.Tensor, Note: self_weight and neighbor_weights must be presented at the same time. """ - if (self_weight is None and neighbor_weights is not None) or \ - (self_weight is not None and neighbor_weights is None): - raise ValueError("Arguments self_weight and neighbor_weights have to be presented at " + if (self_weight is None and src_weights is not None) or \ + (self_weight is not None and src_weights is None): + raise ValueError("Arguments self_weight and src_weights have to be presented at " "the same time") - if send_neighbors is None: + if dst_weights is None: first_dim = tensor.shape[0] * len(in_neighbor_ranks()) else: - first_dim = tensor.shape[0] * len(neighbor_weights) + first_dim = tensor.shape[0] * len(src_weights) new_shape = torch.Size([first_dim] + list(tensor.shape[1:])) output = tensor.new(new_shape) # Pre-allocate the memory for the output. - return _neighbor_allreduce_nonblocking(tensor, output, self_weight, neighbor_weights, - send_neighbors, enable_topo_check, name=name) + return _neighbor_allreduce_nonblocking(tensor, output, self_weight, src_weights, + dst_weights, enable_topo_check, name=name) def hierarchical_neighbor_allreduce(tensor: torch.Tensor, diff --git a/bluefog/torch/optimizers.py b/bluefog/torch/optimizers.py index ceeb5a94..82fc677c 100644 --- a/bluefog/torch/optimizers.py +++ b/bluefog/torch/optimizers.py @@ -324,8 +324,8 @@ def __init__(self, params, model, communication_type, num_steps_per_communicatio named_parameters, models = _check_named_parameters(self, model) # knobs for neighbor communication behavior self.self_weight = None - self.neighbor_weights = None - self.send_neighbors = None + self.src_weights = None + self.dst_weights = None self.neighbor_machine_weights = None self.send_neighbor_machines = None self.enable_topo_check = False @@ -394,8 +394,8 @@ def hook(model, *unused): def _neighbor_allreduce_data_async(self, p): name = self._parameter_names.get(p) handle = bf.neighbor_allreduce_nonblocking(p.data, name=name, self_weight=self.self_weight, - neighbor_weights=self.neighbor_weights, - send_neighbors=self.send_neighbors, + src_weights=self.src_weights, + dst_weights=self.dst_weights, enable_topo_check=self.enable_topo_check) return handle @@ -489,8 +489,8 @@ def __init__(self, params, model, communication_type, backward_passes_per_step=1 named_parameters, models = _check_named_parameters(self, model) # knobs for neighbor communication behavior self.self_weight = None - self.neighbor_weights = None - self.send_neighbors = None + self.src_weights = None + self.dst_weights = None self.neighbor_machine_weights = None self.send_neighbor_machines = None self.enable_topo_check = False @@ -762,8 +762,8 @@ def _adadelta_step(self, p, grad, param_group): def _neighbor_allreduce_data_async(self, p): name = self._parameter_names.get(p) handle = bf.neighbor_allreduce_nonblocking(p.data, name=name, self_weight=self.self_weight, - neighbor_weights=self.neighbor_weights, - send_neighbors=self.send_neighbors, + src_weights=self.src_weights, + dst_weights=self.dst_weights, enable_topo_check=self.enable_topo_check) return handle diff --git a/examples/pytorch_average_consensus.py b/examples/pytorch_average_consensus.py index 0b1b2f1e..95711596 100644 --- a/examples/pytorch_average_consensus.py +++ b/examples/pytorch_average_consensus.py @@ -99,8 +99,8 @@ self_weight = 1 / (len(recv_neighbors) + 1) x = bf.neighbor_allreduce(x, name='x', self_weight=self_weight, - neighbor_weights=neighbor_weights, - send_neighbors=send_neighbors, enable_topo_check=False) + src_weights=neighbor_weights, + dst_weights=send_neighbors, enable_topo_check=False) mse.append(torch.norm(x-x_bar, p=2) / torch.norm(x_bar, p=2)) else: outdegree = len(bf.out_neighbor_ranks()) diff --git a/examples/pytorch_benchmark.py b/examples/pytorch_benchmark.py index 958e6571..64cf1eb2 100644 --- a/examples/pytorch_benchmark.py +++ b/examples/pytorch_benchmark.py @@ -186,8 +186,8 @@ def dynamic_topology_update(batch_idx): optimizer.dst_weights = {sent_neighbor: 1.0} elif args.dist_optimizer == 'neighbor_allreduce': send_neighbors, recv_neighbors = next(dynamic_neighbor_allreduce_gen) - optimizer.send_neighbors = send_neighbors - optimizer.neighbor_weights = { + optimizer.dst_weights = send_neighbors + optimizer.src_weights = { r: 1/(len(recv_neighbors) + 1) for r in recv_neighbors} optimizer.self_weight = 1 / (len(recv_neighbors) + 1) optimizer.enable_topo_check = False diff --git a/examples/pytorch_mnist.py b/examples/pytorch_mnist.py index 8315dffe..7643369c 100644 --- a/examples/pytorch_mnist.py +++ b/examples/pytorch_mnist.py @@ -215,8 +215,8 @@ def dynamic_topology_update(epoch, batch_idx): optimizer.dst_weights = {sent_neighbor: 1.0} elif args.dist_optimizer == 'neighbor_allreduce': send_neighbors, recv_neighbors = next(dynamic_neighbor_allreduce_gen) - optimizer.send_neighbors = send_neighbors - optimizer.neighbor_weights = {r: 1/(len(recv_neighbors) + 1) for r in recv_neighbors} + optimizer.dst_weights = send_neighbors + optimizer.src_weights = {r: 1/(len(recv_neighbors) + 1) for r in recv_neighbors} optimizer.self_weight = 1 / (len(recv_neighbors) + 1) optimizer.enable_topo_check = False elif args.dist_optimizer == 'hierarchical_neighbor_allreduce': diff --git a/examples/pytorch_optimization.py b/examples/pytorch_optimization.py index 058a1ca2..162f9040 100644 --- a/examples/pytorch_optimization.py +++ b/examples/pytorch_optimization.py @@ -199,11 +199,13 @@ def diffusion(X, y, w_opt, loss, maxite=2000, alpha=1e-1, **kwargs): loss_step(X, y, w, tensor_name='neighbor.allreduce.local_variable', loss=loss, rho=rho) + # diffusion with torch.no_grad(): - # diffusion phi = w - alpha * w.grad.data - w.data = bf.neighbor_allreduce( - phi, self_weight, neighbor_weights, name='local variable') + w.data = bf.neighbor_allreduce(phi, + self_weight=self_weight, + src_weights=neighbor_weights, + name='local variable') w.grad.data.zero_() # record convergence @@ -271,8 +273,10 @@ def exact_diffusion(X, y, w_opt, loss, maxite=2000, alpha=1e-1, use_Abar=True, * with torch.no_grad(): psi = w - alpha * w.grad.data phi = psi + w.data - psi_prev - w.data = bf.neighbor_allreduce( - phi, self_weight, neighbor_weights, name='local variable') + w.data = bf.neighbor_allreduce(phi, + self_weight=self_weight, + src_weights=neighbor_weights, + name='local variable') psi_prev = psi.clone() w.grad.data.zero_() @@ -330,8 +334,7 @@ def gradient_tracking(X, y, w_opt, loss, maxite=2000, alpha=1e-1, **kwargs): # q^{k+1} = neighbor_allreduce(q^k) + grad(w^{k+1}) - grad(w^k) # Notice the communication of neighbor_allreduce can overlap with gradient computation. - w_handle = bf.neighbor_allreduce_nonblocking( - w.data, name='Grad.Tracking.w') + w_handle = bf.neighbor_allreduce_nonblocking(w.data, name='Grad.Tracking.w') q_handle = bf.neighbor_allreduce_nonblocking(q, name='Grad.Tracking.q') w.data = bf.synchronize(w_handle) - alpha * q # calculate local gradient diff --git a/examples/pytorch_resnet.py b/examples/pytorch_resnet.py index b0ab4ce4..62f3eb01 100644 --- a/examples/pytorch_resnet.py +++ b/examples/pytorch_resnet.py @@ -361,8 +361,8 @@ def dynamic_topology_update(epoch, batch_idx): optimizer.dst_weights = {sent_neighbor: 1.0} elif args.dist_optimizer == 'neighbor_allreduce': send_neighbors, recv_neighbors = next(dynamic_neighbor_allreduce_gen) - optimizer.send_neighbors = send_neighbors - optimizer.neighbor_weights = {r: 1/(len(recv_neighbors) + 1) for r in recv_neighbors} + optimizer.dst_weights = send_neighbors + optimizer.src_weights = {r: 1/(len(recv_neighbors) + 1) for r in recv_neighbors} optimizer.self_weight = 1 / (len(recv_neighbors) + 1) optimizer.enable_topo_check = False elif args.dist_optimizer == 'hierarchical_neighbor_allreduce': diff --git a/test/torch_ops_test.py b/test/torch_ops_test.py index 4a555563..f3ec3240 100644 --- a/test/torch_ops_test.py +++ b/test/torch_ops_test.py @@ -387,7 +387,7 @@ def test_neighbor_allreduce_sum_precision(self): name = "neighbor_allreduce_{}_{}".format(dim, dtype) nw = {i: 1.0 for i in neighbor_ranks} reduced_tensor = bf.neighbor_allreduce(tensor, self_weight=1.0, - neighbor_weights=nw, name=name) + src_weights=nw, name=name) assert ( list(reduced_tensor.shape) == [23] * dim ), "bf.neighbor_allreduce (avg) produces incorrect reduced shape" @@ -450,7 +450,7 @@ def test_neighbor_allreduce_dynamic_topo_check(self): name = "neighbor_allreduce_{}_{}".format(dim, dtype) with pytest.raises(ValueError): bf.neighbor_allreduce(tensor, name=name, self_weight=self_weight, - neighbor_weights=neighbor_weights, send_neighbors=send_ranks) + src_weights=neighbor_weights, dst_weights=send_ranks) def test_neighbor_allreduce_dynamic_topo_outside_static_topo_move(self): """Test that the neighbor all reduce (move) 1D, 2D, 3D tensors correctly @@ -477,7 +477,7 @@ def test_neighbor_allreduce_dynamic_topo_outside_static_topo_move(self): name = "neighbor_allreduce_{}_{}".format(dim, dtype) reduced_tensor = bf.neighbor_allreduce( tensor, name=name, self_weight=self_weight, - neighbor_weights=neighbor_weights, send_neighbors=send_ranks) + src_weights=neighbor_weights, dst_weights=send_ranks) eps = EPSILON if tensor.dtype != torch.float16 else LOOSE_EPSILON tensor, reduced_tensor = self.convert_cpu_fp16_to_fp32(tensor, reduced_tensor) assert ( @@ -511,7 +511,7 @@ def test_neighbor_allreduce_dynamic_topo_move(self): name = "neighbor_allreduce_{}_{}".format(dim, dtype) reduced_tensor = bf.neighbor_allreduce( tensor, name=name, self_weight=self_weight, - neighbor_weights=neighbor_weights, send_neighbors=send_ranks) + src_weights=neighbor_weights, dst_weights=send_ranks) eps = EPSILON if tensor.dtype != torch.float16 else LOOSE_EPSILON tensor, reduced_tensor = self.convert_cpu_fp16_to_fp32(tensor, reduced_tensor) assert ( @@ -553,7 +553,7 @@ def test_neighbor_allreduce_dynamic_topo_with_empty_send_neighbors(self): name = "neighbor_allreduce_{}_{}".format(dim, dtype) reduced_tensor = bf.neighbor_allreduce( tensor, name=name, self_weight=self_weight, - neighbor_weights=neighbor_weights, send_neighbors=send_ranks) + src_weights=neighbor_weights, dst_weights=send_ranks) eps = EPSILON if tensor.dtype != torch.float16 else LOOSE_EPSILON tensor, reduced_tensor = self.convert_cpu_fp16_to_fp32(tensor, reduced_tensor) assert ( @@ -591,7 +591,7 @@ def test_neighbor_allreduce_dynamic_topo_avg(self): name = "neighbor_allreduce_{}_{}".format(dim, dtype) reduced_tensor = bf.neighbor_allreduce( tensor, name=name, self_weight=self_weight, - neighbor_weights=neighbor_weights, send_neighbors=send_ranks) + src_weights=neighbor_weights, dst_weights=send_ranks) eps = EPSILON if tensor.dtype != torch.float16 else LOOSE_EPSILON tensor, reduced_tensor = self.convert_cpu_fp16_to_fp32(tensor, reduced_tensor) assert ( @@ -818,7 +818,7 @@ def test_neighbor_allreduce_sum(self): tensor = self.cast_and_place(tensor, dtype) nw = {i: 1.0 for i in neighbor_ranks} reduced_tensor = bf.neighbor_allreduce(tensor, self_weight=1.0, - neighbor_weights=nw) + src_weights=nw) tensor, reduced_tensor = self.convert_cpu_fp16_to_fp32(tensor, reduced_tensor) assert ( list(reduced_tensor.shape) == [23] * dim @@ -994,10 +994,10 @@ def test_neighbor_allreduce_dynamic_topo_fusion(self): handle_1 = bf.neighbor_allreduce_nonblocking( tensor_1, name=name_1, self_weight=self_weight, - neighbor_weights=neighbor_weights, send_neighbors=send_ranks) + src_weights=neighbor_weights, dst_weights=send_ranks) handle_2 = bf.neighbor_allreduce_nonblocking( tensor_2, name=name_2, self_weight=self_weight, - neighbor_weights=neighbor_weights, send_neighbors=send_ranks) + src_weights=neighbor_weights, dst_weights=send_ranks) output_1 = bf.synchronize(handle_1) output_2 = bf.synchronize(handle_2) diff --git a/test/torch_optimizer_test.py b/test/torch_optimizer_test.py index 8da25053..7508a4b9 100644 --- a/test/torch_optimizer_test.py +++ b/test/torch_optimizer_test.py @@ -226,8 +226,8 @@ def dynamic_neighbor_allreduce_train(model, optimizer, dataloader, isCUDA, dynam model.train() for data, target in dataloader: send_neighbors, recv_neighbors = next(dynamic_topo_gen) - optimizer.send_neighbors = send_neighbors - optimizer.neighbor_weights = { + optimizer.dst_weights = send_neighbors + optimizer.src_weights = { r: 1/(len(recv_neighbors) + 1) for r in recv_neighbors} optimizer.self_weight = 1 / (len(recv_neighbors) + 1) From cf178757de19adae43509f32d8e18bd6f6524d96 Mon Sep 17 00:00:00 2001 From: Hanbin Hu Date: Thu, 18 Feb 2021 23:13:19 -0800 Subject: [PATCH 06/36] A script to test existing examples --- examples/test_all_example.sh | 92 ++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100755 examples/test_all_example.sh diff --git a/examples/test_all_example.sh b/examples/test_all_example.sh new file mode 100755 index 00000000..af9aa491 --- /dev/null +++ b/examples/test_all_example.sh @@ -0,0 +1,92 @@ +#!/bin/bash +NUM_PROC=4 +BFRUN="bfrun -np ${NUM_PROC}" +RUN_DIR="$( pwd )" +EXAMPLE_DIR="$( cd "$( dirname "$0" )" && pwd )" + +die() { echo >&2 -e "\nERROR: $@\n"; exit 1; } +check() { + "$@" >>/dev/null 2>&1; + exit_code=$?; + [ $exit_code -eq 0 ] \ + && echo "Command [$*] succeed" \ + || die "Command [$*] failed with error code $exit_code"; +} + +# PyTorch Average Concensus Cases +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_average_consensus.py +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_average_consensus.py --enable-dynamic-topology +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_average_consensus.py --asynchronous-mode +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_average_consensus.py --no-cuda +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_average_consensus.py --no-cuda --enable-dynamic-topology +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_average_consensus.py --no-cuda --asynchronous-mode + +# PyTorch Optimization Cases +[ -f "${RUN_DIR}/plot.png" ] && EXIST_PLOT_PNG=1 || EXIST_PLOT_PNG=0 +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_optimization.py --method=diffusion +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_optimization.py --method=exact_diffusion +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_optimization.py --method=gradient_tracking +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_optimization.py --method=push_diging +[ "${EXIST_PLOT_PNG}" == 0 ] && rm -f ${RUN_DIR}/plot.png + +# PyTorch MNIST Cases +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --dist-optimizer=gradient_allreduce +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --dist-optimizer=allreduce +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --dist-optimizer=allreduce --atc-style +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --dist-optimizer=neighbor_allreduce +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --dist-optimizer=neighbor_allreduce --disable-dynamic-topology +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --dist-optimizer=neighbor_allreduce --atc-style +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --dist-optimizer=neighbor_allreduce --atc-style --disable-dynamic-topology +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --dist-optimizer=win_put +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --dist-optimizer=win_put --disable-dynamic-topology +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=gradient_allreduce +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=allreduce +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=allreduce --atc-style +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=neighbor_allreduce +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=neighbor_allreduce --disable-dynamic-topology +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=neighbor_allreduce --atc-style +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=neighbor_allreduce --atc-style --disable-dynamic-topology +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=win_put +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=win_put --disable-dynamic-topology + +# PyTorch Benchmark Cases +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --dist-optimizer=gradient_allreduce +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --dist-optimizer=allreduce +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --dist-optimizer=allreduce --atc-style +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --dist-optimizer=neighbor_allreduce +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --dist-optimizer=neighbor_allreduce --disable-dynamic-topology +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --dist-optimizer=neighbor_allreduce --atc-style +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --dist-optimizer=neighbor_allreduce --atc-style --disable-dynamic-topology +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --dist-optimizer=win_put +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --dist-optimizer=win_put --disable-dynamic-topology +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=gradient_allreduce +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=allreduce +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=allreduce --atc-style +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=neighbor_allreduce +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=neighbor_allreduce --disable-dynamic-topology +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=neighbor_allreduce --atc-style +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=neighbor_allreduce --atc-style --disable-dynamic-topology +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=win_put +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=win_put --disable-dynamic-topology + +# PyTorch ResNet Cases +[ -d "${RUN_DIR}/logs" ] && EXIST_LOGS_DIR=1 || EXIST_LOGS_DIR=0 +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --dist-optimizer=gradient_allreduce +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --dist-optimizer=allreduce +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --dist-optimizer=allreduce --atc-style +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --dist-optimizer=neighbor_allreduce +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --dist-optimizer=neighbor_allreduce --disable-dynamic-topology +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --dist-optimizer=neighbor_allreduce --atc-style +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --dist-optimizer=neighbor_allreduce --atc-style --disable-dynamic-topology +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --dist-optimizer=win_put +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --dist-optimizer=win_put --disable-dynamic-topology +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --no-cuda --dist-optimizer=gradient_allreduce +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --no-cuda --dist-optimizer=allreduce +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --no-cuda --dist-optimizer=allreduce --atc-style +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --no-cuda --dist-optimizer=neighbor_allreduce +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --no-cuda --dist-optimizer=neighbor_allreduce --disable-dynamic-topology +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --no-cuda --dist-optimizer=neighbor_allreduce --atc-style +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --no-cuda --dist-optimizer=neighbor_allreduce --atc-style --disable-dynamic-topology +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --no-cuda --dist-optimizer=win_put +check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --no-cuda --dist-optimizer=win_put --disable-dynamic-topology +[ "${EXIST_LOGS_DIR}" == 0 ] && rm -rf ${RUN_DIR}/logs \ No newline at end of file From 923df09c72f09f2e4964f62a852bd3c257743942 Mon Sep 17 00:00:00 2001 From: Hanbin Hu Date: Fri, 19 Feb 2021 01:30:20 -0800 Subject: [PATCH 07/36] Accept dst_weights as Dict, and reorganize DoNeighborAllreduce --- bluefog/torch/mpi_ops.cc | 317 ++++++++++++++++----------------------- bluefog/torch/mpi_ops.h | 14 +- bluefog/torch/mpi_ops.py | 4 +- 3 files changed, 140 insertions(+), 195 deletions(-) diff --git a/bluefog/torch/mpi_ops.cc b/bluefog/torch/mpi_ops.cc index 4e517ab3..55fd3e34 100644 --- a/bluefog/torch/mpi_ops.cc +++ b/bluefog/torch/mpi_ops.cc @@ -82,8 +82,6 @@ void MaybeCopyBufferBack(::torch::Tensor tensor, ::torch::Tensor buffer) { if (IsCPUHalfTensor(tensor)) tensor.copy_(buffer.to(::torch::kFloat16)); } -} // namespace - std::function(std::function)> GetCallbackWrapper(int handle, Timeline* timeline_ptr, const std::string& op_name, std::thread::id tid) { @@ -98,6 +96,99 @@ std::function(std::function)> }; } +void PerformNeighborAllreduceCallback(::torch::Tensor tensor, ::torch::Tensor output, + double self_weight, + const std::unordered_map& src_weights, + const std::vector& src_neighbors, + bool avg_computation, + bool dynamic_neighbors_enabled, + bool is_hierarchical) { + int src_size = bluefog_neighbor_size(); + if (dynamic_neighbors_enabled) src_size = src_neighbors.size(); + if (src_size > 0) { + ::torch::Tensor output_buffer = MaybeCopyToTensorBuffer(output); + ::torch::Tensor tensor_buffer = MaybeCopyToTensorBuffer(tensor); + + int first_dim = output_buffer.size(0) / src_size; + std::vector shape_vector; + shape_vector.push_back(first_dim); + for (int idx = 1; idx < tensor_buffer.dim(); ++idx) { + shape_vector.push_back(tensor_buffer.size(idx)); + } + + // if avg_computation is set to be False, sum computation will be taken place. + if (avg_computation) { + // 1) For a distributed graph topology, created with + // MPI_Dist_graph_create, the sequence of neighbors in the send and + // receive buffers at each process is defined as the sequence returned + // by MPI_Dist_graph_neighbors for destinations and sources, + // respectively. 2) MPI_Dist_graph_neighbors: If the communicator was + // created with MPI_Dist_graph_create_adjacent then the order of the + // values in sources and destinations is identical to the input that + // was used by the process with the same rank in comm_old in the + // creation call. + int* sources_ptr = nullptr; + if (!dynamic_neighbors_enabled) { + int indgree = 0; + int outdegree = 0; + int* destinations_ptr = nullptr; + bluefog_load_topology(&indgree, sources_ptr, &outdegree, + destinations_ptr); + } + auto output_reduced = output_buffer.slice(0, 0, first_dim); + for (int i = 0; i < src_size; i++) { + double weight = 0.0; + int src_rank; + if (!dynamic_neighbors_enabled) src_rank = *(sources_ptr+i); + else src_rank = src_neighbors[i]; + auto it = src_weights.find(src_rank); + if (it != src_weights.end()) { + weight = it->second; + } + + if (i == 0) { + output_reduced.mul_(weight); + } else { + output_reduced.add_( + output_buffer.slice(0, i * first_dim, (i + 1) * first_dim), weight); + } + } + output_buffer.resize_(shape_vector); + output_buffer.add_(tensor_buffer, self_weight); + if (is_hierarchical){ + // Because there is ncclAllreduce just take sum. + output_buffer.div_(bluefog_local_size()); + } + } else { // avg_computation is False, using sum operation + if (src_size > 1) { + auto output_reduced = output_buffer.slice(0, 0, first_dim); + for (int i = 1; i < src_size; i++) { + output_reduced.add_( + output_buffer.slice(0, i * first_dim, (i + 1) * first_dim)); + } + output_buffer.resize_(shape_vector); + } + // Include self data as well. + output_buffer.add_(tensor_buffer); + if (is_hierarchical){ + // Because there is ncclAllreduce just take sum. + output_buffer.div_(bluefog_local_size() * (src_size + 1)); + } else { + output_buffer.div_(src_size + 1); + } + } + output.resize_(shape_vector); + MaybeCopyBufferBack(output, output_buffer); + } else { // recv_size == 0 + output.set_(tensor); + ::torch::Tensor output_buffer = MaybeCopyToTensorBuffer(output); + output_buffer.mul_(self_weight); + MaybeCopyBufferBack(output, output_buffer); + } +} + +} // namespace + int DoAllreduce(::torch::Tensor tensor, ::torch::Tensor output, int average, bool is_hierarchical_local, const std::string& name) { ThrowIfError(common::CheckInitialized()); @@ -315,10 +406,11 @@ int DoNeighborAllgather(::torch::Tensor tensor, ::torch::Tensor output, } int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output, - double self_weight, const std::unordered_map& neighbor_weights, - const std::vector& send_neighbors, bool dynamic_neighbors_enabled, - bool enable_topo_check, bool avg_computation, bool is_hierarchical, - const std::string& name) { + double self_weight, + const std::unordered_map& src_weights, + const std::unordered_map& dst_weights, + bool dynamic_neighbors_enabled, bool enable_topo_check, + bool avg_computation, bool is_hierarchical, const std::string& name) { ThrowIfError(common::CheckInitialized()); auto handle = handle_manager.AllocateHandle(); @@ -333,117 +425,43 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output, auto callback_wrapper = GetCallbackWrapper(handle, timeline_ptr, op_name, tid); - std::vector recv_neighbors; - for (auto kv : neighbor_weights) - recv_neighbors.push_back(kv.first); - std::sort(recv_neighbors.begin(), recv_neighbors.end()); + std::vector src_neighbors; + for (auto kv : src_weights) + src_neighbors.push_back(kv.first); + std::sort(src_neighbors.begin(), src_neighbors.end()); + + std::vector dst_neighbors; + for (auto kv : dst_weights) + dst_neighbors.push_back(kv.first); + std::sort(dst_neighbors.begin(), dst_neighbors.end()); + auto bf_src_neighbors = std::make_shared>(src_neighbors); + auto bf_dst_neighbors = std::make_shared>(dst_neighbors); + auto ready_event = RecordReadyEvent(device); if (OPS_ON_CPU && tensor.device().is_cuda()) { ::torch::Tensor cpu_buffer = tensor.to(::torch::Device(::torch::kCPU), /*non_blocking=*/false); ::torch::Tensor cpu_output = output.to(::torch::Device(::torch::kCPU), /*non_blocking=*/false); auto bf_tensor = std::make_shared(cpu_buffer); - auto bf_context = - std::make_shared(CPU_DEVICE_ID, cpu_output); + auto bf_context = std::make_shared(CPU_DEVICE_ID, cpu_output); auto bf_output = std::make_shared(cpu_output); - auto bf_recv_neighbors = std::make_shared>(recv_neighbors); - auto bf_send_neighbors = std::make_shared>(send_neighbors); - auto ready_event = RecordReadyEvent(device); + auto enqueue_result = EnqueueTensorNeighborAllreduce( - bf_tensor, bf_output, bf_context, ready_event, bf_recv_neighbors, - bf_send_neighbors, dynamic_neighbors_enabled, is_hierarchical, + bf_tensor, bf_output, bf_context, ready_event, bf_src_neighbors, + bf_dst_neighbors, dynamic_neighbors_enabled, is_hierarchical, enable_topo_check, op_name, CPU_DEVICE_ID, - callback_wrapper([self_weight, neighbor_weights, avg_computation, - cpu_output, tensor, recv_neighbors, send_neighbors, + callback_wrapper([self_weight, src_weights, avg_computation, + cpu_output, tensor, src_neighbors, dynamic_neighbors_enabled, is_hierarchical, output, device]() mutable { with_device device_guard(device); output.copy_(cpu_output); - int recv_size = bluefog_neighbor_size(); - if (dynamic_neighbors_enabled) recv_size = recv_neighbors.size(); - if (recv_size > 0) { - ::torch::Tensor output_buffer = MaybeCopyToTensorBuffer(output); - ::torch::Tensor tensor_buffer = MaybeCopyToTensorBuffer(tensor); - - int first_dim = output_buffer.size(0) / recv_size; - std::vector shape_vector; - shape_vector.push_back(first_dim); - for (int idx = 1; idx < tensor_buffer.dim(); ++idx) { - shape_vector.push_back(tensor_buffer.size(idx)); - } - - // if avg_computation is set to be False, sum computation will be taken place. - if (avg_computation) { - // 1) For a distributed graph topology, created with - // MPI_Dist_graph_create, the sequence of neighbors in the send and - // receive buffers at each process is defined as the sequence returned - // by MPI_Dist_graph_neighbors for destinations and sources, - // respectively. 2) MPI_Dist_graph_neighbors: If the communicator was - // created with MPI_Dist_graph_create_adjacent then the order of the - // values in sources and destinations is identical to the input that - // was used by the process with the same rank in comm_old in the - // creation call. - int indgree = 0; - int outdegree = 0; - int* sources_ptr = nullptr; - int* destinations_ptr = nullptr; - bluefog_load_topology(&indgree, sources_ptr, &outdegree, - destinations_ptr); - - auto output_reduced = output_buffer.slice(0, 0, first_dim); - if (dynamic_neighbors_enabled) indgree = recv_neighbors.size(); - for (int i = 0; i < indgree; i++) { - double weight = 0.0; - int recv_rank; - if (!dynamic_neighbors_enabled) recv_rank = *(sources_ptr+i); - else recv_rank = recv_neighbors[i]; - auto it = neighbor_weights.find(recv_rank); - if (it != neighbor_weights.end()) { - weight = it->second; - } - - if (i == 0) { - output_reduced.mul_(weight); - } else { - output_reduced.add_( - output_buffer.slice(0, i * first_dim, (i + 1) * first_dim), weight); - } - } - output_buffer.resize_(shape_vector); - output_buffer.add_(tensor_buffer, self_weight); - if (is_hierarchical){ - // Because there is ncclAllreduce just take sum. - output_buffer.div_(bluefog_local_size()); - } - } else { - int neighbor_size = !dynamic_neighbors_enabled - ? bluefog_neighbor_size() - : recv_neighbors.size(); - if (neighbor_size > 1) { - auto output_reduced = output_buffer.slice(0, 0, first_dim); - for (int i = 1; i < neighbor_size; i++) { - output_reduced.add_( - output_buffer.slice(0, i * first_dim, (i + 1) * first_dim)); - } - output_buffer.resize_(shape_vector); - } - // Include self data as well. - output_buffer.add_(tensor_buffer); - if (is_hierarchical){ - // Because there is ncclAllreduce just take sum. - output_buffer.div_(bluefog_local_size() * (neighbor_size + 1)); - } else { - output_buffer.div_(neighbor_size + 1); - } - } - output.resize_(shape_vector); - MaybeCopyBufferBack(output, output_buffer); - } else { // recv_size == 0 - ::torch::Tensor output_buffer = MaybeCopyToTensorBuffer(output); - output_buffer.mul_(self_weight); - MaybeCopyBufferBack(output, output_buffer); - } + PerformNeighborAllreduceCallback(tensor, output, self_weight, + src_weights, src_neighbors, + avg_computation, + dynamic_neighbors_enabled, + is_hierarchical); })); ThrowIfError(enqueue_result); @@ -451,94 +469,19 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output, auto bf_tensor = std::make_shared(tensor); auto bf_context = std::make_shared(device, output); auto bf_output = std::make_shared(output); - auto bf_recv_neighbors = std::make_shared>(recv_neighbors); - auto bf_send_neighbors = std::make_shared>(send_neighbors); - auto ready_event = RecordReadyEvent(device); auto enqueue_result = EnqueueTensorNeighborAllreduce( - bf_tensor, bf_output, bf_context, ready_event, bf_recv_neighbors, - bf_send_neighbors, dynamic_neighbors_enabled, is_hierarchical, + bf_tensor, bf_output, bf_context, ready_event, bf_src_neighbors, + bf_dst_neighbors, dynamic_neighbors_enabled, is_hierarchical, enable_topo_check, op_name, device, - callback_wrapper([self_weight, neighbor_weights, avg_computation, - recv_neighbors, send_neighbors, dynamic_neighbors_enabled, + callback_wrapper([self_weight, src_weights, avg_computation, + src_neighbors, dynamic_neighbors_enabled, is_hierarchical, tensor, output]() mutable { - int recv_size = bluefog_neighbor_size(); - if (dynamic_neighbors_enabled) recv_size = recv_neighbors.size(); - if (recv_size > 0) { - ::torch::Tensor output_buffer = MaybeCopyToTensorBuffer(output); - ::torch::Tensor tensor_buffer = MaybeCopyToTensorBuffer(tensor); - - int first_dim = output_buffer.size(0) / recv_size; - std::vector shape_vector; - shape_vector.push_back(first_dim); - for (int idx = 1; idx < tensor_buffer.dim(); ++idx) { - shape_vector.push_back(tensor_buffer.size(idx)); - } - // if avg_computation is set to be True, average computation will be taken place. - if (avg_computation) { - int indgree = 0; - int outdegree = 0; - int* sources_ptr = nullptr; - int* destinations_ptr = nullptr; - auto output_reduced = output_buffer.slice(0, 0, first_dim); - if (!dynamic_neighbors_enabled) { - bluefog_load_topology(&indgree, sources_ptr, &outdegree, - destinations_ptr); - } else { - indgree = recv_neighbors.size(); - } - for (int i = 0; i < indgree; i++) { - double weight = 0.0; - int recv_rank; - if (!dynamic_neighbors_enabled) recv_rank = *(sources_ptr+i); - else recv_rank = recv_neighbors[i]; - auto it = neighbor_weights.find(recv_rank); - if (it != neighbor_weights.end()) { - weight = it->second; - } - - if (i == 0) { - output_reduced.mul_(weight); - } else { - output_reduced.add_( - output_buffer.slice(0, i * first_dim, (i + 1) * first_dim), weight); - } - } - output_buffer.resize_(shape_vector); - output_buffer.add_(tensor_buffer, self_weight); - if (is_hierarchical){ - // Because there is ncclAllreduce just take sum. - output_buffer.div_(bluefog_local_size()); - } - } else { - int neighbor_size = !dynamic_neighbors_enabled - ? bluefog_neighbor_size() - : recv_neighbors.size(); - if (neighbor_size > 1) { - auto output_reduced = output_buffer.slice(0, 0, first_dim); - for (int i = 1; i < neighbor_size; i++) { - output_reduced.add_( - output.slice(0, i * first_dim, (i + 1) * first_dim)); - } - } - output_buffer.resize_(shape_vector); - // Include self data as well. - output_buffer.add_(tensor_buffer); - if (is_hierarchical){ - // Because there is ncclAllreduce just take sum. - output_buffer.div_(bluefog_local_size() * (neighbor_size + 1)); - } else { - output_buffer.div_(neighbor_size + 1); - } - } - output.resize_(shape_vector); - MaybeCopyBufferBack(output, output_buffer); - } else { // recv_size == 0 - output.set_(tensor); - ::torch::Tensor output_buffer = MaybeCopyToTensorBuffer(output); - output_buffer.mul_(self_weight); - MaybeCopyBufferBack(output, output_buffer); - } + PerformNeighborAllreduceCallback(tensor, output, self_weight, + src_weights, src_neighbors, + avg_computation, + dynamic_neighbors_enabled, + is_hierarchical); })); ThrowIfError(enqueue_result); } diff --git a/bluefog/torch/mpi_ops.h b/bluefog/torch/mpi_ops.h index 5dc2cd7b..66ecb38c 100644 --- a/bluefog/torch/mpi_ops.h +++ b/bluefog/torch/mpi_ops.h @@ -111,13 +111,13 @@ NEIGHBOR_ALLGATHER_H(torch_cuda_FloatTensor, THCudaTensor) NEIGHBOR_ALLGATHER_H(torch_cuda_DoubleTensor, THCudaDoubleTensor) #endif -#define NEIGHBOR_ALLREDUCE_H(torch_Tensor, THTensor) \ - extern "C" int bluefog_torch_neighbor_allreduce_nonblocking_##torch_Tensor( \ - THTensor* tensor, THTensor* output, double self_weight, \ - const std::unordered_map& neighbor_weights, \ - const std::vector& send_neighbors, bool dynamic_neighbors_enabled, \ - bool enable_topo_check, bool avg_computation, bool is_hierarchical, \ - char* name); +#define NEIGHBOR_ALLREDUCE_H(torch_Tensor, THTensor) \ + extern "C" int bluefog_torch_neighbor_allreduce_nonblocking_##torch_Tensor( \ + THTensor* tensor, THTensor* output, double self_weight, \ + const std::unordered_map& src_weights, \ + const std::unordered_map& dst_weights, \ + bool dynamic_neighbors_enabled, bool enable_topo_check, \ + bool avg_computation, bool is_hierarchical, char* name); NEIGHBOR_ALLREDUCE_H(torch_HalfTensor, THHalfTensor) NEIGHBOR_ALLREDUCE_H(torch_FloatTensor, THFloatTensor) diff --git a/bluefog/torch/mpi_ops.py b/bluefog/torch/mpi_ops.py index 82c55e07..3720dd6a 100644 --- a/bluefog/torch/mpi_ops.py +++ b/bluefog/torch/mpi_ops.py @@ -434,7 +434,7 @@ def _neighbor_allreduce_nonblocking(tensor, output, self_weight, src_weights, dst_weights, enable_topo_check, name): function = _check_function(_neighbor_allreduce_function_factory, tensor) if dst_weights is None: - dst_weights = [] + dst_weights = {} dynamic_neighbors_enabled = False elif len(set(dst_weights)) != len(dst_weights): raise ValueError("Argument dst_weights should only contain the unique ranks.") @@ -443,6 +443,8 @@ def _neighbor_allreduce_nonblocking(tensor, output, self_weight, src_weights, "enabling dynamic topology.") else: dynamic_neighbors_enabled = True + if isinstance(dst_weights, list): + dst_weights = {dst:1.0 for dst in dst_weights} if self_weight is None and src_weights is None: # Implying this is static graph. if is_topo_weighted(): From 42d0af797216ea234dec8521a44a70c0f4839cb6 Mon Sep 17 00:00:00 2001 From: Hanbin Hu Date: Fri, 19 Feb 2021 02:51:14 -0800 Subject: [PATCH 08/36] Reorganize CheckNeighborSendRecvPattern --- bluefog/common/mpi_controller.cc | 109 ++++++++++++++++-------------- bluefog/common/mpi_controller.h | 7 +- bluefog/common/nccl_controller.cc | 28 ++++++-- 3 files changed, 86 insertions(+), 58 deletions(-) diff --git a/bluefog/common/mpi_controller.cc b/bluefog/common/mpi_controller.cc index b52451b9..26093260 100644 --- a/bluefog/common/mpi_controller.cc +++ b/bluefog/common/mpi_controller.cc @@ -293,54 +293,47 @@ void MPIController::NeighborAllgather(TensorTableEntry& entry) { } // Function to check if the sending and receiving neighbors match in the topology. -bool CheckNeighborSendRecvPattern(int size, const TensorTableEntry& entry, - Timeline* timeline_ptr, const MPI_Comm& comm) { +bool CheckNeighborSendRecvPattern( + std::shared_ptr> send_neighbors, + std::shared_ptr> recv_neighbors, + const std::string& tensor_name, + int size, Timeline* timeline_ptr, const MPI_Comm& comm) { bool res = false; - // enabled the check if enable_topo_check is true and partial - // neighbor_allreduce is activated. - if (entry.enable_topo_check && entry.dynamic_neighbors_enabled) { - if (entry.is_hierarchical) { - // TODO: support check. - BFLOG(INFO) << "Request to check topology for hierarchical neighbor " - << "allreduce ops but it is not supported yet."; - return res; - } - timeline_ptr->ActivityStart(entry.tensor_name, "NEGOTIATION"); - // Put all the send and recv neighbors in a single vector, and obtain a send - // matrix and a recv matrix through MPI_Allgather. - bool* send_check_buf = new bool[2 * size]; - std::fill_n(send_check_buf, 2 * size, false); - bool* recv_check_buf = new bool[2 * size * size]; - for (int send_rank : *(entry.send_neighbors)) - send_check_buf[send_rank] = true; - for (int recv_rank : *(entry.recv_neighbors)) - send_check_buf[size + recv_rank] = true; - int ret_code = MPI_Allgather(send_check_buf, size * 2, MPI_C_BOOL, - recv_check_buf, size * 2, MPI_C_BOOL, comm); - if (ret_code != MPI_SUCCESS) { - throw std::runtime_error( - "MPI_Allgather (for dynamic neighbor_allreduce negotiation) failed, " - "see MPI output for details."); - } - // This checks that send matrix and transposed recv matrix should be the - // same. If same, the topology is good to go. If not, there is mismatch edge - // to be fixed. - auto GetSendIndex = [size](int i, int j) -> int { return 2*size*i+j; }; - auto GetRecvIndex = [size](int i, int j) -> int { return 2*size*i+j+size; }; - for (int i = 0; i < size; ++i) { - if (res) break; - for (int j = 0; j < size; ++j) { - if (recv_check_buf[GetSendIndex(i, j)] != - recv_check_buf[GetRecvIndex(j, i)]) { - res = true; - break; - } + timeline_ptr->ActivityStart(tensor_name, "NEGOTIATION"); + // Put all the send and recv neighbors in a single vector, and obtain a send + // matrix and a recv matrix through MPI_Allgather. + bool* send_check_buf = new bool[2 * size]; + std::fill_n(send_check_buf, 2 * size, false); + bool* recv_check_buf = new bool[2 * size * size]; + for (int send_rank : *(send_neighbors)) + send_check_buf[send_rank] = true; + for (int recv_rank : *(recv_neighbors)) + send_check_buf[size + recv_rank] = true; + int ret_code = MPI_Allgather(send_check_buf, size * 2, MPI_C_BOOL, + recv_check_buf, size * 2, MPI_C_BOOL, comm); + if (ret_code != MPI_SUCCESS) { + throw std::runtime_error( + "MPI_Allgather (for dynamic neighbor_allreduce negotiation) failed, " + "see MPI output for details."); + } + // This checks that send matrix and transposed recv matrix should be the + // same. If same, the topology is good to go. If not, there is mismatch edge + // to be fixed. + auto GetSendIndex = [size](int i, int j) -> int { return 2*size*i+j; }; + auto GetRecvIndex = [size](int i, int j) -> int { return 2*size*i+j+size; }; + for (int i = 0; i < size; ++i) { + if (res) break; + for (int j = 0; j < size; ++j) { + if (recv_check_buf[GetSendIndex(i, j)] != + recv_check_buf[GetRecvIndex(j, i)]) { + res = true; + break; } } - delete [] send_check_buf; - delete [] recv_check_buf; - timeline_ptr->ActivityEnd(entry.tensor_name); } + delete [] send_check_buf; + delete [] recv_check_buf; + timeline_ptr->ActivityEnd(tensor_name); return res; } @@ -385,9 +378,17 @@ void MPIController::NeighborAllreduce(TensorTableEntry& entry) { // If only partial sending is enabled, the following code block checks whether the sending // and recieving neighbors match each other when enable_topo_check is set to be True. - bool is_topo_check_fail = CheckNeighborSendRecvPattern( - mpi_ctx_.size_, entry, timeline_ptr, - mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL)); + bool is_topo_check_fail = false; + if (entry.enable_topo_check && entry.dynamic_neighbors_enabled) { + if (entry.is_hierarchical) { + // TODO: support check. + BFLOG(INFO) << "Request to check topology for hierarchical neighbor " + << "allreduce ops but it is not supported yet."; + } + is_topo_check_fail = CheckNeighborSendRecvPattern( + entry.send_neighbors, entry.recv_neighbors, entry.tensor_name, + mpi_ctx_.size_, timeline_ptr, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL)); + } if (is_topo_check_fail) { entry.callback(Status::InvalidArgument( @@ -583,9 +584,17 @@ void MPIController::NeighborAllreduce(std::vector& entries) { // If only partial sending is enabled, the following code block checks whether // the sending and recieving neighbors match each other when enable_topo_check // is set to be True. - bool is_topo_check_fail = CheckNeighborSendRecvPattern( - mpi_ctx_.size_, first_entry, timeline_ptr, - mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL)); + bool is_topo_check_fail = false; + if (first_entry.enable_topo_check && first_entry.dynamic_neighbors_enabled) { + if (first_entry.is_hierarchical) { + // TODO: support check. + BFLOG(INFO) << "Request to check topology for hierarchical neighbor " + << "allreduce ops but it is not supported yet."; + } + is_topo_check_fail = CheckNeighborSendRecvPattern( + first_entry.send_neighbors, first_entry.recv_neighbors, first_entry.tensor_name, + mpi_ctx_.size_, timeline_ptr, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL)); + } if (is_topo_check_fail) { for (auto& entry : entries) { diff --git a/bluefog/common/mpi_controller.h b/bluefog/common/mpi_controller.h index f42fc182..8aacb071 100644 --- a/bluefog/common/mpi_controller.h +++ b/bluefog/common/mpi_controller.h @@ -27,8 +27,11 @@ namespace common { // Function to check if the sending and receiving neighbors match in the // topology. -bool CheckNeighborSendRecvPattern(int size, const TensorTableEntry& entry, - Timeline* timeline_ptr, const MPI_Comm& comm); +bool CheckNeighborSendRecvPattern( + std::shared_ptr> send_neighbors, + std::shared_ptr> recv_neighbors, + const std::string& tensor_name, + int size, Timeline* timeline_ptr, const MPI_Comm& comm); class MPIController { public: diff --git a/bluefog/common/nccl_controller.cc b/bluefog/common/nccl_controller.cc index 5389f341..cbc63bb7 100644 --- a/bluefog/common/nccl_controller.cc +++ b/bluefog/common/nccl_controller.cc @@ -661,9 +661,17 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) { // If only partial sending is enabled, the following code block checks whether // the sending and recieving neighbors match each other when enable_topo_check // is set to be True. - bool is_topo_check_fail = CheckNeighborSendRecvPattern( - mpi_ctx_.size_, entry, timeline_ptr_, - mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL)); + bool is_topo_check_fail = false; + if (entry.enable_topo_check && entry.dynamic_neighbors_enabled) { + if (entry.is_hierarchical) { + // TODO: support check. + BFLOG(INFO) << "Request to check topology for hierarchical neighbor " + << "allreduce ops but it is not supported yet."; + } + is_topo_check_fail = CheckNeighborSendRecvPattern( + entry.send_neighbors, entry.recv_neighbors, entry.tensor_name, + mpi_ctx_.size_, timeline_ptr, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL)); + } if (is_topo_check_fail) { entry.callback(Status::InvalidArgument( "Send and recv neighbors dont' match in neighbor " @@ -907,9 +915,17 @@ void NCCLController::NeighborAllreduce(std::vector& entries) { // If only partial sending is enabled, the following code block checks whether // the sending and recieving neighbors match each other when enable_topo_check // is set to be True. - bool is_topo_check_fail = CheckNeighborSendRecvPattern( - mpi_ctx_.size_, first_entry, timeline_ptr_, - mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL)); + bool is_topo_check_fail = false; + if (entry.enable_topo_check && entry.dynamic_neighbors_enabled) { + if (entry.is_hierarchical) { + // TODO: support check. + BFLOG(INFO) << "Request to check topology for hierarchical neighbor " + << "allreduce ops but it is not supported yet."; + } + is_topo_check_fail = CheckNeighborSendRecvPattern( + entry.send_neighbors, entry.recv_neighbors, entry.tensor_name, + mpi_ctx_.size_, timeline_ptr, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL)); + } if (is_topo_check_fail) { for (auto& entry : entries) { entry.callback(Status::InvalidArgument( From 3184e17054d430014dcba7a18df8b2dc9d1086b8 Mon Sep 17 00:00:00 2001 From: Hanbin Hu Date: Fri, 19 Feb 2021 03:01:04 -0800 Subject: [PATCH 09/36] Fix timeline_ptr for NCCL --- bluefog/common/nccl_controller.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bluefog/common/nccl_controller.cc b/bluefog/common/nccl_controller.cc index cbc63bb7..27b5e9c8 100644 --- a/bluefog/common/nccl_controller.cc +++ b/bluefog/common/nccl_controller.cc @@ -670,7 +670,7 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) { } is_topo_check_fail = CheckNeighborSendRecvPattern( entry.send_neighbors, entry.recv_neighbors, entry.tensor_name, - mpi_ctx_.size_, timeline_ptr, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL)); + mpi_ctx_.size_, timeline_ptr_, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL)); } if (is_topo_check_fail) { entry.callback(Status::InvalidArgument( @@ -924,7 +924,7 @@ void NCCLController::NeighborAllreduce(std::vector& entries) { } is_topo_check_fail = CheckNeighborSendRecvPattern( entry.send_neighbors, entry.recv_neighbors, entry.tensor_name, - mpi_ctx_.size_, timeline_ptr, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL)); + mpi_ctx_.size_, timeline_ptr_, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL)); } if (is_topo_check_fail) { for (auto& entry : entries) { From c85cbd50ff509c0c56816baa91a2c756ce394e24 Mon Sep 17 00:00:00 2001 From: Hanbin Hu Date: Fri, 19 Feb 2021 03:01:04 -0800 Subject: [PATCH 10/36] Fix timeline_ptr for NCCL --- bluefog/common/nccl_controller.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/bluefog/common/nccl_controller.cc b/bluefog/common/nccl_controller.cc index cbc63bb7..e47f4857 100644 --- a/bluefog/common/nccl_controller.cc +++ b/bluefog/common/nccl_controller.cc @@ -670,7 +670,7 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) { } is_topo_check_fail = CheckNeighborSendRecvPattern( entry.send_neighbors, entry.recv_neighbors, entry.tensor_name, - mpi_ctx_.size_, timeline_ptr, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL)); + mpi_ctx_.size_, timeline_ptr_, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL)); } if (is_topo_check_fail) { entry.callback(Status::InvalidArgument( @@ -916,15 +916,15 @@ void NCCLController::NeighborAllreduce(std::vector& entries) { // the sending and recieving neighbors match each other when enable_topo_check // is set to be True. bool is_topo_check_fail = false; - if (entry.enable_topo_check && entry.dynamic_neighbors_enabled) { - if (entry.is_hierarchical) { + if (first_entry.enable_topo_check && first_entry.dynamic_neighbors_enabled) { + if (first_entry.is_hierarchical) { // TODO: support check. BFLOG(INFO) << "Request to check topology for hierarchical neighbor " << "allreduce ops but it is not supported yet."; } is_topo_check_fail = CheckNeighborSendRecvPattern( - entry.send_neighbors, entry.recv_neighbors, entry.tensor_name, - mpi_ctx_.size_, timeline_ptr, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL)); + first_entry.send_neighbors, first_entry.recv_neighbors, first_entry.tensor_name, + mpi_ctx_.size_, timeline_ptr_, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL)); } if (is_topo_check_fail) { for (auto& entry : entries) { From 7d60585054e7af06938a851e394be77d9c854384 Mon Sep 17 00:00:00 2001 From: Hanbin Hu Date: Fri, 19 Feb 2021 11:29:10 -0800 Subject: [PATCH 11/36] Put dst_weights information into TensorTableEntry --- bluefog/common/common.h | 1 + bluefog/common/operations.cc | 37 ++++++++++++++++++------------------ bluefog/common/operations.h | 1 + bluefog/torch/mpi_ops.cc | 20 +++++++++++-------- 4 files changed, 33 insertions(+), 26 deletions(-) diff --git a/bluefog/common/common.h b/bluefog/common/common.h index 00d2a204..a27dde68 100644 --- a/bluefog/common/common.h +++ b/bluefog/common/common.h @@ -278,6 +278,7 @@ struct TensorTableEntry { // Neighbors for dynamic neighbor_allreduce. std::shared_ptr> send_neighbors; std::shared_ptr> recv_neighbors; + std::shared_ptr> send_weights; // Boolean value if dynamic neighbor is enabled. bool dynamic_neighbors_enabled = false; diff --git a/bluefog/common/operations.cc b/bluefog/common/operations.cc index c9daa4ea..312dc8d6 100644 --- a/bluefog/common/operations.cc +++ b/bluefog/common/operations.cc @@ -432,6 +432,20 @@ void CheckForStalledTensors(BluefogGlobalState& state) { } } +template +bool IsSameList (std::shared_ptr> n1, std::shared_ptr> n2) { + if (n1 == nullptr && n2 == nullptr) return true; + if (n1 == nullptr || n2 == nullptr) return false; + if (n1->size() != n2->size()) return false; + // The order matters as well. + for (int i = 0; i < n1->size(); i++) { + if (n1->at(i) != n2->at(i)) { + return false; + } + } + return true; +} + } // namespace bool RunLoopOnce(BluefogGlobalState& state); @@ -948,20 +962,6 @@ void NegotiateOfRequestOfMaster(BluefogGlobalState& state, // Attempt to add more responses to this fused response. const TensorTableEntry& entry = state.tensor_queue.GetTensorEntry(response.tensor_names()[0]); - auto IsSameNeighborList = - [](std::shared_ptr> n1, - std::shared_ptr> n2) -> bool { - if (n1 == nullptr && n2 == nullptr) return true; - if (n1 == nullptr || n2 == nullptr) return false; - if (n1->size() != n2->size()) return false; - // The order matters as well. - for (int i = 0; i < n1->size(); i++) { - if (n1->at(i) != n2->at(i)) { - return false; - } - } - return true; - }; // Recall that send_neighbors is empty or not determines we use partial // neighbor allreduce or not. int num_recv_neighbors = !entry.dynamic_neighbors_enabled @@ -985,10 +985,9 @@ void NegotiateOfRequestOfMaster(BluefogGlobalState& state, entry.tensor->dtype() == new_entry.tensor->dtype() && entry.dynamic_neighbors_enabled == new_entry.dynamic_neighbors_enabled && entry.is_hierarchical == new_entry.is_hierarchical && - IsSameNeighborList(entry.send_neighbors, - new_entry.send_neighbors) && - IsSameNeighborList(entry.recv_neighbors, - new_entry.recv_neighbors) && + IsSameList(entry.send_neighbors, new_entry.send_neighbors) && + IsSameList(entry.send_weights, new_entry.send_weights) && + IsSameList(entry.recv_neighbors, new_entry.recv_neighbors) && tensor_size + new_tensor_size <= state.tensor_fusion_threshold) { // These tensors will fuse together well. tensor_size += new_tensor_size; @@ -1545,6 +1544,7 @@ Status EnqueueTensorNeighborAllreduce(std::shared_ptr tensor, std::shared_ptr ready_event, std::shared_ptr> recv_neighbors, std::shared_ptr> send_neighbors, + std::shared_ptr> send_weights, bool dynamic_neighbors_enabled, bool is_hierarchical, bool enable_topo_check, @@ -1569,6 +1569,7 @@ Status EnqueueTensorNeighborAllreduce(std::shared_ptr tensor, e.ready_event = ready_event; e.recv_neighbors = recv_neighbors; e.send_neighbors = send_neighbors; + e.send_weights = send_weights; e.dynamic_neighbors_enabled = dynamic_neighbors_enabled; e.is_hierarchical = is_hierarchical; e.enable_topo_check = enable_topo_check; diff --git a/bluefog/common/operations.h b/bluefog/common/operations.h index 66c06031..1d2a53d2 100644 --- a/bluefog/common/operations.h +++ b/bluefog/common/operations.h @@ -144,6 +144,7 @@ Status EnqueueTensorNeighborAllreduce(std::shared_ptr tensor, std::shared_ptr ready_event, std::shared_ptr> recv_neighbors, std::shared_ptr> send_neighbors, + std::shared_ptr> send_weights, bool dynamic_neighbors_enabled, bool is_hierarchical, bool enable_topo_check, diff --git a/bluefog/torch/mpi_ops.cc b/bluefog/torch/mpi_ops.cc index 55fd3e34..a5527d57 100644 --- a/bluefog/torch/mpi_ops.cc +++ b/bluefog/torch/mpi_ops.cc @@ -427,16 +427,20 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output, std::vector src_neighbors; for (auto kv : src_weights) - src_neighbors.push_back(kv.first); + src_neighbors.push_back(kv.first); std::sort(src_neighbors.begin(), src_neighbors.end()); std::vector dst_neighbors; for (auto kv : dst_weights) - dst_neighbors.push_back(kv.first); + dst_neighbors.push_back(kv.first); std::sort(dst_neighbors.begin(), dst_neighbors.end()); + std::vector dst_weights_vec; + for (int rank : dst_neighbors) + dst_weights_vec.push_back(dst_weights.at(rank)); auto bf_src_neighbors = std::make_shared>(src_neighbors); auto bf_dst_neighbors = std::make_shared>(dst_neighbors); + auto bf_dst_weights_vec = std::make_shared>(dst_weights_vec); auto ready_event = RecordReadyEvent(device); if (OPS_ON_CPU && tensor.device().is_cuda()) { ::torch::Tensor cpu_buffer = @@ -448,9 +452,9 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output, auto bf_output = std::make_shared(cpu_output); auto enqueue_result = EnqueueTensorNeighborAllreduce( - bf_tensor, bf_output, bf_context, ready_event, bf_src_neighbors, - bf_dst_neighbors, dynamic_neighbors_enabled, is_hierarchical, - enable_topo_check, op_name, CPU_DEVICE_ID, + bf_tensor, bf_output, bf_context, ready_event, + bf_src_neighbors, bf_dst_neighbors, bf_dst_weights_vec, + dynamic_neighbors_enabled, is_hierarchical, enable_topo_check, op_name, CPU_DEVICE_ID, callback_wrapper([self_weight, src_weights, avg_computation, cpu_output, tensor, src_neighbors, dynamic_neighbors_enabled, is_hierarchical, output, @@ -471,9 +475,9 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output, auto bf_output = std::make_shared(output); auto enqueue_result = EnqueueTensorNeighborAllreduce( - bf_tensor, bf_output, bf_context, ready_event, bf_src_neighbors, - bf_dst_neighbors, dynamic_neighbors_enabled, is_hierarchical, - enable_topo_check, op_name, device, + bf_tensor, bf_output, bf_context, ready_event, + bf_src_neighbors, bf_dst_neighbors, bf_dst_weights_vec, + dynamic_neighbors_enabled, is_hierarchical, enable_topo_check, op_name, device, callback_wrapper([self_weight, src_weights, avg_computation, src_neighbors, dynamic_neighbors_enabled, is_hierarchical, tensor, output]() mutable { From 79c57b36310ae36c5f5ee030521fb46ef7a628b5 Mon Sep 17 00:00:00 2001 From: Hanbin Hu Date: Fri, 26 Feb 2021 18:38:09 -0800 Subject: [PATCH 12/36] First Version of neighbor_allreduce dst_weight, existing problem: Fusion Not Implemented, CUDA data_weight problem --- bluefog/common/common.h | 7 ++- bluefog/common/mpi_controller.cc | 101 ++++++++++++++++++++++-------- bluefog/common/mpi_controller.h | 6 +- bluefog/common/nccl_controller.cc | 58 +++++++++++++---- bluefog/common/operations.cc | 5 +- bluefog/common/operations.h | 3 +- bluefog/tensorflow/adapter.cc | 2 +- bluefog/tensorflow/adapter.h | 2 +- bluefog/torch/adapter.cc | 6 +- bluefog/torch/adapter.h | 2 +- bluefog/torch/mpi_ops.cc | 15 +++-- bluefog/torch/mpi_ops.h | 5 +- bluefog/torch/mpi_ops.py | 7 ++- test/torch_ops_test.py | 93 +++++++++++++++++++++++++++ 14 files changed, 251 insertions(+), 61 deletions(-) diff --git a/bluefog/common/common.h b/bluefog/common/common.h index a27dde68..f9e1413f 100644 --- a/bluefog/common/common.h +++ b/bluefog/common/common.h @@ -211,7 +211,7 @@ class Tensor { virtual const DataType dtype() const = 0; virtual const TensorShape shape() const = 0; virtual const void* data() const = 0; - virtual std::shared_ptr data_weight(float weight) = 0; + virtual std::unique_ptr data_weight(float weight) = 0; virtual int64_t size() const = 0; virtual ~Tensor() = default; }; @@ -278,11 +278,14 @@ struct TensorTableEntry { // Neighbors for dynamic neighbor_allreduce. std::shared_ptr> send_neighbors; std::shared_ptr> recv_neighbors; - std::shared_ptr> send_weights; + std::shared_ptr> send_weights; // Boolean value if dynamic neighbor is enabled. bool dynamic_neighbors_enabled = false; + // Boolean value for enabling destination(send) weighting operation or not. + bool dst_weighting_enabled = false; + // Boolean value for enabling topology check. bool enable_topo_check = false; diff --git a/bluefog/common/mpi_controller.cc b/bluefog/common/mpi_controller.cc index 26093260..788ecdd7 100644 --- a/bluefog/common/mpi_controller.cc +++ b/bluefog/common/mpi_controller.cc @@ -294,10 +294,8 @@ void MPIController::NeighborAllgather(TensorTableEntry& entry) { // Function to check if the sending and receiving neighbors match in the topology. bool CheckNeighborSendRecvPattern( - std::shared_ptr> send_neighbors, - std::shared_ptr> recv_neighbors, - const std::string& tensor_name, - int size, Timeline* timeline_ptr, const MPI_Comm& comm) { + const std::vector* send_neighbors, const std::vector* recv_neighbors, + const std::string& tensor_name, int size, Timeline* timeline_ptr, const MPI_Comm& comm) { bool res = false; timeline_ptr->ActivityStart(tensor_name, "NEGOTIATION"); // Put all the send and recv neighbors in a single vector, and obtain a send @@ -386,7 +384,7 @@ void MPIController::NeighborAllreduce(TensorTableEntry& entry) { << "allreduce ops but it is not supported yet."; } is_topo_check_fail = CheckNeighborSendRecvPattern( - entry.send_neighbors, entry.recv_neighbors, entry.tensor_name, + entry.send_neighbors.get(), entry.recv_neighbors.get(), entry.tensor_name, mpi_ctx_.size_, timeline_ptr, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL)); } @@ -437,18 +435,41 @@ void MPIController::NeighborAllreduce(TensorTableEntry& entry) { "output for details."); } } - for (int i = 0; i < nsend; ++i) { - int ret_code = MPI_Isend( - sendbuf, num_elements, mpi_ctx_.GetMPIDataType(entry.tensor), - entry.send_neighbors->at(i), - mpi_ctx_.rank_ + entry.send_neighbors->at(i), - mpi_ctx_.GetMPICommunicator(Communicator::GRAPH), &requests[i]); - if (ret_code != MPI_SUCCESS) { - throw std::runtime_error( - "MPI_Isend (for dynamic neighbor_allreduce) failed, see MPI " - "output for details."); + + std::vector> weighted_tensors; + // Ensure the lifecycle of the weighted tensors are alive after communication. + + if (entry.dst_weighting_enabled) { + for (int i = 0; i < nsend; ++i) { + int ret_code; + auto weighted_tensor_ptr = entry.tensor->data_weight(entry.send_weights->at(i)); + weighted_tensors.push_back(std::move(weighted_tensor_ptr)); + ret_code = MPI_Isend( + weighted_tensors[i].get()->data(), num_elements, mpi_ctx_.GetMPIDataType(entry.tensor), + entry.send_neighbors->at(i), + mpi_ctx_.rank_ + entry.send_neighbors->at(i), + mpi_ctx_.GetMPICommunicator(Communicator::GRAPH), &requests[i]); + if (ret_code != MPI_SUCCESS) { + throw std::runtime_error( + "MPI_Isend (for dynamic neighbor_allreduce) failed, see MPI " + "output for details."); + } + } + } else { + for (int i = 0; i < nsend; ++i) { + int ret_code = MPI_Isend( + sendbuf, num_elements, mpi_ctx_.GetMPIDataType(entry.tensor), + entry.send_neighbors->at(i), + mpi_ctx_.rank_ + entry.send_neighbors->at(i), + mpi_ctx_.GetMPICommunicator(Communicator::GRAPH), &requests[i]); + if (ret_code != MPI_SUCCESS) { + throw std::runtime_error( + "MPI_Isend (for dynamic neighbor_allreduce) failed, see MPI " + "output for details."); + } } } + MPI_Waitall(nsend + nrecv, requests.data(), statuses.data()); error_message = GenerateNeighborAllreduceErrorMessage(statuses, nsend, nrecv); @@ -592,7 +613,7 @@ void MPIController::NeighborAllreduce(std::vector& entries) { << "allreduce ops but it is not supported yet."; } is_topo_check_fail = CheckNeighborSendRecvPattern( - first_entry.send_neighbors, first_entry.recv_neighbors, first_entry.tensor_name, + first_entry.send_neighbors.get(), first_entry.recv_neighbors.get(), first_entry.tensor_name, mpi_ctx_.size_, timeline_ptr, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL)); } @@ -610,6 +631,18 @@ void MPIController::NeighborAllreduce(std::vector& entries) { timeline_ptr->ActivityEndAll(entries); const void* fused_input_data = buffer_data; + const void* weighted_fused_input_data = nullptr; + if (first_entry.dst_weighting_enabled) { + // Generate weighted data fusion for sending + BFLOG(INFO) << "Should not reach here"; + throw std::runtime_error("Not Implemented for dst_weight fusion"); + if (first_entry.device == CPU_DEVICE_ID) { + + } else { + + } + } + // Unlike allreduce, the storage for neighbor_allreduce in fusion buffer // is like [t_1, t_2 | t_1_n1, t_2_n1, t_1_n2, t_2_n2]. // Here t_1 and t_2 means self tensor 1 and 2 and _n1 and _n2 means the @@ -655,16 +688,32 @@ void MPIController::NeighborAllreduce(std::vector& entries) { "for details."); } } - for (int i = 0; i < nsend; ++i) { - int ret_code = MPI_Isend( - fused_input_data, num_elements, mpi_ctx_.GetMPIDataType(first_entry.tensor), - first_entry.send_neighbors->at(i), - /*tag=*/mpi_ctx_.rank_ + first_entry.send_neighbors->at(i), - mpi_ctx_.GetMPICommunicator(Communicator::GRAPH), &requests[i]); - if (ret_code != MPI_SUCCESS) { - throw std::runtime_error( - "MPI_Isend (for dynamic neighbor_allreduce) failed, see MPI output " - "for details."); + if (!first_entry.dst_weighting_enabled) { + for (int i = 0; i < nsend; ++i) { + int ret_code = MPI_Isend( + fused_input_data, num_elements, mpi_ctx_.GetMPIDataType(first_entry.tensor), + first_entry.send_neighbors->at(i), + /*tag=*/mpi_ctx_.rank_ + first_entry.send_neighbors->at(i), + mpi_ctx_.GetMPICommunicator(Communicator::GRAPH), &requests[i]); + if (ret_code != MPI_SUCCESS) { + throw std::runtime_error( + "MPI_Isend (for dynamic neighbor_allreduce) failed, see MPI output " + "for details."); + } + } + } else { + for (int i = 0; i < nsend; ++i) { + void* sendbuf = + (void*)((uint8_t*)weighted_fused_input_data + num_elements * i * element_size); + int ret_code = MPI_Isend(sendbuf, num_elements, + mpi_ctx_.GetMPIDataType(first_entry.tensor), first_entry.send_neighbors->at(i), + /*tag=*/mpi_ctx_.rank_ + first_entry.send_neighbors->at(i), + mpi_ctx_.GetMPICommunicator(Communicator::GRAPH), &requests[i]); + if (ret_code != MPI_SUCCESS) { + throw std::runtime_error( + "MPI_Isend (for dynamic neighbor_allreduce) failed, see MPI output " + "for details."); + } } } MPI_Waitall(nsend + nrecv, requests.data(), statuses.data()); diff --git a/bluefog/common/mpi_controller.h b/bluefog/common/mpi_controller.h index 8aacb071..2a850c01 100644 --- a/bluefog/common/mpi_controller.h +++ b/bluefog/common/mpi_controller.h @@ -28,10 +28,8 @@ namespace common { // Function to check if the sending and receiving neighbors match in the // topology. bool CheckNeighborSendRecvPattern( - std::shared_ptr> send_neighbors, - std::shared_ptr> recv_neighbors, - const std::string& tensor_name, - int size, Timeline* timeline_ptr, const MPI_Comm& comm); + const std::vector* send_neighbors, const std::vector* recv_neighbors, + const std::string& tensor_name, int size, Timeline* timeline_ptr, const MPI_Comm& comm); class MPIController { public: diff --git a/bluefog/common/nccl_controller.cc b/bluefog/common/nccl_controller.cc index e47f4857..921a8d5b 100644 --- a/bluefog/common/nccl_controller.cc +++ b/bluefog/common/nccl_controller.cc @@ -669,7 +669,7 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) { << "allreduce ops but it is not supported yet."; } is_topo_check_fail = CheckNeighborSendRecvPattern( - entry.send_neighbors, entry.recv_neighbors, entry.tensor_name, + entry.send_neighbors.get(), entry.recv_neighbors.get(), entry.tensor_name, mpi_ctx_.size_, timeline_ptr_, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL)); } if (is_topo_check_fail) { @@ -679,6 +679,9 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) { return; } + std::vector> weighted_tensors; + // Ensure the lifecycle of the weighted tensors are alive after communication. + #if NCCL_MINOR > 6 if (!entry.is_hierarchical) { ncclGroupStart(); @@ -702,9 +705,20 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) { NCCLCHECK(ncclRecv(recvbuf, num_elements, GetNCCLDataType(entry.tensor), recv_rank, nccl_ctx_.nccl_comm, nccl_ctx_.stream)); } - for (int send_rank : *entry.send_neighbors) { - NCCLCHECK(ncclSend(sendbuf, num_elements, GetNCCLDataType(entry.tensor), - send_rank, nccl_ctx_.nccl_comm, nccl_ctx_.stream)); + if(entry.dst_weighting_enabled) + { + for (size_t i = 0; i < entry.send_neighbors->size(); ++i) { + auto weighted_tensor_ptr = entry.tensor->data_weight(entry.send_weights->at(i)); + weighted_tensors.push_back(std::move(weighted_tensor_ptr)); + NCCLCHECK(ncclSend(weighted_tensors[i].get()->data(), num_elements, + GetNCCLDataType(entry.tensor), entry.send_neighbors->at(i), + nccl_ctx_.nccl_comm, nccl_ctx_.stream)); + } + } else { + for (int send_rank : *entry.send_neighbors) { + NCCLCHECK(ncclSend(sendbuf, num_elements, GetNCCLDataType(entry.tensor), + send_rank, nccl_ctx_.nccl_comm, nccl_ctx_.stream)); + } } } ncclGroupEnd(); @@ -754,7 +768,7 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) { auto tid = std::this_thread::get_id(); nccl_ctx_.finalizer_thread_pool.execute( - [this, entry, event_queue, tid]() mutable { + [this, entry, event_queue, tid, buffer_data, weighted_tensors]() mutable { with_device device_guard(entry.device); WaitForEvents(event_queue, {entry}, this->timeline_ptr_, tid); @@ -923,7 +937,7 @@ void NCCLController::NeighborAllreduce(std::vector& entries) { << "allreduce ops but it is not supported yet."; } is_topo_check_fail = CheckNeighborSendRecvPattern( - first_entry.send_neighbors, first_entry.recv_neighbors, first_entry.tensor_name, + first_entry.send_neighbors.get(), first_entry.recv_neighbors.get(), first_entry.tensor_name, mpi_ctx_.size_, timeline_ptr_, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL)); } if (is_topo_check_fail) { @@ -943,6 +957,16 @@ void NCCLController::NeighborAllreduce(std::vector& entries) { RecordEvent(event_queue, "MEM_CPY_IN"); } + const void* weighted_fused_input_data = nullptr; + if (first_entry.dst_weighting_enabled) { + throw std::runtime_error("Not Implemented for dst_weight fusion"); + if (first_entry.device == CPU_DEVICE_ID) { + throw std::runtime_error("NCCL Neighbor Allreduce failed: perform operation on CPU data."); + } else { + + } + } + // Unlike allreduce, the storage for neighbor_allreduce in fusion buffer // is like [t_1, t_2 | t_1_n1, t_2_n1, t_1_n2, t_2_n2]. // Here t_1 and t_2 means self tensor 1 and 2 and _n1 and _n2 means the @@ -975,10 +999,22 @@ void NCCLController::NeighborAllreduce(std::vector& entries) { GetNCCLDataType(first_entry.tensor), recv_rank, nccl_ctx_.nccl_comm, nccl_ctx_.stream)); } - for (int send_rank : *first_entry.send_neighbors) { - NCCLCHECK(ncclSend(fused_input_data, num_elements, - GetNCCLDataType(first_entry.tensor), send_rank, - nccl_ctx_.nccl_comm, nccl_ctx_.stream)); + if (!first_entry.dst_weighting_enabled) + { + for (int send_rank : *first_entry.send_neighbors) { + NCCLCHECK(ncclSend(fused_input_data, num_elements, + GetNCCLDataType(first_entry.tensor), send_rank, + nccl_ctx_.nccl_comm, nccl_ctx_.stream)); + } + } else { + size_t i = 0; + for (int send_rank : *first_entry.send_neighbors) { + void* sendbuf = + (void*)((uint8_t*)weighted_fused_input_data + num_elements * i * element_size); + NCCLCHECK(ncclSend(sendbuf, num_elements, + GetNCCLDataType(first_entry.tensor), send_rank, + nccl_ctx_.nccl_comm, nccl_ctx_.stream)); + } } } ncclGroupEnd(); @@ -1053,7 +1089,7 @@ void NCCLController::NeighborAllreduce(std::vector& entries) { auto tid = std::this_thread::get_id(); nccl_ctx_.finalizer_thread_pool.execute( - [this, entries, event_queue, tid, buffer_data]() mutable { + [this, entries, event_queue, tid, buffer_data, weighted_fused_input_data]() mutable { auto& first_entry = entries[0]; with_device device_guard(first_entry.device); WaitForEvents(event_queue, entries, this->timeline_ptr_, tid); diff --git a/bluefog/common/operations.cc b/bluefog/common/operations.cc index 312dc8d6..efec9ff8 100644 --- a/bluefog/common/operations.cc +++ b/bluefog/common/operations.cc @@ -984,6 +984,7 @@ void NegotiateOfRequestOfMaster(BluefogGlobalState& state, response.devices() == new_response.devices() && entry.tensor->dtype() == new_entry.tensor->dtype() && entry.dynamic_neighbors_enabled == new_entry.dynamic_neighbors_enabled && + entry.dst_weighting_enabled == new_entry.dst_weighting_enabled && entry.is_hierarchical == new_entry.is_hierarchical && IsSameList(entry.send_neighbors, new_entry.send_neighbors) && IsSameList(entry.send_weights, new_entry.send_weights) && @@ -1544,8 +1545,9 @@ Status EnqueueTensorNeighborAllreduce(std::shared_ptr tensor, std::shared_ptr ready_event, std::shared_ptr> recv_neighbors, std::shared_ptr> send_neighbors, - std::shared_ptr> send_weights, + std::shared_ptr> send_weights, bool dynamic_neighbors_enabled, + bool dst_weighting_enabled, bool is_hierarchical, bool enable_topo_check, const std::string& name, const int device, @@ -1571,6 +1573,7 @@ Status EnqueueTensorNeighborAllreduce(std::shared_ptr tensor, e.send_neighbors = send_neighbors; e.send_weights = send_weights; e.dynamic_neighbors_enabled = dynamic_neighbors_enabled; + e.dst_weighting_enabled = dst_weighting_enabled; e.is_hierarchical = is_hierarchical; e.enable_topo_check = enable_topo_check; e.device = device; diff --git a/bluefog/common/operations.h b/bluefog/common/operations.h index 1d2a53d2..d94a4e26 100644 --- a/bluefog/common/operations.h +++ b/bluefog/common/operations.h @@ -144,8 +144,9 @@ Status EnqueueTensorNeighborAllreduce(std::shared_ptr tensor, std::shared_ptr ready_event, std::shared_ptr> recv_neighbors, std::shared_ptr> send_neighbors, - std::shared_ptr> send_weights, + std::shared_ptr> send_weights, bool dynamic_neighbors_enabled, + bool dst_weighting_enabled, bool is_hierarchical, bool enable_topo_check, const std::string& name, const int device, diff --git a/bluefog/tensorflow/adapter.cc b/bluefog/tensorflow/adapter.cc index f7864cbc..313a48ed 100644 --- a/bluefog/tensorflow/adapter.cc +++ b/bluefog/tensorflow/adapter.cc @@ -128,7 +128,7 @@ const void* TFTensor::data() const { return (const void*)tensor_.tensor_data().data(); } -std::shared_ptr TFTensor::data_weight(float weight) { +std::unique_ptr TFTensor::data_weight(float weight) { throw std::runtime_error("Tensorflow with weight is not implemented yet."); }; diff --git a/bluefog/tensorflow/adapter.h b/bluefog/tensorflow/adapter.h index c61c88ad..02e74ddc 100644 --- a/bluefog/tensorflow/adapter.h +++ b/bluefog/tensorflow/adapter.h @@ -50,7 +50,7 @@ class TFTensor : public common::Tensor { virtual const common::DataType dtype() const override; virtual const common::TensorShape shape() const override; virtual const void* data() const override; - virtual std::shared_ptr data_weight(float weight) override; + virtual std::unique_ptr data_weight(float weight) override; virtual int64_t size() const override; protected: diff --git a/bluefog/torch/adapter.cc b/bluefog/torch/adapter.cc index 3cb38aab..aff49a90 100644 --- a/bluefog/torch/adapter.cc +++ b/bluefog/torch/adapter.cc @@ -76,15 +76,15 @@ const common::TensorShape TorchTensor::shape() const { const void* TorchTensor::data() const { return tensor_.data_ptr(); } -std::shared_ptr TorchTensor::data_weight(float weight) { +std::unique_ptr TorchTensor::data_weight(float weight) { if (weight == 1.0) { - return std::make_shared(tensor_); + return std::make_unique(tensor_); } else { int device = tensor_.device().is_cuda() ? tensor_.device().index() : CPU_DEVICE_ID; with_device device_context(device); // Note we call mul instead of mul_ - return std::make_shared(tensor_.mul(weight)); + return std::make_unique(tensor_.mul(weight)); } } diff --git a/bluefog/torch/adapter.h b/bluefog/torch/adapter.h index 4e192ed2..eb6526f2 100644 --- a/bluefog/torch/adapter.h +++ b/bluefog/torch/adapter.h @@ -35,7 +35,7 @@ class TorchTensor : public common::Tensor { virtual const common::DataType dtype() const override; virtual const common::TensorShape shape() const override; virtual const void* data() const override; - virtual std::shared_ptr data_weight(float weight) override; + virtual std::unique_ptr data_weight(float weight) override; virtual int64_t size() const override; // TODO(ybc) Figure out a better encapsulated way to do it. diff --git a/bluefog/torch/mpi_ops.cc b/bluefog/torch/mpi_ops.cc index a5527d57..56168d0f 100644 --- a/bluefog/torch/mpi_ops.cc +++ b/bluefog/torch/mpi_ops.cc @@ -409,8 +409,9 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output, double self_weight, const std::unordered_map& src_weights, const std::unordered_map& dst_weights, - bool dynamic_neighbors_enabled, bool enable_topo_check, - bool avg_computation, bool is_hierarchical, const std::string& name) { + bool dynamic_neighbors_enabled, bool dst_weighting_enabled, + bool enable_topo_check, bool avg_computation, bool is_hierarchical, + const std::string& name) { ThrowIfError(common::CheckInitialized()); auto handle = handle_manager.AllocateHandle(); @@ -434,13 +435,13 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output, for (auto kv : dst_weights) dst_neighbors.push_back(kv.first); std::sort(dst_neighbors.begin(), dst_neighbors.end()); - std::vector dst_weights_vec; + std::vector dst_weights_vec; for (int rank : dst_neighbors) dst_weights_vec.push_back(dst_weights.at(rank)); auto bf_src_neighbors = std::make_shared>(src_neighbors); auto bf_dst_neighbors = std::make_shared>(dst_neighbors); - auto bf_dst_weights_vec = std::make_shared>(dst_weights_vec); + auto bf_dst_weights_vec = std::make_shared>(dst_weights_vec); auto ready_event = RecordReadyEvent(device); if (OPS_ON_CPU && tensor.device().is_cuda()) { ::torch::Tensor cpu_buffer = @@ -454,7 +455,8 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output, auto enqueue_result = EnqueueTensorNeighborAllreduce( bf_tensor, bf_output, bf_context, ready_event, bf_src_neighbors, bf_dst_neighbors, bf_dst_weights_vec, - dynamic_neighbors_enabled, is_hierarchical, enable_topo_check, op_name, CPU_DEVICE_ID, + dynamic_neighbors_enabled, dst_weighting_enabled, is_hierarchical, + enable_topo_check, op_name, CPU_DEVICE_ID, callback_wrapper([self_weight, src_weights, avg_computation, cpu_output, tensor, src_neighbors, dynamic_neighbors_enabled, is_hierarchical, output, @@ -477,7 +479,8 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output, auto enqueue_result = EnqueueTensorNeighborAllreduce( bf_tensor, bf_output, bf_context, ready_event, bf_src_neighbors, bf_dst_neighbors, bf_dst_weights_vec, - dynamic_neighbors_enabled, is_hierarchical, enable_topo_check, op_name, device, + dynamic_neighbors_enabled, dst_weighting_enabled, + is_hierarchical, enable_topo_check, op_name, device, callback_wrapper([self_weight, src_weights, avg_computation, src_neighbors, dynamic_neighbors_enabled, is_hierarchical, tensor, output]() mutable { diff --git a/bluefog/torch/mpi_ops.h b/bluefog/torch/mpi_ops.h index 66ecb38c..605afa41 100644 --- a/bluefog/torch/mpi_ops.h +++ b/bluefog/torch/mpi_ops.h @@ -116,8 +116,9 @@ NEIGHBOR_ALLGATHER_H(torch_cuda_DoubleTensor, THCudaDoubleTensor) THTensor* tensor, THTensor* output, double self_weight, \ const std::unordered_map& src_weights, \ const std::unordered_map& dst_weights, \ - bool dynamic_neighbors_enabled, bool enable_topo_check, \ - bool avg_computation, bool is_hierarchical, char* name); + bool dynamic_neighbors_enabled, bool dst_weighting_enabled, \ + bool enable_topo_check, bool avg_computation, bool is_hierarchical, \ + char* name); NEIGHBOR_ALLREDUCE_H(torch_HalfTensor, THHalfTensor) NEIGHBOR_ALLREDUCE_H(torch_FloatTensor, THFloatTensor) diff --git a/bluefog/torch/mpi_ops.py b/bluefog/torch/mpi_ops.py index 3720dd6a..f34270d4 100644 --- a/bluefog/torch/mpi_ops.py +++ b/bluefog/torch/mpi_ops.py @@ -436,6 +436,7 @@ def _neighbor_allreduce_nonblocking(tensor, output, self_weight, src_weights, if dst_weights is None: dst_weights = {} dynamic_neighbors_enabled = False + dst_weighting_enabled = False elif len(set(dst_weights)) != len(dst_weights): raise ValueError("Argument dst_weights should only contain the unique ranks.") elif self_weight is None or src_weights is None: @@ -443,8 +444,10 @@ def _neighbor_allreduce_nonblocking(tensor, output, self_weight, src_weights, "enabling dynamic topology.") else: dynamic_neighbors_enabled = True + dst_weighting_enabled = True if isinstance(dst_weights, list): dst_weights = {dst:1.0 for dst in dst_weights} + dst_weighting_enabled = False if self_weight is None and src_weights is None: # Implying this is static graph. if is_topo_weighted(): @@ -479,8 +482,8 @@ def _neighbor_allreduce_nonblocking(tensor, output, self_weight, src_weights, raise ValueError("Arguments self_weight and neighbor_weights have to be presented at " "the same time") is_hierarchical = False - handle = getattr(mpi_lib, function)(tensor, output, self_weight, src_weights, - dst_weights, dynamic_neighbors_enabled, + handle = getattr(mpi_lib, function)(tensor, output, self_weight, src_weights, dst_weights, + dynamic_neighbors_enabled, dst_weighting_enabled, enable_topo_check, weighted_average_computation, is_hierarchical, name.encode() if name is not None else "") _handle_map[handle] = (tensor, output) diff --git a/test/torch_ops_test.py b/test/torch_ops_test.py index f3ec3240..e76b8770 100644 --- a/test/torch_ops_test.py +++ b/test/torch_ops_test.py @@ -830,6 +830,46 @@ def test_neighbor_allreduce_sum(self): reduced_tensor.data.max() == sum_value ), "bf.neighbor_allreduce (sum) produces incorrect reduced tensor" + def test_neighbor_allreduce_dst_weight(self): + """Test that the neighbor allreduce with destination weights works correctly.""" + size = bf.size() + rank = bf.rank() + if size <= 1: + fname = inspect.currentframe().f_code.co_name + warnings.warn("Skip {} due to size 1".format(fname)) + return + + #dtypes = [torch.FloatTensor, torch.DoubleTensor] + dtypes = [] + if TEST_ON_GPU: + dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor] + + # By default, we use exponential two ring topology. + num_indegree = int(np.ceil(np.log2(size))) + neighbor_ranks = [(rank - 2**i) % size for i in range(num_indegree)] + dst_weight = 0.5 + expected_value = (np.sum(neighbor_ranks) + rank)*dst_weight + + self_weight = 0.5 + src_weights = {i: 1.0 for i in neighbor_ranks} + dst_weights = {(rank + 2**i) % size : dst_weight for i in range(num_indegree)} + + dims = [1,2,3] + for dtype, dim in itertools.product(dtypes, dims): + tensor = torch.FloatTensor(*([23] * dim)).fill_(1).mul_(rank) + tensor = self.cast_and_place(tensor, dtype) + name = "neighbor_allreduce_{}_{}".format(dim, dtype) + reduced_tensor = bf.neighbor_allreduce( + tensor, name=name, self_weight=self_weight, + src_weights=src_weights, dst_weights=dst_weights) + eps = EPSILON if tensor.dtype != torch.float16 else LOOSE_EPSILON + assert ( + list(reduced_tensor.shape) == [23] * dim + ), "bf.neighbor_allreduce (avg) produces incorrect reduced shape" + assert ( + (reduced_tensor - expected_value).abs().max() < eps + ), "bf.neighbor_allreduce (avg) produces incorrect reduced tensor" + def test_neighbor_allreduce_weighted_avg(self): """Test that the neighbor all reduce (avg) 1D, 2D, 3D tensors correctly.""" size = bf.size() @@ -1020,6 +1060,59 @@ def test_neighbor_allreduce_dynamic_topo_fusion(self): sum_value).abs().max() < eps ), "bf.neighbor_allreduce_2 (fusion) produces incorrect reduced tensor" + def test_neighbor_allreduce_dst_weight_fusion(self): + """Test neighbor allreduce works with destination weights under tensor fusion.""" + size = bf.size() + rank = bf.rank() + K = 50 # number of tensors send in short time + if size <= 1: + fname = inspect.currentframe().f_code.co_name + warnings.warn("Skip {} due to size 1".format(fname)) + return + + dtypes = [torch.FloatTensor, torch.DoubleTensor] + if TEST_ON_GPU: + dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor] + + # By default, we use exponential two ring topology. + num_indegree = int(np.ceil(np.log2(size))) + neighbor_ranks = [(rank - 2**i) % size for i in range(num_indegree)] + sum_value = (np.sum(neighbor_ranks)+rank)*K + + dst_weight = 0.5 + self_weight = dst_weight + src_weights = {i: 1.0 for i in neighbor_ranks} + dst_weights = {(rank + 2**i) % size: dst_weight for i in range(num_indegree)} + + dims = [1, 2, 3] + for dtype, dim in itertools.product(dtypes, dims): + tensor_list, handles, names = [], [], [] + for i in range(K): + tensor = torch.FloatTensor(*([23] * dim)).fill_(i + rank*K) + tensor = self.cast_and_place(tensor, dtype) + tensor_list.append(tensor) + names.append("index{}_{}_{}".format(i, dtype, dim)) + + for i in range(K): + handle = bf.neighbor_allreduce_nonblocking( + tensor_list[i], name=names[i], + self_weight=self_weight, src_weights=src_weights, dst_weights=dst_weights) + handles.append(handle) + + outputs = [] + for i in range(K): + output = bf.synchronize(handles[i]) + outputs.append(output) + + for i in range(K): + assert ( + list(outputs[i].shape) == [23] * dim + ), f"{names[i]} (fusion) produces incorrect reduced shape" + output_normalized = outputs[i]/dst_weight - i*(num_indegree+1) + assert ( + (output_normalized - sum_value).abs().max() < EPSILON + ), f"{names[i]} (fusion) produces incorrect reduced tensor" + def test_neighbor_allgather(self): """Test that the neighbor all gather 1D, 2D, 3D tensors correctly.""" size = bf.size() From 29a239c758396e32eb713ad49df6ee91e6ef649a Mon Sep 17 00:00:00 2001 From: Hanbin Hu Date: Fri, 26 Feb 2021 18:45:31 -0800 Subject: [PATCH 13/36] Add some delay after data_weight as a temporary solution --- bluefog/common/mpi_controller.cc | 2 ++ test/torch_ops_test.py | 3 +-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/bluefog/common/mpi_controller.cc b/bluefog/common/mpi_controller.cc index 788ecdd7..eb2ffe52 100644 --- a/bluefog/common/mpi_controller.cc +++ b/bluefog/common/mpi_controller.cc @@ -444,6 +444,8 @@ void MPIController::NeighborAllreduce(TensorTableEntry& entry) { int ret_code; auto weighted_tensor_ptr = entry.tensor->data_weight(entry.send_weights->at(i)); weighted_tensors.push_back(std::move(weighted_tensor_ptr)); + //TODO(hhb): Temporary Solution for Ready Event Issue with data_weight + std::this_thread::sleep_for(std::chrono::microseconds(100)); ret_code = MPI_Isend( weighted_tensors[i].get()->data(), num_elements, mpi_ctx_.GetMPIDataType(entry.tensor), entry.send_neighbors->at(i), diff --git a/test/torch_ops_test.py b/test/torch_ops_test.py index e76b8770..c8f3d3ba 100644 --- a/test/torch_ops_test.py +++ b/test/torch_ops_test.py @@ -839,8 +839,7 @@ def test_neighbor_allreduce_dst_weight(self): warnings.warn("Skip {} due to size 1".format(fname)) return - #dtypes = [torch.FloatTensor, torch.DoubleTensor] - dtypes = [] + dtypes = [torch.FloatTensor, torch.DoubleTensor] if TEST_ON_GPU: dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor] From 97517b3c0bbbbe383c84a242655ffcd566de0154 Mon Sep 17 00:00:00 2001 From: Hanbin Hu Date: Sat, 27 Feb 2021 00:12:57 -0800 Subject: [PATCH 14/36] CPU Fusion for dst_weighted added --- bluefog/common/mpi_controller.cc | 85 ++++++++++++++++++++++++++++--- bluefog/common/mpi_controller.h | 4 ++ bluefog/common/nccl_controller.cc | 37 +++++++++++--- bluefog/common/nccl_controller.h | 4 ++ bluefog/common/operations.cc | 12 ++++- bluefog/common/tensor_queue.cc | 29 +++++++++++ bluefog/common/tensor_queue.h | 23 +++++++++ test/torch_ops_test.py | 6 ++- 8 files changed, 185 insertions(+), 15 deletions(-) diff --git a/bluefog/common/mpi_controller.cc b/bluefog/common/mpi_controller.cc index eb2ffe52..05850e15 100644 --- a/bluefog/common/mpi_controller.cc +++ b/bluefog/common/mpi_controller.cc @@ -32,6 +32,33 @@ namespace bluefog { namespace common { +namespace { + +template +void ScaleBufferCPUImpl(T* buffer, int64_t num_elements, TS scale_factor) { + for (int64_t i = 0; i < num_elements; ++i) { + buffer[i] = buffer[i] * scale_factor; + } +} + +void ScaleCPUBuffer(double scale_factor, void* weighted_fused_input_data, + int64_t num_elements, DataType dtype) { + switch (dtype) { + // TODO(hhb): FLOAT16 support + case DataType::BLUEFOG_FLOAT32: + ScaleBufferCPUImpl((float*) weighted_fused_input_data, num_elements, (float) scale_factor); + break; + case DataType::BLUEFOG_FLOAT64: + ScaleBufferCPUImpl((double*) weighted_fused_input_data, num_elements, scale_factor); + break; + default: + throw std::logic_error("Type " + DataType_Name(dtype) + + " not supported by ScaleBufferCPUImpl."); + } +} + +} + // It may be because the win_create is called at different // threads from the win_put, win_get, etc. After moving win_create into // communicaiton thread, it resolved. (works in Openmpi=4.0.2 and MPICH). @@ -636,13 +663,27 @@ void MPIController::NeighborAllreduce(std::vector& entries) { const void* weighted_fused_input_data = nullptr; if (first_entry.dst_weighting_enabled) { // Generate weighted data fusion for sending - BFLOG(INFO) << "Should not reach here"; - throw std::runtime_error("Not Implemented for dst_weight fusion"); - if (first_entry.device == CPU_DEVICE_ID) { - - } else { - + timeline_ptr->ActivityStartAll(entries, "MEMCPY_IN_WEIGHT_FUSION_BUFFER"); + void* weight_buffer_data; + MemcpyInWeightFusionBuffer(weight_buffer_data, first_entry.send_neighbors->size(), + fused_input_data, num_elements, element_size, + first_entry.context, first_entry.device); + int64_t offset = 0; + for (size_t i = 0; i < first_entry.send_neighbors->size(); ++i) { + double dst_weight = first_entry.send_weights->at(i); + void* weight_buffer_data_offset = (uint8_t*)weight_buffer_data + offset; + if (first_entry.device == CPU_DEVICE_ID) { + ScaleCPUBuffer(dst_weight, weight_buffer_data_offset, num_elements, + first_entry.tensor->dtype()); + } else { +#if HAVE_CUDA + // CUDA Weighting +#endif + } + offset += num_elements * element_size; } + weighted_fused_input_data = weight_buffer_data; + timeline_ptr->ActivityEndAll(entries); } // Unlike allreduce, the storage for neighbor_allreduce in fusion buffer @@ -1452,6 +1493,38 @@ Status MPIController::GetWindowVersionValue(const std::string& name, return Status::OK(); } +void MPIController::MemcpyInWeightFusionBuffer( + void*& weight_buffer_data, size_t num_dst, + const void* buffer_data, int64_t num_elements, int element_size, + std::shared_ptr context, int device) { + // Access the fusion buffer. + FusionBufferManager* buffer_manager; + auto fusion_status = GetBluefogFusionBuffer(buffer_manager); + if (!fusion_status.ok()){ + throw std::runtime_error(fusion_status.reason()); + } + std::shared_ptr buffer = + buffer_manager->GetWeightBuffer(device); + weight_buffer_data = const_cast(buffer->AccessData(context)); + size_t data_size = num_elements * element_size; + + int64_t offset = 0; + for (size_t i = 0; i < num_dst; ++i) { + void* weight_buffer_data_at_offset = (uint8_t*)weight_buffer_data + offset; +#if HAVE_CUDA + if (device != CPU_DEVICE_ID) { + CUDACHECK(cudaMemcpy(weight_buffer_data_at_offset, buffer_data, data_size, + cudaMemcpyDeviceToDevice)); + } else { +#endif + std::memcpy(weight_buffer_data_at_offset, buffer_data, data_size); +#if HAVE_CUDA + } +#endif + offset += data_size; + } +} + void MPIController::MemcpyInFusionBuffer( const std::vector& entries, void*& buffer_data, size_t& buffer_len) { diff --git a/bluefog/common/mpi_controller.h b/bluefog/common/mpi_controller.h index 2a850c01..3f5af419 100644 --- a/bluefog/common/mpi_controller.h +++ b/bluefog/common/mpi_controller.h @@ -121,6 +121,10 @@ class MPIController { const int num_recv_neighbors, const int64_t fused_data_size); + void MemcpyInWeightFusionBuffer(void*& weight_buffer_data, size_t num_dst, + const void* buffer_data, int64_t num_elements, int element_size, + std::shared_ptr context, int device); + void MemcpyOutFusionBufferForInputs(const void* fused_input_data, std::vector& entries); diff --git a/bluefog/common/nccl_controller.cc b/bluefog/common/nccl_controller.cc index 921a8d5b..2c68c3e9 100644 --- a/bluefog/common/nccl_controller.cc +++ b/bluefog/common/nccl_controller.cc @@ -708,6 +708,7 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) { if(entry.dst_weighting_enabled) { for (size_t i = 0; i < entry.send_neighbors->size(); ++i) { + // Add ready event auto weighted_tensor_ptr = entry.tensor->data_weight(entry.send_weights->at(i)); weighted_tensors.push_back(std::move(weighted_tensor_ptr)); NCCLCHECK(ncclSend(weighted_tensors[i].get()->data(), num_elements, @@ -959,12 +960,12 @@ void NCCLController::NeighborAllreduce(std::vector& entries) { const void* weighted_fused_input_data = nullptr; if (first_entry.dst_weighting_enabled) { - throw std::runtime_error("Not Implemented for dst_weight fusion"); - if (first_entry.device == CPU_DEVICE_ID) { - throw std::runtime_error("NCCL Neighbor Allreduce failed: perform operation on CPU data."); - } else { - - } + void* weight_buffer_data = nullptr; + MemcpyInWeightFusionBuffer(weight_buffer_data, first_entry.send_neighbors->size(), + fused_input_data, num_elements, element_size, + first_entry.context, first_entry.device); + //TODO(hhb): add weighting implementation. + weighted_fused_input_data = weight_buffer_data; } // Unlike allreduce, the storage for neighbor_allreduce in fusion buffer @@ -1839,6 +1840,30 @@ void NCCLController::MemcpyInFusionBuffer( buffer_len = (size_t)offset; } +void NCCLController::MemcpyInWeightFusionBuffer( + void*& weight_buffer_data, size_t num_dst, + const void* buffer_data, int64_t num_elements, int element_size, + std::shared_ptr context, int device) { + // Access the fusion buffer. + FusionBufferManager* buffer_manager; + auto fusion_status = GetBluefogFusionBuffer(buffer_manager); + if (!fusion_status.ok()){ + throw std::runtime_error(fusion_status.reason()); + } + std::shared_ptr buffer = + buffer_manager->GetWeightBuffer(device); + weight_buffer_data = const_cast(buffer->AccessData(context)); + size_t data_size = num_elements * element_size; + + int64_t offset = 0; + for (size_t i = 0; i < num_dst; ++i) { + void* weight_buffer_data_at_offset = (uint8_t*)weight_buffer_data + offset; + CUDACHECK(cudaMemcpy(weight_buffer_data_at_offset, buffer_data, data_size, + cudaMemcpyDeviceToDevice)); + offset += data_size; + } +} + void NCCLController::MemcpyOutFusionBuffer( const void* buffer_data, std::vector& entries) { int64_t offset = 0; diff --git a/bluefog/common/nccl_controller.h b/bluefog/common/nccl_controller.h index 3835a3d4..71380b08 100644 --- a/bluefog/common/nccl_controller.h +++ b/bluefog/common/nccl_controller.h @@ -188,6 +188,10 @@ class NCCLController { const int num_recv_neighbors, const int64_t fused_data_size); + void MemcpyInWeightFusionBuffer(void*& weight_buffer_data, size_t num_dst, + const void* buffer_data, int64_t num_elements, int element_size, + std::shared_ptr context, int device); + void MemcpyOutFusionBufferForInputs(const void* fused_input_data, std::vector& entries); diff --git a/bluefog/common/operations.cc b/bluefog/common/operations.cc index efec9ff8..a165f32a 100644 --- a/bluefog/common/operations.cc +++ b/bluefog/common/operations.cc @@ -779,7 +779,17 @@ void PerformOperationWithFusion(std::vector& entries) { first_entry.context, [&]() { timeline.ActivityStartAll(entries, "INIT_FUSION_BUFFER"); }, [&]() { timeline.ActivityEndAll(entries); }); - if (!status.ok()) { + + Status status_dst_weight = Status::OK(); + if (first_entry.dst_weighting_enabled) { + status_dst_weight = bluefog_global.fusion_buffer.InitializeWeightBuffer( + bluefog_global.tensor_fusion_threshold, mpi_context.size_, + first_entry.device, first_entry.context, + [&]() { timeline.ActivityStartAll(entries, "INIT_WEIGHT_FUSION_BUFFER"); }, + [&]() { timeline.ActivityEndAll(entries); }); + } + + if (!status.ok() || !status_dst_weight.ok()) { for (auto& e : entries) { e.callback(status); } diff --git a/bluefog/common/tensor_queue.cc b/bluefog/common/tensor_queue.cc index 513459f1..b0f67241 100644 --- a/bluefog/common/tensor_queue.cc +++ b/bluefog/common/tensor_queue.cc @@ -154,5 +154,34 @@ std::shared_ptr FusionBufferManager::GetBuffer(int device) { return tensor_fusion_buffers_[device].first; } +Status FusionBufferManager::InitializeWeightBuffer( + int64_t threshold, int mpi_size, int device, std::shared_ptr context, + std::function on_start_init, std::function on_end_init) { + auto& elem = weight_tensor_fusion_buffers_[device]; + auto& buffer = elem.first; + int64_t& size = elem.second; + if (size != threshold*mpi_size) { + buffer.reset(); + size = 0; + } + + if (buffer == nullptr) { + on_start_init(); + size = threshold*mpi_size; + + // Lazily allocate persistent buffer for Tensor Fusion and keep it + // forever per device. + Status status = context->AllocatePersistent(size, &buffer); + on_end_init(); + + return status; + } + + return Status::OK(); +} + +std::shared_ptr FusionBufferManager::GetWeightBuffer(int device) { + return weight_tensor_fusion_buffers_[device].first; +} } // namespace common } // namespace bluefog \ No newline at end of file diff --git a/bluefog/common/tensor_queue.h b/bluefog/common/tensor_queue.h index 50740164..2c54c88f 100644 --- a/bluefog/common/tensor_queue.h +++ b/bluefog/common/tensor_queue.h @@ -82,13 +82,36 @@ class FusionBufferManager { std::function on_start_init, std::function on_end_init); + // Initializes a buffer of the given threshold size times MPI size if not already cached. + // + // Args: + // threshold: Size of the buffer in bytes. + // mpi_size: Size of MPI nodes. + // device: Device ID to associate the buffer. + // context: Framework used to create the buffer and associate it. + // on_start_init: Callback on starting buffer initialization. + // on_end_init: Callback on completing buffer initialization. + Status InitializeWeightBuffer(int64_t threshold, + int mpi_size, + int device, + std::shared_ptr context, + std::function on_start_init, + std::function on_end_init); + // Returns the buffer associated with the given device and framework, or null. std::shared_ptr GetBuffer(int device); + // Returns the weight buffer associated with the given device and framework, or null. + std::shared_ptr GetWeightBuffer(int device); + private: // Memory buffers for Tensor Fusion. They are keyed by device ID. std::unordered_map, int64_t>> tensor_fusion_buffers_; + + // Memory buffers for Tensor Fusion with dst weight. They are keyed by device ID. + std::unordered_map, int64_t>> + weight_tensor_fusion_buffers_; }; } // namespace common diff --git a/test/torch_ops_test.py b/test/torch_ops_test.py index c8f3d3ba..71d49d63 100644 --- a/test/torch_ops_test.py +++ b/test/torch_ops_test.py @@ -841,7 +841,8 @@ def test_neighbor_allreduce_dst_weight(self): dtypes = [torch.FloatTensor, torch.DoubleTensor] if TEST_ON_GPU: - dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor] + pass + #dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor] # By default, we use exponential two ring topology. num_indegree = int(np.ceil(np.log2(size))) @@ -1071,7 +1072,8 @@ def test_neighbor_allreduce_dst_weight_fusion(self): dtypes = [torch.FloatTensor, torch.DoubleTensor] if TEST_ON_GPU: - dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor] + pass + #dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor] # By default, we use exponential two ring topology. num_indegree = int(np.ceil(np.log2(size))) From cc9faf459c30ab55a1351fe3907c164a40da54eb Mon Sep 17 00:00:00 2001 From: Hanbin Hu Date: Thu, 4 Mar 2021 00:37:07 -0800 Subject: [PATCH 15/36] Add ReadyEvent for dst_weight for single entry neighbor_allreduce --- bluefog/common/common.h | 1 + bluefog/common/mpi_controller.cc | 27 ++++++++++++++++++--------- bluefog/common/nccl_controller.cc | 22 ++++++++++++++++------ bluefog/torch/adapter.cc | 4 ++++ bluefog/torch/adapter.h | 1 + test/torch_ops_test.py | 3 +-- 6 files changed, 41 insertions(+), 17 deletions(-) diff --git a/bluefog/common/common.h b/bluefog/common/common.h index cf12616d..25edf8bd 100644 --- a/bluefog/common/common.h +++ b/bluefog/common/common.h @@ -241,6 +241,7 @@ class OpContext { std::shared_ptr* tensor) = 0; virtual Status AllocateZeros(int64_t num_elements, DataType dtype, std::shared_ptr* tensor) = 0; + virtual std::shared_ptr RecordReadyEvent(int device) = 0; virtual Framework framework() const = 0; virtual ~OpContext() = default; }; diff --git a/bluefog/common/mpi_controller.cc b/bluefog/common/mpi_controller.cc index c4efe82e..32b2b7b7 100644 --- a/bluefog/common/mpi_controller.cc +++ b/bluefog/common/mpi_controller.cc @@ -468,6 +468,18 @@ void MPIController::NeighborAllreduce(TensorTableEntry& entry) { } else { int nsend = entry.send_neighbors->size(); int nrecv = entry.recv_neighbors->size(); + + // Ensure the lifecycle of the weighted tensors are alive after communication. + std::vector> weighted_tensors; + if (entry.dst_weighting_enabled) { + for (int i = 0; i < nsend; ++i) { + auto weighted_tensor_ptr = entry.tensor->data_weight(entry.send_weights->at(i)); + weighted_tensors.push_back(std::move(weighted_tensor_ptr)); + } + } + std::shared_ptr ready_event = + entry.context->RecordReadyEvent(entry.device); + std::vector requests(nsend + nrecv); std::vector statuses(nsend + nrecv); int element_size = mpi_ctx_.GetMPITypeSize(entry.output->dtype()); @@ -487,17 +499,14 @@ void MPIController::NeighborAllreduce(TensorTableEntry& entry) { } } - std::vector> weighted_tensors; - // Ensure the lifecycle of the weighted tensors are alive after communication. - if (entry.dst_weighting_enabled) { + if (ready_event != nullptr) { + while (!ready_event->Ready()) { + std::this_thread::sleep_for(std::chrono::nanoseconds(100)); + } + } for (int i = 0; i < nsend; ++i) { - int ret_code; - auto weighted_tensor_ptr = entry.tensor->data_weight(entry.send_weights->at(i)); - weighted_tensors.push_back(std::move(weighted_tensor_ptr)); - //TODO(hhb): Temporary Solution for Ready Event Issue with data_weight - std::this_thread::sleep_for(std::chrono::microseconds(100)); - ret_code = MPI_Isend( + int ret_code = MPI_Isend( weighted_tensors[i].get()->data(), num_elements, mpi_ctx_.GetMPIDataType(entry.tensor), entry.send_neighbors->at(i), mpi_ctx_.rank_ + entry.send_neighbors->at(i), diff --git a/bluefog/common/nccl_controller.cc b/bluefog/common/nccl_controller.cc index 30a052d7..ff98a2fa 100644 --- a/bluefog/common/nccl_controller.cc +++ b/bluefog/common/nccl_controller.cc @@ -753,7 +753,7 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) { return; } - std::vector> weighted_tensors; + std::shared_ptr>> weighted_tensors; // Ensure the lifecycle of the weighted tensors are alive after communication. #if NCCL_MINOR > 6 @@ -772,6 +772,14 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) { send_rank, nccl_ctx_.nccl_comm, nccl_ctx_.stream)); } } else { + if(entry.dst_weighting_enabled) { + for (size_t i = 0; i < entry.send_neighbors->size(); ++i) { + auto weighted_tensor_ptr = entry.tensor->data_weight(entry.send_weights->at(i)); + weighted_tensors->push_back(std::move(weighted_tensor_ptr)); + } + } + std::shared_ptr ready_event = + entry.context->RecordReadyEvent(entry.device); for (size_t i = 0; i < entry.recv_neighbors->size(); ++i) { int recv_rank = entry.recv_neighbors->at(i); void* recvbuf = (void*)(static_cast(entry.output->data()) + @@ -781,11 +789,13 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) { } if(entry.dst_weighting_enabled) { + if (ready_event != nullptr) { + while (!ready_event->Ready()) { + std::this_thread::sleep_for(std::chrono::nanoseconds(100)); + } + } for (size_t i = 0; i < entry.send_neighbors->size(); ++i) { - // Add ready event - auto weighted_tensor_ptr = entry.tensor->data_weight(entry.send_weights->at(i)); - weighted_tensors.push_back(std::move(weighted_tensor_ptr)); - NCCLCHECK(ncclSend(weighted_tensors[i].get()->data(), num_elements, + NCCLCHECK(ncclSend((*weighted_tensors)[i].get()->data(), num_elements, GetNCCLDataType(entry.tensor), entry.send_neighbors->at(i), nccl_ctx_.nccl_comm, nccl_ctx_.stream)); } @@ -843,7 +853,7 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) { auto tid = std::this_thread::get_id(); nccl_ctx_.finalizer_thread_pool.execute( - [this, entry, event_queue, tid, buffer_data, weighted_tensors]() mutable { + [this, entry, event_queue, tid, weighted_tensors]() mutable { with_device device_guard(entry.device); WaitForEvents(event_queue, {entry}, this->timeline_ptr_, tid); diff --git a/bluefog/torch/adapter.cc b/bluefog/torch/adapter.cc index aff49a90..bdccf556 100644 --- a/bluefog/torch/adapter.cc +++ b/bluefog/torch/adapter.cc @@ -178,6 +178,10 @@ Status TorchOpContext::AllocateZeros(int64_t num_elements, DataType dtype, Framework TorchOpContext::framework() const { return Framework::PYTORCH; } +std::shared_ptr TorchOpContext::RecordReadyEvent(int device) { + return torch::RecordReadyEvent(device); +} + #if HAVE_CUDA struct ReadyEventRegistry { std::unordered_map> cuda_events; diff --git a/bluefog/torch/adapter.h b/bluefog/torch/adapter.h index eb6526f2..3b7ccf61 100644 --- a/bluefog/torch/adapter.h +++ b/bluefog/torch/adapter.h @@ -70,6 +70,7 @@ class TorchOpContext : public common::OpContext { virtual common::Status AllocateZeros( int64_t num_elements, common::DataType dtype, std::shared_ptr* tensor) override; + virtual std::shared_ptr RecordReadyEvent(int device) override; virtual common::Framework framework() const override; private: diff --git a/test/torch_ops_test.py b/test/torch_ops_test.py index f690764a..9cf95e76 100644 --- a/test/torch_ops_test.py +++ b/test/torch_ops_test.py @@ -841,8 +841,7 @@ def test_neighbor_allreduce_dst_weight(self): dtypes = [torch.FloatTensor, torch.DoubleTensor] if TEST_ON_GPU: - pass - #dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor] + dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor] # By default, we use exponential two ring topology. num_indegree = int(np.ceil(np.log2(size))) From 7f47ce4ce33ecfdfaaac767cf25db86fe6d4c4e0 Mon Sep 17 00:00:00 2001 From: Hanbin Hu Date: Thu, 4 Mar 2021 01:10:58 -0800 Subject: [PATCH 16/36] Remove const identifier for tensor dtype as it is meaningless --- bluefog/common/common.h | 2 +- bluefog/tensorflow/adapter.cc | 2 +- bluefog/tensorflow/adapter.h | 2 +- bluefog/torch/adapter.cc | 2 +- bluefog/torch/adapter.h | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/bluefog/common/common.h b/bluefog/common/common.h index 25edf8bd..ab78dac9 100644 --- a/bluefog/common/common.h +++ b/bluefog/common/common.h @@ -209,7 +209,7 @@ class TensorShape { class Tensor { public: - virtual const DataType dtype() const = 0; + virtual DataType dtype() const = 0; virtual const TensorShape shape() const = 0; virtual const void* data() const = 0; virtual std::unique_ptr data_weight(float weight) = 0; diff --git a/bluefog/tensorflow/adapter.cc b/bluefog/tensorflow/adapter.cc index 313a48ed..278cc426 100644 --- a/bluefog/tensorflow/adapter.cc +++ b/bluefog/tensorflow/adapter.cc @@ -89,7 +89,7 @@ const void* TFPersistentBuffer::AccessData( TFTensor::TFTensor(::tensorflow::Tensor& tensor) : tensor_(tensor) {} -const common::DataType TFTensor::dtype() const { +common::DataType TFTensor::dtype() const { switch (tensor_.dtype()) { case ::tensorflow::DT_UINT8: return common::DataType::BLUEFOG_UINT8; diff --git a/bluefog/tensorflow/adapter.h b/bluefog/tensorflow/adapter.h index 02e74ddc..428a0a3e 100644 --- a/bluefog/tensorflow/adapter.h +++ b/bluefog/tensorflow/adapter.h @@ -47,7 +47,7 @@ class TFPersistentBuffer : public common::PersistentBuffer { class TFTensor : public common::Tensor { public: TFTensor(::tensorflow::Tensor& tensor); - virtual const common::DataType dtype() const override; + virtual common::DataType dtype() const override; virtual const common::TensorShape shape() const override; virtual const void* data() const override; virtual std::unique_ptr data_weight(float weight) override; diff --git a/bluefog/torch/adapter.cc b/bluefog/torch/adapter.cc index bdccf556..8dd8560c 100644 --- a/bluefog/torch/adapter.cc +++ b/bluefog/torch/adapter.cc @@ -43,7 +43,7 @@ using ::bluefog::common::with_device; TorchTensor::TorchTensor(::torch::Tensor tensor) : tensor_(tensor) {} -const DataType TorchTensor::dtype() const { +DataType TorchTensor::dtype() const { switch (tensor_.scalar_type()) { case ::torch::kByte: return DataType::BLUEFOG_UINT8; diff --git a/bluefog/torch/adapter.h b/bluefog/torch/adapter.h index 3b7ccf61..85f07994 100644 --- a/bluefog/torch/adapter.h +++ b/bluefog/torch/adapter.h @@ -32,7 +32,7 @@ namespace torch { class TorchTensor : public common::Tensor { public: TorchTensor(::torch::Tensor tensor); - virtual const common::DataType dtype() const override; + virtual common::DataType dtype() const override; virtual const common::TensorShape shape() const override; virtual const void* data() const override; virtual std::unique_ptr data_weight(float weight) override; From d84138be373bd57f6292ccb8590fe3036c5d704c Mon Sep 17 00:00:00 2001 From: Hanbin Hu Date: Thu, 4 Mar 2021 01:14:41 -0800 Subject: [PATCH 17/36] Add cuda source for scalebuffer --- bluefog/common/cuda/cuda_kernels.cu | 129 ++++++++++++++++++++++++++++ bluefog/common/cuda/cuda_kernels.h | 33 +++++++ 2 files changed, 162 insertions(+) create mode 100644 bluefog/common/cuda/cuda_kernels.cu create mode 100644 bluefog/common/cuda/cuda_kernels.h diff --git a/bluefog/common/cuda/cuda_kernels.cu b/bluefog/common/cuda/cuda_kernels.cu new file mode 100644 index 00000000..0435125d --- /dev/null +++ b/bluefog/common/cuda/cuda_kernels.cu @@ -0,0 +1,129 @@ +// Copyright (C) 2020 NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// ============================================================================= + +#include "cuda_kernels.h" + +#include +#include + +namespace bluefog { +namespace common { + +template +__global__ void scale_buffer_k(const T* input, T* output, int64_t num_elements, const TS scale_factor) { + + const size_t idx = static_cast(blockDim.x) * blockIdx.x + threadIdx.x; + + for (size_t i = idx; i < num_elements; i += gridDim.x * blockDim.x) { + output[i] = scale_factor * input[i]; + } +} + +// Specialization for half2 +__global__ void scale_buffer_half2_k(const __half* input, __half* output, int64_t num_elements, const __half scale_factor) { + + const size_t idx = static_cast(blockDim.x) * blockIdx.x + threadIdx.x; + +#if __CUDA_ARCH__ > 530 + const __half2* input_h2 = reinterpret_cast(input); + __half2* output_h2 = reinterpret_cast<__half2 *>(output); + const __half2 scale_factor_h2 = __halves2half2(scale_factor, scale_factor); + + for (size_t i = idx; i < num_elements / 2; i += gridDim.x * blockDim.x) { + output_h2[i] = __hmul2(scale_factor_h2, input_h2[i]); + } + + // Deal with last element if num_elements is odd + if (idx == 0 && num_elements % 2) { + output[num_elements - 1] = __hmul(scale_factor, input[num_elements - 1]); + } +#else + for (size_t i = idx; i < num_elements; i += gridDim.x * blockDim.x) { + output[i] = __float2half(__half2float(scale_factor) * __half2float(input[i])); + } +#endif +} + +// Specialization for architectures without __half compute +template<> +__global__ void scale_buffer_k(const __half* input, __half* output, int64_t num_elements, const __half scale_factor) { + + const size_t idx = static_cast(blockDim.x) * blockIdx.x + threadIdx.x; + +#if __CUDA_ARCH__ > 530 + for (size_t i = idx; i < num_elements; i += gridDim.x * blockDim.x) { + output[i] = scale_factor * input[i]; + } +#else + for (size_t i = idx; i < num_elements; i += gridDim.x * blockDim.x) { + output[i] = __float2half(__half2float(scale_factor) * __half2float(input[i])); + } +#endif +} + +#define NTHREADS_SCALE_BUFFER_KERNEL 512 +void ScaleBufferCudaImpl(const void* fused_input_data, void* buffer_data, const int64_t num_elements, double scale_factor, + DataType dtype, cudaStream_t stream) { + const int64_t blocks = (num_elements + NTHREADS_SCALE_BUFFER_KERNEL - 1) / NTHREADS_SCALE_BUFFER_KERNEL; + const int threads = NTHREADS_SCALE_BUFFER_KERNEL; + switch (dtype) { + case DataType::BLUEFOG_UINT8: + scale_buffer_k<<>>((const uint8_t*) fused_input_data, (uint8_t*) buffer_data, + num_elements, scale_factor); + break; + case DataType::BLUEFOG_INT8: + scale_buffer_k<<>>((const int8_t*) fused_input_data, (int8_t*) buffer_data, + num_elements, scale_factor); + break; + case DataType::BLUEFOG_INT32: + scale_buffer_k<<>>((const int32_t*) fused_input_data, (int32_t*) buffer_data, + num_elements, scale_factor); + break; + case DataType::BLUEFOG_INT64: + scale_buffer_k<<>>((const int64_t*) fused_input_data, (int64_t*) buffer_data, + num_elements, scale_factor); + break; + case DataType::BLUEFOG_FLOAT16: + { + __half scale_factor_half = __float2half((float) scale_factor); + if ((size_t) fused_input_data % 4 == 0 && (size_t) buffer_data % 4 == 0) { + // If alignment allows, use half2 specialized kernel + int64_t num_elements_h2 = (num_elements + 1) / 2; + int64_t blocks_h2 = (num_elements_h2 + NTHREADS_SCALE_BUFFER_KERNEL - 1) / NTHREADS_SCALE_BUFFER_KERNEL; + scale_buffer_half2_k<<>>((const __half*) fused_input_data, (__half*) buffer_data, + num_elements, scale_factor_half); + } else { + scale_buffer_k<<>>((const __half*) fused_input_data, (__half*) buffer_data, + num_elements, scale_factor_half); + } + break; + } + case DataType::BLUEFOG_FLOAT32: + scale_buffer_k<<>>((const float*) fused_input_data, (float*) buffer_data, + num_elements, (float) scale_factor); + break; + case DataType::BLUEFOG_FLOAT64: + scale_buffer_k<<>>((const double*) fused_input_data, (double*) buffer_data, + num_elements, scale_factor); + break; + default: + throw std::logic_error("Type " + DataType_Name(dtype) + + " not supported by ScaleBufferCudaImpl."); + } +} + +} // namespace common +} // namespace bluefog + diff --git a/bluefog/common/cuda/cuda_kernels.h b/bluefog/common/cuda/cuda_kernels.h new file mode 100644 index 00000000..3739e1ea --- /dev/null +++ b/bluefog/common/cuda/cuda_kernels.h @@ -0,0 +1,33 @@ +// Copyright (C) 2020 NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// ============================================================================= + +#ifndef CUDA_KERNELS_H +#define CUDA_KERNELS_H + +#include + +#include "../common.h" + +namespace bluefog { +namespace common { + +// Scales buffer by scalar +void ScaleBufferCudaImpl(const void* fused_input_data, void* buffer_data, const int64_t num_elements, + double scale_factor, DataType dtype, cudaStream_t stream); + +} // namespace common +} // namespace bluefog + +#endif // CUDA_KERNELS_H \ No newline at end of file From 06c375d1630fb296dce27529d4deb92edf5a0ef5 Mon Sep 17 00:00:00 2001 From: Hanbin Hu Date: Thu, 4 Mar 2021 02:30:24 -0800 Subject: [PATCH 18/36] Scale buffer to modify itself --- bluefog/common/cuda/cuda_kernels.cu | 49 ++++++++++++----------------- bluefog/common/cuda/cuda_kernels.h | 4 +-- 2 files changed, 22 insertions(+), 31 deletions(-) diff --git a/bluefog/common/cuda/cuda_kernels.cu b/bluefog/common/cuda/cuda_kernels.cu index 0435125d..144b21e8 100644 --- a/bluefog/common/cuda/cuda_kernels.cu +++ b/bluefog/common/cuda/cuda_kernels.cu @@ -22,101 +22,92 @@ namespace bluefog { namespace common { template -__global__ void scale_buffer_k(const T* input, T* output, int64_t num_elements, const TS scale_factor) { +__global__ void scale_buffer_k(T* buffer, int64_t num_elements, const TS scale_factor) { const size_t idx = static_cast(blockDim.x) * blockIdx.x + threadIdx.x; for (size_t i = idx; i < num_elements; i += gridDim.x * blockDim.x) { - output[i] = scale_factor * input[i]; + buffer[i] *= scale_factor; } } // Specialization for half2 -__global__ void scale_buffer_half2_k(const __half* input, __half* output, int64_t num_elements, const __half scale_factor) { +__global__ void scale_buffer_half2_k(__half* buffer, int64_t num_elements, const __half scale_factor) { const size_t idx = static_cast(blockDim.x) * blockIdx.x + threadIdx.x; #if __CUDA_ARCH__ > 530 - const __half2* input_h2 = reinterpret_cast(input); - __half2* output_h2 = reinterpret_cast<__half2 *>(output); + __half2* buffer_h2 = reinterpret_cast<__half2 *>(buffer); const __half2 scale_factor_h2 = __halves2half2(scale_factor, scale_factor); for (size_t i = idx; i < num_elements / 2; i += gridDim.x * blockDim.x) { - output_h2[i] = __hmul2(scale_factor_h2, input_h2[i]); + buffer_h2[i] = __hmul2(scale_factor_h2, buffer_h2[i]); } // Deal with last element if num_elements is odd if (idx == 0 && num_elements % 2) { - output[num_elements - 1] = __hmul(scale_factor, input[num_elements - 1]); + buffer[num_elements - 1] = __hmul(scale_factor, buffer[num_elements - 1]); } #else for (size_t i = idx; i < num_elements; i += gridDim.x * blockDim.x) { - output[i] = __float2half(__half2float(scale_factor) * __half2float(input[i])); + buffer[i] = __float2half(__half2float(scale_factor) * __half2float(buffer[i])); } #endif } // Specialization for architectures without __half compute template<> -__global__ void scale_buffer_k(const __half* input, __half* output, int64_t num_elements, const __half scale_factor) { +__global__ void scale_buffer_k(__half* buffer, int64_t num_elements, const __half scale_factor) { const size_t idx = static_cast(blockDim.x) * blockIdx.x + threadIdx.x; #if __CUDA_ARCH__ > 530 for (size_t i = idx; i < num_elements; i += gridDim.x * blockDim.x) { - output[i] = scale_factor * input[i]; + buffer[i] *= scale_factor; } #else for (size_t i = idx; i < num_elements; i += gridDim.x * blockDim.x) { - output[i] = __float2half(__half2float(scale_factor) * __half2float(input[i])); + buffer[i] = __float2half(__half2float(scale_factor) * __half2float(buffer[i])); } #endif } #define NTHREADS_SCALE_BUFFER_KERNEL 512 -void ScaleBufferCudaImpl(const void* fused_input_data, void* buffer_data, const int64_t num_elements, double scale_factor, +void ScaleBufferCudaImpl(double scale_factor, void* buffer_data, const int64_t num_elements, DataType dtype, cudaStream_t stream) { const int64_t blocks = (num_elements + NTHREADS_SCALE_BUFFER_KERNEL - 1) / NTHREADS_SCALE_BUFFER_KERNEL; const int threads = NTHREADS_SCALE_BUFFER_KERNEL; switch (dtype) { case DataType::BLUEFOG_UINT8: - scale_buffer_k<<>>((const uint8_t*) fused_input_data, (uint8_t*) buffer_data, - num_elements, scale_factor); + scale_buffer_k<<>>((uint8_t*) buffer_data, num_elements, scale_factor); break; case DataType::BLUEFOG_INT8: - scale_buffer_k<<>>((const int8_t*) fused_input_data, (int8_t*) buffer_data, - num_elements, scale_factor); + scale_buffer_k<<>>((int8_t*) buffer_data, num_elements, scale_factor); break; case DataType::BLUEFOG_INT32: - scale_buffer_k<<>>((const int32_t*) fused_input_data, (int32_t*) buffer_data, - num_elements, scale_factor); + scale_buffer_k<<>>((int32_t*) buffer_data, num_elements, scale_factor); break; case DataType::BLUEFOG_INT64: - scale_buffer_k<<>>((const int64_t*) fused_input_data, (int64_t*) buffer_data, - num_elements, scale_factor); + scale_buffer_k<<>>((int64_t*) buffer_data, num_elements, scale_factor); break; case DataType::BLUEFOG_FLOAT16: { __half scale_factor_half = __float2half((float) scale_factor); - if ((size_t) fused_input_data % 4 == 0 && (size_t) buffer_data % 4 == 0) { + if ((size_t) buffer_data % 4 == 0) { // If alignment allows, use half2 specialized kernel int64_t num_elements_h2 = (num_elements + 1) / 2; int64_t blocks_h2 = (num_elements_h2 + NTHREADS_SCALE_BUFFER_KERNEL - 1) / NTHREADS_SCALE_BUFFER_KERNEL; - scale_buffer_half2_k<<>>((const __half*) fused_input_data, (__half*) buffer_data, - num_elements, scale_factor_half); + scale_buffer_half2_k<<>>((__half*) buffer_data, num_elements, scale_factor_half); } else { - scale_buffer_k<<>>((const __half*) fused_input_data, (__half*) buffer_data, - num_elements, scale_factor_half); + scale_buffer_k<<>>((__half*) buffer_data, num_elements, scale_factor_half); } break; } case DataType::BLUEFOG_FLOAT32: - scale_buffer_k<<>>((const float*) fused_input_data, (float*) buffer_data, - num_elements, (float) scale_factor); + scale_buffer_k<<>>((float*) buffer_data, num_elements, (float) scale_factor); break; case DataType::BLUEFOG_FLOAT64: - scale_buffer_k<<>>((const double*) fused_input_data, (double*) buffer_data, - num_elements, scale_factor); + scale_buffer_k<<>>((double*) buffer_data, num_elements, scale_factor); break; default: throw std::logic_error("Type " + DataType_Name(dtype) + diff --git a/bluefog/common/cuda/cuda_kernels.h b/bluefog/common/cuda/cuda_kernels.h index 3739e1ea..2ba27a09 100644 --- a/bluefog/common/cuda/cuda_kernels.h +++ b/bluefog/common/cuda/cuda_kernels.h @@ -24,8 +24,8 @@ namespace bluefog { namespace common { // Scales buffer by scalar -void ScaleBufferCudaImpl(const void* fused_input_data, void* buffer_data, const int64_t num_elements, - double scale_factor, DataType dtype, cudaStream_t stream); +void ScaleBufferCudaImpl(double scale_factor, void* buffer_data, const int64_t num_elements, + DataType dtype, cudaStream_t stream); } // namespace common } // namespace bluefog From e4db92fd7bd6fd45f130605cd0f3ae114826677b Mon Sep 17 00:00:00 2001 From: Hanbin Hu Date: Thu, 4 Mar 2021 22:40:05 -0800 Subject: [PATCH 19/36] Add .o file to .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 56029e30..bb14086e 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ __pycache__/ # C extensions *.so +*.o # Distribution / packaging .Python From ec2a3fc4a3d3783d2ebcb22f4ebd4f961dd602e0 Mon Sep 17 00:00:00 2001 From: Hanbin Hu Date: Thu, 4 Mar 2021 22:41:12 -0800 Subject: [PATCH 20/36] dst_weight using CUDA for fused entry & compile flow in Python setup.py --- bluefog/common/mpi_context.cc | 7 +++++++ bluefog/common/mpi_context.h | 9 +++++++++ bluefog/common/mpi_controller.cc | 14 +++++++++++--- bluefog/common/nccl_controller.cc | 11 ++++++++++- setup.py | 25 +++++++++++++++++++++++++ test/torch_ops_test.py | 3 +-- 6 files changed, 63 insertions(+), 6 deletions(-) diff --git a/bluefog/common/mpi_context.cc b/bluefog/common/mpi_context.cc index 5d6cefc8..addd3b3a 100644 --- a/bluefog/common/mpi_context.cc +++ b/bluefog/common/mpi_context.cc @@ -332,6 +332,13 @@ void MPIContext::Initialize(const std::vector& ranks, // Create custom MPI float16 summation op. MPI_Op_create(&float16_sum, 1, &mpi_float16_sum); + +#if HAVE_CUDA + int greatest_priority; + CUDACHECK(cudaDeviceGetStreamPriorityRange(NULL, &greatest_priority)); + CUDACHECK(cudaStreamCreateWithPriority(&stream, cudaStreamNonBlocking, + greatest_priority)); +#endif } void MPIContext::Finalize(MPIContextManager& ctx_manager) { diff --git a/bluefog/common/mpi_context.h b/bluefog/common/mpi_context.h index db0c31b1..aaad85db 100644 --- a/bluefog/common/mpi_context.h +++ b/bluefog/common/mpi_context.h @@ -22,6 +22,10 @@ #include #include +#if HAVE_CUDA +#include "cuda_runtime.h" +#endif + #include "common.h" #include "mpi.h" @@ -232,6 +236,11 @@ class MPIContext { // MPI Custom data type for float16. MPI_Datatype mpi_float16_t; MPI_Op mpi_float16_sum; + +#if HAVE_CUDA + // CUDA Stream + cudaStream_t stream; +#endif }; std::string GenerateNeighborExchangeErrorMessage(const std::vector& statuses, diff --git a/bluefog/common/mpi_controller.cc b/bluefog/common/mpi_controller.cc index 32b2b7b7..07fc3f0a 100644 --- a/bluefog/common/mpi_controller.cc +++ b/bluefog/common/mpi_controller.cc @@ -29,6 +29,10 @@ #include "operations.h" #include "timeline.h" +#if HAVE_CUDA +#include "cuda/cuda_kernels.h" +#endif + namespace bluefog { namespace common { @@ -507,8 +511,8 @@ void MPIController::NeighborAllreduce(TensorTableEntry& entry) { } for (int i = 0; i < nsend; ++i) { int ret_code = MPI_Isend( - weighted_tensors[i].get()->data(), num_elements, mpi_ctx_.GetMPIDataType(entry.tensor), - entry.send_neighbors->at(i), + weighted_tensors[i].get()->data(), num_elements, + mpi_ctx_.GetMPIDataType(entry.tensor), entry.send_neighbors->at(i), mpi_ctx_.rank_ + entry.send_neighbors->at(i), mpi_ctx_.GetMPICommunicator(Communicator::GRAPH), &requests[i]); if (ret_code != MPI_SUCCESS) { @@ -679,7 +683,8 @@ void MPIController::NeighborAllreduce(std::vector& entries) { first_entry.tensor->dtype()); } else { #if HAVE_CUDA - // CUDA Weighting + ScaleBufferCudaImpl(dst_weight, weight_buffer_data_offset, num_elements, + first_entry.tensor->dtype(), mpi_ctx_.stream); #endif } offset += num_elements * element_size; @@ -747,6 +752,9 @@ void MPIController::NeighborAllreduce(std::vector& entries) { } } } else { +#if HAVE_CUDA + cudaStreamSynchronize(mpi_ctx_.stream); +#endif for (int i = 0; i < nsend; ++i) { void* sendbuf = (void*)((uint8_t*)weighted_fused_input_data + num_elements * i * element_size); diff --git a/bluefog/common/nccl_controller.cc b/bluefog/common/nccl_controller.cc index ff98a2fa..295dbf22 100644 --- a/bluefog/common/nccl_controller.cc +++ b/bluefog/common/nccl_controller.cc @@ -27,6 +27,8 @@ #include "operations.h" #include "timeline.h" +#include "cuda/cuda_kernels.h" + namespace bluefog { namespace common { @@ -1048,7 +1050,14 @@ void NCCLController::NeighborAllreduce(std::vector& entries) { MemcpyInWeightFusionBuffer(weight_buffer_data, first_entry.send_neighbors->size(), fused_input_data, num_elements, element_size, first_entry.context, first_entry.device); - //TODO(hhb): add weighting implementation. + int64_t offset = 0; + for (size_t i = 0; i < first_entry.send_neighbors->size(); ++i) { + double dst_weight = first_entry.send_weights->at(i); + void* weight_buffer_data_offset = (uint8_t*)weight_buffer_data + offset; + ScaleBufferCudaImpl(dst_weight, weight_buffer_data_offset, num_elements, + first_entry.tensor->dtype(), nccl_ctx_.stream); + offset += num_elements * element_size; + } weighted_fused_input_data = weight_buffer_data; } diff --git a/setup.py b/setup.py index d06cb445..7e8e0726 100644 --- a/setup.py +++ b/setup.py @@ -437,6 +437,26 @@ def is_torch_cuda(build_ext, include_dirs, extra_compile_args): print('INFO: Above error indicates that this PyTorch installation does not support CUDA.') return False +def build_nvcc_extra_objects(): + # nvcc --compiler-options '-fPIC -D_GLIBCXX_USE_CXX11_ABI=0' -rdc=true -c cuda_kernels.cu + # nvcc --compiler-options '-fPIC -D_GLIBCXX_USE_CXX11_ABI=0' -dlink -o cuda_kernels_link.o \ + # cuda_kernels.o -lcudart + nvcc = 'nvcc' + import torch + common_flags = f"'-fPIC -D_GLIBCXX_USE_CXX11_ABI={int(torch.compiled_with_cxx11_abi())}'" + nvcc_with_flags = f"{nvcc} --compiler-options {common_flags}" + + extra_object_dir = 'bluefog/common/cuda/' + source = extra_object_dir+'cuda_kernels.cu' + object_file = extra_object_dir+'cuda_kernels.o' + object_link = extra_object_dir+'cuda_kernels_link.o' + + command_object = f"{nvcc_with_flags} -rdc=true -c {source} -o {object_file}" + command_link = f"{nvcc_with_flags} -dlink {object_file} -lcudart -o {object_link}" + + subprocess.run([command_object], shell=True) + subprocess.run([command_link], shell=True) + return [object_file, object_link] def build_torch_extension(build_ext, global_options, torch_version): # Backup the options, preventing other plugins access libs that @@ -447,6 +467,9 @@ def build_torch_extension(build_ext, global_options, torch_version): if have_cuda: cuda_include_dirs, cuda_lib_dirs = get_cuda_dirs( build_ext, options['COMPILE_FLAGS']) + cuda_extra_objects = build_nvcc_extra_objects() + options['EXTRA_OBJECTS'] += cuda_extra_objects + options['INCLUDES'] += cuda_include_dirs options['LIBRARY_DIRS'] += cuda_lib_dirs options['LIBRARIES'] += ['cudart'] @@ -491,6 +514,7 @@ def build_torch_extension(build_ext, global_options, torch_version): else: # CUDAExtension fails with `ld: library not found for -lcudart` if CUDA is not present from torch.utils.cpp_extension import CppExtension as TorchExtension + bluefog_tensorflow_mpi_lib.extra_objects = options['EXTRA_OBJECTS'] ext = TorchExtension(bluefog_torch_mpi_lib.name, define_macros=updated_macros, @@ -504,6 +528,7 @@ def build_torch_extension(build_ext, global_options, torch_version): extra_compile_args=options['COMPILE_FLAGS'], extra_link_args=options['LINK_FLAGS'], library_dirs=options['LIBRARY_DIRS'], + extra_objects=options['EXTRA_OBJECTS'], libraries=options['LIBRARIES']) # Patch an existing bluefog_torch_mpi_lib extension object. diff --git a/test/torch_ops_test.py b/test/torch_ops_test.py index 9cf95e76..974a1aca 100644 --- a/test/torch_ops_test.py +++ b/test/torch_ops_test.py @@ -1071,8 +1071,7 @@ def test_neighbor_allreduce_dst_weight_fusion(self): dtypes = [torch.FloatTensor, torch.DoubleTensor] if TEST_ON_GPU: - pass - #dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor] + dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor] # By default, we use exponential two ring topology. num_indegree = int(np.ceil(np.log2(size))) From adaa191258f1a23a1d5de051997042d2a41c6730 Mon Sep 17 00:00:00 2001 From: Hanbin Hu Date: Thu, 4 Mar 2021 23:08:22 -0800 Subject: [PATCH 21/36] make clean *.o files generated by nvcc --- Makefile | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 5b3ebdfb..dadbdf88 100644 --- a/Makefile +++ b/Makefile @@ -19,7 +19,7 @@ test_torch: test_torch_basic test_torch_ops test_torch_win_ops test_torch_optimi test_tensorflow: test_tensorflow_basic test_tensorflow_ops test_all: test_torch test_tensorflow -clean: clean_build clean_so +clean: clean_build clean_so clean_o .PHONY: test_torch_basic test_torch_basic: @@ -51,8 +51,12 @@ test_tensorflow_ops: .PHONY: clean_build clean_build: - rm -R build + rm -fR build .PHONY: clean_so clean_so: - rm ./bluefog/torch/mpi_lib.*.so + rm -f ./bluefog/torch/mpi_lib.*.so + +.PHONY: clean_o +clean_o: + rm -f ./bluefog/common/cuda/*.o \ No newline at end of file From 46e64313fa1aa94345ee33f9293558c9cce68b9e Mon Sep 17 00:00:00 2001 From: Hanbin Hu Date: Sun, 14 Mar 2021 22:54:03 -0700 Subject: [PATCH 22/36] Add fix for NCCL single entry --- bluefog/common/nccl_controller.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bluefog/common/nccl_controller.cc b/bluefog/common/nccl_controller.cc index 295dbf22..bc823ddd 100644 --- a/bluefog/common/nccl_controller.cc +++ b/bluefog/common/nccl_controller.cc @@ -755,7 +755,7 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) { return; } - std::shared_ptr>> weighted_tensors; + std::vector> weighted_tensors; // Ensure the lifecycle of the weighted tensors are alive after communication. #if NCCL_MINOR > 6 @@ -777,7 +777,7 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) { if(entry.dst_weighting_enabled) { for (size_t i = 0; i < entry.send_neighbors->size(); ++i) { auto weighted_tensor_ptr = entry.tensor->data_weight(entry.send_weights->at(i)); - weighted_tensors->push_back(std::move(weighted_tensor_ptr)); + weighted_tensors.push_back(std::move(weighted_tensor_ptr)); } } std::shared_ptr ready_event = @@ -797,7 +797,7 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) { } } for (size_t i = 0; i < entry.send_neighbors->size(); ++i) { - NCCLCHECK(ncclSend((*weighted_tensors)[i].get()->data(), num_elements, + NCCLCHECK(ncclSend(weighted_tensors[i].get()->data(), num_elements, GetNCCLDataType(entry.tensor), entry.send_neighbors->at(i), nccl_ctx_.nccl_comm, nccl_ctx_.stream)); } From ae57e75d74291210e43bd0ae904b33e0b57abd03 Mon Sep 17 00:00:00 2001 From: Hanbin Hu Date: Sun, 14 Mar 2021 23:00:05 -0700 Subject: [PATCH 23/36] Make setup.py more robust --- setup.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/setup.py b/setup.py index 7e8e0726..6857b9bc 100644 --- a/setup.py +++ b/setup.py @@ -437,13 +437,12 @@ def is_torch_cuda(build_ext, include_dirs, extra_compile_args): print('INFO: Above error indicates that this PyTorch installation does not support CUDA.') return False -def build_nvcc_extra_objects(): +def build_nvcc_extra_objects(cxx11_abi: bool): # nvcc --compiler-options '-fPIC -D_GLIBCXX_USE_CXX11_ABI=0' -rdc=true -c cuda_kernels.cu # nvcc --compiler-options '-fPIC -D_GLIBCXX_USE_CXX11_ABI=0' -dlink -o cuda_kernels_link.o \ # cuda_kernels.o -lcudart nvcc = 'nvcc' - import torch - common_flags = f"'-fPIC -D_GLIBCXX_USE_CXX11_ABI={int(torch.compiled_with_cxx11_abi())}'" + common_flags = f"'-fPIC -D_GLIBCXX_USE_CXX11_ABI={int(cxx11_abi)}'" nvcc_with_flags = f"{nvcc} --compiler-options {common_flags}" extra_object_dir = 'bluefog/common/cuda/' @@ -454,20 +453,23 @@ def build_nvcc_extra_objects(): command_object = f"{nvcc_with_flags} -rdc=true -c {source} -o {object_file}" command_link = f"{nvcc_with_flags} -dlink {object_file} -lcudart -o {object_link}" - subprocess.run([command_object], shell=True) - subprocess.run([command_link], shell=True) + subprocess.check_call([command_object], shell=True) + subprocess.check_call([command_link], shell=True) return [object_file, object_link] def build_torch_extension(build_ext, global_options, torch_version): # Backup the options, preventing other plugins access libs that # compiled with compiler of this plugin + import torch + is_cxx11_abi = torch.compiled_with_cxx11_abi() + options = copy.deepcopy(global_options) have_cuda = is_torch_cuda(build_ext, include_dirs=options['INCLUDES'], extra_compile_args=options['COMPILE_FLAGS']) if have_cuda: cuda_include_dirs, cuda_lib_dirs = get_cuda_dirs( build_ext, options['COMPILE_FLAGS']) - cuda_extra_objects = build_nvcc_extra_objects() + cuda_extra_objects = build_nvcc_extra_objects(is_cxx11_abi) options['EXTRA_OBJECTS'] += cuda_extra_objects options['INCLUDES'] += cuda_include_dirs @@ -501,9 +503,8 @@ def build_torch_extension(build_ext, global_options, torch_version): updated_macros, 'TORCH_VERSION', str(torch_version)) # Always set _GLIBCXX_USE_CXX11_ABI, since PyTorch can only detect whether it was set to 1. - import torch updated_macros = set_macro(updated_macros, '_GLIBCXX_USE_CXX11_ABI', - str(int(torch.compiled_with_cxx11_abi()))) + str(int(is_cxx11_abi))) # PyTorch requires -DTORCH_API_INCLUDE_EXTENSION_H updated_macros = set_macro( From 62dc99db6f2332e89f9e68885c05854237b7b474 Mon Sep 17 00:00:00 2001 From: Hanbin Hu Date: Tue, 16 Mar 2021 23:03:56 -0700 Subject: [PATCH 24/36] Add timeout and cuda check --- examples/test_all_example.sh | 71 ++++++++++++++++++++++++------------ 1 file changed, 47 insertions(+), 24 deletions(-) diff --git a/examples/test_all_example.sh b/examples/test_all_example.sh index af9aa491..1e2c92d9 100755 --- a/examples/test_all_example.sh +++ b/examples/test_all_example.sh @@ -2,24 +2,41 @@ NUM_PROC=4 BFRUN="bfrun -np ${NUM_PROC}" RUN_DIR="$( pwd )" -EXAMPLE_DIR="$( cd "$( dirname "$0" )" && pwd )" +EXAMPLE_DIR="$( cd "$( dirname "$0" )" && pwd )/../examples" die() { echo >&2 -e "\nERROR: $@\n"; exit 1; } check() { - "$@" >>/dev/null 2>&1; - exit_code=$?; + timeout 2m "$@" >>/dev/null 2>&1; + local exit_code=$?; [ $exit_code -eq 0 ] \ && echo "Command [$*] succeed" \ || die "Command [$*] failed with error code $exit_code"; } +# check GPU exists +nvidia-smi >>/dev/null 2>&1 +gpu_exit_code=$? +if [[ $gpu_exit_code -eq 0 ]]; then + isgpu=1 +else + isgpu=0 +fi + +if [[ $isgpu -eq 1 ]]; then + echo "GPU Detected" +else + echo "No GPU Detected." +fi + # PyTorch Average Concensus Cases check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_average_consensus.py check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_average_consensus.py --enable-dynamic-topology check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_average_consensus.py --asynchronous-mode -check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_average_consensus.py --no-cuda -check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_average_consensus.py --no-cuda --enable-dynamic-topology -check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_average_consensus.py --no-cuda --asynchronous-mode +if [[ $isgpu -eq 1 ]]; then + check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_average_consensus.py --no-cuda + check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_average_consensus.py --no-cuda --enable-dynamic-topology + check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_average_consensus.py --no-cuda --asynchronous-mode +fi # PyTorch Optimization Cases [ -f "${RUN_DIR}/plot.png" ] && EXIST_PLOT_PNG=1 || EXIST_PLOT_PNG=0 @@ -39,15 +56,17 @@ check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --dist-optimize check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --dist-optimizer=neighbor_allreduce --atc-style --disable-dynamic-topology check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --dist-optimizer=win_put check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --dist-optimizer=win_put --disable-dynamic-topology -check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=gradient_allreduce -check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=allreduce -check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=allreduce --atc-style -check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=neighbor_allreduce -check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=neighbor_allreduce --disable-dynamic-topology -check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=neighbor_allreduce --atc-style -check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=neighbor_allreduce --atc-style --disable-dynamic-topology -check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=win_put -check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=win_put --disable-dynamic-topology +if [[ $isgpu -eq 1 ]]; then + check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=gradient_allreduce + check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=allreduce + check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=allreduce --atc-style + check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=neighbor_allreduce + check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=neighbor_allreduce --disable-dynamic-topology + check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=neighbor_allreduce --atc-style + check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=neighbor_allreduce --atc-style --disable-dynamic-topology + check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=win_put + check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_mnist.py --epochs=1 --no-cuda --dist-optimizer=win_put --disable-dynamic-topology +fi # PyTorch Benchmark Cases check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --dist-optimizer=gradient_allreduce @@ -59,15 +78,17 @@ check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-it check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --dist-optimizer=neighbor_allreduce --atc-style --disable-dynamic-topology check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --dist-optimizer=win_put check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --dist-optimizer=win_put --disable-dynamic-topology -check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=gradient_allreduce -check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=allreduce -check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=allreduce --atc-style -check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=neighbor_allreduce -check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=neighbor_allreduce --disable-dynamic-topology -check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=neighbor_allreduce --atc-style -check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=neighbor_allreduce --atc-style --disable-dynamic-topology -check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=win_put -check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=win_put --disable-dynamic-topology +if [[ $isgpu -eq 1 ]]; then + check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=gradient_allreduce + check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=allreduce + check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=allreduce --atc-style + check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=neighbor_allreduce + check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=neighbor_allreduce --disable-dynamic-topology + check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=neighbor_allreduce --atc-style + check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=neighbor_allreduce --atc-style --disable-dynamic-topology + check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=win_put + check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_benchmark.py --model=lenet --num-iters=1 --no-cuda --dist-optimizer=win_put --disable-dynamic-topology +fi # PyTorch ResNet Cases [ -d "${RUN_DIR}/logs" ] && EXIST_LOGS_DIR=1 || EXIST_LOGS_DIR=0 @@ -80,6 +101,7 @@ check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --e check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --dist-optimizer=neighbor_allreduce --atc-style --disable-dynamic-topology check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --dist-optimizer=win_put check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --dist-optimizer=win_put --disable-dynamic-topology +if [[ $isgpu -eq 1 ]]; then check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --no-cuda --dist-optimizer=gradient_allreduce check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --no-cuda --dist-optimizer=allreduce check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --no-cuda --dist-optimizer=allreduce --atc-style @@ -89,4 +111,5 @@ check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --e check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --no-cuda --dist-optimizer=neighbor_allreduce --atc-style --disable-dynamic-topology check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --no-cuda --dist-optimizer=win_put check ${BFRUN} python ${EXAMPLE_DIR}/pytorch_resnet.py --model=squeezenet1_0 --epochs=1 --no-cuda --dist-optimizer=win_put --disable-dynamic-topology +fi [ "${EXIST_LOGS_DIR}" == 0 ] && rm -rf ${RUN_DIR}/logs \ No newline at end of file From a836b021c82ddd12e3cfaaa5e5951a180d085bfa Mon Sep 17 00:00:00 2001 From: Hanbin Hu Date: Tue, 16 Mar 2021 23:04:19 -0700 Subject: [PATCH 25/36] Move test example --- {examples => test}/test_all_example.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename {examples => test}/test_all_example.sh (100%) diff --git a/examples/test_all_example.sh b/test/test_all_example.sh similarity index 100% rename from examples/test_all_example.sh rename to test/test_all_example.sh From e02a3d055f68963df201ff7e57acc4566b462627 Mon Sep 17 00:00:00 2001 From: Hanbin Hu Date: Thu, 18 Mar 2021 01:40:39 -0700 Subject: [PATCH 26/36] Fix NCCL side dst_weight fusion bug --- bluefog/common/nccl_controller.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/bluefog/common/nccl_controller.cc b/bluefog/common/nccl_controller.cc index bc823ddd..9fdce4e5 100644 --- a/bluefog/common/nccl_controller.cc +++ b/bluefog/common/nccl_controller.cc @@ -1101,12 +1101,12 @@ void NCCLController::NeighborAllreduce(std::vector& entries) { nccl_ctx_.nccl_comm, nccl_ctx_.stream)); } } else { - size_t i = 0; - for (int send_rank : *first_entry.send_neighbors) { + for (size_t i = 0; i < first_entry.send_neighbors->size(); ++i) { void* sendbuf = (void*)((uint8_t*)weighted_fused_input_data + num_elements * i * element_size); NCCLCHECK(ncclSend(sendbuf, num_elements, - GetNCCLDataType(first_entry.tensor), send_rank, + GetNCCLDataType(first_entry.tensor), + first_entry.send_neighbors->at(i), nccl_ctx_.nccl_comm, nccl_ctx_.stream)); } } @@ -1951,8 +1951,8 @@ void NCCLController::MemcpyInWeightFusionBuffer( int64_t offset = 0; for (size_t i = 0; i < num_dst; ++i) { void* weight_buffer_data_at_offset = (uint8_t*)weight_buffer_data + offset; - CUDACHECK(cudaMemcpy(weight_buffer_data_at_offset, buffer_data, data_size, - cudaMemcpyDeviceToDevice)); + CUDACHECK(cudaMemcpyAsync(weight_buffer_data_at_offset, buffer_data, data_size, + cudaMemcpyDeviceToDevice, nccl_ctx_.stream)); offset += data_size; } } From 8e42ca2f0e9d1b6b0408eb5fffff59ad60e73891 Mon Sep 17 00:00:00 2001 From: Hanbin Hu Date: Thu, 18 Mar 2021 02:11:18 -0700 Subject: [PATCH 27/36] Add agg to make matplotlib more stable --- examples/pytorch_optimization.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/pytorch_optimization.py b/examples/pytorch_optimization.py index 162f9040..f731cc4b 100644 --- a/examples/pytorch_optimization.py +++ b/examples/pytorch_optimization.py @@ -15,6 +15,8 @@ import os import torch +import matplotlib +matplotlib.use('agg') import matplotlib.pyplot as plt import argparse From e5f8722fca71b44eecee172352cd5234523cca50 Mon Sep 17 00:00:00 2001 From: Hanbin Hu Date: Sun, 21 Mar 2021 01:25:03 -0700 Subject: [PATCH 28/36] Address comments for setup.py --- examples/pytorch_optimization.py | 2 +- setup.py | 22 ++++++++++++++-------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/examples/pytorch_optimization.py b/examples/pytorch_optimization.py index f731cc4b..658cd485 100644 --- a/examples/pytorch_optimization.py +++ b/examples/pytorch_optimization.py @@ -16,7 +16,7 @@ import os import torch import matplotlib -matplotlib.use('agg') +matplotlib.use('agg') # Make matplotlib more robust when interface plotting is impossible. import matplotlib.pyplot as plt import argparse diff --git a/setup.py b/setup.py index 6857b9bc..f4e3bffb 100644 --- a/setup.py +++ b/setup.py @@ -22,6 +22,7 @@ import sys import textwrap import traceback +from typing import List from distutils.errors import CompileError, DistutilsError, \ @@ -437,21 +438,26 @@ def is_torch_cuda(build_ext, include_dirs, extra_compile_args): print('INFO: Above error indicates that this PyTorch installation does not support CUDA.') return False -def build_nvcc_extra_objects(cxx11_abi: bool): +def get_nvcc_cmd() -> str: + from shutil import which + nvcc_cmd = which('nvcc') + if nvcc_cmd is None: + raise DistutilsPlatformError('Unable to find NVCC compiler') + return nvcc_cmd + +def build_nvcc_extra_objects(nvcc_cmd: str, cxx11_abi: bool) -> List[str]: # nvcc --compiler-options '-fPIC -D_GLIBCXX_USE_CXX11_ABI=0' -rdc=true -c cuda_kernels.cu # nvcc --compiler-options '-fPIC -D_GLIBCXX_USE_CXX11_ABI=0' -dlink -o cuda_kernels_link.o \ # cuda_kernels.o -lcudart - nvcc = 'nvcc' - common_flags = f"'-fPIC -D_GLIBCXX_USE_CXX11_ABI={int(cxx11_abi)}'" - nvcc_with_flags = f"{nvcc} --compiler-options {common_flags}" + nvcc_flags = f"--compiler-options '-fPIC -D_GLIBCXX_USE_CXX11_ABI={int(cxx11_abi)}'" extra_object_dir = 'bluefog/common/cuda/' source = extra_object_dir+'cuda_kernels.cu' object_file = extra_object_dir+'cuda_kernels.o' object_link = extra_object_dir+'cuda_kernels_link.o' - command_object = f"{nvcc_with_flags} -rdc=true -c {source} -o {object_file}" - command_link = f"{nvcc_with_flags} -dlink {object_file} -lcudart -o {object_link}" + command_object = f"{nvcc_cmd} {nvcc_flags} -rdc=true -c {source} -o {object_file}" + command_link = f"{nvcc_cmd} {nvcc_flags} -dlink {object_file} -lcudart -o {object_link}" subprocess.check_call([command_object], shell=True) subprocess.check_call([command_link], shell=True) @@ -469,7 +475,8 @@ def build_torch_extension(build_ext, global_options, torch_version): if have_cuda: cuda_include_dirs, cuda_lib_dirs = get_cuda_dirs( build_ext, options['COMPILE_FLAGS']) - cuda_extra_objects = build_nvcc_extra_objects(is_cxx11_abi) + nvcc_cmd = get_nvcc_cmd() + cuda_extra_objects = build_nvcc_extra_objects(nvcc_cmd, is_cxx11_abi) options['EXTRA_OBJECTS'] += cuda_extra_objects options['INCLUDES'] += cuda_include_dirs @@ -515,7 +522,6 @@ def build_torch_extension(build_ext, global_options, torch_version): else: # CUDAExtension fails with `ld: library not found for -lcudart` if CUDA is not present from torch.utils.cpp_extension import CppExtension as TorchExtension - bluefog_tensorflow_mpi_lib.extra_objects = options['EXTRA_OBJECTS'] ext = TorchExtension(bluefog_torch_mpi_lib.name, define_macros=updated_macros, From 9f2f55d78371fc2356381bea4d1ff62e8baccee8 Mon Sep 17 00:00:00 2001 From: Hanbin Hu Date: Sun, 21 Mar 2021 01:46:50 -0700 Subject: [PATCH 29/36] Simpler logic for dst_weighting_enabled and weighted_average_computation --- bluefog/torch/mpi_ops.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/bluefog/torch/mpi_ops.py b/bluefog/torch/mpi_ops.py index dc508a34..e5e106ae 100644 --- a/bluefog/torch/mpi_ops.py +++ b/bluefog/torch/mpi_ops.py @@ -17,6 +17,7 @@ from contextlib import contextmanager from typing import List, Dict, Union, Optional +import numpy as np import torch from bluefog.torch import mpi_lib # C library @@ -488,10 +489,9 @@ def _neighbor_allreduce_nonblocking(tensor, output, self_weight, src_weights, "enabling dynamic topology.") else: dynamic_neighbors_enabled = True - dst_weighting_enabled = True if isinstance(dst_weights, list): dst_weights = {dst:1.0 for dst in dst_weights} - dst_weighting_enabled = False + dst_weighting_enabled = not np.allclose(list(dst_weights.values()), 1.0) if self_weight is None and src_weights is None: # Implying this is static graph. if is_topo_weighted(): @@ -515,13 +515,8 @@ def _neighbor_allreduce_nonblocking(tensor, output, self_weight, src_weights, raise ValueError("The key of weights should only contain the ranks that belong to " " in-neighbors and self rank.") uniform_weights = 1.0/(len(src_weights)+1) - weighted_average_computation = False - if abs(self_weight - uniform_weights) > 1e-6: - weighted_average_computation = True - for n_weights in src_weights.values(): - if abs(n_weights - uniform_weights) > 1e-6: - weighted_average_computation = True - break + weighted_average_computation = not(np.isclose(self_weight, uniform_weights) and + np.allclose(list(src_weights.values()), uniform_weights)) else: raise ValueError("Arguments self_weight and neighbor_weights have to be presented at " "the same time") From d7a93103a306fcc07d86194a7cf7f192e37edba9 Mon Sep 17 00:00:00 2001 From: Hanbin Hu Date: Sun, 21 Mar 2021 01:56:58 -0700 Subject: [PATCH 30/36] Better consideration for weight buffer size --- bluefog/common/operations.cc | 2 ++ bluefog/common/tensor_queue.cc | 7 ++++--- bluefog/common/tensor_queue.h | 4 ++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/bluefog/common/operations.cc b/bluefog/common/operations.cc index 84d18aba..3738f9a3 100644 --- a/bluefog/common/operations.cc +++ b/bluefog/common/operations.cc @@ -780,6 +780,8 @@ void PerformOperationWithFusion(std::vector& entries) { [&]() { timeline.ActivityStartAll(entries, "INIT_FUSION_BUFFER"); }, [&]() { timeline.ActivityEndAll(entries); }); + // As the dst_weight requires extra memory to scale the tensor for each destination, therefore, + // extra memory is required. Status status_dst_weight = Status::OK(); if (first_entry.dst_weighting_enabled) { status_dst_weight = bluefog_global.fusion_buffer.InitializeWeightBuffer( diff --git a/bluefog/common/tensor_queue.cc b/bluefog/common/tensor_queue.cc index b0f67241..0737db91 100644 --- a/bluefog/common/tensor_queue.cc +++ b/bluefog/common/tensor_queue.cc @@ -155,19 +155,20 @@ std::shared_ptr FusionBufferManager::GetBuffer(int device) { } Status FusionBufferManager::InitializeWeightBuffer( - int64_t threshold, int mpi_size, int device, std::shared_ptr context, + int64_t threshold, int world_size, int device, std::shared_ptr context, std::function on_start_init, std::function on_end_init) { auto& elem = weight_tensor_fusion_buffers_[device]; auto& buffer = elem.first; int64_t& size = elem.second; - if (size != threshold*mpi_size) { + // threshold * (world_size-1) is the upper bound for buffer + if (size != threshold*(world_size-1)) { buffer.reset(); size = 0; } if (buffer == nullptr) { on_start_init(); - size = threshold*mpi_size; + size = threshold*(world_size-1); // Lazily allocate persistent buffer for Tensor Fusion and keep it // forever per device. diff --git a/bluefog/common/tensor_queue.h b/bluefog/common/tensor_queue.h index 2c54c88f..47d74f88 100644 --- a/bluefog/common/tensor_queue.h +++ b/bluefog/common/tensor_queue.h @@ -86,13 +86,13 @@ class FusionBufferManager { // // Args: // threshold: Size of the buffer in bytes. - // mpi_size: Size of MPI nodes. + // world_size: Size of MPI nodes. // device: Device ID to associate the buffer. // context: Framework used to create the buffer and associate it. // on_start_init: Callback on starting buffer initialization. // on_end_init: Callback on completing buffer initialization. Status InitializeWeightBuffer(int64_t threshold, - int mpi_size, + int world_size, int device, std::shared_ptr context, std::function on_start_init, From 562b231bf5213f0b575f57657ca59fcc36f58c64 Mon Sep 17 00:00:00 2001 From: Hanbin Hu Date: Thu, 25 Mar 2021 22:23:36 -0700 Subject: [PATCH 31/36] Make src_weights as std::map, and simplify logic for PerformNeighborAllreduceCallback --- bluefog/torch/mpi_ops.cc | 63 +++++++++++----------------------------- 1 file changed, 17 insertions(+), 46 deletions(-) diff --git a/bluefog/torch/mpi_ops.cc b/bluefog/torch/mpi_ops.cc index 16a410d6..636da5f0 100644 --- a/bluefog/torch/mpi_ops.cc +++ b/bluefog/torch/mpi_ops.cc @@ -98,13 +98,12 @@ std::function(std::function)> void PerformNeighborAllreduceCallback(::torch::Tensor tensor, ::torch::Tensor output, double self_weight, - const std::unordered_map& src_weights, - const std::vector& src_neighbors, + const std::map& src_weights, bool avg_computation, bool dynamic_neighbors_enabled, bool is_hierarchical) { int src_size = bluefog_neighbor_size(); - if (dynamic_neighbors_enabled) src_size = src_neighbors.size(); + if (dynamic_neighbors_enabled) src_size = src_weights.size(); if (src_size > 0) { ::torch::Tensor output_buffer = MaybeCopyToTensorBuffer(output); ::torch::Tensor tensor_buffer = MaybeCopyToTensorBuffer(tensor); @@ -118,40 +117,17 @@ void PerformNeighborAllreduceCallback(::torch::Tensor tensor, ::torch::Tensor ou // if avg_computation is set to be False, sum computation will be taken place. if (avg_computation) { - // 1) For a distributed graph topology, created with - // MPI_Dist_graph_create, the sequence of neighbors in the send and - // receive buffers at each process is defined as the sequence returned - // by MPI_Dist_graph_neighbors for destinations and sources, - // respectively. 2) MPI_Dist_graph_neighbors: If the communicator was - // created with MPI_Dist_graph_create_adjacent then the order of the - // values in sources and destinations is identical to the input that - // was used by the process with the same rank in comm_old in the - // creation call. - int* sources_ptr = nullptr; - if (!dynamic_neighbors_enabled) { - int indgree = 0; - int outdegree = 0; - int* destinations_ptr = nullptr; - bluefog_load_topology(&indgree, sources_ptr, &outdegree, - destinations_ptr); - } auto output_reduced = output_buffer.slice(0, 0, first_dim); - for (int i = 0; i < src_size; i++) { - double weight = 0.0; - int src_rank; - if (!dynamic_neighbors_enabled) src_rank = *(sources_ptr+i); - else src_rank = src_neighbors[i]; - auto it = src_weights.find(src_rank); - if (it != src_weights.end()) { - weight = it->second; - } - + int i = 0; + for (auto kv : src_weights) { + double weight = kv.second; if (i == 0) { output_reduced.mul_(weight); } else { output_reduced.add_( output_buffer.slice(0, i * first_dim, (i + 1) * first_dim), weight); } + ++i; } output_buffer.resize_(shape_vector); output_buffer.add_(tensor_buffer, self_weight); @@ -431,10 +407,12 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output, auto callback_wrapper = GetCallbackWrapper(handle, timeline_ptr, op_name, tid); - std::vector src_neighbors; + std::map src_weights_map; for (auto kv : src_weights) + src_weights_map.insert(kv); + std::vector src_neighbors; + for (auto kv : src_weights_map) src_neighbors.push_back(kv.first); - std::sort(src_neighbors.begin(), src_neighbors.end()); std::vector dst_neighbors; for (auto kv : dst_weights) @@ -462,16 +440,12 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output, bf_src_neighbors, bf_dst_neighbors, bf_dst_weights_vec, dynamic_neighbors_enabled, dst_weighting_enabled, is_hierarchical, enable_topo_check, op_name, CPU_DEVICE_ID, - callback_wrapper([self_weight, src_weights, avg_computation, - cpu_output, tensor, src_neighbors, - dynamic_neighbors_enabled, is_hierarchical, output, - device]() mutable { + callback_wrapper([self_weight, src_weights_map, avg_computation, cpu_output, tensor, + dynamic_neighbors_enabled, is_hierarchical, output, device]() mutable { with_device device_guard(device); output.copy_(cpu_output); - PerformNeighborAllreduceCallback(tensor, output, self_weight, - src_weights, src_neighbors, - avg_computation, - dynamic_neighbors_enabled, + PerformNeighborAllreduceCallback(tensor, output, self_weight, src_weights_map, + avg_computation, dynamic_neighbors_enabled, is_hierarchical); })); @@ -486,13 +460,10 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output, bf_src_neighbors, bf_dst_neighbors, bf_dst_weights_vec, dynamic_neighbors_enabled, dst_weighting_enabled, is_hierarchical, enable_topo_check, op_name, device, - callback_wrapper([self_weight, src_weights, avg_computation, - src_neighbors, dynamic_neighbors_enabled, + callback_wrapper([self_weight, src_weights_map, avg_computation, dynamic_neighbors_enabled, is_hierarchical, tensor, output]() mutable { - PerformNeighborAllreduceCallback(tensor, output, self_weight, - src_weights, src_neighbors, - avg_computation, - dynamic_neighbors_enabled, + PerformNeighborAllreduceCallback(tensor, output, self_weight, src_weights_map, + avg_computation, dynamic_neighbors_enabled, is_hierarchical); })); ThrowIfError(enqueue_result); From 6764efa44c75d9f064e0d50c4b4b61071129b742 Mon Sep 17 00:00:00 2001 From: Hanbin Hu Date: Thu, 25 Mar 2021 23:23:12 -0700 Subject: [PATCH 32/36] Add TODO #80 and #81, and simplify the logic for dst_weight --- bluefog/common/global_state.h | 1 + bluefog/common/mpi_context.h | 1 + bluefog/common/mpi_controller.cc | 134 ++++++++++-------------------- bluefog/common/nccl_controller.cc | 53 ++++++------ bluefog/common/operations.cc | 2 +- bluefog/torch/mpi_ops.py | 2 +- 6 files changed, 72 insertions(+), 121 deletions(-) diff --git a/bluefog/common/global_state.h b/bluefog/common/global_state.h index d9addcac..6f940769 100644 --- a/bluefog/common/global_state.h +++ b/bluefog/common/global_state.h @@ -80,6 +80,7 @@ struct BluefogGlobalState { // Threshold for Tensor Fusion. All tensors that occupy memory beyond this // threshold will be fused. int64_t tensor_fusion_threshold = 8 * 1024 * 1024; + int64_t tensor_fusion_threshold_for_dst_weight = 8 * 1024 * 1024; FusionBufferManager fusion_buffer; // Because setting topology happens in the main thread instead of communication diff --git a/bluefog/common/mpi_context.h b/bluefog/common/mpi_context.h index aaad85db..77f28e48 100644 --- a/bluefog/common/mpi_context.h +++ b/bluefog/common/mpi_context.h @@ -237,6 +237,7 @@ class MPIContext { MPI_Datatype mpi_float16_t; MPI_Op mpi_float16_sum; + // TODO(hhb): #80 We should use a common context for MPI and NCCL controller for CUDA usage. #if HAVE_CUDA // CUDA Stream cudaStream_t stream; diff --git a/bluefog/common/mpi_controller.cc b/bluefog/common/mpi_controller.cc index 07fc3f0a..88f17516 100644 --- a/bluefog/common/mpi_controller.cc +++ b/bluefog/common/mpi_controller.cc @@ -459,16 +459,10 @@ void MPIController::NeighborAllreduce(TensorTableEntry& entry) { if (!entry.is_hierarchical) { if (!entry.dynamic_neighbors_enabled) { - int ret_code = MPI_Neighbor_allgather( + MPICHECK(MPI_Neighbor_allgather( sendbuf, num_elements, mpi_ctx_.GetMPIDataType(entry.tensor), buffer_data, num_elements, mpi_ctx_.GetMPIDataType(entry.output), - mpi_ctx_.GetMPICommunicator(Communicator::GRAPH)); - if (ret_code != MPI_SUCCESS) { - throw std::runtime_error( - "MPI_Neighbor_allreduce (through neighbor_allgather) failed, see " - "MPI " - "output for details."); - } + mpi_ctx_.GetMPICommunicator(Communicator::GRAPH))); } else { int nsend = entry.send_neighbors->size(); int nrecv = entry.recv_neighbors->size(); @@ -490,17 +484,10 @@ void MPIController::NeighborAllreduce(TensorTableEntry& entry) { for (int i = 0; i < nrecv; ++i) { void* recvbuf = (void*)(static_cast(entry.output->data()) + num_elements * i * element_size); - int ret_code = MPI_Irecv( - recvbuf, num_elements, mpi_ctx_.GetMPIDataType(entry.output), - entry.recv_neighbors->at(i), - mpi_ctx_.rank_ + entry.recv_neighbors->at(i), - mpi_ctx_.GetMPICommunicator(Communicator::GRAPH), - &requests[i + nsend]); - if (ret_code != MPI_SUCCESS) { - throw std::runtime_error( - "MPI_Irecv (for dynamic neighbor_allreduce) failed, see MPI " - "output for details."); - } + MPICHECK(MPI_Irecv(recvbuf, num_elements, + mpi_ctx_.GetMPIDataType(entry.output), entry.recv_neighbors->at(i), + /*tag=*/mpi_ctx_.rank_ + entry.recv_neighbors->at(i), + mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL), &requests[i + nsend])); } if (entry.dst_weighting_enabled) { @@ -509,33 +496,16 @@ void MPIController::NeighborAllreduce(TensorTableEntry& entry) { std::this_thread::sleep_for(std::chrono::nanoseconds(100)); } } - for (int i = 0; i < nsend; ++i) { - int ret_code = MPI_Isend( - weighted_tensors[i].get()->data(), num_elements, - mpi_ctx_.GetMPIDataType(entry.tensor), entry.send_neighbors->at(i), - mpi_ctx_.rank_ + entry.send_neighbors->at(i), - mpi_ctx_.GetMPICommunicator(Communicator::GRAPH), &requests[i]); - if (ret_code != MPI_SUCCESS) { - throw std::runtime_error( - "MPI_Isend (for dynamic neighbor_allreduce) failed, see MPI " - "output for details."); - } - } - } else { - for (int i = 0; i < nsend; ++i) { - int ret_code = MPI_Isend( - sendbuf, num_elements, mpi_ctx_.GetMPIDataType(entry.tensor), - entry.send_neighbors->at(i), - mpi_ctx_.rank_ + entry.send_neighbors->at(i), - mpi_ctx_.GetMPICommunicator(Communicator::GRAPH), &requests[i]); - if (ret_code != MPI_SUCCESS) { - throw std::runtime_error( - "MPI_Isend (for dynamic neighbor_allreduce) failed, see MPI " - "output for details."); - } - } } - + for (int i = 0; i < nsend; ++i) { + const void* buffer_send = sendbuf; + if (entry.dst_weighting_enabled) + buffer_send = weighted_tensors[i].get()->data(); + MPICHECK(MPI_Isend(buffer_send, num_elements, + mpi_ctx_.GetMPIDataType(entry.tensor), entry.send_neighbors->at(i), + /*tag=*/mpi_ctx_.rank_ + entry.send_neighbors->at(i), + mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL), &requests[i])); + } MPI_Waitall(nsend + nrecv, requests.data(), statuses.data()); error_message = GenerateNeighborExchangeErrorMessage(statuses, nsend, nrecv); @@ -551,6 +521,11 @@ void MPIController::NeighborAllreduce(TensorTableEntry& entry) { "Local size is smaller than 2, in this case, you should use " "neighbor_allreduce instead of hierarchical_neighbor_allreduce."); } + if (entry.dst_weighting_enabled) { + throw std::runtime_error( + "Under hierarchical neighbor_allreduce, argument " + "dst_weight should not be enabled for now."); + } // 1. In-place allreduce MPI_Allreduce(MPI_IN_PLACE, (void*)sendbuf, num_elements, mpi_ctx_.GetMPIDataType(entry.tensor), MPI_SUM, @@ -644,8 +619,8 @@ void MPIController::NeighborAllreduce(std::vector& entries) { if (first_entry.enable_topo_check && first_entry.dynamic_neighbors_enabled) { if (first_entry.is_hierarchical) { // TODO: support check. - BFLOG(INFO) << "Request to check topology for hierarchical neighbor " - << "allreduce ops but it is not supported yet."; + BFLOG(WARNING) << "Request to check topology for hierarchical neighbor " + << "allreduce ops but it is not supported yet."; } is_topo_check_fail = CheckNeighborSendRecvPattern( first_entry.send_neighbors.get(), first_entry.recv_neighbors.get(), first_entry.tensor_name, @@ -709,15 +684,10 @@ void MPIController::NeighborAllreduce(std::vector& entries) { if (!first_entry.is_hierarchical) { if (!first_entry.dynamic_neighbors_enabled) { - int ret_code = MPI_Neighbor_allgather( + MPICHECK(MPI_Neighbor_allgather( fused_input_data, num_elements, mpi_ctx_.GetMPIDataType(first_entry.tensor), buffer_data, num_elements, mpi_ctx_.GetMPIDataType(first_entry.output), - mpi_ctx_.GetMPICommunicator(Communicator::GRAPH)); - if (ret_code != MPI_SUCCESS) { - throw std::runtime_error( - "MPI_Neighbor_allreduce (through neighbor_allgather) failed, see MPI " - "output for details."); - } + mpi_ctx_.GetMPICommunicator(Communicator::GRAPH))); } else { int nsend = first_entry.send_neighbors->size(); int nrecv = first_entry.recv_neighbors->size(); @@ -726,48 +696,25 @@ void MPIController::NeighborAllreduce(std::vector& entries) { for (int i = 0; i < nrecv; ++i) { void* recvbuf = (void*)((uint8_t*)buffer_data + num_elements * i * element_size); - int ret_code = MPI_Irecv(recvbuf, num_elements, - mpi_ctx_.GetMPIDataType(first_entry.output), - first_entry.recv_neighbors->at(i), - /*tag=*/mpi_ctx_.rank_ + first_entry.recv_neighbors->at(i), - mpi_ctx_.GetMPICommunicator(Communicator::GRAPH), - &requests[i + nsend]); - if (ret_code != MPI_SUCCESS) { - throw std::runtime_error( - "MPI_Irecv (for dynamic neighbor_allreduce) failed, see MPI output " - "for details."); - } + MPICHECK(MPI_Irecv(recvbuf, num_elements, + mpi_ctx_.GetMPIDataType(first_entry.output), first_entry.recv_neighbors->at(i), + /*tag=*/mpi_ctx_.rank_ + first_entry.recv_neighbors->at(i), + mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL), &requests[i + nsend])); } - if (!first_entry.dst_weighting_enabled) { - for (int i = 0; i < nsend; ++i) { - int ret_code = MPI_Isend( - fused_input_data, num_elements, mpi_ctx_.GetMPIDataType(first_entry.tensor), - first_entry.send_neighbors->at(i), - /*tag=*/mpi_ctx_.rank_ + first_entry.send_neighbors->at(i), - mpi_ctx_.GetMPICommunicator(Communicator::GRAPH), &requests[i]); - if (ret_code != MPI_SUCCESS) { - throw std::runtime_error( - "MPI_Isend (for dynamic neighbor_allreduce) failed, see MPI output " - "for details."); - } - } - } else { #if HAVE_CUDA + if (first_entry.dst_weighting_enabled && first_entry.device != CPU_DEVICE_ID) { cudaStreamSynchronize(mpi_ctx_.stream); + } #endif - for (int i = 0; i < nsend; ++i) { - void* sendbuf = + for (int i = 0; i < nsend; ++i) { + const void* sendbuf = fused_input_data; + if (first_entry.dst_weighting_enabled) + sendbuf = (void*)((uint8_t*)weighted_fused_input_data + num_elements * i * element_size); - int ret_code = MPI_Isend(sendbuf, num_elements, - mpi_ctx_.GetMPIDataType(first_entry.tensor), first_entry.send_neighbors->at(i), - /*tag=*/mpi_ctx_.rank_ + first_entry.send_neighbors->at(i), - mpi_ctx_.GetMPICommunicator(Communicator::GRAPH), &requests[i]); - if (ret_code != MPI_SUCCESS) { - throw std::runtime_error( - "MPI_Isend (for dynamic neighbor_allreduce) failed, see MPI output " - "for details."); - } - } + MPICHECK(MPI_Isend(sendbuf, num_elements, + mpi_ctx_.GetMPIDataType(first_entry.tensor), first_entry.send_neighbors->at(i), + /*tag=*/mpi_ctx_.rank_ + first_entry.send_neighbors->at(i), + mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL), &requests[i])); } MPI_Waitall(nsend + nrecv, requests.data(), statuses.data()); error_message = @@ -784,6 +731,11 @@ void MPIController::NeighborAllreduce(std::vector& entries) { "Local size is smaller than 2, in this case, you should use " "neighbor_allreduce instead of hierarchical_neighbor_allreduce."); } + if (first_entry.dst_weighting_enabled) { + throw std::runtime_error( + "Under hierarchical neighbor_allreduce, argument " + "dst_weight should not be enabled for now."); + } // 1. In-place allreduce MPI_Allreduce(MPI_IN_PLACE, (void*)fused_input_data, num_elements, mpi_ctx_.GetMPIDataType(first_entry.tensor), MPI_SUM, diff --git a/bluefog/common/nccl_controller.cc b/bluefog/common/nccl_controller.cc index 9fdce4e5..7bdd9acf 100644 --- a/bluefog/common/nccl_controller.cc +++ b/bluefog/common/nccl_controller.cc @@ -789,23 +789,19 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) { NCCLCHECK(ncclRecv(recvbuf, num_elements, GetNCCLDataType(entry.tensor), recv_rank, nccl_ctx_.nccl_comm, nccl_ctx_.stream)); } - if(entry.dst_weighting_enabled) - { + if(entry.dst_weighting_enabled) { if (ready_event != nullptr) { while (!ready_event->Ready()) { std::this_thread::sleep_for(std::chrono::nanoseconds(100)); } } - for (size_t i = 0; i < entry.send_neighbors->size(); ++i) { - NCCLCHECK(ncclSend(weighted_tensors[i].get()->data(), num_elements, - GetNCCLDataType(entry.tensor), entry.send_neighbors->at(i), - nccl_ctx_.nccl_comm, nccl_ctx_.stream)); - } - } else { - for (int send_rank : *entry.send_neighbors) { - NCCLCHECK(ncclSend(sendbuf, num_elements, GetNCCLDataType(entry.tensor), - send_rank, nccl_ctx_.nccl_comm, nccl_ctx_.stream)); - } + } + for (size_t i = 0; i < entry.send_neighbors->size(); ++i) { + const void* buffer_send = sendbuf; + if (entry.dst_weighting_enabled) + buffer_send = weighted_tensors[i].get()->data(); + NCCLCHECK(ncclSend(buffer_send, num_elements, GetNCCLDataType(entry.tensor), + entry.send_neighbors->at(i), nccl_ctx_.nccl_comm, nccl_ctx_.stream)); } } ncclGroupEnd(); @@ -816,6 +812,11 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) { "send_machine_neighbors should " "not be empty."); } + if (entry.dst_weighting_enabled) { + throw std::runtime_error( + "Under hierarchical neighbor_allreduce, argument " + "dst_weight should not be enabled for now."); + } // 1. In-place allreduce for all local ranks. Note it is sum, so we need to // divided by local size at call back stage. NCCLCHECK(ncclAllReduce(sendbuf, (void*)sendbuf, num_elements, @@ -1093,22 +1094,13 @@ void NCCLController::NeighborAllreduce(std::vector& entries) { GetNCCLDataType(first_entry.tensor), recv_rank, nccl_ctx_.nccl_comm, nccl_ctx_.stream)); } - if (!first_entry.dst_weighting_enabled) - { - for (int send_rank : *first_entry.send_neighbors) { - NCCLCHECK(ncclSend(fused_input_data, num_elements, - GetNCCLDataType(first_entry.tensor), send_rank, - nccl_ctx_.nccl_comm, nccl_ctx_.stream)); - } - } else { - for (size_t i = 0; i < first_entry.send_neighbors->size(); ++i) { - void* sendbuf = - (void*)((uint8_t*)weighted_fused_input_data + num_elements * i * element_size); - NCCLCHECK(ncclSend(sendbuf, num_elements, - GetNCCLDataType(first_entry.tensor), - first_entry.send_neighbors->at(i), - nccl_ctx_.nccl_comm, nccl_ctx_.stream)); - } + for (size_t i = 0; i < first_entry.send_neighbors->size(); ++i) { + const void* sendbuf = fused_input_data; + if (first_entry.dst_weighting_enabled) + sendbuf = (void*)((uint8_t*)weighted_fused_input_data + num_elements * i * element_size); + NCCLCHECK(ncclSend(sendbuf, num_elements, GetNCCLDataType(first_entry.tensor), + first_entry.send_neighbors->at(i), + nccl_ctx_.nccl_comm, nccl_ctx_.stream)); } } ncclGroupEnd(); @@ -1124,6 +1116,11 @@ void NCCLController::NeighborAllreduce(std::vector& entries) { "neighbor_allreduce instead of hierarchical_neighbor_allreduce." ); } + if (first_entry.dst_weighting_enabled) { + throw std::runtime_error( + "Under hierarchical neighbor_allreduce, argument " + "dst_weight should not be enabled for now."); + } // 1. In-place allreduce for all local ranks. Note it is sum, so we need to // divided by local size at call back stage. diff --git a/bluefog/common/operations.cc b/bluefog/common/operations.cc index 3738f9a3..ddf90cf5 100644 --- a/bluefog/common/operations.cc +++ b/bluefog/common/operations.cc @@ -785,7 +785,7 @@ void PerformOperationWithFusion(std::vector& entries) { Status status_dst_weight = Status::OK(); if (first_entry.dst_weighting_enabled) { status_dst_weight = bluefog_global.fusion_buffer.InitializeWeightBuffer( - bluefog_global.tensor_fusion_threshold, mpi_context.size_, + bluefog_global.tensor_fusion_threshold_for_dst_weight, mpi_context.size_, first_entry.device, first_entry.context, [&]() { timeline.ActivityStartAll(entries, "INIT_WEIGHT_FUSION_BUFFER"); }, [&]() { timeline.ActivityEndAll(entries); }); diff --git a/bluefog/torch/mpi_ops.py b/bluefog/torch/mpi_ops.py index e5e106ae..1b82b664 100644 --- a/bluefog/torch/mpi_ops.py +++ b/bluefog/torch/mpi_ops.py @@ -638,7 +638,7 @@ def neighbor_allreduce_nonblocking(tensor: torch.Tensor, *, return _neighbor_allreduce_nonblocking(tensor, output, self_weight, src_weights, dst_weights, enable_topo_check, name=name) - +# TODO(hanbinhu) #81 Add dst_weight for hierarchical neighbor allreduce. def hierarchical_neighbor_allreduce(tensor: torch.Tensor, self_weight: float = None, neighbor_machine_weights: Dict[int, float] = None, From 1e8db23a397950705090bed7f709d9cadec0dfd0 Mon Sep 17 00:00:00 2001 From: Hanbin Hu Date: Thu, 25 Mar 2021 23:47:53 -0700 Subject: [PATCH 33/36] Wrap CheckNeighborSendRecvPattern again --- bluefog/common/mpi_context.cc | 2 +- bluefog/common/mpi_context.h | 2 +- bluefog/common/mpi_controller.cc | 74 ++++++++++++------------------- bluefog/common/mpi_controller.h | 4 +- bluefog/common/nccl_controller.cc | 39 +++------------- 5 files changed, 38 insertions(+), 83 deletions(-) diff --git a/bluefog/common/mpi_context.cc b/bluefog/common/mpi_context.cc index addd3b3a..7a68ce39 100644 --- a/bluefog/common/mpi_context.cc +++ b/bluefog/common/mpi_context.cc @@ -222,7 +222,7 @@ MPI_Op MPIContext::GetMPISumOp(DataType dtype) { return dtype == DataType::BLUEFOG_FLOAT16 ? mpi_float16_sum : MPI_SUM; } -MPI_Comm MPIContext::GetMPICommunicator(Communicator comm) { +MPI_Comm MPIContext::GetMPICommunicator(Communicator comm) const { switch (comm) { case Communicator::GLOBAL: return mpi_comm; diff --git a/bluefog/common/mpi_context.h b/bluefog/common/mpi_context.h index 77f28e48..c220d497 100644 --- a/bluefog/common/mpi_context.h +++ b/bluefog/common/mpi_context.h @@ -148,7 +148,7 @@ class MPIContext { MPI_Op GetMPISumOp(DataType dtype); - MPI_Comm GetMPICommunicator(Communicator comm); + MPI_Comm GetMPICommunicator(Communicator comm) const; int GetMPITypeSize(DataType dtype); diff --git a/bluefog/common/mpi_controller.cc b/bluefog/common/mpi_controller.cc index 88f17516..5d358476 100644 --- a/bluefog/common/mpi_controller.cc +++ b/bluefog/common/mpi_controller.cc @@ -294,23 +294,13 @@ void MPIController::NeighborAllgather(TensorTableEntry& entry) { displcmnts = new int[mpi_ctx_.neighbor_indgree_]; status = mpi_ctx_.AllocateOutput(entry, recvcounts, Communicator::GRAPH); } else { - bool is_topo_check_fail = false; - if (entry.enable_topo_check && entry.dynamic_neighbors_enabled) { - if (entry.is_hierarchical) { - // TODO: support check. - BFLOG(INFO) << "Request to check topology for hierarchical neighbor " - << "allreduce ops but it is not supported yet."; - } - is_topo_check_fail = CheckNeighborSendRecvPattern( - entry.send_neighbors.get(), entry.recv_neighbors.get(), entry.tensor_name, - mpi_ctx_.size_, timeline_ptr, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL)); - } - + bool is_topo_check_fail = CheckNeighborSendRecvPatternForEntry(entry, mpi_ctx_, timeline_ptr); if (is_topo_check_fail) { entry.callback(Status::InvalidArgument( "Src and dst neighbor ranks do not match")); return; } + recvcounts = new int[entry.recv_neighbors->size()]; displcmnts = new int[entry.recv_neighbors->size()]; status = mpi_ctx_.AllocateOutput(entry, recvcounts, Communicator::DYNAMIC, @@ -372,7 +362,7 @@ void MPIController::NeighborAllgather(TensorTableEntry& entry) { // Function to check if the sending and receiving neighbors match in the topology. bool CheckNeighborSendRecvPattern( - const std::vector* send_neighbors, const std::vector* recv_neighbors, + const std::vector& send_neighbors, const std::vector& recv_neighbors, const std::string& tensor_name, int size, Timeline* timeline_ptr, const MPI_Comm& comm) { bool res = false; timeline_ptr->ActivityStart(tensor_name, "NEGOTIATION"); @@ -381,17 +371,12 @@ bool CheckNeighborSendRecvPattern( bool* send_check_buf = new bool[2 * size]; std::fill_n(send_check_buf, 2 * size, false); bool* recv_check_buf = new bool[2 * size * size]; - for (int send_rank : *(send_neighbors)) + for (int send_rank : send_neighbors) send_check_buf[send_rank] = true; - for (int recv_rank : *(recv_neighbors)) + for (int recv_rank : recv_neighbors) send_check_buf[size + recv_rank] = true; - int ret_code = MPI_Allgather(send_check_buf, size * 2, MPI_C_BOOL, - recv_check_buf, size * 2, MPI_C_BOOL, comm); - if (ret_code != MPI_SUCCESS) { - throw std::runtime_error( - "MPI_Allgather (for dynamic neighbor_allreduce negotiation) failed, " - "see MPI output for details."); - } + MPICHECK(MPI_Allgather(send_check_buf, size * 2, MPI_C_BOOL, + recv_check_buf, size * 2, MPI_C_BOOL, comm)); // This checks that send matrix and transposed recv matrix should be the // same. If same, the topology is good to go. If not, there is mismatch edge // to be fixed. @@ -413,6 +398,24 @@ bool CheckNeighborSendRecvPattern( return res; } +bool CheckNeighborSendRecvPatternForEntry(const TensorTableEntry& entry, + const MPIContext& mpi_ctx, + Timeline* timeline_ptr) { + bool is_topo_check_fail = false; + if (entry.enable_topo_check && entry.dynamic_neighbors_enabled) { + if (entry.is_hierarchical) { + // TODO: support check. + BFLOG(WARNING) << "Request to check topology for hierarchical neighbor " + << "allreduce ops but it is not supported yet."; + } else { + is_topo_check_fail = CheckNeighborSendRecvPattern( + *entry.send_neighbors, *entry.recv_neighbors, entry.tensor_name, + mpi_ctx.size_, timeline_ptr, mpi_ctx.GetMPICommunicator(Communicator::GLOBAL)); + } + } + return is_topo_check_fail; +} + void MPIController::NeighborAllreduce(TensorTableEntry& entry) { const void* sendbuf = entry.tensor->data(); int num_elements = entry.tensor->shape().num_elements(); @@ -432,18 +435,7 @@ void MPIController::NeighborAllreduce(TensorTableEntry& entry) { // If only partial sending is enabled, the following code block checks whether the sending // and recieving neighbors match each other when enable_topo_check is set to be True. - bool is_topo_check_fail = false; - if (entry.enable_topo_check && entry.dynamic_neighbors_enabled) { - if (entry.is_hierarchical) { - // TODO: support check. - BFLOG(INFO) << "Request to check topology for hierarchical neighbor " - << "allreduce ops but it is not supported yet."; - } - is_topo_check_fail = CheckNeighborSendRecvPattern( - entry.send_neighbors.get(), entry.recv_neighbors.get(), entry.tensor_name, - mpi_ctx_.size_, timeline_ptr, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL)); - } - + bool is_topo_check_fail = CheckNeighborSendRecvPatternForEntry(entry, mpi_ctx_, timeline_ptr); if (is_topo_check_fail) { entry.callback(Status::InvalidArgument( "Src(recv from) and dst(send to) neighbor ranks do not match")); @@ -615,18 +607,8 @@ void MPIController::NeighborAllreduce(std::vector& entries) { // If only partial sending is enabled, the following code block checks whether // the sending and recieving neighbors match each other when enable_topo_check // is set to be True. - bool is_topo_check_fail = false; - if (first_entry.enable_topo_check && first_entry.dynamic_neighbors_enabled) { - if (first_entry.is_hierarchical) { - // TODO: support check. - BFLOG(WARNING) << "Request to check topology for hierarchical neighbor " - << "allreduce ops but it is not supported yet."; - } - is_topo_check_fail = CheckNeighborSendRecvPattern( - first_entry.send_neighbors.get(), first_entry.recv_neighbors.get(), first_entry.tensor_name, - mpi_ctx_.size_, timeline_ptr, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL)); - } - + bool is_topo_check_fail = CheckNeighborSendRecvPatternForEntry(first_entry, mpi_ctx_, + timeline_ptr); if (is_topo_check_fail) { for (auto& entry : entries) { entry.callback(Status::InvalidArgument( diff --git a/bluefog/common/mpi_controller.h b/bluefog/common/mpi_controller.h index 3f5af419..be5daa34 100644 --- a/bluefog/common/mpi_controller.h +++ b/bluefog/common/mpi_controller.h @@ -28,8 +28,10 @@ namespace common { // Function to check if the sending and receiving neighbors match in the // topology. bool CheckNeighborSendRecvPattern( - const std::vector* send_neighbors, const std::vector* recv_neighbors, + const std::vector& send_neighbors, const std::vector& recv_neighbors, const std::string& tensor_name, int size, Timeline* timeline_ptr, const MPI_Comm& comm); +bool CheckNeighborSendRecvPatternForEntry( + const TensorTableEntry& entry, const MPIContext& mpi_ctx, Timeline* timeline_ptr); class MPIController { public: diff --git a/bluefog/common/nccl_controller.cc b/bluefog/common/nccl_controller.cc index 7bdd9acf..7f2d52e5 100644 --- a/bluefog/common/nccl_controller.cc +++ b/bluefog/common/nccl_controller.cc @@ -579,23 +579,13 @@ void NCCLController::NeighborAllgather(TensorTableEntry& entry) { displcmnts = new int[mpi_ctx_.neighbor_indgree_]; status = mpi_ctx_.AllocateOutput(entry, recvcounts, Communicator::GRAPH); } else { - bool is_topo_check_fail = false; - if (entry.enable_topo_check && entry.dynamic_neighbors_enabled) { - if (entry.is_hierarchical) { - // TODO: support check. - BFLOG(INFO) << "Request to check topology for hierarchical neighbor " - << "allreduce ops but it is not supported yet."; - } - is_topo_check_fail = CheckNeighborSendRecvPattern( - entry.send_neighbors.get(), entry.recv_neighbors.get(), entry.tensor_name, - mpi_ctx_.size_, timeline_ptr_, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL)); - } - + bool is_topo_check_fail = CheckNeighborSendRecvPatternForEntry(entry, mpi_ctx_, timeline_ptr_); if (is_topo_check_fail) { entry.callback(Status::InvalidArgument( "Src and dst neighbor ranks do not match")); return; } + recvcounts = new int[entry.recv_neighbors->size()]; displcmnts = new int[entry.recv_neighbors->size()]; status = mpi_ctx_.AllocateOutput(entry, recvcounts, Communicator::DYNAMIC, @@ -737,17 +727,7 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) { // If only partial sending is enabled, the following code block checks whether // the sending and recieving neighbors match each other when enable_topo_check // is set to be True. - bool is_topo_check_fail = false; - if (entry.enable_topo_check && entry.dynamic_neighbors_enabled) { - if (entry.is_hierarchical) { - // TODO: support check. - BFLOG(INFO) << "Request to check topology for hierarchical neighbor " - << "allreduce ops but it is not supported yet."; - } - is_topo_check_fail = CheckNeighborSendRecvPattern( - entry.send_neighbors.get(), entry.recv_neighbors.get(), entry.tensor_name, - mpi_ctx_.size_, timeline_ptr_, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL)); - } + bool is_topo_check_fail = CheckNeighborSendRecvPatternForEntry(entry, mpi_ctx_, timeline_ptr_); if (is_topo_check_fail) { entry.callback(Status::InvalidArgument( "Send and recv neighbors dont' match in neighbor " @@ -1017,17 +997,8 @@ void NCCLController::NeighborAllreduce(std::vector& entries) { // If only partial sending is enabled, the following code block checks whether // the sending and recieving neighbors match each other when enable_topo_check // is set to be True. - bool is_topo_check_fail = false; - if (first_entry.enable_topo_check && first_entry.dynamic_neighbors_enabled) { - if (first_entry.is_hierarchical) { - // TODO: support check. - BFLOG(INFO) << "Request to check topology for hierarchical neighbor " - << "allreduce ops but it is not supported yet."; - } - is_topo_check_fail = CheckNeighborSendRecvPattern( - first_entry.send_neighbors.get(), first_entry.recv_neighbors.get(), first_entry.tensor_name, - mpi_ctx_.size_, timeline_ptr_, mpi_ctx_.GetMPICommunicator(Communicator::GLOBAL)); - } + bool is_topo_check_fail = CheckNeighborSendRecvPatternForEntry(first_entry, mpi_ctx_, + timeline_ptr_); if (is_topo_check_fail) { for (auto& entry : entries) { entry.callback(Status::InvalidArgument( From 3d90087d562a41fe3123865d8a6d620b0c4bd0cd Mon Sep 17 00:00:00 2001 From: Hanbin Hu Date: Fri, 26 Mar 2021 00:55:05 -0700 Subject: [PATCH 34/36] Add two more TODOs --- bluefog/common/mpi_controller.cc | 3 +++ bluefog/torch/mpi_ops.py | 2 ++ 2 files changed, 5 insertions(+) diff --git a/bluefog/common/mpi_controller.cc b/bluefog/common/mpi_controller.cc index 5d358476..832f6be5 100644 --- a/bluefog/common/mpi_controller.cc +++ b/bluefog/common/mpi_controller.cc @@ -467,6 +467,9 @@ void MPIController::NeighborAllreduce(TensorTableEntry& entry) { weighted_tensors.push_back(std::move(weighted_tensor_ptr)); } } + // TODO(ybc) #83 Better design pattern for data_weight synchronization + // This ready event make sure the data_weight computation is done before communication, as + // Pytorch CUDA stream is not synchronized with our CUDA stream. std::shared_ptr ready_event = entry.context->RecordReadyEvent(entry.device); diff --git a/bluefog/torch/mpi_ops.py b/bluefog/torch/mpi_ops.py index 1b82b664..4c6b1dbc 100644 --- a/bluefog/torch/mpi_ops.py +++ b/bluefog/torch/mpi_ops.py @@ -570,6 +570,7 @@ def neighbor_allreduce(tensor: torch.Tensor, *, Note: self_weight and neighbor_weights must be presented at the same time. """ + # TODO(hanbinhu) #82 Symmetrical argument for self_weight, src_weights, dst_weights if (self_weight is None and src_weights is not None) or \ (self_weight is not None and src_weights is None): raise ValueError("Arguments self_weight and src_weights have to be presented at " @@ -625,6 +626,7 @@ def neighbor_allreduce_nonblocking(tensor: torch.Tensor, *, Note: self_weight and neighbor_weights must be presented at the same time. """ + # TODO(hanbinhu) #82 Symmetrical argument for self_weight, src_weights, dst_weights if (self_weight is None and src_weights is not None) or \ (self_weight is not None and src_weights is None): raise ValueError("Arguments self_weight and src_weights have to be presented at " From 96887b5063953784400b933a841bcb8d414c97ac Mon Sep 17 00:00:00 2001 From: Hanbin Hu Date: Sat, 27 Mar 2021 12:49:53 -0700 Subject: [PATCH 35/36] Address review comments --- bluefog/common/global_state.h | 2 +- bluefog/common/mpi_controller.cc | 92 +++++++++++++++++-------------- bluefog/common/mpi_controller.h | 4 ++ bluefog/common/nccl_controller.cc | 82 +++++++++++++++------------ bluefog/common/nccl_controller.h | 4 ++ bluefog/common/operations.cc | 2 + bluefog/common/tensor_queue.h | 3 + bluefog/torch/mpi_ops.cc | 19 ++++--- setup.py | 16 ++++-- 9 files changed, 135 insertions(+), 89 deletions(-) diff --git a/bluefog/common/global_state.h b/bluefog/common/global_state.h index 6f940769..2be439a3 100644 --- a/bluefog/common/global_state.h +++ b/bluefog/common/global_state.h @@ -80,7 +80,7 @@ struct BluefogGlobalState { // Threshold for Tensor Fusion. All tensors that occupy memory beyond this // threshold will be fused. int64_t tensor_fusion_threshold = 8 * 1024 * 1024; - int64_t tensor_fusion_threshold_for_dst_weight = 8 * 1024 * 1024; + int64_t tensor_fusion_threshold_for_dst_weight = tensor_fusion_threshold; FusionBufferManager fusion_buffer; // Because setting topology happens in the main thread instead of communication diff --git a/bluefog/common/mpi_controller.cc b/bluefog/common/mpi_controller.cc index 832f6be5..cd632049 100644 --- a/bluefog/common/mpi_controller.cc +++ b/bluefog/common/mpi_controller.cc @@ -449,13 +449,13 @@ void MPIController::NeighborAllreduce(TensorTableEntry& entry) { // including itself is more intuitive. std::string error_message = ""; - if (!entry.is_hierarchical) { - if (!entry.dynamic_neighbors_enabled) { + if (!entry.is_hierarchical) { // neighbor allreduce without hierarchy + if (!entry.dynamic_neighbors_enabled) { // static topology MPICHECK(MPI_Neighbor_allgather( sendbuf, num_elements, mpi_ctx_.GetMPIDataType(entry.tensor), buffer_data, num_elements, mpi_ctx_.GetMPIDataType(entry.output), mpi_ctx_.GetMPICommunicator(Communicator::GRAPH))); - } else { + } else { // dynamic topology int nsend = entry.send_neighbors->size(); int nrecv = entry.recv_neighbors->size(); @@ -468,8 +468,9 @@ void MPIController::NeighborAllreduce(TensorTableEntry& entry) { } } // TODO(ybc) #83 Better design pattern for data_weight synchronization - // This ready event make sure the data_weight computation is done before communication, as - // Pytorch CUDA stream is not synchronized with our CUDA stream. + // This ready event makes sure the data_weight computation is done before communication, as + // Pytorch CUDA stream is not synchronized with our CUDA stream, and it does nothing when + // it is running on CPU. std::shared_ptr ready_event = entry.context->RecordReadyEvent(entry.device); @@ -486,16 +487,14 @@ void MPIController::NeighborAllreduce(TensorTableEntry& entry) { } if (entry.dst_weighting_enabled) { - if (ready_event != nullptr) { - while (!ready_event->Ready()) { - std::this_thread::sleep_for(std::chrono::nanoseconds(100)); - } + while ((ready_event != nullptr) && !ready_event->Ready()) { + std::this_thread::sleep_for(std::chrono::nanoseconds(100)); } } for (int i = 0; i < nsend; ++i) { - const void* buffer_send = sendbuf; - if (entry.dst_weighting_enabled) - buffer_send = weighted_tensors[i].get()->data(); + const void* buffer_send = (entry.dst_weighting_enabled) + ? weighted_tensors[i]->data() + : sendbuf; MPICHECK(MPI_Isend(buffer_send, num_elements, mpi_ctx_.GetMPIDataType(entry.tensor), entry.send_neighbors->at(i), /*tag=*/mpi_ctx_.rank_ + entry.send_neighbors->at(i), @@ -505,7 +504,7 @@ void MPIController::NeighborAllreduce(TensorTableEntry& entry) { error_message = GenerateNeighborExchangeErrorMessage(statuses, nsend, nrecv); } - } else { + } else { // hierarchical neighbor allreduce if (entry.send_neighbors->empty()) { throw std::runtime_error( "Under hierarchical neighbor_allreduce, argument " @@ -630,26 +629,8 @@ void MPIController::NeighborAllreduce(std::vector& entries) { if (first_entry.dst_weighting_enabled) { // Generate weighted data fusion for sending timeline_ptr->ActivityStartAll(entries, "MEMCPY_IN_WEIGHT_FUSION_BUFFER"); - void* weight_buffer_data; - MemcpyInWeightFusionBuffer(weight_buffer_data, first_entry.send_neighbors->size(), - fused_input_data, num_elements, element_size, - first_entry.context, first_entry.device); - int64_t offset = 0; - for (size_t i = 0; i < first_entry.send_neighbors->size(); ++i) { - double dst_weight = first_entry.send_weights->at(i); - void* weight_buffer_data_offset = (uint8_t*)weight_buffer_data + offset; - if (first_entry.device == CPU_DEVICE_ID) { - ScaleCPUBuffer(dst_weight, weight_buffer_data_offset, num_elements, - first_entry.tensor->dtype()); - } else { -#if HAVE_CUDA - ScaleBufferCudaImpl(dst_weight, weight_buffer_data_offset, num_elements, - first_entry.tensor->dtype(), mpi_ctx_.stream); -#endif - } - offset += num_elements * element_size; - } - weighted_fused_input_data = weight_buffer_data; + weighted_fused_input_data = GenerateWeightedFusedInputData(fused_input_data, first_entry, + num_elements, element_size); timeline_ptr->ActivityEndAll(entries); } @@ -667,13 +648,13 @@ void MPIController::NeighborAllreduce(std::vector& entries) { // including itself is more intuitive. std::string error_message = ""; - if (!first_entry.is_hierarchical) { - if (!first_entry.dynamic_neighbors_enabled) { + if (!first_entry.is_hierarchical) { // neighbor allreduce without hierarchy + if (!first_entry.dynamic_neighbors_enabled) { // static topology MPICHECK(MPI_Neighbor_allgather( fused_input_data, num_elements, mpi_ctx_.GetMPIDataType(first_entry.tensor), buffer_data, num_elements, mpi_ctx_.GetMPIDataType(first_entry.output), mpi_ctx_.GetMPICommunicator(Communicator::GRAPH))); - } else { + } else { // dynamic topology int nsend = first_entry.send_neighbors->size(); int nrecv = first_entry.recv_neighbors->size(); std::vector requests(nsend + nrecv); @@ -692,10 +673,10 @@ void MPIController::NeighborAllreduce(std::vector& entries) { } #endif for (int i = 0; i < nsend; ++i) { - const void* sendbuf = fused_input_data; - if (first_entry.dst_weighting_enabled) - sendbuf = - (void*)((uint8_t*)weighted_fused_input_data + num_elements * i * element_size); + const void* sendbuf = + (first_entry.dst_weighting_enabled) + ? (void*)((uint8_t*)weighted_fused_input_data + num_elements * i * element_size) + : fused_input_data; MPICHECK(MPI_Isend(sendbuf, num_elements, mpi_ctx_.GetMPIDataType(first_entry.tensor), first_entry.send_neighbors->at(i), /*tag=*/mpi_ctx_.rank_ + first_entry.send_neighbors->at(i), @@ -705,7 +686,7 @@ void MPIController::NeighborAllreduce(std::vector& entries) { error_message = GenerateNeighborExchangeErrorMessage(statuses, nsend, nrecv); } - } else { + } else { // hierarchical neighbor allreduce if (first_entry.send_neighbors->empty()) { throw std::runtime_error( "Under hierarchical neighbor_allreduce, argument " @@ -1442,6 +1423,35 @@ void MPIController::MemcpyInWeightFusionBuffer( } } +const void* MPIController::GenerateWeightedFusedInputData(const void* fused_input_data, + const TensorTableEntry& entry, + int64_t num_elements, int element_size) { + // Given a fused_input_data like [t_1, t_2], the storage for neighbor_allreduce in + // weighted fused input data is like [t_1_w1, t_2_w1 | t_1_w2, t_2_w2 | t_1_w3, t_2_w3]. + // Here t_1 and t_2 means self tensor 1 and 2 and _w1, _w2, and _w3 means the + // destination weights to destination 1, 2, and 3. + void* weight_buffer_data; + MemcpyInWeightFusionBuffer(weight_buffer_data, entry.send_neighbors->size(), + fused_input_data, num_elements, element_size, + entry.context, entry.device); + int64_t offset = 0; + for (size_t i = 0; i < entry.send_neighbors->size(); ++i) { + double dst_weight = entry.send_weights->at(i); + void* weight_buffer_data_offset = (uint8_t*)weight_buffer_data + offset; + if (entry.device == CPU_DEVICE_ID) { + ScaleCPUBuffer(dst_weight, weight_buffer_data_offset, num_elements, + entry.tensor->dtype()); + } else { +#if HAVE_CUDA + ScaleBufferCudaImpl(dst_weight, weight_buffer_data_offset, num_elements, + entry.tensor->dtype(), mpi_ctx_.stream); +#endif + } + offset += num_elements * element_size; + } + return weight_buffer_data; +} + void MPIController::MemcpyInFusionBuffer( const std::vector& entries, void*& buffer_data, size_t& buffer_len) { diff --git a/bluefog/common/mpi_controller.h b/bluefog/common/mpi_controller.h index be5daa34..b3eda810 100644 --- a/bluefog/common/mpi_controller.h +++ b/bluefog/common/mpi_controller.h @@ -127,6 +127,10 @@ class MPIController { const void* buffer_data, int64_t num_elements, int element_size, std::shared_ptr context, int device); + const void* GenerateWeightedFusedInputData(const void* fused_input_data, + const TensorTableEntry& entry, + int64_t num_elements, int element_size); + void MemcpyOutFusionBufferForInputs(const void* fused_input_data, std::vector& entries); diff --git a/bluefog/common/nccl_controller.cc b/bluefog/common/nccl_controller.cc index 7f2d52e5..f5fbd1df 100644 --- a/bluefog/common/nccl_controller.cc +++ b/bluefog/common/nccl_controller.cc @@ -739,9 +739,9 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) { // Ensure the lifecycle of the weighted tensors are alive after communication. #if NCCL_MINOR > 6 - if (!entry.is_hierarchical) { + if (!entry.is_hierarchical) { // neighbor allreduce without hierarchy ncclGroupStart(); - if (!entry.dynamic_neighbors_enabled) { + if (!entry.dynamic_neighbors_enabled) { // static topology for (int i = 0; i < mpi_ctx_.neighbor_indgree_; i++) { int recv_rank = mpi_ctx_.neighbor_in_ranks_[i]; void* recvbuf = (void*)(static_cast(entry.output->data()) + @@ -753,13 +753,17 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) { NCCLCHECK(ncclSend(sendbuf, num_elements, GetNCCLDataType(entry.tensor), send_rank, nccl_ctx_.nccl_comm, nccl_ctx_.stream)); } - } else { + } else { // dynamic topology if(entry.dst_weighting_enabled) { for (size_t i = 0; i < entry.send_neighbors->size(); ++i) { auto weighted_tensor_ptr = entry.tensor->data_weight(entry.send_weights->at(i)); weighted_tensors.push_back(std::move(weighted_tensor_ptr)); } } + // TODO(ybc) #83 Better design pattern for data_weight synchronization + // This ready event makes sure the data_weight computation is done before communication, as + // Pytorch CUDA stream is not synchronized with our CUDA stream, and it does nothing when + // it is running on CPU. std::shared_ptr ready_event = entry.context->RecordReadyEvent(entry.device); for (size_t i = 0; i < entry.recv_neighbors->size(); ++i) { @@ -770,22 +774,20 @@ void NCCLController::NeighborAllreduce(TensorTableEntry& entry) { recv_rank, nccl_ctx_.nccl_comm, nccl_ctx_.stream)); } if(entry.dst_weighting_enabled) { - if (ready_event != nullptr) { - while (!ready_event->Ready()) { - std::this_thread::sleep_for(std::chrono::nanoseconds(100)); - } + while ((ready_event != nullptr) && !ready_event->Ready()) { + std::this_thread::sleep_for(std::chrono::nanoseconds(100)); } } for (size_t i = 0; i < entry.send_neighbors->size(); ++i) { - const void* buffer_send = sendbuf; - if (entry.dst_weighting_enabled) - buffer_send = weighted_tensors[i].get()->data(); + const void* buffer_send = (entry.dst_weighting_enabled) + ? weighted_tensors[i]->data() + : sendbuf; NCCLCHECK(ncclSend(buffer_send, num_elements, GetNCCLDataType(entry.tensor), entry.send_neighbors->at(i), nccl_ctx_.nccl_comm, nccl_ctx_.stream)); } } ncclGroupEnd(); - } else { + } else { // hierarchical neighbor allreduce if (entry.send_neighbors->empty()) { throw std::runtime_error( "Under hierarchical neighbor_allreduce, argument " @@ -1016,22 +1018,11 @@ void NCCLController::NeighborAllreduce(std::vector& entries) { RecordEvent(event_queue, "MEM_CPY_IN"); } - const void* weighted_fused_input_data = nullptr; - if (first_entry.dst_weighting_enabled) { - void* weight_buffer_data = nullptr; - MemcpyInWeightFusionBuffer(weight_buffer_data, first_entry.send_neighbors->size(), - fused_input_data, num_elements, element_size, - first_entry.context, first_entry.device); - int64_t offset = 0; - for (size_t i = 0; i < first_entry.send_neighbors->size(); ++i) { - double dst_weight = first_entry.send_weights->at(i); - void* weight_buffer_data_offset = (uint8_t*)weight_buffer_data + offset; - ScaleBufferCudaImpl(dst_weight, weight_buffer_data_offset, num_elements, - first_entry.tensor->dtype(), nccl_ctx_.stream); - offset += num_elements * element_size; - } - weighted_fused_input_data = weight_buffer_data; - } + const void* weighted_fused_input_data = + (first_entry.dst_weighting_enabled) + ? GenerateWeightedFusedInputData(fused_input_data, first_entry, + num_elements, element_size) + : nullptr; // Unlike allreduce, the storage for neighbor_allreduce in fusion buffer // is like [t_1, t_2 | t_1_n1, t_2_n1, t_1_n2, t_2_n2]. @@ -1040,9 +1031,9 @@ void NCCLController::NeighborAllreduce(std::vector& entries) { // Hence, we need to offset the buffer data to location for neighbors. buffer_data = (uint8_t*)buffer_data + num_elements * element_size; - if (!first_entry.is_hierarchical) { + if (!first_entry.is_hierarchical) { // neighbor allreduce without hierarchy ncclGroupStart(); - if (!first_entry.dynamic_neighbors_enabled) { + if (!first_entry.dynamic_neighbors_enabled) { // static topology for (int i = 0; i < mpi_ctx_.neighbor_indgree_; i++) { int recv_rank = mpi_ctx_.neighbor_in_ranks_[i]; void* recvbuf = @@ -1056,7 +1047,7 @@ void NCCLController::NeighborAllreduce(std::vector& entries) { GetNCCLDataType(first_entry.tensor), send_rank, nccl_ctx_.nccl_comm, nccl_ctx_.stream)); } - } else { + } else { // dynamic topology for (size_t i = 0; i < first_entry.recv_neighbors->size(); ++i) { int recv_rank = first_entry.recv_neighbors->at(i); void* recvbuf = @@ -1066,16 +1057,17 @@ void NCCLController::NeighborAllreduce(std::vector& entries) { nccl_ctx_.nccl_comm, nccl_ctx_.stream)); } for (size_t i = 0; i < first_entry.send_neighbors->size(); ++i) { - const void* sendbuf = fused_input_data; - if (first_entry.dst_weighting_enabled) - sendbuf = (void*)((uint8_t*)weighted_fused_input_data + num_elements * i * element_size); + const void* sendbuf = + (first_entry.dst_weighting_enabled) + ? (void*)((uint8_t*)weighted_fused_input_data + num_elements * i * element_size) + : fused_input_data; NCCLCHECK(ncclSend(sendbuf, num_elements, GetNCCLDataType(first_entry.tensor), first_entry.send_neighbors->at(i), nccl_ctx_.nccl_comm, nccl_ctx_.stream)); } } ncclGroupEnd(); - } else { + } else { // hierarchical neighbor allreduce if (first_entry.send_neighbors->empty()) { throw std::runtime_error( "Under hierarchical neighbor_allreduce, argument " @@ -1925,6 +1917,28 @@ void NCCLController::MemcpyInWeightFusionBuffer( } } +const void* NCCLController::GenerateWeightedFusedInputData(const void* fused_input_data, + const TensorTableEntry& entry, + int64_t num_elements, int element_size) { + // Given a fused_input_data like [t_1, t_2], the storage for neighbor_allreduce in + // weighted fused input data is like [t_1_w1, t_2_w1 | t_1_w2, t_2_w2 | t_1_w3, t_2_w3]. + // Here t_1 and t_2 means self tensor 1 and 2 and _w1, _w2, and _w3 means the + // destination weights to destination 1, 2, and 3. + void* weight_buffer_data = nullptr; + MemcpyInWeightFusionBuffer(weight_buffer_data, entry.send_neighbors->size(), + fused_input_data, num_elements, element_size, + entry.context, entry.device); + int64_t offset = 0; + for (size_t i = 0; i < entry.send_neighbors->size(); ++i) { + double dst_weight = entry.send_weights->at(i); + void* weight_buffer_data_offset = (uint8_t*)weight_buffer_data + offset; + ScaleBufferCudaImpl(dst_weight, weight_buffer_data_offset, num_elements, + entry.tensor->dtype(), nccl_ctx_.stream); + offset += num_elements * element_size; + } + return weight_buffer_data; +} + void NCCLController::MemcpyOutFusionBuffer( const void* buffer_data, std::vector& entries) { int64_t offset = 0; diff --git a/bluefog/common/nccl_controller.h b/bluefog/common/nccl_controller.h index 0a2b1013..952cad5e 100644 --- a/bluefog/common/nccl_controller.h +++ b/bluefog/common/nccl_controller.h @@ -199,6 +199,10 @@ class NCCLController { const void* buffer_data, int64_t num_elements, int element_size, std::shared_ptr context, int device); + const void* GenerateWeightedFusedInputData(const void* fused_input_data, + const TensorTableEntry& entry, + int64_t num_elements, int element_size); + void MemcpyOutFusionBufferForInputs(const void* fused_input_data, std::vector& entries); diff --git a/bluefog/common/operations.cc b/bluefog/common/operations.cc index ddf90cf5..bb6940cb 100644 --- a/bluefog/common/operations.cc +++ b/bluefog/common/operations.cc @@ -482,6 +482,8 @@ void BackgroundThreadLoop(BluefogGlobalState& state) { if (bluefog_fusion_threshold != nullptr) { state.tensor_fusion_threshold = std::strtol(bluefog_fusion_threshold, nullptr, 10); + state.tensor_fusion_threshold_for_dst_weight = + state.tensor_fusion_threshold; } // Initialize the tensor count table. No tensors are available yet. diff --git a/bluefog/common/tensor_queue.h b/bluefog/common/tensor_queue.h index 47d74f88..a0302496 100644 --- a/bluefog/common/tensor_queue.h +++ b/bluefog/common/tensor_queue.h @@ -83,6 +83,9 @@ class FusionBufferManager { std::function on_end_init); // Initializes a buffer of the given threshold size times MPI size if not already cached. + // There is one constraint to noticed here. We need WeightBuffer is always larger than + // (size-1)*fusion Buffer since we don't want to tensor being able to put into the fusion + // buffer but not able to put into weightbuffer. // // Args: // threshold: Size of the buffer in bytes. diff --git a/bluefog/torch/mpi_ops.cc b/bluefog/torch/mpi_ops.cc index 636da5f0..9a980d54 100644 --- a/bluefog/torch/mpi_ops.cc +++ b/bluefog/torch/mpi_ops.cc @@ -407,11 +407,14 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output, auto callback_wrapper = GetCallbackWrapper(handle, timeline_ptr, op_name, tid); - std::map src_weights_map; + // src_neighbors, dst_neighbors --> list of ranks only used in Enqueue + // src_weights_ordered_map --> used in callback only + // dst_weights_vec --> used in Enqueue for sending (same order of dst_neighbors) + std::map src_weights_ordered_map; for (auto kv : src_weights) - src_weights_map.insert(kv); + src_weights_ordered_map.insert(kv); std::vector src_neighbors; - for (auto kv : src_weights_map) + for (auto kv : src_weights_ordered_map) src_neighbors.push_back(kv.first); std::vector dst_neighbors; @@ -440,11 +443,11 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output, bf_src_neighbors, bf_dst_neighbors, bf_dst_weights_vec, dynamic_neighbors_enabled, dst_weighting_enabled, is_hierarchical, enable_topo_check, op_name, CPU_DEVICE_ID, - callback_wrapper([self_weight, src_weights_map, avg_computation, cpu_output, tensor, + callback_wrapper([self_weight, src_weights_ordered_map, avg_computation, cpu_output, tensor, dynamic_neighbors_enabled, is_hierarchical, output, device]() mutable { with_device device_guard(device); output.copy_(cpu_output); - PerformNeighborAllreduceCallback(tensor, output, self_weight, src_weights_map, + PerformNeighborAllreduceCallback(tensor, output, self_weight, src_weights_ordered_map, avg_computation, dynamic_neighbors_enabled, is_hierarchical); })); @@ -460,9 +463,9 @@ int DoNeighborAllreduce(::torch::Tensor tensor, ::torch::Tensor output, bf_src_neighbors, bf_dst_neighbors, bf_dst_weights_vec, dynamic_neighbors_enabled, dst_weighting_enabled, is_hierarchical, enable_topo_check, op_name, device, - callback_wrapper([self_weight, src_weights_map, avg_computation, dynamic_neighbors_enabled, - is_hierarchical, tensor, output]() mutable { - PerformNeighborAllreduceCallback(tensor, output, self_weight, src_weights_map, + callback_wrapper([self_weight, src_weights_ordered_map, avg_computation, + dynamic_neighbors_enabled, is_hierarchical, tensor, output]() mutable { + PerformNeighborAllreduceCallback(tensor, output, self_weight, src_weights_ordered_map, avg_computation, dynamic_neighbors_enabled, is_hierarchical); })); diff --git a/setup.py b/setup.py index f4e3bffb..fa736de5 100644 --- a/setup.py +++ b/setup.py @@ -21,6 +21,7 @@ import subprocess import sys import textwrap +import shlex import traceback from typing import List @@ -449,18 +450,23 @@ def build_nvcc_extra_objects(nvcc_cmd: str, cxx11_abi: bool) -> List[str]: # nvcc --compiler-options '-fPIC -D_GLIBCXX_USE_CXX11_ABI=0' -rdc=true -c cuda_kernels.cu # nvcc --compiler-options '-fPIC -D_GLIBCXX_USE_CXX11_ABI=0' -dlink -o cuda_kernels_link.o \ # cuda_kernels.o -lcudart - nvcc_flags = f"--compiler-options '-fPIC -D_GLIBCXX_USE_CXX11_ABI={int(cxx11_abi)}'" + nvcc_flags = f'-fPIC -D_GLIBCXX_USE_CXX11_ABI={int(cxx11_abi)}' extra_object_dir = 'bluefog/common/cuda/' source = extra_object_dir+'cuda_kernels.cu' object_file = extra_object_dir+'cuda_kernels.o' object_link = extra_object_dir+'cuda_kernels_link.o' - command_object = f"{nvcc_cmd} {nvcc_flags} -rdc=true -c {source} -o {object_file}" - command_link = f"{nvcc_cmd} {nvcc_flags} -dlink {object_file} -lcudart -o {object_link}" + command_object = [nvcc_cmd, '--compiler-options', nvcc_flags, + '-rdc=true', '-c', source, '-o', object_file] + command_link = [nvcc_cmd, '--compiler-options', nvcc_flags, + '-dlink', object_file, '-lcudart', '-o', object_link] + + command_object_str = ' '.join(shlex.quote(par) for par in command_object) + command_link_str = ' '.join(shlex.quote(par) for par in command_link) - subprocess.check_call([command_object], shell=True) - subprocess.check_call([command_link], shell=True) + subprocess.check_call(command_object_str, shell=True) + subprocess.check_call(command_link_str, shell=True) return [object_file, object_link] def build_torch_extension(build_ext, global_options, torch_version): From 36c073abf9c5352e85de4a8dd9f96e2c95085c29 Mon Sep 17 00:00:00 2001 From: YBC Date: Sat, 10 Apr 2021 19:52:04 -0700 Subject: [PATCH 36/36] Add condition variable to control the loop (#88) * Add condition variable to control the loop * Minor update on topology_setting in global_state * Add missing header * Change cv.wait to cv.wait_for 10 seconds * Address comment and remove adjusting resetVersionWinMem in ibfrun --- bluefog/common/global_state.h | 13 +++- bluefog/common/mpi_context.cc | 2 +- bluefog/common/operations.cc | 116 +++++++++++++++------------------ bluefog/common/tensor_queue.h | 36 +++++----- bluefog/run/interactive_run.py | 3 +- test/torch_ops_test.py | 8 ++- 6 files changed, 91 insertions(+), 87 deletions(-) diff --git a/bluefog/common/global_state.h b/bluefog/common/global_state.h index 2be439a3..2683ad14 100644 --- a/bluefog/common/global_state.h +++ b/bluefog/common/global_state.h @@ -18,6 +18,7 @@ #define BLUEFOG_COMMON_GLOBAL_STATE_H #include +#include #include #include #include @@ -54,6 +55,14 @@ struct BluefogGlobalState { // Whether collective context has been completed on the background thread. std::atomic_bool initialization_done{false}; + // Condition variable and its mutex for main loop in communication thread. + std::condition_variable loop_cv; + std::mutex loop_mutex; + + // Under negotiation, the entries sends to master first and wait until it + // returns ok to run. This variable keeps the records of that. + std::atomic_int unfinished_enqueued_entries{0}; + // Timeline writer. Timeline timeline; @@ -84,10 +93,8 @@ struct BluefogGlobalState { FusionBufferManager fusion_buffer; // Because setting topology happens in the main thread instead of communication - // thread. Following three variables are to sync between them. + // thread. Not really used since the condition variable refactor. std::atomic_bool setting_topology{false}; - std::atomic_bool setting_topology_done{false}; - std::atomic_bool ready_to_setting_topology{false}; // Only exists on the coordinator node (rank zero). Maintains a vector of // requests to allreduce every tensor (keyed by tensor name). diff --git a/bluefog/common/mpi_context.cc b/bluefog/common/mpi_context.cc index 7a68ce39..e5ef0e6c 100644 --- a/bluefog/common/mpi_context.cc +++ b/bluefog/common/mpi_context.cc @@ -75,7 +75,7 @@ bool WindowManager::InitializeMutexWin(const MPI_Comm& mpi_comm) { std::vector WindowManager::GetVersionMemoryCopy() { return version_mem_; } void WindowManager::resetVersionWinMem(int initialValue /*=0*/) { - for (int i = 0; i < version_mem_.size(); i++) { + for (size_t i = 0; i < version_mem_.size(); i++) { version_mem_[i] = initialValue; } } diff --git a/bluefog/common/operations.cc b/bluefog/common/operations.cc index bb6940cb..578291e0 100644 --- a/bluefog/common/operations.cc +++ b/bluefog/common/operations.cc @@ -438,7 +438,7 @@ bool IsSameList (std::shared_ptr> n1, std::shared_ptrsize() != n2->size()) return false; // The order matters as well. - for (int i = 0; i < n1->size(); i++) { + for (size_t i = 0; i < n1->size(); i++) { if (n1->at(i) != n2->at(i)) { return false; } @@ -853,7 +853,9 @@ void PerformOperationWithFusion(std::vector& entries) { void NegotiateOfRequestOfMaster(BluefogGlobalState& state, std::deque& message_queue_buffer, bool& should_change_topo, - bool& should_shut_down) { + bool& should_shut_down) { + state.unfinished_enqueued_entries.fetch_add(message_queue_buffer.size()); + std::vector ready_to_reduce; RequestList message_list; message_list.set_shutdown(should_shut_down); @@ -1035,6 +1037,7 @@ void NegotiateOfRequestOfMaster(BluefogGlobalState& state, } else { PerformOperation(nego_entries); } + state.unfinished_enqueued_entries.fetch_sub(nego_entries.size()); } // Check for stalled tensors. @@ -1049,6 +1052,7 @@ void NegotiateOfRequestOfSlave(BluefogGlobalState& state, std::deque& message_queue_buffer, bool& should_change_topo, bool& should_shut_down) { + state.unfinished_enqueued_entries.fetch_add(message_queue_buffer.size()); std::string encoded_message; RequestList message_list; message_list.set_shutdown(state.shut_down); @@ -1057,6 +1061,7 @@ void NegotiateOfRequestOfSlave(BluefogGlobalState& state, message_list.add_request(message_queue_buffer.front()); message_queue_buffer.pop_front(); } + RequestList::SerializeToString(message_list, encoded_message); int encoded_message_length = (int)encoded_message.length() + 1; MPI_Gather(&encoded_message_length, 1, MPI_INT, nullptr, 1, MPI_INT, @@ -1084,6 +1089,7 @@ void NegotiateOfRequestOfSlave(BluefogGlobalState& state, } else { PerformOperation(nego_entries); } + state.unfinished_enqueued_entries.fetch_sub(nego_entries.size()); } if (response_list.shutdown()) { @@ -1097,6 +1103,8 @@ void NegotiateOfRequestOfSlave(BluefogGlobalState& state, void NegotiationOfRequest(BluefogGlobalState& state, std::deque& message_queue_buffer, bool& should_change_topo, bool& should_shut_down) { + // TODO(ybc) should_change_topo has no effect after condition variable refactor. + // Just keep it for a while. will remove. if (bluefog_rank() == COORDINATE_RANK) { NegotiateOfRequestOfMaster(state, message_queue_buffer, should_change_topo, should_shut_down); @@ -1111,6 +1119,15 @@ bool RunLoopOnce(BluefogGlobalState& state) { bool should_shut_down = state.shut_down; bool should_change_topo = state.setting_topology; + std::unique_lock lk(state.loop_mutex); + // The real mutex for queue is the on under TensorQueue. + state.loop_cv.wait_for(lk, std::chrono::seconds(10), [&state] { + // When we requesting shut_down, or any unfinished entries waiting in the + // negotiation we should not wait. + return state.shut_down || (state.unfinished_enqueued_entries > 0) || + (state.tensor_queue.size() > 0); + }); + // This delay determines thread frequency and MPI message latency auto sleep_duration = state.last_cycle_start + @@ -1131,7 +1148,7 @@ bool RunLoopOnce(BluefogGlobalState& state) { std::vector entries; auto IsRequestConvertToEntryDirectly = [](const Request& request) -> bool { return global_skip_negotiate_stage || - (request.request_type() != Request::ALLREDUCE && + (request.request_type() != Request::ALLREDUCE && request.request_type() != Request::ALLGATHER && request.request_type() != Request::BROADCAST && request.request_type() != Request::NEIGHBOR_ALLREDUCE && @@ -1150,7 +1167,8 @@ bool RunLoopOnce(BluefogGlobalState& state) { std::remove_if(message_queue_buffer.begin(), message_queue_buffer.end(), IsRequestConvertToEntryDirectly), message_queue_buffer.end()); - + + lk.unlock(); // Never hold the mutex when there is remote function. PerformOperation(entries); // For the rest requests, they needs to coordinate and neogiate. @@ -1163,20 +1181,6 @@ bool RunLoopOnce(BluefogGlobalState& state) { NegotiationOfRequest(state, message_queue_buffer, should_change_topo, should_shut_down); } - // Seperate the setting topology and negotiate communnication. - // TODO(ybc) Use conditional variable and mutex to re-implement this. - if (should_change_topo) { - bluefog_global.ready_to_setting_topology = true; - while (!bluefog_global.setting_topology_done) { - std::this_thread::sleep_for(SUSPEND_BACKGROUND_WAITTING_DURATION); - } - bluefog_global.ready_to_setting_topology = false; - // Wait for main thread reset. - while (bluefog_global.setting_topology_done) { - std::this_thread::sleep_for(SUSPEND_BACKGROUND_WAITTING_DURATION); - } - } - return !should_shut_down; } @@ -1215,6 +1219,7 @@ void bluefog_init() { InitializeBluefogOnce(); } void bluefog_shutdown() { if (bluefog_global.background_thread.joinable()) { bluefog_global.shut_down = true; + bluefog_global.loop_cv.notify_all(); bluefog_global.background_thread.join(); // Reset the initialization flag to allow restarting with bluefog_init(...) //bluefog_global.initialize_flag.clear(); @@ -1290,36 +1295,21 @@ int bluefog_set_topology(int indegree, const int* sources, int outdegree, return -1; } #endif - bluefog_global.setting_topology = true; - while (!bluefog_global.ready_to_setting_topology.load()) { - std::this_thread::sleep_for(SUSPEND_BACKGROUND_WAITTING_DURATION); - } - bluefog_global.tensor_queue.LockTensorQueue(); - if (bluefog_global.tensor_queue.size() > 0) { - BFLOG(ERROR) - << "Cannot set the topology because there are unfinished MPI ops."; - bluefog_global.tensor_queue.UnlockTensorQueue(); - return -1; - } - - bool mpi_result = bluefog_global.controller->SetTopology( - indegree, sources, outdegree, destinations); + bool mpi_result; + // When we change the topology, there should be no entries being processed at + // same time. + { + std::lock_guard lk(bluefog_global.loop_mutex); + mpi_result = bluefog_global.controller->SetTopology( + indegree, sources, outdegree, destinations); #if HAVE_NCCL && NCCL_MINOR < 7 - if (mpi_result && nccl_context.is_initialized) { - bluefog_global.nccl_controller->DestroyPeerCommunicators(); - bluefog_global.nccl_controller->InitPeerCommunicators(); - } + if (mpi_result && nccl_context.is_initialized) { + bluefog_global.nccl_controller->DestroyPeerCommunicators(); + bluefog_global.nccl_controller->InitPeerCommunicators(); + } #endif - bluefog_global.tensor_queue.UnlockTensorQueue(); - - bluefog_global.setting_topology = false; - bluefog_global.setting_topology_done = true; - // Wait for the background thread receive the setting_topology_done and - // close the ready_to_setting_topology epoch. - while (bluefog_global.ready_to_setting_topology) { - std::this_thread::sleep_for(SUSPEND_BACKGROUND_WAITTING_DURATION); } - bluefog_global.setting_topology_done = false; + bluefog_global.loop_cv.notify_one(); return mpi_result; } @@ -1447,6 +1437,7 @@ Status EnqueueTensorAllreduce(std::shared_ptr tensor, return SUSPEND_ERROR; } Status status = bluefog_global.tensor_queue.AddToTensorQueue(e, message); + bluefog_global.loop_cv.notify_one(); return status; } @@ -1482,6 +1473,7 @@ Status EnqueueTensorBroadcast(std::shared_ptr tensor, return SUSPEND_ERROR; } Status status = bluefog_global.tensor_queue.AddToTensorQueue(e, message); + bluefog_global.loop_cv.notify_one(); return status; } @@ -1516,6 +1508,7 @@ Status EnqueueTensorAllgather(std::shared_ptr tensor, return SUSPEND_ERROR; } Status status = bluefog_global.tensor_queue.AddToTensorQueue(e, message); + bluefog_global.loop_cv.notify_one(); return status; } @@ -1560,6 +1553,7 @@ Status EnqueueTensorNeighborAllgather(std::shared_ptr tensor, return SUSPEND_ERROR; } Status status = bluefog_global.tensor_queue.AddToTensorQueue(e, message); + bluefog_global.loop_cv.notify_one(); return status; } @@ -1611,6 +1605,7 @@ Status EnqueueTensorNeighborAllreduce(std::shared_ptr tensor, return SUSPEND_ERROR; } Status status = bluefog_global.tensor_queue.AddToTensorQueue(e, message); + bluefog_global.loop_cv.notify_one(); return status; } @@ -1651,6 +1646,7 @@ Status EnqueueTensorPairGossip(std::shared_ptr tensor, return SUSPEND_ERROR; } Status status = bluefog_global.tensor_queue.AddToTensorQueue(e, message); + bluefog_global.loop_cv.notify_one(); return status; } @@ -1683,6 +1679,7 @@ Status EnqueueTensorWindowCreate( return SUSPEND_ERROR; } Status status = bluefog_global.tensor_queue.AddToTensorQueue(e, message); + bluefog_global.loop_cv.notify_one(); return status; } @@ -1707,6 +1704,7 @@ Status EnqueueTensorWindowFree(const std::string& name, int device, return SUSPEND_ERROR; } Status status = bluefog_global.tensor_queue.AddToTensorQueue(e, message); + bluefog_global.loop_cv.notify_one(); return status; } @@ -1740,6 +1738,7 @@ Status EnqueueTensorWindowPut(std::shared_ptr tensor, return SUSPEND_ERROR; } Status status = bluefog_global.tensor_queue.AddToTensorQueue(e, message); + bluefog_global.loop_cv.notify_one(); return status; } @@ -1771,6 +1770,7 @@ Status EnqueueTensorWindowAccumulate( return SUSPEND_ERROR; } Status status = bluefog_global.tensor_queue.AddToTensorQueue(e, message); + bluefog_global.loop_cv.notify_one(); return status; } @@ -1798,6 +1798,7 @@ Status EnqueueTensorWindowGet(const std::string& name, return SUSPEND_ERROR; } Status status = bluefog_global.tensor_queue.AddToTensorQueue(e, message); + bluefog_global.loop_cv.notify_one(); return status; } @@ -2040,28 +2041,15 @@ void SetSkipNegotiateStageState(bool value) { if (value == global_skip_negotiate_stage) { return; } - if (value) { - // From running negotiate to skipping negotiate, we need to properly turn - // off negotiate stage. Otherwise, it may hang the processes. Use setting - // topology flag to suspend the negotiate stage then skip it. - bluefog_global.setting_topology = true; - while (!bluefog_global.ready_to_setting_topology.load()) { - std::this_thread::sleep_for(SUSPEND_BACKGROUND_WAITTING_DURATION); - } - - global_skip_negotiate_stage = value; - bluefog_global.setting_topology = false; - bluefog_global.setting_topology_done = true; - // Wait for the background thread receive the setting_topology_done and - // close the ready_to_setting_topology epoch. - while (bluefog_global.ready_to_setting_topology) { - std::this_thread::sleep_for(SUSPEND_BACKGROUND_WAITTING_DURATION); - } - bluefog_global.setting_topology_done = false; - } else { + // From running negotiate to skipping negotiate, we need to properly turn + // off negotiate stage. Otherwise, it may hang the processes. Use setting + // topology flag to suspend the negotiate stage then skip it. + { + std::lock_guard lk(bluefog_global.loop_mutex); global_skip_negotiate_stage = value; } + bluefog_global.loop_cv.notify_one(); } bool GetSkipNegotiateStageState() { diff --git a/bluefog/common/tensor_queue.h b/bluefog/common/tensor_queue.h index a0302496..de476743 100644 --- a/bluefog/common/tensor_queue.h +++ b/bluefog/common/tensor_queue.h @@ -46,15 +46,20 @@ class TensorQueue { void PushMessageToQueue(Request& message); - // Used when setting Topology, which require the tensor queue should be empty always. + // Used when setting Topology, which require the tensor queue should be empty + // always. inline void LockTensorQueue() { mutex_.lock(); } inline void UnlockTensorQueue() { mutex_.unlock(); } - inline size_t size() { return message_queue_.size(); } + inline size_t size() { + std::lock_guard guard(mutex_); + return message_queue_.size(); + } protected: // Tensors waiting to be processed. - // Key is based upon the message name since tensor_name in table entry for win ops - // is for window and we need to add "win_put."/"win_create." before it in message. + // Key is based upon the message name since tensor_name in table entry for win + // ops is for window and we need to add "win_put."/"win_create." before it in + // message. std::unordered_map tensor_table_; // Queue of MPI requests waiting to be sent to the coordinator node. @@ -65,8 +70,8 @@ class TensorQueue { mutable std::mutex mutex_; }; -// Encapsulates the process of creating and destroying fusion buffers as the requested -// threshold is changed. +// Encapsulates the process of creating and destroying fusion buffers as the +// requested threshold is changed. class FusionBufferManager { public: // Initializes a buffer of the given threshold size if not already cached. @@ -82,10 +87,11 @@ class FusionBufferManager { std::function on_start_init, std::function on_end_init); - // Initializes a buffer of the given threshold size times MPI size if not already cached. - // There is one constraint to noticed here. We need WeightBuffer is always larger than - // (size-1)*fusion Buffer since we don't want to tensor being able to put into the fusion - // buffer but not able to put into weightbuffer. + // Initializes a buffer of the given threshold size times MPI size if not + // already cached. There is one constraint to noticed here. We need + // WeightBuffer is always larger than (size-1)*fusion Buffer since we don't + // want to tensor being able to put into the fusion buffer but not able to put + // into weightbuffer. // // Args: // threshold: Size of the buffer in bytes. @@ -94,9 +100,7 @@ class FusionBufferManager { // context: Framework used to create the buffer and associate it. // on_start_init: Callback on starting buffer initialization. // on_end_init: Callback on completing buffer initialization. - Status InitializeWeightBuffer(int64_t threshold, - int world_size, - int device, + Status InitializeWeightBuffer(int64_t threshold, int world_size, int device, std::shared_ptr context, std::function on_start_init, std::function on_end_init); @@ -104,7 +108,8 @@ class FusionBufferManager { // Returns the buffer associated with the given device and framework, or null. std::shared_ptr GetBuffer(int device); - // Returns the weight buffer associated with the given device and framework, or null. + // Returns the weight buffer associated with the given device and framework, + // or null. std::shared_ptr GetWeightBuffer(int device); private: @@ -112,7 +117,8 @@ class FusionBufferManager { std::unordered_map, int64_t>> tensor_fusion_buffers_; - // Memory buffers for Tensor Fusion with dst weight. They are keyed by device ID. + // Memory buffers for Tensor Fusion with dst weight. They are keyed by device + // ID. std::unordered_map, int64_t>> weight_tensor_fusion_buffers_; }; diff --git a/bluefog/run/interactive_run.py b/bluefog/run/interactive_run.py index ccfc0c79..6682c8e6 100644 --- a/bluefog/run/interactive_run.py +++ b/bluefog/run/interactive_run.py @@ -406,7 +406,8 @@ def handler(signum, frame): signal.signal(signal.SIGINT, handler) env = os.environ.copy() - env['BLUEFOG_CYCLE_TIME'] = str(20) # Increase the cycle time + # No longer needed after using condition variable. + # env['BLUEFOG_CYCLE_TIME'] = str(20) # Increase the cycle time # action of stop if args.action == "stop": diff --git a/test/torch_ops_test.py b/test/torch_ops_test.py index 974a1aca..c8c8ca5a 100644 --- a/test/torch_ops_test.py +++ b/test/torch_ops_test.py @@ -171,6 +171,7 @@ def test_allreduce_sum(self): dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): + torch.manual_seed(123456) tensor = torch.FloatTensor(*([23] * dim)).random_(-100, 100) tensor = self.cast_and_place(tensor, dtype) name = "allreduce_tensor_{}_{}".format(dim, dtype) @@ -1207,7 +1208,7 @@ def test_neighbor_allgather_dynamic_variable_size(self): torch.ByteTensor, torch.CharTensor, torch.ShortTensor, torch.HalfTensor] if TEST_ON_GPU: dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor] - + # Connect to all other ranks neighbor_ranks = [i for i in range(size) if i != rank] dims = [1, 2, 3] @@ -1222,10 +1223,11 @@ def test_neighbor_allgather_dynamic_variable_size(self): tensor = torch.FloatTensor( *([tensor_sizes[rank]] + [17] * (dim - 1))).fill_(1).mul_(rank) tensor = self.cast_and_place(tensor, dtype) - gathered = bf.neighbor_allgather(tensor, dst_ranks=neighbor_ranks, src_ranks=neighbor_ranks) + gathered = bf.neighbor_allgather( + tensor, dst_ranks=neighbor_ranks, src_ranks=neighbor_ranks) tensor, gathered = self.convert_cpu_fp16_to_fp32(tensor, gathered) - tensor_sizes[rank] = 0 # remove self-size since neighbor_allgather does not include self. + tensor_sizes[rank] = 0 # remove self since neighbor_allgather does not include self expected_size = sum(tensor_sizes) assert list(gathered.shape) == [expected_size] + [17] * (dim - 1)