Improve neighbor allreduce (#78)

* Fixed the self_weight under emtpy receiving case * Enable empty send neighbors and fix HalfTensor for recv_size==0 * Fixed the self_weight under emtpy receiving case * Enable empty send neighbors and fix HalfTensor for recv_size==0 * Rename neighbor_weights to src_weights, and send_neighbors to dst_weights for neighbor_allreduce * A script to test existing examples * Accept dst_weights as Dict, and reorganize DoNeighborAllreduce * Reorganize CheckNeighborSendRecvPattern * Fix timeline_ptr for NCCL * Fix timeline_ptr for NCCL * Put dst_weights information into TensorTableEntry * First Version of neighbor_allreduce dst_weight, existing problem: Fusion Not Implemented, CUDA data_weight problem * Add some delay after data_weight as a temporary solution * CPU Fusion for dst_weighted added * Add ReadyEvent for dst_weight for single entry neighbor_allreduce * Remove const identifier for tensor dtype as it is meaningless * Add cuda source for scalebuffer * Scale buffer to modify itself * Add .o file to .gitignore * dst_weight using CUDA for fused entry & compile flow in Python setup.py * make clean *.o files generated by nvcc * Add fix for NCCL single entry * Make setup.py more robust * Add timeout and cuda check * Move test example * Fix NCCL side dst_weight fusion bug * Add agg to make matplotlib more stable * Address comments for setup.py * Simpler logic for dst_weighting_enabled and weighted_average_computation * Better consideration for weight buffer size * Make src_weights as std::map, and simplify logic for PerformNeighborAllreduceCallback * Add TODO #80 and #81, and simplify the logic for dst_weight * Wrap CheckNeighborSendRecvPattern again * Add two more TODOs * Address review comments * Add condition variable to control the loop (#88) * Add condition variable to control the loop * Minor update on topology_setting in global_state * Add missing <condition_variable> header * Change cv.wait to cv.wait_for 10 seconds * Address comment and remove adjusting resetVersionWinMem in ibfrun Co-authored-by: ybc <bichengying@gmail.com>
Bluefog-Lib · Apr 11, 2021 · 2f696ed · 2f696ed
1 parent 8bde896
commit 2f696ed
Show file tree

Hide file tree

Showing 34 changed files with 1,233 additions and 508 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,6 +5,7 @@ __pycache__/
 
 # C extensions
 *.so
+*.o
 
 # Distribution / packaging
 .Python

diff --git a/Makefile b/Makefile
@@ -19,7 +19,7 @@ test_torch: test_torch_basic test_torch_ops test_torch_win_ops test_torch_optimi
 test_tensorflow: test_tensorflow_basic test_tensorflow_ops
 test_all: test_torch test_tensorflow
 
-clean: clean_build clean_so
+clean: clean_build clean_so clean_o
 
 .PHONY: test_torch_basic
 test_torch_basic:
@@ -51,8 +51,12 @@ test_tensorflow_ops:
 
 .PHONY: clean_build
 clean_build:
-	rm -R build
+	rm -fR build
 
 .PHONY: clean_so
 clean_so:
-	rm ./bluefog/torch/mpi_lib.*.so
+	rm -f ./bluefog/torch/mpi_lib.*.so
+
+.PHONY: clean_o
+clean_o:
+	rm -f ./bluefog/common/cuda/*.o
diff --git a/bluefog/common/common.h b/bluefog/common/common.h
@@ -209,10 +209,10 @@ class TensorShape {
 
 class Tensor {
  public:
-  virtual const DataType dtype() const = 0;
+  virtual DataType dtype() const = 0;
   virtual const TensorShape shape() const = 0;
   virtual const void* data() const = 0;
-  virtual std::shared_ptr<common::Tensor> data_weight(float weight) = 0;
+  virtual std::unique_ptr<common::Tensor> data_weight(float weight) = 0;
   virtual int64_t size() const = 0;
   virtual ~Tensor() = default;
 };
@@ -241,6 +241,7 @@ class OpContext {
                                 std::shared_ptr<Tensor>* tensor) = 0;
   virtual Status AllocateZeros(int64_t num_elements, DataType dtype,
                                std::shared_ptr<Tensor>* tensor) = 0;
+  virtual std::shared_ptr<ReadyEvent> RecordReadyEvent(int device) = 0;
   virtual Framework framework() const = 0;
   virtual ~OpContext() = default;
 };
@@ -279,10 +280,14 @@ struct TensorTableEntry {
   // Neighbors for dynamic neighbor_allreduce.
   std::shared_ptr<std::vector<int>> send_neighbors;
   std::shared_ptr<std::vector<int>> recv_neighbors;
+  std::shared_ptr<std::vector<double>> send_weights;
 
   // Boolean value if dynamic neighbor is enabled.
   bool dynamic_neighbors_enabled = false;
 
+  // Boolean value for enabling destination(send) weighting operation or not.
+  bool dst_weighting_enabled = false;
+
   // Boolean value for enabling topology check.
   bool enable_topo_check = false;
 

diff --git a/bluefog/common/cuda/cuda_kernels.cu b/bluefog/common/cuda/cuda_kernels.cu
@@ -0,0 +1,120 @@
+// Copyright (C) 2020 NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "cuda_kernels.h"
+
+#include <stdexcept>
+#include <cuda_fp16.h>
+
+namespace bluefog {
+namespace common {
+
+template<typename T, typename TS>
+__global__ void scale_buffer_k(T* buffer, int64_t num_elements, const TS scale_factor) {
+
+  const size_t idx = static_cast<size_t>(blockDim.x) * blockIdx.x + threadIdx.x;
+
+  for (size_t i = idx; i < num_elements; i += gridDim.x * blockDim.x) {
+    buffer[i] *= scale_factor;
+  }
+}
+
+// Specialization for half2
+__global__ void scale_buffer_half2_k(__half* buffer, int64_t num_elements, const __half scale_factor) {
+
+  const size_t idx = static_cast<size_t>(blockDim.x) * blockIdx.x + threadIdx.x;
+
+#if __CUDA_ARCH__ > 530
+  __half2* buffer_h2 = reinterpret_cast<__half2 *>(buffer);
+  const __half2 scale_factor_h2 = __halves2half2(scale_factor, scale_factor);
+
+  for (size_t i = idx; i < num_elements / 2; i += gridDim.x * blockDim.x) {
+    buffer_h2[i] = __hmul2(scale_factor_h2, buffer_h2[i]);
+  }
+
+  // Deal with last element if num_elements is odd
+  if (idx == 0 && num_elements % 2) {
+    buffer[num_elements - 1] = __hmul(scale_factor, buffer[num_elements - 1]);
+  }
+#else
+  for (size_t i = idx; i < num_elements; i += gridDim.x * blockDim.x) {
+    buffer[i] = __float2half(__half2float(scale_factor) * __half2float(buffer[i]));
+  }
+#endif
+}
+
+// Specialization for architectures without __half compute
+template<>
+__global__ void scale_buffer_k(__half* buffer, int64_t num_elements, const __half scale_factor) {
+
+  const size_t idx = static_cast<size_t>(blockDim.x) * blockIdx.x + threadIdx.x;
+
+#if __CUDA_ARCH__ > 530
+  for (size_t i = idx; i < num_elements; i += gridDim.x * blockDim.x) {
+    buffer[i] *= scale_factor;
+  }
+#else
+  for (size_t i = idx; i < num_elements; i += gridDim.x * blockDim.x) {
+    buffer[i] = __float2half(__half2float(scale_factor) * __half2float(buffer[i]));
+  }
+#endif
+}
+
+#define NTHREADS_SCALE_BUFFER_KERNEL 512
+void ScaleBufferCudaImpl(double scale_factor, void* buffer_data, const int64_t num_elements,
+                         DataType dtype, cudaStream_t stream) {
+  const int64_t blocks = (num_elements + NTHREADS_SCALE_BUFFER_KERNEL - 1) / NTHREADS_SCALE_BUFFER_KERNEL;
+  const int threads = NTHREADS_SCALE_BUFFER_KERNEL;
+  switch (dtype) {
+    case DataType::BLUEFOG_UINT8:
+      scale_buffer_k<<<blocks, threads, 0, stream>>>((uint8_t*) buffer_data, num_elements, scale_factor);
+      break;
+    case DataType::BLUEFOG_INT8:
+      scale_buffer_k<<<blocks, threads, 0, stream>>>((int8_t*) buffer_data, num_elements, scale_factor);
+      break;
+    case DataType::BLUEFOG_INT32:
+      scale_buffer_k<<<blocks, threads, 0, stream>>>((int32_t*) buffer_data, num_elements, scale_factor);
+      break;
+    case DataType::BLUEFOG_INT64:
+      scale_buffer_k<<<blocks, threads, 0, stream>>>((int64_t*) buffer_data, num_elements, scale_factor);
+      break;
+    case DataType::BLUEFOG_FLOAT16:
+    {
+      __half scale_factor_half = __float2half((float) scale_factor);
+      if ((size_t) buffer_data % 4 == 0) {
+        // If alignment allows, use half2 specialized kernel
+        int64_t num_elements_h2 = (num_elements + 1) / 2;
+        int64_t blocks_h2 = (num_elements_h2 + NTHREADS_SCALE_BUFFER_KERNEL - 1) / NTHREADS_SCALE_BUFFER_KERNEL;
+        scale_buffer_half2_k<<<blocks_h2, threads, 0, stream>>>((__half*) buffer_data, num_elements, scale_factor_half);
+      } else {
+        scale_buffer_k<<<blocks, threads, 0, stream>>>((__half*) buffer_data, num_elements, scale_factor_half);
+     }
+      break;
+    }
+    case DataType::BLUEFOG_FLOAT32:
+      scale_buffer_k<<<blocks, threads, 0, stream>>>((float*) buffer_data, num_elements, (float) scale_factor);
+      break;
+    case DataType::BLUEFOG_FLOAT64:
+      scale_buffer_k<<<blocks, threads, 0, stream>>>((double*) buffer_data, num_elements, scale_factor);
+      break;
+    default:
+      throw std::logic_error("Type " + DataType_Name(dtype) +
+                             " not supported by ScaleBufferCudaImpl.");
+  }
+}
+
+} // namespace common
+} // namespace bluefog
+
diff --git a/bluefog/common/cuda/cuda_kernels.h b/bluefog/common/cuda/cuda_kernels.h
@@ -0,0 +1,33 @@
+// Copyright (C) 2020 NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef CUDA_KERNELS_H
+#define CUDA_KERNELS_H
+
+#include <cuda_runtime.h>
+
+#include "../common.h"
+
+namespace bluefog {
+namespace common {
+
+// Scales buffer by scalar
+void ScaleBufferCudaImpl(double scale_factor, void* buffer_data, const int64_t num_elements,
+                         DataType dtype, cudaStream_t stream);
+
+} // namespace common
+} // namespace bluefog
+
+#endif // CUDA_KERNELS_H
diff --git a/bluefog/common/global_state.h b/bluefog/common/global_state.h
@@ -18,6 +18,7 @@
 #define BLUEFOG_COMMON_GLOBAL_STATE_H
 
 #include <atomic>
+#include <condition_variable>
 #include <chrono>
 #include <memory>
 #include <queue>
@@ -54,6 +55,14 @@ struct BluefogGlobalState {
   // Whether collective context has been completed on the background thread.
   std::atomic_bool initialization_done{false};
 
+  // Condition variable and its mutex for main loop in communication thread.
+  std::condition_variable loop_cv;
+  std::mutex loop_mutex;
+
+  // Under negotiation, the entries sends to master first and wait until it 
+  // returns ok to run. This variable keeps the records of that.
+  std::atomic_int unfinished_enqueued_entries{0};
+
   // Timeline writer.
   Timeline timeline;
 
@@ -80,13 +89,12 @@ struct BluefogGlobalState {
   // Threshold for Tensor Fusion.  All tensors that occupy memory beyond this
   // threshold will be fused.
   int64_t tensor_fusion_threshold = 8 * 1024 * 1024;
+  int64_t tensor_fusion_threshold_for_dst_weight = tensor_fusion_threshold;
   FusionBufferManager fusion_buffer;
 
   // Because setting topology happens in the main thread instead of communication
-  // thread. Following three variables are to sync between them.
+  // thread. Not really used since the condition variable refactor.
   std::atomic_bool setting_topology{false};
-  std::atomic_bool setting_topology_done{false};
-  std::atomic_bool ready_to_setting_topology{false};
 
   // Only exists on the coordinator node (rank zero). Maintains a vector of
   // requests to allreduce every tensor (keyed by tensor name).

diff --git a/bluefog/common/mpi_context.cc b/bluefog/common/mpi_context.cc
@@ -75,7 +75,7 @@ bool WindowManager::InitializeMutexWin(const MPI_Comm& mpi_comm) {
 std::vector<int> WindowManager::GetVersionMemoryCopy() { return version_mem_; }
 
 void WindowManager::resetVersionWinMem(int initialValue /*=0*/) {
-  for (int i = 0; i < version_mem_.size(); i++) {
+  for (size_t i = 0; i < version_mem_.size(); i++) {
     version_mem_[i] = initialValue;
   }
 }
@@ -222,7 +222,7 @@ MPI_Op MPIContext::GetMPISumOp(DataType dtype) {
   return dtype == DataType::BLUEFOG_FLOAT16 ? mpi_float16_sum : MPI_SUM;
 }
 
-MPI_Comm MPIContext::GetMPICommunicator(Communicator comm) {
+MPI_Comm MPIContext::GetMPICommunicator(Communicator comm) const {
   switch (comm) {
     case Communicator::GLOBAL:
       return mpi_comm;
@@ -332,6 +332,13 @@ void MPIContext::Initialize(const std::vector<int>& ranks,
 
   // Create custom MPI float16 summation op.
   MPI_Op_create(&float16_sum, 1, &mpi_float16_sum);
+
+#if HAVE_CUDA
+  int greatest_priority;
+  CUDACHECK(cudaDeviceGetStreamPriorityRange(NULL, &greatest_priority));
+  CUDACHECK(cudaStreamCreateWithPriority(&stream, cudaStreamNonBlocking,
+                                         greatest_priority));
+#endif
 }
 
 void MPIContext::Finalize(MPIContextManager& ctx_manager) {

diff --git a/bluefog/common/mpi_context.h b/bluefog/common/mpi_context.h
@@ -22,6 +22,10 @@
 #include <unordered_map>
 #include <vector>
 
+#if HAVE_CUDA
+#include "cuda_runtime.h"
+#endif
+
 #include "common.h"
 #include "mpi.h"
 
@@ -144,7 +148,7 @@ class MPIContext {
 
   MPI_Op GetMPISumOp(DataType dtype);
 
-  MPI_Comm GetMPICommunicator(Communicator comm);
+  MPI_Comm GetMPICommunicator(Communicator comm) const;
 
   int GetMPITypeSize(DataType dtype);
 
@@ -232,8 +236,17 @@ class MPIContext {
   // MPI Custom  data type for float16.
   MPI_Datatype mpi_float16_t;
   MPI_Op mpi_float16_sum;
+
+  // TODO(hhb): #80 We should use a common context for MPI and NCCL controller for CUDA usage.
+#if HAVE_CUDA
+  // CUDA Stream
+  cudaStream_t stream;
+#endif
 };
 
+std::string GenerateNeighborExchangeErrorMessage(const std::vector<MPI_Status>& statuses,
+                                                 int nsend, int nrecv);
+
 }  // namespace common
 }  // namespace bluefog