-
Notifications
You must be signed in to change notification settings - Fork 71
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Fixed the self_weight under emtpy receiving case * Enable empty send neighbors and fix HalfTensor for recv_size==0 * Fixed the self_weight under emtpy receiving case * Enable empty send neighbors and fix HalfTensor for recv_size==0 * Rename neighbor_weights to src_weights, and send_neighbors to dst_weights for neighbor_allreduce * A script to test existing examples * Accept dst_weights as Dict, and reorganize DoNeighborAllreduce * Reorganize CheckNeighborSendRecvPattern * Fix timeline_ptr for NCCL * Fix timeline_ptr for NCCL * Put dst_weights information into TensorTableEntry * First Version of neighbor_allreduce dst_weight, existing problem: Fusion Not Implemented, CUDA data_weight problem * Add some delay after data_weight as a temporary solution * CPU Fusion for dst_weighted added * Add ReadyEvent for dst_weight for single entry neighbor_allreduce * Remove const identifier for tensor dtype as it is meaningless * Add cuda source for scalebuffer * Scale buffer to modify itself * Add .o file to .gitignore * dst_weight using CUDA for fused entry & compile flow in Python setup.py * make clean *.o files generated by nvcc * Add fix for NCCL single entry * Make setup.py more robust * Add timeout and cuda check * Move test example * Fix NCCL side dst_weight fusion bug * Add agg to make matplotlib more stable * Address comments for setup.py * Simpler logic for dst_weighting_enabled and weighted_average_computation * Better consideration for weight buffer size * Make src_weights as std::map, and simplify logic for PerformNeighborAllreduceCallback * Add TODO #80 and #81, and simplify the logic for dst_weight * Wrap CheckNeighborSendRecvPattern again * Add two more TODOs * Address review comments * Add condition variable to control the loop (#88) * Add condition variable to control the loop * Minor update on topology_setting in global_state * Add missing <condition_variable> header * Change cv.wait to cv.wait_for 10 seconds * Address comment and remove adjusting resetVersionWinMem in ibfrun Co-authored-by: ybc <bichengying@gmail.com>
- Loading branch information
1 parent
8bde896
commit 2f696ed
Showing
34 changed files
with
1,233 additions
and
508 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,7 @@ __pycache__/ | |
|
||
# C extensions | ||
*.so | ||
*.o | ||
|
||
# Distribution / packaging | ||
.Python | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
// Copyright (C) 2020 NVIDIA CORPORATION. All rights reserved. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
// ============================================================================= | ||
|
||
#include "cuda_kernels.h" | ||
|
||
#include <stdexcept> | ||
#include <cuda_fp16.h> | ||
|
||
namespace bluefog { | ||
namespace common { | ||
|
||
template<typename T, typename TS> | ||
__global__ void scale_buffer_k(T* buffer, int64_t num_elements, const TS scale_factor) { | ||
|
||
const size_t idx = static_cast<size_t>(blockDim.x) * blockIdx.x + threadIdx.x; | ||
|
||
for (size_t i = idx; i < num_elements; i += gridDim.x * blockDim.x) { | ||
buffer[i] *= scale_factor; | ||
} | ||
} | ||
|
||
// Specialization for half2 | ||
__global__ void scale_buffer_half2_k(__half* buffer, int64_t num_elements, const __half scale_factor) { | ||
|
||
const size_t idx = static_cast<size_t>(blockDim.x) * blockIdx.x + threadIdx.x; | ||
|
||
#if __CUDA_ARCH__ > 530 | ||
__half2* buffer_h2 = reinterpret_cast<__half2 *>(buffer); | ||
const __half2 scale_factor_h2 = __halves2half2(scale_factor, scale_factor); | ||
|
||
for (size_t i = idx; i < num_elements / 2; i += gridDim.x * blockDim.x) { | ||
buffer_h2[i] = __hmul2(scale_factor_h2, buffer_h2[i]); | ||
} | ||
|
||
// Deal with last element if num_elements is odd | ||
if (idx == 0 && num_elements % 2) { | ||
buffer[num_elements - 1] = __hmul(scale_factor, buffer[num_elements - 1]); | ||
} | ||
#else | ||
for (size_t i = idx; i < num_elements; i += gridDim.x * blockDim.x) { | ||
buffer[i] = __float2half(__half2float(scale_factor) * __half2float(buffer[i])); | ||
} | ||
#endif | ||
} | ||
|
||
// Specialization for architectures without __half compute | ||
template<> | ||
__global__ void scale_buffer_k(__half* buffer, int64_t num_elements, const __half scale_factor) { | ||
|
||
const size_t idx = static_cast<size_t>(blockDim.x) * blockIdx.x + threadIdx.x; | ||
|
||
#if __CUDA_ARCH__ > 530 | ||
for (size_t i = idx; i < num_elements; i += gridDim.x * blockDim.x) { | ||
buffer[i] *= scale_factor; | ||
} | ||
#else | ||
for (size_t i = idx; i < num_elements; i += gridDim.x * blockDim.x) { | ||
buffer[i] = __float2half(__half2float(scale_factor) * __half2float(buffer[i])); | ||
} | ||
#endif | ||
} | ||
|
||
#define NTHREADS_SCALE_BUFFER_KERNEL 512 | ||
void ScaleBufferCudaImpl(double scale_factor, void* buffer_data, const int64_t num_elements, | ||
DataType dtype, cudaStream_t stream) { | ||
const int64_t blocks = (num_elements + NTHREADS_SCALE_BUFFER_KERNEL - 1) / NTHREADS_SCALE_BUFFER_KERNEL; | ||
const int threads = NTHREADS_SCALE_BUFFER_KERNEL; | ||
switch (dtype) { | ||
case DataType::BLUEFOG_UINT8: | ||
scale_buffer_k<<<blocks, threads, 0, stream>>>((uint8_t*) buffer_data, num_elements, scale_factor); | ||
break; | ||
case DataType::BLUEFOG_INT8: | ||
scale_buffer_k<<<blocks, threads, 0, stream>>>((int8_t*) buffer_data, num_elements, scale_factor); | ||
break; | ||
case DataType::BLUEFOG_INT32: | ||
scale_buffer_k<<<blocks, threads, 0, stream>>>((int32_t*) buffer_data, num_elements, scale_factor); | ||
break; | ||
case DataType::BLUEFOG_INT64: | ||
scale_buffer_k<<<blocks, threads, 0, stream>>>((int64_t*) buffer_data, num_elements, scale_factor); | ||
break; | ||
case DataType::BLUEFOG_FLOAT16: | ||
{ | ||
__half scale_factor_half = __float2half((float) scale_factor); | ||
if ((size_t) buffer_data % 4 == 0) { | ||
// If alignment allows, use half2 specialized kernel | ||
int64_t num_elements_h2 = (num_elements + 1) / 2; | ||
int64_t blocks_h2 = (num_elements_h2 + NTHREADS_SCALE_BUFFER_KERNEL - 1) / NTHREADS_SCALE_BUFFER_KERNEL; | ||
scale_buffer_half2_k<<<blocks_h2, threads, 0, stream>>>((__half*) buffer_data, num_elements, scale_factor_half); | ||
} else { | ||
scale_buffer_k<<<blocks, threads, 0, stream>>>((__half*) buffer_data, num_elements, scale_factor_half); | ||
} | ||
break; | ||
} | ||
case DataType::BLUEFOG_FLOAT32: | ||
scale_buffer_k<<<blocks, threads, 0, stream>>>((float*) buffer_data, num_elements, (float) scale_factor); | ||
break; | ||
case DataType::BLUEFOG_FLOAT64: | ||
scale_buffer_k<<<blocks, threads, 0, stream>>>((double*) buffer_data, num_elements, scale_factor); | ||
break; | ||
default: | ||
throw std::logic_error("Type " + DataType_Name(dtype) + | ||
" not supported by ScaleBufferCudaImpl."); | ||
} | ||
} | ||
|
||
} // namespace common | ||
} // namespace bluefog | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
// Copyright (C) 2020 NVIDIA CORPORATION. All rights reserved. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
// ============================================================================= | ||
|
||
#ifndef CUDA_KERNELS_H | ||
#define CUDA_KERNELS_H | ||
|
||
#include <cuda_runtime.h> | ||
|
||
#include "../common.h" | ||
|
||
namespace bluefog { | ||
namespace common { | ||
|
||
// Scales buffer by scalar | ||
void ScaleBufferCudaImpl(double scale_factor, void* buffer_data, const int64_t num_elements, | ||
DataType dtype, cudaStream_t stream); | ||
|
||
} // namespace common | ||
} // namespace bluefog | ||
|
||
#endif // CUDA_KERNELS_H |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.