Skip to content

Commit

Permalink
misc: remove recommend omp threads (#59)
Browse files Browse the repository at this point in the history
  • Loading branch information
jasperzhong authored Jul 30, 2020
1 parent 378cc2f commit 7dc8d7f
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 26 deletions.
32 changes: 13 additions & 19 deletions byteps/common/cpu_reducer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,6 @@ CpuReducer::CpuReducer(std::shared_ptr<BytePSComm> comm) {
_num_threads = 4;
}

if (getenv("BYTEPS_OMP_THREAD_THRESHOLD")) {
_single_thread_threshold = atoi(getenv("BYTEPS_OMP_THREAD_THRESHOLD"));
} else {
_single_thread_threshold = (1 << 16);
}

return;
}

Expand Down Expand Up @@ -92,7 +86,7 @@ int CpuReducer::sum(void* dst, const void* src, size_t len, DataType dtype) {

template <typename T>
int CpuReducer::_sum(T* dst, const T* src, size_t len) {
#pragma omp parallel for simd num_threads(GetRecommendNumThreads(len))
#pragma omp parallel for simd num_threads(_num_threads)
for (size_t i = 0; i < len / (size_t)sizeof(T); ++i) {
dst[i] = dst[i] + src[i];
}
Expand All @@ -107,7 +101,7 @@ int CpuReducer::_sum_float16(void* dst, const void* src, size_t len) {

#if __AVX__ && __F16C__
if (is_avx_and_f16c()) {
#pragma omp parallel for simd num_threads(GetRecommendNumThreads(len))
#pragma omp parallel for simd num_threads(_num_threads)
for (size_t i = 0; i < (size_t)(len / 8) * 8; i += 8) {
// convert in & inout to m256
__m256 in_m256 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(in + i)));
Expand All @@ -132,7 +126,7 @@ int CpuReducer::_sum_float16(void* dst, const void* src, size_t len) {
Float2HalfBits(&inout_float, inout + i);
}
#else
#pragma omp parallel for simd num_threads(GetRecommendNumThreads(len))
#pragma omp parallel for simd num_threads(_num_threads)
for (size_t i = 0; i < (size_t)len; ++i) {
float in_float;
float inout_float;
Expand Down Expand Up @@ -183,7 +177,7 @@ int CpuReducer::sum(void* dst, const void* src1, const void* src2, size_t len,

template <typename T>
int CpuReducer::_sum(T* dst, const T* src1, const T* src2, size_t len) {
#pragma omp parallel for simd num_threads(GetRecommendNumThreads(len))
#pragma omp parallel for simd num_threads(_num_threads)
for (size_t i = 0; i < len / (size_t)sizeof(T); ++i) {
dst[i] = src1[i] + src2[i];
}
Expand All @@ -200,7 +194,7 @@ int CpuReducer::_sum_float16(void* dst, const void* src1, const void* src2,

#if __AVX__ && __F16C__
if (is_avx_and_f16c()) {
#pragma omp parallel for simd num_threads(GetRecommendNumThreads(len))
#pragma omp parallel for simd num_threads(_num_threads)
for (size_t i = 0; i < (size_t)(len / 8) * 8; i += 8) {
// convert in1 & in2 to m256
__m256 in_m256 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(in1 + i)));
Expand All @@ -225,7 +219,7 @@ int CpuReducer::_sum_float16(void* dst, const void* src1, const void* src2,
Float2HalfBits(&out_float, out + i);
}
#else
#pragma omp parallel for simd num_threads(GetRecommendNumThreads(len))
#pragma omp parallel for simd num_threads(_num_threads)
for (size_t i = 0; i < (size_t)len; ++i) {
float in1_float;
float in2_float;
Expand Down Expand Up @@ -270,7 +264,7 @@ int CpuReducer::sum(void* dst, const void* src, size_t len, DataType dtype,

template <typename T>
int CpuReducer::_sum(T* dst, const T* src, size_t len, float alpha) {
#pragma omp parallel for simd num_threads(GetRecommendNumThreads(len))
#pragma omp parallel for simd num_threads(_num_threads)
for (size_t i = 0; i < len / (size_t)sizeof(T); ++i) {
dst[i] = dst[i] + alpha * src[i];
}
Expand All @@ -290,7 +284,7 @@ int CpuReducer::_sum_float16(void* dst, const void* src, size_t len,

if (is_avx_and_f16c()) {
__m256 __mm256_alpha = _mm256_loadu_ps(mm256_alpha);
#pragma omp parallel for simd num_threads(GetRecommendNumThreads(len))
#pragma omp parallel for simd num_threads(_num_threads)
for (size_t i = 0; i < (size_t)(len / 8) * 8; i += 8) {
// convert in & inout to m256
__m256 in_m256 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(in + i)));
Expand All @@ -316,7 +310,7 @@ int CpuReducer::_sum_float16(void* dst, const void* src, size_t len,
Float2HalfBits(&inout_float, inout + i);
}
#else
#pragma omp parallel for simd num_threads(GetRecommendNumThreads(len))
#pragma omp parallel for simd num_threads(_num_threads)
for (size_t i = 0; i < (size_t)len; ++i) {
float in_float;
float inout_float;
Expand Down Expand Up @@ -368,7 +362,7 @@ int CpuReducer::sum(void* dst, const void* src1, const void* src2, size_t len,
template <typename T>
int CpuReducer::_sum(T* dst, const T* src1, const T* src2, size_t len,
float alpha) {
#pragma omp parallel for simd num_threads(GetRecommendNumThreads(len))
#pragma omp parallel for simd num_threads(_num_threads)
for (size_t i = 0; i < len / (size_t)sizeof(T); ++i) {
dst[i] = src1[i] + alpha * src2[i];
}
Expand All @@ -389,7 +383,7 @@ int CpuReducer::_sum_float16(void* dst, const void* src1, const void* src2,

if (is_avx_and_f16c()) {
__m256 __mm256_alpha = _mm256_loadu_ps(mm256_alpha);
#pragma omp parallel for simd num_threads(GetRecommendNumThreads(len))
#pragma omp parallel for simd num_threads(_num_threads)
for (size_t i = 0; i < (size_t)(len / 8) * 8; i += 8) {
// convert in1 & in2 to m256
__m256 in1_m256 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(in1 + i)));
Expand All @@ -415,7 +409,7 @@ int CpuReducer::_sum_float16(void* dst, const void* src1, const void* src2,
Float2HalfBits(&out_float, out + i);
}
#else
#pragma omp parallel for simd num_threads(GetRecommendNumThreads(len))
#pragma omp parallel for simd num_threads(_num_threads)
for (size_t i = 0; i < (size_t)len; ++i) {
float in1_float;
float in2_float;
Expand All @@ -432,7 +426,7 @@ int CpuReducer::_sum_float16(void* dst, const void* src1, const void* src2,
int CpuReducer::copy(void* dst, const void* src, size_t len) {
auto in = (float*)src;
auto out = (float*)dst;
#pragma omp parallel for simd num_threads(GetRecommendNumThreads(len))
#pragma omp parallel for simd num_threads(_num_threads)
for (size_t i = 0; i < len / 4; ++i) {
out[i] = in[i];
}
Expand Down
7 changes: 0 additions & 7 deletions byteps/common/cpu_reducer.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,13 +63,6 @@ class CpuReducer {
DataType GetDataType(int dtype) { return static_cast<DataType>(dtype); }

private:
size_t GetRecommendNumThreads(size_t len) const {
if (len < _single_thread_threshold) {
return 1;
} else {
return _num_threads;
}
}

#if __AVX__ && __F16C__
// Query CPUID to determine AVX and F16C runtime support.
Expand Down

0 comments on commit 7dc8d7f

Please sign in to comment.