Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

misc: remove recommend omp threads #59

Merged
merged 1 commit into from
Jul 30, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 13 additions & 19 deletions byteps/common/cpu_reducer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,6 @@ CpuReducer::CpuReducer(std::shared_ptr<BytePSComm> comm) {
_num_threads = 4;
}

if (getenv("BYTEPS_OMP_THREAD_THRESHOLD")) {
_single_thread_threshold = atoi(getenv("BYTEPS_OMP_THREAD_THRESHOLD"));
} else {
_single_thread_threshold = (1 << 16);
}

return;
}

Expand Down Expand Up @@ -92,7 +86,7 @@ int CpuReducer::sum(void* dst, const void* src, size_t len, DataType dtype) {

template <typename T>
int CpuReducer::_sum(T* dst, const T* src, size_t len) {
#pragma omp parallel for simd num_threads(GetRecommendNumThreads(len))
#pragma omp parallel for simd num_threads(_num_threads)
for (size_t i = 0; i < len / (size_t)sizeof(T); ++i) {
dst[i] = dst[i] + src[i];
}
Expand All @@ -107,7 +101,7 @@ int CpuReducer::_sum_float16(void* dst, const void* src, size_t len) {

#if __AVX__ && __F16C__
if (is_avx_and_f16c()) {
#pragma omp parallel for simd num_threads(GetRecommendNumThreads(len))
#pragma omp parallel for simd num_threads(_num_threads)
for (size_t i = 0; i < (size_t)(len / 8) * 8; i += 8) {
// convert in & inout to m256
__m256 in_m256 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(in + i)));
Expand All @@ -132,7 +126,7 @@ int CpuReducer::_sum_float16(void* dst, const void* src, size_t len) {
Float2HalfBits(&inout_float, inout + i);
}
#else
#pragma omp parallel for simd num_threads(GetRecommendNumThreads(len))
#pragma omp parallel for simd num_threads(_num_threads)
for (size_t i = 0; i < (size_t)len; ++i) {
float in_float;
float inout_float;
Expand Down Expand Up @@ -183,7 +177,7 @@ int CpuReducer::sum(void* dst, const void* src1, const void* src2, size_t len,

template <typename T>
int CpuReducer::_sum(T* dst, const T* src1, const T* src2, size_t len) {
#pragma omp parallel for simd num_threads(GetRecommendNumThreads(len))
#pragma omp parallel for simd num_threads(_num_threads)
for (size_t i = 0; i < len / (size_t)sizeof(T); ++i) {
dst[i] = src1[i] + src2[i];
}
Expand All @@ -200,7 +194,7 @@ int CpuReducer::_sum_float16(void* dst, const void* src1, const void* src2,

#if __AVX__ && __F16C__
if (is_avx_and_f16c()) {
#pragma omp parallel for simd num_threads(GetRecommendNumThreads(len))
#pragma omp parallel for simd num_threads(_num_threads)
for (size_t i = 0; i < (size_t)(len / 8) * 8; i += 8) {
// convert in1 & in2 to m256
__m256 in_m256 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(in1 + i)));
Expand All @@ -225,7 +219,7 @@ int CpuReducer::_sum_float16(void* dst, const void* src1, const void* src2,
Float2HalfBits(&out_float, out + i);
}
#else
#pragma omp parallel for simd num_threads(GetRecommendNumThreads(len))
#pragma omp parallel for simd num_threads(_num_threads)
for (size_t i = 0; i < (size_t)len; ++i) {
float in1_float;
float in2_float;
Expand Down Expand Up @@ -270,7 +264,7 @@ int CpuReducer::sum(void* dst, const void* src, size_t len, DataType dtype,

template <typename T>
int CpuReducer::_sum(T* dst, const T* src, size_t len, float alpha) {
#pragma omp parallel for simd num_threads(GetRecommendNumThreads(len))
#pragma omp parallel for simd num_threads(_num_threads)
for (size_t i = 0; i < len / (size_t)sizeof(T); ++i) {
dst[i] = dst[i] + alpha * src[i];
}
Expand All @@ -290,7 +284,7 @@ int CpuReducer::_sum_float16(void* dst, const void* src, size_t len,

if (is_avx_and_f16c()) {
__m256 __mm256_alpha = _mm256_loadu_ps(mm256_alpha);
#pragma omp parallel for simd num_threads(GetRecommendNumThreads(len))
#pragma omp parallel for simd num_threads(_num_threads)
for (size_t i = 0; i < (size_t)(len / 8) * 8; i += 8) {
// convert in & inout to m256
__m256 in_m256 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(in + i)));
Expand All @@ -316,7 +310,7 @@ int CpuReducer::_sum_float16(void* dst, const void* src, size_t len,
Float2HalfBits(&inout_float, inout + i);
}
#else
#pragma omp parallel for simd num_threads(GetRecommendNumThreads(len))
#pragma omp parallel for simd num_threads(_num_threads)
for (size_t i = 0; i < (size_t)len; ++i) {
float in_float;
float inout_float;
Expand Down Expand Up @@ -368,7 +362,7 @@ int CpuReducer::sum(void* dst, const void* src1, const void* src2, size_t len,
template <typename T>
int CpuReducer::_sum(T* dst, const T* src1, const T* src2, size_t len,
float alpha) {
#pragma omp parallel for simd num_threads(GetRecommendNumThreads(len))
#pragma omp parallel for simd num_threads(_num_threads)
for (size_t i = 0; i < len / (size_t)sizeof(T); ++i) {
dst[i] = src1[i] + alpha * src2[i];
}
Expand All @@ -389,7 +383,7 @@ int CpuReducer::_sum_float16(void* dst, const void* src1, const void* src2,

if (is_avx_and_f16c()) {
__m256 __mm256_alpha = _mm256_loadu_ps(mm256_alpha);
#pragma omp parallel for simd num_threads(GetRecommendNumThreads(len))
#pragma omp parallel for simd num_threads(_num_threads)
for (size_t i = 0; i < (size_t)(len / 8) * 8; i += 8) {
// convert in1 & in2 to m256
__m256 in1_m256 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(in1 + i)));
Expand All @@ -415,7 +409,7 @@ int CpuReducer::_sum_float16(void* dst, const void* src1, const void* src2,
Float2HalfBits(&out_float, out + i);
}
#else
#pragma omp parallel for simd num_threads(GetRecommendNumThreads(len))
#pragma omp parallel for simd num_threads(_num_threads)
for (size_t i = 0; i < (size_t)len; ++i) {
float in1_float;
float in2_float;
Expand All @@ -432,7 +426,7 @@ int CpuReducer::_sum_float16(void* dst, const void* src1, const void* src2,
int CpuReducer::copy(void* dst, const void* src, size_t len) {
auto in = (float*)src;
auto out = (float*)dst;
#pragma omp parallel for simd num_threads(GetRecommendNumThreads(len))
#pragma omp parallel for simd num_threads(_num_threads)
for (size_t i = 0; i < len / 4; ++i) {
out[i] = in[i];
}
Expand Down
7 changes: 0 additions & 7 deletions byteps/common/cpu_reducer.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,13 +63,6 @@ class CpuReducer {
DataType GetDataType(int dtype) { return static_cast<DataType>(dtype); }

private:
size_t GetRecommendNumThreads(size_t len) const {
if (len < _single_thread_threshold) {
return 1;
} else {
return _num_threads;
}
}

#if __AVX__ && __F16C__
// Query CPUID to determine AVX and F16C runtime support.
Expand Down