From 2da083f4b27c66c0e443a0415dcfa9377ea238ce Mon Sep 17 00:00:00 2001 From: HexToString <506181616@qq.com> Date: Mon, 19 Sep 2022 19:55:38 +0800 Subject: [PATCH 1/6] fix bug and codestyle --- .../faster_tokenizer/core/encoding.cc | 62 ++++++++- .../faster_tokenizer/core/encoding.h | 12 ++ .../faster_tokenizer/core/tokenizer.cc | 130 +++++++++++++++++- .../faster_tokenizer/core/tokenizer.h | 20 +++ 4 files changed, 215 insertions(+), 9 deletions(-) diff --git a/faster_tokenizer/faster_tokenizer/core/encoding.cc b/faster_tokenizer/faster_tokenizer/core/encoding.cc index c3a7fcbfd50b..d5d8843c0c59 100644 --- a/faster_tokenizer/faster_tokenizer/core/encoding.cc +++ b/faster_tokenizer/faster_tokenizer/core/encoding.cc @@ -17,11 +17,7 @@ limitations under the License. */ #include #include #include -#include "glog/logging.h" -#ifdef WITH_OMP -#include -#endif namespace paddlenlp { namespace faster_tokenizer { namespace core { @@ -600,6 +596,23 @@ bool TruncateEncodings(Encoding* encoding, return true; } +void MultiThreadPadEncodings(std::vector* encodings, + const PadMethod& method, + size_t pad_length, + size_t start_index, + size_t step_index) { + auto batch_size = encodings->size(); + size_t end_index = start_index+step_index; + if(end_index>batch_size) end_index = batch_size; + for (size_t i = start_index; i < end_index; ++i) { + auto& encoding = (*encodings)[i]; + encoding.Pad(pad_length, + method.pad_id_, + method.pad_token_type_id_, + method.pad_token_, + method.direction_); + } +} void PadEncodings(std::vector* encodings, const PadMethod& method) { if (encodings == nullptr || encodings->empty()) { return; @@ -619,7 +632,6 @@ void PadEncodings(std::vector* encodings, const PadMethod& method) { auto batch_size = encodings->size(); #ifdef WITH_OMP #pragma omp parallel for if (batch_size >= 4 && omp_get_max_threads() > 1) -#endif for (int i = 0; i < batch_size; ++i) { auto& encoding = (*encodings)[i]; encoding.Pad(pad_length, @@ -628,6 +640,46 @@ void PadEncodings(std::vector* encodings, const PadMethod& method) { method.pad_token_, method.direction_); } +#else + int thread_num = GetThreadNum(batch_size); + std::vector vectorOfThread; + size_t start_index = 0; + size_t step_index = ceil(batch_size/thread_num); + + for(size_t thread_index = 0; thread_index < thread_num; thread_index++){ + vectorOfThread.emplace_back(std::thread(&MultiThreadPadEncodings, + encodings, + std::ref(method), + pad_length, + start_index, + step_index)); + start_index = start_index + step_index; + } + for(size_t thread_index = 0; thread_index < thread_num; thread_index++){ + vectorOfThread[thread_index].join(); + } + vectorOfThread.clear(); +#endif + +} + +int GetThreadNum(size_t batch_size){ + char* env_var = std::getenv("OMP_NUM_THREADS"); + int thread_num = std::atoi(env_var); + if(batch_size <=0){ + thread_num = 1; + VLOG(0) << "batch_size <=0, we set OMP_NUM_THREADS = 1"; + }else{ + int best_num = ceil(batch_size/4); + if(thread_num > best_num){ + thread_num = best_num; + VLOG(0) << "OMP_NUM_THREADS > batch_size/4, we set OMP_NUM_THREADS = batch_size/4"; + }else if(thread_num == 0){ + thread_num = best_num; + VLOG(0) << "OMP_NUM_THREADS == 0, we set OMP_NUM_THREADS = batch_size/4"; + } + } + return thread_num; } } // namespace core diff --git a/faster_tokenizer/faster_tokenizer/core/encoding.h b/faster_tokenizer/faster_tokenizer/core/encoding.h index 34f5a93bdec2..756e23313da8 100644 --- a/faster_tokenizer/faster_tokenizer/core/encoding.h +++ b/faster_tokenizer/faster_tokenizer/core/encoding.h @@ -20,6 +20,17 @@ limitations under the License. */ #include #include "faster_tokenizer/core/base.h" #include "faster_tokenizer/utils/utils.h" +#include "glog/logging.h" + +#ifdef WITH_OMP +#include +#else +// Replace OMP with std::thread +#include +#include +#include +using namespace std; +#endif namespace paddlenlp { namespace faster_tokenizer { @@ -122,6 +133,7 @@ bool FASTERTOKENIZER_DECL TruncateEncodings(Encoding* encoding, void FASTERTOKENIZER_DECL PadEncodings(std::vector* encoding, const PadMethod& method); +int GetThreadNum(size_t batch_size); } // namespace core } // namespace faster_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/core/tokenizer.cc b/faster_tokenizer/faster_tokenizer/core/tokenizer.cc index 626910584486..348ccea56571 100644 --- a/faster_tokenizer/faster_tokenizer/core/tokenizer.cc +++ b/faster_tokenizer/faster_tokenizer/core/tokenizer.cc @@ -26,8 +26,16 @@ limitations under the License. */ #include "faster_tokenizer/postprocessors/postprocessors.h" #include "faster_tokenizer/pretokenizers/pretokenizers.h" + + #ifdef WITH_OMP #include +#else +// Replace OMP with std::thread +#include +#include +#include +using namespace std; #endif namespace paddlenlp { @@ -248,23 +256,61 @@ void Tokenizer::EncodePairStrings(const EncodeInput& encode_input, } } +void Tokenizer::MultiThreadEncodeBatchStrings( + const std::vector& batch_encode_input, + std::vector* encodings, + bool add_special_tokens, + size_t start_index, + size_t step_index) const { + + auto batch_size = batch_encode_input.size(); + size_t end_index = start_index+step_index; + if(end_index>batch_size) end_index = batch_size; + for (size_t i = start_index; i < end_index; ++i) { + EncodePairStrings( + batch_encode_input[i], &(*encodings)[i], add_special_tokens); + } +} + void Tokenizer::EncodeBatchStrings( const std::vector& batch_encode_input, std::vector* encodings, bool add_special_tokens) const { auto batch_size = batch_encode_input.size(); encodings->resize(batch_size); + #ifdef WITH_OMP // (TODO:zhoushunjie): Simply use the batch size to estimate the workload of // tokenization. // Use workload to determine whether create omp threads. Need to optimize the // workload estimation. #pragma omp parallel for if (batch_size >= 4 && omp_get_max_threads() > 1) -#endif for (int i = 0; i < batch_size; ++i) { EncodePairStrings( batch_encode_input[i], &(*encodings)[i], add_special_tokens); } +#else + int thread_num = GetThreadNum(batch_size); + std::vector vectorOfThread; + size_t start_index = 0; + size_t step_index = ceil(batch_size/thread_num); + + for(size_t thread_index = 0; thread_index < thread_num; thread_index++){ + vectorOfThread.emplace_back(std::thread(&Tokenizer::MultiThreadEncodeBatchStrings, + this, + std::ref(batch_encode_input), + encodings, + add_special_tokens, + start_index, + step_index)); + start_index = start_index + step_index; + } + for(size_t thread_index = 0; thread_index < thread_num; thread_index++){ + vectorOfThread[thread_index].join(); + } + vectorOfThread.clear(); +#endif + if (use_padding_) { PadEncodings(encodings, pad_method_); } @@ -289,6 +335,24 @@ void Tokenizer::EncodePairStringsCharOffsets(const EncodeInput& encode_input, PostProcess(&encoding, &pair_encoding, add_special_tokens, encodings); } +void Tokenizer::MultiThreadEncodeBatchStringsCharOffsets( + const std::vector& batch_encode_input, + std::vector* encodings, + bool add_special_tokens, + size_t start_index, + size_t step_index) const { + + auto batch_size = batch_encode_input.size(); + size_t end_index = start_index+step_index; + if( end_index>batch_size ) end_index = batch_size; + for (size_t i = start_index; i < end_index; ++i) { + Encoding encoding; + EncodePairStringsCharOffsets( + batch_encode_input[i], &encoding, add_special_tokens); + (*encodings)[i] = std::move(encoding); + } +} + void Tokenizer::EncodeBatchStringsCharOffsets( const std::vector& batch_encode_input, std::vector* encodings, @@ -301,13 +365,34 @@ void Tokenizer::EncodeBatchStringsCharOffsets( // Use workload to determine whether create omp threads. Need to optimize the // workload estimation. #pragma omp parallel for if (batch_size >= 4 && omp_get_max_threads() > 1) -#endif for (int i = 0; i < batch_size; ++i) { Encoding encoding; EncodePairStringsCharOffsets( batch_encode_input[i], &encoding, add_special_tokens); (*encodings)[i] = std::move(encoding); } +#else + int thread_num = GetThreadNum(batch_size); + std::vector vectorOfThread; + size_t start_index = 0; + size_t step_index = ceil(batch_size/thread_num); + + for(size_t thread_index = 0; thread_index < thread_num; thread_index++){ + vectorOfThread.emplace_back(std::thread(&Tokenizer::MultiThreadEncodeBatchStringsCharOffsets, + this, + std::ref(batch_encode_input), + encodings, + add_special_tokens, + start_index, + step_index)); + start_index = start_index + step_index; + } + for(size_t thread_index = 0; thread_index < thread_num; thread_index++) { + vectorOfThread[thread_index].join(); + } + vectorOfThread.clear(); +#endif + if (use_padding_) { PadEncodings(encodings, pad_method_); } @@ -404,11 +489,28 @@ void Tokenizer::Decode(const std::vector& token_ids, } } + +void Tokenizer::MultiThreadDecodeBatch( + const std::vector>& batch_token_ids, + std::vector* results, + bool skip_special_tokens, + size_t start_index, + size_t step_index) const { + + auto batch_size = batch_token_ids.size(); + size_t end_index = start_index+step_index; + if( end_index>batch_size ) end_index = batch_size; + for (size_t i = start_index; i < end_index; ++i) { + Decode(batch_token_ids[i], &(*results)[i], skip_special_tokens); + } +} + void Tokenizer::DecodeBatch( const std::vector>& batch_token_ids, std::vector* results, bool skip_special_tokens) const { - results->resize(batch_token_ids.size()); + auto batch_size = batch_token_ids.size(); + results->resize(batch_size); #ifdef WITH_OMP // (TODO:zhoushunjie): Simply use the batch size to estimate the workload of // tokenization. @@ -416,10 +518,30 @@ void Tokenizer::DecodeBatch( // workload estimation. #pragma omp parallel for if (batch_token_ids.size() >= 4 && \ omp_get_num_threads() > 1) -#endif for (int i = 0; i < batch_token_ids.size(); ++i) { Decode(batch_token_ids[i], &(*results)[i], skip_special_tokens); } +#else + int thread_num = GetThreadNum(batch_size); + std::vector vectorOfThread; + size_t start_index = 0; + size_t step_index = ceil(batch_size/thread_num); + + for(size_t thread_index = 0; thread_index < thread_num; thread_index++){ + vectorOfThread.emplace_back(std::thread(&Tokenizer::MultiThreadDecodeBatch, + this, + std::ref(batch_token_ids), + results, + skip_special_tokens, + start_index, + step_index)); + start_index = start_index + step_index; + } + for(size_t thread_index = 0; thread_index < thread_num; thread_index++) { + vectorOfThread[thread_index].join(); + } + vectorOfThread.clear(); +#endif } bool Tokenizer::GetUseTruncation() const { return use_truncation_; } diff --git a/faster_tokenizer/faster_tokenizer/core/tokenizer.h b/faster_tokenizer/faster_tokenizer/core/tokenizer.h index bf317efe1b98..a5df094bf2ae 100644 --- a/faster_tokenizer/faster_tokenizer/core/tokenizer.h +++ b/faster_tokenizer/faster_tokenizer/core/tokenizer.h @@ -160,10 +160,24 @@ class FASTERTOKENIZER_DECL Tokenizer { bool add_special_tokens, Encoding* result_encoding) const; + void MultiThreadEncodeBatchStrings( + const std::vector& batch_encode_input, + std::vector* encodings, + bool add_special_tokens, + size_t start_index, + size_t step_index) const; + void EncodeBatchStrings(const std::vector& batch_encode_input, std::vector* encodings, bool add_special_tokens = true) const; + void MultiThreadEncodeBatchStringsCharOffsets( + const std::vector& batch_encode_input, + std::vector* encodings, + bool add_special_tokens, + size_t start_index, + size_t step_index) const; + void EncodeBatchStringsCharOffsets( const std::vector& batch_encode_input, std::vector* encodings, @@ -194,6 +208,12 @@ class FASTERTOKENIZER_DECL Tokenizer { void Decode(const std::vector& token_ids, std::string* result, bool skip_special_tokens = true) const; + void MultiThreadDecodeBatch( + const std::vector>& batch_token_ids, + std::vector* results, + bool skip_special_tokens, + size_t start_index, + size_t step_index) const; void DecodeBatch(const std::vector>& batch_token_ids, std::vector* results, bool skip_special_tokens = true) const; From 310b91b17c5d4e0c23a4fcac14dc8243186b81b5 Mon Sep 17 00:00:00 2001 From: HexToString <506181616@qq.com> Date: Tue, 20 Sep 2022 10:38:11 +0800 Subject: [PATCH 2/6] save change --- .../faster_tokenizer/core/encoding.cc | 48 ++++++----- .../faster_tokenizer/core/encoding.h | 5 +- .../faster_tokenizer/core/tokenizer.cc | 81 ++++++------------- 3 files changed, 54 insertions(+), 80 deletions(-) diff --git a/faster_tokenizer/faster_tokenizer/core/encoding.cc b/faster_tokenizer/faster_tokenizer/core/encoding.cc index d5d8843c0c59..95b90c88dae2 100644 --- a/faster_tokenizer/faster_tokenizer/core/encoding.cc +++ b/faster_tokenizer/faster_tokenizer/core/encoding.cc @@ -641,24 +641,13 @@ void PadEncodings(std::vector* encodings, const PadMethod& method) { method.direction_); } #else - int thread_num = GetThreadNum(batch_size); - std::vector vectorOfThread; - size_t start_index = 0; - size_t step_index = ceil(batch_size/thread_num); - - for(size_t thread_index = 0; thread_index < thread_num; thread_index++){ - vectorOfThread.emplace_back(std::thread(&MultiThreadPadEncodings, - encodings, - std::ref(method), - pad_length, - start_index, - step_index)); - start_index = start_index + step_index; - } - for(size_t thread_index = 0; thread_index < thread_num; thread_index++){ - vectorOfThread[thread_index].join(); - } - vectorOfThread.clear(); + auto func = std::bind(&MultiThreadPadEncodings, + encodings, + std::ref(method), + pad_length, + std::placeholders::_1, + std::placeholders::_2); + RunMultiThread(func,batch_size); #endif } @@ -668,20 +657,35 @@ int GetThreadNum(size_t batch_size){ int thread_num = std::atoi(env_var); if(batch_size <=0){ thread_num = 1; - VLOG(0) << "batch_size <=0, we set OMP_NUM_THREADS = 1"; + VLOG(3) << "batch_size <=0, we set OMP_NUM_THREADS = 1"; }else{ - int best_num = ceil(batch_size/4); + int best_num = ceil(batch_size/4.0); if(thread_num > best_num){ thread_num = best_num; - VLOG(0) << "OMP_NUM_THREADS > batch_size/4, we set OMP_NUM_THREADS = batch_size/4"; + VLOG(3) << "OMP_NUM_THREADS > batch_size/4, we set OMP_NUM_THREADS = batch_size/4"; }else if(thread_num == 0){ thread_num = best_num; - VLOG(0) << "OMP_NUM_THREADS == 0, we set OMP_NUM_THREADS = batch_size/4"; + VLOG(3) << "OMP_NUM_THREADS == 0, we set OMP_NUM_THREADS = batch_size/4"; } } return thread_num; } +void RunMultiThread(std::function func, size_t batch_size){ + int thread_num = GetThreadNum(batch_size); + std::vector vectorOfThread; + size_t start_index = 0; + size_t step_index = ceil(batch_size/float(thread_num)); + + for(size_t thread_index = 0; thread_index < thread_num; thread_index++){ + vectorOfThread.emplace_back(std::thread(func, start_index, step_index)); + start_index = start_index + step_index; + } + for(size_t thread_index = 0; thread_index < thread_num; thread_index++) { + vectorOfThread[thread_index].join(); + } +} + } // namespace core } // namespace faster_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/core/encoding.h b/faster_tokenizer/faster_tokenizer/core/encoding.h index 756e23313da8..b362732d106f 100644 --- a/faster_tokenizer/faster_tokenizer/core/encoding.h +++ b/faster_tokenizer/faster_tokenizer/core/encoding.h @@ -29,6 +29,7 @@ limitations under the License. */ #include #include #include +#include using namespace std; #endif @@ -133,7 +134,9 @@ bool FASTERTOKENIZER_DECL TruncateEncodings(Encoding* encoding, void FASTERTOKENIZER_DECL PadEncodings(std::vector* encoding, const PadMethod& method); -int GetThreadNum(size_t batch_size); +int FASTERTOKENIZER_DECL GetThreadNum(size_t batch_size); + +void FASTERTOKENIZER_DECL RunMultiThread(std::function func, size_t batch_size); } // namespace core } // namespace faster_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/core/tokenizer.cc b/faster_tokenizer/faster_tokenizer/core/tokenizer.cc index 348ccea56571..c2b05256aedf 100644 --- a/faster_tokenizer/faster_tokenizer/core/tokenizer.cc +++ b/faster_tokenizer/faster_tokenizer/core/tokenizer.cc @@ -290,25 +290,14 @@ void Tokenizer::EncodeBatchStrings( batch_encode_input[i], &(*encodings)[i], add_special_tokens); } #else - int thread_num = GetThreadNum(batch_size); - std::vector vectorOfThread; - size_t start_index = 0; - size_t step_index = ceil(batch_size/thread_num); - - for(size_t thread_index = 0; thread_index < thread_num; thread_index++){ - vectorOfThread.emplace_back(std::thread(&Tokenizer::MultiThreadEncodeBatchStrings, - this, - std::ref(batch_encode_input), - encodings, - add_special_tokens, - start_index, - step_index)); - start_index = start_index + step_index; - } - for(size_t thread_index = 0; thread_index < thread_num; thread_index++){ - vectorOfThread[thread_index].join(); - } - vectorOfThread.clear(); + auto func = std::bind(&Tokenizer::MultiThreadEncodeBatchStrings, + this, + std::ref(batch_encode_input), + encodings, + add_special_tokens, + std::placeholders::_1, + std::placeholders::_2); + RunMultiThread(func,batch_size); #endif if (use_padding_) { @@ -372,25 +361,14 @@ void Tokenizer::EncodeBatchStringsCharOffsets( (*encodings)[i] = std::move(encoding); } #else - int thread_num = GetThreadNum(batch_size); - std::vector vectorOfThread; - size_t start_index = 0; - size_t step_index = ceil(batch_size/thread_num); - - for(size_t thread_index = 0; thread_index < thread_num; thread_index++){ - vectorOfThread.emplace_back(std::thread(&Tokenizer::MultiThreadEncodeBatchStringsCharOffsets, - this, - std::ref(batch_encode_input), - encodings, - add_special_tokens, - start_index, - step_index)); - start_index = start_index + step_index; - } - for(size_t thread_index = 0; thread_index < thread_num; thread_index++) { - vectorOfThread[thread_index].join(); - } - vectorOfThread.clear(); + auto func = std::bind(&Tokenizer::MultiThreadEncodeBatchStringsCharOffsets, + this, + std::ref(batch_encode_input), + encodings, + add_special_tokens, + std::placeholders::_1, + std::placeholders::_2); + RunMultiThread(func,batch_size); #endif if (use_padding_) { @@ -522,25 +500,14 @@ void Tokenizer::DecodeBatch( Decode(batch_token_ids[i], &(*results)[i], skip_special_tokens); } #else - int thread_num = GetThreadNum(batch_size); - std::vector vectorOfThread; - size_t start_index = 0; - size_t step_index = ceil(batch_size/thread_num); - - for(size_t thread_index = 0; thread_index < thread_num; thread_index++){ - vectorOfThread.emplace_back(std::thread(&Tokenizer::MultiThreadDecodeBatch, - this, - std::ref(batch_token_ids), - results, - skip_special_tokens, - start_index, - step_index)); - start_index = start_index + step_index; - } - for(size_t thread_index = 0; thread_index < thread_num; thread_index++) { - vectorOfThread[thread_index].join(); - } - vectorOfThread.clear(); + auto func = std::bind(&Tokenizer::MultiThreadDecodeBatch, + this, + std::ref(batch_token_ids), + results, + skip_special_tokens, + std::placeholders::_1, + std::placeholders::_2); + RunMultiThread(func,batch_size); #endif } From 8062ed166043bbc405c48d6459db066950884604 Mon Sep 17 00:00:00 2001 From: zhoushunjie Date: Tue, 20 Sep 2022 11:18:42 +0800 Subject: [PATCH 3/6] change code style --- faster_tokenizer/faster_tokenizer/core/base.h | 2 +- .../faster_tokenizer/core/encoding.cc | 67 ++++++++------- .../faster_tokenizer/core/encoding.h | 7 +- .../faster_tokenizer/core/tokenizer.cc | 86 +++++++++---------- .../faster_tokenizer/core/tokenizer.h | 32 +++---- 5 files changed, 96 insertions(+), 98 deletions(-) diff --git a/faster_tokenizer/faster_tokenizer/core/base.h b/faster_tokenizer/faster_tokenizer/core/base.h index cb4256ef3272..c8d0cddc7e4a 100644 --- a/faster_tokenizer/faster_tokenizer/core/base.h +++ b/faster_tokenizer/faster_tokenizer/core/base.h @@ -21,9 +21,9 @@ limitations under the License. */ #include #include +#include "faster_tokenizer/utils/utils.h" #include "glog/logging.h" #include "nlohmann/json.hpp" -#include "faster_tokenizer/utils/utils.h" namespace std { template <> diff --git a/faster_tokenizer/faster_tokenizer/core/encoding.cc b/faster_tokenizer/faster_tokenizer/core/encoding.cc index 95b90c88dae2..3dac89c56806 100644 --- a/faster_tokenizer/faster_tokenizer/core/encoding.cc +++ b/faster_tokenizer/faster_tokenizer/core/encoding.cc @@ -597,20 +597,20 @@ bool TruncateEncodings(Encoding* encoding, } void MultiThreadPadEncodings(std::vector* encodings, - const PadMethod& method, - size_t pad_length, - size_t start_index, - size_t step_index) { + const PadMethod& method, + size_t pad_length, + size_t start_index, + size_t step_index) { auto batch_size = encodings->size(); - size_t end_index = start_index+step_index; - if(end_index>batch_size) end_index = batch_size; + size_t end_index = start_index + step_index; + if (end_index > batch_size) end_index = batch_size; for (size_t i = start_index; i < end_index; ++i) { - auto& encoding = (*encodings)[i]; - encoding.Pad(pad_length, - method.pad_id_, - method.pad_token_type_id_, - method.pad_token_, - method.direction_); + auto& encoding = (*encodings)[i]; + encoding.Pad(pad_length, + method.pad_id_, + method.pad_token_type_id_, + method.pad_token_, + method.direction_); } } void PadEncodings(std::vector* encodings, const PadMethod& method) { @@ -641,47 +641,48 @@ void PadEncodings(std::vector* encodings, const PadMethod& method) { method.direction_); } #else - auto func = std::bind(&MultiThreadPadEncodings, - encodings, - std::ref(method), - pad_length, - std::placeholders::_1, - std::placeholders::_2); - RunMultiThread(func,batch_size); + auto func = std::bind(&MultiThreadPadEncodings, + encodings, + std::ref(method), + pad_length, + std::placeholders::_1, + std::placeholders::_2); + RunMultiThread(func, batch_size); #endif - } -int GetThreadNum(size_t batch_size){ +int GetThreadNum(size_t batch_size) { char* env_var = std::getenv("OMP_NUM_THREADS"); int thread_num = std::atoi(env_var); - if(batch_size <=0){ + if (batch_size <= 0) { thread_num = 1; VLOG(3) << "batch_size <=0, we set OMP_NUM_THREADS = 1"; - }else{ - int best_num = ceil(batch_size/4.0); - if(thread_num > best_num){ + } else { + int best_num = ceil(batch_size / 4.0); + if (thread_num > best_num) { + thread_num = best_num; + VLOG(3) << "OMP_NUM_THREADS > batch_size/4, we set OMP_NUM_THREADS = " + "batch_size/4"; + } else if (thread_num == 0) { thread_num = best_num; - VLOG(3) << "OMP_NUM_THREADS > batch_size/4, we set OMP_NUM_THREADS = batch_size/4"; - }else if(thread_num == 0){ - thread_num = best_num; - VLOG(3) << "OMP_NUM_THREADS == 0, we set OMP_NUM_THREADS = batch_size/4"; + VLOG(3) << "OMP_NUM_THREADS == 0, we set OMP_NUM_THREADS = batch_size/4"; } } return thread_num; } -void RunMultiThread(std::function func, size_t batch_size){ +void RunMultiThread(std::function func, + size_t batch_size) { int thread_num = GetThreadNum(batch_size); std::vector vectorOfThread; size_t start_index = 0; - size_t step_index = ceil(batch_size/float(thread_num)); + size_t step_index = ceil(batch_size / float(thread_num)); - for(size_t thread_index = 0; thread_index < thread_num; thread_index++){ + for (size_t thread_index = 0; thread_index < thread_num; thread_index++) { vectorOfThread.emplace_back(std::thread(func, start_index, step_index)); start_index = start_index + step_index; } - for(size_t thread_index = 0; thread_index < thread_num; thread_index++) { + for (size_t thread_index = 0; thread_index < thread_num; thread_index++) { vectorOfThread[thread_index].join(); } } diff --git a/faster_tokenizer/faster_tokenizer/core/encoding.h b/faster_tokenizer/faster_tokenizer/core/encoding.h index b362732d106f..197ccd9efe9c 100644 --- a/faster_tokenizer/faster_tokenizer/core/encoding.h +++ b/faster_tokenizer/faster_tokenizer/core/encoding.h @@ -26,10 +26,10 @@ limitations under the License. */ #include #else // Replace OMP with std::thread -#include -#include #include +#include #include +#include using namespace std; #endif @@ -136,7 +136,8 @@ void FASTERTOKENIZER_DECL PadEncodings(std::vector* encoding, int FASTERTOKENIZER_DECL GetThreadNum(size_t batch_size); -void FASTERTOKENIZER_DECL RunMultiThread(std::function func, size_t batch_size); +void FASTERTOKENIZER_DECL +RunMultiThread(std::function func, size_t batch_size); } // namespace core } // namespace faster_tokenizer } // namespace paddlenlp diff --git a/faster_tokenizer/faster_tokenizer/core/tokenizer.cc b/faster_tokenizer/faster_tokenizer/core/tokenizer.cc index c2b05256aedf..cb92976ebbef 100644 --- a/faster_tokenizer/faster_tokenizer/core/tokenizer.cc +++ b/faster_tokenizer/faster_tokenizer/core/tokenizer.cc @@ -27,14 +27,13 @@ limitations under the License. */ #include "faster_tokenizer/pretokenizers/pretokenizers.h" - #ifdef WITH_OMP #include #else // Replace OMP with std::thread +#include #include #include -#include using namespace std; #endif @@ -262,14 +261,13 @@ void Tokenizer::MultiThreadEncodeBatchStrings( bool add_special_tokens, size_t start_index, size_t step_index) const { - - auto batch_size = batch_encode_input.size(); - size_t end_index = start_index+step_index; - if(end_index>batch_size) end_index = batch_size; - for (size_t i = start_index; i < end_index; ++i) { - EncodePairStrings( - batch_encode_input[i], &(*encodings)[i], add_special_tokens); - } + auto batch_size = batch_encode_input.size(); + size_t end_index = start_index + step_index; + if (end_index > batch_size) end_index = batch_size; + for (size_t i = start_index; i < end_index; ++i) { + EncodePairStrings( + batch_encode_input[i], &(*encodings)[i], add_special_tokens); + } } void Tokenizer::EncodeBatchStrings( @@ -290,16 +288,16 @@ void Tokenizer::EncodeBatchStrings( batch_encode_input[i], &(*encodings)[i], add_special_tokens); } #else - auto func = std::bind(&Tokenizer::MultiThreadEncodeBatchStrings, - this, - std::ref(batch_encode_input), - encodings, - add_special_tokens, - std::placeholders::_1, - std::placeholders::_2); - RunMultiThread(func,batch_size); + auto func = std::bind(&Tokenizer::MultiThreadEncodeBatchStrings, + this, + std::ref(batch_encode_input), + encodings, + add_special_tokens, + std::placeholders::_1, + std::placeholders::_2); + RunMultiThread(func, batch_size); #endif - + if (use_padding_) { PadEncodings(encodings, pad_method_); } @@ -330,16 +328,15 @@ void Tokenizer::MultiThreadEncodeBatchStringsCharOffsets( bool add_special_tokens, size_t start_index, size_t step_index) const { - - auto batch_size = batch_encode_input.size(); - size_t end_index = start_index+step_index; - if( end_index>batch_size ) end_index = batch_size; - for (size_t i = start_index; i < end_index; ++i) { - Encoding encoding; - EncodePairStringsCharOffsets( - batch_encode_input[i], &encoding, add_special_tokens); - (*encodings)[i] = std::move(encoding); - } + auto batch_size = batch_encode_input.size(); + size_t end_index = start_index + step_index; + if (end_index > batch_size) end_index = batch_size; + for (size_t i = start_index; i < end_index; ++i) { + Encoding encoding; + EncodePairStringsCharOffsets( + batch_encode_input[i], &encoding, add_special_tokens); + (*encodings)[i] = std::move(encoding); + } } void Tokenizer::EncodeBatchStringsCharOffsets( @@ -368,7 +365,7 @@ void Tokenizer::EncodeBatchStringsCharOffsets( add_special_tokens, std::placeholders::_1, std::placeholders::_2); - RunMultiThread(func,batch_size); + RunMultiThread(func, batch_size); #endif if (use_padding_) { @@ -474,13 +471,12 @@ void Tokenizer::MultiThreadDecodeBatch( bool skip_special_tokens, size_t start_index, size_t step_index) const { - - auto batch_size = batch_token_ids.size(); - size_t end_index = start_index+step_index; - if( end_index>batch_size ) end_index = batch_size; - for (size_t i = start_index; i < end_index; ++i) { - Decode(batch_token_ids[i], &(*results)[i], skip_special_tokens); - } + auto batch_size = batch_token_ids.size(); + size_t end_index = start_index + step_index; + if (end_index > batch_size) end_index = batch_size; + for (size_t i = start_index; i < end_index; ++i) { + Decode(batch_token_ids[i], &(*results)[i], skip_special_tokens); + } } void Tokenizer::DecodeBatch( @@ -500,14 +496,14 @@ void Tokenizer::DecodeBatch( Decode(batch_token_ids[i], &(*results)[i], skip_special_tokens); } #else - auto func = std::bind(&Tokenizer::MultiThreadDecodeBatch, - this, - std::ref(batch_token_ids), - results, - skip_special_tokens, - std::placeholders::_1, - std::placeholders::_2); - RunMultiThread(func,batch_size); + auto func = std::bind(&Tokenizer::MultiThreadDecodeBatch, + this, + std::ref(batch_token_ids), + results, + skip_special_tokens, + std::placeholders::_1, + std::placeholders::_2); + RunMultiThread(func, batch_size); #endif } diff --git a/faster_tokenizer/faster_tokenizer/core/tokenizer.h b/faster_tokenizer/faster_tokenizer/core/tokenizer.h index a5df094bf2ae..d709cc5a5c6e 100644 --- a/faster_tokenizer/faster_tokenizer/core/tokenizer.h +++ b/faster_tokenizer/faster_tokenizer/core/tokenizer.h @@ -161,23 +161,23 @@ class FASTERTOKENIZER_DECL Tokenizer { Encoding* result_encoding) const; void MultiThreadEncodeBatchStrings( - const std::vector& batch_encode_input, - std::vector* encodings, - bool add_special_tokens, - size_t start_index, - size_t step_index) const; + const std::vector& batch_encode_input, + std::vector* encodings, + bool add_special_tokens, + size_t start_index, + size_t step_index) const; void EncodeBatchStrings(const std::vector& batch_encode_input, std::vector* encodings, bool add_special_tokens = true) const; void MultiThreadEncodeBatchStringsCharOffsets( - const std::vector& batch_encode_input, - std::vector* encodings, - bool add_special_tokens, - size_t start_index, - size_t step_index) const; - + const std::vector& batch_encode_input, + std::vector* encodings, + bool add_special_tokens, + size_t start_index, + size_t step_index) const; + void EncodeBatchStringsCharOffsets( const std::vector& batch_encode_input, std::vector* encodings, @@ -209,11 +209,11 @@ class FASTERTOKENIZER_DECL Tokenizer { std::string* result, bool skip_special_tokens = true) const; void MultiThreadDecodeBatch( - const std::vector>& batch_token_ids, - std::vector* results, - bool skip_special_tokens, - size_t start_index, - size_t step_index) const; + const std::vector>& batch_token_ids, + std::vector* results, + bool skip_special_tokens, + size_t start_index, + size_t step_index) const; void DecodeBatch(const std::vector>& batch_token_ids, std::vector* results, bool skip_special_tokens = true) const; From 2696c53bd3d174e9271878f1a842e3a95d61ffae Mon Sep 17 00:00:00 2001 From: zhoushunjie Date: Tue, 20 Sep 2022 11:31:14 +0800 Subject: [PATCH 4/6] fix conflict --- faster_tokenizer/faster_tokenizer/core/base.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faster_tokenizer/faster_tokenizer/core/base.h b/faster_tokenizer/faster_tokenizer/core/base.h index c8d0cddc7e4a..cb4256ef3272 100644 --- a/faster_tokenizer/faster_tokenizer/core/base.h +++ b/faster_tokenizer/faster_tokenizer/core/base.h @@ -21,9 +21,9 @@ limitations under the License. */ #include #include -#include "faster_tokenizer/utils/utils.h" #include "glog/logging.h" #include "nlohmann/json.hpp" +#include "faster_tokenizer/utils/utils.h" namespace std { template <> From 504b1eef9db36cacda4ab101dff2e3536f20e109 Mon Sep 17 00:00:00 2001 From: zhoushunjie Date: Tue, 20 Sep 2022 15:47:00 +0800 Subject: [PATCH 5/6] change h file --- faster_tokenizer/faster_tokenizer/core/encoding.cc | 5 +++++ faster_tokenizer/faster_tokenizer/core/encoding.h | 6 ------ faster_tokenizer/faster_tokenizer/core/tokenizer.cc | 6 ------ 3 files changed, 5 insertions(+), 12 deletions(-) diff --git a/faster_tokenizer/faster_tokenizer/core/encoding.cc b/faster_tokenizer/faster_tokenizer/core/encoding.cc index 3dac89c56806..980e192abcbc 100644 --- a/faster_tokenizer/faster_tokenizer/core/encoding.cc +++ b/faster_tokenizer/faster_tokenizer/core/encoding.cc @@ -17,6 +17,11 @@ limitations under the License. */ #include #include #include +#include "glog/logging.h" + +#ifdef WITH_OMP +#include +#endif namespace paddlenlp { namespace faster_tokenizer { diff --git a/faster_tokenizer/faster_tokenizer/core/encoding.h b/faster_tokenizer/faster_tokenizer/core/encoding.h index 197ccd9efe9c..12a4bb708635 100644 --- a/faster_tokenizer/faster_tokenizer/core/encoding.h +++ b/faster_tokenizer/faster_tokenizer/core/encoding.h @@ -20,18 +20,12 @@ limitations under the License. */ #include #include "faster_tokenizer/core/base.h" #include "faster_tokenizer/utils/utils.h" -#include "glog/logging.h" -#ifdef WITH_OMP -#include -#else -// Replace OMP with std::thread #include #include #include #include using namespace std; -#endif namespace paddlenlp { namespace faster_tokenizer { diff --git a/faster_tokenizer/faster_tokenizer/core/tokenizer.cc b/faster_tokenizer/faster_tokenizer/core/tokenizer.cc index cb92976ebbef..5e1ba21f427a 100644 --- a/faster_tokenizer/faster_tokenizer/core/tokenizer.cc +++ b/faster_tokenizer/faster_tokenizer/core/tokenizer.cc @@ -30,12 +30,6 @@ limitations under the License. */ #ifdef WITH_OMP #include #else -// Replace OMP with std::thread -#include -#include -#include -using namespace std; -#endif namespace paddlenlp { namespace faster_tokenizer { From 8924159ce3d4dcbe1b04cc2a3876bca5fb5211cd Mon Sep 17 00:00:00 2001 From: Thomas Young <35565423+HexToString@users.noreply.github.com> Date: Tue, 20 Sep 2022 19:17:47 +0800 Subject: [PATCH 6/6] Update tokenizer.cc --- faster_tokenizer/faster_tokenizer/core/tokenizer.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faster_tokenizer/faster_tokenizer/core/tokenizer.cc b/faster_tokenizer/faster_tokenizer/core/tokenizer.cc index 5e1ba21f427a..1b6399c4aedf 100644 --- a/faster_tokenizer/faster_tokenizer/core/tokenizer.cc +++ b/faster_tokenizer/faster_tokenizer/core/tokenizer.cc @@ -29,7 +29,7 @@ limitations under the License. */ #ifdef WITH_OMP #include -#else +#endif namespace paddlenlp { namespace faster_tokenizer {