From 2da083f4b27c66c0e443a0415dcfa9377ea238ce Mon Sep 17 00:00:00 2001
From: HexToString <506181616@qq.com>
Date: Mon, 19 Sep 2022 19:55:38 +0800
Subject: [PATCH 1/6] fix bug and codestyle

---
 .../faster_tokenizer/core/encoding.cc         |  62 ++++++++-
 .../faster_tokenizer/core/encoding.h          |  12 ++
 .../faster_tokenizer/core/tokenizer.cc        | 130 +++++++++++++++++-
 .../faster_tokenizer/core/tokenizer.h         |  20 +++
 4 files changed, 215 insertions(+), 9 deletions(-)
diff --git a/faster_tokenizer/faster_tokenizer/core/encoding.cc b/faster_tokenizer/faster_tokenizer/core/encoding.cc
index c3a7fcbfd50b..d5d8843c0c59 100644
--- a/faster_tokenizer/faster_tokenizer/core/encoding.cc
+++ b/faster_tokenizer/faster_tokenizer/core/encoding.cc
@@ -17,11 +17,7 @@ limitations under the License. */
 #include <cassert>
 #include <climits>
 #include <sstream>
-#include "glog/logging.h"
 
-#ifdef WITH_OMP
-#include <omp.h>
-#endif
 namespace paddlenlp {
 namespace faster_tokenizer {
 namespace core {
@@ -600,6 +596,23 @@ bool TruncateEncodings(Encoding* encoding,
   return true;
 }
 
+void MultiThreadPadEncodings(std::vector<Encoding>* encodings,
+                  const PadMethod& method,
+                  size_t pad_length,
+                  size_t start_index,
+                  size_t step_index) {
+  auto batch_size = encodings->size();
+  size_t end_index = start_index+step_index;
+  if(end_index>batch_size) end_index = batch_size;
+  for (size_t i = start_index; i < end_index; ++i) {
+      auto& encoding = (*encodings)[i];
+      encoding.Pad(pad_length,
+                  method.pad_id_,
+                  method.pad_token_type_id_,
+                  method.pad_token_,
+                  method.direction_);
+  }
+}
 void PadEncodings(std::vector<Encoding>* encodings, const PadMethod& method) {
   if (encodings == nullptr || encodings->empty()) {
     return;
@@ -619,7 +632,6 @@ void PadEncodings(std::vector<Encoding>* encodings, const PadMethod& method) {
   auto batch_size = encodings->size();
 #ifdef WITH_OMP
 #pragma omp parallel for if (batch_size >= 4 && omp_get_max_threads() > 1)
-#endif
   for (int i = 0; i < batch_size; ++i) {
     auto& encoding = (*encodings)[i];
     encoding.Pad(pad_length,
@@ -628,6 +640,46 @@ void PadEncodings(std::vector<Encoding>* encodings, const PadMethod& method) {
                  method.pad_token_,
                  method.direction_);
   }
+#else
+  int thread_num = GetThreadNum(batch_size);
+  std::vector<std::thread> vectorOfThread;
+  size_t start_index = 0;
+  size_t step_index = ceil(batch_size/thread_num);
+
+  for(size_t thread_index = 0; thread_index < thread_num; thread_index++){
+    vectorOfThread.emplace_back(std::thread(&MultiThreadPadEncodings,
+                                              encodings,
+                                              std::ref(method),
+                                              pad_length,
+                                              start_index,
+                                              step_index));
+    start_index = start_index + step_index;
+  }
+  for(size_t thread_index = 0; thread_index < thread_num; thread_index++){
+    vectorOfThread[thread_index].join();
+  }
+  vectorOfThread.clear();
+#endif
+
+}
+
+int GetThreadNum(size_t batch_size){
+  char* env_var = std::getenv("OMP_NUM_THREADS");
+  int thread_num = std::atoi(env_var);
+  if(batch_size <=0){
+    thread_num = 1;
+    VLOG(0) << "batch_size <=0, we set OMP_NUM_THREADS = 1";
+  }else{
+    int best_num = ceil(batch_size/4);
+    if(thread_num > best_num){
+      thread_num = best_num;
+      VLOG(0) << "OMP_NUM_THREADS > batch_size/4, we set OMP_NUM_THREADS = batch_size/4";
+    }else if(thread_num == 0){
+        thread_num = best_num;
+        VLOG(0) << "OMP_NUM_THREADS == 0, we set OMP_NUM_THREADS = batch_size/4";
+    }
+  }
+  return thread_num;
 }
 
 }  // namespace core
diff --git a/faster_tokenizer/faster_tokenizer/core/encoding.h b/faster_tokenizer/faster_tokenizer/core/encoding.h
index 34f5a93bdec2..756e23313da8 100644
--- a/faster_tokenizer/faster_tokenizer/core/encoding.h
+++ b/faster_tokenizer/faster_tokenizer/core/encoding.h
@@ -20,6 +20,17 @@ limitations under the License. */
 #include <vector>
 #include "faster_tokenizer/core/base.h"
 #include "faster_tokenizer/utils/utils.h"
+#include "glog/logging.h"
+
+#ifdef WITH_OMP
+#include <omp.h>
+#else
+// Replace OMP with std::thread
+#include <stdlib.h>
+#include <thread>
+#include <math.h>
+using namespace std;
+#endif
 
 namespace paddlenlp {
 namespace faster_tokenizer {
@@ -122,6 +133,7 @@ bool FASTERTOKENIZER_DECL TruncateEncodings(Encoding* encoding,
 void FASTERTOKENIZER_DECL PadEncodings(std::vector<Encoding>* encoding,
                                        const PadMethod& method);
 
+int GetThreadNum(size_t batch_size);
 }  // namespace core
 }  // namespace faster_tokenizer
 }  // namespace paddlenlp
diff --git a/faster_tokenizer/faster_tokenizer/core/tokenizer.cc b/faster_tokenizer/faster_tokenizer/core/tokenizer.cc
index 626910584486..348ccea56571 100644
--- a/faster_tokenizer/faster_tokenizer/core/tokenizer.cc
+++ b/faster_tokenizer/faster_tokenizer/core/tokenizer.cc
@@ -26,8 +26,16 @@ limitations under the License. */
 #include "faster_tokenizer/postprocessors/postprocessors.h"
 #include "faster_tokenizer/pretokenizers/pretokenizers.h"
 
+
+
 #ifdef WITH_OMP
 #include <omp.h>
+#else
+// Replace OMP with std::thread
+#include <stdlib.h>
+#include <thread>
+#include <math.h>
+using namespace std;
 #endif
 
 namespace paddlenlp {
@@ -248,23 +256,61 @@ void Tokenizer::EncodePairStrings(const EncodeInput& encode_input,
   }
 }
 
+void Tokenizer::MultiThreadEncodeBatchStrings(
+    const std::vector<EncodeInput>& batch_encode_input,
+    std::vector<Encoding>* encodings,
+    bool add_special_tokens,
+    size_t start_index,
+    size_t step_index) const {
+
+    auto batch_size = batch_encode_input.size();
+    size_t end_index = start_index+step_index;
+    if(end_index>batch_size) end_index = batch_size;
+    for (size_t i = start_index; i < end_index; ++i) {
+        EncodePairStrings(
+            batch_encode_input[i], &(*encodings)[i], add_special_tokens);
+    }
+}
+
 void Tokenizer::EncodeBatchStrings(
     const std::vector<EncodeInput>& batch_encode_input,
     std::vector<Encoding>* encodings,
     bool add_special_tokens) const {
   auto batch_size = batch_encode_input.size();
   encodings->resize(batch_size);
+
 #ifdef WITH_OMP
 // (TODO:zhoushunjie): Simply use the batch size to estimate the workload of
 // tokenization.
 // Use workload to determine whether create omp threads. Need to optimize the
 // workload estimation.
 #pragma omp parallel for if (batch_size >= 4 && omp_get_max_threads() > 1)
-#endif
   for (int i = 0; i < batch_size; ++i) {
     EncodePairStrings(
         batch_encode_input[i], &(*encodings)[i], add_special_tokens);
   }
+#else
+  int thread_num = GetThreadNum(batch_size);
+  std::vector<std::thread> vectorOfThread;
+  size_t start_index = 0;
+  size_t step_index = ceil(batch_size/thread_num);
+
+  for(size_t thread_index = 0; thread_index < thread_num; thread_index++){
+    vectorOfThread.emplace_back(std::thread(&Tokenizer::MultiThreadEncodeBatchStrings,
+                                              this,
+                                              std::ref(batch_encode_input),
+                                              encodings,
+                                              add_special_tokens,
+                                              start_index,
+                                              step_index));
+    start_index = start_index + step_index;
+  }
+  for(size_t thread_index = 0; thread_index < thread_num; thread_index++){
+    vectorOfThread[thread_index].join();
+  }
+  vectorOfThread.clear();
+#endif
+  
   if (use_padding_) {
     PadEncodings(encodings, pad_method_);
   }
@@ -289,6 +335,24 @@ void Tokenizer::EncodePairStringsCharOffsets(const EncodeInput& encode_input,
   PostProcess(&encoding, &pair_encoding, add_special_tokens, encodings);
 }
 
+void Tokenizer::MultiThreadEncodeBatchStringsCharOffsets(
+    const std::vector<EncodeInput>& batch_encode_input,
+    std::vector<Encoding>* encodings,
+    bool add_special_tokens,
+    size_t start_index,
+    size_t step_index) const {
+
+    auto batch_size = batch_encode_input.size();
+    size_t end_index = start_index+step_index;
+    if( end_index>batch_size ) end_index = batch_size;
+    for (size_t i = start_index; i < end_index; ++i) {
+        Encoding encoding;
+        EncodePairStringsCharOffsets(
+            batch_encode_input[i], &encoding, add_special_tokens);
+        (*encodings)[i] = std::move(encoding);
+    }
+}
+
 void Tokenizer::EncodeBatchStringsCharOffsets(
     const std::vector<EncodeInput>& batch_encode_input,
     std::vector<Encoding>* encodings,
@@ -301,13 +365,34 @@ void Tokenizer::EncodeBatchStringsCharOffsets(
 // Use workload to determine whether create omp threads. Need to optimize the
 // workload estimation.
 #pragma omp parallel for if (batch_size >= 4 && omp_get_max_threads() > 1)
-#endif
   for (int i = 0; i < batch_size; ++i) {
     Encoding encoding;
     EncodePairStringsCharOffsets(
         batch_encode_input[i], &encoding, add_special_tokens);
     (*encodings)[i] = std::move(encoding);
   }
+#else
+  int thread_num = GetThreadNum(batch_size);
+  std::vector<std::thread> vectorOfThread;
+  size_t start_index = 0;
+  size_t step_index = ceil(batch_size/thread_num);
+
+  for(size_t thread_index = 0; thread_index < thread_num; thread_index++){
+    vectorOfThread.emplace_back(std::thread(&Tokenizer::MultiThreadEncodeBatchStringsCharOffsets,
+                                              this,
+                                              std::ref(batch_encode_input),
+                                              encodings,
+                                              add_special_tokens,
+                                              start_index,
+                                              step_index));
+    start_index = start_index + step_index;
+  }
+  for(size_t thread_index = 0; thread_index < thread_num; thread_index++) {
+    vectorOfThread[thread_index].join();
+  }
+  vectorOfThread.clear();
+#endif
+
   if (use_padding_) {
     PadEncodings(encodings, pad_method_);
   }
@@ -404,11 +489,28 @@ void Tokenizer::Decode(const std::vector<uint32_t>& token_ids,
   }
 }
 
+
+void Tokenizer::MultiThreadDecodeBatch(
+    const std::vector<std::vector<uint32_t>>& batch_token_ids,
+    std::vector<std::string>* results,
+    bool skip_special_tokens,
+    size_t start_index,
+    size_t step_index) const {
+
+    auto batch_size = batch_token_ids.size();
+    size_t end_index = start_index+step_index;
+    if( end_index>batch_size ) end_index = batch_size;
+    for (size_t i = start_index; i < end_index; ++i) {
+      Decode(batch_token_ids[i], &(*results)[i], skip_special_tokens);
+    }
+}
+
 void Tokenizer::DecodeBatch(
     const std::vector<std::vector<uint32_t>>& batch_token_ids,
     std::vector<std::string>* results,
     bool skip_special_tokens) const {
-  results->resize(batch_token_ids.size());
+  auto batch_size = batch_token_ids.size();
+  results->resize(batch_size);
 #ifdef WITH_OMP
 // (TODO:zhoushunjie): Simply use the batch size to estimate the workload of
 // tokenization.
@@ -416,10 +518,30 @@ void Tokenizer::DecodeBatch(
 // workload estimation.
 #pragma omp parallel for if (batch_token_ids.size() >= 4 && \
                                                   omp_get_num_threads() > 1)
-#endif
   for (int i = 0; i < batch_token_ids.size(); ++i) {
     Decode(batch_token_ids[i], &(*results)[i], skip_special_tokens);
   }
+#else
+  int thread_num = GetThreadNum(batch_size);
+  std::vector<std::thread> vectorOfThread;
+  size_t start_index = 0;
+  size_t step_index = ceil(batch_size/thread_num);
+
+  for(size_t thread_index = 0; thread_index < thread_num; thread_index++){
+    vectorOfThread.emplace_back(std::thread(&Tokenizer::MultiThreadDecodeBatch,
+                                              this,
+                                              std::ref(batch_token_ids),
+                                              results,
+                                              skip_special_tokens,
+                                              start_index,
+                                              step_index));
+    start_index = start_index + step_index;
+  }
+  for(size_t thread_index = 0; thread_index < thread_num; thread_index++) {
+    vectorOfThread[thread_index].join();
+  }
+  vectorOfThread.clear();
+#endif
 }
 
 bool Tokenizer::GetUseTruncation() const { return use_truncation_; }
diff --git a/faster_tokenizer/faster_tokenizer/core/tokenizer.h b/faster_tokenizer/faster_tokenizer/core/tokenizer.h
index bf317efe1b98..a5df094bf2ae 100644
--- a/faster_tokenizer/faster_tokenizer/core/tokenizer.h
+++ b/faster_tokenizer/faster_tokenizer/core/tokenizer.h
@@ -160,10 +160,24 @@ class FASTERTOKENIZER_DECL Tokenizer {
                    bool add_special_tokens,
                    Encoding* result_encoding) const;
 
+  void MultiThreadEncodeBatchStrings(
+    const std::vector<EncodeInput>& batch_encode_input,
+    std::vector<Encoding>* encodings,
+    bool add_special_tokens,
+    size_t start_index,
+    size_t step_index) const;
+
   void EncodeBatchStrings(const std::vector<EncodeInput>& batch_encode_input,
                           std::vector<Encoding>* encodings,
                           bool add_special_tokens = true) const;
 
+  void MultiThreadEncodeBatchStringsCharOffsets(
+    const std::vector<EncodeInput>& batch_encode_input,
+    std::vector<Encoding>* encodings,
+    bool add_special_tokens,
+    size_t start_index,
+    size_t step_index) const;
+    
   void EncodeBatchStringsCharOffsets(
       const std::vector<EncodeInput>& batch_encode_input,
       std::vector<Encoding>* encodings,
@@ -194,6 +208,12 @@ class FASTERTOKENIZER_DECL Tokenizer {
   void Decode(const std::vector<uint32_t>& token_ids,
               std::string* result,
               bool skip_special_tokens = true) const;
+  void MultiThreadDecodeBatch(
+    const std::vector<std::vector<uint32_t>>& batch_token_ids,
+    std::vector<std::string>* results,
+    bool skip_special_tokens,
+    size_t start_index,
+    size_t step_index) const;
   void DecodeBatch(const std::vector<std::vector<uint32_t>>& batch_token_ids,
                    std::vector<std::string>* results,
                    bool skip_special_tokens = true) const;

From 310b91b17c5d4e0c23a4fcac14dc8243186b81b5 Mon Sep 17 00:00:00 2001
From: HexToString <506181616@qq.com>
Date: Tue, 20 Sep 2022 10:38:11 +0800
Subject: [PATCH 2/6] save change

---
 .../faster_tokenizer/core/encoding.cc         | 48 ++++++-----
 .../faster_tokenizer/core/encoding.h          |  5 +-
 .../faster_tokenizer/core/tokenizer.cc        | 81 ++++++-------------
 3 files changed, 54 insertions(+), 80 deletions(-)

diff --git a/faster_tokenizer/faster_tokenizer/core/encoding.cc b/faster_tokenizer/faster_tokenizer/core/encoding.cc
index d5d8843c0c59..95b90c88dae2 100644
--- a/faster_tokenizer/faster_tokenizer/core/encoding.cc
+++ b/faster_tokenizer/faster_tokenizer/core/encoding.cc
@@ -641,24 +641,13 @@ void PadEncodings(std::vector<Encoding>* encodings, const PadMethod& method) {
                  method.direction_);
   }
 #else
-  int thread_num = GetThreadNum(batch_size);
-  std::vector<std::thread> vectorOfThread;
-  size_t start_index = 0;
-  size_t step_index = ceil(batch_size/thread_num);
-
-  for(size_t thread_index = 0; thread_index < thread_num; thread_index++){
-    vectorOfThread.emplace_back(std::thread(&MultiThreadPadEncodings,
-                                              encodings,
-                                              std::ref(method),
-                                              pad_length,
-                                              start_index,
-                                              step_index));
-    start_index = start_index + step_index;
-  }
-  for(size_t thread_index = 0; thread_index < thread_num; thread_index++){
-    vectorOfThread[thread_index].join();
-  }
-  vectorOfThread.clear();
+    auto func = std::bind(&MultiThreadPadEncodings,
+                          encodings,
+                          std::ref(method),
+                          pad_length,
+                          std::placeholders::_1,
+                          std::placeholders::_2);
+    RunMultiThread(func,batch_size);
 #endif
 
 }
@@ -668,20 +657,35 @@ int GetThreadNum(size_t batch_size){
   int thread_num = std::atoi(env_var);
   if(batch_size <=0){
     thread_num = 1;
-    VLOG(0) << "batch_size <=0, we set OMP_NUM_THREADS = 1";
+    VLOG(3) << "batch_size <=0, we set OMP_NUM_THREADS = 1";
   }else{
-    int best_num = ceil(batch_size/4);
+    int best_num = ceil(batch_size/4.0);
     if(thread_num > best_num){
       thread_num = best_num;
-      VLOG(0) << "OMP_NUM_THREADS > batch_size/4, we set OMP_NUM_THREADS = batch_size/4";
+      VLOG(3) << "OMP_NUM_THREADS > batch_size/4, we set OMP_NUM_THREADS = batch_size/4";
     }else if(thread_num == 0){
         thread_num = best_num;
-        VLOG(0) << "OMP_NUM_THREADS == 0, we set OMP_NUM_THREADS = batch_size/4";
+        VLOG(3) << "OMP_NUM_THREADS == 0, we set OMP_NUM_THREADS = batch_size/4";
     }
   }
   return thread_num;
 }
 
+void RunMultiThread(std::function<void(size_t,size_t)> func, size_t batch_size){
+  int thread_num = GetThreadNum(batch_size);
+  std::vector<std::thread> vectorOfThread;
+  size_t start_index = 0;
+  size_t step_index = ceil(batch_size/float(thread_num));
+
+  for(size_t thread_index = 0; thread_index < thread_num; thread_index++){
+    vectorOfThread.emplace_back(std::thread(func, start_index, step_index));
+    start_index = start_index + step_index;
+  }
+  for(size_t thread_index = 0; thread_index < thread_num; thread_index++) {
+    vectorOfThread[thread_index].join();
+  }
+}
+
 }  // namespace core
 }  // namespace faster_tokenizer
 }  // namespace paddlenlp
diff --git a/faster_tokenizer/faster_tokenizer/core/encoding.h b/faster_tokenizer/faster_tokenizer/core/encoding.h
index 756e23313da8..b362732d106f 100644
--- a/faster_tokenizer/faster_tokenizer/core/encoding.h
+++ b/faster_tokenizer/faster_tokenizer/core/encoding.h
@@ -29,6 +29,7 @@ limitations under the License. */
 #include <stdlib.h>
 #include <thread>
 #include <math.h>
+#include <functional>
 using namespace std;
 #endif
 
@@ -133,7 +134,9 @@ bool FASTERTOKENIZER_DECL TruncateEncodings(Encoding* encoding,
 void FASTERTOKENIZER_DECL PadEncodings(std::vector<Encoding>* encoding,
                                        const PadMethod& method);
 
-int GetThreadNum(size_t batch_size);
+int FASTERTOKENIZER_DECL GetThreadNum(size_t batch_size);
+
+void FASTERTOKENIZER_DECL RunMultiThread(std::function<void(size_t,size_t)> func, size_t batch_size);
 }  // namespace core
 }  // namespace faster_tokenizer
 }  // namespace paddlenlp
diff --git a/faster_tokenizer/faster_tokenizer/core/tokenizer.cc b/faster_tokenizer/faster_tokenizer/core/tokenizer.cc
index 348ccea56571..c2b05256aedf 100644
--- a/faster_tokenizer/faster_tokenizer/core/tokenizer.cc
+++ b/faster_tokenizer/faster_tokenizer/core/tokenizer.cc
@@ -290,25 +290,14 @@ void Tokenizer::EncodeBatchStrings(
         batch_encode_input[i], &(*encodings)[i], add_special_tokens);
   }
 #else
-  int thread_num = GetThreadNum(batch_size);
-  std::vector<std::thread> vectorOfThread;
-  size_t start_index = 0;
-  size_t step_index = ceil(batch_size/thread_num);
-
-  for(size_t thread_index = 0; thread_index < thread_num; thread_index++){
-    vectorOfThread.emplace_back(std::thread(&Tokenizer::MultiThreadEncodeBatchStrings,
-                                              this,
-                                              std::ref(batch_encode_input),
-                                              encodings,
-                                              add_special_tokens,
-                                              start_index,
-                                              step_index));
-    start_index = start_index + step_index;
-  }
-  for(size_t thread_index = 0; thread_index < thread_num; thread_index++){
-    vectorOfThread[thread_index].join();
-  }
-  vectorOfThread.clear();
+    auto func = std::bind(&Tokenizer::MultiThreadEncodeBatchStrings,
+                          this,
+                          std::ref(batch_encode_input),
+                          encodings,
+                          add_special_tokens,
+                          std::placeholders::_1,
+                          std::placeholders::_2);
+    RunMultiThread(func,batch_size);
 #endif
   
   if (use_padding_) {
@@ -372,25 +361,14 @@ void Tokenizer::EncodeBatchStringsCharOffsets(
     (*encodings)[i] = std::move(encoding);
   }
 #else
-  int thread_num = GetThreadNum(batch_size);
-  std::vector<std::thread> vectorOfThread;
-  size_t start_index = 0;
-  size_t step_index = ceil(batch_size/thread_num);
-
-  for(size_t thread_index = 0; thread_index < thread_num; thread_index++){
-    vectorOfThread.emplace_back(std::thread(&Tokenizer::MultiThreadEncodeBatchStringsCharOffsets,
-                                              this,
-                                              std::ref(batch_encode_input),
-                                              encodings,
-                                              add_special_tokens,
-                                              start_index,
-                                              step_index));
-    start_index = start_index + step_index;
-  }
-  for(size_t thread_index = 0; thread_index < thread_num; thread_index++) {
-    vectorOfThread[thread_index].join();
-  }
-  vectorOfThread.clear();
+  auto func = std::bind(&Tokenizer::MultiThreadEncodeBatchStringsCharOffsets,
+                        this,
+                        std::ref(batch_encode_input),
+                        encodings,
+                        add_special_tokens,
+                        std::placeholders::_1,
+                        std::placeholders::_2);
+  RunMultiThread(func,batch_size);
 #endif
 
   if (use_padding_) {
@@ -522,25 +500,14 @@ void Tokenizer::DecodeBatch(
     Decode(batch_token_ids[i], &(*results)[i], skip_special_tokens);
   }
 #else
-  int thread_num = GetThreadNum(batch_size);
-  std::vector<std::thread> vectorOfThread;
-  size_t start_index = 0;
-  size_t step_index = ceil(batch_size/thread_num);
-
-  for(size_t thread_index = 0; thread_index < thread_num; thread_index++){
-    vectorOfThread.emplace_back(std::thread(&Tokenizer::MultiThreadDecodeBatch,
-                                              this,
-                                              std::ref(batch_token_ids),
-                                              results,
-                                              skip_special_tokens,
-                                              start_index,
-                                              step_index));
-    start_index = start_index + step_index;
-  }
-  for(size_t thread_index = 0; thread_index < thread_num; thread_index++) {
-    vectorOfThread[thread_index].join();
-  }
-  vectorOfThread.clear();
+    auto func = std::bind(&Tokenizer::MultiThreadDecodeBatch,
+                          this,
+                          std::ref(batch_token_ids),
+                          results,
+                          skip_special_tokens,
+                          std::placeholders::_1,
+                          std::placeholders::_2);
+    RunMultiThread(func,batch_size);
 #endif
 }
 

From 8062ed166043bbc405c48d6459db066950884604 Mon Sep 17 00:00:00 2001
From: zhoushunjie <zhoushunjie@baidu.com>
Date: Tue, 20 Sep 2022 11:18:42 +0800
Subject: [PATCH 3/6] change code style

---
 faster_tokenizer/faster_tokenizer/core/base.h |  2 +-
 .../faster_tokenizer/core/encoding.cc         | 67 ++++++++-------
 .../faster_tokenizer/core/encoding.h          |  7 +-
 .../faster_tokenizer/core/tokenizer.cc        | 86 +++++++++----------
 .../faster_tokenizer/core/tokenizer.h         | 32 +++----
 5 files changed, 96 insertions(+), 98 deletions(-)

diff --git a/faster_tokenizer/faster_tokenizer/core/base.h b/faster_tokenizer/faster_tokenizer/core/base.h
index cb4256ef3272..c8d0cddc7e4a 100644
--- a/faster_tokenizer/faster_tokenizer/core/base.h
+++ b/faster_tokenizer/faster_tokenizer/core/base.h
@@ -21,9 +21,9 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
+#include "faster_tokenizer/utils/utils.h"
 #include "glog/logging.h"
 #include "nlohmann/json.hpp"
-#include "faster_tokenizer/utils/utils.h"
 
 namespace std {
 template <>
diff --git a/faster_tokenizer/faster_tokenizer/core/encoding.cc b/faster_tokenizer/faster_tokenizer/core/encoding.cc
index 95b90c88dae2..3dac89c56806 100644
--- a/faster_tokenizer/faster_tokenizer/core/encoding.cc
+++ b/faster_tokenizer/faster_tokenizer/core/encoding.cc
@@ -597,20 +597,20 @@ bool TruncateEncodings(Encoding* encoding,
 }
 
 void MultiThreadPadEncodings(std::vector<Encoding>* encodings,
-                  const PadMethod& method,
-                  size_t pad_length,
-                  size_t start_index,
-                  size_t step_index) {
+                             const PadMethod& method,
+                             size_t pad_length,
+                             size_t start_index,
+                             size_t step_index) {
   auto batch_size = encodings->size();
-  size_t end_index = start_index+step_index;
-  if(end_index>batch_size) end_index = batch_size;
+  size_t end_index = start_index + step_index;
+  if (end_index > batch_size) end_index = batch_size;
   for (size_t i = start_index; i < end_index; ++i) {
-      auto& encoding = (*encodings)[i];
-      encoding.Pad(pad_length,
-                  method.pad_id_,
-                  method.pad_token_type_id_,
-                  method.pad_token_,
-                  method.direction_);
+    auto& encoding = (*encodings)[i];
+    encoding.Pad(pad_length,
+                 method.pad_id_,
+                 method.pad_token_type_id_,
+                 method.pad_token_,
+                 method.direction_);
   }
 }
 void PadEncodings(std::vector<Encoding>* encodings, const PadMethod& method) {
@@ -641,47 +641,48 @@ void PadEncodings(std::vector<Encoding>* encodings, const PadMethod& method) {
                  method.direction_);
   }
 #else
-    auto func = std::bind(&MultiThreadPadEncodings,
-                          encodings,
-                          std::ref(method),
-                          pad_length,
-                          std::placeholders::_1,
-                          std::placeholders::_2);
-    RunMultiThread(func,batch_size);
+  auto func = std::bind(&MultiThreadPadEncodings,
+                        encodings,
+                        std::ref(method),
+                        pad_length,
+                        std::placeholders::_1,
+                        std::placeholders::_2);
+  RunMultiThread(func, batch_size);
 #endif
-
 }
 
-int GetThreadNum(size_t batch_size){
+int GetThreadNum(size_t batch_size) {
   char* env_var = std::getenv("OMP_NUM_THREADS");
   int thread_num = std::atoi(env_var);
-  if(batch_size <=0){
+  if (batch_size <= 0) {
     thread_num = 1;
     VLOG(3) << "batch_size <=0, we set OMP_NUM_THREADS = 1";
-  }else{
-    int best_num = ceil(batch_size/4.0);
-    if(thread_num > best_num){
+  } else {
+    int best_num = ceil(batch_size / 4.0);
+    if (thread_num > best_num) {
+      thread_num = best_num;
+      VLOG(3) << "OMP_NUM_THREADS > batch_size/4, we set OMP_NUM_THREADS = "
+                 "batch_size/4";
+    } else if (thread_num == 0) {
       thread_num = best_num;
-      VLOG(3) << "OMP_NUM_THREADS > batch_size/4, we set OMP_NUM_THREADS = batch_size/4";
-    }else if(thread_num == 0){
-        thread_num = best_num;
-        VLOG(3) << "OMP_NUM_THREADS == 0, we set OMP_NUM_THREADS = batch_size/4";
+      VLOG(3) << "OMP_NUM_THREADS == 0, we set OMP_NUM_THREADS = batch_size/4";
     }
   }
   return thread_num;
 }
 
-void RunMultiThread(std::function<void(size_t,size_t)> func, size_t batch_size){
+void RunMultiThread(std::function<void(size_t, size_t)> func,
+                    size_t batch_size) {
   int thread_num = GetThreadNum(batch_size);
   std::vector<std::thread> vectorOfThread;
   size_t start_index = 0;
-  size_t step_index = ceil(batch_size/float(thread_num));
+  size_t step_index = ceil(batch_size / float(thread_num));
 
-  for(size_t thread_index = 0; thread_index < thread_num; thread_index++){
+  for (size_t thread_index = 0; thread_index < thread_num; thread_index++) {
     vectorOfThread.emplace_back(std::thread(func, start_index, step_index));
     start_index = start_index + step_index;
   }
-  for(size_t thread_index = 0; thread_index < thread_num; thread_index++) {
+  for (size_t thread_index = 0; thread_index < thread_num; thread_index++) {
     vectorOfThread[thread_index].join();
   }
 }
diff --git a/faster_tokenizer/faster_tokenizer/core/encoding.h b/faster_tokenizer/faster_tokenizer/core/encoding.h
index b362732d106f..197ccd9efe9c 100644
--- a/faster_tokenizer/faster_tokenizer/core/encoding.h
+++ b/faster_tokenizer/faster_tokenizer/core/encoding.h
@@ -26,10 +26,10 @@ limitations under the License. */
 #include <omp.h>
 #else
 // Replace OMP with std::thread
-#include <stdlib.h>
-#include <thread>
 #include <math.h>
+#include <stdlib.h>
 #include <functional>
+#include <thread>
 using namespace std;
 #endif
 
@@ -136,7 +136,8 @@ void FASTERTOKENIZER_DECL PadEncodings(std::vector<Encoding>* encoding,
 
 int FASTERTOKENIZER_DECL GetThreadNum(size_t batch_size);
 
-void FASTERTOKENIZER_DECL RunMultiThread(std::function<void(size_t,size_t)> func, size_t batch_size);
+void FASTERTOKENIZER_DECL
+RunMultiThread(std::function<void(size_t, size_t)> func, size_t batch_size);
 }  // namespace core
 }  // namespace faster_tokenizer
 }  // namespace paddlenlp
diff --git a/faster_tokenizer/faster_tokenizer/core/tokenizer.cc b/faster_tokenizer/faster_tokenizer/core/tokenizer.cc
index c2b05256aedf..cb92976ebbef 100644
--- a/faster_tokenizer/faster_tokenizer/core/tokenizer.cc
+++ b/faster_tokenizer/faster_tokenizer/core/tokenizer.cc
@@ -27,14 +27,13 @@ limitations under the License. */
 #include "faster_tokenizer/pretokenizers/pretokenizers.h"
 
 
-
 #ifdef WITH_OMP
 #include <omp.h>
 #else
 // Replace OMP with std::thread
+#include <math.h>
 #include <stdlib.h>
 #include <thread>
-#include <math.h>
 using namespace std;
 #endif
 
@@ -262,14 +261,13 @@ void Tokenizer::MultiThreadEncodeBatchStrings(
     bool add_special_tokens,
     size_t start_index,
     size_t step_index) const {
-
-    auto batch_size = batch_encode_input.size();
-    size_t end_index = start_index+step_index;
-    if(end_index>batch_size) end_index = batch_size;
-    for (size_t i = start_index; i < end_index; ++i) {
-        EncodePairStrings(
-            batch_encode_input[i], &(*encodings)[i], add_special_tokens);
-    }
+  auto batch_size = batch_encode_input.size();
+  size_t end_index = start_index + step_index;
+  if (end_index > batch_size) end_index = batch_size;
+  for (size_t i = start_index; i < end_index; ++i) {
+    EncodePairStrings(
+        batch_encode_input[i], &(*encodings)[i], add_special_tokens);
+  }
 }
 
 void Tokenizer::EncodeBatchStrings(
@@ -290,16 +288,16 @@ void Tokenizer::EncodeBatchStrings(
         batch_encode_input[i], &(*encodings)[i], add_special_tokens);
   }
 #else
-    auto func = std::bind(&Tokenizer::MultiThreadEncodeBatchStrings,
-                          this,
-                          std::ref(batch_encode_input),
-                          encodings,
-                          add_special_tokens,
-                          std::placeholders::_1,
-                          std::placeholders::_2);
-    RunMultiThread(func,batch_size);
+  auto func = std::bind(&Tokenizer::MultiThreadEncodeBatchStrings,
+                        this,
+                        std::ref(batch_encode_input),
+                        encodings,
+                        add_special_tokens,
+                        std::placeholders::_1,
+                        std::placeholders::_2);
+  RunMultiThread(func, batch_size);
 #endif
-  
+
   if (use_padding_) {
     PadEncodings(encodings, pad_method_);
   }
@@ -330,16 +328,15 @@ void Tokenizer::MultiThreadEncodeBatchStringsCharOffsets(
     bool add_special_tokens,
     size_t start_index,
     size_t step_index) const {
-
-    auto batch_size = batch_encode_input.size();
-    size_t end_index = start_index+step_index;
-    if( end_index>batch_size ) end_index = batch_size;
-    for (size_t i = start_index; i < end_index; ++i) {
-        Encoding encoding;
-        EncodePairStringsCharOffsets(
-            batch_encode_input[i], &encoding, add_special_tokens);
-        (*encodings)[i] = std::move(encoding);
-    }
+  auto batch_size = batch_encode_input.size();
+  size_t end_index = start_index + step_index;
+  if (end_index > batch_size) end_index = batch_size;
+  for (size_t i = start_index; i < end_index; ++i) {
+    Encoding encoding;
+    EncodePairStringsCharOffsets(
+        batch_encode_input[i], &encoding, add_special_tokens);
+    (*encodings)[i] = std::move(encoding);
+  }
 }
 
 void Tokenizer::EncodeBatchStringsCharOffsets(
@@ -368,7 +365,7 @@ void Tokenizer::EncodeBatchStringsCharOffsets(
                         add_special_tokens,
                         std::placeholders::_1,
                         std::placeholders::_2);
-  RunMultiThread(func,batch_size);
+  RunMultiThread(func, batch_size);
 #endif
 
   if (use_padding_) {
@@ -474,13 +471,12 @@ void Tokenizer::MultiThreadDecodeBatch(
     bool skip_special_tokens,
     size_t start_index,
     size_t step_index) const {
-
-    auto batch_size = batch_token_ids.size();
-    size_t end_index = start_index+step_index;
-    if( end_index>batch_size ) end_index = batch_size;
-    for (size_t i = start_index; i < end_index; ++i) {
-      Decode(batch_token_ids[i], &(*results)[i], skip_special_tokens);
-    }
+  auto batch_size = batch_token_ids.size();
+  size_t end_index = start_index + step_index;
+  if (end_index > batch_size) end_index = batch_size;
+  for (size_t i = start_index; i < end_index; ++i) {
+    Decode(batch_token_ids[i], &(*results)[i], skip_special_tokens);
+  }
 }
 
 void Tokenizer::DecodeBatch(
@@ -500,14 +496,14 @@ void Tokenizer::DecodeBatch(
     Decode(batch_token_ids[i], &(*results)[i], skip_special_tokens);
   }
 #else
-    auto func = std::bind(&Tokenizer::MultiThreadDecodeBatch,
-                          this,
-                          std::ref(batch_token_ids),
-                          results,
-                          skip_special_tokens,
-                          std::placeholders::_1,
-                          std::placeholders::_2);
-    RunMultiThread(func,batch_size);
+  auto func = std::bind(&Tokenizer::MultiThreadDecodeBatch,
+                        this,
+                        std::ref(batch_token_ids),
+                        results,
+                        skip_special_tokens,
+                        std::placeholders::_1,
+                        std::placeholders::_2);
+  RunMultiThread(func, batch_size);
 #endif
 }
 
diff --git a/faster_tokenizer/faster_tokenizer/core/tokenizer.h b/faster_tokenizer/faster_tokenizer/core/tokenizer.h
index a5df094bf2ae..d709cc5a5c6e 100644
--- a/faster_tokenizer/faster_tokenizer/core/tokenizer.h
+++ b/faster_tokenizer/faster_tokenizer/core/tokenizer.h
@@ -161,23 +161,23 @@ class FASTERTOKENIZER_DECL Tokenizer {
                    Encoding* result_encoding) const;
 
   void MultiThreadEncodeBatchStrings(
-    const std::vector<EncodeInput>& batch_encode_input,
-    std::vector<Encoding>* encodings,
-    bool add_special_tokens,
-    size_t start_index,
-    size_t step_index) const;
+      const std::vector<EncodeInput>& batch_encode_input,
+      std::vector<Encoding>* encodings,
+      bool add_special_tokens,
+      size_t start_index,
+      size_t step_index) const;
 
   void EncodeBatchStrings(const std::vector<EncodeInput>& batch_encode_input,
                           std::vector<Encoding>* encodings,
                           bool add_special_tokens = true) const;
 
   void MultiThreadEncodeBatchStringsCharOffsets(
-    const std::vector<EncodeInput>& batch_encode_input,
-    std::vector<Encoding>* encodings,
-    bool add_special_tokens,
-    size_t start_index,
-    size_t step_index) const;
-    
+      const std::vector<EncodeInput>& batch_encode_input,
+      std::vector<Encoding>* encodings,
+      bool add_special_tokens,
+      size_t start_index,
+      size_t step_index) const;
+
   void EncodeBatchStringsCharOffsets(
       const std::vector<EncodeInput>& batch_encode_input,
       std::vector<Encoding>* encodings,
@@ -209,11 +209,11 @@ class FASTERTOKENIZER_DECL Tokenizer {
               std::string* result,
               bool skip_special_tokens = true) const;
   void MultiThreadDecodeBatch(
-    const std::vector<std::vector<uint32_t>>& batch_token_ids,
-    std::vector<std::string>* results,
-    bool skip_special_tokens,
-    size_t start_index,
-    size_t step_index) const;
+      const std::vector<std::vector<uint32_t>>& batch_token_ids,
+      std::vector<std::string>* results,
+      bool skip_special_tokens,
+      size_t start_index,
+      size_t step_index) const;
   void DecodeBatch(const std::vector<std::vector<uint32_t>>& batch_token_ids,
                    std::vector<std::string>* results,
                    bool skip_special_tokens = true) const;

From 2696c53bd3d174e9271878f1a842e3a95d61ffae Mon Sep 17 00:00:00 2001
From: zhoushunjie <zhoushunjie@baidu.com>
Date: Tue, 20 Sep 2022 11:31:14 +0800
Subject: [PATCH 4/6] fix conflict

---
 faster_tokenizer/faster_tokenizer/core/base.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/faster_tokenizer/faster_tokenizer/core/base.h b/faster_tokenizer/faster_tokenizer/core/base.h
index c8d0cddc7e4a..cb4256ef3272 100644
--- a/faster_tokenizer/faster_tokenizer/core/base.h
+++ b/faster_tokenizer/faster_tokenizer/core/base.h
@@ -21,9 +21,9 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
-#include "faster_tokenizer/utils/utils.h"
 #include "glog/logging.h"
 #include "nlohmann/json.hpp"
+#include "faster_tokenizer/utils/utils.h"
 
 namespace std {
 template <>

From 504b1eef9db36cacda4ab101dff2e3536f20e109 Mon Sep 17 00:00:00 2001
From: zhoushunjie <zhoushunjie@baidu.com>
Date: Tue, 20 Sep 2022 15:47:00 +0800
Subject: [PATCH 5/6] change h file

---
 faster_tokenizer/faster_tokenizer/core/encoding.cc  | 5 +++++
 faster_tokenizer/faster_tokenizer/core/encoding.h   | 6 ------
 faster_tokenizer/faster_tokenizer/core/tokenizer.cc | 6 ------
 3 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/faster_tokenizer/faster_tokenizer/core/encoding.cc b/faster_tokenizer/faster_tokenizer/core/encoding.cc
index 3dac89c56806..980e192abcbc 100644
--- a/faster_tokenizer/faster_tokenizer/core/encoding.cc
+++ b/faster_tokenizer/faster_tokenizer/core/encoding.cc
@@ -17,6 +17,11 @@ limitations under the License. */
 #include <cassert>
 #include <climits>
 #include <sstream>
+#include "glog/logging.h"
+
+#ifdef WITH_OMP
+#include <omp.h>
+#endif
 
 namespace paddlenlp {
 namespace faster_tokenizer {
diff --git a/faster_tokenizer/faster_tokenizer/core/encoding.h b/faster_tokenizer/faster_tokenizer/core/encoding.h
index 197ccd9efe9c..12a4bb708635 100644
--- a/faster_tokenizer/faster_tokenizer/core/encoding.h
+++ b/faster_tokenizer/faster_tokenizer/core/encoding.h
@@ -20,18 +20,12 @@ limitations under the License. */
 #include <vector>
 #include "faster_tokenizer/core/base.h"
 #include "faster_tokenizer/utils/utils.h"
-#include "glog/logging.h"
 
-#ifdef WITH_OMP
-#include <omp.h>
-#else
-// Replace OMP with std::thread
 #include <math.h>
 #include <stdlib.h>
 #include <functional>
 #include <thread>
 using namespace std;
-#endif
 
 namespace paddlenlp {
 namespace faster_tokenizer {
diff --git a/faster_tokenizer/faster_tokenizer/core/tokenizer.cc b/faster_tokenizer/faster_tokenizer/core/tokenizer.cc
index cb92976ebbef..5e1ba21f427a 100644
--- a/faster_tokenizer/faster_tokenizer/core/tokenizer.cc
+++ b/faster_tokenizer/faster_tokenizer/core/tokenizer.cc
@@ -30,12 +30,6 @@ limitations under the License. */
 #ifdef WITH_OMP
 #include <omp.h>
 #else
-// Replace OMP with std::thread
-#include <math.h>
-#include <stdlib.h>
-#include <thread>
-using namespace std;
-#endif
 
 namespace paddlenlp {
 namespace faster_tokenizer {

From 8924159ce3d4dcbe1b04cc2a3876bca5fb5211cd Mon Sep 17 00:00:00 2001
From: Thomas Young <35565423+HexToString@users.noreply.github.com>
Date: Tue, 20 Sep 2022 19:17:47 +0800
Subject: [PATCH 6/6] Update tokenizer.cc

---
 faster_tokenizer/faster_tokenizer/core/tokenizer.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/faster_tokenizer/faster_tokenizer/core/tokenizer.cc b/faster_tokenizer/faster_tokenizer/core/tokenizer.cc
index 5e1ba21f427a..1b6399c4aedf 100644
--- a/faster_tokenizer/faster_tokenizer/core/tokenizer.cc
+++ b/faster_tokenizer/faster_tokenizer/core/tokenizer.cc
@@ -29,7 +29,7 @@ limitations under the License. */
 
 #ifdef WITH_OMP
 #include <omp.h>
-#else
+#endif
 
 namespace paddlenlp {
 namespace faster_tokenizer {