y-scope · LinZhihao-723 · Jul 11, 2024 · Jul 11, 2024 · Jul 11, 2024
diff --git a/components/core/src/clp/clp/CMakeLists.txt b/components/core/src/clp/clp/CMakeLists.txt
@@ -133,6 +133,8 @@ set(
         ../TimestampPattern.hpp
         ../TraceableException.hpp
         ../type_utils.hpp
+        ../utf8_utils.cpp
+        ../utf8_utils.hpp
         ../Utils.cpp
         ../Utils.hpp
         ../VariableDictionaryEntry.cpp

diff --git a/components/core/src/clp/clp/FileCompressor.cpp b/components/core/src/clp/clp/FileCompressor.cpp
@@ -16,6 +16,7 @@
 #include "../LogSurgeonReader.hpp"
 #include "../Profiler.hpp"
 #include "../streaming_archive/writer/utils.hpp"
+#include "../utf8_utils.hpp"
 #include "utils.hpp"
 
 using clp::ir::eight_byte_encoded_variable_t;
@@ -145,8 +146,8 @@ bool FileCompressor::compress_file(
     size_t peek_size{0};
     m_file_reader.peek_buffered_data(utf8_validation_buf, peek_size);
     bool succeeded = true;
-    auto utf8_validation_buf_len = std::min(peek_size, cUtfMaxValidationLen);
-    if (is_utf8_sequence(utf8_validation_buf_len, utf8_validation_buf)) {
+    auto const utf8_validation_buf_len = std::min(peek_size, cUtfMaxValidationLen);
+    if (is_utf8_encoded({utf8_validation_buf, utf8_validation_buf_len})) {
         if (use_heuristic) {
             parse_and_encode_with_heuristic(
                     target_data_size_of_dicts,
@@ -359,8 +360,8 @@ bool FileCompressor::try_compressing_as_archive(
         size_t peek_size{0};
         m_libarchive_file_reader.peek_buffered_data(utf8_validation_buf, peek_size);
         string file_path{m_libarchive_reader.get_path()};
-        auto utf8_validation_buf_len = std::min(peek_size, cUtfMaxValidationLen);
-        if (is_utf8_sequence(utf8_validation_buf_len, utf8_validation_buf)) {
+        auto const utf8_validation_buf_len = std::min(peek_size, cUtfMaxValidationLen);
+        if (is_utf8_encoded({utf8_validation_buf, utf8_validation_buf_len})) {
             auto boost_path_for_compression = parent_boost_path / file_path;
             if (use_heuristic) {
                 parse_and_encode_with_heuristic(

diff --git a/components/core/src/clp/clp/utils.cpp b/components/core/src/clp/clp/utils.cpp
@@ -86,40 +86,6 @@ bool find_all_files_and_empty_directories(
     return true;
 }
 
-bool is_utf8_sequence(size_t sequence_length, char const* sequence) {
-    size_t num_utf8_bytes_to_read = 0;
-    for (size_t i = 0; i < sequence_length; ++i) {
-        auto byte = sequence[i];
-
-        if (num_utf8_bytes_to_read > 0) {
-            // Validate that byte matches 0b10xx_xxxx
-            if ((byte & 0xC0) != 0x80) {
-                return false;
-            }
-            --num_utf8_bytes_to_read;
-        } else {
-            if (byte & 0x80) {
-                // Check if byte is valid UTF-8 length-indicator
-                if ((byte & 0xF8) == 0xF0) {
-                    // Matches 0b1111_0xxx
-                    num_utf8_bytes_to_read = 3;
-                } else if ((byte & 0xF0) == 0xE0) {
-                    // Matches 0b1110_xxxx
-                    num_utf8_bytes_to_read = 2;
-                } else if ((byte & 0xE0) == 0xC0) {
-                    // Matches 0b110x_xxxx
-                    num_utf8_bytes_to_read = 1;
-                } else {
-                    // Invalid UTF-8 length-indicator
-                    return false;
-                }
-            }  // else byte is ASCII
-        }
-    }
-
-    return true;
-}
-
 bool read_input_paths(string const& list_path, vector<string>& paths) {
     ErrorCode error_code = read_list_of_paths(list_path, paths);
     if (ErrorCode_Success != error_code) {

diff --git a/components/core/src/clp/clp/utils.hpp b/components/core/src/clp/clp/utils.hpp
@@ -41,14 +41,6 @@ bool find_all_files_and_empty_directories(
         std::vector<std::string>& empty_directory_paths
 );
 
-/**
- * Checks if the given sequence is valid UTF-8
- * @param sequence_length
- * @param sequence
- * @return true if valid, false otherwise
- */
-bool is_utf8_sequence(size_t sequence_length, char const* sequence);
-
 /**
  * Reads a list of input paths
  * @param list_path