From 48c322ac41e6702f5b64cc8c94d239d8e2490e76 Mon Sep 17 00:00:00 2001 From: LinZhihao-723 Date: Thu, 11 Jul 2024 14:40:10 -0400 Subject: [PATCH 1/2] Use the new utf8 --- .../core/src/clp/clp/FileCompressor.cpp | 9 ++--- components/core/src/clp/clp/utils.cpp | 34 ------------------- components/core/src/clp/clp/utils.hpp | 8 ----- 3 files changed, 5 insertions(+), 46 deletions(-) diff --git a/components/core/src/clp/clp/FileCompressor.cpp b/components/core/src/clp/clp/FileCompressor.cpp index 4100816f5..9898602cc 100644 --- a/components/core/src/clp/clp/FileCompressor.cpp +++ b/components/core/src/clp/clp/FileCompressor.cpp @@ -16,6 +16,7 @@ #include "../LogSurgeonReader.hpp" #include "../Profiler.hpp" #include "../streaming_archive/writer/utils.hpp" +#include "../utf8_utils.hpp" #include "utils.hpp" using clp::ir::eight_byte_encoded_variable_t; @@ -145,8 +146,8 @@ bool FileCompressor::compress_file( size_t peek_size{0}; m_file_reader.peek_buffered_data(utf8_validation_buf, peek_size); bool succeeded = true; - auto utf8_validation_buf_len = std::min(peek_size, cUtfMaxValidationLen); - if (is_utf8_sequence(utf8_validation_buf_len, utf8_validation_buf)) { + auto const utf8_validation_buf_len = std::min(peek_size, cUtfMaxValidationLen); + if (is_utf8_encoded({utf8_validation_buf, utf8_validation_buf_len})) { if (use_heuristic) { parse_and_encode_with_heuristic( target_data_size_of_dicts, @@ -359,8 +360,8 @@ bool FileCompressor::try_compressing_as_archive( size_t peek_size{0}; m_libarchive_file_reader.peek_buffered_data(utf8_validation_buf, peek_size); string file_path{m_libarchive_reader.get_path()}; - auto utf8_validation_buf_len = std::min(peek_size, cUtfMaxValidationLen); - if (is_utf8_sequence(utf8_validation_buf_len, utf8_validation_buf)) { + auto const utf8_validation_buf_len = std::min(peek_size, cUtfMaxValidationLen); + if (is_utf8_encoded({utf8_validation_buf, utf8_validation_buf_len})) { auto boost_path_for_compression = parent_boost_path / file_path; if (use_heuristic) { parse_and_encode_with_heuristic( diff --git a/components/core/src/clp/clp/utils.cpp b/components/core/src/clp/clp/utils.cpp index 5ba59e0d7..0f05d75ac 100644 --- a/components/core/src/clp/clp/utils.cpp +++ b/components/core/src/clp/clp/utils.cpp @@ -86,40 +86,6 @@ bool find_all_files_and_empty_directories( return true; } -bool is_utf8_sequence(size_t sequence_length, char const* sequence) { - size_t num_utf8_bytes_to_read = 0; - for (size_t i = 0; i < sequence_length; ++i) { - auto byte = sequence[i]; - - if (num_utf8_bytes_to_read > 0) { - // Validate that byte matches 0b10xx_xxxx - if ((byte & 0xC0) != 0x80) { - return false; - } - --num_utf8_bytes_to_read; - } else { - if (byte & 0x80) { - // Check if byte is valid UTF-8 length-indicator - if ((byte & 0xF8) == 0xF0) { - // Matches 0b1111_0xxx - num_utf8_bytes_to_read = 3; - } else if ((byte & 0xF0) == 0xE0) { - // Matches 0b1110_xxxx - num_utf8_bytes_to_read = 2; - } else if ((byte & 0xE0) == 0xC0) { - // Matches 0b110x_xxxx - num_utf8_bytes_to_read = 1; - } else { - // Invalid UTF-8 length-indicator - return false; - } - } // else byte is ASCII - } - } - - return true; -} - bool read_input_paths(string const& list_path, vector& paths) { ErrorCode error_code = read_list_of_paths(list_path, paths); if (ErrorCode_Success != error_code) { diff --git a/components/core/src/clp/clp/utils.hpp b/components/core/src/clp/clp/utils.hpp index 41e7a3694..0a6918445 100644 --- a/components/core/src/clp/clp/utils.hpp +++ b/components/core/src/clp/clp/utils.hpp @@ -41,14 +41,6 @@ bool find_all_files_and_empty_directories( std::vector& empty_directory_paths ); -/** - * Checks if the given sequence is valid UTF-8 - * @param sequence_length - * @param sequence - * @return true if valid, false otherwise - */ -bool is_utf8_sequence(size_t sequence_length, char const* sequence); - /** * Reads a list of input paths * @param list_path From 7c56a0b005b1089b262f8f97a4b8c98e949b7193 Mon Sep 17 00:00:00 2001 From: LinZhihao-723 Date: Thu, 11 Jul 2024 15:00:47 -0400 Subject: [PATCH 2/2] Fix missing cmake --- components/core/src/clp/clp/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/components/core/src/clp/clp/CMakeLists.txt b/components/core/src/clp/clp/CMakeLists.txt index b8e073dd1..0f18777d9 100644 --- a/components/core/src/clp/clp/CMakeLists.txt +++ b/components/core/src/clp/clp/CMakeLists.txt @@ -133,6 +133,8 @@ set( ../TimestampPattern.hpp ../TraceableException.hpp ../type_utils.hpp + ../utf8_utils.cpp + ../utf8_utils.hpp ../Utils.cpp ../Utils.hpp ../VariableDictionaryEntry.cpp