Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

clp-core: Replace calls to incomplete UTF-8 validation function with new and complete implementation. #477

Merged
merged 2 commits into from
Jul 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions components/core/src/clp/clp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,8 @@ set(
../TimestampPattern.hpp
../TraceableException.hpp
../type_utils.hpp
../utf8_utils.cpp
../utf8_utils.hpp
../Utils.cpp
../Utils.hpp
../VariableDictionaryEntry.cpp
Expand Down
9 changes: 5 additions & 4 deletions components/core/src/clp/clp/FileCompressor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "../LogSurgeonReader.hpp"
#include "../Profiler.hpp"
#include "../streaming_archive/writer/utils.hpp"
#include "../utf8_utils.hpp"
#include "utils.hpp"

using clp::ir::eight_byte_encoded_variable_t;
Expand Down Expand Up @@ -145,8 +146,8 @@ bool FileCompressor::compress_file(
size_t peek_size{0};
m_file_reader.peek_buffered_data(utf8_validation_buf, peek_size);
bool succeeded = true;
auto utf8_validation_buf_len = std::min(peek_size, cUtfMaxValidationLen);
if (is_utf8_sequence(utf8_validation_buf_len, utf8_validation_buf)) {
auto const utf8_validation_buf_len = std::min(peek_size, cUtfMaxValidationLen);
if (is_utf8_encoded({utf8_validation_buf, utf8_validation_buf_len})) {
if (use_heuristic) {
parse_and_encode_with_heuristic(
target_data_size_of_dicts,
Expand Down Expand Up @@ -359,8 +360,8 @@ bool FileCompressor::try_compressing_as_archive(
size_t peek_size{0};
m_libarchive_file_reader.peek_buffered_data(utf8_validation_buf, peek_size);
string file_path{m_libarchive_reader.get_path()};
auto utf8_validation_buf_len = std::min(peek_size, cUtfMaxValidationLen);
if (is_utf8_sequence(utf8_validation_buf_len, utf8_validation_buf)) {
auto const utf8_validation_buf_len = std::min(peek_size, cUtfMaxValidationLen);
if (is_utf8_encoded({utf8_validation_buf, utf8_validation_buf_len})) {
auto boost_path_for_compression = parent_boost_path / file_path;
if (use_heuristic) {
parse_and_encode_with_heuristic(
Expand Down
34 changes: 0 additions & 34 deletions components/core/src/clp/clp/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,40 +86,6 @@ bool find_all_files_and_empty_directories(
return true;
}

bool is_utf8_sequence(size_t sequence_length, char const* sequence) {
size_t num_utf8_bytes_to_read = 0;
for (size_t i = 0; i < sequence_length; ++i) {
auto byte = sequence[i];

if (num_utf8_bytes_to_read > 0) {
// Validate that byte matches 0b10xx_xxxx
if ((byte & 0xC0) != 0x80) {
return false;
}
--num_utf8_bytes_to_read;
} else {
if (byte & 0x80) {
// Check if byte is valid UTF-8 length-indicator
if ((byte & 0xF8) == 0xF0) {
// Matches 0b1111_0xxx
num_utf8_bytes_to_read = 3;
} else if ((byte & 0xF0) == 0xE0) {
// Matches 0b1110_xxxx
num_utf8_bytes_to_read = 2;
} else if ((byte & 0xE0) == 0xC0) {
// Matches 0b110x_xxxx
num_utf8_bytes_to_read = 1;
} else {
// Invalid UTF-8 length-indicator
return false;
}
} // else byte is ASCII
}
}

return true;
}

bool read_input_paths(string const& list_path, vector<string>& paths) {
ErrorCode error_code = read_list_of_paths(list_path, paths);
if (ErrorCode_Success != error_code) {
Expand Down
8 changes: 0 additions & 8 deletions components/core/src/clp/clp/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,6 @@ bool find_all_files_and_empty_directories(
std::vector<std::string>& empty_directory_paths
);

/**
* Checks if the given sequence is valid UTF-8
* @param sequence_length
* @param sequence
* @return true if valid, false otherwise
*/
bool is_utf8_sequence(size_t sequence_length, char const* sequence);

/**
* Reads a list of input paths
* @param list_path
Expand Down
Loading