From cca53fbbc193178b792786567ca6238a6024b741 Mon Sep 17 00:00:00 2001 From: fktn Date: Tue, 21 Jan 2025 23:45:29 +0900 Subject: [PATCH 1/4] separate utf-8 validation based on char bytes by overloads --- .../fkYAML/detail/encodings/utf_encodings.hpp | 265 +++++++------ include/fkYAML/detail/input/input_adapter.hpp | 100 +++-- single_include/fkYAML/node.hpp | 369 +++++++++--------- tests/unit_test/test_utf_encodings.cpp | 292 ++++++-------- 4 files changed, 480 insertions(+), 546 deletions(-) diff --git a/include/fkYAML/detail/encodings/utf_encodings.hpp b/include/fkYAML/detail/encodings/utf_encodings.hpp index a73a12c7..eb581018 100644 --- a/include/fkYAML/detail/encodings/utf_encodings.hpp +++ b/include/fkYAML/detail/encodings/utf_encodings.hpp @@ -29,7 +29,7 @@ namespace utf8 { /// @return The number of UTF-8 character bytes. inline uint32_t get_num_bytes(uint8_t first_byte) { // The first byte starts with 0b0XXX'XXXX -> 1-byte character - if (first_byte < 0x80) { + if FK_YAML_LIKELY (first_byte < 0x80) { return 1; } // The first byte starts with 0b110X'XXXX -> 2-byte character @@ -49,140 +49,136 @@ inline uint32_t get_num_bytes(uint8_t first_byte) { throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", {first_byte}); } -/// @brief Validates the encoding of a given byte array whose length is 1. -/// @param[in] byte_array The byte array to be validated. -/// @return true if a given byte array is valid, false otherwise. -inline bool validate(const std::initializer_list& byte_array) noexcept { - switch (byte_array.size()) { - case 1: - // U+0000..U+007F - return *byte_array.begin() <= 0x7Fu; - case 2: { - const auto* itr = byte_array.begin(); - const uint8_t first = *itr++; - const uint8_t second = *itr; +/// @brief Checks if `byte` is a valid 1-byte UTF-8 character. +/// @param[in] byte The byte value. +/// @return true if `byte` is a valid 1-byte UTF-8 character, false otherwise. +inline bool validate(uint8_t byte) noexcept { + // U+0000..U+007F + return byte <= 0x7Fu; +} - // U+0080..U+07FF - // 1st Byte: 0xC2..0xDF - // 2nd Byte: 0x80..0xBF - if (0xC2u <= first && first <= 0xDFu) { - if (0x80u <= second && second <= 0xBFu) { +/// @brief Checks if the given bytes are a valid 2-byte UTF-8 character. +/// @param[in] byte0 The first byte value. +/// @param[in] byte1 The second byte value. +/// @return true if the given bytes a valid 3-byte UTF-8 character, false otherwise. +inline bool validate(uint8_t byte0, uint8_t byte1) noexcept { + // U+0080..U+07FF + // 1st Byte: 0xC2..0xDF + // 2nd Byte: 0x80..0xBF + if FK_YAML_LIKELY (0xC2u <= byte0 && byte0 <= 0xDFu) { + if FK_YAML_LIKELY (0x80u <= byte1 && byte1 <= 0xBFu) { + return true; + } + } + + // The rest of byte combinations are invalid. + return false; +} + +/// @brief Checks if the given bytes are a valid 3-byte UTF-8 character. +/// @param[in] byte0 The first byte value. +/// @param[in] byte1 The second byte value. +/// @param[in] byte2 The third byte value. +/// @return true if the given bytes a valid 2-byte UTF-8 character, false otherwise. +inline bool validate(uint8_t byte0, uint8_t byte1, uint8_t byte2) noexcept { + // U+1000..U+CFFF: + // 1st Byte: 0xE0..0xEC + // 2nd Byte: 0x80..0xBF + // 3rd Byte: 0x80..0xBF + if (0xE0u <= byte0 && byte0 <= 0xECu) { + if FK_YAML_LIKELY (0x80u <= byte1 && byte1 <= 0xBFu) { + if FK_YAML_LIKELY (0x80u <= byte2 && byte2 <= 0xBFu) { return true; } } - - // The rest of byte combinations are invalid. return false; } - case 3: { - const auto* itr = byte_array.begin(); - const uint8_t first = *itr++; - const uint8_t second = *itr++; - const uint8_t third = *itr; - // U+1000..U+CFFF: - // 1st Byte: 0xE0..0xEC - // 2nd Byte: 0x80..0xBF - // 3rd Byte: 0x80..0xBF - if (0xE0u <= first && first <= 0xECu) { - if (0x80u <= second && second <= 0xBFu) { - if (0x80u <= third && third <= 0xBFu) { - return true; - } + // U+D000..U+D7FF: + // 1st Byte: 0xED + // 2nd Byte: 0x80..0x9F + // 3rd Byte: 0x80..0xBF + if (byte0 == 0xEDu) { + if FK_YAML_LIKELY (0x80u <= byte1 && byte1 <= 0x9Fu) { + if FK_YAML_LIKELY (0x80u <= byte2 && byte2 <= 0xBFu) { + return true; } - return false; } + return false; + } - // U+D000..U+D7FF: - // 1st Byte: 0xED - // 2nd Byte: 0x80..0x9F - // 3rd Byte: 0x80..0xBF - if (first == 0xEDu) { - if (0x80u <= second && second <= 0x9Fu) { - if (0x80u <= third && third <= 0xBFu) { - return true; - } + // U+E000..U+FFFF: + // 1st Byte: 0xEE..0xEF + // 2nd Byte: 0x80..0xBF + // 3rd Byte: 0x80..0xBF + if FK_YAML_LIKELY (byte0 == 0xEEu || byte0 == 0xEFu) { + if FK_YAML_LIKELY (0x80u <= byte1 && byte1 <= 0xBFu) { + if FK_YAML_LIKELY (0x80u <= byte2 && byte2 <= 0xBFu) { + return true; } - return false; } + } - // U+E000..U+FFFF: - // 1st Byte: 0xEE..0xEF - // 2nd Byte: 0x80..0xBF - // 3rd Byte: 0x80..0xBF - if (first == 0xEEu || first == 0xEFu) { - if (0x80u <= second && second <= 0xBFu) { - if (0x80u <= third && third <= 0xBFu) { + // The rest of byte combinations are invalid. + return false; +} + +/// @brief Checks if the given bytes are a valid 4-byte UTF-8 character. +/// @param[in] byte0 The first byte value. +/// @param[in] byte1 The second byte value. +/// @param[in] byte2 The third byte value. +/// @param[in] byte3 The fourth byte value. +/// @return true if the given bytes a valid 4-byte UTF-8 character, false otherwise. +inline bool validate(uint8_t byte0, uint8_t byte1, uint8_t byte2, uint8_t byte3) noexcept { + // U+10000..U+3FFFF: + // 1st Byte: 0xF0 + // 2nd Byte: 0x90..0xBF + // 3rd Byte: 0x80..0xBF + // 4th Byte: 0x80..0xBF + if (byte0 == 0xF0u) { + if FK_YAML_LIKELY (0x90u <= byte1 && byte1 <= 0xBFu) { + if FK_YAML_LIKELY (0x80u <= byte2 && byte2 <= 0xBFu) { + if FK_YAML_LIKELY (0x80u <= byte3 && byte3 <= 0xBFu) { return true; } } - return false; } - - // The rest of byte combinations are invalid. return false; } - case 4: { - const auto* itr = byte_array.begin(); - const uint8_t first = *itr++; - const uint8_t second = *itr++; - const uint8_t third = *itr++; - const uint8_t fourth = *itr; - - // U+10000..U+3FFFF: - // 1st Byte: 0xF0 - // 2nd Byte: 0x90..0xBF - // 3rd Byte: 0x80..0xBF - // 4th Byte: 0x80..0xBF - if (first == 0xF0u) { - if (0x90u <= second && second <= 0xBFu) { - if (0x80u <= third && third <= 0xBFu) { - if (0x80u <= fourth && fourth <= 0xBFu) { - return true; - } - } - } - return false; - } - // U+40000..U+FFFFF: - // 1st Byte: 0xF1..0xF3 - // 2nd Byte: 0x80..0xBF - // 3rd Byte: 0x80..0xBF - // 4th Byte: 0x80..0xBF - if (0xF1u <= first && first <= 0xF3u) { - if (0x80u <= second && second <= 0xBFu) { - if (0x80u <= third && third <= 0xBFu) { - if (0x80u <= fourth && fourth <= 0xBFu) { - return true; - } + // U+40000..U+FFFFF: + // 1st Byte: 0xF1..0xF3 + // 2nd Byte: 0x80..0xBF + // 3rd Byte: 0x80..0xBF + // 4th Byte: 0x80..0xBF + if (0xF1u <= byte0 && byte0 <= 0xF3u) { + if FK_YAML_LIKELY (0x80u <= byte1 && byte1 <= 0xBFu) { + if FK_YAML_LIKELY (0x80u <= byte2 && byte2 <= 0xBFu) { + if FK_YAML_LIKELY (0x80u <= byte3 && byte3 <= 0xBFu) { + return true; } } - return false; } + return false; + } - // U+100000..U+10FFFF: - // 1st Byte: 0xF4 - // 2nd Byte: 0x80..0x8F - // 3rd Byte: 0x80..0xBF - // 4th Byte: 0x80..0xBF - if (first == 0xF4u) { - if (0x80u <= second && second <= 0x8Fu) { - if (0x80u <= third && third <= 0xBFu) { - if (0x80u <= fourth && fourth <= 0xBFu) { - return true; - } + // U+100000..U+10FFFF: + // 1st Byte: 0xF4 + // 2nd Byte: 0x80..0x8F + // 3rd Byte: 0x80..0xBF + // 4th Byte: 0x80..0xBF + if FK_YAML_LIKELY (byte0 == 0xF4u) { + if FK_YAML_LIKELY (0x80u <= byte1 && byte1 <= 0x8Fu) { + if FK_YAML_LIKELY (0x80u <= byte2 && byte2 <= 0xBFu) { + if FK_YAML_LIKELY (0x80u <= byte3 && byte3 <= 0xBFu) { + return true; } } - return false; } - - // The rest of byte combinations are invalid. - return false; - } - default: // LCOV_EXCL_LINE - detail::unreachable(); // LCOV_EXCL_LINE } + + // The rest of byte combinations are invalid. + return false; } /// @brief Converts UTF-16 encoded characters to UTF-8 encoded bytes. @@ -192,37 +188,38 @@ inline bool validate(const std::initializer_list& byte_array) noexcept /// @param[out] encoded_size The size of UTF-encoded bytes. inline void from_utf16( std::array utf16, std::array& utf8, uint32_t& consumed_size, uint32_t& encoded_size) { - if (utf16[0] < 0x80u) { - utf8[0] = static_cast(utf16[0] & 0x7Fu); + const auto first = utf16[0]; + if (first < 0x80u) { + utf8[0] = static_cast(first & 0x7Fu); consumed_size = 1; encoded_size = 1; } - else if (utf16[0] <= 0x7FFu) { - const auto utf8_chunk = static_cast(0xC080u | ((utf16[0] & 0x07C0u) << 2) | (utf16[0] & 0x3Fu)); - utf8[0] = static_cast((utf8_chunk & 0xFF00u) >> 8); - utf8[1] = static_cast(utf8_chunk & 0x00FFu); + else if (first <= 0x7FFu) { + const auto utf8_chunk = static_cast(0xC080u | ((first & 0x07C0u) << 2) | (first & 0x3Fu)); + utf8[0] = static_cast(utf8_chunk >> 8); + utf8[1] = static_cast(utf8_chunk); consumed_size = 1; encoded_size = 2; } - else if (utf16[0] < 0xD800u || 0xE000u <= utf16[0]) { - const auto utf8_chunk = static_cast( - 0xE08080u | ((utf16[0] & 0xF000u) << 4) | ((utf16[0] & 0x0FC0u) << 2) | (utf16[0] & 0x3Fu)); - utf8[0] = static_cast((utf8_chunk & 0xFF0000u) >> 16); - utf8[1] = static_cast((utf8_chunk & 0x00FF00u) >> 8); - utf8[2] = static_cast(utf8_chunk & 0x0000FFu); + else if (first < 0xD800u || 0xE000u <= first) { + const auto utf8_chunk = + static_cast(0xE08080u | ((first & 0xF000u) << 4) | ((first & 0x0FC0u) << 2) | (first & 0x3Fu)); + utf8[0] = static_cast(utf8_chunk >> 16); + utf8[1] = static_cast(utf8_chunk >> 8); + utf8[2] = static_cast(utf8_chunk); consumed_size = 1; encoded_size = 3; } - else if (utf16[0] <= 0xDBFFu && 0xDC00u <= utf16[1] && utf16[1] <= 0xDFFFu) { + else if (const auto second = utf16[1]; first <= 0xDBFFu && 0xDC00u <= second && second <= 0xDFFFu) { // surrogate pair - const uint32_t code_point = 0x10000u + ((utf16[0] & 0x03FFu) << 10) + (utf16[1] & 0x03FFu); + const uint32_t code_point = 0x10000u + ((first & 0x03FFu) << 10) + (second & 0x03FFu); const auto utf8_chunk = static_cast( 0xF0808080u | ((code_point & 0x1C0000u) << 6) | ((code_point & 0x03F000u) << 4) | ((code_point & 0x0FC0u) << 2) | (code_point & 0x3Fu)); - utf8[0] = static_cast((utf8_chunk & 0xFF000000u) >> 24); - utf8[1] = static_cast((utf8_chunk & 0x00FF0000u) >> 16); - utf8[2] = static_cast((utf8_chunk & 0x0000FF00u) >> 8); - utf8[3] = static_cast(utf8_chunk & 0x000000FFu); + utf8[0] = static_cast(utf8_chunk >> 24); + utf8[1] = static_cast(utf8_chunk >> 16); + utf8[2] = static_cast(utf8_chunk >> 8); + utf8[3] = static_cast(utf8_chunk); consumed_size = 2; encoded_size = 4; } @@ -242,26 +239,26 @@ inline void from_utf32(const char32_t utf32, std::array& utf8, uint3 } else if (utf32 <= 0x7FFu) { const auto utf8_chunk = static_cast(0xC080u | ((utf32 & 0x07C0u) << 2) | (utf32 & 0x3Fu)); - utf8[0] = static_cast((utf8_chunk & 0xFF00u) >> 8); - utf8[1] = static_cast(utf8_chunk & 0x00FFu); + utf8[0] = static_cast(utf8_chunk >> 8); + utf8[1] = static_cast(utf8_chunk); encoded_size = 2; } else if (utf32 <= 0xFFFFu) { const auto utf8_chunk = static_cast(0xE08080u | ((utf32 & 0xF000u) << 4) | ((utf32 & 0x0FC0u) << 2) | (utf32 & 0x3F)); - utf8[0] = static_cast((utf8_chunk & 0xFF0000u) >> 16); - utf8[1] = static_cast((utf8_chunk & 0x00FF00u) >> 8); - utf8[2] = static_cast(utf8_chunk & 0x0000FFu); + utf8[0] = static_cast(utf8_chunk >> 16); + utf8[1] = static_cast(utf8_chunk >> 8); + utf8[2] = static_cast(utf8_chunk); encoded_size = 3; } else if (utf32 <= 0x10FFFFu) { const auto utf8_chunk = static_cast( 0xF0808080u | ((utf32 & 0x1C0000u) << 6) | ((utf32 & 0x03F000u) << 4) | ((utf32 & 0x0FC0u) << 2) | (utf32 & 0x3Fu)); - utf8[0] = static_cast((utf8_chunk & 0xFF000000u) >> 24); - utf8[1] = static_cast((utf8_chunk & 0x00FF0000u) >> 16); - utf8[2] = static_cast((utf8_chunk & 0x0000FF00u) >> 8); - utf8[3] = static_cast(utf8_chunk & 0x000000FFu); + utf8[0] = static_cast(utf8_chunk >> 24); + utf8[1] = static_cast(utf8_chunk >> 16); + utf8[2] = static_cast(utf8_chunk >> 8); + utf8[3] = static_cast(utf8_chunk); encoded_size = 4; } else { diff --git a/include/fkYAML/detail/input/input_adapter.hpp b/include/fkYAML/detail/input/input_adapter.hpp index f377e427..816d2602 100644 --- a/include/fkYAML/detail/input/input_adapter.hpp +++ b/include/fkYAML/detail/input/input_adapter.hpp @@ -94,31 +94,29 @@ class iterator_input_adapter bytes {first, static_cast(*current++)}; - const bool is_valid = utf8::validate(bytes); + const auto second = static_cast(*current++); + const bool is_valid = utf8::validate(first, second); if FK_YAML_UNLIKELY (!is_valid) { - throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", bytes); + throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", {first, second}); } break; } case 3: { - const std::initializer_list bytes { - first, static_cast(*current++), static_cast(*current++)}; - const bool is_valid = utf8::validate(bytes); + const auto second = static_cast(*current++); + const auto third = static_cast(*current++); + const bool is_valid = utf8::validate(first, second, third); if FK_YAML_UNLIKELY (!is_valid) { - throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", bytes); + throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", {first, second, third}); } break; } case 4: { - const std::initializer_list bytes { - first, - static_cast(*current++), - static_cast(*current++), - static_cast(*current++)}; - const bool is_valid = utf8::validate(bytes); + const auto second = static_cast(*current++); + const auto third = static_cast(*current++); + const auto fourth = static_cast(*current++); + const bool is_valid = utf8::validate(first, second, third, fourth); if FK_YAML_UNLIKELY (!is_valid) { - throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", bytes); + throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", {first, second, third, fourth}); } break; } @@ -296,27 +294,29 @@ class iterator_input_adapter bytes {first, uint8_t(*current++)}; - const bool is_valid = utf8::validate(bytes); + const auto second = static_cast(*current++); + const bool is_valid = utf8::validate(first, second); if FK_YAML_UNLIKELY (!is_valid) { - throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", bytes); + throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", {first, second}); } break; } case 3: { - const std::initializer_list bytes {first, uint8_t(*current++), uint8_t(*current++)}; - const bool is_valid = utf8::validate(bytes); + const auto second = static_cast(*current++); + const auto third = static_cast(*current++); + const bool is_valid = utf8::validate(first, second, third); if FK_YAML_UNLIKELY (!is_valid) { - throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", bytes); + throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", {first, second, third}); } break; } case 4: { - const std::initializer_list bytes { - first, uint8_t(*current++), uint8_t(*current++), uint8_t(*current++)}; - const bool is_valid = utf8::validate(bytes); + const auto second = static_cast(*current++); + const auto third = static_cast(*current++); + const auto fourth = static_cast(*current++); + const bool is_valid = utf8::validate(first, second, third, fourth); if FK_YAML_UNLIKELY (!is_valid) { - throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", bytes); + throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", {first, second, third, fourth}); } break; } @@ -587,31 +587,29 @@ class file_input_adapter { switch (num_bytes) { case 2: { - const std::initializer_list bytes {first, static_cast(*current++)}; - const bool is_valid = utf8::validate(bytes); + const auto second = static_cast(*current++); + const bool is_valid = utf8::validate(first, second); if FK_YAML_UNLIKELY (!is_valid) { - throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", bytes); + throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", {first, second}); } break; } case 3: { - const std::initializer_list bytes { - first, static_cast(*current++), static_cast(*current++)}; - const bool is_valid = utf8::validate(bytes); + const auto second = static_cast(*current++); + const auto third = static_cast(*current++); + const bool is_valid = utf8::validate(first, second, third); if FK_YAML_UNLIKELY (!is_valid) { - throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", bytes); + throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", {first, second, third}); } break; } case 4: { - const std::initializer_list bytes { - first, - static_cast(*current++), - static_cast(*current++), - static_cast(*current++)}; - const bool is_valid = utf8::validate(bytes); + const auto second = static_cast(*current++); + const auto third = static_cast(*current++); + const auto fourth = static_cast(*current++); + const bool is_valid = utf8::validate(first, second, third, fourth); if FK_YAML_UNLIKELY (!is_valid) { - throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", bytes); + throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", {first, second, third, fourth}); } break; } @@ -791,31 +789,29 @@ class stream_input_adapter { switch (num_bytes) { case 2: { - const std::initializer_list bytes {first, static_cast(*current++)}; - const bool is_valid = utf8::validate(bytes); + const auto second = static_cast(*current++); + const bool is_valid = utf8::validate(first, second); if FK_YAML_UNLIKELY (!is_valid) { - throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", bytes); + throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", {first, second}); } break; } case 3: { - const std::initializer_list bytes { - first, static_cast(*current++), static_cast(*current++)}; - const bool is_valid = utf8::validate(bytes); + const auto second = static_cast(*current++); + const auto third = static_cast(*current++); + const bool is_valid = utf8::validate(first, second, third); if FK_YAML_UNLIKELY (!is_valid) { - throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", bytes); + throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", {first, second, third}); } break; } case 4: { - const std::initializer_list bytes { - first, - static_cast(*current++), - static_cast(*current++), - static_cast(*current++)}; - const bool is_valid = utf8::validate(bytes); + const auto second = static_cast(*current++); + const auto third = static_cast(*current++); + const auto fourth = static_cast(*current++); + const bool is_valid = utf8::validate(first, second, third, fourth); if FK_YAML_UNLIKELY (!is_valid) { - throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", bytes); + throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", {first, second, third, fourth}); } break; } diff --git a/single_include/fkYAML/node.hpp b/single_include/fkYAML/node.hpp index 784953f3..ad0c3960 100644 --- a/single_include/fkYAML/node.hpp +++ b/single_include/fkYAML/node.hpp @@ -1909,7 +1909,7 @@ namespace utf8 { /// @return The number of UTF-8 character bytes. inline uint32_t get_num_bytes(uint8_t first_byte) { // The first byte starts with 0b0XXX'XXXX -> 1-byte character - if (first_byte < 0x80) { + if FK_YAML_LIKELY (first_byte < 0x80) { return 1; } // The first byte starts with 0b110X'XXXX -> 2-byte character @@ -1929,140 +1929,136 @@ inline uint32_t get_num_bytes(uint8_t first_byte) { throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", {first_byte}); } -/// @brief Validates the encoding of a given byte array whose length is 1. -/// @param[in] byte_array The byte array to be validated. -/// @return true if a given byte array is valid, false otherwise. -inline bool validate(const std::initializer_list& byte_array) noexcept { - switch (byte_array.size()) { - case 1: - // U+0000..U+007F - return *byte_array.begin() <= 0x7Fu; - case 2: { - const auto* itr = byte_array.begin(); - const uint8_t first = *itr++; - const uint8_t second = *itr; - - // U+0080..U+07FF - // 1st Byte: 0xC2..0xDF - // 2nd Byte: 0x80..0xBF - if (0xC2u <= first && first <= 0xDFu) { - if (0x80u <= second && second <= 0xBFu) { +/// @brief Checks if `byte` is a valid 1-byte UTF-8 character. +/// @param[in] byte The byte value. +/// @return true if `byte` is a valid 1-byte UTF-8 character, false otherwise. +inline bool validate(uint8_t byte) noexcept { + // U+0000..U+007F + return byte <= 0x7Fu; +} + +/// @brief Checks if the given bytes are a valid 2-byte UTF-8 character. +/// @param[in] byte0 The first byte value. +/// @param[in] byte1 The second byte value. +/// @return true if the given bytes a valid 3-byte UTF-8 character, false otherwise. +inline bool validate(uint8_t byte0, uint8_t byte1) noexcept { + // U+0080..U+07FF + // 1st Byte: 0xC2..0xDF + // 2nd Byte: 0x80..0xBF + if FK_YAML_LIKELY (0xC2u <= byte0 && byte0 <= 0xDFu) { + if FK_YAML_LIKELY (0x80u <= byte1 && byte1 <= 0xBFu) { + return true; + } + } + + // The rest of byte combinations are invalid. + return false; +} + +/// @brief Checks if the given bytes are a valid 3-byte UTF-8 character. +/// @param[in] byte0 The first byte value. +/// @param[in] byte1 The second byte value. +/// @param[in] byte2 The third byte value. +/// @return true if the given bytes a valid 2-byte UTF-8 character, false otherwise. +inline bool validate(uint8_t byte0, uint8_t byte1, uint8_t byte2) noexcept { + // U+1000..U+CFFF: + // 1st Byte: 0xE0..0xEC + // 2nd Byte: 0x80..0xBF + // 3rd Byte: 0x80..0xBF + if (0xE0u <= byte0 && byte0 <= 0xECu) { + if FK_YAML_LIKELY (0x80u <= byte1 && byte1 <= 0xBFu) { + if FK_YAML_LIKELY (0x80u <= byte2 && byte2 <= 0xBFu) { return true; } } - - // The rest of byte combinations are invalid. return false; } - case 3: { - const auto* itr = byte_array.begin(); - const uint8_t first = *itr++; - const uint8_t second = *itr++; - const uint8_t third = *itr; - - // U+1000..U+CFFF: - // 1st Byte: 0xE0..0xEC - // 2nd Byte: 0x80..0xBF - // 3rd Byte: 0x80..0xBF - if (0xE0u <= first && first <= 0xECu) { - if (0x80u <= second && second <= 0xBFu) { - if (0x80u <= third && third <= 0xBFu) { - return true; - } + + // U+D000..U+D7FF: + // 1st Byte: 0xED + // 2nd Byte: 0x80..0x9F + // 3rd Byte: 0x80..0xBF + if (byte0 == 0xEDu) { + if FK_YAML_LIKELY (0x80u <= byte1 && byte1 <= 0x9Fu) { + if FK_YAML_LIKELY (0x80u <= byte2 && byte2 <= 0xBFu) { + return true; } - return false; } + return false; + } - // U+D000..U+D7FF: - // 1st Byte: 0xED - // 2nd Byte: 0x80..0x9F - // 3rd Byte: 0x80..0xBF - if (first == 0xEDu) { - if (0x80u <= second && second <= 0x9Fu) { - if (0x80u <= third && third <= 0xBFu) { - return true; - } + // U+E000..U+FFFF: + // 1st Byte: 0xEE..0xEF + // 2nd Byte: 0x80..0xBF + // 3rd Byte: 0x80..0xBF + if FK_YAML_LIKELY (byte0 == 0xEEu || byte0 == 0xEFu) { + if FK_YAML_LIKELY (0x80u <= byte1 && byte1 <= 0xBFu) { + if FK_YAML_LIKELY (0x80u <= byte2 && byte2 <= 0xBFu) { + return true; } - return false; } + } + + // The rest of byte combinations are invalid. + return false; +} - // U+E000..U+FFFF: - // 1st Byte: 0xEE..0xEF - // 2nd Byte: 0x80..0xBF - // 3rd Byte: 0x80..0xBF - if (first == 0xEEu || first == 0xEFu) { - if (0x80u <= second && second <= 0xBFu) { - if (0x80u <= third && third <= 0xBFu) { +/// @brief Checks if the given bytes are a valid 4-byte UTF-8 character. +/// @param[in] byte0 The first byte value. +/// @param[in] byte1 The second byte value. +/// @param[in] byte2 The third byte value. +/// @param[in] byte3 The fourth byte value. +/// @return true if the given bytes a valid 4-byte UTF-8 character, false otherwise. +inline bool validate(uint8_t byte0, uint8_t byte1, uint8_t byte2, uint8_t byte3) noexcept { + // U+10000..U+3FFFF: + // 1st Byte: 0xF0 + // 2nd Byte: 0x90..0xBF + // 3rd Byte: 0x80..0xBF + // 4th Byte: 0x80..0xBF + if (byte0 == 0xF0u) { + if FK_YAML_LIKELY (0x90u <= byte1 && byte1 <= 0xBFu) { + if FK_YAML_LIKELY (0x80u <= byte2 && byte2 <= 0xBFu) { + if FK_YAML_LIKELY (0x80u <= byte3 && byte3 <= 0xBFu) { return true; } } - return false; } - - // The rest of byte combinations are invalid. return false; } - case 4: { - const auto* itr = byte_array.begin(); - const uint8_t first = *itr++; - const uint8_t second = *itr++; - const uint8_t third = *itr++; - const uint8_t fourth = *itr; - - // U+10000..U+3FFFF: - // 1st Byte: 0xF0 - // 2nd Byte: 0x90..0xBF - // 3rd Byte: 0x80..0xBF - // 4th Byte: 0x80..0xBF - if (first == 0xF0u) { - if (0x90u <= second && second <= 0xBFu) { - if (0x80u <= third && third <= 0xBFu) { - if (0x80u <= fourth && fourth <= 0xBFu) { - return true; - } - } - } - return false; - } - // U+40000..U+FFFFF: - // 1st Byte: 0xF1..0xF3 - // 2nd Byte: 0x80..0xBF - // 3rd Byte: 0x80..0xBF - // 4th Byte: 0x80..0xBF - if (0xF1u <= first && first <= 0xF3u) { - if (0x80u <= second && second <= 0xBFu) { - if (0x80u <= third && third <= 0xBFu) { - if (0x80u <= fourth && fourth <= 0xBFu) { - return true; - } + // U+40000..U+FFFFF: + // 1st Byte: 0xF1..0xF3 + // 2nd Byte: 0x80..0xBF + // 3rd Byte: 0x80..0xBF + // 4th Byte: 0x80..0xBF + if (0xF1u <= byte0 && byte0 <= 0xF3u) { + if FK_YAML_LIKELY (0x80u <= byte1 && byte1 <= 0xBFu) { + if FK_YAML_LIKELY (0x80u <= byte2 && byte2 <= 0xBFu) { + if FK_YAML_LIKELY (0x80u <= byte3 && byte3 <= 0xBFu) { + return true; } } - return false; } + return false; + } - // U+100000..U+10FFFF: - // 1st Byte: 0xF4 - // 2nd Byte: 0x80..0x8F - // 3rd Byte: 0x80..0xBF - // 4th Byte: 0x80..0xBF - if (first == 0xF4u) { - if (0x80u <= second && second <= 0x8Fu) { - if (0x80u <= third && third <= 0xBFu) { - if (0x80u <= fourth && fourth <= 0xBFu) { - return true; - } + // U+100000..U+10FFFF: + // 1st Byte: 0xF4 + // 2nd Byte: 0x80..0x8F + // 3rd Byte: 0x80..0xBF + // 4th Byte: 0x80..0xBF + if FK_YAML_LIKELY (byte0 == 0xF4u) { + if FK_YAML_LIKELY (0x80u <= byte1 && byte1 <= 0x8Fu) { + if FK_YAML_LIKELY (0x80u <= byte2 && byte2 <= 0xBFu) { + if FK_YAML_LIKELY (0x80u <= byte3 && byte3 <= 0xBFu) { + return true; } } - return false; } - - // The rest of byte combinations are invalid. - return false; - } - default: // LCOV_EXCL_LINE - detail::unreachable(); // LCOV_EXCL_LINE } + + // The rest of byte combinations are invalid. + return false; } /// @brief Converts UTF-16 encoded characters to UTF-8 encoded bytes. @@ -2072,37 +2068,38 @@ inline bool validate(const std::initializer_list& byte_array) noexcept /// @param[out] encoded_size The size of UTF-encoded bytes. inline void from_utf16( std::array utf16, std::array& utf8, uint32_t& consumed_size, uint32_t& encoded_size) { - if (utf16[0] < 0x80u) { - utf8[0] = static_cast(utf16[0] & 0x7Fu); + const auto first = utf16[0]; + if (first < 0x80u) { + utf8[0] = static_cast(first & 0x7Fu); consumed_size = 1; encoded_size = 1; } - else if (utf16[0] <= 0x7FFu) { - const auto utf8_chunk = static_cast(0xC080u | ((utf16[0] & 0x07C0u) << 2) | (utf16[0] & 0x3Fu)); - utf8[0] = static_cast((utf8_chunk & 0xFF00u) >> 8); - utf8[1] = static_cast(utf8_chunk & 0x00FFu); + else if (first <= 0x7FFu) { + const auto utf8_chunk = static_cast(0xC080u | ((first & 0x07C0u) << 2) | (first & 0x3Fu)); + utf8[0] = static_cast(utf8_chunk >> 8); + utf8[1] = static_cast(utf8_chunk); consumed_size = 1; encoded_size = 2; } - else if (utf16[0] < 0xD800u || 0xE000u <= utf16[0]) { - const auto utf8_chunk = static_cast( - 0xE08080u | ((utf16[0] & 0xF000u) << 4) | ((utf16[0] & 0x0FC0u) << 2) | (utf16[0] & 0x3Fu)); - utf8[0] = static_cast((utf8_chunk & 0xFF0000u) >> 16); - utf8[1] = static_cast((utf8_chunk & 0x00FF00u) >> 8); - utf8[2] = static_cast(utf8_chunk & 0x0000FFu); + else if (first < 0xD800u || 0xE000u <= first) { + const auto utf8_chunk = + static_cast(0xE08080u | ((first & 0xF000u) << 4) | ((first & 0x0FC0u) << 2) | (first & 0x3Fu)); + utf8[0] = static_cast(utf8_chunk >> 16); + utf8[1] = static_cast(utf8_chunk >> 8); + utf8[2] = static_cast(utf8_chunk); consumed_size = 1; encoded_size = 3; } - else if (utf16[0] <= 0xDBFFu && 0xDC00u <= utf16[1] && utf16[1] <= 0xDFFFu) { + else if (const auto second = utf16[1]; first <= 0xDBFFu && 0xDC00u <= second && second <= 0xDFFFu) { // surrogate pair - const uint32_t code_point = 0x10000u + ((utf16[0] & 0x03FFu) << 10) + (utf16[1] & 0x03FFu); + const uint32_t code_point = 0x10000u + ((first & 0x03FFu) << 10) + (second & 0x03FFu); const auto utf8_chunk = static_cast( 0xF0808080u | ((code_point & 0x1C0000u) << 6) | ((code_point & 0x03F000u) << 4) | ((code_point & 0x0FC0u) << 2) | (code_point & 0x3Fu)); - utf8[0] = static_cast((utf8_chunk & 0xFF000000u) >> 24); - utf8[1] = static_cast((utf8_chunk & 0x00FF0000u) >> 16); - utf8[2] = static_cast((utf8_chunk & 0x0000FF00u) >> 8); - utf8[3] = static_cast(utf8_chunk & 0x000000FFu); + utf8[0] = static_cast(utf8_chunk >> 24); + utf8[1] = static_cast(utf8_chunk >> 16); + utf8[2] = static_cast(utf8_chunk >> 8); + utf8[3] = static_cast(utf8_chunk); consumed_size = 2; encoded_size = 4; } @@ -2122,26 +2119,26 @@ inline void from_utf32(const char32_t utf32, std::array& utf8, uint3 } else if (utf32 <= 0x7FFu) { const auto utf8_chunk = static_cast(0xC080u | ((utf32 & 0x07C0u) << 2) | (utf32 & 0x3Fu)); - utf8[0] = static_cast((utf8_chunk & 0xFF00u) >> 8); - utf8[1] = static_cast(utf8_chunk & 0x00FFu); + utf8[0] = static_cast(utf8_chunk >> 8); + utf8[1] = static_cast(utf8_chunk); encoded_size = 2; } else if (utf32 <= 0xFFFFu) { const auto utf8_chunk = static_cast(0xE08080u | ((utf32 & 0xF000u) << 4) | ((utf32 & 0x0FC0u) << 2) | (utf32 & 0x3F)); - utf8[0] = static_cast((utf8_chunk & 0xFF0000u) >> 16); - utf8[1] = static_cast((utf8_chunk & 0x00FF00u) >> 8); - utf8[2] = static_cast(utf8_chunk & 0x0000FFu); + utf8[0] = static_cast(utf8_chunk >> 16); + utf8[1] = static_cast(utf8_chunk >> 8); + utf8[2] = static_cast(utf8_chunk); encoded_size = 3; } else if (utf32 <= 0x10FFFFu) { const auto utf8_chunk = static_cast( 0xF0808080u | ((utf32 & 0x1C0000u) << 6) | ((utf32 & 0x03F000u) << 4) | ((utf32 & 0x0FC0u) << 2) | (utf32 & 0x3Fu)); - utf8[0] = static_cast((utf8_chunk & 0xFF000000u) >> 24); - utf8[1] = static_cast((utf8_chunk & 0x00FF0000u) >> 16); - utf8[2] = static_cast((utf8_chunk & 0x0000FF00u) >> 8); - utf8[3] = static_cast(utf8_chunk & 0x000000FFu); + utf8[0] = static_cast(utf8_chunk >> 24); + utf8[1] = static_cast(utf8_chunk >> 16); + utf8[2] = static_cast(utf8_chunk >> 8); + utf8[3] = static_cast(utf8_chunk); encoded_size = 4; } else { @@ -8985,31 +8982,29 @@ class iterator_input_adapter bytes {first, static_cast(*current++)}; - const bool is_valid = utf8::validate(bytes); + const auto second = static_cast(*current++); + const bool is_valid = utf8::validate(first, second); if FK_YAML_UNLIKELY (!is_valid) { - throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", bytes); + throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", {first, second}); } break; } case 3: { - const std::initializer_list bytes { - first, static_cast(*current++), static_cast(*current++)}; - const bool is_valid = utf8::validate(bytes); + const auto second = static_cast(*current++); + const auto third = static_cast(*current++); + const bool is_valid = utf8::validate(first, second, third); if FK_YAML_UNLIKELY (!is_valid) { - throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", bytes); + throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", {first, second, third}); } break; } case 4: { - const std::initializer_list bytes { - first, - static_cast(*current++), - static_cast(*current++), - static_cast(*current++)}; - const bool is_valid = utf8::validate(bytes); + const auto second = static_cast(*current++); + const auto third = static_cast(*current++); + const auto fourth = static_cast(*current++); + const bool is_valid = utf8::validate(first, second, third, fourth); if FK_YAML_UNLIKELY (!is_valid) { - throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", bytes); + throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", {first, second, third, fourth}); } break; } @@ -9187,27 +9182,29 @@ class iterator_input_adapter bytes {first, uint8_t(*current++)}; - const bool is_valid = utf8::validate(bytes); + const auto second = static_cast(*current++); + const bool is_valid = utf8::validate(first, second); if FK_YAML_UNLIKELY (!is_valid) { - throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", bytes); + throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", {first, second}); } break; } case 3: { - const std::initializer_list bytes {first, uint8_t(*current++), uint8_t(*current++)}; - const bool is_valid = utf8::validate(bytes); + const auto second = static_cast(*current++); + const auto third = static_cast(*current++); + const bool is_valid = utf8::validate(first, second, third); if FK_YAML_UNLIKELY (!is_valid) { - throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", bytes); + throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", {first, second, third}); } break; } case 4: { - const std::initializer_list bytes { - first, uint8_t(*current++), uint8_t(*current++), uint8_t(*current++)}; - const bool is_valid = utf8::validate(bytes); + const auto second = static_cast(*current++); + const auto third = static_cast(*current++); + const auto fourth = static_cast(*current++); + const bool is_valid = utf8::validate(first, second, third, fourth); if FK_YAML_UNLIKELY (!is_valid) { - throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", bytes); + throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", {first, second, third, fourth}); } break; } @@ -9478,31 +9475,29 @@ class file_input_adapter { switch (num_bytes) { case 2: { - const std::initializer_list bytes {first, static_cast(*current++)}; - const bool is_valid = utf8::validate(bytes); + const auto second = static_cast(*current++); + const bool is_valid = utf8::validate(first, second); if FK_YAML_UNLIKELY (!is_valid) { - throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", bytes); + throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", {first, second}); } break; } case 3: { - const std::initializer_list bytes { - first, static_cast(*current++), static_cast(*current++)}; - const bool is_valid = utf8::validate(bytes); + const auto second = static_cast(*current++); + const auto third = static_cast(*current++); + const bool is_valid = utf8::validate(first, second, third); if FK_YAML_UNLIKELY (!is_valid) { - throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", bytes); + throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", {first, second, third}); } break; } case 4: { - const std::initializer_list bytes { - first, - static_cast(*current++), - static_cast(*current++), - static_cast(*current++)}; - const bool is_valid = utf8::validate(bytes); + const auto second = static_cast(*current++); + const auto third = static_cast(*current++); + const auto fourth = static_cast(*current++); + const bool is_valid = utf8::validate(first, second, third, fourth); if FK_YAML_UNLIKELY (!is_valid) { - throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", bytes); + throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", {first, second, third, fourth}); } break; } @@ -9682,31 +9677,29 @@ class stream_input_adapter { switch (num_bytes) { case 2: { - const std::initializer_list bytes {first, static_cast(*current++)}; - const bool is_valid = utf8::validate(bytes); + const auto second = static_cast(*current++); + const bool is_valid = utf8::validate(first, second); if FK_YAML_UNLIKELY (!is_valid) { - throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", bytes); + throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", {first, second}); } break; } case 3: { - const std::initializer_list bytes { - first, static_cast(*current++), static_cast(*current++)}; - const bool is_valid = utf8::validate(bytes); + const auto second = static_cast(*current++); + const auto third = static_cast(*current++); + const bool is_valid = utf8::validate(first, second, third); if FK_YAML_UNLIKELY (!is_valid) { - throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", bytes); + throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", {first, second, third}); } break; } case 4: { - const std::initializer_list bytes { - first, - static_cast(*current++), - static_cast(*current++), - static_cast(*current++)}; - const bool is_valid = utf8::validate(bytes); + const auto second = static_cast(*current++); + const auto third = static_cast(*current++); + const auto fourth = static_cast(*current++); + const bool is_valid = utf8::validate(first, second, third, fourth); if FK_YAML_UNLIKELY (!is_valid) { - throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", bytes); + throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", {first, second, third, fourth}); } break; } diff --git a/tests/unit_test/test_utf_encodings.cpp b/tests/unit_test/test_utf_encodings.cpp index 14607a9f..80688552 100644 --- a/tests/unit_test/test_utf_encodings.cpp +++ b/tests/unit_test/test_utf_encodings.cpp @@ -40,201 +40,149 @@ TEST_CASE("UTF8_GetNumBytes") { TEST_CASE("UTF8_Validate") { SECTION("1 byte character encoded in UTF-8") { - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0x00u)}) == true); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0x01u)}) == true); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0x02u)}) == true); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0x7Du)}) == true); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0x7Eu)}) == true); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0x7Fu)}) == true); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0x80u)}) == false); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0x81u)}) == false); + REQUIRE(fkyaml::detail::utf8::validate(uint8_t(0x00u))); + REQUIRE(fkyaml::detail::utf8::validate(uint8_t(0x01u))); + REQUIRE(fkyaml::detail::utf8::validate(uint8_t(0x02u))); + REQUIRE(fkyaml::detail::utf8::validate(uint8_t(0x7Du))); + REQUIRE(fkyaml::detail::utf8::validate(uint8_t(0x7Eu))); + REQUIRE(fkyaml::detail::utf8::validate(uint8_t(0x7Fu))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0x80u))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0x81u))); } SECTION("2 byte characters encoded in UTF-8") { - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xC0u), uint8_t(0x80u)}) == false); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xC1u), uint8_t(0x80u)}) == false); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xC2u), uint8_t(0x7Eu)}) == false); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xC2u), uint8_t(0x7Fu)}) == false); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xC2u), uint8_t(0x80u)}) == true); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xC3u), uint8_t(0x81u)}) == true); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xD0u), uint8_t(0xA0u)}) == true); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xDEu), uint8_t(0xBEu)}) == true); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xDFu), uint8_t(0xBFu)}) == true); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xDFu), uint8_t(0xC0u)}) == false); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xDFu), uint8_t(0xC1u)}) == false); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xE0u), uint8_t(0xBFu)}) == false); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xE1u), uint8_t(0xBFu)}) == false); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xC0u), uint8_t(0x80u))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xC1u), uint8_t(0x80u))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xC2u), uint8_t(0x7Eu))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xC2u), uint8_t(0x7Fu))); + REQUIRE(fkyaml::detail::utf8::validate(uint8_t(0xC2u), uint8_t(0x80u))); + REQUIRE(fkyaml::detail::utf8::validate(uint8_t(0xC3u), uint8_t(0x81u))); + REQUIRE(fkyaml::detail::utf8::validate(uint8_t(0xD0u), uint8_t(0xA0u))); + REQUIRE(fkyaml::detail::utf8::validate(uint8_t(0xDEu), uint8_t(0xBEu))); + REQUIRE(fkyaml::detail::utf8::validate(uint8_t(0xDFu), uint8_t(0xBFu))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xDFu), uint8_t(0xC0u))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xDFu), uint8_t(0xC1u))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xE0u), uint8_t(0xBFu))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xE1u), uint8_t(0xBFu))); } SECTION("3 byte characters encoded in UTF-8") { - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xDEu), uint8_t(0x80u), uint8_t(0x80u)}) == false); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xDFu), uint8_t(0x80u), uint8_t(0x80u)}) == false); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xE0u), uint8_t(0x7Eu), uint8_t(0x80u)}) == false); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xE0u), uint8_t(0x7Fu), uint8_t(0x80u)}) == false); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xE0u), uint8_t(0x80u), uint8_t(0x7Eu)}) == false); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xE0u), uint8_t(0x80u), uint8_t(0x7Fu)}) == false); - - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xE0u), uint8_t(0x80u), uint8_t(0x80u)}) == true); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xE6u), uint8_t(0xA0u), uint8_t(0xA0u)}) == true); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xECu), uint8_t(0xBFu), uint8_t(0xBFu)}) == true); - - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xECu), uint8_t(0xC0u), uint8_t(0xBFu)}) == false); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xECu), uint8_t(0xC1u), uint8_t(0xBFu)}) == false); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xECu), uint8_t(0xBFu), uint8_t(0xC0u)}) == false); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xECu), uint8_t(0xBFu), uint8_t(0xC1u)}) == false); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xDEu), uint8_t(0x80u), uint8_t(0x80u))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xDFu), uint8_t(0x80u), uint8_t(0x80u))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xE0u), uint8_t(0x7Eu), uint8_t(0x80u))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xE0u), uint8_t(0x7Fu), uint8_t(0x80u))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xE0u), uint8_t(0x80u), uint8_t(0x7Eu))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xE0u), uint8_t(0x80u), uint8_t(0x7Fu))); + + REQUIRE(fkyaml::detail::utf8::validate(uint8_t(0xE0u), uint8_t(0x80u), uint8_t(0x80u))); + REQUIRE(fkyaml::detail::utf8::validate(uint8_t(0xE6u), uint8_t(0xA0u), uint8_t(0xA0u))); + REQUIRE(fkyaml::detail::utf8::validate(uint8_t(0xECu), uint8_t(0xBFu), uint8_t(0xBFu))); + + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xECu), uint8_t(0xC0u), uint8_t(0xBFu))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xECu), uint8_t(0xC1u), uint8_t(0xBFu))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xECu), uint8_t(0xBFu), uint8_t(0xC0u))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xECu), uint8_t(0xBFu), uint8_t(0xC1u))); ////////////////////////////////////////////// - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xEDu), uint8_t(0x7Eu), uint8_t(0x80u)}) == false); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xEDu), uint8_t(0x7Fu), uint8_t(0x80u)}) == false); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xEDu), uint8_t(0x80u), uint8_t(0x7Eu)}) == false); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xEDu), uint8_t(0x80u), uint8_t(0x7Fu)}) == false); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xEDu), uint8_t(0x7Eu), uint8_t(0x80u))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xEDu), uint8_t(0x7Fu), uint8_t(0x80u))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xEDu), uint8_t(0x80u), uint8_t(0x7Eu))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xEDu), uint8_t(0x80u), uint8_t(0x7Fu))); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xEDu), uint8_t(0x80u), uint8_t(0x80u)}) == true); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xEDu), uint8_t(0x90u), uint8_t(0xA0u)}) == true); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xEDu), uint8_t(0x9Fu), uint8_t(0xBFu)}) == true); + REQUIRE(fkyaml::detail::utf8::validate(uint8_t(0xEDu), uint8_t(0x80u), uint8_t(0x80u))); + REQUIRE(fkyaml::detail::utf8::validate(uint8_t(0xEDu), uint8_t(0x90u), uint8_t(0xA0u))); + REQUIRE(fkyaml::detail::utf8::validate(uint8_t(0xEDu), uint8_t(0x9Fu), uint8_t(0xBFu))); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xEDu), uint8_t(0xA0u), uint8_t(0xBFu)}) == false); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xEDu), uint8_t(0xA1u), uint8_t(0xBFu)}) == false); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xEDu), uint8_t(0x9Fu), uint8_t(0xC0u)}) == false); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xEDu), uint8_t(0x9Fu), uint8_t(0xC1u)}) == false); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xEDu), uint8_t(0xA0u), uint8_t(0xBFu))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xEDu), uint8_t(0xA1u), uint8_t(0xBFu))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xEDu), uint8_t(0x9Fu), uint8_t(0xC0u))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xEDu), uint8_t(0x9Fu), uint8_t(0xC1u))); ////////////////////////////////////////////// - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xEEu), uint8_t(0x7Eu), uint8_t(0x80u)}) == false); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xEEu), uint8_t(0x7Fu), uint8_t(0x80u)}) == false); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xEEu), uint8_t(0x80u), uint8_t(0x7Eu)}) == false); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xEEu), uint8_t(0x80u), uint8_t(0x7Fu)}) == false); - - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xEEu), uint8_t(0x80u), uint8_t(0x80u)}) == true); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xEEu), uint8_t(0xA0u), uint8_t(0xA0u)}) == true); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xEFu), uint8_t(0xBFu), uint8_t(0xBFu)}) == true); - - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xEFu), uint8_t(0xC0u), uint8_t(0xBFu)}) == false); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xEFu), uint8_t(0xC1u), uint8_t(0xBFu)}) == false); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xEFu), uint8_t(0xBFu), uint8_t(0xC0u)}) == false); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xEFu), uint8_t(0xBFu), uint8_t(0xC1u)}) == false); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xF0u), uint8_t(0xBFu), uint8_t(0xBFu)}) == false); - REQUIRE(fkyaml::detail::utf8::validate({uint8_t(0xF1u), uint8_t(0xBFu), uint8_t(0xBFu)}) == false); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xEEu), uint8_t(0x7Eu), uint8_t(0x80u))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xEEu), uint8_t(0x7Fu), uint8_t(0x80u))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xEEu), uint8_t(0x80u), uint8_t(0x7Eu))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xEEu), uint8_t(0x80u), uint8_t(0x7Fu))); + + REQUIRE(fkyaml::detail::utf8::validate(uint8_t(0xEEu), uint8_t(0x80u), uint8_t(0x80u))); + REQUIRE(fkyaml::detail::utf8::validate(uint8_t(0xEEu), uint8_t(0xA0u), uint8_t(0xA0u))); + REQUIRE(fkyaml::detail::utf8::validate(uint8_t(0xEFu), uint8_t(0xBFu), uint8_t(0xBFu))); + + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xEFu), uint8_t(0xC0u), uint8_t(0xBFu))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xEFu), uint8_t(0xC1u), uint8_t(0xBFu))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xEFu), uint8_t(0xBFu), uint8_t(0xC0u))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xEFu), uint8_t(0xBFu), uint8_t(0xC1u))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xF0u), uint8_t(0xBFu), uint8_t(0xBFu))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xF1u), uint8_t(0xBFu), uint8_t(0xBFu))); } SECTION("4 byte characters encoded in UTF-8") { - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xDEu), uint8_t(0x90u), uint8_t(0x80u), uint8_t(0x80u)}) == false); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xDFu), uint8_t(0x90u), uint8_t(0x80u), uint8_t(0x80u)}) == false); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xE0u), uint8_t(0x8Eu), uint8_t(0x80u), uint8_t(0x80u)}) == false); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xE0u), uint8_t(0x8Fu), uint8_t(0x80u), uint8_t(0x80u)}) == false); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xE0u), uint8_t(0x90u), uint8_t(0x7Eu), uint8_t(0x80u)}) == false); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xE0u), uint8_t(0x90u), uint8_t(0x7Fu), uint8_t(0x80u)}) == false); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xE0u), uint8_t(0x90u), uint8_t(0x80u), uint8_t(0x7Eu)}) == false); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xE0u), uint8_t(0x90u), uint8_t(0x80u), uint8_t(0x7Fu)}) == false); - - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF0u), uint8_t(0x90u), uint8_t(0x80u), uint8_t(0x80u)}) == true); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF0u), uint8_t(0xA8u), uint8_t(0xA0u), uint8_t(0xA0u)}) == true); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF0u), uint8_t(0xBFu), uint8_t(0xBFu), uint8_t(0xBFu)}) == true); - - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF0u), uint8_t(0x8Fu), uint8_t(0x80u), uint8_t(0x80u)}) == false); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF0u), uint8_t(0xC0u), uint8_t(0xBFu), uint8_t(0xBFu)}) == false); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF0u), uint8_t(0xC1u), uint8_t(0xBFu), uint8_t(0xBFu)}) == false); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF0u), uint8_t(0xBFu), uint8_t(0x7Fu), uint8_t(0xBFu)}) == false); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF0u), uint8_t(0xBFu), uint8_t(0xC0u), uint8_t(0xBFu)}) == false); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF0u), uint8_t(0xBFu), uint8_t(0xC1u), uint8_t(0xBFu)}) == false); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF0u), uint8_t(0xBFu), uint8_t(0xBFu), uint8_t(0x7Fu)}) == false); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF0u), uint8_t(0xBFu), uint8_t(0xBFu), uint8_t(0xC0u)}) == false); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF0u), uint8_t(0xBFu), uint8_t(0xBFu), uint8_t(0xC1u)}) == false); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xDEu), uint8_t(0x90u), uint8_t(0x80u), uint8_t(0x80u))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xDFu), uint8_t(0x90u), uint8_t(0x80u), uint8_t(0x80u))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xE0u), uint8_t(0x8Eu), uint8_t(0x80u), uint8_t(0x80u))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xE0u), uint8_t(0x8Fu), uint8_t(0x80u), uint8_t(0x80u))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xE0u), uint8_t(0x90u), uint8_t(0x7Eu), uint8_t(0x80u))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xE0u), uint8_t(0x90u), uint8_t(0x7Fu), uint8_t(0x80u))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xE0u), uint8_t(0x90u), uint8_t(0x80u), uint8_t(0x7Eu))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xE0u), uint8_t(0x90u), uint8_t(0x80u), uint8_t(0x7Fu))); + + REQUIRE(fkyaml::detail::utf8::validate(uint8_t(0xF0u), uint8_t(0x90u), uint8_t(0x80u), uint8_t(0x80u))); + REQUIRE(fkyaml::detail::utf8::validate(uint8_t(0xF0u), uint8_t(0xA8u), uint8_t(0xA0u), uint8_t(0xA0u))); + REQUIRE(fkyaml::detail::utf8::validate(uint8_t(0xF0u), uint8_t(0xBFu), uint8_t(0xBFu), uint8_t(0xBFu))); + + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xF0u), uint8_t(0x8Fu), uint8_t(0x80u), uint8_t(0x80u))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xF0u), uint8_t(0xC0u), uint8_t(0xBFu), uint8_t(0xBFu))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xF0u), uint8_t(0xC1u), uint8_t(0xBFu), uint8_t(0xBFu))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xF0u), uint8_t(0xBFu), uint8_t(0x7Fu), uint8_t(0xBFu))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xF0u), uint8_t(0xBFu), uint8_t(0xC0u), uint8_t(0xBFu))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xF0u), uint8_t(0xBFu), uint8_t(0xC1u), uint8_t(0xBFu))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xF0u), uint8_t(0xBFu), uint8_t(0xBFu), uint8_t(0x7Fu))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xF0u), uint8_t(0xBFu), uint8_t(0xBFu), uint8_t(0xC0u))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xF0u), uint8_t(0xBFu), uint8_t(0xBFu), uint8_t(0xC1u))); //////////////////////////////////////////////////// - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF1u), uint8_t(0x7Eu), uint8_t(0x80u), uint8_t(0x80u)}) == false); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF1u), uint8_t(0x7Fu), uint8_t(0x80u), uint8_t(0x80u)}) == false); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF1u), uint8_t(0x80u), uint8_t(0x7Eu), uint8_t(0x80u)}) == false); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF1u), uint8_t(0x80u), uint8_t(0x7Fu), uint8_t(0x80u)}) == false); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF1u), uint8_t(0x80u), uint8_t(0x80u), uint8_t(0x7Eu)}) == false); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF1u), uint8_t(0x80u), uint8_t(0x80u), uint8_t(0x7Fu)}) == false); - - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF1u), uint8_t(0x80u), uint8_t(0x80u), uint8_t(0x80u)}) == true); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF2u), uint8_t(0xA0u), uint8_t(0xA0u), uint8_t(0xA0u)}) == true); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF3u), uint8_t(0xBFu), uint8_t(0xBFu), uint8_t(0xBFu)}) == true); - - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF3u), uint8_t(0xC0u), uint8_t(0xBFu), uint8_t(0xBFu)}) == false); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF3u), uint8_t(0xC1u), uint8_t(0xBFu), uint8_t(0xBFu)}) == false); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF3u), uint8_t(0xBFu), uint8_t(0xC0u), uint8_t(0xBFu)}) == false); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF3u), uint8_t(0xBFu), uint8_t(0xC1u), uint8_t(0xBFu)}) == false); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF3u), uint8_t(0xBFu), uint8_t(0xBFu), uint8_t(0xC0u)}) == false); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF3u), uint8_t(0xBFu), uint8_t(0xBFu), uint8_t(0xC1u)}) == false); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xF1u), uint8_t(0x7Eu), uint8_t(0x80u), uint8_t(0x80u))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xF1u), uint8_t(0x7Fu), uint8_t(0x80u), uint8_t(0x80u))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xF1u), uint8_t(0x80u), uint8_t(0x7Eu), uint8_t(0x80u))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xF1u), uint8_t(0x80u), uint8_t(0x7Fu), uint8_t(0x80u))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xF1u), uint8_t(0x80u), uint8_t(0x80u), uint8_t(0x7Eu))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xF1u), uint8_t(0x80u), uint8_t(0x80u), uint8_t(0x7Fu))); + + REQUIRE(fkyaml::detail::utf8::validate(uint8_t(0xF1u), uint8_t(0x80u), uint8_t(0x80u), uint8_t(0x80u))); + REQUIRE(fkyaml::detail::utf8::validate(uint8_t(0xF2u), uint8_t(0xA0u), uint8_t(0xA0u), uint8_t(0xA0u))); + REQUIRE(fkyaml::detail::utf8::validate(uint8_t(0xF3u), uint8_t(0xBFu), uint8_t(0xBFu), uint8_t(0xBFu))); + + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xF3u), uint8_t(0xC0u), uint8_t(0xBFu), uint8_t(0xBFu))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xF3u), uint8_t(0xC1u), uint8_t(0xBFu), uint8_t(0xBFu))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xF3u), uint8_t(0xBFu), uint8_t(0xC0u), uint8_t(0xBFu))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xF3u), uint8_t(0xBFu), uint8_t(0xC1u), uint8_t(0xBFu))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xF3u), uint8_t(0xBFu), uint8_t(0xBFu), uint8_t(0xC0u))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xF3u), uint8_t(0xBFu), uint8_t(0xBFu), uint8_t(0xC1u))); //////////////////////////////////////////////////// - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF4u), uint8_t(0x7Eu), uint8_t(0x80u), uint8_t(0x80u)}) == false); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF4u), uint8_t(0x7Fu), uint8_t(0x80u), uint8_t(0x80u)}) == false); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF4u), uint8_t(0x80u), uint8_t(0x7Eu), uint8_t(0x80u)}) == false); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF4u), uint8_t(0x80u), uint8_t(0x7Fu), uint8_t(0x80u)}) == false); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF4u), uint8_t(0x80u), uint8_t(0x80u), uint8_t(0x7Eu)}) == false); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF4u), uint8_t(0x80u), uint8_t(0x80u), uint8_t(0x7Fu)}) == false); - - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF4u), uint8_t(0x80u), uint8_t(0x80u), uint8_t(0x80u)}) == true); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF4u), uint8_t(0x88u), uint8_t(0xA0u), uint8_t(0x80u)}) == true); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF4u), uint8_t(0x8Fu), uint8_t(0xBFu), uint8_t(0xBFu)}) == true); - - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF4u), uint8_t(0x90u), uint8_t(0xBFu), uint8_t(0xBFu)}) == false); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF4u), uint8_t(0x91u), uint8_t(0xBFu), uint8_t(0xBFu)}) == false); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF4u), uint8_t(0x8Fu), uint8_t(0xC0u), uint8_t(0xBFu)}) == false); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF4u), uint8_t(0x8Fu), uint8_t(0xC1u), uint8_t(0xBFu)}) == false); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF4u), uint8_t(0x8Fu), uint8_t(0xBFu), uint8_t(0xC0u)}) == false); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF4u), uint8_t(0x8Fu), uint8_t(0xBFu), uint8_t(0xC1u)}) == false); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF5u), uint8_t(0x8Fu), uint8_t(0xBFu), uint8_t(0xBFu)}) == false); - REQUIRE( - fkyaml::detail::utf8::validate({uint8_t(0xF6u), uint8_t(0x8Fu), uint8_t(0xBFu), uint8_t(0xBFu)}) == false); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xF4u), uint8_t(0x7Eu), uint8_t(0x80u), uint8_t(0x80u))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xF4u), uint8_t(0x7Fu), uint8_t(0x80u), uint8_t(0x80u))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xF4u), uint8_t(0x80u), uint8_t(0x7Eu), uint8_t(0x80u))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xF4u), uint8_t(0x80u), uint8_t(0x7Fu), uint8_t(0x80u))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xF4u), uint8_t(0x80u), uint8_t(0x80u), uint8_t(0x7Eu))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xF4u), uint8_t(0x80u), uint8_t(0x80u), uint8_t(0x7Fu))); + + REQUIRE(fkyaml::detail::utf8::validate(uint8_t(0xF4u), uint8_t(0x80u), uint8_t(0x80u), uint8_t(0x80u))); + REQUIRE(fkyaml::detail::utf8::validate(uint8_t(0xF4u), uint8_t(0x88u), uint8_t(0xA0u), uint8_t(0x80u))); + REQUIRE(fkyaml::detail::utf8::validate(uint8_t(0xF4u), uint8_t(0x8Fu), uint8_t(0xBFu), uint8_t(0xBFu))); + + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xF4u), uint8_t(0x90u), uint8_t(0xBFu), uint8_t(0xBFu))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xF4u), uint8_t(0x91u), uint8_t(0xBFu), uint8_t(0xBFu))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xF4u), uint8_t(0x8Fu), uint8_t(0xC0u), uint8_t(0xBFu))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xF4u), uint8_t(0x8Fu), uint8_t(0xC1u), uint8_t(0xBFu))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xF4u), uint8_t(0x8Fu), uint8_t(0xBFu), uint8_t(0xC0u))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xF4u), uint8_t(0x8Fu), uint8_t(0xBFu), uint8_t(0xC1u))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xF5u), uint8_t(0x8Fu), uint8_t(0xBFu), uint8_t(0xBFu))); + REQUIRE_FALSE(fkyaml::detail::utf8::validate(uint8_t(0xF6u), uint8_t(0x8Fu), uint8_t(0xBFu), uint8_t(0xBFu))); } } From 9415a8115c995fe8d9594aa103070a6c5ee6a630 Mon Sep 17 00:00:00 2001 From: fktn Date: Wed, 22 Jan 2025 00:09:21 +0900 Subject: [PATCH 2/4] refactored line break normalization in iterator_input_adapter for char/char8_t --- .../fkYAML/detail/encodings/utf_encodings.hpp | 3 +- include/fkYAML/detail/input/input_adapter.hpp | 56 ++++++++++-------- single_include/fkYAML/node.hpp | 59 +++++++++++-------- 3 files changed, 66 insertions(+), 52 deletions(-) diff --git a/include/fkYAML/detail/encodings/utf_encodings.hpp b/include/fkYAML/detail/encodings/utf_encodings.hpp index eb581018..618e6fae 100644 --- a/include/fkYAML/detail/encodings/utf_encodings.hpp +++ b/include/fkYAML/detail/encodings/utf_encodings.hpp @@ -189,6 +189,7 @@ inline bool validate(uint8_t byte0, uint8_t byte1, uint8_t byte2, uint8_t byte3) inline void from_utf16( std::array utf16, std::array& utf8, uint32_t& consumed_size, uint32_t& encoded_size) { const auto first = utf16[0]; + const auto second = utf16[1]; if (first < 0x80u) { utf8[0] = static_cast(first & 0x7Fu); consumed_size = 1; @@ -210,7 +211,7 @@ inline void from_utf16( consumed_size = 1; encoded_size = 3; } - else if (const auto second = utf16[1]; first <= 0xDBFFu && 0xDC00u <= second && second <= 0xDFFFu) { + else if (first <= 0xDBFFu && 0xDC00u <= second && second <= 0xDFFFu) { // surrogate pair const uint32_t code_point = 0x10000u + ((first & 0x03FFu) << 10) + (second & 0x03FFu); const auto utf8_chunk = static_cast( diff --git a/include/fkYAML/detail/input/input_adapter.hpp b/include/fkYAML/detail/input/input_adapter.hpp index 816d2602..81a5f090 100644 --- a/include/fkYAML/detail/input/input_adapter.hpp +++ b/include/fkYAML/detail/input/input_adapter.hpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -88,11 +89,17 @@ class iterator_input_adapter cr_itrs {}; while (current != m_end) { const auto first = static_cast(*current++); const uint32_t num_bytes = utf8::get_num_bytes(first); switch (num_bytes) { + case 1: + if FK_YAML_UNLIKELY (first == 0x0D /*CR*/) { + cr_itrs.emplace_back(std::prev(current)); + } + break; case 2: { const auto second = static_cast(*current++); const bool is_valid = utf8::validate(first, second); @@ -120,30 +127,25 @@ class iterator_input_adapter cr_itrs {}; while (current != m_end) { const auto first = static_cast(*current++); const uint32_t num_bytes = utf8::get_num_bytes(first); switch (num_bytes) { + case 1: + if FK_YAML_UNLIKELY (first == 0x0D /*CR*/) { + cr_itrs.emplace_back(std::prev(current)); + } + break; case 2: { const auto second = static_cast(*current++); const bool is_valid = utf8::validate(first, second); @@ -320,21 +328,19 @@ class iterator_input_adapter(*current++); - if FK_YAML_LIKELY (c != '\r') { - m_buffer.push_back(c); - } + for (const auto& cr_itr : cr_itrs) { + std::transform( + current, cr_itr, std::back_inserter(m_buffer), [](char8_t c) { return static_cast(c); }); + current = std::next(cr_itr); } + std::transform(current, m_end, std::back_inserter(m_buffer), [](char8_t c) { return static_cast(c); }); return str_view {m_buffer.begin(), m_buffer.end()}; } diff --git a/single_include/fkYAML/node.hpp b/single_include/fkYAML/node.hpp index ad0c3960..eea65533 100644 --- a/single_include/fkYAML/node.hpp +++ b/single_include/fkYAML/node.hpp @@ -2069,6 +2069,7 @@ inline bool validate(uint8_t byte0, uint8_t byte1, uint8_t byte2, uint8_t byte3) inline void from_utf16( std::array utf16, std::array& utf8, uint32_t& consumed_size, uint32_t& encoded_size) { const auto first = utf16[0]; + const auto second = utf16[1]; if (first < 0x80u) { utf8[0] = static_cast(first & 0x7Fu); consumed_size = 1; @@ -2090,7 +2091,7 @@ inline void from_utf16( consumed_size = 1; encoded_size = 3; } - else if (const auto second = utf16[1]; first <= 0xDBFFu && 0xDC00u <= second && second <= 0xDFFFu) { + else if (first <= 0xDBFFu && 0xDC00u <= second && second <= 0xDFFFu) { // surrogate pair const uint32_t code_point = 0x10000u + ((first & 0x03FFu) << 10) + (second & 0x03FFu); const auto utf8_chunk = static_cast( @@ -8511,6 +8512,7 @@ FK_YAML_DETAIL_NAMESPACE_END #include #include #include +#include #include #include #include @@ -8976,11 +8978,17 @@ class iterator_input_adapter cr_itrs {}; while (current != m_end) { const auto first = static_cast(*current++); const uint32_t num_bytes = utf8::get_num_bytes(first); switch (num_bytes) { + case 1: + if FK_YAML_UNLIKELY (first == 0x0D /*CR*/) { + cr_itrs.emplace_back(std::prev(current)); + } + break; case 2: { const auto second = static_cast(*current++); const bool is_valid = utf8::validate(first, second); @@ -9008,30 +9016,25 @@ class iterator_input_adapter cr_itrs {}; while (current != m_end) { const auto first = static_cast(*current++); const uint32_t num_bytes = utf8::get_num_bytes(first); switch (num_bytes) { + case 1: + if FK_YAML_UNLIKELY (first == 0x0D /*CR*/) { + cr_itrs.emplace_back(std::prev(current)); + } + break; case 2: { const auto second = static_cast(*current++); const bool is_valid = utf8::validate(first, second); @@ -9208,21 +9217,19 @@ class iterator_input_adapter(*current++); - if FK_YAML_LIKELY (c != '\r') { - m_buffer.push_back(c); - } + for (const auto& cr_itr : cr_itrs) { + std::transform( + current, cr_itr, std::back_inserter(m_buffer), [](char8_t c) { return static_cast(c); }); + current = std::next(cr_itr); } + std::transform(current, m_end, std::back_inserter(m_buffer), [](char8_t c) { return static_cast(c); }); return str_view {m_buffer.begin(), m_buffer.end()}; } From 713cc8911854e039ac9227b2237c4bc0aac87ef8 Mon Sep 17 00:00:00 2001 From: fktn Date: Wed, 22 Jan 2025 01:17:14 +0900 Subject: [PATCH 3/4] refactored line break normalization in file/stream_input_adapter for UTF-8 --- include/fkYAML/detail/input/input_adapter.hpp | 56 +++++++++---------- single_include/fkYAML/node.hpp | 56 +++++++++---------- 2 files changed, 56 insertions(+), 56 deletions(-) diff --git a/include/fkYAML/detail/input/input_adapter.hpp b/include/fkYAML/detail/input/input_adapter.hpp index 81a5f090..053b618c 100644 --- a/include/fkYAML/detail/input/input_adapter.hpp +++ b/include/fkYAML/detail/input/input_adapter.hpp @@ -570,19 +570,18 @@ class file_input_adapter { while ((read_size = std::fread(&tmp_buf[0], sizeof(char), buf_size, m_file)) > 0) { char* p_current = &tmp_buf[0]; char* p_end = p_current + read_size; + + // copy tmp_buf to m_buffer, dropping CRs. + char* p_cr = p_current; do { - // find CR in `tmp_buf`. - char* p_cr_or_end = p_current; - while (p_cr_or_end != p_end) { - if (*p_cr_or_end == '\r') { - break; - } - ++p_cr_or_end; + if FK_YAML_UNLIKELY (*p_cr == '\r') { + m_buffer.append(p_current, p_cr); + p_current = p_cr + 1; } + ++p_cr; + } while (p_cr != p_end); - m_buffer.append(p_current, p_cr_or_end); - p_current = (p_cr_or_end == p_end) ? p_end : p_cr_or_end + 1; - } while (p_current != p_end); + m_buffer.append(p_current, p_end); } auto current = m_buffer.begin(); @@ -592,6 +591,8 @@ class file_input_adapter { const uint32_t num_bytes = utf8::get_num_bytes(first); switch (num_bytes) { + case 1: + break; case 2: { const auto second = static_cast(*current++); const bool is_valid = utf8::validate(first, second); @@ -619,9 +620,8 @@ class file_input_adapter { } break; } - case 1: - default: - break; + default: // LCOV_EXCL_LINE + unreachable(); // LCOV_EXCL_LINE } } @@ -772,19 +772,18 @@ class stream_input_adapter { char* p_current = &tmp_buf[0]; char* p_end = p_current + read_size; + + // copy tmp_buf to m_buffer, dropping CRs. + char* p_cr = p_current; do { - // find CR in `tmp_buf`. - char* p_cr_or_end = p_current; - while (p_cr_or_end != p_end) { - if (*p_cr_or_end == '\r') { - break; - } - ++p_cr_or_end; + if FK_YAML_UNLIKELY (*p_cr == '\r') { + m_buffer.append(p_current, p_cr); + p_current = p_cr + 1; } + ++p_cr; + } while (p_cr != p_end); - m_buffer.append(p_current, p_cr_or_end); - p_current = (p_cr_or_end == p_end) ? p_end : p_cr_or_end + 1; - } while (p_current != p_end); + m_buffer.append(p_current, p_end); } while (!m_istream->eof()); auto current = m_buffer.begin(); @@ -794,6 +793,8 @@ class stream_input_adapter { const uint32_t num_bytes = utf8::get_num_bytes(first); switch (num_bytes) { + case 1: + break; case 2: { const auto second = static_cast(*current++); const bool is_valid = utf8::validate(first, second); @@ -821,9 +822,8 @@ class stream_input_adapter { } break; } - case 1: - default: - break; + default: // LCOV_EXCL_LINE + unreachable(); // LCOV_EXCL_LINE } } @@ -853,7 +853,7 @@ class stream_input_adapter { while (encoded_buf_size < 2) { m_istream->read(&chars[0], 2); const std::streamsize size = m_istream->gcount(); - if (size != 2) { + if FK_YAML_UNLIKELY (size != 2) { break; } @@ -905,7 +905,7 @@ class stream_input_adapter { do { m_istream->read(&chars[0], 4); const std::streamsize size = m_istream->gcount(); - if (size != 4) { + if FK_YAML_UNLIKELY (size != 4) { break; } diff --git a/single_include/fkYAML/node.hpp b/single_include/fkYAML/node.hpp index eea65533..13f38c6e 100644 --- a/single_include/fkYAML/node.hpp +++ b/single_include/fkYAML/node.hpp @@ -9459,19 +9459,18 @@ class file_input_adapter { while ((read_size = std::fread(&tmp_buf[0], sizeof(char), buf_size, m_file)) > 0) { char* p_current = &tmp_buf[0]; char* p_end = p_current + read_size; + + // copy tmp_buf to m_buffer, dropping CRs. + char* p_cr = p_current; do { - // find CR in `tmp_buf`. - char* p_cr_or_end = p_current; - while (p_cr_or_end != p_end) { - if (*p_cr_or_end == '\r') { - break; - } - ++p_cr_or_end; + if FK_YAML_UNLIKELY (*p_cr == '\r') { + m_buffer.append(p_current, p_cr); + p_current = p_cr + 1; } + ++p_cr; + } while (p_cr != p_end); - m_buffer.append(p_current, p_cr_or_end); - p_current = (p_cr_or_end == p_end) ? p_end : p_cr_or_end + 1; - } while (p_current != p_end); + m_buffer.append(p_current, p_end); } auto current = m_buffer.begin(); @@ -9481,6 +9480,8 @@ class file_input_adapter { const uint32_t num_bytes = utf8::get_num_bytes(first); switch (num_bytes) { + case 1: + break; case 2: { const auto second = static_cast(*current++); const bool is_valid = utf8::validate(first, second); @@ -9508,9 +9509,8 @@ class file_input_adapter { } break; } - case 1: - default: - break; + default: // LCOV_EXCL_LINE + unreachable(); // LCOV_EXCL_LINE } } @@ -9661,19 +9661,18 @@ class stream_input_adapter { char* p_current = &tmp_buf[0]; char* p_end = p_current + read_size; + + // copy tmp_buf to m_buffer, dropping CRs. + char* p_cr = p_current; do { - // find CR in `tmp_buf`. - char* p_cr_or_end = p_current; - while (p_cr_or_end != p_end) { - if (*p_cr_or_end == '\r') { - break; - } - ++p_cr_or_end; + if FK_YAML_UNLIKELY (*p_cr == '\r') { + m_buffer.append(p_current, p_cr); + p_current = p_cr + 1; } + ++p_cr; + } while (p_cr != p_end); - m_buffer.append(p_current, p_cr_or_end); - p_current = (p_cr_or_end == p_end) ? p_end : p_cr_or_end + 1; - } while (p_current != p_end); + m_buffer.append(p_current, p_end); } while (!m_istream->eof()); auto current = m_buffer.begin(); @@ -9683,6 +9682,8 @@ class stream_input_adapter { const uint32_t num_bytes = utf8::get_num_bytes(first); switch (num_bytes) { + case 1: + break; case 2: { const auto second = static_cast(*current++); const bool is_valid = utf8::validate(first, second); @@ -9710,9 +9711,8 @@ class stream_input_adapter { } break; } - case 1: - default: - break; + default: // LCOV_EXCL_LINE + unreachable(); // LCOV_EXCL_LINE } } @@ -9742,7 +9742,7 @@ class stream_input_adapter { while (encoded_buf_size < 2) { m_istream->read(&chars[0], 2); const std::streamsize size = m_istream->gcount(); - if (size != 2) { + if FK_YAML_UNLIKELY (size != 2) { break; } @@ -9794,7 +9794,7 @@ class stream_input_adapter { do { m_istream->read(&chars[0], 4); const std::streamsize size = m_istream->gcount(); - if (size != 4) { + if FK_YAML_UNLIKELY (size != 4) { break; } From a3590f87593df8f16bffb50f834817e7015c6c57 Mon Sep 17 00:00:00 2001 From: fktn Date: Wed, 22 Jan 2025 01:42:09 +0900 Subject: [PATCH 4/4] updated benchmark results --- README.md | 32 +++++++++---------- .../result_debug_citm_catalog_json.txt | 18 +++++------ .../results/result_debug_citm_catalog_yml.txt | 18 +++++------ .../results/result_debug_ubuntu_yml.txt | 18 +++++------ .../result_release_citm_catalog_json.txt | 18 +++++------ .../result_release_citm_catalog_yml.txt | 18 +++++------ .../results/result_release_ubuntu_yml.txt | 18 +++++------ 7 files changed, 70 insertions(+), 70 deletions(-) diff --git a/README.md b/README.md index 6f41cf6d..d38eed26 100644 --- a/README.md +++ b/README.md @@ -151,33 +151,33 @@ The following tables are created from the benchmarking results in the following | Benchmark | processed bytes per second (Release) | | ---------------------------------- | ------------------------------------ | -| fkYAML | 60.1886Mi/s | -| libfyaml | 34.9689Mi/s | -| rapidyaml
(with mutable buff) | 19.4415Gi/s | -| rapidyaml
(with immutable buff) | 139.381Mi/s | -| yaml-cpp | 8.8139Mi/s | +| fkYAML | 62.5049Mi/s | +| libfyaml | 39.235Mi/s | +| rapidyaml
(with mutable buff) | 22.007Gi/s | +| rapidyaml
(with immutable buff) | 133.311Mi/s | +| yaml-cpp | 9.07876Mi/s | ### Parsing [citm_catalog.json](https://github.com/fktn-k/fkYAML/blob/develop/tool/benchmark/cases/citm_catalog.json) | Benchmark | processed bytes per second (Release) | | ---------------------------------- | ------------------------------------ | -| fkYAML | 91.1523Mi/s | -| libfyaml | 51.8156Mi/s | -| rapidyaml
(with mutable buff) | 29.7284Gi/s | -| rapidyaml
(with immutable buff) | 140.764Mi/s | -| yaml-cpp | 14.8521Mi/s | +| fkYAML | 97.216Mi/s | +| libfyaml | 57.3021Mi/s | +| rapidyaml
(with mutable buff) | 37.9026Gi/s | +| rapidyaml
(with immutable buff) | 140.375Mi/s | +| yaml-cpp | 14.3192Mi/s | ### Parsing [citm_catalog.yml](https://github.com/fktn-k/fkYAML/blob/develop/tool/benchmark/cases/citm_catalog.yml) | Benchmark | processed bytes per second (Release) | | ---------------------------------- | ------------------------------------ | -| fkYAML | 36.0492Mi/s | -| libfyaml | 21.547Mi/s | -| rapidyaml
(with mutable buff) | 22.6904Gi/s | -| rapidyaml
(with immutable buff) | 64.3972Mi/s | -| yaml-cpp | 6.2337Mi/s | +| fkYAML | 38.7563Mi/s | +| libfyaml | 24.7526Mi/s | +| rapidyaml
(with mutable buff) | 37.9676Gi/s | +| rapidyaml
(with immutable buff) | 68.4245Mi/s | +| yaml-cpp | 6.47003Mi/s | -Although [rapidyaml](https://github.com/biojppm/rapidyaml) is about 2x faster with immutable buffers and far faster with mutable buffers than fkYAML as it focuses on high performance, fkYAML is in general 70% faster than [libfyaml](https://github.com/pantoniou/libfyaml) and also about 6x faster than [yaml-cpp](https://github.com/jbeder/yaml-cpp). +Although [rapidyaml](https://github.com/biojppm/rapidyaml) is about 2x faster with immutable buffers and far faster with mutable buffers than fkYAML as it focuses on high performance, fkYAML is in general 70% faster than [libfyaml](https://github.com/pantoniou/libfyaml) and also about 6.5x faster than [yaml-cpp](https://github.com/jbeder/yaml-cpp). Note that, since fkYAML deserializes scalars into native booleans or integers during the parsing, the performance could be more faster in some use cases since there is no need for string manipulations upon data queries. ## License diff --git a/tools/benchmark/results/result_debug_citm_catalog_json.txt b/tools/benchmark/results/result_debug_citm_catalog_json.txt index d04b8699..5f432465 100644 --- a/tools/benchmark/results/result_debug_citm_catalog_json.txt +++ b/tools/benchmark/results/result_debug_citm_catalog_json.txt @@ -1,18 +1,18 @@ -2024-12-22T01:28:01+09:00 -Running ./build_bm_debug/tool/benchmark/benchmarker -Run on (16 X 3193.88 MHz CPU s) +2025-01-22T01:24:56+09:00 +Running ./build_bm_debug/tools/benchmark/benchmarker +Run on (16 X 3193.92 MHz CPU s) CPU Caches: L1 Data 32 KiB (x8) L1 Instruction 32 KiB (x8) L2 Unified 512 KiB (x8) L3 Unified 16384 KiB (x1) -Load Average: 1.00, 0.39, 0.14 +Load Average: 0.41, 0.48, 0.37 ***WARNING*** Library was built as DEBUG. Timings may be affected. ------------------------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... ------------------------------------------------------------------------------------- -bm_fkyaml_parse 20140087 ns 20219603 ns 36 bytes_per_second=81.465Mi/s items_per_second=49.457/s -bm_yamlcpp_parse 824976395 ns 827427500 ns 1 bytes_per_second=1.99074Mi/s items_per_second=1.20857/s -bm_libfyaml_parse 119860560 ns 120087717 ns 6 bytes_per_second=13.7166Mi/s items_per_second=8.32725/s -bm_rapidyaml_parse_inplace 52605 ns 52662 ns 13548 bytes_per_second=30.5454Gi/s items_per_second=18.989k/s -bm_rapidyaml_parse_arena 41897531 ns 41887041 ns 17 bytes_per_second=39.3246Mi/s items_per_second=23.8737/s +bm_fkyaml_parse 19655126 ns 19654672 ns 36 bytes_per_second=83.8065Mi/s items_per_second=50.8785/s +bm_yamlcpp_parse 785616717 ns 785615000 ns 1 bytes_per_second=2.09669Mi/s items_per_second=1.27289/s +bm_libfyaml_parse 116035673 ns 116034750 ns 6 bytes_per_second=14.1957Mi/s items_per_second=8.61811/s +bm_rapidyaml_parse_inplace 46239 ns 46239 ns 16289 bytes_per_second=34.7886Gi/s items_per_second=21.6268k/s +bm_rapidyaml_parse_arena 40129454 ns 40128400 ns 17 bytes_per_second=41.048Mi/s items_per_second=24.92/s diff --git a/tools/benchmark/results/result_debug_citm_catalog_yml.txt b/tools/benchmark/results/result_debug_citm_catalog_yml.txt index 281ed073..53faca69 100644 --- a/tools/benchmark/results/result_debug_citm_catalog_yml.txt +++ b/tools/benchmark/results/result_debug_citm_catalog_yml.txt @@ -1,18 +1,18 @@ -2024-12-22T01:28:06+09:00 -Running ./build_bm_debug/tool/benchmark/benchmarker -Run on (16 X 3193.88 MHz CPU s) +2025-01-22T01:25:01+09:00 +Running ./build_bm_debug/tools/benchmark/benchmarker +Run on (16 X 3193.92 MHz CPU s) CPU Caches: L1 Data 32 KiB (x8) L1 Instruction 32 KiB (x8) L2 Unified 512 KiB (x8) L3 Unified 16384 KiB (x1) -Load Average: 1.00, 0.40, 0.14 +Load Average: 0.45, 0.49, 0.38 ***WARNING*** Library was built as DEBUG. Timings may be affected. ------------------------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... ------------------------------------------------------------------------------------- -bm_fkyaml_parse 20816812 ns 20751359 ns 34 bytes_per_second=32.971Mi/s items_per_second=48.1896/s -bm_yamlcpp_parse 849217073 ns 847757700 ns 1 bytes_per_second=826.432Ki/s items_per_second=1.17958/s -bm_libfyaml_parse 112104197 ns 111836333 ns 6 bytes_per_second=6.11781Mi/s items_per_second=8.94164/s -bm_rapidyaml_parse_inplace 21570 ns 21506 ns 33456 bytes_per_second=31.0691Gi/s items_per_second=46.4996k/s -bm_rapidyaml_parse_arena 37021323 ns 36891053 ns 19 bytes_per_second=18.5463Mi/s items_per_second=27.1068/s +bm_fkyaml_parse 19937878 ns 19937994 ns 36 bytes_per_second=34.3161Mi/s items_per_second=50.1555/s +bm_yamlcpp_parse 797608026 ns 797605100 ns 1 bytes_per_second=878.397Ki/s items_per_second=1.25375/s +bm_libfyaml_parse 109612610 ns 109613300 ns 7 bytes_per_second=6.24188Mi/s items_per_second=9.12298/s +bm_rapidyaml_parse_inplace 20804 ns 20804 ns 33092 bytes_per_second=32.1165Gi/s items_per_second=48.0672k/s +bm_rapidyaml_parse_arena 35513217 ns 35513440 ns 20 bytes_per_second=19.2658Mi/s items_per_second=28.1584/s diff --git a/tools/benchmark/results/result_debug_ubuntu_yml.txt b/tools/benchmark/results/result_debug_ubuntu_yml.txt index 8ac3cd2a..0cbd95b7 100644 --- a/tools/benchmark/results/result_debug_ubuntu_yml.txt +++ b/tools/benchmark/results/result_debug_ubuntu_yml.txt @@ -1,18 +1,18 @@ -2024-12-22T01:27:57+09:00 -Running ./build_bm_debug/tool/benchmark/benchmarker -Run on (16 X 3193.88 MHz CPU s) +2025-01-22T01:24:52+09:00 +Running ./build_bm_debug/tools/benchmark/benchmarker +Run on (16 X 3193.92 MHz CPU s) CPU Caches: L1 Data 32 KiB (x8) L1 Instruction 32 KiB (x8) L2 Unified 512 KiB (x8) L3 Unified 16384 KiB (x1) -Load Average: 1.00, 0.38, 0.13 +Load Average: 0.35, 0.47, 0.37 ***WARNING*** Library was built as DEBUG. Timings may be affected. ------------------------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... ------------------------------------------------------------------------------------- -bm_fkyaml_parse 153251 ns 154543 ns 4429 bytes_per_second=54.4092Mi/s items_per_second=6.4707k/s -bm_yamlcpp_parse 8297369 ns 8354844 ns 84 bytes_per_second=1.00643Mi/s items_per_second=119.691/s -bm_libfyaml_parse 1056772 ns 1065795 ns 661 bytes_per_second=7.88946Mi/s items_per_second=938.266/s -bm_rapidyaml_parse_inplace 975 ns 982 ns 706183 bytes_per_second=8.36368Gi/s items_per_second=1.01854M/s -bm_rapidyaml_parse_arena 293219 ns 294795 ns 2331 bytes_per_second=28.5234Mi/s items_per_second=3.39219k/s +bm_fkyaml_parse 154761 ns 154758 ns 4522 bytes_per_second=54.3336Mi/s items_per_second=6.46171k/s +bm_yamlcpp_parse 7625752 ns 7625710 ns 89 bytes_per_second=1.10266Mi/s items_per_second=131.135/s +bm_libfyaml_parse 994492 ns 994476 ns 687 bytes_per_second=8.45525Mi/s items_per_second=1.00555k/s +bm_rapidyaml_parse_inplace 925 ns 925 ns 755134 bytes_per_second=8.88146Gi/s items_per_second=1.08159M/s +bm_rapidyaml_parse_arena 282340 ns 282339 ns 2503 bytes_per_second=29.7818Mi/s items_per_second=3.54185k/s diff --git a/tools/benchmark/results/result_release_citm_catalog_json.txt b/tools/benchmark/results/result_release_citm_catalog_json.txt index 89f532b6..e108edd8 100644 --- a/tools/benchmark/results/result_release_citm_catalog_json.txt +++ b/tools/benchmark/results/result_release_citm_catalog_json.txt @@ -1,17 +1,17 @@ -2024-12-22T01:25:42+09:00 -Running ./build_bm_release/tool/benchmark/benchmarker -Run on (16 X 3193.88 MHz CPU s) +2025-01-22T01:18:38+09:00 +Running ./build_bm_release/tools/benchmark/benchmarker +Run on (16 X 3193.92 MHz CPU s) CPU Caches: L1 Data 32 KiB (x8) L1 Instruction 32 KiB (x8) L2 Unified 512 KiB (x8) L3 Unified 16384 KiB (x1) -Load Average: 0.14, 0.06, 0.01 +Load Average: 0.49, 0.32, 0.30 ------------------------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... ------------------------------------------------------------------------------------- -bm_fkyaml_parse 18183785 ns 18070739 ns 38 bytes_per_second=91.1523Mi/s items_per_second=55.3381/s -bm_yamlcpp_parse 111608816 ns 110906333 ns 6 bytes_per_second=14.8521Mi/s items_per_second=9.01662/s -bm_libfyaml_parse 31745123 ns 31789463 ns 24 bytes_per_second=51.8156Mi/s items_per_second=31.457/s -bm_rapidyaml_parse_inplace 53190 ns 54109 ns 12182 bytes_per_second=29.7284Gi/s items_per_second=18.4811k/s -bm_rapidyaml_parse_arena 11537854 ns 11701766 ns 59 bytes_per_second=140.764Mi/s items_per_second=85.4572/s +bm_fkyaml_parse 16943687 ns 16943618 ns 40 bytes_per_second=97.216Mi/s items_per_second=59.0193/s +bm_yamlcpp_parse 115034348 ns 115033614 ns 7 bytes_per_second=14.3192Mi/s items_per_second=8.69311/s +bm_libfyaml_parse 28745831 ns 28745712 ns 25 bytes_per_second=57.3021Mi/s items_per_second=34.7878/s +bm_rapidyaml_parse_inplace 42440 ns 42440 ns 16161 bytes_per_second=37.9026Gi/s items_per_second=23.5627k/s +bm_rapidyaml_parse_arena 11734386 ns 11734217 ns 58 bytes_per_second=140.375Mi/s items_per_second=85.2209/s diff --git a/tools/benchmark/results/result_release_citm_catalog_yml.txt b/tools/benchmark/results/result_release_citm_catalog_yml.txt index 9a168163..a3c1d67e 100644 --- a/tools/benchmark/results/result_release_citm_catalog_yml.txt +++ b/tools/benchmark/results/result_release_citm_catalog_yml.txt @@ -1,17 +1,17 @@ -2024-12-22T01:35:07+09:00 -Running ./build_bm_release/tool/benchmark/benchmarker -Run on (16 X 3193.88 MHz CPU s) +2025-01-22T01:18:43+09:00 +Running ./build_bm_release/tools/benchmark/benchmarker +Run on (16 X 3193.92 MHz CPU s) CPU Caches: L1 Data 32 KiB (x8) L1 Instruction 32 KiB (x8) L2 Unified 512 KiB (x8) L3 Unified 16384 KiB (x1) -Load Average: 0.20, 0.21, 0.14 +Load Average: 0.53, 0.33, 0.30 ------------------------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... ------------------------------------------------------------------------------------- -bm_fkyaml_parse 19005246 ns 18979447 ns 36 bytes_per_second=36.0492Mi/s items_per_second=52.6886/s -bm_yamlcpp_parse 109966009 ns 109757200 ns 6 bytes_per_second=6.2337Mi/s items_per_second=9.11102/s -bm_libfyaml_parse 31836258 ns 31753588 ns 24 bytes_per_second=21.547Mi/s items_per_second=31.4925/s -bm_rapidyaml_parse_inplace 29559 ns 29447 ns 24350 bytes_per_second=22.6904Gi/s items_per_second=33.9597k/s -bm_rapidyaml_parse_arena 10671326 ns 10624578 ns 64 bytes_per_second=64.3972Mi/s items_per_second=94.1214/s +bm_fkyaml_parse 17653677 ns 17653735 ns 40 bytes_per_second=38.7563Mi/s items_per_second=56.6452/s +bm_yamlcpp_parse 105749049 ns 105748129 ns 7 bytes_per_second=6.47003Mi/s items_per_second=9.45643/s +bm_libfyaml_parse 27641204 ns 27641304 ns 26 bytes_per_second=24.7526Mi/s items_per_second=36.1777/s +bm_rapidyaml_parse_inplace 17598 ns 17598 ns 39483 bytes_per_second=37.9676Gi/s items_per_second=56.8242k/s +bm_rapidyaml_parse_arena 9999307 ns 9999242 ns 67 bytes_per_second=68.4245Mi/s items_per_second=100.008/s diff --git a/tools/benchmark/results/result_release_ubuntu_yml.txt b/tools/benchmark/results/result_release_ubuntu_yml.txt index 040ff895..5e883e81 100644 --- a/tools/benchmark/results/result_release_ubuntu_yml.txt +++ b/tools/benchmark/results/result_release_ubuntu_yml.txt @@ -1,17 +1,17 @@ -2024-12-22T01:25:37+09:00 -Running ./build_bm_release/tool/benchmark/benchmarker -Run on (16 X 3193.88 MHz CPU s) +2025-01-22T01:18:33+09:00 +Running ./build_bm_release/tools/benchmark/benchmarker +Run on (16 X 3193.92 MHz CPU s) CPU Caches: L1 Data 32 KiB (x8) L1 Instruction 32 KiB (x8) L2 Unified 512 KiB (x8) L3 Unified 16384 KiB (x1) -Load Average: 0.06, 0.04, 0.01 +Load Average: 0.44, 0.31, 0.30 ------------------------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... ------------------------------------------------------------------------------------- -bm_fkyaml_parse 140515 ns 139703 ns 4969 bytes_per_second=60.1886Mi/s items_per_second=7.15803k/s -bm_yamlcpp_parse 959662 ns 954009 ns 727 bytes_per_second=8.8139Mi/s items_per_second=1.04821k/s -bm_libfyaml_parse 241916 ns 240458 ns 2733 bytes_per_second=34.9689Mi/s items_per_second=4.15873k/s -bm_rapidyaml_parse_inplace 425 ns 422 ns 1672964 bytes_per_second=19.4415Gi/s items_per_second=2.3676M/s -bm_rapidyaml_parse_arena 60700 ns 60328 ns 11452 bytes_per_second=139.381Mi/s items_per_second=16.5761k/s +bm_fkyaml_parse 134527 ns 134526 ns 5131 bytes_per_second=62.5049Mi/s items_per_second=7.4335k/s +bm_yamlcpp_parse 926173 ns 926178 ns 744 bytes_per_second=9.07876Mi/s items_per_second=1.07971k/s +bm_libfyaml_parse 214311 ns 214312 ns 3261 bytes_per_second=39.235Mi/s items_per_second=4.66609k/s +bm_rapidyaml_parse_inplace 373 ns 373 ns 1879323 bytes_per_second=22.007Gi/s items_per_second=2.68003M/s +bm_rapidyaml_parse_arena 63075 ns 63074 ns 11031 bytes_per_second=133.311Mi/s items_per_second=15.8543k/s