Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor UTF encode validation & line break normalization #470

Merged
merged 4 commits into from
Jan 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 16 additions & 16 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,33 +151,33 @@ The following tables are created from the benchmarking results in the following

| Benchmark | processed bytes per second (Release) |
| ---------------------------------- | ------------------------------------ |
| fkYAML | 60.1886Mi/s |
| libfyaml | 34.9689Mi/s |
| rapidyaml<br>(with mutable buff) | 19.4415Gi/s |
| rapidyaml<br>(with immutable buff) | 139.381Mi/s |
| yaml-cpp | 8.8139Mi/s |
| fkYAML | 62.5049Mi/s |
| libfyaml | 39.235Mi/s |
| rapidyaml<br>(with mutable buff) | 22.007Gi/s |
| rapidyaml<br>(with immutable buff) | 133.311Mi/s |
| yaml-cpp | 9.07876Mi/s |

### Parsing [citm_catalog.json](https://github.com/fktn-k/fkYAML/blob/develop/tool/benchmark/cases/citm_catalog.json)

| Benchmark | processed bytes per second (Release) |
| ---------------------------------- | ------------------------------------ |
| fkYAML | 91.1523Mi/s |
| libfyaml | 51.8156Mi/s |
| rapidyaml<br>(with mutable buff) | 29.7284Gi/s |
| rapidyaml<br>(with immutable buff) | 140.764Mi/s |
| yaml-cpp | 14.8521Mi/s |
| fkYAML | 97.216Mi/s |
| libfyaml | 57.3021Mi/s |
| rapidyaml<br>(with mutable buff) | 37.9026Gi/s |
| rapidyaml<br>(with immutable buff) | 140.375Mi/s |
| yaml-cpp | 14.3192Mi/s |

### Parsing [citm_catalog.yml](https://github.com/fktn-k/fkYAML/blob/develop/tool/benchmark/cases/citm_catalog.yml)

| Benchmark | processed bytes per second (Release) |
| ---------------------------------- | ------------------------------------ |
| fkYAML | 36.0492Mi/s |
| libfyaml | 21.547Mi/s |
| rapidyaml<br>(with mutable buff) | 22.6904Gi/s |
| rapidyaml<br>(with immutable buff) | 64.3972Mi/s |
| yaml-cpp | 6.2337Mi/s |
| fkYAML | 38.7563Mi/s |
| libfyaml | 24.7526Mi/s |
| rapidyaml<br>(with mutable buff) | 37.9676Gi/s |
| rapidyaml<br>(with immutable buff) | 68.4245Mi/s |
| yaml-cpp | 6.47003Mi/s |

Although [rapidyaml](https://github.com/biojppm/rapidyaml) is about 2x faster with immutable buffers and far faster with mutable buffers than fkYAML as it focuses on high performance, fkYAML is in general 70% faster than [libfyaml](https://github.com/pantoniou/libfyaml) and also about 6x faster than [yaml-cpp](https://github.com/jbeder/yaml-cpp).
Although [rapidyaml](https://github.com/biojppm/rapidyaml) is about 2x faster with immutable buffers and far faster with mutable buffers than fkYAML as it focuses on high performance, fkYAML is in general 70% faster than [libfyaml](https://github.com/pantoniou/libfyaml) and also about 6.5x faster than [yaml-cpp](https://github.com/jbeder/yaml-cpp).
Note that, since fkYAML deserializes scalars into native booleans or integers during the parsing, the performance could be more faster in some use cases since there is no need for string manipulations upon data queries.

## License
Expand Down
266 changes: 132 additions & 134 deletions include/fkYAML/detail/encodings/utf_encodings.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ namespace utf8 {
/// @return The number of UTF-8 character bytes.
inline uint32_t get_num_bytes(uint8_t first_byte) {
// The first byte starts with 0b0XXX'XXXX -> 1-byte character
if (first_byte < 0x80) {
if FK_YAML_LIKELY (first_byte < 0x80) {
return 1;
}
// The first byte starts with 0b110X'XXXX -> 2-byte character
Expand All @@ -49,140 +49,136 @@ inline uint32_t get_num_bytes(uint8_t first_byte) {
throw fkyaml::invalid_encoding("Invalid UTF-8 encoding.", {first_byte});
}

/// @brief Validates the encoding of a given byte array whose length is 1.
/// @param[in] byte_array The byte array to be validated.
/// @return true if a given byte array is valid, false otherwise.
inline bool validate(const std::initializer_list<uint8_t>& byte_array) noexcept {
switch (byte_array.size()) {
case 1:
// U+0000..U+007F
return *byte_array.begin() <= 0x7Fu;
case 2: {
const auto* itr = byte_array.begin();
const uint8_t first = *itr++;
const uint8_t second = *itr;
/// @brief Checks if `byte` is a valid 1-byte UTF-8 character.
/// @param[in] byte The byte value.
/// @return true if `byte` is a valid 1-byte UTF-8 character, false otherwise.
inline bool validate(uint8_t byte) noexcept {
// U+0000..U+007F
return byte <= 0x7Fu;
}

// U+0080..U+07FF
// 1st Byte: 0xC2..0xDF
// 2nd Byte: 0x80..0xBF
if (0xC2u <= first && first <= 0xDFu) {
if (0x80u <= second && second <= 0xBFu) {
/// @brief Checks if the given bytes are a valid 2-byte UTF-8 character.
/// @param[in] byte0 The first byte value.
/// @param[in] byte1 The second byte value.
/// @return true if the given bytes a valid 3-byte UTF-8 character, false otherwise.
inline bool validate(uint8_t byte0, uint8_t byte1) noexcept {
// U+0080..U+07FF
// 1st Byte: 0xC2..0xDF
// 2nd Byte: 0x80..0xBF
if FK_YAML_LIKELY (0xC2u <= byte0 && byte0 <= 0xDFu) {
if FK_YAML_LIKELY (0x80u <= byte1 && byte1 <= 0xBFu) {
return true;
}
}

// The rest of byte combinations are invalid.
return false;
}

/// @brief Checks if the given bytes are a valid 3-byte UTF-8 character.
/// @param[in] byte0 The first byte value.
/// @param[in] byte1 The second byte value.
/// @param[in] byte2 The third byte value.
/// @return true if the given bytes a valid 2-byte UTF-8 character, false otherwise.
inline bool validate(uint8_t byte0, uint8_t byte1, uint8_t byte2) noexcept {
// U+1000..U+CFFF:
// 1st Byte: 0xE0..0xEC
// 2nd Byte: 0x80..0xBF
// 3rd Byte: 0x80..0xBF
if (0xE0u <= byte0 && byte0 <= 0xECu) {
if FK_YAML_LIKELY (0x80u <= byte1 && byte1 <= 0xBFu) {
if FK_YAML_LIKELY (0x80u <= byte2 && byte2 <= 0xBFu) {
return true;
}
}

// The rest of byte combinations are invalid.
return false;
}
case 3: {
const auto* itr = byte_array.begin();
const uint8_t first = *itr++;
const uint8_t second = *itr++;
const uint8_t third = *itr;

// U+1000..U+CFFF:
// 1st Byte: 0xE0..0xEC
// 2nd Byte: 0x80..0xBF
// 3rd Byte: 0x80..0xBF
if (0xE0u <= first && first <= 0xECu) {
if (0x80u <= second && second <= 0xBFu) {
if (0x80u <= third && third <= 0xBFu) {
return true;
}
// U+D000..U+D7FF:
// 1st Byte: 0xED
// 2nd Byte: 0x80..0x9F
// 3rd Byte: 0x80..0xBF
if (byte0 == 0xEDu) {
if FK_YAML_LIKELY (0x80u <= byte1 && byte1 <= 0x9Fu) {
if FK_YAML_LIKELY (0x80u <= byte2 && byte2 <= 0xBFu) {
return true;
}
return false;
}
return false;
}

// U+D000..U+D7FF:
// 1st Byte: 0xED
// 2nd Byte: 0x80..0x9F
// 3rd Byte: 0x80..0xBF
if (first == 0xEDu) {
if (0x80u <= second && second <= 0x9Fu) {
if (0x80u <= third && third <= 0xBFu) {
return true;
}
// U+E000..U+FFFF:
// 1st Byte: 0xEE..0xEF
// 2nd Byte: 0x80..0xBF
// 3rd Byte: 0x80..0xBF
if FK_YAML_LIKELY (byte0 == 0xEEu || byte0 == 0xEFu) {
if FK_YAML_LIKELY (0x80u <= byte1 && byte1 <= 0xBFu) {
if FK_YAML_LIKELY (0x80u <= byte2 && byte2 <= 0xBFu) {
return true;
}
return false;
}
}

// U+E000..U+FFFF:
// 1st Byte: 0xEE..0xEF
// 2nd Byte: 0x80..0xBF
// 3rd Byte: 0x80..0xBF
if (first == 0xEEu || first == 0xEFu) {
if (0x80u <= second && second <= 0xBFu) {
if (0x80u <= third && third <= 0xBFu) {
// The rest of byte combinations are invalid.
return false;
}

/// @brief Checks if the given bytes are a valid 4-byte UTF-8 character.
/// @param[in] byte0 The first byte value.
/// @param[in] byte1 The second byte value.
/// @param[in] byte2 The third byte value.
/// @param[in] byte3 The fourth byte value.
/// @return true if the given bytes a valid 4-byte UTF-8 character, false otherwise.
inline bool validate(uint8_t byte0, uint8_t byte1, uint8_t byte2, uint8_t byte3) noexcept {
// U+10000..U+3FFFF:
// 1st Byte: 0xF0
// 2nd Byte: 0x90..0xBF
// 3rd Byte: 0x80..0xBF
// 4th Byte: 0x80..0xBF
if (byte0 == 0xF0u) {
if FK_YAML_LIKELY (0x90u <= byte1 && byte1 <= 0xBFu) {
if FK_YAML_LIKELY (0x80u <= byte2 && byte2 <= 0xBFu) {
if FK_YAML_LIKELY (0x80u <= byte3 && byte3 <= 0xBFu) {
return true;
}
}
return false;
}

// The rest of byte combinations are invalid.
return false;
}
case 4: {
const auto* itr = byte_array.begin();
const uint8_t first = *itr++;
const uint8_t second = *itr++;
const uint8_t third = *itr++;
const uint8_t fourth = *itr;

// U+10000..U+3FFFF:
// 1st Byte: 0xF0
// 2nd Byte: 0x90..0xBF
// 3rd Byte: 0x80..0xBF
// 4th Byte: 0x80..0xBF
if (first == 0xF0u) {
if (0x90u <= second && second <= 0xBFu) {
if (0x80u <= third && third <= 0xBFu) {
if (0x80u <= fourth && fourth <= 0xBFu) {
return true;
}
}
}
return false;
}

// U+40000..U+FFFFF:
// 1st Byte: 0xF1..0xF3
// 2nd Byte: 0x80..0xBF
// 3rd Byte: 0x80..0xBF
// 4th Byte: 0x80..0xBF
if (0xF1u <= first && first <= 0xF3u) {
if (0x80u <= second && second <= 0xBFu) {
if (0x80u <= third && third <= 0xBFu) {
if (0x80u <= fourth && fourth <= 0xBFu) {
return true;
}
// U+40000..U+FFFFF:
// 1st Byte: 0xF1..0xF3
// 2nd Byte: 0x80..0xBF
// 3rd Byte: 0x80..0xBF
// 4th Byte: 0x80..0xBF
if (0xF1u <= byte0 && byte0 <= 0xF3u) {
if FK_YAML_LIKELY (0x80u <= byte1 && byte1 <= 0xBFu) {
if FK_YAML_LIKELY (0x80u <= byte2 && byte2 <= 0xBFu) {
if FK_YAML_LIKELY (0x80u <= byte3 && byte3 <= 0xBFu) {
return true;
}
}
return false;
}
return false;
}

// U+100000..U+10FFFF:
// 1st Byte: 0xF4
// 2nd Byte: 0x80..0x8F
// 3rd Byte: 0x80..0xBF
// 4th Byte: 0x80..0xBF
if (first == 0xF4u) {
if (0x80u <= second && second <= 0x8Fu) {
if (0x80u <= third && third <= 0xBFu) {
if (0x80u <= fourth && fourth <= 0xBFu) {
return true;
}
// U+100000..U+10FFFF:
// 1st Byte: 0xF4
// 2nd Byte: 0x80..0x8F
// 3rd Byte: 0x80..0xBF
// 4th Byte: 0x80..0xBF
if FK_YAML_LIKELY (byte0 == 0xF4u) {
if FK_YAML_LIKELY (0x80u <= byte1 && byte1 <= 0x8Fu) {
if FK_YAML_LIKELY (0x80u <= byte2 && byte2 <= 0xBFu) {
if FK_YAML_LIKELY (0x80u <= byte3 && byte3 <= 0xBFu) {
return true;
}
}
return false;
}

// The rest of byte combinations are invalid.
return false;
}
default: // LCOV_EXCL_LINE
detail::unreachable(); // LCOV_EXCL_LINE
}

// The rest of byte combinations are invalid.
return false;
}

/// @brief Converts UTF-16 encoded characters to UTF-8 encoded bytes.
Expand All @@ -192,37 +188,39 @@ inline bool validate(const std::initializer_list<uint8_t>& byte_array) noexcept
/// @param[out] encoded_size The size of UTF-encoded bytes.
inline void from_utf16(
std::array<char16_t, 2> utf16, std::array<uint8_t, 4>& utf8, uint32_t& consumed_size, uint32_t& encoded_size) {
if (utf16[0] < 0x80u) {
utf8[0] = static_cast<uint8_t>(utf16[0] & 0x7Fu);
const auto first = utf16[0];
const auto second = utf16[1];
if (first < 0x80u) {
utf8[0] = static_cast<uint8_t>(first & 0x7Fu);
consumed_size = 1;
encoded_size = 1;
}
else if (utf16[0] <= 0x7FFu) {
const auto utf8_chunk = static_cast<uint16_t>(0xC080u | ((utf16[0] & 0x07C0u) << 2) | (utf16[0] & 0x3Fu));
utf8[0] = static_cast<uint8_t>((utf8_chunk & 0xFF00u) >> 8);
utf8[1] = static_cast<uint8_t>(utf8_chunk & 0x00FFu);
else if (first <= 0x7FFu) {
const auto utf8_chunk = static_cast<uint16_t>(0xC080u | ((first & 0x07C0u) << 2) | (first & 0x3Fu));
utf8[0] = static_cast<uint8_t>(utf8_chunk >> 8);
utf8[1] = static_cast<uint8_t>(utf8_chunk);
consumed_size = 1;
encoded_size = 2;
}
else if (utf16[0] < 0xD800u || 0xE000u <= utf16[0]) {
const auto utf8_chunk = static_cast<uint32_t>(
0xE08080u | ((utf16[0] & 0xF000u) << 4) | ((utf16[0] & 0x0FC0u) << 2) | (utf16[0] & 0x3Fu));
utf8[0] = static_cast<uint8_t>((utf8_chunk & 0xFF0000u) >> 16);
utf8[1] = static_cast<uint8_t>((utf8_chunk & 0x00FF00u) >> 8);
utf8[2] = static_cast<uint8_t>(utf8_chunk & 0x0000FFu);
else if (first < 0xD800u || 0xE000u <= first) {
const auto utf8_chunk =
static_cast<uint32_t>(0xE08080u | ((first & 0xF000u) << 4) | ((first & 0x0FC0u) << 2) | (first & 0x3Fu));
utf8[0] = static_cast<uint8_t>(utf8_chunk >> 16);
utf8[1] = static_cast<uint8_t>(utf8_chunk >> 8);
utf8[2] = static_cast<uint8_t>(utf8_chunk);
consumed_size = 1;
encoded_size = 3;
}
else if (utf16[0] <= 0xDBFFu && 0xDC00u <= utf16[1] && utf16[1] <= 0xDFFFu) {
else if (first <= 0xDBFFu && 0xDC00u <= second && second <= 0xDFFFu) {
// surrogate pair
const uint32_t code_point = 0x10000u + ((utf16[0] & 0x03FFu) << 10) + (utf16[1] & 0x03FFu);
const uint32_t code_point = 0x10000u + ((first & 0x03FFu) << 10) + (second & 0x03FFu);
const auto utf8_chunk = static_cast<uint32_t>(
0xF0808080u | ((code_point & 0x1C0000u) << 6) | ((code_point & 0x03F000u) << 4) |
((code_point & 0x0FC0u) << 2) | (code_point & 0x3Fu));
utf8[0] = static_cast<uint8_t>((utf8_chunk & 0xFF000000u) >> 24);
utf8[1] = static_cast<uint8_t>((utf8_chunk & 0x00FF0000u) >> 16);
utf8[2] = static_cast<uint8_t>((utf8_chunk & 0x0000FF00u) >> 8);
utf8[3] = static_cast<uint8_t>(utf8_chunk & 0x000000FFu);
utf8[0] = static_cast<uint8_t>(utf8_chunk >> 24);
utf8[1] = static_cast<uint8_t>(utf8_chunk >> 16);
utf8[2] = static_cast<uint8_t>(utf8_chunk >> 8);
utf8[3] = static_cast<uint8_t>(utf8_chunk);
consumed_size = 2;
encoded_size = 4;
}
Expand All @@ -242,26 +240,26 @@ inline void from_utf32(const char32_t utf32, std::array<uint8_t, 4>& utf8, uint3
}
else if (utf32 <= 0x7FFu) {
const auto utf8_chunk = static_cast<uint16_t>(0xC080u | ((utf32 & 0x07C0u) << 2) | (utf32 & 0x3Fu));
utf8[0] = static_cast<uint8_t>((utf8_chunk & 0xFF00u) >> 8);
utf8[1] = static_cast<uint8_t>(utf8_chunk & 0x00FFu);
utf8[0] = static_cast<uint8_t>(utf8_chunk >> 8);
utf8[1] = static_cast<uint8_t>(utf8_chunk);
encoded_size = 2;
}
else if (utf32 <= 0xFFFFu) {
const auto utf8_chunk =
static_cast<uint32_t>(0xE08080u | ((utf32 & 0xF000u) << 4) | ((utf32 & 0x0FC0u) << 2) | (utf32 & 0x3F));
utf8[0] = static_cast<uint8_t>((utf8_chunk & 0xFF0000u) >> 16);
utf8[1] = static_cast<uint8_t>((utf8_chunk & 0x00FF00u) >> 8);
utf8[2] = static_cast<uint8_t>(utf8_chunk & 0x0000FFu);
utf8[0] = static_cast<uint8_t>(utf8_chunk >> 16);
utf8[1] = static_cast<uint8_t>(utf8_chunk >> 8);
utf8[2] = static_cast<uint8_t>(utf8_chunk);
encoded_size = 3;
}
else if (utf32 <= 0x10FFFFu) {
const auto utf8_chunk = static_cast<uint32_t>(
0xF0808080u | ((utf32 & 0x1C0000u) << 6) | ((utf32 & 0x03F000u) << 4) | ((utf32 & 0x0FC0u) << 2) |
(utf32 & 0x3Fu));
utf8[0] = static_cast<uint8_t>((utf8_chunk & 0xFF000000u) >> 24);
utf8[1] = static_cast<uint8_t>((utf8_chunk & 0x00FF0000u) >> 16);
utf8[2] = static_cast<uint8_t>((utf8_chunk & 0x0000FF00u) >> 8);
utf8[3] = static_cast<uint8_t>(utf8_chunk & 0x000000FFu);
utf8[0] = static_cast<uint8_t>(utf8_chunk >> 24);
utf8[1] = static_cast<uint8_t>(utf8_chunk >> 16);
utf8[2] = static_cast<uint8_t>(utf8_chunk >> 8);
utf8[3] = static_cast<uint8_t>(utf8_chunk);
encoded_size = 4;
}
else {
Expand Down
Loading
Loading