Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for validating and escaping UTF-8 strings. #453

Merged
merged 13 commits into from
Jun 27, 2024
5 changes: 5 additions & 0 deletions components/core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,8 @@ set(SOURCE_FILES_unitTest
src/clp/ffi/search/Subquery.hpp
src/clp/ffi/search/WildcardToken.cpp
src/clp/ffi/search/WildcardToken.hpp
src/clp/ffi/utils.cpp
src/clp/ffi/utils.hpp
src/clp/FileDescriptor.cpp
src/clp/FileDescriptor.hpp
src/clp/FileReader.cpp
Expand Down Expand Up @@ -436,6 +438,8 @@ set(SOURCE_FILES_unitTest
src/clp/TraceableException.hpp
src/clp/time_types.hpp
src/clp/type_utils.hpp
src/clp/utf8_utils.cpp
src/clp/utf8_utils.hpp
src/clp/Utils.cpp
src/clp/Utils.hpp
src/clp/VariableDictionaryEntry.cpp
Expand Down Expand Up @@ -470,6 +474,7 @@ set(SOURCE_FILES_unitTest
tests/test-StreamingCompression.cpp
tests/test-string_utils.cpp
tests/test-TimestampPattern.cpp
tests/test-utf8_utils.cpp
tests/test-Utils.cpp
)
add_executable(unitTest ${SOURCE_FILES_unitTest} ${SOURCE_FILES_clp_s_unitTest})
Expand Down
89 changes: 89 additions & 0 deletions components/core/src/clp/ffi/utils.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#include "utils.hpp"

#include <array>
#include <cstddef>
#include <cstdint>
#include <cstdio>
#include <optional>
#include <string>
#include <string_view>
#include <tuple>

#include "../utf8_utils.hpp"

using std::string;
using std::string_view;

namespace clp::ffi {
auto validate_and_escape_utf8_string(string_view raw) -> std::optional<string> {
std::optional<std::string> ret_val;
auto& escaped{ret_val.emplace()};
escaped.reserve(raw.size() + (raw.size() / 2));
if (false == validate_and_append_escaped_utf8_string(raw, escaped)) {
return std::nullopt;
}
return ret_val;
}

auto validate_and_append_escaped_utf8_string(std::string_view src, std::string& dst) -> bool {
string_view::const_iterator next_char_to_copy_it{src.cbegin()};

auto escape_handler = [&](string_view::const_iterator it) -> void {
// Allocate 6 + 1 size buffer to format control characters as "\u00bb", with the last byte
// used by `snprintf` to append '\0'
constexpr size_t cControlCharacterBufSize{7};
std::array<char, cControlCharacterBufSize> buf{};
std::string_view escaped_char;
bool escape_required{true};
switch (*it) {
case '\b':
escaped_char = "\\b";
break;
case '\t':
escaped_char = "\\t";
break;
case '\n':
escaped_char = "\\n";
break;
case '\f':
escaped_char = "\\f";
break;
case '\r':
escaped_char = "\\r";
break;
case '\\':
escaped_char = "\\\\";
break;
case '"':
escaped_char = "\\\"";
break;
default: {
constexpr uint8_t cLargestControlCharacter{0x1F};
auto const byte{static_cast<uint8_t>(*it)};
if (cLargestControlCharacter >= byte) {
std::ignore = snprintf(buf.data(), buf.size(), "\\u00%02x", byte);
escaped_char = {buf.data(), buf.size() - 1};
} else {
escape_required = false;
}
break;
}
}
if (escape_required) {
dst.append(next_char_to_copy_it, it);
dst += escaped_char;
next_char_to_copy_it = it + 1;
}
};

if (false == validate_utf8_string(src, escape_handler)) {
return false;
}

if (src.cend() != next_char_to_copy_it) {
dst.append(next_char_to_copy_it, src.cend());
}

return true;
}
} // namespace clp::ffi
31 changes: 31 additions & 0 deletions components/core/src/clp/ffi/utils.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#ifndef CLP_FFI_UTILS_HPP
#define CLP_FFI_UTILS_HPP

#include <optional>
#include <string>
#include <string_view>

namespace clp::ffi {
/**
* Validates whether the given string is UTF-8 encoded, and escapes any characters to make the
* string compatible with the JSON specification.
* @param raw The raw string to escape.
* @return The escaped string on success.
* @return std::nullopt if the string contains any non-UTF-8-encoded byte sequences.
*/
[[nodiscard]] auto validate_and_escape_utf8_string(std::string_view raw
) -> std::optional<std::string>;

/**
* Validates whether `src` is UTF-8 encoded, and appends `src` to `dst` while escaping any
* characters to make the appended string compatible with the JSON specification.
* @param src The string to validate and escape.
* @param dst Returns `dst` with an escaped version of `src` appended.
* @return Whether `src` is a valid UTF-8-encoded string. NOTE: Even if `src` is not UTF-8 encoded,
* `dst` may be modified.
*/
[[nodiscard]] auto
validate_and_append_escaped_utf8_string(std::string_view src, std::string& dst) -> bool;
} // namespace clp::ffi

#endif // CLP_FFI_UTILS_HPP
55 changes: 55 additions & 0 deletions components/core/src/clp/utf8_utils.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#include "utf8_utils.hpp"

#include <cstddef>
#include <cstdint>
#include <string_view>

namespace clp {
auto is_utf8_encoded(std::string_view str) -> bool {
auto escape_handler = []([[maybe_unused]] std::string_view::const_iterator it) -> void {};
return validate_utf8_string(str, escape_handler);
}

namespace utf8_utils_internal {
auto parse_and_validate_lead_byte(
uint8_t byte,
size_t& num_continuation_bytes,
uint32_t& code_point,
uint32_t& code_point_lower_bound,
uint32_t& code_point_upper_bound
) -> bool {
if ((byte & cFourByteUtf8CharHeaderMask) == cFourByteUtf8CharHeader) {
num_continuation_bytes = 3;
code_point = (~cFourByteUtf8CharHeaderMask & byte);
code_point_lower_bound = cFourByteUtf8CharCodePointLowerBound;
code_point_upper_bound = cFourByteUtf8CharCodePointUpperBound;
} else if ((byte & cThreeByteUtf8CharHeaderMask) == cThreeByteUtf8CharHeader) {
num_continuation_bytes = 2;
code_point = (~cThreeByteUtf8CharHeaderMask & byte);
code_point_lower_bound = cThreeByteUtf8CharCodePointLowerBound;
code_point_upper_bound = cThreeByteUtf8CharCodePointUpperBound;
} else if ((byte & cTwoByteUtf8CharHeaderMask) == cTwoByteUtf8CharHeader) {
num_continuation_bytes = 1;
code_point = (~cTwoByteUtf8CharHeaderMask & byte);
code_point_lower_bound = cTwoByteUtf8CharCodePointLowerBound;
code_point_upper_bound = cTwoByteUtf8CharCodePointUpperBound;
} else {
return false;
}
return true;
}

auto is_ascii_char(uint8_t byte) -> bool {
return cOneByteUtf8CharCodePointUpperBound >= byte;
}

auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool {
return (byte & cUtf8ContinuationByteMask) == cUtf8ContinuationByteHeader;
}

auto parse_continuation_byte(uint32_t code_point, uint8_t continuation_byte) -> uint32_t {
return (code_point << cUtf8NumContinuationByteCodePointBits)
+ (continuation_byte & cUtf8ContinuationByteCodePointMask);
}
} // namespace utf8_utils_internal
} // namespace clp
144 changes: 144 additions & 0 deletions components/core/src/clp/utf8_utils.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
#ifndef CLP_UTF8_UTILS_HPP
#define CLP_UTF8_UTILS_HPP

#include <cstddef>
#include <cstdint>
#include <string_view>

namespace clp {
// Constants
// Lead byte signature
constexpr uint8_t cTwoByteUtf8CharHeaderMask{0xE0}; // 0b111x_xxxx
constexpr uint8_t cTwoByteUtf8CharHeader{0xC0}; // 0b110x_xxxx
constexpr uint8_t cThreeByteUtf8CharHeaderMask{0xF0}; // 0b1111_xxxx
constexpr uint8_t cThreeByteUtf8CharHeader{0xE0}; // 0b1110_xxxx
constexpr uint8_t cFourByteUtf8CharHeaderMask{0xF8}; // 0b1111_1xxx
constexpr uint8_t cFourByteUtf8CharHeader{0xF0}; // 0b1111_0xxx

// Code point ranges (inclusive)
constexpr uint32_t cOneByteUtf8CharCodePointLowerBound{0};
constexpr uint32_t cOneByteUtf8CharCodePointUpperBound{0x7F};
constexpr uint32_t cTwoByteUtf8CharCodePointLowerBound{0x80};
constexpr uint32_t cTwoByteUtf8CharCodePointUpperBound{0x7FF};
constexpr uint32_t cThreeByteUtf8CharCodePointLowerBound{0x800};
constexpr uint32_t cThreeByteUtf8CharCodePointUpperBound{0xFFFF};
constexpr uint32_t cFourByteUtf8CharCodePointLowerBound{0x1'0000};
constexpr uint32_t cFourByteUtf8CharCodePointUpperBound{0x10'FFFF};

// Continuation byte
constexpr uint32_t cUtf8ContinuationByteMask{0xC0};
constexpr uint32_t cUtf8ContinuationByteHeader{0x80};
constexpr uint32_t cUtf8ContinuationByteCodePointMask{0x3F};
constexpr uint8_t cUtf8NumContinuationByteCodePointBits{6};

/**
* Validates whether the given string is UTF-8 encoded, optionally escaping ASCII characters using
* the given handler.
* @tparam EscapeHandler Method to optionally escape any ASCII character in the string.
* @param src
* @param escape_handler
* @return Whether the input is a valid UTF-8 encoded string.
*/
template <typename EscapeHandler>
requires std::is_invocable_v<EscapeHandler, std::string_view::const_iterator>
[[nodiscard]] auto validate_utf8_string(std::string_view src, EscapeHandler escape_handler) -> bool;

/**
* @param str
* @return Whether the input is a valid UTF-8 encoded string.
*/
[[nodiscard]] auto is_utf8_encoded(std::string_view str) -> bool;

namespace utf8_utils_internal {
/**
* Validates whether the given byte is a valid lead byte for a multi-byte UTF-8 character, parses
* the byte, and returns the parsed properties as well as associated properties.
* @param byte Byte to validate.
* @param num_continuation_bytes Returns the number of continuation bytes expected.
* @param code_point Returns the code point bits parsed from the lead byte.
* @param code_point_lower_bound Returns the lower bound of the code point range for the UTF-8
* character.
* @param code_point_upper_bound Returns the upper bound of the code point range for the UTF-8
* character.
* @return Whether the input byte is a valid lead byte for a multi-byte UTF-8 character.
*/
[[nodiscard]] auto parse_and_validate_lead_byte(
uint8_t byte,
size_t& num_continuation_bytes,
uint32_t& code_point,
uint32_t& code_point_lower_bound,
uint32_t& code_point_upper_bound
) -> bool;

/**
* @param byte
* @return Whether the given byte is a valid ASCII character.
*/
[[nodiscard]] auto is_ascii_char(uint8_t byte) -> bool;

/*
* @param byte
* @return Whether the input byte is a valid UTF-8 continuation byte.
*/
[[nodiscard]] auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool;

/**
* Parses the code-point bits from the given continuation byte and combines them with the given
* code point.
* @param code_point
* @param continuation_byte
* @return The updated code point.
*/
[[nodiscard]] auto
parse_continuation_byte(uint32_t code_point, uint8_t continuation_byte) -> uint32_t;
} // namespace utf8_utils_internal

template <typename EscapeHandler>
requires std::is_invocable_v<EscapeHandler, std::string_view::const_iterator>
auto validate_utf8_string(std::string_view src, EscapeHandler escape_handler) -> bool {
size_t num_continuation_bytes_to_validate{0};
uint32_t code_point{};
uint32_t code_point_lower_bound{};
uint32_t code_point_upper_bound{};

// NOLINTNEXTLINE(readability-qualified-auto)
for (auto it{src.cbegin()}; it != src.cend(); ++it) {
auto const byte{static_cast<uint8_t>(*it)};
if (0 == num_continuation_bytes_to_validate) {
if (utf8_utils_internal::is_ascii_char(byte)) {
escape_handler(it);
} else if (false
== utf8_utils_internal::parse_and_validate_lead_byte(
byte,
num_continuation_bytes_to_validate,
code_point,
code_point_lower_bound,
code_point_upper_bound
))
{
return false;
}
} else {
if (false == utf8_utils_internal::is_valid_utf8_continuation_byte(byte)) {
return false;
}
code_point = utf8_utils_internal::parse_continuation_byte(code_point, byte);
--num_continuation_bytes_to_validate;
if (0 == num_continuation_bytes_to_validate
&& (code_point < code_point_lower_bound || code_point_upper_bound < code_point))
{
return false;
}
}
}

if (0 != num_continuation_bytes_to_validate) {
// Incomplete UTF-8 character
return false;
}

return true;
}
} // namespace clp

#endif // CLP_UTF8_UTILS_HPP
Loading
Loading