Skip to content

Commit

Permalink
Add support for validating and escaping UTF-8 strings. (y-scope#453)
Browse files Browse the repository at this point in the history
Co-authored-by: kirkrodrigues <2454684+kirkrodrigues@users.noreply.github.com>
  • Loading branch information
2 people authored and Jack Luo committed Dec 4, 2024
1 parent d17df24 commit 242bb37
Show file tree
Hide file tree
Showing 6 changed files with 533 additions and 0 deletions.
5 changes: 5 additions & 0 deletions components/core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,8 @@ set(SOURCE_FILES_unitTest
src/clp/ffi/search/Subquery.hpp
src/clp/ffi/search/WildcardToken.cpp
src/clp/ffi/search/WildcardToken.hpp
src/clp/ffi/utils.cpp
src/clp/ffi/utils.hpp
src/clp/FileDescriptor.cpp
src/clp/FileDescriptor.hpp
src/clp/FileReader.cpp
Expand Down Expand Up @@ -455,6 +457,8 @@ set(SOURCE_FILES_unitTest
src/clp/TraceableException.hpp
src/clp/time_types.hpp
src/clp/type_utils.hpp
src/clp/utf8_utils.cpp
src/clp/utf8_utils.hpp
src/clp/Utils.cpp
src/clp/Utils.hpp
src/clp/VariableDictionaryEntry.cpp
Expand Down Expand Up @@ -489,6 +493,7 @@ set(SOURCE_FILES_unitTest
tests/test-StreamingCompression.cpp
tests/test-string_utils.cpp
tests/test-TimestampPattern.cpp
tests/test-utf8_utils.cpp
tests/test-Utils.cpp
)
add_executable(unitTest ${SOURCE_FILES_unitTest} ${SOURCE_FILES_clp_s_unitTest})
Expand Down
89 changes: 89 additions & 0 deletions components/core/src/clp/ffi/utils.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#include "utils.hpp"

#include <array>
#include <cstddef>
#include <cstdint>
#include <cstdio>
#include <optional>
#include <string>
#include <string_view>
#include <tuple>

#include "../utf8_utils.hpp"

using std::string;
using std::string_view;

namespace clp::ffi {
auto validate_and_escape_utf8_string(string_view raw) -> std::optional<string> {
std::optional<std::string> ret_val;
auto& escaped{ret_val.emplace()};
escaped.reserve(raw.size() + (raw.size() / 2));
if (false == validate_and_append_escaped_utf8_string(raw, escaped)) {
return std::nullopt;
}
return ret_val;
}

auto validate_and_append_escaped_utf8_string(std::string_view src, std::string& dst) -> bool {
string_view::const_iterator next_char_to_copy_it{src.cbegin()};

auto escape_handler = [&](string_view::const_iterator it) -> void {
// Allocate 6 + 1 size buffer to format control characters as "\u00bb", with the last byte
// used by `snprintf` to append '\0'
constexpr size_t cControlCharacterBufSize{7};
std::array<char, cControlCharacterBufSize> buf{};
std::string_view escaped_char;
bool escape_required{true};
switch (*it) {
case '\b':
escaped_char = "\\b";
break;
case '\t':
escaped_char = "\\t";
break;
case '\n':
escaped_char = "\\n";
break;
case '\f':
escaped_char = "\\f";
break;
case '\r':
escaped_char = "\\r";
break;
case '\\':
escaped_char = "\\\\";
break;
case '"':
escaped_char = "\\\"";
break;
default: {
constexpr uint8_t cLargestControlCharacter{0x1F};
auto const byte{static_cast<uint8_t>(*it)};
if (cLargestControlCharacter >= byte) {
std::ignore = snprintf(buf.data(), buf.size(), "\\u00%02x", byte);
escaped_char = {buf.data(), buf.size() - 1};
} else {
escape_required = false;
}
break;
}
}
if (escape_required) {
dst.append(next_char_to_copy_it, it);
dst += escaped_char;
next_char_to_copy_it = it + 1;
}
};

if (false == validate_utf8_string(src, escape_handler)) {
return false;
}

if (src.cend() != next_char_to_copy_it) {
dst.append(next_char_to_copy_it, src.cend());
}

return true;
}
} // namespace clp::ffi
31 changes: 31 additions & 0 deletions components/core/src/clp/ffi/utils.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#ifndef CLP_FFI_UTILS_HPP
#define CLP_FFI_UTILS_HPP

#include <optional>
#include <string>
#include <string_view>

namespace clp::ffi {
/**
* Validates whether the given string is UTF-8 encoded, and escapes any characters to make the
* string compatible with the JSON specification.
* @param raw The raw string to escape.
* @return The escaped string on success.
* @return std::nullopt if the string contains any non-UTF-8-encoded byte sequences.
*/
[[nodiscard]] auto validate_and_escape_utf8_string(std::string_view raw
) -> std::optional<std::string>;

/**
* Validates whether `src` is UTF-8 encoded, and appends `src` to `dst` while escaping any
* characters to make the appended string compatible with the JSON specification.
* @param src The string to validate and escape.
* @param dst Returns `dst` with an escaped version of `src` appended.
* @return Whether `src` is a valid UTF-8-encoded string. NOTE: Even if `src` is not UTF-8 encoded,
* `dst` may be modified.
*/
[[nodiscard]] auto
validate_and_append_escaped_utf8_string(std::string_view src, std::string& dst) -> bool;
} // namespace clp::ffi

#endif // CLP_FFI_UTILS_HPP
55 changes: 55 additions & 0 deletions components/core/src/clp/utf8_utils.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#include "utf8_utils.hpp"

#include <cstddef>
#include <cstdint>
#include <string_view>

namespace clp {
auto is_utf8_encoded(std::string_view str) -> bool {
auto escape_handler = []([[maybe_unused]] std::string_view::const_iterator it) -> void {};
return validate_utf8_string(str, escape_handler);
}

namespace utf8_utils_internal {
auto parse_and_validate_lead_byte(
uint8_t byte,
size_t& num_continuation_bytes,
uint32_t& code_point,
uint32_t& code_point_lower_bound,
uint32_t& code_point_upper_bound
) -> bool {
if ((byte & cFourByteUtf8CharHeaderMask) == cFourByteUtf8CharHeader) {
num_continuation_bytes = 3;
code_point = (~cFourByteUtf8CharHeaderMask & byte);
code_point_lower_bound = cFourByteUtf8CharCodePointLowerBound;
code_point_upper_bound = cFourByteUtf8CharCodePointUpperBound;
} else if ((byte & cThreeByteUtf8CharHeaderMask) == cThreeByteUtf8CharHeader) {
num_continuation_bytes = 2;
code_point = (~cThreeByteUtf8CharHeaderMask & byte);
code_point_lower_bound = cThreeByteUtf8CharCodePointLowerBound;
code_point_upper_bound = cThreeByteUtf8CharCodePointUpperBound;
} else if ((byte & cTwoByteUtf8CharHeaderMask) == cTwoByteUtf8CharHeader) {
num_continuation_bytes = 1;
code_point = (~cTwoByteUtf8CharHeaderMask & byte);
code_point_lower_bound = cTwoByteUtf8CharCodePointLowerBound;
code_point_upper_bound = cTwoByteUtf8CharCodePointUpperBound;
} else {
return false;
}
return true;
}

auto is_ascii_char(uint8_t byte) -> bool {
return cOneByteUtf8CharCodePointUpperBound >= byte;
}

auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool {
return (byte & cUtf8ContinuationByteMask) == cUtf8ContinuationByteHeader;
}

auto parse_continuation_byte(uint32_t code_point, uint8_t continuation_byte) -> uint32_t {
return (code_point << cUtf8NumContinuationByteCodePointBits)
+ (continuation_byte & cUtf8ContinuationByteCodePointMask);
}
} // namespace utf8_utils_internal
} // namespace clp
144 changes: 144 additions & 0 deletions components/core/src/clp/utf8_utils.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
#ifndef CLP_UTF8_UTILS_HPP
#define CLP_UTF8_UTILS_HPP

#include <cstddef>
#include <cstdint>
#include <string_view>

namespace clp {
// Constants
// Lead byte signature
constexpr uint8_t cTwoByteUtf8CharHeaderMask{0xE0}; // 0b111x_xxxx
constexpr uint8_t cTwoByteUtf8CharHeader{0xC0}; // 0b110x_xxxx
constexpr uint8_t cThreeByteUtf8CharHeaderMask{0xF0}; // 0b1111_xxxx
constexpr uint8_t cThreeByteUtf8CharHeader{0xE0}; // 0b1110_xxxx
constexpr uint8_t cFourByteUtf8CharHeaderMask{0xF8}; // 0b1111_1xxx
constexpr uint8_t cFourByteUtf8CharHeader{0xF0}; // 0b1111_0xxx

// Code point ranges (inclusive)
constexpr uint32_t cOneByteUtf8CharCodePointLowerBound{0};
constexpr uint32_t cOneByteUtf8CharCodePointUpperBound{0x7F};
constexpr uint32_t cTwoByteUtf8CharCodePointLowerBound{0x80};
constexpr uint32_t cTwoByteUtf8CharCodePointUpperBound{0x7FF};
constexpr uint32_t cThreeByteUtf8CharCodePointLowerBound{0x800};
constexpr uint32_t cThreeByteUtf8CharCodePointUpperBound{0xFFFF};
constexpr uint32_t cFourByteUtf8CharCodePointLowerBound{0x1'0000};
constexpr uint32_t cFourByteUtf8CharCodePointUpperBound{0x10'FFFF};

// Continuation byte
constexpr uint32_t cUtf8ContinuationByteMask{0xC0};
constexpr uint32_t cUtf8ContinuationByteHeader{0x80};
constexpr uint32_t cUtf8ContinuationByteCodePointMask{0x3F};
constexpr uint8_t cUtf8NumContinuationByteCodePointBits{6};

/**
* Validates whether the given string is UTF-8 encoded, optionally escaping ASCII characters using
* the given handler.
* @tparam EscapeHandler Method to optionally escape any ASCII character in the string.
* @param src
* @param escape_handler
* @return Whether the input is a valid UTF-8 encoded string.
*/
template <typename EscapeHandler>
requires std::is_invocable_v<EscapeHandler, std::string_view::const_iterator>
[[nodiscard]] auto validate_utf8_string(std::string_view src, EscapeHandler escape_handler) -> bool;

/**
* @param str
* @return Whether the input is a valid UTF-8 encoded string.
*/
[[nodiscard]] auto is_utf8_encoded(std::string_view str) -> bool;

namespace utf8_utils_internal {
/**
* Validates whether the given byte is a valid lead byte for a multi-byte UTF-8 character, parses
* the byte, and returns the parsed properties as well as associated properties.
* @param byte Byte to validate.
* @param num_continuation_bytes Returns the number of continuation bytes expected.
* @param code_point Returns the code point bits parsed from the lead byte.
* @param code_point_lower_bound Returns the lower bound of the code point range for the UTF-8
* character.
* @param code_point_upper_bound Returns the upper bound of the code point range for the UTF-8
* character.
* @return Whether the input byte is a valid lead byte for a multi-byte UTF-8 character.
*/
[[nodiscard]] auto parse_and_validate_lead_byte(
uint8_t byte,
size_t& num_continuation_bytes,
uint32_t& code_point,
uint32_t& code_point_lower_bound,
uint32_t& code_point_upper_bound
) -> bool;

/**
* @param byte
* @return Whether the given byte is a valid ASCII character.
*/
[[nodiscard]] auto is_ascii_char(uint8_t byte) -> bool;

/*
* @param byte
* @return Whether the input byte is a valid UTF-8 continuation byte.
*/
[[nodiscard]] auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool;

/**
* Parses the code-point bits from the given continuation byte and combines them with the given
* code point.
* @param code_point
* @param continuation_byte
* @return The updated code point.
*/
[[nodiscard]] auto
parse_continuation_byte(uint32_t code_point, uint8_t continuation_byte) -> uint32_t;
} // namespace utf8_utils_internal

template <typename EscapeHandler>
requires std::is_invocable_v<EscapeHandler, std::string_view::const_iterator>
auto validate_utf8_string(std::string_view src, EscapeHandler escape_handler) -> bool {
size_t num_continuation_bytes_to_validate{0};
uint32_t code_point{};
uint32_t code_point_lower_bound{};
uint32_t code_point_upper_bound{};

// NOLINTNEXTLINE(readability-qualified-auto)
for (auto it{src.cbegin()}; it != src.cend(); ++it) {
auto const byte{static_cast<uint8_t>(*it)};
if (0 == num_continuation_bytes_to_validate) {
if (utf8_utils_internal::is_ascii_char(byte)) {
escape_handler(it);
} else if (false
== utf8_utils_internal::parse_and_validate_lead_byte(
byte,
num_continuation_bytes_to_validate,
code_point,
code_point_lower_bound,
code_point_upper_bound
))
{
return false;
}
} else {
if (false == utf8_utils_internal::is_valid_utf8_continuation_byte(byte)) {
return false;
}
code_point = utf8_utils_internal::parse_continuation_byte(code_point, byte);
--num_continuation_bytes_to_validate;
if (0 == num_continuation_bytes_to_validate
&& (code_point < code_point_lower_bound || code_point_upper_bound < code_point))
{
return false;
}
}
}

if (0 != num_continuation_bytes_to_validate) {
// Incomplete UTF-8 character
return false;
}

return true;
}
} // namespace clp

#endif // CLP_UTF8_UTILS_HPP
Loading

0 comments on commit 242bb37

Please sign in to comment.