From 25ff0f888259ac173280cbed72a333b2a0fd977a Mon Sep 17 00:00:00 2001 From: "Node.js GitHub Bot" Date: Wed, 18 Jan 2023 20:50:08 +0000 Subject: [PATCH 1/2] deps: update simdutf to 3.1.0 --- deps/simdutf/simdutf.cpp | 470 +++++++++++++++++++++++------------- deps/simdutf/simdutf.h | 507 ++++++++++++++++++++++++++++++--------- 2 files changed, 707 insertions(+), 270 deletions(-) diff --git a/deps/simdutf/simdutf.cpp b/deps/simdutf/simdutf.cpp index f9c0a649dc1b26..c987f9378bec66 100644 --- a/deps/simdutf/simdutf.cpp +++ b/deps/simdutf/simdutf.cpp @@ -1,4 +1,4 @@ -/* auto-generated on 2023-01-02 15:43:33 -0500. Do not edit! */ +/* auto-generated on 2023-01-18 12:43:26 -0500. Do not edit! */ // dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf.cpp /* begin file src/simdutf.cpp */ #include "simdutf.h" @@ -509,7 +509,7 @@ simdutf_really_inline int16x8_t make_int16x8_t(int16_t x1, int16_t x2, int16_t simdutf_really_inline void store_ascii_as_utf16(char16_t * p) const { uint16x8_t first = vmovl_u8(vget_low_u8 (vreinterpretq_u8_s8(this->value))); uint16x8_t second = vmovl_high_u8(vreinterpretq_u8_s8(this->value)); - if (big_endian) { + if (!match_system(big_endian)) { #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); #else @@ -4386,7 +4386,6 @@ class detect_best_supported_implementation_on_first_use final : public implement const implementation *set_best() const noexcept; }; -const detect_best_supported_implementation_on_first_use detect_best_supported_implementation_on_first_use_singleton; const std::initializer_list available_implementation_pointers { #if SIMDUTF_IMPLEMENTATION_ICELAKE @@ -4662,187 +4661,322 @@ const implementation *detect_best_supported_implementation_on_first_use::set_bes SIMDUTF_POP_DISABLE_WARNINGS if (force_implementation_name) { - auto force_implementation = available_implementations[force_implementation_name]; + auto force_implementation = get_available_implementations()[force_implementation_name]; if (force_implementation) { - return active_implementation = force_implementation; + return get_active_implementation() = force_implementation; } else { // Note: abort() and stderr usage within the library is forbidden. - return active_implementation = &unsupported_singleton; + return get_active_implementation() = &unsupported_singleton; } } - return active_implementation = available_implementations.detect_best_supported(); + return get_active_implementation() = get_available_implementations().detect_best_supported(); } } // namespace internal -SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list available_implementations{}; -SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr active_implementation{&internal::detect_best_supported_implementation_on_first_use_singleton}; + + +/** + * The list of available implementations compiled into simdutf. + */ +SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list& get_available_implementations() { + static const internal::available_implementation_list available_implementations{}; + return available_implementations; +} + +/** + * The active implementation. + */ +SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr& get_active_implementation() { + static const internal::detect_best_supported_implementation_on_first_use detect_best_supported_implementation_on_first_use_singleton; + static internal::atomic_ptr active_implementation{&detect_best_supported_implementation_on_first_use_singleton}; + return active_implementation; +} simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept { - return active_implementation->validate_utf8(buf, len); + return get_active_implementation()->validate_utf8(buf, len); } simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) noexcept { - return active_implementation->validate_utf8_with_errors(buf, len); + return get_active_implementation()->validate_utf8_with_errors(buf, len); } simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) noexcept { - return active_implementation->validate_ascii(buf, len); + return get_active_implementation()->validate_ascii(buf, len); } simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) noexcept { - return active_implementation->validate_ascii_with_errors(buf, len); + return get_active_implementation()->validate_ascii_with_errors(buf, len); +} +simdutf_warn_unused size_t convert_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_output) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return convert_utf8_to_utf16be(input, length, utf16_output); + #else + return convert_utf8_to_utf16le(input, length, utf16_output); + #endif } simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_output) noexcept { - return active_implementation->convert_utf8_to_utf16le(input, length, utf16_output); + return get_active_implementation()->convert_utf8_to_utf16le(input, length, utf16_output); } simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_output) noexcept { - return active_implementation->convert_utf8_to_utf16be(input, length, utf16_output); + return get_active_implementation()->convert_utf8_to_utf16be(input, length, utf16_output); +} +simdutf_warn_unused result convert_utf8_to_utf16_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return convert_utf8_to_utf16be_with_errors(input, length, utf16_output); + #else + return convert_utf8_to_utf16le_with_errors(input, length, utf16_output); + #endif } simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept { - return active_implementation->convert_utf8_to_utf16le_with_errors(input, length, utf16_output); + return get_active_implementation()->convert_utf8_to_utf16le_with_errors(input, length, utf16_output); } simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept { - return active_implementation->convert_utf8_to_utf16be_with_errors(input, length, utf16_output); + return get_active_implementation()->convert_utf8_to_utf16be_with_errors(input, length, utf16_output); } simdutf_warn_unused size_t convert_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_output) noexcept { - return active_implementation->convert_utf8_to_utf32(input, length, utf32_output); + return get_active_implementation()->convert_utf8_to_utf32(input, length, utf32_output); } simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * input, size_t length, char32_t* utf32_output) noexcept { - return active_implementation->convert_utf8_to_utf32_with_errors(input, length, utf32_output); + return get_active_implementation()->convert_utf8_to_utf32_with_errors(input, length, utf32_output); +} +simdutf_warn_unused bool validate_utf16(const char16_t * buf, size_t len) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return validate_utf16be(buf, len); + #else + return validate_utf16le(buf, len); + #endif } simdutf_warn_unused bool validate_utf16le(const char16_t * buf, size_t len) noexcept { - return active_implementation->validate_utf16le(buf, len); + return get_active_implementation()->validate_utf16le(buf, len); } simdutf_warn_unused bool validate_utf16be(const char16_t * buf, size_t len) noexcept { - return active_implementation->validate_utf16be(buf, len); + return get_active_implementation()->validate_utf16be(buf, len); +} +simdutf_warn_unused result validate_utf16_with_errors(const char16_t * buf, size_t len) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return validate_utf16be_with_errors(buf, len); + #else + return validate_utf16le_with_errors(buf, len); + #endif } simdutf_warn_unused result validate_utf16le_with_errors(const char16_t * buf, size_t len) noexcept { - return active_implementation->validate_utf16le_with_errors(buf, len); + return get_active_implementation()->validate_utf16le_with_errors(buf, len); } simdutf_warn_unused result validate_utf16be_with_errors(const char16_t * buf, size_t len) noexcept { - return active_implementation->validate_utf16be_with_errors(buf, len); + return get_active_implementation()->validate_utf16be_with_errors(buf, len); } simdutf_warn_unused bool validate_utf32(const char32_t * buf, size_t len) noexcept { - return active_implementation->validate_utf32(buf, len); + return get_active_implementation()->validate_utf32(buf, len); } simdutf_warn_unused result validate_utf32_with_errors(const char32_t * buf, size_t len) noexcept { - return active_implementation->validate_utf32_with_errors(buf, len); + return get_active_implementation()->validate_utf32_with_errors(buf, len); +} +simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_buffer) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return convert_valid_utf8_to_utf16be(input, length, utf16_buffer); + #else + return convert_valid_utf8_to_utf16le(input, length, utf16_buffer); + #endif } simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_buffer) noexcept { - return active_implementation->convert_valid_utf8_to_utf16le(input, length, utf16_buffer); + return get_active_implementation()->convert_valid_utf8_to_utf16le(input, length, utf16_buffer); } simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_buffer) noexcept { - return active_implementation->convert_valid_utf8_to_utf16be(input, length, utf16_buffer); + return get_active_implementation()->convert_valid_utf8_to_utf16be(input, length, utf16_buffer); } simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) noexcept { - return active_implementation->convert_valid_utf8_to_utf32(input, length, utf32_buffer); + return get_active_implementation()->convert_valid_utf8_to_utf32(input, length, utf32_buffer); +} +simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return convert_utf16be_to_utf8(buf, len, utf8_buffer); + #else + return convert_utf16le_to_utf8(buf, len, utf8_buffer); + #endif } simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept { - return active_implementation->convert_utf16le_to_utf8(buf, len, utf8_buffer); + return get_active_implementation()->convert_utf16le_to_utf8(buf, len, utf8_buffer); } simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept { - return active_implementation->convert_utf16be_to_utf8(buf, len, utf8_buffer); + return get_active_implementation()->convert_utf16be_to_utf8(buf, len, utf8_buffer); +} +simdutf_warn_unused result convert_utf16_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return convert_utf16be_to_utf8_with_errors(buf, len, utf8_buffer); + #else + return convert_utf16le_to_utf8_with_errors(buf, len, utf8_buffer); + #endif } simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) noexcept { - return active_implementation->convert_utf16le_to_utf8_with_errors(buf, len, utf8_buffer); + return get_active_implementation()->convert_utf16le_to_utf8_with_errors(buf, len, utf8_buffer); } simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) noexcept { - return active_implementation->convert_utf16be_to_utf8_with_errors(buf, len, utf8_buffer); + return get_active_implementation()->convert_utf16be_to_utf8_with_errors(buf, len, utf8_buffer); +} +simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept { + #if BIG_ENDIAN + return convert_valid_utf16be_to_utf8(buf, len, utf8_buffer); + #else + return convert_valid_utf16le_to_utf8(buf, len, utf8_buffer); + #endif } simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept { - return active_implementation->convert_valid_utf16le_to_utf8(buf, len, utf8_buffer); + return get_active_implementation()->convert_valid_utf16le_to_utf8(buf, len, utf8_buffer); } simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept { - return active_implementation->convert_valid_utf16be_to_utf8(buf, len, utf8_buffer); + return get_active_implementation()->convert_valid_utf16be_to_utf8(buf, len, utf8_buffer); } simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) noexcept { - return active_implementation->convert_utf32_to_utf8(buf, len, utf8_buffer); + return get_active_implementation()->convert_utf32_to_utf8(buf, len, utf8_buffer); } simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) noexcept { - return active_implementation->convert_utf32_to_utf8_with_errors(buf, len, utf8_buffer); + return get_active_implementation()->convert_utf32_to_utf8_with_errors(buf, len, utf8_buffer); } simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) noexcept { - return active_implementation->convert_valid_utf32_to_utf8(buf, len, utf8_buffer); + return get_active_implementation()->convert_valid_utf32_to_utf8(buf, len, utf8_buffer); +} +simdutf_warn_unused size_t convert_utf32_to_utf16(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return convert_utf32_to_utf16be(buf, len, utf16_buffer); + #else + return convert_utf32_to_utf16le(buf, len, utf16_buffer); + #endif } simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept { - return active_implementation->convert_utf32_to_utf16le(buf, len, utf16_buffer); + return get_active_implementation()->convert_utf32_to_utf16le(buf, len, utf16_buffer); } simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept { - return active_implementation->convert_utf32_to_utf16be(buf, len, utf16_buffer); + return get_active_implementation()->convert_utf32_to_utf16be(buf, len, utf16_buffer); +} +simdutf_warn_unused result convert_utf32_to_utf16_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return convert_utf32_to_utf16be_with_errors(buf, len, utf16_buffer); + #else + return convert_utf32_to_utf16le_with_errors(buf, len, utf16_buffer); + #endif } simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept { - return active_implementation->convert_utf32_to_utf16le_with_errors(buf, len, utf16_buffer); + return get_active_implementation()->convert_utf32_to_utf16le_with_errors(buf, len, utf16_buffer); } simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept { - return active_implementation->convert_utf32_to_utf16be_with_errors(buf, len, utf16_buffer); + return get_active_implementation()->convert_utf32_to_utf16be_with_errors(buf, len, utf16_buffer); +} +simdutf_warn_unused size_t convert_valid_utf32_to_utf16(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return convert_valid_utf32_to_utf16be(buf, len, utf16_buffer); + #else + return convert_valid_utf32_to_utf16le(buf, len, utf16_buffer); + #endif } simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept { - return active_implementation->convert_valid_utf32_to_utf16le(buf, len, utf16_buffer); + return get_active_implementation()->convert_valid_utf32_to_utf16le(buf, len, utf16_buffer); } simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept { - return active_implementation->convert_valid_utf32_to_utf16be(buf, len, utf16_buffer); + return get_active_implementation()->convert_valid_utf32_to_utf16be(buf, len, utf16_buffer); +} +simdutf_warn_unused size_t convert_utf16_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return convert_utf16be_to_utf32(buf, len, utf32_buffer); + #else + return convert_utf16le_to_utf32(buf, len, utf32_buffer); + #endif } simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept { - return active_implementation->convert_utf16le_to_utf32(buf, len, utf32_buffer); + return get_active_implementation()->convert_utf16le_to_utf32(buf, len, utf32_buffer); } simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept { - return active_implementation->convert_utf16be_to_utf32(buf, len, utf32_buffer); + return get_active_implementation()->convert_utf16be_to_utf32(buf, len, utf32_buffer); +} +simdutf_warn_unused result convert_utf16_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return convert_utf16be_to_utf32_with_errors(buf, len, utf32_buffer); + #else + return convert_utf16le_to_utf32_with_errors(buf, len, utf32_buffer); + #endif } simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept { - return active_implementation->convert_utf16le_to_utf32_with_errors(buf, len, utf32_buffer); + return get_active_implementation()->convert_utf16le_to_utf32_with_errors(buf, len, utf32_buffer); } simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept { - return active_implementation->convert_utf16be_to_utf32_with_errors(buf, len, utf32_buffer); + return get_active_implementation()->convert_utf16be_to_utf32_with_errors(buf, len, utf32_buffer); +} +simdutf_warn_unused size_t convert_valid_utf16_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return convert_valid_utf16be_to_utf32(buf, len, utf32_buffer); + #else + return convert_valid_utf16le_to_utf32(buf, len, utf32_buffer); + #endif } simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept { - return active_implementation->convert_valid_utf16le_to_utf32(buf, len, utf32_buffer); + return get_active_implementation()->convert_valid_utf16le_to_utf32(buf, len, utf32_buffer); } simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept { - return active_implementation->convert_valid_utf16be_to_utf32(buf, len, utf32_buffer); + return get_active_implementation()->convert_valid_utf16be_to_utf32(buf, len, utf32_buffer); } void change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) noexcept { - active_implementation->change_endianness_utf16(input, length, output); + get_active_implementation()->change_endianness_utf16(input, length, output); +} +simdutf_warn_unused size_t count_utf16(const char16_t * input, size_t length) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return count_utf16be(input, length); + #else + return count_utf16le(input, length); + #endif } simdutf_warn_unused size_t count_utf16le(const char16_t * input, size_t length) noexcept { - return active_implementation->count_utf16le(input, length); + return get_active_implementation()->count_utf16le(input, length); } simdutf_warn_unused size_t count_utf16be(const char16_t * input, size_t length) noexcept { - return active_implementation->count_utf16be(input, length); + return get_active_implementation()->count_utf16be(input, length); } simdutf_warn_unused size_t count_utf8(const char * input, size_t length) noexcept { - return active_implementation->count_utf8(input, length); + return get_active_implementation()->count_utf8(input, length); +} +simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * input, size_t length) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return utf8_length_from_utf16be(input, length); + #else + return utf8_length_from_utf16le(input, length); + #endif } simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) noexcept { - return active_implementation->utf8_length_from_utf16le(input, length); + return get_active_implementation()->utf8_length_from_utf16le(input, length); } simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) noexcept { - return active_implementation->utf8_length_from_utf16be(input, length); + return get_active_implementation()->utf8_length_from_utf16be(input, length); +} +simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t * input, size_t length) noexcept { + #if SIMDUTF_IS_BIG_ENDIAN + return utf32_length_from_utf16be(input, length); + #else + return utf32_length_from_utf16le(input, length); + #endif } simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) noexcept { - return active_implementation->utf32_length_from_utf16le(input, length); + return get_active_implementation()->utf32_length_from_utf16le(input, length); } simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) noexcept { - return active_implementation->utf32_length_from_utf16be(input, length); + return get_active_implementation()->utf32_length_from_utf16be(input, length); } simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) noexcept { - return active_implementation->utf16_length_from_utf8(input, length); + return get_active_implementation()->utf16_length_from_utf8(input, length); } simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) noexcept { - return active_implementation->utf8_length_from_utf32(input, length); + return get_active_implementation()->utf8_length_from_utf32(input, length); } simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) noexcept { - return active_implementation->utf16_length_from_utf32(input, length); + return get_active_implementation()->utf16_length_from_utf32(input, length); } simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) noexcept { - return active_implementation->utf32_length_from_utf8(input, length); + return get_active_implementation()->utf32_length_from_utf8(input, length); } simdutf_warn_unused simdutf::encoding_type autodetect_encoding(const char * buf, size_t length) noexcept { - return active_implementation->autodetect_encoding(buf, length); + return get_active_implementation()->autodetect_encoding(buf, length); } simdutf_warn_unused int detect_encodings(const char * buf, size_t length) noexcept { - return active_implementation->detect_encodings(buf, length); + return get_active_implementation()->detect_encodings(buf, length); } const implementation * builtin_implementation() { - static const implementation * builtin_impl = available_implementations[SIMDUTF_STRINGIFY(SIMDUTF_BUILTIN_IMPLEMENTATION)]; + static const implementation * builtin_impl = get_available_implementations()[SIMDUTF_STRINGIFY(SIMDUTF_BUILTIN_IMPLEMENTATION)]; return builtin_impl; } @@ -4854,6 +4988,14 @@ const implementation * builtin_implementation() { /* begin file src/encoding_types.cpp */ namespace simdutf { +bool match_system(endianness e) { +#if SIMDUTF_IS_BIG_ENDIAN + return e == endianness::BIG; +#else + return e == endianness::LITTLE; +#endif +} + std::string to_string(encoding_type bom) { switch (bom) { case UTF16_LE: return "UTF16 little-endian"; @@ -10071,12 +10213,12 @@ inline simdutf_warn_unused bool validate(const char16_t *buf, size_t len) noexce const uint16_t *data = reinterpret_cast(buf); uint64_t pos = 0; while (pos < len) { - uint16_t word = big_endian ? swap_bytes(data[pos]) : data[pos]; + uint16_t word = !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos]; if((word &0xF800) == 0xD800) { if(pos + 1 >= len) { return false; } uint16_t diff = uint16_t(word - 0xD800); if(diff > 0x3FF) { return false; } - uint16_t next_word = big_endian ? uint16_t((data[pos + 1] >> 8) | (data[pos + 1] << 8)) : data[pos + 1]; + uint16_t next_word = !match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1]; uint16_t diff2 = uint16_t(next_word - 0xDC00); if(diff2 > 0x3FF) { return false; } pos += 2; @@ -10092,12 +10234,12 @@ inline simdutf_warn_unused result validate_with_errors(const char16_t *buf, size const uint16_t *data = reinterpret_cast(buf); size_t pos = 0; while (pos < len) { - uint16_t word = big_endian ? swap_bytes(data[pos]) : data[pos]; + uint16_t word = !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos]; if((word & 0xF800) == 0xD800) { if(pos + 1 >= len) { return result(error_code::SURROGATE, pos); } uint16_t diff = uint16_t(word - 0xD800); if(diff > 0x3FF) { return result(error_code::SURROGATE, pos); } - uint16_t next_word = big_endian ? uint16_t((data[pos + 1] >> 8) | (data[pos + 1] << 8)) : data[pos + 1]; + uint16_t next_word = !match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1]; uint16_t diff2 = uint16_t(next_word - 0xDC00); if(diff2 > 0x3FF) { return result(error_code::SURROGATE, pos); } pos += 2; @@ -10114,7 +10256,7 @@ inline size_t count_code_points(const char16_t* buf, size_t len) { const uint16_t * p = reinterpret_cast(buf); size_t counter{0}; for(size_t i = 0; i < len; i++) { - uint16_t word = big_endian ? swap_bytes(p[i]) : p[i]; + uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i]; counter += ((word & 0xFC00) != 0xDC00); } return counter; @@ -10126,7 +10268,7 @@ inline size_t utf8_length_from_utf16(const char16_t* buf, size_t len) { const uint16_t * p = reinterpret_cast(buf); size_t counter{0}; for(size_t i = 0; i < len; i++) { - uint16_t word = big_endian ? swap_bytes(p[i]) : p[i]; + uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i]; /** ASCII **/ if(word <= 0x7F) { counter++; } /** two-byte **/ @@ -10145,7 +10287,7 @@ inline size_t utf32_length_from_utf16(const char16_t* buf, size_t len) { const uint16_t * p = reinterpret_cast(buf); size_t counter{0}; for(size_t i = 0; i < len; i++) { - uint16_t word = big_endian ? swap_bytes(p[i]) : p[i]; + uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i]; counter += ((word & 0xFC00) != 0xDC00); } return counter; @@ -10440,14 +10582,14 @@ inline size_t convert_valid(const char32_t* buf, size_t len, char16_t* utf16_out uint32_t word = data[pos]; if((word & 0xFFFF0000)==0) { // will not generate a surrogate pair - *utf16_output++ = big_endian ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word); + *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word); pos++; } else { // will generate a surrogate pair word -= 0x10000; uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); - if (big_endian) { + if (!match_system(big_endian)) { high_surrogate = utf16::swap_bytes(high_surrogate); low_surrogate = utf16::swap_bytes(low_surrogate); } @@ -10486,14 +10628,14 @@ inline size_t convert(const char32_t* buf, size_t len, char16_t* utf16_output) { if((word & 0xFFFF0000)==0) { if (word >= 0xD800 && word <= 0xDFFF) { return 0; } // will not generate a surrogate pair - *utf16_output++ = big_endian ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word); + *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word); } else { // will generate a surrogate pair if (word > 0x10FFFF) { return 0; } word -= 0x10000; uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); - if (big_endian) { + if (!match_system(big_endian)) { high_surrogate = utf16::swap_bytes(high_surrogate); low_surrogate = utf16::swap_bytes(low_surrogate); } @@ -10515,14 +10657,14 @@ inline result convert_with_errors(const char32_t* buf, size_t len, char16_t* utf if((word & 0xFFFF0000)==0) { if (word >= 0xD800 && word <= 0xDFFF) { return result(error_code::SURROGATE, pos); } // will not generate a surrogate pair - *utf16_output++ = big_endian ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word); + *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word); } else { // will generate a surrogate pair if (word > 0x10FFFF) { return result(error_code::TOO_LARGE, pos); } word -= 0x10000; uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); - if (big_endian) { + if (!match_system(big_endian)) { high_surrogate = utf16::swap_bytes(high_surrogate); low_surrogate = utf16::swap_bytes(low_surrogate); } @@ -10562,17 +10704,18 @@ inline size_t convert_valid(const char16_t* buf, size_t len, char* utf8_output) if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii uint64_t v; ::memcpy(&v, data + pos, sizeof(uint64_t)); - if (big_endian) v = (v >> 8) | (v << (64 - 8)); + if (!match_system(big_endian)) v = (v >> 8) | (v << (64 - 8)); if ((v & 0xFF80FF80FF80FF80) == 0) { size_t final_pos = pos + 4; while(pos < final_pos) { - *utf8_output++ = big_endian ? char(utf16::swap_bytes(buf[pos])) : char(buf[pos]); + *utf8_output++ = !match_system(big_endian) ? char(utf16::swap_bytes(buf[pos])) : char(buf[pos]); pos++; } continue; } } - uint16_t word = big_endian ? utf16::swap_bytes(data[pos]) : data[pos]; + + uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos]; if((word & 0xFF80)==0) { // will generate one UTF-8 bytes *utf8_output++ = char(word); @@ -10594,7 +10737,7 @@ inline size_t convert_valid(const char16_t* buf, size_t len, char* utf8_output) // must be a surrogate pair uint16_t diff = uint16_t(word - 0xD800); if(pos + 1 >= len) { return 0; } // minimal bound checking - uint16_t next_word = big_endian ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1]; + uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1]; uint16_t diff2 = uint16_t(next_word - 0xDC00); uint32_t value = (diff << 10) + diff2 + 0x10000; // will generate four UTF-8 bytes @@ -10636,17 +10779,17 @@ inline size_t convert(const char16_t* buf, size_t len, char* utf8_output) { if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii uint64_t v; ::memcpy(&v, data + pos, sizeof(uint64_t)); - if (big_endian) v = (v >> 8) | (v << (64 - 8)); + if (!match_system(big_endian)) v = (v >> 8) | (v << (64 - 8)); if ((v & 0xFF80FF80FF80FF80) == 0) { size_t final_pos = pos + 4; while(pos < final_pos) { - *utf8_output++ = big_endian ? char(utf16::swap_bytes(buf[pos])) : char(buf[pos]); + *utf8_output++ = !match_system(big_endian) ? char(utf16::swap_bytes(buf[pos])) : char(buf[pos]); pos++; } continue; } } - uint16_t word = big_endian ? utf16::swap_bytes(data[pos]) : data[pos]; + uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos]; if((word & 0xFF80)==0) { // will generate one UTF-8 bytes *utf8_output++ = char(word); @@ -10669,7 +10812,7 @@ inline size_t convert(const char16_t* buf, size_t len, char* utf8_output) { if(pos + 1 >= len) { return 0; } uint16_t diff = uint16_t(word - 0xD800); if(diff > 0x3FF) { return 0; } - uint16_t next_word = big_endian ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1]; + uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1]; uint16_t diff2 = uint16_t(next_word - 0xDC00); if(diff2 > 0x3FF) { return 0; } uint32_t value = (diff << 10) + diff2 + 0x10000; @@ -10695,17 +10838,17 @@ inline result convert_with_errors(const char16_t* buf, size_t len, char* utf8_ou if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii uint64_t v; ::memcpy(&v, data + pos, sizeof(uint64_t)); - if (big_endian) v = (v >> 8) | (v << (64 - 8)); + if (!match_system(big_endian)) v = (v >> 8) | (v << (64 - 8)); if ((v & 0xFF80FF80FF80FF80) == 0) { size_t final_pos = pos + 4; while(pos < final_pos) { - *utf8_output++ = big_endian ? char(utf16::swap_bytes(buf[pos])) : char(buf[pos]); + *utf8_output++ = !match_system(big_endian) ? char(utf16::swap_bytes(buf[pos])) : char(buf[pos]); pos++; } continue; } } - uint16_t word = big_endian ? utf16::swap_bytes(data[pos]) : data[pos]; + uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos]; if((word & 0xFF80)==0) { // will generate one UTF-8 bytes *utf8_output++ = char(word); @@ -10728,7 +10871,7 @@ inline result convert_with_errors(const char16_t* buf, size_t len, char* utf8_ou if(pos + 1 >= len) { return result(error_code::SURROGATE, pos); } uint16_t diff = uint16_t(word - 0xD800); if(diff > 0x3FF) { return result(error_code::SURROGATE, pos); } - uint16_t next_word = big_endian ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1]; + uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1]; uint16_t diff2 = uint16_t(next_word - 0xDC00); if(diff2 > 0x3FF) { return result(error_code::SURROGATE, pos); } uint32_t value = (diff << 10) + diff2 + 0x10000; @@ -10768,7 +10911,7 @@ inline size_t convert_valid(const char16_t* buf, size_t len, char32_t* utf32_out size_t pos = 0; char32_t* start{utf32_output}; while (pos < len) { - uint16_t word = big_endian ? utf16::swap_bytes(data[pos]) : data[pos]; + uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos]; if((word &0xF800 ) != 0xD800) { // No surrogate pair, extend 16-bit word to 32-bit word *utf32_output++ = char32_t(word); @@ -10777,7 +10920,7 @@ inline size_t convert_valid(const char16_t* buf, size_t len, char32_t* utf32_out // must be a surrogate pair uint16_t diff = uint16_t(word - 0xD800); if(pos + 1 >= len) { return 0; } // minimal bound checking - uint16_t next_word = big_endian ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1]; + uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1]; uint16_t diff2 = uint16_t(next_word - 0xDC00); uint32_t value = (diff << 10) + diff2 + 0x10000; *utf32_output++ = char32_t(value); @@ -10810,7 +10953,7 @@ inline size_t convert(const char16_t* buf, size_t len, char32_t* utf32_output) { size_t pos = 0; char32_t* start{utf32_output}; while (pos < len) { - uint16_t word = big_endian ? utf16::swap_bytes(data[pos]) : data[pos]; + uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos]; if((word &0xF800 ) != 0xD800) { // No surrogate pair, extend 16-bit word to 32-bit word *utf32_output++ = char32_t(word); @@ -10820,7 +10963,7 @@ inline size_t convert(const char16_t* buf, size_t len, char32_t* utf32_output) { uint16_t diff = uint16_t(word - 0xD800); if(diff > 0x3FF) { return 0; } if(pos + 1 >= len) { return 0; } // minimal bound checking - uint16_t next_word = big_endian ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1]; + uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1]; uint16_t diff2 = uint16_t(next_word - 0xDC00); if(diff2 > 0x3FF) { return 0; } uint32_t value = (diff << 10) + diff2 + 0x10000; @@ -10837,7 +10980,7 @@ inline result convert_with_errors(const char16_t* buf, size_t len, char32_t* utf size_t pos = 0; char32_t* start{utf32_output}; while (pos < len) { - uint16_t word = big_endian ? utf16::swap_bytes(data[pos]) : data[pos]; + uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos]; if((word &0xF800 ) != 0xD800) { // No surrogate pair, extend 16-bit word to 32-bit word *utf32_output++ = char32_t(word); @@ -10847,7 +10990,7 @@ inline result convert_with_errors(const char16_t* buf, size_t len, char32_t* utf uint16_t diff = uint16_t(word - 0xD800); if(diff > 0x3FF) { return result(error_code::SURROGATE, pos); } if(pos + 1 >= len) { return result(error_code::SURROGATE, pos); } // minimal bound checking - uint16_t next_word = big_endian ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1]; + uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1]; uint16_t diff2 = uint16_t(next_word - 0xDC00); if(diff2 > 0x3FF) { return result(error_code::SURROGATE, pos); } uint32_t value = (diff << 10) + diff2 + 0x10000; @@ -10889,7 +11032,7 @@ inline size_t convert_valid(const char* buf, size_t len, char16_t* utf16_output) if ((v & 0x8080808080808080) == 0) { size_t final_pos = pos + 8; while(pos < final_pos) { - *utf16_output++ = big_endian ? char16_t(utf16::swap_bytes(buf[pos])) : char16_t(buf[pos]); + *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(buf[pos])) : char16_t(buf[pos]); pos++; } continue; @@ -10898,14 +11041,14 @@ inline size_t convert_valid(const char* buf, size_t len, char16_t* utf16_output) uint8_t leading_byte = data[pos]; // leading byte if (leading_byte < 0b10000000) { // converting one ASCII byte !!! - *utf16_output++ = big_endian ? char16_t(utf16::swap_bytes(leading_byte)) : char16_t(leading_byte); + *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(leading_byte)) : char16_t(leading_byte); pos++; } else if ((leading_byte & 0b11100000) == 0b11000000) { // We have a two-byte UTF-8, it should become // a single UTF-16 word. if(pos + 1 >= len) { break; } // minimal bound checking uint16_t code_point = uint16_t(((leading_byte &0b00011111) << 6) | (data[pos + 1] &0b00111111)); - if (big_endian) { + if (!match_system(big_endian)) { code_point = utf16::swap_bytes(uint16_t(code_point)); } *utf16_output++ = char16_t(code_point); @@ -10915,7 +11058,7 @@ inline size_t convert_valid(const char* buf, size_t len, char16_t* utf16_output) // a single UTF-16 word. if(pos + 2 >= len) { break; } // minimal bound checking uint16_t code_point = uint16_t(((leading_byte &0b00001111) << 12) | ((data[pos + 1] &0b00111111) << 6) | (data[pos + 2] &0b00111111)); - if (big_endian) { + if (!match_system(big_endian)) { code_point = utf16::swap_bytes(uint16_t(code_point)); } *utf16_output++ = char16_t(code_point); @@ -10928,7 +11071,7 @@ inline size_t convert_valid(const char* buf, size_t len, char16_t* utf16_output) code_point -= 0x10000; uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10)); uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF)); - if (big_endian) { + if (!match_system(big_endian)) { high_surrogate = utf16::swap_bytes(high_surrogate); low_surrogate = utf16::swap_bytes(low_surrogate); } @@ -10977,16 +11120,17 @@ inline size_t convert(const char* buf, size_t len, char16_t* utf16_output) { if ((v & 0x8080808080808080) == 0) { size_t final_pos = pos + 16; while(pos < final_pos) { - *utf16_output++ = big_endian ? char16_t(utf16::swap_bytes(buf[pos])) : char16_t(buf[pos]); + *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(buf[pos])) : char16_t(buf[pos]); pos++; } continue; } } + uint8_t leading_byte = data[pos]; // leading byte if (leading_byte < 0b10000000) { // converting one ASCII byte !!! - *utf16_output++ = big_endian ? char16_t(utf16::swap_bytes(leading_byte)): char16_t(leading_byte); + *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(leading_byte)): char16_t(leading_byte); pos++; } else if ((leading_byte & 0b11100000) == 0b11000000) { // We have a two-byte UTF-8, it should become @@ -10996,7 +11140,7 @@ inline size_t convert(const char* buf, size_t len, char16_t* utf16_output) { // range check uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); if (code_point < 0x80 || 0x7ff < code_point) { return 0; } - if (big_endian) { + if (!match_system(big_endian)) { code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point))); } *utf16_output++ = char16_t(code_point); @@ -11016,7 +11160,7 @@ inline size_t convert(const char* buf, size_t len, char16_t* utf16_output) { (0xd7ff < code_point && code_point < 0xe000)) { return 0; } - if (big_endian) { + if (!match_system(big_endian)) { code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point))); } *utf16_output++ = char16_t(code_point); @@ -11036,7 +11180,7 @@ inline size_t convert(const char* buf, size_t len, char16_t* utf16_output) { code_point -= 0x10000; uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10)); uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF)); - if (big_endian) { + if (!match_system(big_endian)) { high_surrogate = utf16::swap_bytes(high_surrogate); low_surrogate = utf16::swap_bytes(low_surrogate); } @@ -11066,7 +11210,7 @@ inline result convert_with_errors(const char* buf, size_t len, char16_t* utf16_o if ((v & 0x8080808080808080) == 0) { size_t final_pos = pos + 16; while(pos < final_pos) { - *utf16_output++ = big_endian ? char16_t(utf16::swap_bytes(buf[pos])) : char16_t(buf[pos]); + *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(buf[pos])) : char16_t(buf[pos]); pos++; } continue; @@ -11075,7 +11219,7 @@ inline result convert_with_errors(const char* buf, size_t len, char16_t* utf16_o uint8_t leading_byte = data[pos]; // leading byte if (leading_byte < 0b10000000) { // converting one ASCII byte !!! - *utf16_output++ = big_endian ? char16_t(utf16::swap_bytes(leading_byte)): char16_t(leading_byte); + *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(leading_byte)): char16_t(leading_byte); pos++; } else if ((leading_byte & 0b11100000) == 0b11000000) { // We have a two-byte UTF-8, it should become @@ -11085,7 +11229,7 @@ inline result convert_with_errors(const char* buf, size_t len, char16_t* utf16_o // range check uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); if (code_point < 0x80 || 0x7ff < code_point) { return result(error_code::OVERLONG, pos); } - if (big_endian) { + if (!match_system(big_endian)) { code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point))); } *utf16_output++ = char16_t(code_point); @@ -11103,7 +11247,7 @@ inline result convert_with_errors(const char* buf, size_t len, char16_t* utf16_o (data[pos + 2] & 0b00111111); if ((code_point < 0x800) || (0xffff < code_point)) { return result(error_code::OVERLONG, pos);} if (0xd7ff < code_point && code_point < 0xe000) { return result(error_code::SURROGATE, pos); } - if (big_endian) { + if (!match_system(big_endian)) { code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point))); } *utf16_output++ = char16_t(code_point); @@ -11124,7 +11268,7 @@ inline result convert_with_errors(const char* buf, size_t len, char16_t* utf16_o code_point -= 0x10000; uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10)); uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF)); - if (big_endian) { + if (!match_system(big_endian)) { high_surrogate = utf16::swap_bytes(high_surrogate); low_surrogate = utf16::swap_bytes(low_surrogate); } @@ -11510,8 +11654,8 @@ int arm_detect_encodings(const char * buf, size_t len) { if (surrogates_wordmask0 != 0 || surrogates_wordmask1 != 0) { // Cannot be UTF8 is_utf8 = false; - // Can still be either UTF-16LE or UTF-32LE depending on the positions of the surrogates - // To be valid UTF-32LE, a surrogate cannot be in the two most significant bytes of any 32-bit word. + // Can still be either UTF-16LE or UTF-32 depending on the positions of the surrogates + // To be valid UTF-32, a surrogate cannot be in the two most significant bytes of any 32-bit word. // On the other hand, to be valid UTF-16LE, at least one surrogate must be in the two most significant // bytes of a 32-bit word since they always come in pairs in UTF-16LE. // Note that we always proceed in multiple of 4 before this point so there is no offset in 32-bit words. @@ -11582,7 +11726,7 @@ int arm_detect_encodings(const char * buf, size_t len) { } } else { is_utf16 = false; - // Check for UTF-32LE + // Check for UTF-32 if (len % 4 == 0) { const char32_t * input = reinterpret_cast(buf); const char32_t* end32 = reinterpret_cast(start) + len/4; @@ -11626,7 +11770,7 @@ int arm_detect_encodings(const char * buf, size_t len) { } // If no surrogate, validate under other encodings as well - // UTF-32LE validation + // UTF-32 validation currentmax = vmaxq_u32(vreinterpretq_u32_u16(in),currentmax); currentmax = vmaxq_u32(vreinterpretq_u32_u16(secondin),currentmax); currentmax = vmaxq_u32(vreinterpretq_u32_u16(thirdin),currentmax); @@ -11686,7 +11830,7 @@ const char16_t* arm_validate_utf16(const char16_t* input, size_t size) { // consists only the higher bytes. auto in0 = simd16(input); auto in1 = simd16(input + simd16::SIZE / sizeof(char16_t)); - if (big_endian) { + if (!match_system(big_endian)) { #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); #else @@ -11762,7 +11906,7 @@ const result arm_validate_utf16_with_errors(const char16_t* input, size_t size) auto in0 = simd16(input); auto in1 = simd16(input + simd16::SIZE / sizeof(char16_t)); - if (big_endian) { + if (!match_system(big_endian)) { #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); #else @@ -11918,7 +12062,7 @@ size_t convert_masked_utf8_to_utf16(const char *input, // We process in chunks of 16 bytes uint16x8_t ascii_first = vmovl_u8(vget_low_u8 (in)); uint16x8_t ascii_second = vmovl_high_u8(in); - if (big_endian) { + if (!match_system(big_endian)) { ascii_first = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(ascii_first), swap)); ascii_second = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(ascii_second), swap)); } @@ -11934,7 +12078,7 @@ size_t convert_masked_utf8_to_utf16(const char *input, uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f))); uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00))); uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2))); - if (big_endian) composed = vqtbl1q_u8(composed, swap); + if (!match_system(big_endian)) composed = vqtbl1q_u8(composed, swap); vst1q_u8(reinterpret_cast(utf16_output), composed); utf16_output += 8; // We wrote 16 bytes, 8 code points. return 16; @@ -11959,7 +12103,7 @@ size_t convert_masked_utf8_to_utf16(const char *input, uint32x4_t composed = vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted); uint16x8_t composed_repacked = vmovn_high_u32(vmovn_u32(composed), composed); - if (big_endian) composed_repacked = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(composed_repacked), swap)); + if (!match_system(big_endian)) composed_repacked = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(composed_repacked), swap)); vst1q_u16(reinterpret_cast(utf16_output), composed_repacked); utf16_output += 4; return 12; @@ -11982,7 +12126,7 @@ size_t convert_masked_utf8_to_utf16(const char *input, uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f))); uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00))); uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2))); - if (big_endian) composed = vqtbl1q_u8(composed, swap); + if (!match_system(big_endian)) composed = vqtbl1q_u8(composed, swap); vst1q_u8(reinterpret_cast(utf16_output), composed); utf16_output += 6; // We wrote 12 bytes, 6 code points. } else if (idx < 145) { @@ -12000,7 +12144,7 @@ size_t convert_masked_utf8_to_utf16(const char *input, uint32x4_t composed = vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted); uint16x8_t composed_repacked = vmovn_high_u32(vmovn_u32(composed), composed); - if (big_endian) composed_repacked = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(composed_repacked), swap)); + if (!match_system(big_endian)) composed_repacked = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(composed_repacked), swap)); vst1q_u16(reinterpret_cast(utf16_output), composed_repacked); utf16_output += 4; } else if (idx < 209) { @@ -12035,7 +12179,7 @@ size_t convert_masked_utf8_to_utf16(const char *input, vorrq_u32(hightenbitsadd, lowtenbitsaddshifted); uint32_t basic_buffer[4]; uint32_t basic_buffer_swap[4]; - if (big_endian) { + if (!match_system(big_endian)) { vst1q_u32(basic_buffer_swap, vreinterpretq_u32_u8(vqtbl1q_u8(composed, swap))); surrogates = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(surrogates), swap)); } @@ -12044,7 +12188,7 @@ size_t convert_masked_utf8_to_utf16(const char *input, vst1q_u32(surrogate_buffer, surrogates); for (size_t i = 0; i < 3; i++) { if (basic_buffer[i] < 65536) { - utf16_output[0] = big_endian ? uint16_t(basic_buffer_swap[i]) : uint16_t(basic_buffer[i]); + utf16_output[0] = !match_system(big_endian) ? uint16_t(basic_buffer_swap[i]) : uint16_t(basic_buffer[i]); utf16_output++; } else { utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff); @@ -12260,7 +12404,7 @@ std::pair arm_convert_utf16_to_utf8(const char16_t* buf, while (buf + 16 <= end) { uint16x8_t in = vld1q_u16(reinterpret_cast(buf)); - if (big_endian) { + if (!match_system(big_endian)) { #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); #else @@ -12271,7 +12415,7 @@ std::pair arm_convert_utf16_to_utf8(const char16_t* buf, if(vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!! // It is common enough that we have sequences of 16 consecutive ASCII characters. uint16x8_t nextin = vld1q_u16(reinterpret_cast(buf) + 8); - if (big_endian) { + if (!match_system(big_endian)) { #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); #else @@ -12477,7 +12621,7 @@ std::pair arm_convert_utf16_to_utf8(const char16_t* buf, size_t k = 0; if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} for(; k < forward; k++) { - uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; + uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; if((word & 0xFF80)==0) { *utf8_output++ = char(word); } else if((word & 0xF800)==0) { @@ -12490,7 +12634,7 @@ std::pair arm_convert_utf16_to_utf8(const char16_t* buf, } else { // must be a surrogate pair uint16_t diff = uint16_t(word - 0xD800); - uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1]; + uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1]; k++; uint16_t diff2 = uint16_t(next_word - 0xDC00); if((diff | diff2) > 0x3FF) { return std::make_pair(nullptr, reinterpret_cast(utf8_output)); } @@ -12527,7 +12671,7 @@ std::pair arm_convert_utf16_to_utf8_with_errors(const char16_t* b while (buf + 16 <= end) { uint16x8_t in = vld1q_u16(reinterpret_cast(buf)); - if (big_endian) { + if (!match_system(big_endian)) { #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); #else @@ -12538,7 +12682,7 @@ std::pair arm_convert_utf16_to_utf8_with_errors(const char16_t* b if(vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!! // It is common enough that we have sequences of 16 consecutive ASCII characters. uint16x8_t nextin = vld1q_u16(reinterpret_cast(buf) + 8); - if (big_endian) { + if (!match_system(big_endian)) { #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); #else @@ -12744,7 +12888,7 @@ std::pair arm_convert_utf16_to_utf8_with_errors(const char16_t* b size_t k = 0; if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} for(; k < forward; k++) { - uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; + uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; if((word & 0xFF80)==0) { *utf8_output++ = char(word); } else if((word & 0xF800)==0) { @@ -12757,7 +12901,7 @@ std::pair arm_convert_utf16_to_utf8_with_errors(const char16_t* b } else { // must be a surrogate pair uint16_t diff = uint16_t(word - 0xD800); - uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1]; + uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1]; k++; uint16_t diff2 = uint16_t(next_word - 0xDC00); if((diff | diff2) > 0x3FF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), reinterpret_cast(utf8_output)); } @@ -12839,7 +12983,7 @@ std::pair arm_convert_utf16_to_utf32(const char16_t* while (buf + 16 <= end) { uint16x8_t in = vld1q_u16(reinterpret_cast(buf)); - if (big_endian) { + if (!match_system(big_endian)) { #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); #else @@ -12866,13 +13010,13 @@ std::pair arm_convert_utf16_to_utf32(const char16_t* size_t k = 0; if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} for(; k < forward; k++) { - uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; + uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; if((word &0xF800 ) != 0xD800) { *utf32_output++ = char32_t(word); } else { // must be a surrogate pair uint16_t diff = uint16_t(word - 0xD800); - uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1]; + uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1]; k++; uint16_t diff2 = uint16_t(next_word - 0xDC00); if((diff | diff2) > 0x3FF) { return std::make_pair(nullptr, reinterpret_cast(utf32_output)); } @@ -12904,7 +13048,7 @@ std::pair arm_convert_utf16_to_utf32_with_errors(const char16 while (buf + 16 <= end) { uint16x8_t in = vld1q_u16(reinterpret_cast(buf)); - if (big_endian) { + if (!match_system(big_endian)) { #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); #else @@ -12931,13 +13075,13 @@ std::pair arm_convert_utf16_to_utf32_with_errors(const char16 size_t k = 0; if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} for(; k < forward; k++) { - uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; + uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; if((word &0xF800 ) != 0xD800) { *utf32_output++ = char32_t(word); } else { // must be a surrogate pair uint16_t diff = uint16_t(word - 0xD800); - uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1]; + uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1]; k++; uint16_t diff2 = uint16_t(next_word - 0xDC00); if((diff | diff2) > 0x3FF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), reinterpret_cast(utf32_output)); } @@ -13445,7 +13589,7 @@ std::pair arm_convert_utf32_to_utf16(const char32_t* const uint16x4_t v_dfff = vmov_n_u16((uint16_t)0xdfff); forbidden_bytemask = vorr_u16(vand_u16(vcle_u16(utf16_packed, v_dfff), vcge_u16(utf16_packed, v_d800)), forbidden_bytemask); - if (big_endian) { + if (!match_system(big_endian)) { #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO const uint8x8_t swap = make_uint8x8_t(1, 0, 3, 2, 5, 4, 7, 6); #else @@ -13465,14 +13609,14 @@ std::pair arm_convert_utf32_to_utf16(const char32_t* if((word & 0xFFFF0000)==0) { // will not generate a surrogate pair if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, reinterpret_cast(utf16_output)); } - *utf16_output++ = big_endian ? char16_t(word >> 8 | word << 8) : char16_t(word); + *utf16_output++ = !match_system(big_endian) ? char16_t(word >> 8 | word << 8) : char16_t(word); } else { // will generate a surrogate pair if (word > 0x10FFFF) { return std::make_pair(nullptr, reinterpret_cast(utf16_output)); } word -= 0x10000; uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); - if (big_endian) { + if (!match_system(big_endian)) { high_surrogate = uint16_t(high_surrogate >> 8 | high_surrogate << 8); low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8); } @@ -13513,7 +13657,7 @@ std::pair arm_convert_utf32_to_utf16_with_errors(const char32 return std::make_pair(result(error_code::SURROGATE, buf - start), reinterpret_cast(utf16_output)); } - if (big_endian) { + if (!match_system(big_endian)) { #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO const uint8x8_t swap = make_uint8x8_t(1, 0, 3, 2, 5, 4, 7, 6); #else @@ -13533,14 +13677,14 @@ std::pair arm_convert_utf32_to_utf16_with_errors(const char32 if((word & 0xFFFF0000)==0) { // will not generate a surrogate pair if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), reinterpret_cast(utf16_output)); } - *utf16_output++ = big_endian ? char16_t(word >> 8 | word << 8) : char16_t(word); + *utf16_output++ = !match_system(big_endian) ? char16_t(word >> 8 | word << 8) : char16_t(word); } else { // will generate a surrogate pair if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), reinterpret_cast(utf16_output)); } word -= 0x10000; uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); - if (big_endian) { + if (!match_system(big_endian)) { high_surrogate = uint16_t(high_surrogate >> 8 | high_surrogate << 8); low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8); } @@ -14705,7 +14849,7 @@ simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) size_t count = 0; for(;pos + 32 <= size; pos += 32) { simd16x32 input(reinterpret_cast(in + pos)); - if (big_endian) input.swap_bytes(); + if (!match_system(big_endian)) input.swap_bytes(); uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF); count += count_ones(not_pair) / 2; } @@ -14719,7 +14863,7 @@ simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t s // This algorithm could no doubt be improved! for(;pos + 32 <= size; pos += 32) { simd16x32 input(reinterpret_cast(in + pos)); - if (big_endian) input.swap_bytes(); + if (!match_system(big_endian)) input.swap_bytes(); uint64_t ascii_mask = input.lteq(0x7F); uint64_t twobyte_mask = input.lteq(0x7FF); uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF); @@ -17902,8 +18046,8 @@ implementation::detect_encodings(const char *input, if (surrogates) { is_utf8 = false; - // Can still be either UTF-16LE or UTF-32LE depending on the positions - // of the surrogates To be valid UTF-32LE, a surrogate cannot be in the + // Can still be either UTF-16LE or UTF-32 depending on the positions + // of the surrogates To be valid UTF-32, a surrogate cannot be in the // two most significant bytes of any 32-bit word. On the other hand, to // be valid UTF-16LE, at least one surrogate must be in the two most // significant bytes of a 32-bit word since they always come in pairs in @@ -17940,7 +18084,7 @@ implementation::detect_encodings(const char *input, } else { is_utf16 = false; - // Check for UTF-32LE + // Check for UTF-32 if (length % 4 == 0) { const char32_t *input32 = reinterpret_cast(buf); const char32_t *end32 = @@ -17955,7 +18099,7 @@ implementation::detect_encodings(const char *input, } // If no surrogate, validate under other encodings as well - // UTF-32LE validation + // UTF-32 validation currentmax = _mm512_max_epu32(in, currentmax); // UTF-8 validation @@ -19081,8 +19225,8 @@ int avx2_detect_encodings(const char * buf, size_t len) { if (surrogates_bitmask0 != 0x0) { // Cannot be UTF8 is_utf8 = false; - // Can still be either UTF-16LE or UTF-32LE depending on the positions of the surrogates - // To be valid UTF-32LE, a surrogate cannot be in the two most significant bytes of any 32-bit word. + // Can still be either UTF-16LE or UTF-32 depending on the positions of the surrogates + // To be valid UTF-32, a surrogate cannot be in the two most significant bytes of any 32-bit word. // On the other hand, to be valid UTF-16LE, at least one surrogate must be in the two most significant // bytes of a 32-bit word since they always come in pairs in UTF-16LE. // Note that we always proceed in multiple of 4 before this point so there is no offset in 32-bit words. @@ -19153,7 +19297,7 @@ int avx2_detect_encodings(const char * buf, size_t len) { } } else { is_utf16 = false; - // Check for UTF-32LE + // Check for UTF-32 if (len % 4 == 0) { const char32_t * input = reinterpret_cast(buf); const char32_t* end32 = reinterpret_cast(start) + len/4; @@ -19188,7 +19332,7 @@ int avx2_detect_encodings(const char * buf, size_t len) { } // If no surrogate, validate under other encodings as well - // UTF-32LE validation + // UTF-32 validation currentmax = _mm256_max_epu32(in, currentmax); currentmax = _mm256_max_epu32(nextin, currentmax); @@ -22278,7 +22422,7 @@ simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) size_t count = 0; for(;pos + 32 <= size; pos += 32) { simd16x32 input(reinterpret_cast(in + pos)); - if (big_endian) input.swap_bytes(); + if (!match_system(big_endian)) input.swap_bytes(); uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF); count += count_ones(not_pair) / 2; } @@ -22292,7 +22436,7 @@ simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t s // This algorithm could no doubt be improved! for(;pos + 32 <= size; pos += 32) { simd16x32 input(reinterpret_cast(in + pos)); - if (big_endian) input.swap_bytes(); + if (!match_system(big_endian)) input.swap_bytes(); uint64_t ascii_mask = input.lteq(0x7F); uint64_t twobyte_mask = input.lteq(0x7FF); uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF); @@ -24001,7 +24145,7 @@ simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) size_t count = 0; for(;pos + 32 <= size; pos += 32) { simd16x32 input(reinterpret_cast(in + pos)); - if (big_endian) input.swap_bytes(); + if (!match_system(big_endian)) input.swap_bytes(); uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF); count += count_ones(not_pair) / 2; } @@ -24015,7 +24159,7 @@ simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t s // This algorithm could no doubt be improved! for(;pos + 32 <= size; pos += 32) { simd16x32 input(reinterpret_cast(in + pos)); - if (big_endian) input.swap_bytes(); + if (!match_system(big_endian)) input.swap_bytes(); uint64_t ascii_mask = input.lteq(0x7F); uint64_t twobyte_mask = input.lteq(0x7FF); uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF); @@ -24382,8 +24526,8 @@ int sse_detect_encodings(const char * buf, size_t len) { if (surrogates_bitmask0 != 0x0 || surrogates_bitmask1 != 0x0) { // Cannot be UTF8 is_utf8 = false; - // Can still be either UTF-16LE or UTF-32LE depending on the positions of the surrogates - // To be valid UTF-32LE, a surrogate cannot be in the two most significant bytes of any 32-bit word. + // Can still be either UTF-16LE or UTF-32 depending on the positions of the surrogates + // To be valid UTF-32, a surrogate cannot be in the two most significant bytes of any 32-bit word. // On the other hand, to be valid UTF-16LE, at least one surrogate must be in the two most significant // bytes of a 32-bit word since they always come in pairs in UTF-16LE. // Note that we always proceed in multiple of 4 before this point so there is no offset in 32-bit words. @@ -24459,7 +24603,7 @@ int sse_detect_encodings(const char * buf, size_t len) { } } else { is_utf16 = false; - // Check for UTF-32LE + // Check for UTF-32 if (len % 4 == 0) { const char32_t * input = reinterpret_cast(buf); const char32_t* end32 = reinterpret_cast(start) + len/4; @@ -24498,7 +24642,7 @@ int sse_detect_encodings(const char * buf, size_t len) { } // If no surrogate, validate under other encodings as well - // UTF-32LE validation + // UTF-32 validation currentmax = _mm_max_epu32(in, currentmax); currentmax = _mm_max_epu32(secondin, currentmax); currentmax = _mm_max_epu32(thirdin, currentmax); @@ -27596,7 +27740,7 @@ simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) size_t count = 0; for(;pos + 32 <= size; pos += 32) { simd16x32 input(reinterpret_cast(in + pos)); - if (big_endian) input.swap_bytes(); + if (!match_system(big_endian)) input.swap_bytes(); uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF); count += count_ones(not_pair) / 2; } @@ -27610,7 +27754,7 @@ simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t s // This algorithm could no doubt be improved! for(;pos + 32 <= size; pos += 32) { simd16x32 input(reinterpret_cast(in + pos)); - if (big_endian) input.swap_bytes(); + if (!match_system(big_endian)) input.swap_bytes(); uint64_t ascii_mask = input.lteq(0x7F); uint64_t twobyte_mask = input.lteq(0x7FF); uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF); diff --git a/deps/simdutf/simdutf.h b/deps/simdutf/simdutf.h index f3f63e78d94d7b..cf236fe3d1c8a9 100644 --- a/deps/simdutf/simdutf.h +++ b/deps/simdutf/simdutf.h @@ -1,4 +1,4 @@ -/* auto-generated on 2023-01-02 15:43:33 -0500. Do not edit! */ +/* auto-generated on 2023-01-18 12:43:26 -0500. Do not edit! */ // dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf.h /* begin file include/simdutf.h */ #ifndef SIMDUTF_H @@ -64,6 +64,48 @@ #include #endif +/** + * We want to check that it is actually a little endian system at + * compile-time. + */ + +#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) +#define SIMDUTF_IS_BIG_ENDIAN (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +#elif defined(_WIN32) +#define SIMDUTF_IS_BIG_ENDIAN 0 +#else +#if defined(__APPLE__) || defined(__FreeBSD__) // defined __BYTE_ORDER__ && defined __ORDER_BIG_ENDIAN__ +#include +#elif defined(sun) || defined(__sun) // defined(__APPLE__) || defined(__FreeBSD__) +#include +#else // defined(__APPLE__) || defined(__FreeBSD__) + +#ifdef __has_include +#if __has_include() +#include +#endif //__has_include() +#endif //__has_include + +#endif // defined(__APPLE__) || defined(__FreeBSD__) + + +#ifndef !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) +#define SIMDUTF_IS_BIG_ENDIAN 0 +#endif + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define SIMDUTF_IS_BIG_ENDIAN 0 +#else // __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define SIMDUTF_IS_BIG_ENDIAN 1 +#endif // __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + +#endif // defined __BYTE_ORDER__ && defined __ORDER_BIG_ENDIAN__ + + +/** + * At this point in time, SIMDUTF_IS_BIG_ENDIAN is defined. + */ + #ifdef _MSC_VER #define SIMDUTF_VISUAL_STUDIO 1 /** @@ -98,8 +140,10 @@ #define SIMDUTF_IS_ARM64 1 #elif defined(__PPC64__) || defined(_M_PPC64) //#define SIMDUTF_IS_PPC64 1 -#pragma message("The simdutf library does yet support SIMD acceleration under\ -POWER processors. Please see https://github.com/lemire/simdutf/issues/51") +// The simdutf library does yet support SIMD acceleration under +// POWER processors. Please see https://github.com/lemire/simdutf/issues/51 +#elif defined(__s390__) +// s390 IBM system. Big endian. #else // The simdutf library is designed // for 64-bit processors and it seems that you are not @@ -456,6 +500,8 @@ enum endianness { BIG }; +bool match_system(endianness e); + std::string to_string(encoding_type bom); // Note that BOM for UTF8 is discouraged. @@ -526,14 +572,14 @@ SIMDUTF_DISABLE_UNDESIRED_WARNINGS #define SIMDUTF_SIMDUTF_VERSION_H /** The version of simdutf being used (major.minor.revision) */ -#define SIMDUTF_VERSION "2.1.0" +#define SIMDUTF_VERSION "3.1.0" namespace simdutf { enum { /** * The major version (MAJOR.minor.revision) of simdutf being used. */ - SIMDUTF_VERSION_MAJOR = 2, + SIMDUTF_VERSION_MAJOR = 3, /** * The minor version (major.MINOR.revision) of simdutf being used. */ @@ -872,6 +918,21 @@ simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) noexcept; */ simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) noexcept; +/** + * Using native endianness; Validate the UTF-16 string. + * This function may be best when you expect the input to be almost always valid. + * Otherwise, consider using validate_utf16_with_errors. + * + * Overridden by each implementation. + * + * This function is not BOM-aware. + * + * @param buf the UTF-16 string to validate. + * @param len the length of the string in number of 2-byte words (char16_t). + * @return true if and only if the string is valid UTF-16. + */ +simdutf_warn_unused bool validate_utf16(const char16_t *buf, size_t len) noexcept; + /** * Validate the UTF-16LE string. This function may be best when you expect * the input to be almost always valid. Otherwise, consider using @@ -902,6 +963,20 @@ simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) noexc */ simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) noexcept; +/** + * Using native endianness; Validate the UTF-16 string and stop on error. + * It might be faster than validate_utf16 when an error is expected to occur early. + * + * Overridden by each implementation. + * + * This function is not BOM-aware. + * + * @param buf the UTF-16 string to validate. + * @param len the length of the string in number of 2-byte words (char16_t). + * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful. + */ +simdutf_warn_unused result validate_utf16_with_errors(const char16_t *buf, size_t len) noexcept; + /** * Validate the UTF-16LE string and stop on error. It might be faster than * validate_utf16le when an error is expected to occur early. @@ -931,7 +1006,7 @@ simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, siz simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) noexcept; /** - * Validate the UTF-32LE string. This function may be best when you expect + * Validate the UTF-32 string. This function may be best when you expect * the input to be almost always valid. Otherwise, consider using * validate_utf32_with_errors. * @@ -939,26 +1014,39 @@ simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, siz * * This function is not BOM-aware. * - * @param buf the UTF-32LE string to validate. + * @param buf the UTF-32 string to validate. * @param len the length of the string in number of 4-byte words (char32_t). - * @return true if and only if the string is valid UTF-32LE. + * @return true if and only if the string is valid UTF-32. */ simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) noexcept; /** - * Validate the UTF-32LE string and stop on error. It might be faster than + * Validate the UTF-32 string and stop on error. It might be faster than * validate_utf32 when an error is expected to occur early. * * Overridden by each implementation. * * This function is not BOM-aware. * - * @param buf the UTF-32LE string to validate. + * @param buf the UTF-32 string to validate. * @param len the length of the string in number of 4-byte words (char32_t). * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful. */ simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) noexcept; +/** + * Using native endianness; Convert possibly broken UTF-8 string into UTF-16 string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in bytes + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return the number of written char16_t; 0 if the input was not valid UTF-8 string + */ +simdutf_warn_unused size_t convert_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_output) noexcept; + /** * Convert possibly broken UTF-8 string into UTF-16LE string. * @@ -985,6 +1073,20 @@ simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * input, size_t le */ simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_output) noexcept; +/** + * Using native endianness; Convert possibly broken UTF-8 string into UTF-16 + * string and stop on error. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in bytes + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful. + */ +simdutf_warn_unused result convert_utf8_to_utf16_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept; + /** * Convert possibly broken UTF-8 string into UTF-16LE string and stop on error. * @@ -1012,7 +1114,7 @@ simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * inpu simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept; /** - * Convert possibly broken UTF-8 string into UTF-32LE string. + * Convert possibly broken UTF-8 string into UTF-32 string. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -1025,7 +1127,7 @@ simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * inpu simdutf_warn_unused size_t convert_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_output) noexcept; /** - * Convert possibly broken UTF-8 string into UTF-32LE string and stop on error. + * Convert possibly broken UTF-8 string into UTF-32 string and stop on error. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -1037,6 +1139,18 @@ simdutf_warn_unused size_t convert_utf8_to_utf32(const char * input, size_t leng */ simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * input, size_t length, char32_t* utf32_output) noexcept; +/** + * Using native endianness; Convert valid UTF-8 string into UTF-16 string. + * + * This function assumes that the input string is valid UTF-8. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in bytes + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return the number of written char16_t + */ +simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_buffer) noexcept; + /** * Convert valid UTF-8 string into UTF-16LE string. * @@ -1062,7 +1176,7 @@ simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * input, siz simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_buffer) noexcept; /** - * Convert valid UTF-8 string into UTF-32LE string. + * Convert valid UTF-8 string into UTF-32 string. * * This function assumes that the input string is valid UTF-8. * @@ -1087,7 +1201,7 @@ simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * input, size_ simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) noexcept; /** - * Compute the number of 4-byte words that this UTF-8 string would require in UTF-32LE format. + * Compute the number of 4-byte words that this UTF-8 string would require in UTF-32 format. * * This function is equivalent to count_utf8 * @@ -1097,10 +1211,25 @@ simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t len * * @param input the UTF-8 string to process * @param length the length of the string in bytes - * @return the number of char32_t words required to encode the UTF-8 string as UTF-32LE + * @return the number of char32_t words required to encode the UTF-8 string as UTF-32 */ simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) noexcept; +/** + * Using native endianness; Convert possibly broken UTF-16 string into UTF-8 string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-16 string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @param utf8_buffer the pointer to buffer that can hold conversion result + * @return number of written words; 0 if input is not a valid UTF-16LE string + */ +simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept; + /** * Convert possibly broken UTF-16LE string into UTF-8 string. * @@ -1131,6 +1260,21 @@ simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * input, size_ */ simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept; +/** + * Using native endianness; Convert possibly broken UTF-16 string into UTF-8 string and stop on error. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-16 string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @param utf8_buffer the pointer to buffer that can hold conversion result + * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful. + */ +simdutf_warn_unused result convert_utf16_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) noexcept; + /** * Convert possibly broken UTF-16LE string into UTF-8 string and stop on error. * @@ -1161,6 +1305,20 @@ simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * */ simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) noexcept; +/** + * Using native endianness; Convert valid UTF-16 string into UTF-8 string. + * + * This function assumes that the input string is valid UTF-16LE. + * + * This function is not BOM-aware. + * + * @param input the UTF-16 string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @param utf8_buffer the pointer to buffer that can hold the conversion result + * @return number of written words; 0 if conversion is not possible + */ +simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept; + /** * Convert valid UTF-16LE string into UTF-8 string. * @@ -1190,7 +1348,22 @@ simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * input, simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept; /** - * Convert possibly broken UTF-16LE string into UTF-32LE string. + * Using native endianness; Convert possibly broken UTF-16 string into UTF-32 string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-16 string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @param utf32_buffer the pointer to buffer that can hold conversion result + * @return number of written words; 0 if input is not a valid UTF-16LE string + */ +simdutf_warn_unused size_t convert_utf16_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; + +/** + * Convert possibly broken UTF-16LE string into UTF-32 string. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -1205,7 +1378,7 @@ simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * input, simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; /** - * Convert possibly broken UTF-16BE string into UTF-32LE string. + * Convert possibly broken UTF-16BE string into UTF-32 string. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -1220,7 +1393,23 @@ simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * input, size simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; /** - * Convert possibly broken UTF-16LE string into UTF-32LE string and stop on error. + * Using native endianness; Convert possibly broken UTF-16 string into + * UTF-32 string and stop on error. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-16 string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @param utf32_buffer the pointer to buffer that can hold conversion result + * @return a result pair struct with an error code and either the position of the error if any or the number of char32_t written if successful. + */ +simdutf_warn_unused result convert_utf16_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; + +/** + * Convert possibly broken UTF-16LE string into UTF-32 string and stop on error. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -1235,7 +1424,7 @@ simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * input, size simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; /** - * Convert possibly broken UTF-16BE string into UTF-32LE string and stop on error. + * Convert possibly broken UTF-16BE string into UTF-32 string and stop on error. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -1250,7 +1439,21 @@ simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; /** - * Convert valid UTF-16LE string into UTF-32LE string. + * Using native endianness; Convert valid UTF-16 string into UTF-32 string. + * + * This function assumes that the input string is valid UTF-16 (native endianness). + * + * This function is not BOM-aware. + * + * @param input the UTF-16 string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @param utf32_buffer the pointer to buffer that can hold the conversion result + * @return number of written words; 0 if conversion is not possible + */ +simdutf_warn_unused size_t convert_valid_utf16_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; + +/** + * Convert valid UTF-16LE string into UTF-32 string. * * This function assumes that the input string is valid UTF-16LE. * @@ -1264,7 +1467,7 @@ simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; /** - * Convert valid UTF-16BE string into UTF-32LE string. + * Convert valid UTF-16BE string into UTF-32 string. * * This function assumes that the input string is valid UTF-16LE. * @@ -1277,6 +1480,18 @@ simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * input */ simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; +/** + * Using native endianness; Compute the number of bytes that this UTF-16 + * string would require in UTF-8 format. + * + * This function does not validate the input. + * + * @param input the UTF-16 string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @return the number of bytes required to encode the UTF-16LE string as UTF-8 + */ +simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * input, size_t length) noexcept; + /** * Compute the number of bytes that this UTF-16LE string would require in UTF-8 format. * @@ -1300,29 +1515,29 @@ simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) noexcept; /** - * Convert possibly broken UTF-32LE string into UTF-8 string. + * Convert possibly broken UTF-32 string into UTF-8 string. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. * * This function is not BOM-aware. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) * @param utf8_buffer the pointer to buffer that can hold conversion result - * @return number of written words; 0 if input is not a valid UTF-32LE string + * @return number of written words; 0 if input is not a valid UTF-32 string */ simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) noexcept; /** - * Convert possibly broken UTF-32LE string into UTF-8 string and stop on error. + * Convert possibly broken UTF-32 string into UTF-8 string and stop on error. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. * * This function is not BOM-aware. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) * @param utf8_buffer the pointer to buffer that can hold conversion result * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful. @@ -1330,13 +1545,13 @@ simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * input, size_t simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * input, size_t length, char* utf8_buffer) noexcept; /** - * Convert valid UTF-32LE string into UTF-8 string. + * Convert valid UTF-32 string into UTF-8 string. * - * This function assumes that the input string is valid UTF-32LE. + * This function assumes that the input string is valid UTF-32. * * This function is not BOM-aware. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) * @param utf8_buffer the pointer to buffer that can hold the conversion result * @return number of written words; 0 if conversion is not possible @@ -1344,44 +1559,75 @@ simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * in simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) noexcept; /** - * Convert possibly broken UTF-32LE string into UTF-16LE string. + * Using native endianness; Convert possibly broken UTF-32 string into UTF-16 string. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. * * This function is not BOM-aware. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return number of written words; 0 if input is not a valid UTF-32LE string + * @return number of written words; 0 if input is not a valid UTF-32 string + */ +simdutf_warn_unused size_t convert_utf32_to_utf16(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; + +/** + * Convert possibly broken UTF-32 string into UTF-16LE string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-32 string to convert + * @param length the length of the string in 4-byte words (char32_t) + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return number of written words; 0 if input is not a valid UTF-32 string */ simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; /** - * Convert possibly broken UTF-32LE string into UTF-16BE string. + * Convert possibly broken UTF-32 string into UTF-16BE string. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. * * This function is not BOM-aware. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return number of written words; 0 if input is not a valid UTF-32LE string + * @return number of written words; 0 if input is not a valid UTF-32 string */ simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; /** - * Convert possibly broken UTF-32LE string into UTF-16LE string and stop on error. + * Using native endianness; Convert possibly broken UTF-32 string into UTF-16 + * string and stop on error. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. * * This function is not BOM-aware. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert + * @param length the length of the string in 4-byte words (char32_t) + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful. + */ +simdutf_warn_unused result convert_utf32_to_utf16_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; + +/** + * Convert possibly broken UTF-32 string into UTF-16LE string and stop on error. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) * @param utf16_buffer the pointer to buffer that can hold conversion result * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful. @@ -1389,14 +1635,14 @@ simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * input, size simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; /** - * Convert possibly broken UTF-32LE string into UTF-16BE string and stop on error. + * Convert possibly broken UTF-32 string into UTF-16BE string and stop on error. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. * * This function is not BOM-aware. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) * @param utf16_buffer the pointer to buffer that can hold conversion result * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful. @@ -1404,13 +1650,27 @@ simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; /** - * Convert valid UTF-32LE string into UTF-16LE string. + * Using native endianness; Convert valid UTF-32 string into UTF-16 string. + * + * This function assumes that the input string is valid UTF-32. + * + * This function is not BOM-aware. + * + * @param input the UTF-32 string to convert + * @param length the length of the string in 4-byte words (char32_t) + * @param utf16_buffer the pointer to buffer that can hold the conversion result + * @return number of written words; 0 if conversion is not possible + */ +simdutf_warn_unused size_t convert_valid_utf32_to_utf16(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; + +/** + * Convert valid UTF-32 string into UTF-16LE string. * - * This function assumes that the input string is valid UTF-32LE. + * This function assumes that the input string is valid UTF-32. * * This function is not BOM-aware. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) * @param utf16_buffer the pointer to buffer that can hold the conversion result * @return number of written words; 0 if conversion is not possible @@ -1418,13 +1678,13 @@ simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; /** - * Convert valid UTF-32LE string into UTF-16BE string. + * Convert valid UTF-32 string into UTF-16BE string. * - * This function assumes that the input string is valid UTF-32LE. + * This function assumes that the input string is valid UTF-32. * * This function is not BOM-aware. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) * @param utf16_buffer the pointer to buffer that can hold the conversion result * @return number of written words; 0 if conversion is not possible @@ -1446,29 +1706,45 @@ simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * input void change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) noexcept; /** - * Compute the number of bytes that this UTF-32LE string would require in UTF-8 format. + * Compute the number of bytes that this UTF-32 string would require in UTF-8 format. * * This function does not validate the input. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) - * @return the number of bytes required to encode the UTF-32LE string as UTF-8 + * @return the number of bytes required to encode the UTF-32 string as UTF-8 */ simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) noexcept; /** - * Compute the number of two-byte words that this UTF-32LE string would require in UTF-16 format. + * Compute the number of two-byte words that this UTF-32 string would require in UTF-16 format. * * This function does not validate the input. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) - * @return the number of bytes required to encode the UTF-32LE string as UTF-16 + * @return the number of bytes required to encode the UTF-32 string as UTF-16 */ simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) noexcept; /** - * Compute the number of bytes that this UTF-16LE string would require in UTF-32LE format. + * Using native endianness; Compute the number of bytes that this UTF-16 + * string would require in UTF-32 format. + * + * This function is equivalent to count_utf16. + * + * This function does not validate the input. + * + * This function is not BOM-aware. + * + * @param input the UTF-16 string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @return the number of bytes required to encode the UTF-16LE string as UTF-32 + */ +simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t * input, size_t length) noexcept; + +/** + * Compute the number of bytes that this UTF-16LE string would require in UTF-32 format. * * This function is equivalent to count_utf16le. * @@ -1478,12 +1754,12 @@ simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_ * * @param input the UTF-16LE string to convert * @param length the length of the string in 2-byte words (char16_t) - * @return the number of bytes required to encode the UTF-16LE string as UTF-32LE + * @return the number of bytes required to encode the UTF-16LE string as UTF-32 */ simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) noexcept; /** - * Compute the number of bytes that this UTF-16BE string would require in UTF-32LE format. + * Compute the number of bytes that this UTF-16BE string would require in UTF-32 format. * * This function is equivalent to count_utf16be. * @@ -1493,10 +1769,24 @@ simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, siz * * @param input the UTF-16BE string to convert * @param length the length of the string in 2-byte words (char16_t) - * @return the number of bytes required to encode the UTF-16BE string as UTF-32LE + * @return the number of bytes required to encode the UTF-16BE string as UTF-32 */ simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) noexcept; +/** + * Count the number of code points (characters) in the string assuming that + * it is valid. + * + * This function assumes that the input string is valid UTF-16 (native endianness). + * + * This function is not BOM-aware. + * + * @param input the UTF-16 string to process + * @param length the length of the string in 2-byte words (char16_t) + * @return number of code points + */ +simdutf_warn_unused size_t count_utf16(const char16_t * input, size_t length) noexcept; + /** * Count the number of code points (characters) in the string assuming that * it is valid. @@ -1705,26 +1995,26 @@ class implementation { simdutf_warn_unused virtual result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept = 0; /** - * Validate the UTF-32LE string. + * Validate the UTF-32 string. * * Overridden by each implementation. * * This function is not BOM-aware. * - * @param buf the UTF-32LE string to validate. + * @param buf the UTF-32 string to validate. * @param len the length of the string in number of 4-byte words (char32_t). - * @return true if and only if the string is valid UTF-32LE. + * @return true if and only if the string is valid UTF-32. */ simdutf_warn_unused virtual bool validate_utf32(const char32_t *buf, size_t len) const noexcept = 0; /** - * Validate the UTF-32LE string and stop on error. + * Validate the UTF-32 string and stop on error. * * Overridden by each implementation. * * This function is not BOM-aware. * - * @param buf the UTF-32LE string to validate. + * @param buf the UTF-32 string to validate. * @param len the length of the string in number of 4-byte words (char32_t). * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful. */ @@ -1783,7 +2073,7 @@ class implementation { simdutf_warn_unused virtual result convert_utf8_to_utf16be_with_errors(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0; /** - * Convert possibly broken UTF-8 string into UTF-32LE string. + * Convert possibly broken UTF-8 string into UTF-32 string. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -1796,7 +2086,7 @@ class implementation { simdutf_warn_unused virtual size_t convert_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_output) const noexcept = 0; /** - * Convert possibly broken UTF-8 string into UTF-32LE string and stop on error. + * Convert possibly broken UTF-8 string into UTF-32 string and stop on error. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -1833,7 +2123,7 @@ class implementation { simdutf_warn_unused virtual size_t convert_valid_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; /** - * Convert valid UTF-8 string into UTF-32LE string. + * Convert valid UTF-8 string into UTF-32 string. * * This function assumes that the input string is valid UTF-8. * @@ -1856,7 +2146,7 @@ class implementation { simdutf_warn_unused virtual size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept = 0; /** - * Compute the number of 4-byte words that this UTF-8 string would require in UTF-32LE format. + * Compute the number of 4-byte words that this UTF-8 string would require in UTF-32 format. * * This function is equivalent to count_utf8. * @@ -1864,7 +2154,7 @@ class implementation { * * @param input the UTF-8 string to process * @param length the length of the string in bytes - * @return the number of char32_t words required to encode the UTF-8 string as UTF-32LE + * @return the number of char32_t words required to encode the UTF-8 string as UTF-32 */ simdutf_warn_unused virtual size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept = 0; @@ -1957,7 +2247,7 @@ class implementation { simdutf_warn_unused virtual size_t convert_valid_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0; /** - * Convert possibly broken UTF-16LE string into UTF-32LE string. + * Convert possibly broken UTF-16LE string into UTF-32 string. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -1972,7 +2262,7 @@ class implementation { simdutf_warn_unused virtual size_t convert_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0; /** - * Convert possibly broken UTF-16BE string into UTF-32LE string. + * Convert possibly broken UTF-16BE string into UTF-32 string. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -1987,7 +2277,7 @@ class implementation { simdutf_warn_unused virtual size_t convert_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0; /** - * Convert possibly broken UTF-16LE string into UTF-32LE string and stop on error. + * Convert possibly broken UTF-16LE string into UTF-32 string and stop on error. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -2002,7 +2292,7 @@ class implementation { simdutf_warn_unused virtual result convert_utf16le_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0; /** - * Convert possibly broken UTF-16BE string into UTF-32LE string and stop on error. + * Convert possibly broken UTF-16BE string into UTF-32 string and stop on error. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -2017,7 +2307,7 @@ class implementation { simdutf_warn_unused virtual result convert_utf16be_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0; /** - * Convert valid UTF-16LE string into UTF-32LE string. + * Convert valid UTF-16LE string into UTF-32 string. * * This function assumes that the input string is valid UTF-16LE. * @@ -2071,29 +2361,29 @@ class implementation { simdutf_warn_unused virtual size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept = 0; /** - * Convert possibly broken UTF-32LE string into UTF-8 string. + * Convert possibly broken UTF-32 string into UTF-8 string. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. * * This function is not BOM-aware. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) * @param utf8_buffer the pointer to buffer that can hold conversion result - * @return number of written words; 0 if input is not a valid UTF-32LE string + * @return number of written words; 0 if input is not a valid UTF-32 string */ simdutf_warn_unused virtual size_t convert_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) const noexcept = 0; /** - * Convert possibly broken UTF-32LE string into UTF-8 string and stop on error. + * Convert possibly broken UTF-32 string into UTF-8 string and stop on error. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. * * This function is not BOM-aware. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) * @param utf8_buffer the pointer to buffer that can hold conversion result * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful. @@ -2101,13 +2391,13 @@ class implementation { simdutf_warn_unused virtual result convert_utf32_to_utf8_with_errors(const char32_t * input, size_t length, char* utf8_buffer) const noexcept = 0; /** - * Convert valid UTF-32LE string into UTF-8 string. + * Convert valid UTF-32 string into UTF-8 string. * - * This function assumes that the input string is valid UTF-32LE. + * This function assumes that the input string is valid UTF-32. * * This function is not BOM-aware. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) * @param utf8_buffer the pointer to buffer that can hold the conversion result * @return number of written words; 0 if conversion is not possible @@ -2115,44 +2405,44 @@ class implementation { simdutf_warn_unused virtual size_t convert_valid_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) const noexcept = 0; /** - * Convert possibly broken UTF-32LE string into UTF-16LE string. + * Convert possibly broken UTF-32 string into UTF-16LE string. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. * * This function is not BOM-aware. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return number of written words; 0 if input is not a valid UTF-32LE string + * @return number of written words; 0 if input is not a valid UTF-32 string */ simdutf_warn_unused virtual size_t convert_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; /** - * Convert possibly broken UTF-32LE string into UTF-16BE string. + * Convert possibly broken UTF-32 string into UTF-16BE string. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. * * This function is not BOM-aware. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return number of written words; 0 if input is not a valid UTF-32LE string + * @return number of written words; 0 if input is not a valid UTF-32 string */ simdutf_warn_unused virtual size_t convert_utf32_to_utf16be(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; /** - * Convert possibly broken UTF-32LE string into UTF-16LE string and stop on error. + * Convert possibly broken UTF-32 string into UTF-16LE string and stop on error. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. * * This function is not BOM-aware. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) * @param utf16_buffer the pointer to buffer that can hold conversion result * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful. @@ -2160,14 +2450,14 @@ class implementation { simdutf_warn_unused virtual result convert_utf32_to_utf16le_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; /** - * Convert possibly broken UTF-32LE string into UTF-16BE string and stop on error. + * Convert possibly broken UTF-32 string into UTF-16BE string and stop on error. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. * * This function is not BOM-aware. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) * @param utf16_buffer the pointer to buffer that can hold conversion result * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful. @@ -2175,13 +2465,13 @@ class implementation { simdutf_warn_unused virtual result convert_utf32_to_utf16be_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; /** - * Convert valid UTF-32LE string into UTF-16LE string. + * Convert valid UTF-32 string into UTF-16LE string. * - * This function assumes that the input string is valid UTF-32LE. + * This function assumes that the input string is valid UTF-32. * * This function is not BOM-aware. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) * @param utf16_buffer the pointer to buffer that can hold the conversion result * @return number of written words; 0 if conversion is not possible @@ -2189,13 +2479,13 @@ class implementation { simdutf_warn_unused virtual size_t convert_valid_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; /** - * Convert valid UTF-32LE string into UTF-16BE string. + * Convert valid UTF-32 string into UTF-16BE string. * - * This function assumes that the input string is valid UTF-32LE. + * This function assumes that the input string is valid UTF-32. * * This function is not BOM-aware. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) * @param utf16_buffer the pointer to buffer that can hold the conversion result * @return number of written words; 0 if conversion is not possible @@ -2217,29 +2507,29 @@ class implementation { virtual void change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept = 0; /** - * Compute the number of bytes that this UTF-32LE string would require in UTF-8 format. + * Compute the number of bytes that this UTF-32 string would require in UTF-8 format. * * This function does not validate the input. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) - * @return the number of bytes required to encode the UTF-32LE string as UTF-8 + * @return the number of bytes required to encode the UTF-32 string as UTF-8 */ simdutf_warn_unused virtual size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept = 0; /** - * Compute the number of two-byte words that this UTF-32LE string would require in UTF-16 format. + * Compute the number of two-byte words that this UTF-32 string would require in UTF-16 format. * * This function does not validate the input. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) - * @return the number of bytes required to encode the UTF-32LE string as UTF-16 + * @return the number of bytes required to encode the UTF-32 string as UTF-16 */ simdutf_warn_unused virtual size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept = 0; /* - * Compute the number of bytes that this UTF-16LE string would require in UTF-32LE format. + * Compute the number of bytes that this UTF-16LE string would require in UTF-32 format. * * This function is equivalent to count_utf16le. * @@ -2249,12 +2539,12 @@ class implementation { * * @param input the UTF-16LE string to convert * @param length the length of the string in 2-byte words (char16_t) - * @return the number of bytes required to encode the UTF-16LE string as UTF-32LE + * @return the number of bytes required to encode the UTF-16LE string as UTF-32 */ simdutf_warn_unused virtual size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept = 0; /* - * Compute the number of bytes that this UTF-16BE string would require in UTF-32LE format. + * Compute the number of bytes that this UTF-16BE string would require in UTF-32 format. * * This function is equivalent to count_utf16be. * @@ -2264,7 +2554,7 @@ class implementation { * * @param input the UTF-16BE string to convert * @param length the length of the string in 2-byte words (char16_t) - * @return the number of bytes required to encode the UTF-16BE string as UTF-32LE + * @return the number of bytes required to encode the UTF-16BE string as UTF-32 */ simdutf_warn_unused virtual size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept = 0; @@ -2429,19 +2719,22 @@ class atomic_ptr { #endif }; +class detect_best_supported_implementation_on_first_use; + } // namespace internal /** - * The list of available implementations compiled into simdutf. + * The list of available implementations compiled into simdjson. */ -extern SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list available_implementations; +extern SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list& get_available_implementations(); /** * The active implementation. * * Automatically initialized on first use to the most advanced implementation supported by this hardware. */ -extern SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr active_implementation; +extern SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr& get_active_implementation(); + } // namespace simdutf From c85f1df74e100a1ee533b5caed18ed5571097f7c Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Wed, 18 Jan 2023 15:59:20 -0500 Subject: [PATCH 2/2] src: fix endianness of simdutf --- src/node_builtins.cc | 8 ++++---- test/cctest/test_util.cc | 14 -------------- 2 files changed, 4 insertions(+), 18 deletions(-) diff --git a/src/node_builtins.cc b/src/node_builtins.cc index d6b5114aa3c4aa..3ec010d5de66f2 100644 --- a/src/node_builtins.cc +++ b/src/node_builtins.cc @@ -256,10 +256,10 @@ bool BuiltinLoader::Add(const char* id, std::string_view utf8source) { size_t expected_u16_length = simdutf::utf16_length_from_utf8(utf8source.data(), utf8source.length()); auto out = std::make_shared>(expected_u16_length); - size_t u16_length = simdutf::convert_utf8_to_utf16le( - utf8source.data(), - utf8source.length(), - reinterpret_cast(out->data())); + size_t u16_length = + simdutf::convert_utf8_to_utf16(utf8source.data(), + utf8source.length(), + reinterpret_cast(out->data())); out->resize(u16_length); return Add(id, UnionBytes(out)); } diff --git a/test/cctest/test_util.cc b/test/cctest/test_util.cc index 443a03117c09fc..5b83e07db3b7c2 100644 --- a/test/cctest/test_util.cc +++ b/test/cctest/test_util.cc @@ -299,17 +299,3 @@ TEST(UtilTest, SPrintF) { const std::string with_zero = std::string("a") + '\0' + 'b'; EXPECT_EQ(SPrintF("%s", with_zero), with_zero); } - -TEST(UtilTest, SimdutfEndiannessDoesNotMeanEndianness) { - // In simdutf, "LE" does *not* refer to Little Endian, it refers - // to 16-byte code units that are stored using *host* endianness. - // This is weird and confusing naming, and so we add this assertion - // here to verify that this is actually the case (so that CI tells - // us if it changed, because for most people Little Endian is - // host endianness, so locally everything would work fine). - const char utf8source[] = "\xe7\x8c\xab"; - char16_t u16output; - size_t u16len = simdutf::convert_utf8_to_utf16le(utf8source, 3, &u16output); - EXPECT_EQ(u16len, 1u); - EXPECT_EQ(u16output, 0x732B); -}