From b245464f938c7309ae14a7b0dd5567ffa272cb0b Mon Sep 17 00:00:00 2001 From: Paul Bowen-Huggett Date: Sun, 14 Jan 2024 09:55:12 +0100 Subject: [PATCH] Fix 32->16 surrogate handling. Fix range adaptor partial input. There was a bug in the handling of surrogate values in the UTF-32->UTF-16 conversion. The code dealt with the <= 0xFFFF case *before* considering the surrogate case (the high/low surrogate values are <= 0xFFFF)! The range adaptor had a bug in the handling of an incomplete code-unit sequence at the end of the input this could result in a code-unit of zero being produced. We now consider the end-of-input case after consuming the input bytes to avoid this problem. Added some additional assertions. Added new, more thorough, testing for the UTF-32 to 8/16/32 cases. --- include/icubaby/icubaby.hpp | 33 ++--- unittests/CMakeLists.txt | 1 + unittests/test_u32.cpp | 252 ++++++++++++++++++++++++++++++++++++ unittests/test_u32_8.cpp | 10 ++ 4 files changed, 280 insertions(+), 16 deletions(-) create mode 100644 unittests/test_u32.cpp diff --git a/include/icubaby/icubaby.hpp b/include/icubaby/icubaby.hpp index 521ee5e8..43399e73 100644 --- a/include/icubaby/icubaby.hpp +++ b/include/icubaby/icubaby.hpp @@ -697,11 +697,11 @@ template <> class transcoder { template ICUBABY_REQUIRES ((std::output_iterator)) OutputIterator operator() (input_type code_point, OutputIterator dest) { - if (code_point <= 0xFFFF) { - *(dest++) = static_cast (code_point); - } else if (is_surrogate (code_point) || code_point > max_code_point) { + if (is_surrogate (code_point) || code_point > max_code_point) { dest = (*this) (replacement_char, dest); well_formed_ = false; + } else if (code_point <= 0xFFFF) { + *(dest++) = static_cast (code_point); } else { *(dest++) = static_cast (0xD7C0U + (code_point >> 10U)); *(dest++) = static_cast (first_low_surrogate + (code_point & 0x3FFU)); @@ -1034,7 +1034,7 @@ class transcode_view::iterator { using difference_type = std::ranges::range_difference_t; iterator () requires std::default_initializable> = default; - constexpr iterator (transcode_view const& parent, std::ranges::iterator_t current) + constexpr iterator (transcode_view const& parent, std::ranges::iterator_t const& current) : current_{current}, parent_{&parent}, state_{current} { assert (state_.empty ()); // Prime the input state so that a dereference of the iterator will yield the first of the @@ -1045,9 +1045,7 @@ class transcode_view::iterator { constexpr std::ranges::iterator_t const& base () const& noexcept { return current_; } constexpr std::ranges::iterator_t base () && { return std::move (current_); } - constexpr value_type const& operator* () const { - return state_.empty () ? replacement : state_.front (); - } + constexpr value_type const& operator* () const { return state_.front (); } constexpr std::ranges::iterator_t operator->() const { return state_.front (); } constexpr iterator& operator++ () { @@ -1095,7 +1093,10 @@ class transcode_view::iterator { constexpr state () : state{std::ranges::iterator_t{}} {} [[nodiscard]] constexpr bool empty () const noexcept { return valid_.empty (); } - [[nodiscard]] constexpr auto& front () const noexcept { return valid_.front (); } + [[nodiscard]] constexpr auto& front () const noexcept { + assert (!valid_.empty ()); + return valid_.front (); + } constexpr void advance () noexcept { valid_.advance (1); } /// Consumes enough code-units from the base iterator to form a single code-point. The resulting @@ -1134,16 +1135,16 @@ constexpr std::ranges::iterator_t transcode_viewbase_); next_ == input_end) { + auto const input_end = std::ranges::end (parent->base_); + // Loop until we've produced a code-point's worth of code-units in the out + // container or we've run out of input. + while (it == out_.begin () && next_ != input_end) { + it = transcoder_ (*next_, it); + ++next_; + } + if (next_ == input_end) { // We've consumed the entire input so tell the transcoder and get any final output. it = transcoder_.end_cp (it); - } else { - // Loop until we've produced a code-point's worth of code-units in the out - // container or we've run out of input. - while (it == out_.begin () && next_ != input_end) { - it = transcoder_ (*next_, it); - ++next_; - } } assert (it >= out_.begin () && it <= out_.end ()); parent->well_formed_ = transcoder_.well_formed (); diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt index be940177..89a2b7a4 100644 --- a/unittests/CMakeLists.txt +++ b/unittests/CMakeLists.txt @@ -25,6 +25,7 @@ add_executable (icubaby-unittests harness.cpp test_u8_32.cpp test_u16.cpp + test_u32.cpp test_u32_8.cpp test_utility.cpp typed_test.hpp diff --git a/unittests/test_u32.cpp b/unittests/test_u32.cpp new file mode 100644 index 00000000..4d15d0d0 --- /dev/null +++ b/unittests/test_u32.cpp @@ -0,0 +1,252 @@ +// MIT License +// +// Copyright (c) 2022-2024 Paul Bowen-Huggett +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include +#include +#include +#include +#include + +// icubaby itself. +#include "icubaby/icubaby.hpp" + +// Google Test/Mock +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +// Local includes +#include "encoded_char.hpp" +#include "typed_test.hpp" + +static_assert (std::is_same_v && + std::is_same_v); +static_assert (std::is_same_v && + std::is_same_v); +// NOLINTNEXTLINE(misc-redundant-expression) +static_assert (std::is_same_v && + std::is_same_v); + +using namespace std::string_literals; +using testing::ElementsAreArray; + +// NOLINTBEGIN(cppcoreguidelines-avoid-magic-numbers, readability-magic-numbers) + +namespace { + +template class Utf32 : public testing::Test { +protected: + using output_type = T; + std::vector output_; + icubaby::transcoder transcoder_; +}; + +} // end anonymous namespace + +TYPED_TEST_SUITE (Utf32, OutputTypes, OutputTypeNames); +// NOLINTNEXTLINE +TYPED_TEST (Utf32, GoodDollarSign) { + auto& transcoder = this->transcoder_; + auto& output = this->output_; + EXPECT_TRUE (transcoder.well_formed ()); + EXPECT_FALSE (transcoder.partial ()); + auto it = transcoder (static_cast (code_point::dollar_sign), std::back_inserter (output)); + EXPECT_TRUE (transcoder.well_formed ()) << "input should be well formed"; + EXPECT_FALSE (transcoder.partial ()) << "there were no surrogate code units"; + transcoder.end_cp (it); + EXPECT_TRUE (transcoder.well_formed ()); + EXPECT_FALSE (transcoder.partial ()); + EXPECT_THAT (output, ElementsAreArray (encoded_char_v)); +} +// NOLINTNEXTLINE +TYPED_TEST (Utf32, StartOfHeadingAndText) { + auto& transcoder = this->transcoder_; + auto& output = this->output_; + auto it = transcoder (static_cast (code_point::start_of_heading), std::back_inserter (output)); + EXPECT_TRUE (transcoder.well_formed ()); + EXPECT_FALSE (transcoder.partial ()); + it = transcoder (static_cast (code_point::start_of_text), it); + EXPECT_TRUE (transcoder.well_formed ()); + EXPECT_FALSE (transcoder.partial ()); + transcoder.end_cp (it); + EXPECT_TRUE (transcoder.well_formed ()); + EXPECT_FALSE (transcoder.partial ()); + + std::vector expected; + append (std::back_inserter (expected)); + append (std::back_inserter (expected)); + EXPECT_THAT (output, ElementsAreArray (expected)); +} +// NOLINTNEXTLINE +TYPED_TEST (Utf32, CharFFFF) { + auto& transcoder = this->transcoder_; + auto& output = this->output_; + auto it = transcoder (static_cast (code_point::code_point_ffff), std::back_inserter (output)); + EXPECT_TRUE (transcoder.well_formed ()); + EXPECT_FALSE (transcoder.partial ()); + transcoder.end_cp (it); + EXPECT_TRUE (transcoder.well_formed ()); + EXPECT_FALSE (transcoder.partial ()); + EXPECT_THAT (output, ElementsAreArray (encoded_char_v)); +} +// NOLINTNEXTLINE +TYPED_TEST (Utf32, FirstHighSurrogate) { + auto& transcoder = this->transcoder_; + auto& output = this->output_; + + auto it = transcoder (icubaby::first_high_surrogate, std::back_inserter (output)); + EXPECT_FALSE (transcoder.well_formed ()); + EXPECT_FALSE (transcoder.partial ()); + transcoder.end_cp (it); + EXPECT_FALSE (transcoder.well_formed ()); + EXPECT_FALSE (transcoder.partial ()); + EXPECT_THAT (output, ElementsAreArray (encoded_char_v)); +} +// NOLINTNEXTLINE +TYPED_TEST (Utf32, LastHighSurrogate) { + auto& transcoder = this->transcoder_; + auto& output = this->output_; + + auto it = transcoder (icubaby::last_high_surrogate, std::back_inserter (output)); + EXPECT_FALSE (transcoder.well_formed ()); + EXPECT_FALSE (transcoder.partial ()); + transcoder.end_cp (it); + EXPECT_FALSE (transcoder.well_formed ()); + EXPECT_FALSE (transcoder.partial ()); + EXPECT_THAT (output, ElementsAreArray (encoded_char_v)); +} +// NOLINTNEXTLINE +TYPED_TEST (Utf32, FirstLowSurrogate) { + auto& transcoder = this->transcoder_; + auto& output = this->output_; + + auto it = transcoder (icubaby::first_low_surrogate, std::back_inserter (output)); + EXPECT_FALSE (transcoder.well_formed ()); + EXPECT_FALSE (transcoder.partial ()); + transcoder.end_cp (it); + EXPECT_FALSE (transcoder.well_formed ()); + EXPECT_FALSE (transcoder.partial ()); + EXPECT_THAT (output, ElementsAreArray (encoded_char_v)); +} +// NOLINTNEXTLINE +TYPED_TEST (Utf32, LastLowSurrogate) { + auto& transcoder = this->transcoder_; + auto& output = this->output_; + + auto it = transcoder (icubaby::last_low_surrogate, std::back_inserter (output)); + EXPECT_FALSE (transcoder.well_formed ()); + EXPECT_FALSE (transcoder.partial ()); + transcoder.end_cp (it); + EXPECT_FALSE (transcoder.well_formed ()); + EXPECT_FALSE (transcoder.partial ()); + EXPECT_THAT (output, ElementsAreArray (encoded_char_v)); +} +// NOLINTNEXTLINE +TYPED_TEST (Utf32, MaxCodePoint) { + auto& transcoder = this->transcoder_; + auto& output = this->output_; + + auto it = transcoder (icubaby::max_code_point, std::back_inserter (output)); + EXPECT_TRUE (transcoder.well_formed ()); + EXPECT_FALSE (transcoder.partial ()); + transcoder.end_cp (it); + EXPECT_TRUE (transcoder.well_formed ()); + EXPECT_FALSE (transcoder.partial ()); + + EXPECT_THAT (output, ElementsAreArray (encoded_char_v)); +} +// NOLINTNEXTLINE +TYPED_TEST (Utf32, BeyondMaxCodePoint) { + auto& transcoder = this->transcoder_; + auto& output = this->output_; + + auto it = transcoder (static_cast (static_cast (icubaby::max_code_point) + 1U), + std::back_inserter (output)); + EXPECT_FALSE (transcoder.well_formed ()); + EXPECT_FALSE (transcoder.partial ()); + transcoder.end_cp (it); + EXPECT_FALSE (transcoder.well_formed ()); + EXPECT_FALSE (transcoder.partial ()); + + EXPECT_THAT (output, ElementsAreArray (encoded_char_v)); +} + +#if ICUBABY_HAVE_RANGES && ICUBABY_HAVE_CONCEPTS + +// NOLINTNEXTLINE +TYPED_TEST (Utf32, RangesCopy) { + auto& output = this->output_; + + std::vector const in{ + static_cast (code_point::cjk_unified_ideograph_2070e), + static_cast (code_point::code_point_ffff), + static_cast (code_point::cuneiform_sign_uru_times_ki), + static_cast (code_point::dollar_sign), + static_cast (code_point::hiragana_letter_go), + static_cast (code_point::hiragana_letter_ha), + static_cast (code_point::hiragana_letter_i), + static_cast (code_point::hiragana_letter_ma), + static_cast (code_point::hiragana_letter_o), + static_cast (code_point::hiragana_letter_su), + static_cast (code_point::hiragana_letter_u), + static_cast (code_point::hiragana_letter_yo), + static_cast (code_point::hiragana_letter_za), + static_cast (code_point::linear_b_syllable_b008_a), + static_cast (code_point::start_of_heading), + static_cast (code_point::start_of_text), + }; + + auto r = in | icubaby::ranges::transcode; + std::ranges::copy (r, std::back_inserter (output)); + + std::vector expected; + append (std::back_inserter (expected)); + append (std::back_inserter (expected)); + append (std::back_inserter (expected)); + append (std::back_inserter (expected)); + append (std::back_inserter (expected)); + append (std::back_inserter (expected)); + append (std::back_inserter (expected)); + append (std::back_inserter (expected)); + append (std::back_inserter (expected)); + append (std::back_inserter (expected)); + append (std::back_inserter (expected)); + append (std::back_inserter (expected)); + append (std::back_inserter (expected)); + append (std::back_inserter (expected)); + append (std::back_inserter (expected)); + append (std::back_inserter (expected)); + EXPECT_THAT (output, ElementsAreArray (expected)); + EXPECT_TRUE (r.well_formed ()); +} +// NOLINTNEXTLINE +TYPED_TEST (Utf32, RangesBadInput) { + auto& output = this->output_; + std::vector const in{char32_t{0xFFFFFFFF}}; + auto const r = in | icubaby::ranges::transcode; + std::ranges::copy (r, std::back_inserter (output)); + EXPECT_THAT (output, ElementsAreArray (encoded_char_v)); + EXPECT_FALSE (r.well_formed ()); +} + +#endif // ICUBABY_HAVE_RANGES && ICUBABY_HAVE_CONCEPTS + +// NOLINTEND(cppcoreguidelines-avoid-magic-numbers, readability-magic-numbers) diff --git a/unittests/test_u32_8.cpp b/unittests/test_u32_8.cpp index a919d048..7c84d494 100644 --- a/unittests/test_u32_8.cpp +++ b/unittests/test_u32_8.cpp @@ -95,4 +95,14 @@ TEST (Utf32To8, RangesCopy) { )); // clang-format on } +// NOLINTNEXTLINE +TEST (Utf32To8, RangesBadInput) { + std::vector const in{char32_t{0xFFFFFFFF}}; + std::vector out8; + auto const r = in | icubaby::ranges::transcode; + std::ranges::copy (r, std::back_inserter (out8)); + EXPECT_THAT (out8, testing::ElementsAre (char8_t{0xEF}, char8_t{0xBF}, char8_t{0xBD})); + EXPECT_FALSE (r.well_formed ()); +} + #endif // __cpp_lib_ranges