Skip to content

Commit

Permalink
Fix 32->16 surrogate handling. Fix range adaptor partial input.
Browse files Browse the repository at this point in the history
There was a bug in the handling of surrogate values in the
UTF-32->UTF-16 conversion. The code dealt with the <= 0xFFFF
case *before* considering the surrogate case (the high/low surrogate
values are <= 0xFFFF)!

The range adaptor had a bug in the handling of an incomplete code-unit
sequence at the end of the input this could result in a code-unit of
zero being produced. We now consider the end-of-input case after
consuming the input bytes to avoid this problem. Added some additional
assertions.

Added new, more thorough, testing for the UTF-32 to 8/16/32 cases.
  • Loading branch information
paulhuggett committed Jan 14, 2024
1 parent a3ee81c commit b245464
Show file tree
Hide file tree
Showing 4 changed files with 280 additions and 16 deletions.
33 changes: 17 additions & 16 deletions include/icubaby/icubaby.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -697,11 +697,11 @@ template <> class transcoder<char32_t, char16_t> {
template <typename OutputIterator>
ICUBABY_REQUIRES ((std::output_iterator<OutputIterator, output_type>))
OutputIterator operator() (input_type code_point, OutputIterator dest) {
if (code_point <= 0xFFFF) {
*(dest++) = static_cast<output_type> (code_point);
} else if (is_surrogate (code_point) || code_point > max_code_point) {
if (is_surrogate (code_point) || code_point > max_code_point) {
dest = (*this) (replacement_char, dest);
well_formed_ = false;
} else if (code_point <= 0xFFFF) {
*(dest++) = static_cast<output_type> (code_point);
} else {
*(dest++) = static_cast<output_type> (0xD7C0U + (code_point >> 10U));
*(dest++) = static_cast<output_type> (first_low_surrogate + (code_point & 0x3FFU));
Expand Down Expand Up @@ -1034,7 +1034,7 @@ class transcode_view<FromEncoding, ToEncoding, View>::iterator {
using difference_type = std::ranges::range_difference_t<View>;

iterator () requires std::default_initializable<std::ranges::iterator_t<View>> = default;
constexpr iterator (transcode_view const& parent, std::ranges::iterator_t<View> current)
constexpr iterator (transcode_view const& parent, std::ranges::iterator_t<View> const& current)
: current_{current}, parent_{&parent}, state_{current} {
assert (state_.empty ());
// Prime the input state so that a dereference of the iterator will yield the first of the
Expand All @@ -1045,9 +1045,7 @@ class transcode_view<FromEncoding, ToEncoding, View>::iterator {
constexpr std::ranges::iterator_t<View> const& base () const& noexcept { return current_; }
constexpr std::ranges::iterator_t<View> base () && { return std::move (current_); }

constexpr value_type const& operator* () const {
return state_.empty () ? replacement : state_.front ();
}
constexpr value_type const& operator* () const { return state_.front (); }
constexpr std::ranges::iterator_t<View> operator->() const { return state_.front (); }

constexpr iterator& operator++ () {
Expand Down Expand Up @@ -1095,7 +1093,10 @@ class transcode_view<FromEncoding, ToEncoding, View>::iterator {
constexpr state () : state{std::ranges::iterator_t<View>{}} {}

[[nodiscard]] constexpr bool empty () const noexcept { return valid_.empty (); }
[[nodiscard]] constexpr auto& front () const noexcept { return valid_.front (); }
[[nodiscard]] constexpr auto& front () const noexcept {
assert (!valid_.empty ());
return valid_.front ();
}
constexpr void advance () noexcept { valid_.advance (1); }

/// Consumes enough code-units from the base iterator to form a single code-point. The resulting
Expand Down Expand Up @@ -1134,16 +1135,16 @@ constexpr std::ranges::iterator_t<View> transcode_view<FromEncoding, ToEncoding,

auto it = out_.begin ();

if (auto const input_end = std::ranges::end (parent->base_); next_ == input_end) {
auto const input_end = std::ranges::end (parent->base_);
// Loop until we've produced a code-point's worth of code-units in the out
// container or we've run out of input.
while (it == out_.begin () && next_ != input_end) {
it = transcoder_ (*next_, it);
++next_;
}
if (next_ == input_end) {
// We've consumed the entire input so tell the transcoder and get any final output.
it = transcoder_.end_cp (it);
} else {
// Loop until we've produced a code-point's worth of code-units in the out
// container or we've run out of input.
while (it == out_.begin () && next_ != input_end) {
it = transcoder_ (*next_, it);
++next_;
}
}
assert (it >= out_.begin () && it <= out_.end ());
parent->well_formed_ = transcoder_.well_formed ();
Expand Down
1 change: 1 addition & 0 deletions unittests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ add_executable (icubaby-unittests
harness.cpp
test_u8_32.cpp
test_u16.cpp
test_u32.cpp
test_u32_8.cpp
test_utility.cpp
typed_test.hpp
Expand Down
252 changes: 252 additions & 0 deletions unittests/test_u32.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,252 @@
// MIT License
//
// Copyright (c) 2022-2024 Paul Bowen-Huggett
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.

#include <array>
#include <iterator>
#include <type_traits>
#include <utility>
#include <vector>

// icubaby itself.
#include "icubaby/icubaby.hpp"

// Google Test/Mock
#include "gmock/gmock.h"
#include "gtest/gtest.h"

// Local includes
#include "encoded_char.hpp"
#include "typed_test.hpp"

static_assert (std::is_same_v<icubaby::t32_8 ::input_type, char32_t> &&
std::is_same_v<icubaby::t32_8 ::output_type, icubaby::char8>);
static_assert (std::is_same_v<icubaby::t32_16::input_type, char32_t> &&
std::is_same_v<icubaby::t32_16::output_type, char16_t>);
// NOLINTNEXTLINE(misc-redundant-expression)
static_assert (std::is_same_v<icubaby::t32_32::input_type, char32_t> &&
std::is_same_v<icubaby::t32_32::output_type, char32_t>);

using namespace std::string_literals;
using testing::ElementsAreArray;

// NOLINTBEGIN(cppcoreguidelines-avoid-magic-numbers, readability-magic-numbers)

namespace {

template <typename T> class Utf32 : public testing::Test {
protected:
using output_type = T;
std::vector<output_type> output_;
icubaby::transcoder<char32_t, output_type> transcoder_;
};

} // end anonymous namespace

TYPED_TEST_SUITE (Utf32, OutputTypes, OutputTypeNames);
// NOLINTNEXTLINE
TYPED_TEST (Utf32, GoodDollarSign) {
auto& transcoder = this->transcoder_;
auto& output = this->output_;
EXPECT_TRUE (transcoder.well_formed ());
EXPECT_FALSE (transcoder.partial ());
auto it = transcoder (static_cast<char32_t> (code_point::dollar_sign), std::back_inserter (output));
EXPECT_TRUE (transcoder.well_formed ()) << "input should be well formed";
EXPECT_FALSE (transcoder.partial ()) << "there were no surrogate code units";
transcoder.end_cp (it);
EXPECT_TRUE (transcoder.well_formed ());
EXPECT_FALSE (transcoder.partial ());
EXPECT_THAT (output, ElementsAreArray (encoded_char_v<code_point::dollar_sign, TypeParam>));
}
// NOLINTNEXTLINE
TYPED_TEST (Utf32, StartOfHeadingAndText) {
auto& transcoder = this->transcoder_;
auto& output = this->output_;
auto it = transcoder (static_cast<char32_t> (code_point::start_of_heading), std::back_inserter (output));
EXPECT_TRUE (transcoder.well_formed ());
EXPECT_FALSE (transcoder.partial ());
it = transcoder (static_cast<char32_t> (code_point::start_of_text), it);
EXPECT_TRUE (transcoder.well_formed ());
EXPECT_FALSE (transcoder.partial ());
transcoder.end_cp (it);
EXPECT_TRUE (transcoder.well_formed ());
EXPECT_FALSE (transcoder.partial ());

std::vector<TypeParam> expected;
append<code_point::start_of_heading, TypeParam> (std::back_inserter (expected));
append<code_point::start_of_text, TypeParam> (std::back_inserter (expected));
EXPECT_THAT (output, ElementsAreArray (expected));
}
// NOLINTNEXTLINE
TYPED_TEST (Utf32, CharFFFF) {
auto& transcoder = this->transcoder_;
auto& output = this->output_;
auto it = transcoder (static_cast<char32_t> (code_point::code_point_ffff), std::back_inserter (output));
EXPECT_TRUE (transcoder.well_formed ());
EXPECT_FALSE (transcoder.partial ());
transcoder.end_cp (it);
EXPECT_TRUE (transcoder.well_formed ());
EXPECT_FALSE (transcoder.partial ());
EXPECT_THAT (output, ElementsAreArray (encoded_char_v<code_point::code_point_ffff, TypeParam>));
}
// NOLINTNEXTLINE
TYPED_TEST (Utf32, FirstHighSurrogate) {
auto& transcoder = this->transcoder_;
auto& output = this->output_;

auto it = transcoder (icubaby::first_high_surrogate, std::back_inserter (output));
EXPECT_FALSE (transcoder.well_formed ());
EXPECT_FALSE (transcoder.partial ());
transcoder.end_cp (it);
EXPECT_FALSE (transcoder.well_formed ());
EXPECT_FALSE (transcoder.partial ());
EXPECT_THAT (output, ElementsAreArray (encoded_char_v<code_point::replacement_char, TypeParam>));
}
// NOLINTNEXTLINE
TYPED_TEST (Utf32, LastHighSurrogate) {
auto& transcoder = this->transcoder_;
auto& output = this->output_;

auto it = transcoder (icubaby::last_high_surrogate, std::back_inserter (output));
EXPECT_FALSE (transcoder.well_formed ());
EXPECT_FALSE (transcoder.partial ());
transcoder.end_cp (it);
EXPECT_FALSE (transcoder.well_formed ());
EXPECT_FALSE (transcoder.partial ());
EXPECT_THAT (output, ElementsAreArray (encoded_char_v<code_point::replacement_char, TypeParam>));
}
// NOLINTNEXTLINE
TYPED_TEST (Utf32, FirstLowSurrogate) {
auto& transcoder = this->transcoder_;
auto& output = this->output_;

auto it = transcoder (icubaby::first_low_surrogate, std::back_inserter (output));
EXPECT_FALSE (transcoder.well_formed ());
EXPECT_FALSE (transcoder.partial ());
transcoder.end_cp (it);
EXPECT_FALSE (transcoder.well_formed ());
EXPECT_FALSE (transcoder.partial ());
EXPECT_THAT (output, ElementsAreArray (encoded_char_v<code_point::replacement_char, TypeParam>));
}
// NOLINTNEXTLINE
TYPED_TEST (Utf32, LastLowSurrogate) {
auto& transcoder = this->transcoder_;
auto& output = this->output_;

auto it = transcoder (icubaby::last_low_surrogate, std::back_inserter (output));
EXPECT_FALSE (transcoder.well_formed ());
EXPECT_FALSE (transcoder.partial ());
transcoder.end_cp (it);
EXPECT_FALSE (transcoder.well_formed ());
EXPECT_FALSE (transcoder.partial ());
EXPECT_THAT (output, ElementsAreArray (encoded_char_v<code_point::replacement_char, TypeParam>));
}
// NOLINTNEXTLINE
TYPED_TEST (Utf32, MaxCodePoint) {
auto& transcoder = this->transcoder_;
auto& output = this->output_;

auto it = transcoder (icubaby::max_code_point, std::back_inserter (output));
EXPECT_TRUE (transcoder.well_formed ());
EXPECT_FALSE (transcoder.partial ());
transcoder.end_cp (it);
EXPECT_TRUE (transcoder.well_formed ());
EXPECT_FALSE (transcoder.partial ());

EXPECT_THAT (output, ElementsAreArray (encoded_char_v<code_point::last_valid_code_point, TypeParam>));
}
// NOLINTNEXTLINE
TYPED_TEST (Utf32, BeyondMaxCodePoint) {
auto& transcoder = this->transcoder_;
auto& output = this->output_;

auto it = transcoder (static_cast<char32_t> (static_cast<std::uint_least32_t> (icubaby::max_code_point) + 1U),
std::back_inserter (output));
EXPECT_FALSE (transcoder.well_formed ());
EXPECT_FALSE (transcoder.partial ());
transcoder.end_cp (it);
EXPECT_FALSE (transcoder.well_formed ());
EXPECT_FALSE (transcoder.partial ());

EXPECT_THAT (output, ElementsAreArray (encoded_char_v<code_point::replacement_char, TypeParam>));
}

#if ICUBABY_HAVE_RANGES && ICUBABY_HAVE_CONCEPTS

// NOLINTNEXTLINE
TYPED_TEST (Utf32, RangesCopy) {
auto& output = this->output_;

std::vector const in{
static_cast<char32_t> (code_point::cjk_unified_ideograph_2070e),
static_cast<char32_t> (code_point::code_point_ffff),
static_cast<char32_t> (code_point::cuneiform_sign_uru_times_ki),
static_cast<char32_t> (code_point::dollar_sign),
static_cast<char32_t> (code_point::hiragana_letter_go),
static_cast<char32_t> (code_point::hiragana_letter_ha),
static_cast<char32_t> (code_point::hiragana_letter_i),
static_cast<char32_t> (code_point::hiragana_letter_ma),
static_cast<char32_t> (code_point::hiragana_letter_o),
static_cast<char32_t> (code_point::hiragana_letter_su),
static_cast<char32_t> (code_point::hiragana_letter_u),
static_cast<char32_t> (code_point::hiragana_letter_yo),
static_cast<char32_t> (code_point::hiragana_letter_za),
static_cast<char32_t> (code_point::linear_b_syllable_b008_a),
static_cast<char32_t> (code_point::start_of_heading),
static_cast<char32_t> (code_point::start_of_text),
};

auto r = in | icubaby::ranges::transcode<char32_t, TypeParam>;
std::ranges::copy (r, std::back_inserter (output));

std::vector<TypeParam> expected;
append<code_point::cjk_unified_ideograph_2070e, TypeParam> (std::back_inserter (expected));
append<code_point::code_point_ffff, TypeParam> (std::back_inserter (expected));
append<code_point::cuneiform_sign_uru_times_ki, TypeParam> (std::back_inserter (expected));
append<code_point::dollar_sign, TypeParam> (std::back_inserter (expected));
append<code_point::hiragana_letter_go, TypeParam> (std::back_inserter (expected));
append<code_point::hiragana_letter_ha, TypeParam> (std::back_inserter (expected));
append<code_point::hiragana_letter_i, TypeParam> (std::back_inserter (expected));
append<code_point::hiragana_letter_ma, TypeParam> (std::back_inserter (expected));
append<code_point::hiragana_letter_o, TypeParam> (std::back_inserter (expected));
append<code_point::hiragana_letter_su, TypeParam> (std::back_inserter (expected));
append<code_point::hiragana_letter_u, TypeParam> (std::back_inserter (expected));
append<code_point::hiragana_letter_yo, TypeParam> (std::back_inserter (expected));
append<code_point::hiragana_letter_za, TypeParam> (std::back_inserter (expected));
append<code_point::linear_b_syllable_b008_a, TypeParam> (std::back_inserter (expected));
append<code_point::start_of_heading, TypeParam> (std::back_inserter (expected));
append<code_point::start_of_text, TypeParam> (std::back_inserter (expected));
EXPECT_THAT (output, ElementsAreArray (expected));
EXPECT_TRUE (r.well_formed ());
}
// NOLINTNEXTLINE
TYPED_TEST (Utf32, RangesBadInput) {
auto& output = this->output_;
std::vector const in{char32_t{0xFFFFFFFF}};
auto const r = in | icubaby::ranges::transcode<char32_t, TypeParam>;
std::ranges::copy (r, std::back_inserter (output));
EXPECT_THAT (output, ElementsAreArray (encoded_char_v<code_point::replacement_char, TypeParam>));
EXPECT_FALSE (r.well_formed ());
}

#endif // ICUBABY_HAVE_RANGES && ICUBABY_HAVE_CONCEPTS

// NOLINTEND(cppcoreguidelines-avoid-magic-numbers, readability-magic-numbers)
10 changes: 10 additions & 0 deletions unittests/test_u32_8.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,4 +95,14 @@ TEST (Utf32To8, RangesCopy) {
));
// clang-format on
}
// NOLINTNEXTLINE
TEST (Utf32To8, RangesBadInput) {
std::vector const in{char32_t{0xFFFFFFFF}};
std::vector<char8_t> out8;
auto const r = in | icubaby::ranges::transcode<char32_t, char8_t>;
std::ranges::copy (r, std::back_inserter (out8));
EXPECT_THAT (out8, testing::ElementsAre (char8_t{0xEF}, char8_t{0xBF}, char8_t{0xBD}));
EXPECT_FALSE (r.well_formed ());
}

#endif // __cpp_lib_ranges

0 comments on commit b245464

Please sign in to comment.