Skip to content

Commit

Permalink
template param: code point type
Browse files Browse the repository at this point in the history
  • Loading branch information
markusicu committed Dec 26, 2024
1 parent 6851e8d commit 64ea110
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 30 deletions.
44 changes: 22 additions & 22 deletions icu4c/source/common/unicode/utf16cppiter.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,30 +29,26 @@ namespace U_HEADER_ONLY_NAMESPACE {
//
// TODO: A possible alternative to an enum might be some kind of function template
// which would be fully customizable.
// The operator*() return value might then want to be a template parameter as well.
// For example, for a well-formed sequence, the return value could be
// a tuple of (code point, well-formed), or a string view, or...
// (And then the caller could choose between UChar32 and char32_t.)
// However, all of that would make the API more complex and daunting.
enum U16IllFormedBehavior {
U16_BEHAVIOR_NEGATIVE,
U16_BEHAVIOR_FFFD,
U16_BEHAVIOR_SURROGATE
};

// TODO: Consider a template parameter for UChar32 vs. char32_t vs. uint32_t.

/**
* A code unit sequence for one code point returned by U16Iterator.
* TODO: Share with UTF-8?
*
* TODO: check doxygen syntax for template parameters
* @param Unit16 char16_t or uint16_t or (on Windows) wchar_t
* @param Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t
* @param CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t;
* should be signed if U16_BEHAVIOR_NEGATIVE
* @draft ICU 77
*/
template<typename Unit16>
template<typename Unit16, typename CP32>
struct U16OneSeq {
// Order of fields with padding and access frequency in mind.
UChar32 codePoint = 0;
CP32 codePoint = 0;
uint8_t length = 0;
bool isWellFormed = false;
const Unit16 *data;
Expand All @@ -61,18 +57,20 @@ struct U16OneSeq {
return std::basic_string_view<Unit16>(data, length);
}

// TODO: std::optional<UChar32> maybeCodePoint() const ? (nullopt if !isWellFormed)
// TODO: std::optional<CP32> maybeCodePoint() const ? (nullopt if !isWellFormed)
};

/**
* Validating iterator over the code points in a Unicode 16-bit string.
*
* TODO: check doxygen syntax for template parameters
* @param Unit16 char16_t or uint16_t or (on Windows) wchar_t
* @param Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t
* @param CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t;
* should be signed if U16_BEHAVIOR_NEGATIVE
* @param U16IllFormedBehavior TODO
* @draft ICU 77
*/
template<typename Unit16, U16IllFormedBehavior behavior>
template<typename Unit16, typename CP32, U16IllFormedBehavior behavior>
class U16Iterator {
public:
// TODO: make private, make friends
Expand All @@ -86,10 +84,10 @@ class U16Iterator {
bool operator==(const U16Iterator &other) const { return p == other.p; }
bool operator!=(const U16Iterator &other) const { return !operator==(other); }

U16OneSeq<Unit16> operator*() const {
U16OneSeq<Unit16, CP32> operator*() const {
// TODO: assert p != limit -- more precisely: start <= p < limit
// Similar to U16_NEXT_OR_FFFD().
UChar32 c = *p;
CP32 c = *p;
if (!U16_IS_SURROGATE(c)) {
return {c, 1, true, p};
} else {
Expand Down Expand Up @@ -118,17 +116,21 @@ class U16Iterator {
// More similar to U16_NEXT_OR_FFFD() than U16_FWD_1() to try to help the compiler
// amortize work between operator*() and operator++(int) in typical *it++ usage.
// Otherwise this is slightly less efficient because it tests a lead surrogate twice.
UChar32 c = *p++;
CP32 c = *p++;
if (U16_IS_SURROGATE(c) &&
U16_IS_SURROGATE_LEAD(c) && p != limit && U16_IS_TRAIL(*p)) {
++p;
}
return result;
}

// TODO: operator--()
// TODO: maybe fused readAndInc()?
// TODO: maybe fused decAndRead()?

private:
// Handle ill-formed UTF-16: One unpaired surrogate.
UChar32 sub(UChar32 surrogate) const {
CP32 sub(CP32 surrogate) const {
switch (behavior) {
case U16_BEHAVIOR_NEGATIVE: return U_SENTINEL;
case U16_BEHAVIOR_FFFD: return 0xfffd;
Expand All @@ -149,7 +151,7 @@ class U16Iterator {
* @return a code point iterator.
* @draft ICU 77
*/
template<typename Unit16, U16IllFormedBehavior behavior>
template<typename Unit16, typename CP32, U16IllFormedBehavior behavior>
class U16StringCodePoints {
public:
/**
Expand All @@ -162,12 +164,12 @@ class U16StringCodePoints {
U16StringCodePoints(const U16StringCodePoints &other) = default;

/** @draft ICU 77 */
U16Iterator<Unit16, behavior> begin() const {
U16Iterator<Unit16, CP32, behavior> begin() const {
return {s.data(), s.data(), s.data() + s.length()};
}

/** @draft ICU 77 */
U16Iterator<Unit16, behavior> end() const {
U16Iterator<Unit16, CP32, behavior> end() const {
const Unit16 *limit = s.data() + s.length();
return {s.data(), limit, limit};
}
Expand All @@ -183,8 +185,6 @@ class U16StringCodePoints {
// template<typename Unit16>
// class U16UnsafeIterator
// TODO: only p, no start, no limit
// TODO: can/should we read the code point only in operator*()?
// if we read it in the constructor, then we would still need start/limit...

} // namespace U_HEADER_ONLY_NAMESPACE

Expand Down
16 changes: 8 additions & 8 deletions icu4c/source/test/intltest/utfcppitertest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,11 @@ void U16IteratorTest::runIndexedTest(int32_t index, UBool exec, const char *&nam
void U16IteratorTest::testGood() {
IcuTestErrorCode errorCode(*this, "testGood");
std::u16string_view good(u"abçカ🚴"sv);
U16StringCodePoints<char16_t, U16_BEHAVIOR_NEGATIVE> range(good);
U16StringCodePoints<char16_t, UChar32, U16_BEHAVIOR_NEGATIVE> range(good);
auto iter = range.begin();
assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint);
++iter; // pre-increment
U16OneSeq<char16_t> seq = *iter;
auto seq = *iter;
assertEquals("iter[1] * codePoint", u'b', seq.codePoint);
assertEquals("iter[1] * length", 1, seq.length);
assertTrue("iter[1] * isWellFormed", seq.isWellFormed);
Expand All @@ -83,11 +83,11 @@ void U16IteratorTest::testNegative() {
IcuTestErrorCode errorCode(*this, "testNegative");
static const char16_t badChars[] = { u'a', 0xd900, u'b', 0xdc05, u'ç' };
std::u16string_view bad(badChars, 5);
U16StringCodePoints<char16_t, U16_BEHAVIOR_NEGATIVE> range(bad);
U16StringCodePoints<char16_t, UChar32, U16_BEHAVIOR_NEGATIVE> range(bad);
auto iter = range.begin();
assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint);
++iter; // pre-increment
U16OneSeq<char16_t> seq = *iter;
auto seq = *iter;
assertEquals("iter[1] * codePoint", -1, seq.codePoint);
assertEquals("iter[1] * length", 1, seq.length);
assertFalse("iter[1] * isWellFormed", seq.isWellFormed);
Expand All @@ -107,11 +107,11 @@ void U16IteratorTest::testFFFD() {
IcuTestErrorCode errorCode(*this, "testFFFD");
static const char16_t badChars[] = { u'a', 0xd900, u'b', 0xdc05, u'ç' };
std::u16string_view bad(badChars, 5);
U16StringCodePoints<char16_t, U16_BEHAVIOR_FFFD> range(bad);
U16StringCodePoints<char16_t, char32_t, U16_BEHAVIOR_FFFD> range(bad);
auto iter = range.begin();
assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint);
++iter; // pre-increment
U16OneSeq<char16_t> seq = *iter;
auto seq = *iter;
assertEquals("iter[1] * codePoint", 0xfffd, seq.codePoint);
assertEquals("iter[1] * length", 1, seq.length);
assertFalse("iter[1] * isWellFormed", seq.isWellFormed);
Expand All @@ -131,11 +131,11 @@ void U16IteratorTest::testSurrogate() {
IcuTestErrorCode errorCode(*this, "testSurrogate");
static const char16_t badChars[] = { u'a', 0xd900, u'b', 0xdc05, u'ç' };
std::u16string_view bad(badChars, 5);
U16StringCodePoints<char16_t, U16_BEHAVIOR_SURROGATE> range(bad);
U16StringCodePoints<char16_t, uint32_t, U16_BEHAVIOR_SURROGATE> range(bad);
auto iter = range.begin();
assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint);
++iter; // pre-increment
U16OneSeq<char16_t> seq = *iter;
auto seq = *iter;
assertEquals("iter[1] * codePoint", 0xd900, seq.codePoint);
assertEquals("iter[1] * length", 1, seq.length);
assertFalse("iter[1] * isWellFormed", seq.isWellFormed);
Expand Down

0 comments on commit 64ea110

Please sign in to comment.