Skip to content

Commit

Permalink
ICU-22433 uppercase decomposed greek to decomposed greek and precompo…
Browse files Browse the repository at this point in the history
…sed greek to precomposed greek.
  • Loading branch information
eggrobin committed Aug 8, 2023
1 parent 720e574 commit 47e47ec
Show file tree
Hide file tree
Showing 6 changed files with 76 additions and 30 deletions.
15 changes: 10 additions & 5 deletions icu4c/source/common/ucasemap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -679,14 +679,18 @@ void toUpper(uint32_t options,
// Adding one only to the final vowel in a longer sequence
// (which does not occur in normal writing) would require lookahead.
// Set the same flag as for preserving an existing dialytika.
if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 &&
(upper == 0x399 || upper == 0x3A5)) {
data |= HAS_DIALYTIKA;
if ((data & HAS_VOWEL) != 0 &&
(state & (AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT | AFTER_VOWEL_WITH_COMBINING_ACCENT)) !=
0 &&
(upper == 0x399 || upper == 0x3A5)) {
data |= (state & AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT) != 0 ? HAS_DIALYTIKA
: HAS_COMBINING_DIALYTIKA;
}
int32_t numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota.
if ((data & HAS_YPOGEGRAMMENI) != 0) {
numYpogegrammeni = 1;
}
const UBool hasPrecomposedAccent = (data & HAS_ACCENT) != 0;
// Skip combining diacritics after this Greek letter.
int32_t nextNextIndex = nextIndex;
while (nextIndex < srcLength) {
Expand All @@ -704,7 +708,8 @@ void toUpper(uint32_t options,
}
}
if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
nextState |= AFTER_VOWEL_WITH_ACCENT;
nextState |= hasPrecomposedAccent ? AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT
: AFTER_VOWEL_WITH_COMBINING_ACCENT;
}
// Map according to Greek rules.
UBool addTonos = false;
Expand All @@ -715,7 +720,7 @@ void toUpper(uint32_t options,
!isFollowedByCasedLetter(src, nextIndex, srcLength)) {
// Keep disjunctive "or" with (only) a tonos.
// We use the same "word boundary" conditions as for the Final_Sigma test.
if (i == nextIndex) {
if (hasPrecomposedAccent) {
upper = 0x389; // Preserve the precomposed form.
} else {
addTonos = true;
Expand Down
3 changes: 2 additions & 1 deletion icu4c/source/common/ucasemap_imp.h
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,8 @@ static const uint32_t HAS_EITHER_DIALYTIKA = HAS_DIALYTIKA | HAS_COMBINING_DIALY

// State bits.
static const uint32_t AFTER_CASED = 1;
static const uint32_t AFTER_VOWEL_WITH_ACCENT = 2;
static const uint32_t AFTER_VOWEL_WITH_COMBINING_ACCENT = 2;
static const uint32_t AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT = 4;

uint32_t getLetterData(UChar32 c);

Expand Down
15 changes: 10 additions & 5 deletions icu4c/source/common/ustrcase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1130,14 +1130,18 @@ int32_t toUpper(uint32_t options,
// Adding one only to the final vowel in a longer sequence
// (which does not occur in normal writing) would require lookahead.
// Set the same flag as for preserving an existing dialytika.
if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 &&
(upper == 0x399 || upper == 0x3A5)) {
data |= HAS_DIALYTIKA;
if ((data & HAS_VOWEL) != 0 &&
(state & (AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT | AFTER_VOWEL_WITH_COMBINING_ACCENT)) !=
0 &&
(upper == 0x399 || upper == 0x3A5)) {
data |= (state & AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT) ? HAS_DIALYTIKA
: HAS_COMBINING_DIALYTIKA;
}
int32_t numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota.
if ((data & HAS_YPOGEGRAMMENI) != 0) {
numYpogegrammeni = 1;
}
const UBool hasPrecomposedAccent = (data & HAS_ACCENT) != 0;
// Skip combining diacritics after this Greek letter.
while (nextIndex < srcLength) {
uint32_t diacriticData = getDiacriticData(src[nextIndex]);
Expand All @@ -1152,7 +1156,8 @@ int32_t toUpper(uint32_t options,
}
}
if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
nextState |= AFTER_VOWEL_WITH_ACCENT;
nextState |= hasPrecomposedAccent ? AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT
: AFTER_VOWEL_WITH_COMBINING_ACCENT;
}
// Map according to Greek rules.
UBool addTonos = false;
Expand All @@ -1163,7 +1168,7 @@ int32_t toUpper(uint32_t options,
!isFollowedByCasedLetter(src, nextIndex, srcLength)) {
// Keep disjunctive "or" with (only) a tonos.
// We use the same "word boundary" conditions as for the Final_Sigma test.
if (i == nextIndex) {
if (hasPrecomposedAccent) {
upper = 0x389; // Preserve the precomposed form.
} else {
addTonos = true;
Expand Down
43 changes: 31 additions & 12 deletions icu4c/source/test/intltest/strcase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "unicode/ures.h"
#include "unicode/uloc.h"
#include "unicode/locid.h"
#include "unicode/normalizer2.h"
#include "unicode/ubrk.h"
#include "unicode/unistr.h"
#include "unicode/ucasemap.h"
Expand Down Expand Up @@ -74,6 +75,8 @@ class StringCaseTest: public IntlTest {

private:
void assertGreekUpper(const char16_t *s, const char16_t *expected);
void assertGreekUpperNormalized(const UnicodeString &s16, const UnicodeString &expected16,
const char *form);

Locale GREEK_LOCALE_;
};
Expand Down Expand Up @@ -806,16 +809,27 @@ StringCaseTest::TestFullCaseFoldingIterator() {
}
}

void
StringCaseTest::assertGreekUpper(const char16_t *s, const char16_t *expected) {
UnicodeString s16(s);
UnicodeString expected16(expected);
UnicodeString msg = UnicodeString("UnicodeString::toUpper/Greek(\"") + s16 + "\")";
void StringCaseTest::assertGreekUpper(const char16_t *s, const char16_t *expected) {
UErrorCode errorCode = U_ZERO_ERROR;
#if UCONFIG_NO_NORMALIZATION
assertGreekUpperNormalized(s, expected, "No normalization");
#else
const Normalizer2 &nfc = *Normalizer2::getNFCInstance(errorCode);
const Normalizer2 &nfd = *Normalizer2::getNFDInstance(errorCode);
assertGreekUpperNormalized(nfc.normalize(s, errorCode), nfc.normalize(expected, errorCode), "NFC");
assertGreekUpperNormalized(nfd.normalize(s, errorCode), nfd.normalize(expected, errorCode), "NFD");
#endif
}

void StringCaseTest::assertGreekUpperNormalized(const UnicodeString &s16,
const UnicodeString &expected16,
const char *form) {
UnicodeString msg = UnicodeString("UnicodeString::toUpper/Greek(\"") + s16 + "\" [" + form + "])";
UnicodeString result16(s16);
result16.toUpper(GREEK_LOCALE_);
assertEquals(msg, expected16, result16);

msg = UnicodeString("u_strToUpper/Greek(\"") + s16 + "\") cap=";
msg = UnicodeString("u_strToUpper/Greek(\"") + s16 + "\" [" + form + "]) cap=";
int32_t length = expected16.length();
int32_t capacities[] = {
// Keep in sync with the UTF-8 capacities near the bottom of this function.
Expand Down Expand Up @@ -849,7 +863,7 @@ StringCaseTest::assertGreekUpper(const char16_t *s, const char16_t *expected) {
assertSuccess("ucasemap_open", errorCode);
std::string s8;
s16.toUTF8String(s8);
msg = UnicodeString("ucasemap_utf8ToUpper/Greek(\"") + s16 + "\")";
msg = UnicodeString("ucasemap_utf8ToUpper/Greek(\"") + s16 + "\" [" + form + "])";
char dest8[1000];
length = ucasemap_utf8ToUpper(csm.getAlias(), dest8, UPRV_LENGTHOF(dest8),
s8.data(), static_cast<int32_t>(s8.length()), &errorCode);
Expand Down Expand Up @@ -901,22 +915,27 @@ StringCaseTest::TestGreekUpper() {
assertGreekUpper(u"ΰ, Τηρώ, Μάιος", u"Ϋ, ΤΗΡΩ, ΜΑΪΟΣ");
assertGreekUpper(u"άυλος", u"ΑΫΛΟΣ");
assertGreekUpper(u"ΑΫΛΟΣ", u"ΑΫΛΟΣ");
assertGreekUpper(u"Άκλιτα ρήματα ή άκλιτες μετοχές", u"ΑΚΛΙΤΑ ΡΗΜΑΤΑ Ή ΑΚΛΙΤΕΣ ΜΕΤΟΧΕΣ");
assertGreekUpper(u"Άκλιτα ρήματα ή άκλιτες μετοχές", u"ΑΚΛΙΤΑ ΡΗΜΑΤΑ Ή ΑΚΛΙΤΕΣ ΜΕΤΟΧΕΣ");
// http://www.unicode.org/udhr/d/udhr_ell_monotonic.html
assertGreekUpper(u"Επειδή η αναγνώριση της αξιοπρέπειας", u"ΕΠΕΙΔΗ Η ΑΝΑΓΝΩΡΙΣΗ ΤΗΣ ΑΞΙΟΠΡΕΠΕΙΑΣ");
assertGreekUpper(u"νομικού ή διεθνούς", u"ΝΟΜΙΚΟΥ Ή ΔΙΕΘΝΟΥΣ");
assertGreekUpper(u"νομικού ή διεθνούς", u"ΝΟΜΙΚΟΥ Ή ΔΙΕΘΝΟΥΣ");
// http://unicode.org/udhr/d/udhr_ell_polytonic.html
assertGreekUpper(u"Ἐπειδὴ ἡ ἀναγνώριση", u"ΕΠΕΙΔΗ Η ΑΝΑΓΝΩΡΙΣΗ");
assertGreekUpper(u"νομικοῦ ἢ διεθνοῦς", u"ΝΟΜΙΚΟΥ Ή ΔΙΕΘΝΟΥΣ");
assertGreekUpper(u"νομικοῦ ἢ διεθνοῦς", u"ΝΟΜΙΚΟΥ Ή ΔΙΕΘΝΟΥΣ");
// From Google bug report
assertGreekUpper(u"Νέο, Δημιουργία", u"ΝΕΟ, ΔΗΜΙΟΥΡΓΙΑ");
// http://crbug.com/234797
assertGreekUpper(u"Ελάτε να φάτε τα καλύτερα παϊδάκια!", u"ΕΛΑΤΕ ΝΑ ΦΑΤΕ ΤΑ ΚΑΛΥΤΕΡΑ ΠΑΪΔΑΚΙΑ!");
assertGreekUpper(u"Μαΐου, τρόλεϊ", u"ΜΑΪΟΥ, ΤΡΟΛΕΪ");
assertGreekUpper(u"Το ένα ή το άλλο.", u"ΤΟ ΕΝΑ Ή ΤΟ ΑΛΛΟ.");
assertGreekUpper(u"Το ένα ή το άλλο.", u"ΤΟ ΕΝΑ Ή ΤΟ ΑΛΛΟ.");
// http://multilingualtypesetting.co.uk/blog/greek-typesetting-tips/
assertGreekUpper(u"ρωμέικα", u"ΡΩΜΕΪΚΑ");
assertGreekUpper(u"ή.", u"Ή.");
assertGreekUpper(u"ή.", u"Ή.");

// The ὑπογεγραμμέναι become Ι as in default case conversion, but they are
// specially handled by the implementation.
assertGreekUpper(u"ᾠδή, -ήν, -ῆς, -ῇ", u"ΩΙΔΗ, -ΗΝ, -ΗΣ, -ΗΙ");
assertGreekUpper(u"ᾍδης", u"ΑΙΔΗΣ");
}

void StringCaseTest::TestArmenian() {
Expand Down
17 changes: 11 additions & 6 deletions icu4j/main/classes/core/src/com/ibm/icu/impl/CaseMapImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -906,7 +906,8 @@ private static final class GreekUpper {

// State bits.
private static final int AFTER_CASED = 1;
private static final int AFTER_VOWEL_WITH_ACCENT = 2;
private static final int AFTER_VOWEL_WITH_COMBINING_ACCENT = 2;
private static final int AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT = 4;

// Data generated by prototype code, see
// https://icu.unicode.org/design/case/greek-upper
Expand Down Expand Up @@ -1416,14 +1417,17 @@ private static <A extends Appendable> A toUpper(int options,
// Adding one only to the final vowel in a longer sequence
// (which does not occur in normal writing) would require lookahead.
// Set the same flag as for preserving an existing dialytika.
if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 &&
(upper == 'Ι' || upper == 'Υ')) {
data |= HAS_DIALYTIKA;
if ((data & HAS_VOWEL) != 0
&& (state & (AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT | AFTER_VOWEL_WITH_COMBINING_ACCENT)) != 0
&& (upper == 'Ι' || upper == 'Υ')) {
data |= (state & AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT) != 0 ? HAS_DIALYTIKA
: HAS_COMBINING_DIALYTIKA;
}
int numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota.
if ((data & HAS_YPOGEGRAMMENI) != 0) {
numYpogegrammeni = 1;
}
final boolean hasPrecomposedAccent = (data & HAS_ACCENT) != 0;
// Skip combining diacritics after this Greek letter.
while (nextIndex < src.length()) {
int diacriticData = getDiacriticData(src.charAt(nextIndex));
Expand All @@ -1438,7 +1442,8 @@ private static <A extends Appendable> A toUpper(int options,
}
}
if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
nextState |= AFTER_VOWEL_WITH_ACCENT;
nextState |= hasPrecomposedAccent ? AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT
: AFTER_VOWEL_WITH_COMBINING_ACCENT;
}
// Map according to Greek rules.
boolean addTonos = false;
Expand All @@ -1449,7 +1454,7 @@ private static <A extends Appendable> A toUpper(int options,
!isFollowedByCasedLetter(src, nextIndex)) {
// Keep disjunctive "or" with (only) a tonos.
// We use the same "word boundary" conditions as for the Final_Sigma test.
if (i == nextIndex) {
if (hasPrecomposedAccent) {
upper = 'Ή'; // Preserve the precomposed form.
} else {
addTonos = true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.CaseMap;
import com.ibm.icu.text.Edits;
import com.ibm.icu.text.Normalizer2;
import com.ibm.icu.text.RuleBasedBreakIterator;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.util.ULocale;
Expand Down Expand Up @@ -837,7 +838,12 @@ else if (lowerTest.charAt(i)
}

private void assertGreekUpper(String s, String expected) {
assertEquals("toUpper/Greek(" + s + ')', expected, UCharacter.toUpperCase(GREEK_LOCALE_, s));
Normalizer2 nfc = Normalizer2.getNFCInstance();
Normalizer2 nfd = Normalizer2.getNFDInstance();
assertEquals("toUpper/Greek(" + nfc.normalize(s) + " [NFC])", nfc.normalize(expected),
UCharacter.toUpperCase(GREEK_LOCALE_, nfc.normalize(s)));
assertEquals("toUpper/Greek(" + nfd.normalize(s) + " [NFD])", nfd.normalize(expected),
UCharacter.toUpperCase(GREEK_LOCALE_, nfd.normalize(s)));
}

@Test
Expand Down Expand Up @@ -868,6 +874,11 @@ public void TestGreekUpper() {
// http://multilingualtypesetting.co.uk/blog/greek-typesetting-tips/
assertGreekUpper("ρωμέικα", "ΡΩΜΕΪΚΑ");
assertGreekUpper("ή.", "Ή.");

// The ὑπογεγραμμέναι become Ι as in default case conversion, but they are
// specially handled by the implementation.
assertGreekUpper("ᾠδή, -ήν, -ῆς, -ῇ", "ΩΙΔΗ, -ΗΝ, -ΗΣ, -ΗΙ");
assertGreekUpper("ᾍδης", "ΑΙΔΗΣ");
}

@Test
Expand Down

0 comments on commit 47e47ec

Please sign in to comment.