unicode-org · srl295 · Jun 7, 2024 · Jun 6, 2024
diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/CodePointEscaper.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/CodePointEscaper.java
@@ -154,6 +154,9 @@ public String codePointToEscaped() {
 
     /** Returns a code point from the escaped form <b>of a single code point</b> */
     public static int escapedToCodePoint(String value) {
+        if (value == null || value.isEmpty()) {
+            return 0xFFFD;
+        }
         if (value.codePointAt(0) != CodePointEscaper.ESCAPE_START
                 || value.codePointAt(value.length() - 1) != CodePointEscaper.ESCAPE_END) {
             throw new IllegalArgumentException(
@@ -177,6 +180,9 @@ public static String toEscaped(String unescaped) {
 
     /** Returns the escaped form from a string */
     public static String toEscaped(String unescaped, UnicodeSet toEscape) {
+        if (unescaped == null) {
+            return null;
+        }
         StringBuilder result = new StringBuilder();
         unescaped
                 .codePoints()
@@ -191,25 +197,30 @@ public static String toEscaped(String unescaped, UnicodeSet toEscape) {
         return result.toString();
     }
     /** Return unescaped string */
-    public static String toUnescaped(String value) {
+    public static String toUnescaped(String escaped) {
+        if (escaped == null) {
+            return null;
+        }
         StringBuilder result = null;
         int donePart = 0;
-        int found = value.indexOf(ESCAPE_START);
+        int found = escaped.indexOf(ESCAPE_START);
         while (found >= 0) {
-            int foundEnd = value.indexOf(ESCAPE_END, found);
+            int foundEnd = escaped.indexOf(ESCAPE_END, found);
             if (foundEnd < 0) {
                 throw new IllegalArgumentException(
                         "Malformed escaped string, missing: " + ESCAPE_END);
             }
             if (result == null) {
                 result = new StringBuilder();
             }
-            result.append(value, donePart, found);
+            result.append(escaped, donePart, found);
             donePart = ++foundEnd;
-            result.appendCodePoint(escapedToCodePoint(value.substring(found, foundEnd)));
-            found = value.indexOf(ESCAPE_START, foundEnd);
+            result.appendCodePoint(escapedToCodePoint(escaped.substring(found, foundEnd)));
+            found = escaped.indexOf(ESCAPE_START, foundEnd);
         }
-        return donePart == 0 ? value : result.append(value, donePart, value.length()).toString();
+        return donePart == 0
+                ? escaped
+                : result.append(escaped, donePart, escaped.length()).toString();
     }
 
     private static final String HAS_NAME = " ≡ ";
@@ -232,6 +243,9 @@ public static String toExample(int codePoint) {
      * brackets</b>
      */
     public static int rawEscapedToCodePoint(CharSequence value) {
+        if (value == null || value.length() == 0) {
+            return 0xFFFD;
+        }
         try {
             return valueOf(value.toString().toUpperCase(Locale.ROOT)).codePoint;
         } catch (Exception e) {

diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/SimpleUnicodeSetFormatter.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/SimpleUnicodeSetFormatter.java
@@ -4,7 +4,6 @@
 import com.ibm.icu.lang.CharSequences;
 import com.ibm.icu.text.Collator;
 import com.ibm.icu.text.Normalizer2;
-import com.ibm.icu.text.UTF16;
 import com.ibm.icu.text.UnicodeSet;
 import com.ibm.icu.util.ULocale;
 import java.util.ArrayList;
@@ -41,15 +40,25 @@ public class SimpleUnicodeSetFormatter implements FormatterParser<UnicodeSet> {
     public static Normalizer2 nfc = Normalizer2.getNFCInstance();
 
     public static final Comparator<String> BASIC_COLLATOR =
-            (Comparator) ComparatorUtilities.getIcuCollator(ULocale.ROOT, Collator.IDENTICAL);
+            (Comparator) CLDRConfig.getInstance().getCollator();
 
-    private static final int DEFAULT_MAX_DISALLOW_RANGES = 199;
+    public static final int DEFAULT_RANGES_ABOVE = 1024;
 
     private final Comparator<String> comparator;
     private final UnicodeSet forceHex;
-    private final int maxDisallowRanges;
-    private final UTF16.StringComparator codepointComparator =
-            new UTF16.StringComparator(true, false, 0);
+    private final int useRangesAbove;
+
+    public Comparator<String> getComparator() {
+        return comparator;
+    }
+
+    public UnicodeSet getToEscape() {
+        return forceHex;
+    }
+
+    public int getUseRangesAbove() {
+        return useRangesAbove;
+    }
 
     /**
      * Create a simple formatter, with a comparator for the ordering and a UnicodeSet of characters
@@ -58,38 +67,43 @@ public class SimpleUnicodeSetFormatter implements FormatterParser<UnicodeSet> {
      * @param col — collator. The default is BASIC_COLLATOR, which is the root collator.
      * @param forceHex - UnicodeSet to force to be hex. It will be frozen if not already. Warning:
      *     may not round-trip unless it includes all of CodePointEscaper.getNamedEscapes()
-     * @param maxDisallowRanges — under this number, there will be no ranges; at or above there may
-     *     be ranges, and the collator will be disregarded.
+     * @param useRangesAbove — under this number, there will be no ranges; at or above there may be
+     *     ranges, and the collator will be disregarded.
      */
     public SimpleUnicodeSetFormatter(
-            Comparator<String> col, UnicodeSet forceHex, int maxDisallowRanges) {
+            Comparator<String> col, UnicodeSet forceHex, int useRangesAbove) {
         // collate, but preserve non-equivalents
-        this.comparator = col == null ? COLLATOR : ComparatorUtilities.wrapForCodePoints(col);
+        this.comparator = col == null ? BASIC_COLLATOR : ComparatorUtilities.wrapForCodePoints(col);
         this.forceHex = forceHex == null ? CodePointEscaper.FORCE_ESCAPE : forceHex.freeze();
-        this.maxDisallowRanges = maxDisallowRanges;
+        this.useRangesAbove = useRangesAbove < 0 ? DEFAULT_RANGES_ABOVE : useRangesAbove;
     }
 
-    static final int DEFAULT_MAX = 1024;
-    public static final Comparator<String> COLLATOR =
-            (Comparator) CLDRConfig.getInstance().getCollator();
-
-    public static SimpleUnicodeSetFormatter fromIcuLocale(String localeId) {
-        return new SimpleUnicodeSetFormatter(COLLATOR, null, DEFAULT_MAX);
+    public static Comparator<String> getComparatorForLocale(String localeId) {
+        Comparator<String> collator = BASIC_COLLATOR;
+        try {
+            if (localeId != null) {
+                ICUServiceBuilder isb =
+                        ICUServiceBuilder.forLocale(CLDRLocale.getInstance(localeId));
+                collator = (Comparator) isb.getRuleBasedCollator();
+            }
+        } catch (Exception e) { // for our purposes, better to fall back to the default
+        }
+        return collator;
     }
 
     public SimpleUnicodeSetFormatter(Comparator<String> col, UnicodeSet forceHex) {
-        this(col, forceHex, DEFAULT_MAX_DISALLOW_RANGES);
+        this(col, forceHex, DEFAULT_RANGES_ABOVE);
     }
 
     public SimpleUnicodeSetFormatter(Comparator<String> col) {
-        this(col, null, DEFAULT_MAX);
+        this(col, null, DEFAULT_RANGES_ABOVE);
     }
 
     public SimpleUnicodeSetFormatter() {
         this(
                 (Comparator) ComparatorUtilities.getIcuCollator(ULocale.ROOT, Collator.IDENTICAL),
                 null,
-                DEFAULT_MAX);
+                DEFAULT_RANGES_ABOVE);
     }
 
     static class Lazy {
@@ -115,7 +129,7 @@ public static UnicodeSet parseLenient(String source) {
 
     @Override
     public String format(UnicodeSet input) {
-        final boolean allowRanges = input.size() > maxDisallowRanges;
+        final boolean allowRanges = input.size() > useRangesAbove;
         StringBuilder result = new StringBuilder();
         Collection<String> sorted =
                 input.addAllTo(allowRanges ? new ArrayList<>() : new TreeSet<>(comparator));

diff --git a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/UnicodeSetPrettyPrinterTest.java b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/UnicodeSetPrettyPrinterTest.java
@@ -14,6 +14,7 @@
 import com.ibm.icu.util.ULocale;
 import java.util.ArrayList;
 import java.util.Collections;
+import java.util.Comparator;
 import java.util.HashSet;
 import java.util.Set;
 import java.util.regex.Matcher;
@@ -85,14 +86,18 @@ public void testSimpleUnicodeSetFormatter() {
                 "[A Á B C {CS} D {DZ} {DZS} E É F G {GY} H I Í J K L {LY} M N {NY} O Ó Ö Ő P Q R S {SZ} T {TY} U Ú Ü Ű V W X Y Z {ZS}]",
                 "A Á B C CS D DZ DZS E É F G GY H I Í J K L LY M N NY O Ó Ö Ő P Q R S SZ T TY U Ú Ü Ű V W X Y Z ZS"
             },
+            {
+                "[:block=Hangul_Jamo:]",
+                "ᄀ ᄁ ᄂ ᄃ ᄄ ᄅ ᄆ ᄇ ᄈ ᄉ ᄊ ᄋ ᄌ ᄍ ᄎ ᄏ ᄐ ᄑ ᄒ ᄓ ᄔ ᄕ ᄖ ᄗ ᄘ ᄙ ᄚ ᄛ ᄜ ᄝ ᄞ ᄟ ᄠ ᄡ ᄢ ᄣ ᄤ ᄥ ᄦ ᄧ ᄨ ᄩ ᄪ ᄫ ᄬ ᄭ ᄮ ᄯ ᄰ ᄱ ᄲ ᄳ ᄴ ᄵ ᄶ ᄷ ᄸ ᄹ ᄺ ᄻ ᄼ ᄽ ᄾ ᄿ ᅀ ᅁ ᅂ ᅃ ᅄ ᅅ ᅆ ᅇ ᅈ ᅉ ᅊ ᅋ ᅌ ᅍ ᅎ ᅏ ᅐ ᅑ ᅒ ᅓ ᅔ ᅕ ᅖ ᅗ ᅘ ᅙ ᅚ ᅛ ᅜ ᅝ ᅞ ❰115F❱ ❰1160❱ ᅡ ᅢ ᅣ ᅤ ᅥ ᅦ ᅧ ᅨ ᅩ ᅪ ᅫ ᅬ ᅭ ᅮ ᅯ ᅰ ᅱ ᅲ ᅳ ᅴ ᅵ ᅶ ᅷ ᅸ ᅹ ᅺ ᅻ ᅼ ᅽ ᅾ ᅿ ᆀ ᆁ ᆂ ᆃ ᆄ ᆅ ᆆ ᆇ ᆈ ᆉ ᆊ ᆋ ᆌ ᆍ ᆎ ᆏ ᆐ ᆑ ᆒ ᆓ ᆔ ᆕ ᆖ ᆗ ᆘ ᆙ ᆚ ᆛ ᆜ ᆝ ᆞ ᆟ ᆠ ᆡ ᆢ ᆣ ᆤ ᆥ ᆦ ᆧ ᆨ ᆩ ᆪ ᆫ ᆬ ᆭ ᆮ ᆯ ᆰ ᆱ ᆲ ᆳ ᆴ ᆵ ᆶ ᆷ ᆸ ᆹ ᆺ ᆻ ᆼ ᆽ ᆾ ᆿ ᇀ ᇁ ᇂ ᇃ ᇄ ᇅ ᇆ ᇇ ᇈ ᇉ ᇊ ᇋ ᇌ ᇍ ᇎ ᇏ ᇐ ᇑ ᇒ ᇓ ᇔ ᇕ ᇖ ᇗ ᇘ ᇙ ᇚ ᇛ ᇜ ᇝ ᇞ ᇟ ᇠ ᇡ ᇢ ᇣ ᇤ ᇥ ᇦ ᇧ ᇨ ᇩ ᇪ ᇫ ᇬ ᇭ ᇮ ᇯ ᇰ ᇱ ᇲ ᇳ ᇴ ᇵ ᇶ ᇷ ᇸ ᇹ ᇺ ᇻ ᇼ ᇽ ᇾ ᇿ"
+            },
+            {"USE_RANGES_ABOVE", "100"},
             {"[:block=Hangul_Jamo:]", "ᄀ➖ᇿ"},
-            {"[:block=CJK_Unified_Ideographs:]", "一➖鿿"},
-            {"LOCALE", "no"},
-            {"[ĂÅ z]", "Ă z Å"}, // Ensure that order is according to the locale
+            {"USE_RANGES_ABOVE", null},
             {
-                "[ÅÅ]", "Å Å"
-            }, // Ensure it doesn't merge two different characters with same NFC, even though a
-            // collator is used
+                "[:block=Hangul_Jamo:]",
+                "ᄀ ᄁ ᄂ ᄃ ᄄ ᄅ ᄆ ᄇ ᄈ ᄉ ᄊ ᄋ ᄌ ᄍ ᄎ ᄏ ᄐ ᄑ ᄒ ᄓ ᄔ ᄕ ᄖ ᄗ ᄘ ᄙ ᄚ ᄛ ᄜ ᄝ ᄞ ᄟ ᄠ ᄡ ᄢ ᄣ ᄤ ᄥ ᄦ ᄧ ᄨ ᄩ ᄪ ᄫ ᄬ ᄭ ᄮ ᄯ ᄰ ᄱ ᄲ ᄳ ᄴ ᄵ ᄶ ᄷ ᄸ ᄹ ᄺ ᄻ ᄼ ᄽ ᄾ ᄿ ᅀ ᅁ ᅂ ᅃ ᅄ ᅅ ᅆ ᅇ ᅈ ᅉ ᅊ ᅋ ᅌ ᅍ ᅎ ᅏ ᅐ ᅑ ᅒ ᅓ ᅔ ᅕ ᅖ ᅗ ᅘ ᅙ ᅚ ᅛ ᅜ ᅝ ᅞ ❰115F❱ ❰1160❱ ᅡ ᅢ ᅣ ᅤ ᅥ ᅦ ᅧ ᅨ ᅩ ᅪ ᅫ ᅬ ᅭ ᅮ ᅯ ᅰ ᅱ ᅲ ᅳ ᅴ ᅵ ᅶ ᅷ ᅸ ᅹ ᅺ ᅻ ᅼ ᅽ ᅾ ᅿ ᆀ ᆁ ᆂ ᆃ ᆄ ᆅ ᆆ ᆇ ᆈ ᆉ ᆊ ᆋ ᆌ ᆍ ᆎ ᆏ ᆐ ᆑ ᆒ ᆓ ᆔ ᆕ ᆖ ᆗ ᆘ ᆙ ᆚ ᆛ ᆜ ᆝ ᆞ ᆟ ᆠ ᆡ ᆢ ᆣ ᆤ ᆥ ᆦ ᆧ ᆨ ᆩ ᆪ ᆫ ᆬ ᆭ ᆮ ᆯ ᆰ ᆱ ᆲ ᆳ ᆴ ᆵ ᆶ ᆷ ᆸ ᆹ ᆺ ᆻ ᆼ ᆽ ᆾ ᆿ ᇀ ᇁ ᇂ ᇃ ᇄ ᇅ ᇆ ᇇ ᇈ ᇉ ᇊ ᇋ ᇌ ᇍ ᇎ ᇏ ᇐ ᇑ ᇒ ᇓ ᇔ ᇕ ᇖ ᇗ ᇘ ᇙ ᇚ ᇛ ᇜ ᇝ ᇞ ᇟ ᇠ ᇡ ᇢ ᇣ ᇤ ᇥ ᇦ ᇧ ᇨ ᇩ ᇪ ᇫ ᇬ ᇭ ᇮ ᇯ ᇰ ᇱ ᇲ ᇳ ᇴ ᇵ ᇶ ᇷ ᇸ ᇹ ᇺ ᇻ ᇼ ᇽ ᇾ ᇿ"
+            },
+            {"[:block=CJK_Unified_Ideographs:]", "一➖鿿"},
             {"[\\u001E-!]", "❰1E❱ ❰1F❱ ❰SP❱ !"},
             {"[a\\u0020]", "❰SP❱ a"},
             {"[abcq]", "a b c q"},
@@ -105,14 +110,31 @@ public void testSimpleUnicodeSetFormatter() {
             // UnicodeSets
             {"[{\\u0020\u0FFF}]", "❰SP❱❰FFF❱"},
             {"[{a\\u0020b\\u0FFFc}]", "a❰SP❱b❰FFF❱c"},
+            {"[ĂÅ z]", "Ă Å z"}, // Check plain ordering
+            {"LOCALE", "no"},
+            {"[ĂÅ z]", "Ă z Å"}, // Ensure that order is according to the locale
+            {"[ÅÅ]", "Å Å"}, // Ensure it doesn't merge two different characters
+            // with same NFC, even though a collator is used
+            {"LOCALE", null},
+            {"[ĂÅ z]", "Ă Å z"}, // Check plain ordering
         };
 
         SimpleUnicodeSetFormatter susf = new SimpleUnicodeSetFormatter();
 
+        Comparator<String> collator = susf.getComparator();
+        UnicodeSet toEscape = susf.getToEscape();
+        int maxRange = susf.getUseRangesAbove();
+
         int count = 0;
         for (String[] test : unicodeToDisplay) {
             if ("LOCALE".equals(test[0])) {
-                susf = SimpleUnicodeSetFormatter.fromIcuLocale(test[1]);
+                collator = SimpleUnicodeSetFormatter.getComparatorForLocale(test[1]);
+                susf = new SimpleUnicodeSetFormatter(collator, toEscape, maxRange);
+                continue;
+            }
+            if ("USE_RANGES_ABOVE".equals(test[0])) {
+                maxRange = test[1] == null ? -1 : Integer.parseInt(test[1]);
+                susf = new SimpleUnicodeSetFormatter(collator, toEscape, maxRange);
                 continue;
             }
             final UnicodeSet source = new UnicodeSet(test[0]);
@@ -381,6 +403,23 @@ public void TestCodePointEscaper() {
         }
     }
 
+    public void TestEdgeCases() {
+        // just make sure none of these throw exceptions
+        assertEquals("null", '\uFFFd', CodePointEscaper.escapedToCodePoint(null));
+        assertEquals("empty", '\uFFFd', CodePointEscaper.escapedToCodePoint(""));
+        assertEquals("null", '\uFFFd', CodePointEscaper.rawEscapedToCodePoint(null));
+        assertEquals("empty", '\uFFFd', CodePointEscaper.rawEscapedToCodePoint(""));
+        assertEquals("null", null, CodePointEscaper.toEscaped(null));
+        assertEquals("empty", "", CodePointEscaper.toEscaped(""));
+        assertEquals("null", null, CodePointEscaper.toEscaped(null, UnicodeSet.EMPTY));
+        assertEquals("empty", "", CodePointEscaper.toEscaped("", UnicodeSet.EMPTY));
+        assertEquals("null", null, CodePointEscaper.toUnescaped(null));
+        assertEquals("empty", "", CodePointEscaper.toUnescaped(""));
+
+        assertEquals(
+                "null", "a\u0001bc", CodePointEscaper.toEscaped("a\u0001bc", UnicodeSet.EMPTY));
+    }
+
     public void TestStringEscaper() {
         String[][] tests = {
             {"xyz", "xyz"},