Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CLDR-17703 Fix NPE in CodePointEscaper #3789

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,9 @@ public String codePointToEscaped() {

/** Returns a code point from the escaped form <b>of a single code point</b> */
public static int escapedToCodePoint(String value) {
if (value == null || value.isEmpty()) {
return 0xFFFD;
}
if (value.codePointAt(0) != CodePointEscaper.ESCAPE_START
|| value.codePointAt(value.length() - 1) != CodePointEscaper.ESCAPE_END) {
throw new IllegalArgumentException(
Expand All @@ -177,6 +180,9 @@ public static String toEscaped(String unescaped) {

/** Returns the escaped form from a string */
public static String toEscaped(String unescaped, UnicodeSet toEscape) {
if (unescaped == null) {
return null;
}
StringBuilder result = new StringBuilder();
unescaped
.codePoints()
Expand All @@ -191,25 +197,30 @@ public static String toEscaped(String unescaped, UnicodeSet toEscape) {
return result.toString();
}
/** Return unescaped string */
public static String toUnescaped(String value) {
public static String toUnescaped(String escaped) {
if (escaped == null) {
return null;
}
StringBuilder result = null;
int donePart = 0;
int found = value.indexOf(ESCAPE_START);
int found = escaped.indexOf(ESCAPE_START);
while (found >= 0) {
int foundEnd = value.indexOf(ESCAPE_END, found);
int foundEnd = escaped.indexOf(ESCAPE_END, found);
if (foundEnd < 0) {
throw new IllegalArgumentException(
"Malformed escaped string, missing: " + ESCAPE_END);
}
if (result == null) {
result = new StringBuilder();
}
result.append(value, donePart, found);
result.append(escaped, donePart, found);
donePart = ++foundEnd;
result.appendCodePoint(escapedToCodePoint(value.substring(found, foundEnd)));
found = value.indexOf(ESCAPE_START, foundEnd);
result.appendCodePoint(escapedToCodePoint(escaped.substring(found, foundEnd)));
found = escaped.indexOf(ESCAPE_START, foundEnd);
}
return donePart == 0 ? value : result.append(value, donePart, value.length()).toString();
return donePart == 0
? escaped
: result.append(escaped, donePart, escaped.length()).toString();
}

private static final String HAS_NAME = " ≡ ";
Expand All @@ -232,6 +243,9 @@ public static String toExample(int codePoint) {
* brackets</b>
*/
public static int rawEscapedToCodePoint(CharSequence value) {
if (value == null || value.length() == 0) {
return 0xFFFD;
}
try {
return valueOf(value.toString().toUpperCase(Locale.ROOT)).codePoint;
} catch (Exception e) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import com.ibm.icu.lang.CharSequences;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.Normalizer2;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.ULocale;
import java.util.ArrayList;
Expand Down Expand Up @@ -41,15 +40,25 @@ public class SimpleUnicodeSetFormatter implements FormatterParser<UnicodeSet> {
public static Normalizer2 nfc = Normalizer2.getNFCInstance();

public static final Comparator<String> BASIC_COLLATOR =
(Comparator) ComparatorUtilities.getIcuCollator(ULocale.ROOT, Collator.IDENTICAL);
(Comparator) CLDRConfig.getInstance().getCollator();

private static final int DEFAULT_MAX_DISALLOW_RANGES = 199;
public static final int DEFAULT_RANGES_ABOVE = 1024;

private final Comparator<String> comparator;
private final UnicodeSet forceHex;
private final int maxDisallowRanges;
private final UTF16.StringComparator codepointComparator =
new UTF16.StringComparator(true, false, 0);
private final int useRangesAbove;

public Comparator<String> getComparator() {
return comparator;
}

public UnicodeSet getToEscape() {
return forceHex;
}

public int getUseRangesAbove() {
return useRangesAbove;
}

/**
* Create a simple formatter, with a comparator for the ordering and a UnicodeSet of characters
Expand All @@ -58,38 +67,43 @@ public class SimpleUnicodeSetFormatter implements FormatterParser<UnicodeSet> {
* @param col — collator. The default is BASIC_COLLATOR, which is the root collator.
* @param forceHex - UnicodeSet to force to be hex. It will be frozen if not already. Warning:
* may not round-trip unless it includes all of CodePointEscaper.getNamedEscapes()
* @param maxDisallowRanges — under this number, there will be no ranges; at or above there may
* be ranges, and the collator will be disregarded.
* @param useRangesAbove — under this number, there will be no ranges; at or above there may be
* ranges, and the collator will be disregarded.
*/
public SimpleUnicodeSetFormatter(
Comparator<String> col, UnicodeSet forceHex, int maxDisallowRanges) {
Comparator<String> col, UnicodeSet forceHex, int useRangesAbove) {
// collate, but preserve non-equivalents
this.comparator = col == null ? COLLATOR : ComparatorUtilities.wrapForCodePoints(col);
this.comparator = col == null ? BASIC_COLLATOR : ComparatorUtilities.wrapForCodePoints(col);
this.forceHex = forceHex == null ? CodePointEscaper.FORCE_ESCAPE : forceHex.freeze();
this.maxDisallowRanges = maxDisallowRanges;
this.useRangesAbove = useRangesAbove < 0 ? DEFAULT_RANGES_ABOVE : useRangesAbove;
}

static final int DEFAULT_MAX = 1024;
public static final Comparator<String> COLLATOR =
(Comparator) CLDRConfig.getInstance().getCollator();

public static SimpleUnicodeSetFormatter fromIcuLocale(String localeId) {
return new SimpleUnicodeSetFormatter(COLLATOR, null, DEFAULT_MAX);
public static Comparator<String> getComparatorForLocale(String localeId) {
Comparator<String> collator = BASIC_COLLATOR;
try {
if (localeId != null) {
ICUServiceBuilder isb =
ICUServiceBuilder.forLocale(CLDRLocale.getInstance(localeId));
collator = (Comparator) isb.getRuleBasedCollator();
}
} catch (Exception e) { // for our purposes, better to fall back to the default
}
return collator;
}

public SimpleUnicodeSetFormatter(Comparator<String> col, UnicodeSet forceHex) {
this(col, forceHex, DEFAULT_MAX_DISALLOW_RANGES);
this(col, forceHex, DEFAULT_RANGES_ABOVE);
}

public SimpleUnicodeSetFormatter(Comparator<String> col) {
this(col, null, DEFAULT_MAX);
this(col, null, DEFAULT_RANGES_ABOVE);
}

public SimpleUnicodeSetFormatter() {
this(
(Comparator) ComparatorUtilities.getIcuCollator(ULocale.ROOT, Collator.IDENTICAL),
null,
DEFAULT_MAX);
DEFAULT_RANGES_ABOVE);
}

static class Lazy {
Expand All @@ -115,7 +129,7 @@ public static UnicodeSet parseLenient(String source) {

@Override
public String format(UnicodeSet input) {
final boolean allowRanges = input.size() > maxDisallowRanges;
final boolean allowRanges = input.size() > useRangesAbove;
StringBuilder result = new StringBuilder();
Collection<String> sorted =
input.addAllTo(allowRanges ? new ArrayList<>() : new TreeSet<>(comparator));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import com.ibm.icu.util.ULocale;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
Expand Down Expand Up @@ -85,14 +86,18 @@ public void testSimpleUnicodeSetFormatter() {
"[A Á B C {CS} D {DZ} {DZS} E É F G {GY} H I Í J K L {LY} M N {NY} O Ó Ö Ő P Q R S {SZ} T {TY} U Ú Ü Ű V W X Y Z {ZS}]",
"A Á B C CS D DZ DZS E É F G GY H I Í J K L LY M N NY O Ó Ö Ő P Q R S SZ T TY U Ú Ü Ű V W X Y Z ZS"
},
{
"[:block=Hangul_Jamo:]",
"ᄀ ᄁ ᄂ ᄃ ᄄ ᄅ ᄆ ᄇ ᄈ ᄉ ᄊ ᄋ ᄌ ᄍ ᄎ ᄏ ᄐ ᄑ ᄒ ᄓ ᄔ ᄕ ᄖ ᄗ ᄘ ᄙ ᄚ ᄛ ᄜ ᄝ ᄞ ᄟ ᄠ ᄡ ᄢ ᄣ ᄤ ᄥ ᄦ ᄧ ᄨ ᄩ ᄪ ᄫ ᄬ ᄭ ᄮ ᄯ ᄰ ᄱ ᄲ ᄳ ᄴ ᄵ ᄶ ᄷ ᄸ ᄹ ᄺ ᄻ ᄼ ᄽ ᄾ ᄿ ᅀ ᅁ ᅂ ᅃ ᅄ ᅅ ᅆ ᅇ ᅈ ᅉ ᅊ ᅋ ᅌ ᅍ ᅎ ᅏ ᅐ ᅑ ᅒ ᅓ ᅔ ᅕ ᅖ ᅗ ᅘ ᅙ ᅚ ᅛ ᅜ ᅝ ᅞ ❰115F❱ ❰1160❱ ᅡ ᅢ ᅣ ᅤ ᅥ ᅦ ᅧ ᅨ ᅩ ᅪ ᅫ ᅬ ᅭ ᅮ ᅯ ᅰ ᅱ ᅲ ᅳ ᅴ ᅵ ᅶ ᅷ ᅸ ᅹ ᅺ ᅻ ᅼ ᅽ ᅾ ᅿ ᆀ ᆁ ᆂ ᆃ ᆄ ᆅ ᆆ ᆇ ᆈ ᆉ ᆊ ᆋ ᆌ ᆍ ᆎ ᆏ ᆐ ᆑ ᆒ ᆓ ᆔ ᆕ ᆖ ᆗ ᆘ ᆙ ᆚ ᆛ ᆜ ᆝ ᆞ ᆟ ᆠ ᆡ ᆢ ᆣ ᆤ ᆥ ᆦ ᆧ ᆨ ᆩ ᆪ ᆫ ᆬ ᆭ ᆮ ᆯ ᆰ ᆱ ᆲ ᆳ ᆴ ᆵ ᆶ ᆷ ᆸ ᆹ ᆺ ᆻ ᆼ ᆽ ᆾ ᆿ ᇀ ᇁ ᇂ ᇃ ᇄ ᇅ ᇆ ᇇ ᇈ ᇉ ᇊ ᇋ ᇌ ᇍ ᇎ ᇏ ᇐ ᇑ ᇒ ᇓ ᇔ ᇕ ᇖ ᇗ ᇘ ᇙ ᇚ ᇛ ᇜ ᇝ ᇞ ᇟ ᇠ ᇡ ᇢ ᇣ ᇤ ᇥ ᇦ ᇧ ᇨ ᇩ ᇪ ᇫ ᇬ ᇭ ᇮ ᇯ ᇰ ᇱ ᇲ ᇳ ᇴ ᇵ ᇶ ᇷ ᇸ ᇹ ᇺ ᇻ ᇼ ᇽ ᇾ ᇿ"
},
{"USE_RANGES_ABOVE", "100"},
{"[:block=Hangul_Jamo:]", "ᄀ➖ᇿ"},
{"[:block=CJK_Unified_Ideographs:]", "一➖鿿"},
{"LOCALE", "no"},
{"[ĂÅ z]", "Ă z Å"}, // Ensure that order is according to the locale
{"USE_RANGES_ABOVE", null},
{
"[ÅÅ]", "Å Å"
}, // Ensure it doesn't merge two different characters with same NFC, even though a
// collator is used
"[:block=Hangul_Jamo:]",
"ᄀ ᄁ ᄂ ᄃ ᄄ ᄅ ᄆ ᄇ ᄈ ᄉ ᄊ ᄋ ᄌ ᄍ ᄎ ᄏ ᄐ ᄑ ᄒ ᄓ ᄔ ᄕ ᄖ ᄗ ᄘ ᄙ ᄚ ᄛ ᄜ ᄝ ᄞ ᄟ ᄠ ᄡ ᄢ ᄣ ᄤ ᄥ ᄦ ᄧ ᄨ ᄩ ᄪ ᄫ ᄬ ᄭ ᄮ ᄯ ᄰ ᄱ ᄲ ᄳ ᄴ ᄵ ᄶ ᄷ ᄸ ᄹ ᄺ ᄻ ᄼ ᄽ ᄾ ᄿ ᅀ ᅁ ᅂ ᅃ ᅄ ᅅ ᅆ ᅇ ᅈ ᅉ ᅊ ᅋ ᅌ ᅍ ᅎ ᅏ ᅐ ᅑ ᅒ ᅓ ᅔ ᅕ ᅖ ᅗ ᅘ ᅙ ᅚ ᅛ ᅜ ᅝ ᅞ ❰115F❱ ❰1160❱ ᅡ ᅢ ᅣ ᅤ ᅥ ᅦ ᅧ ᅨ ᅩ ᅪ ᅫ ᅬ ᅭ ᅮ ᅯ ᅰ ᅱ ᅲ ᅳ ᅴ ᅵ ᅶ ᅷ ᅸ ᅹ ᅺ ᅻ ᅼ ᅽ ᅾ ᅿ ᆀ ᆁ ᆂ ᆃ ᆄ ᆅ ᆆ ᆇ ᆈ ᆉ ᆊ ᆋ ᆌ ᆍ ᆎ ᆏ ᆐ ᆑ ᆒ ᆓ ᆔ ᆕ ᆖ ᆗ ᆘ ᆙ ᆚ ᆛ ᆜ ᆝ ᆞ ᆟ ᆠ ᆡ ᆢ ᆣ ᆤ ᆥ ᆦ ᆧ ᆨ ᆩ ᆪ ᆫ ᆬ ᆭ ᆮ ᆯ ᆰ ᆱ ᆲ ᆳ ᆴ ᆵ ᆶ ᆷ ᆸ ᆹ ᆺ ᆻ ᆼ ᆽ ᆾ ᆿ ᇀ ᇁ ᇂ ᇃ ᇄ ᇅ ᇆ ᇇ ᇈ ᇉ ᇊ ᇋ ᇌ ᇍ ᇎ ᇏ ᇐ ᇑ ᇒ ᇓ ᇔ ᇕ ᇖ ᇗ ᇘ ᇙ ᇚ ᇛ ᇜ ᇝ ᇞ ᇟ ᇠ ᇡ ᇢ ᇣ ᇤ ᇥ ᇦ ᇧ ᇨ ᇩ ᇪ ᇫ ᇬ ᇭ ᇮ ᇯ ᇰ ᇱ ᇲ ᇳ ᇴ ᇵ ᇶ ᇷ ᇸ ᇹ ᇺ ᇻ ᇼ ᇽ ᇾ ᇿ"
},
{"[:block=CJK_Unified_Ideographs:]", "一➖鿿"},
{"[\\u001E-!]", "❰1E❱ ❰1F❱ ❰SP❱ !"},
{"[a\\u0020]", "❰SP❱ a"},
{"[abcq]", "a b c q"},
Expand All @@ -105,14 +110,31 @@ public void testSimpleUnicodeSetFormatter() {
// UnicodeSets
{"[{\\u0020\u0FFF}]", "❰SP❱❰FFF❱"},
{"[{a\\u0020b\\u0FFFc}]", "a❰SP❱b❰FFF❱c"},
{"[ĂÅ z]", "Ă Å z"}, // Check plain ordering
{"LOCALE", "no"},
{"[ĂÅ z]", "Ă z Å"}, // Ensure that order is according to the locale
{"[ÅÅ]", "Å Å"}, // Ensure it doesn't merge two different characters
// with same NFC, even though a collator is used
{"LOCALE", null},
{"[ĂÅ z]", "Ă Å z"}, // Check plain ordering
};

SimpleUnicodeSetFormatter susf = new SimpleUnicodeSetFormatter();

Comparator<String> collator = susf.getComparator();
UnicodeSet toEscape = susf.getToEscape();
int maxRange = susf.getUseRangesAbove();

int count = 0;
for (String[] test : unicodeToDisplay) {
if ("LOCALE".equals(test[0])) {
susf = SimpleUnicodeSetFormatter.fromIcuLocale(test[1]);
collator = SimpleUnicodeSetFormatter.getComparatorForLocale(test[1]);
susf = new SimpleUnicodeSetFormatter(collator, toEscape, maxRange);
continue;
}
if ("USE_RANGES_ABOVE".equals(test[0])) {
maxRange = test[1] == null ? -1 : Integer.parseInt(test[1]);
susf = new SimpleUnicodeSetFormatter(collator, toEscape, maxRange);
continue;
}
final UnicodeSet source = new UnicodeSet(test[0]);
Expand Down Expand Up @@ -381,6 +403,23 @@ public void TestCodePointEscaper() {
}
}

public void TestEdgeCases() {
// just make sure none of these throw exceptions
assertEquals("null", '\uFFFd', CodePointEscaper.escapedToCodePoint(null));
assertEquals("empty", '\uFFFd', CodePointEscaper.escapedToCodePoint(""));
assertEquals("null", '\uFFFd', CodePointEscaper.rawEscapedToCodePoint(null));
assertEquals("empty", '\uFFFd', CodePointEscaper.rawEscapedToCodePoint(""));
assertEquals("null", null, CodePointEscaper.toEscaped(null));
assertEquals("empty", "", CodePointEscaper.toEscaped(""));
assertEquals("null", null, CodePointEscaper.toEscaped(null, UnicodeSet.EMPTY));
assertEquals("empty", "", CodePointEscaper.toEscaped("", UnicodeSet.EMPTY));
assertEquals("null", null, CodePointEscaper.toUnescaped(null));
assertEquals("empty", "", CodePointEscaper.toUnescaped(""));

assertEquals(
"null", "a\u0001bc", CodePointEscaper.toEscaped("a\u0001bc", UnicodeSet.EMPTY));
}

public void TestStringEscaper() {
String[][] tests = {
{"xyz", "xyz"},
Expand Down
Loading