From 595dde363d36cecd07b135d8ec8cef83c1774eb4 Mon Sep 17 00:00:00 2001 From: macchiati Date: Thu, 1 Aug 2024 06:43:41 -0700 Subject: [PATCH] CLDR-17844 cleanup --- .../java/org/unicode/cldr/util/BidiUtils.java | 162 ++++++++++++++++++ .../unicode/cldr/util/CodePointEscaper.java | 55 +++++- .../unicode/cldr/util/DateTimeFormats.java | 97 ++++------- 3 files changed, 249 insertions(+), 65 deletions(-) create mode 100644 tools/cldr-code/src/main/java/org/unicode/cldr/util/BidiUtils.java diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/BidiUtils.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/BidiUtils.java new file mode 100644 index 00000000000..dc7e9f2761b --- /dev/null +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/BidiUtils.java @@ -0,0 +1,162 @@ +package org.unicode.cldr.util; + +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Sets; +import com.google.common.collect.Sets.SetView; +import com.ibm.icu.text.Bidi; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSet.SpanCondition; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; +import java.util.TreeSet; +import java.util.stream.Collectors; + +/** + * A set of utilities for handling BIDI, especially in charts and examples but not restricted to + * that. + */ +public class BidiUtils { + public static final String ALERT = "⚠️"; + static final String LRM = CodePointEscaper.LRM.getString(); + + // These are intended to be classes of characters that "stick together in order" + // The initial focus is dates, so this will probably need to be expanded for numbers; might need + // more syntax + + private enum SpanClass { + NUMBERS("\\p{N}"), + LETTERS_MARKS("[\\p{L}\\p{M}]"), + DATE_PUNCT("[+]"), + SPACES("\\p{Z}"), + OTHERS("\\p{any}") // must be last, to pick up remainder. + ; + final UnicodeSet uset; + + private SpanClass(String unicodeSetSource) { + uset = new UnicodeSet(unicodeSetSource); + } + + static { + // clean up by removing previous values + UnicodeSet soFar = new UnicodeSet(); + for (SpanClass sc : SpanClass.values()) { + sc.uset.removeAll(soFar).freeze(); + soFar.addAll(sc.uset); + } + } + } + /** + * Checks the ordering of the example, under the specified bidiDirectionOptions; + * + * @param example Source text, not HTMLified + * @param outputReorderedResults One string for each specified bidiDirectionOption + * @param bidiDirectionOptions an array of BIDI directions from com.ibm.icu.text.Bidi. if there + * are no items, the default is DIRECTION_DEFAULT_LEFT_TO_RIGHT (dir="auto"), + * DIRECTION_RIGHT_TO_LEFT (dir="rtl"). + * @return true unless two or more of the resulting strings are different. + */ + public static boolean isOrderingUnchanged( + String example, List outputReorderedResults, int... bidiDirectionOptions) { + boolean hasList = outputReorderedResults != null; + if (!hasList) { + outputReorderedResults = new ArrayList<>(); + } else { + outputReorderedResults.clear(); + } + boolean result = true; + for (int count = 0; count < bidiDirectionOptions.length; ++count) { + String reordered = new Bidi(example, bidiDirectionOptions[count]).writeReordered(0); + outputReorderedResults.add(reordered); + if (result && count != 0 && !reordered.equals(outputReorderedResults.get(0))) { + result = false; + if (!hasList) { + break; // if the output results are not needed, then stop. + } + } + } + return result; + } + + /** + * Return a list of the , where each span is a sequence of: + * + * @param orderedLTR + * @return + */ + /** + * Gets the 'fields' in a formatted string, used to test whether bidi reordering causes the + * original fields to merge when reordered. Each field is the longest contiguous span of + * characters with the same properties: * + * + * + * + * @param ordered + * @return a set of fields, in the same order as found in the text but duplicates removed (ike + * LinkedHashSeet). + */ + public static Set getFields(String reordred, Set result) { + int start = 0; + while (start < reordred.length()) { + for (SpanClass sc : SpanClass.values()) { + int end = sc.uset.span(reordred, start, SpanCondition.CONTAINED); + if (end != start) { + result.add(reordred.substring(start, end)); + start = end; + break; + } + } + } + return ImmutableSet.copyOf(result); + } + + /** + * Show when the fields in strings are different + * + * @param bidiReordereds + * @return + */ + public static String getAlert(List bidiReordereds) { + Set> results = new LinkedHashSet<>(); + for (String bidiReordered : bidiReordereds) { + Set fieldsLTR = BidiUtils.getFields(bidiReordered, new TreeSet<>()); + results.add(fieldsLTR); + } + if (results.size() < 2) { + return ""; + } + // there can still be differences within a field of OTHERS, that we ignore. + // EG ⚠️ 20,28,2B; 2B,28,20 " (+" vs " (+" + + // show just the difference in the first 2, for now. + Iterator> it = results.iterator(); + Set first = it.next(); + Set second = it.next(); + SetView uniqueFirst = Sets.difference(first, second); + SetView uniqueSecond = Sets.difference(second, first); + return ALERT + " " + escape(uniqueFirst) + "; " + escape(uniqueSecond); + } + + public static String escape(Set uniqueFirst) { + return uniqueFirst.stream() + .map(x -> CodePointEscaper.toEscaped(x)) + .collect(Collectors.joining(LRM + ", " + LRM, LRM, LRM)); + } + + public static String alphagram(String string) { + return string.codePoints() + .sorted() + .collect( + StringBuilder::new, // Supplier supplier + StringBuilder::appendCodePoint, // ObjIntConsumer accumulator + StringBuilder::append // BiConsumer combiner + ) + .toString(); + } +} diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/CodePointEscaper.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/CodePointEscaper.java index 9b6c304024a..04d030b7a19 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/CodePointEscaper.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/CodePointEscaper.java @@ -1,6 +1,7 @@ package org.unicode.cldr.util; import com.ibm.icu.impl.UnicodeMap; +import com.ibm.icu.impl.Utility; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeSet; @@ -114,9 +115,7 @@ public enum CodePointEscaper { private final String description; private CodePointEscaper(int codePoint, String shortName) { - this.codePoint = codePoint; - this.shortName = shortName; - this.description = ""; + this(codePoint, shortName, ""); } private CodePointEscaper(int codePoint, String shortName, String description) { @@ -291,4 +290,54 @@ public static String rawCodePointToEscaped(int codePoint) { ? Integer.toString(codePoint, 16).toUpperCase(Locale.ROOT) : result.toString(); } + + public static final String getHtmlRows( + UnicodeSet escapesToShow, String tableOptions, String cellOptions) { + if (!escapesToShow.strings().isEmpty()) { + throw new IllegalArgumentException("No strings allowed in the unicode set."); + } + StringBuilder result = new StringBuilder(""); + UnicodeSet remaining = new UnicodeSet(escapesToShow); + String tdPlus = ""; + for (CodePointEscaper cpe : CodePointEscaper.values()) { + int cp = cpe.getCodePoint(); + remaining.remove(cp); + if (escapesToShow.contains(cpe.getCodePoint())) { + final String id = cpe.name(); + final String shortName = cpe.getShortName(); + final String description = cpe.getDescription(); + addREsult(result, tdPlus, id, shortName, description); + } + } + for (String cps : remaining) { + int cp = cps.codePointAt(0); + final String extendedName = UCharacter.getExtendedName(cp); + addREsult( + result, + tdPlus, + Utility.hex(cp, 2), + "", + extendedName == null ? "" : extendedName.toLowerCase()); + } + return result.append("").toString(); + } + + public static void addREsult( + StringBuilder result, + String tdPlus, + final String id, + final String shortName, + final String description) { + result.append("") + .append(tdPlus) + .append(ESCAPE_START) + .append(id) + .append(ESCAPE_END + "") + .append(tdPlus) + .append(shortName) + .append("") + .append(tdPlus) + .append(description) + .append(""); + } } diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/DateTimeFormats.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/DateTimeFormats.java index 60d38bbdac8..87b368f9bfd 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/DateTimeFormats.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/DateTimeFormats.java @@ -1,7 +1,6 @@ package org.unicode.cldr.util; import com.google.common.collect.ImmutableMap; -import com.google.common.collect.ImmutableSet; import com.ibm.icu.impl.Row.R3; import com.ibm.icu.text.Bidi; import com.ibm.icu.text.DateFormat; @@ -15,7 +14,6 @@ import com.ibm.icu.text.MessageFormat; import com.ibm.icu.text.SimpleDateFormat; import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSet.SpanCondition; import com.ibm.icu.util.Calendar; import com.ibm.icu.util.DateInterval; import com.ibm.icu.util.ICUUncheckedIOException; @@ -25,6 +23,7 @@ import java.io.File; import java.io.IOException; import java.io.PrintWriter; +import java.util.ArrayList; import java.util.Arrays; import java.util.Date; import java.util.EnumSet; @@ -51,9 +50,6 @@ public class DateTimeFormats { private static final UnicodeSet TO_ESCAPE = new UnicodeSet(CodePointEscaper.FORCE_ESCAPE) .remove(CodePointEscaper.SP.getCodePoint()) - .remove(CodePointEscaper.TSP.getCodePoint()) - .remove(CodePointEscaper.NBSP.getCodePoint()) - .remove(CodePointEscaper.NBTSP.getCodePoint()) .freeze(); private static final String MISSING_PART = "ⓜⓘⓢⓢⓘⓝⓖ"; private static final CLDRConfig CONFIG = CLDRConfig.getInstance(); @@ -513,12 +509,13 @@ public boolean isPresent(String skeleton) { * @param output */ public void addTable(DateTimeFormats comparison, Appendable output) { + UnicodeSet allEscapedCharactersFound = new UnicodeSet(); try { output.append( "

" + hackDoubleLinked("Patterns") + "

" - + "

Normally, there is one line containing an example in each Native Example cell. " + + "

Normally, there is a single line containing an example in each Native Example cell. " + (!isRTL ? "" : "However, two examples are provided if the locale is right-to-left, like Arabic or Hebrew, " @@ -528,14 +525,13 @@ public void addTable(DateTimeFormats comparison, Appendable output) { + ltrSpan + "and a different background" + spanEnd - + ". If the display of either example causes strings of letters or numbers to collide, " - + "then a ⚠️ is shown. ") + + ". If the display of either example appears to cause strings of letters or numbers to collide, " + + "then a ⚠️ is shown followed by differences (this is still experimental). ") + "When an example has hidden characters, then " + tableSpan + "an extra line" + spanEnd - + " shows those characters " - + "such as ❰RLM❱ for the invisible Right-to-Left Mark. " + + " shows those characters with short IDs ❰…❱: see the Key below the table. " + "So that the ordering of the characters in memory is clear, they are presented left-to-right one at a time. " + "so that the placement is clear. " + "When a pattern (or a component of a pattern) is missing, it is displayed as " @@ -572,8 +568,8 @@ public void addTable(DateTimeFormats comparison, Appendable output) { RowStyle.normal, name, skeleton, - comparison.getExample(skeleton), - getExample(skeleton), + comparison.getExample(skeleton, allEscapedCharactersFound), + getExample(skeleton, allEscapedCharactersFound), diff.isPresent(skeleton)); } } @@ -611,12 +607,21 @@ public void addTable(DateTimeFormats comparison, Appendable output) { RowStyle.normal, skeleton, skeleton, - comparison.getExample(skeleton), - getExample(skeleton), + comparison.getExample(skeleton, allEscapedCharactersFound), + getExample(skeleton, allEscapedCharactersFound), true); } } output.append(""); + if (!allEscapedCharactersFound.isEmpty()) { + output.append("\n

Key to Escaped Characters

\n"); + String keyToEscaped = + CodePointEscaper.getHtmlRows( + allEscapedCharactersFound, + " style='border:1px solid blue; border-collapse: collapse'", + " style='border:1px solid blue'"); + output.append(keyToEscaped); + } } catch (IOException e) { throw new ICUUncheckedIOException(e); } @@ -626,9 +631,10 @@ public void addTable(DateTimeFormats comparison, Appendable output) { * Get an example from the "enhanced" skeleton. * * @param skeleton + * @param escapedCharactersFound Any characters that were escaped are added to this. * @return */ - private String getExample(String skeleton) { + private String getExample(String skeleton, UnicodeSet escapedCharactersFound) { String example; if (skeleton.contains("®")) { example = getRelativeExampleFromSkeleton(skeleton); @@ -663,18 +669,17 @@ private String getExample(String skeleton) { } } String transformedExample = TransliteratorUtilities.toHTML.transform(example); + ArrayList listOfReorderings = new ArrayList<>(); if ((isRTL || BIDI_MARKS.containsSome(example)) && !example.contains(MISSING_PART)) { - Bidi bidiLTR = new Bidi(example, Bidi.DIRECTION_DEFAULT_LEFT_TO_RIGHT); - String orderedLTR = bidiLTR.writeReordered(0); - Bidi bidiRTL = new Bidi(example, Bidi.DIRECTION_RIGHT_TO_LEFT); - String orderedRTL = bidiRTL.writeReordered(0); - if (!orderedLTR.equals(orderedRTL)) { - // since this is RTL, we put it first + if (!BidiUtils.isOrderingUnchanged( + example, + listOfReorderings, + Bidi.DIRECTION_DEFAULT_LEFT_TO_RIGHT, + Bidi.DIRECTION_RIGHT_TO_LEFT)) { + // since this locale is RTL, we put it first String rtlVersion = rtlStart + transformedExample + divEnd; // not colored String autoVersion = autoLtrStart + transformedExample + divEnd; // colored - Set fieldsLTR = getFields(orderedLTR); - Set fieldsRTL = getFields(orderedRTL); - String alert = fieldsLTR.equals(fieldsRTL) ? "" : " ⚠️ "; + String alert = BidiUtils.getAlert(listOfReorderings); transformedExample = rtlVersion + autoVersion + alert; } else { String autoVersion = autoStart + transformedExample + divEnd; // not colored @@ -696,46 +701,11 @@ private String getExample(String skeleton) { }); transformedExample += "" + processed + "
"; + escapedCharactersFound.addAll(new UnicodeSet().addAll(example).retainAll(TO_ESCAPE)); } return transformedExample; } - /** - * Return a list of the fields, where each span is a sequence of: - * - *
    - *
  • numbers (\p{N}) - *
  • letters & marks ([\p{L}\p{M} - *
  • Other - *
- * - * @param orderedLTR - * @return - */ - static final UnicodeSet NUMBERS = new UnicodeSet("\\p{N}").freeze(); - - static final UnicodeSet LETTERS_MARKS = new UnicodeSet("[\\p{L}\\p{M}]").freeze(); - static final UnicodeSet OTHERS = - new UnicodeSet(NUMBERS).addAll(LETTERS_MARKS).complement().freeze(); - static final Set ALL = ImmutableSet.of(NUMBERS, LETTERS_MARKS, OTHERS); - - private Set getFields(String ordered) { - Set result = - new LinkedHashSet<>(); // doesn't have to be a LHS, but helps with debugging - int start = 0; - while (start < ordered.length()) { - for (UnicodeSet us : ALL) { - int end = us.span(ordered, start, SpanCondition.CONTAINED); - if (end != start) { - result.add(ordered.substring(start, end)); - start = end; - break; - } - } - } - return result; - } - static final Pattern RELATIVE_DATE = PatternCache.get("®([a-z]+(?:-[a-z]+)?)+(-[a-z]+)?([+-]?\\d+)([a-zA-Z]+)?"); @@ -1102,11 +1072,14 @@ public static void main(String[] args) throws IOException { String organization = MyOptions.organization.option.getValue(); String filter = MyOptions.filter.option.getValue(); + boolean hasFilter = MyOptions.filter.option.doesOccur(); CLDRFile englishFile = CONFIG.getEnglish(); Factory factory = Factory.make(CLDRPaths.MAIN_DIRECTORY, filter); - System.out.println("Total locales: " + factory.getAvailableLanguages().size()); + final Set availableLocales = + hasFilter ? factory.getAvailable() : factory.getAvailableLanguages(); + System.out.println("Total locales: " + availableLocales.size()); DateTimeFormats english = new DateTimeFormats().set(englishFile, "gregorian"); new File(DIR).mkdirs(); @@ -1118,7 +1091,7 @@ public static void main(String[] args) throws IOException { Map sorted = new TreeMap<>(); SupplementalDataInfo sdi = SupplementalDataInfo.getInstance(); Set defaultContent = sdi.getDefaultContentLocales(); - for (String localeID : factory.getAvailable()) { + for (String localeID : availableLocales) { Level level = StandardCodes.make().getLocaleCoverageLevel(organization, localeID); if (Level.MODERN.compareTo(level) > 0) { continue;