Skip to content

Commit

Permalink
CLDR-16468 v43 JSON drop code_fallback and constructed data (#2791)
Browse files Browse the repository at this point in the history
* CLDR-16468 v43: JSON update logging output

- add some symbols to distinguish sections during build

* CLDR-16468 v43: JSON skip code-fallback/constructed data

- also improve javadocs on CLDRFile and CldrUtility

* CLDR-16468 v43: JSON update logging output

- make it clearer how long we are in the process
  • Loading branch information
srl295 authored Mar 15, 2023
1 parent 4d5c6aa commit e9a77eb
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 47 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
import org.unicode.cldr.util.CLDRFile.DraftStatus;

import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
Expand All @@ -42,6 +41,7 @@
import com.ibm.icu.number.LocalizedNumberFormatter;
import com.ibm.icu.number.NumberFormatter;
import com.ibm.icu.number.Precision;
import com.ibm.icu.text.MessageFormat;
import com.ibm.icu.util.NoUnit;
import com.ibm.icu.util.ULocale;

Expand All @@ -53,13 +53,23 @@
*/
@CLDRTool(alias = "ldml2json", description = "Convert CLDR data to JSON")
public class Ldml2JsonConverter {
// Icons
private static final String DONE_ICON = "✅";
private static final String GEAR_ICON = "⚙️";
private static final String NONE_ICON = "∅";
private static final String PACKAGE_ICON = "📦";
private static final String SECTION_ICON = "📍";
private static final String TYPE_ICON = "📂";
private static final String WARN_ICON = "⚠️";

// File prefix
private static final String CLDR_PKG_PREFIX = "cldr-";
private static final String FULL_TIER_SUFFIX = "-full";
private static final String MODERN_TIER_SUFFIX = "-modern";
private static Logger logger = Logger.getLogger(Ldml2JsonConverter.class.getName());

enum RunType {
all,
all, // number zero
main,
supplemental(false, false), // aka 'cldr-core'
segments, rbnf(false, true), annotations, annotationsDerived, bcp47(false, false);
Expand Down Expand Up @@ -131,6 +141,8 @@ private class AvailableLocales {
"Type of CLDR data being generated, such as main, supplemental, or segments. All gets all.")
.add("resolved", 'r', "(true|false)", "false",
"Whether the output JSON for the main directory should be based on resolved or unresolved data")
.add("Redundant", 'R', "(true|false)", "false",
"Include redundant data from code-fallback and constructed")
.add("draftstatus", 's', "(approved|contributed|provisional|unconfirmed)", "unconfirmed",
"The minimum draft status of the output data")
.add("coverage", 'l', "(minimal|basic|moderate|modern|comprehensive|optional)", "optional",
Expand All @@ -150,28 +162,30 @@ private class AvailableLocales {
.add("Modern", 'M', "(true|false)", "true", "Whether to include the -modern tier");

public static void main(String[] args) throws Exception {
System.out.println(GEAR_ICON + " " + Ldml2JsonConverter.class.getName() + " options:");
options.parse(args, true);

Timer overallTimer = new Timer();
overallTimer.start();
final String rawType = options.get("type").getValue();

if (RunType.all.name().equals(rawType)) {
// Running all types
for(final RunType t : RunType.values()) {
if (t == RunType.all) continue;
System.out.println();
System.out.println("####################### " + t + " #######################");
System.out.println(TYPE_ICON + "####################### " + t + " #######################");
Timer subTimer = new Timer();
subTimer.start();
processType(t.name());
System.out.println(t + "\tFinished in " + subTimer.toMeasureString());
System.out.println(TYPE_ICON + " " + t + "\tFinished in " + subTimer.toMeasureString());
System.out.println();
}
} else {
processType(rawType);
}

System.out.println("\n\n###\n\nFinished everything in " + overallTimer.toMeasureString());
System.out.println("\n\n###\n\n" + DONE_ICON + " Finished everything in " + overallTimer.toMeasureString());
}

static void processType(final String runType) throws Exception {
Expand All @@ -188,7 +202,8 @@ static void processType(final String runType) throws Exception {
options.get("pkgversion").getValue(),
Boolean.parseBoolean(options.get("bcp47").getValue()),
Boolean.parseBoolean(options.get("bcp47-no-subtags").getValue()),
Boolean.parseBoolean(options.get("Modern").getValue())
Boolean.parseBoolean(options.get("Modern").getValue()),
Boolean.parseBoolean(options.get("Redundant").getValue())
);

DraftStatus status = DraftStatus.valueOf(options.get("draftstatus").getValue());
Expand All @@ -211,6 +226,9 @@ static void processType(final String runType) throws Exception {
private boolean writePackages;
// Type of run for this converter: main, supplemental, or segments
final private RunType type;
// include Redundant data such as apc="apc", en_US="en (US)"
private boolean includeRedundant;


static class JSONSection implements Comparable<JSONSection> {
public String section;
Expand All @@ -234,7 +252,7 @@ public int compareTo(JSONSection other) {

public Ldml2JsonConverter(String cldrDir, String outputDir, String runType, boolean fullNumbers, boolean resolve, String coverage, String match,
boolean writePackages, String configFile, String pkgVersion,
boolean strictBcp47, boolean skipBcp47LocalesWithSubtags, boolean writeModernPackage) {
boolean strictBcp47, boolean skipBcp47LocalesWithSubtags, boolean writeModernPackage, boolean includeRedundant) {
this.writeModernPackage = writeModernPackage;
this.strictBcp47 = strictBcp47;
this.skipBcp47LocalesWithSubtags = strictBcp47 && skipBcp47LocalesWithSubtags;
Expand All @@ -260,6 +278,7 @@ public Ldml2JsonConverter(String cldrDir, String outputDir, String runType, bool
this.dependencies = configFileReader.getDependencies();
this.sections = configFileReader.getSections();
this.packages = new TreeSet<>();
this.includeRedundant = includeRedundant;
}

/**
Expand Down Expand Up @@ -364,6 +383,9 @@ private String transformPath(final String pathStr, final String pathPrefix) {
return result;
}

/**
* Read all paths in the file, and assign each to a JSONSection. Return the map.
*/
private Map<JSONSection, List<CldrItem>> mapPathsToSections(AtomicInteger readCount, int totalCount,
CLDRFile file, String pathPrefix, SupplementalDataInfo sdi)
throws IOException, ParseException {
Expand All @@ -383,19 +405,23 @@ private Map<JSONSection, List<CldrItem>> mapPathsToSections(AtomicInteger readCo
}
final DtdType fileDtdType = file.getDtdType();
CoverageInfo covInfo = CLDRConfig.getInstance().getCoverageInfo();
// read paths in DTD order. The order is critical for JSON processing.
final CLDRFile.Status status = new CLDRFile.Status();
for (Iterator<String> it = file.iterator("", DtdData.getInstance(fileDtdType).getDtdComparator(null)); it.hasNext();) {
int cv = Level.UNDETERMINED.getLevel();
final String path = it.next();

// Check for code-fallback and constructed first, even before fullpath and value
final String localeWhereFound = file.getSourceLocaleID(path, status);
if (!includeRedundant && (localeWhereFound.equals(XMLSource.CODE_FALLBACK_ID) || // language[@type="apc"] = apc : missing
status.pathWhereFound.equals(GlossonymConstructor.PSEUDO_PATH))) { // language[@type="fa_AF"] = fa (AF) or Farsi (Afghanistan) : missing
// Don't include these paths.
continue;
}

// now get the fullpath and value
String fullPath = file.getFullXPath(path);
String value = file.getWinningValue(path);
/*
* TODO: check whether this next block is superfluous, and remove it if so
* Reference: https://unicode-org.atlassian.net/browse/CLDR-13263
*/
if (path.startsWith("//ldml/localeDisplayNames/languages") &&
file.getSourceLocaleID(path, null).equals("code-fallback")) {
value = file.getBaileyValue(path, null, null);
}

if (fullPath == null) {
fullPath = path;
Expand Down Expand Up @@ -448,7 +474,7 @@ private Map<JSONSection, List<CldrItem>> mapPathsToSections(AtomicInteger readCo
continue; // skip this path
}

for (JSONSection js : sections) {
for (JSONSection js : sections) { // TODO: move to subfunction, error if >1 section matches
if (js.pattern.matcher(transformedPath).matches()) {
CldrItem item = new CldrItem(transformedPath, transformedFullPath, path, fullPath, value);

Expand All @@ -463,7 +489,8 @@ private Map<JSONSection, List<CldrItem>> mapPathsToSections(AtomicInteger readCo
}
}

Matcher versionInfoMatcher = PatternCache.get(".*/(identity|version).*").matcher("");
// TODO: move matcher out of inner loop
final Matcher versionInfoMatcher = VERSION_INFO_PATTERN.matcher("");
// Automatically copy the version info to any sections that had real data in them.
JSONSection otherSection = sections.get(sections.size() - 1);
List<CldrItem> others = sectionItems.get(otherSection);
Expand All @@ -484,7 +511,7 @@ private Map<JSONSection, List<CldrItem>> mapPathsToSections(AtomicInteger readCo
hit.add(addedItemCount, item);
sectionItems.put(js, hit);
}
if (js.section.equals("other")) {
if (js.section.equals("other")) { // did not match one of the regular sections
List<CldrItem> hit = sectionItems.get(js);
hit.remove(item);
sectionItems.put(js, hit);
Expand All @@ -496,6 +523,7 @@ private Map<JSONSection, List<CldrItem>> mapPathsToSections(AtomicInteger readCo
return sectionItems;
}

final static Pattern VERSION_INFO_PATTERN = PatternCache.get(".*/(identity|version).*");
final static Pattern HAS_SUBTAG = PatternCache.get(".*-[a-z]-.*");

/**
Expand Down Expand Up @@ -722,7 +750,7 @@ private int convertCldrItems(AtomicInteger readCount, int totalCount,
}

String outPath = new File(outputDir.substring(this.outputDir.length()), outFilename).getPath();
outputProgress.add(Pair.of(js.section+' '+outPath, valueCount));
outputProgress.add(Pair.of(String.format("%20s %s", js.section, outPath), valueCount));
logger.fine(">" + progressPrefix(readCount, totalCount, filename, js.section) + String.format("…%s (%d values)",
outPath, valueCount));

Expand All @@ -735,20 +763,24 @@ private int convertCldrItems(AtomicInteger readCount, int totalCount,
if(!outputProgress.isEmpty()) {
// Put these first, so the percent is at the end.
for(final Pair<String, Integer> outputItem : outputProgress) {
outStr.append(String.format("\t- %s (%d)\n", outputItem.getFirst(), outputItem.getSecond()));
outStr.append(String.format("\t%6d %s\n", outputItem.getSecond(), outputItem.getFirst()));
}
outStr.append(String.format("%s%s (%d values in %d sections)\n",
outStr.append(String.format("%s%-12s\t %s\n",
progressPrefix(readCount, totalCount), filename,
totalItemsInFile, outputProgress.size()));
valueSectionsFormat(totalItemsInFile, outputProgress.size())));
} else {
outStr.append(String.format("%s%s (no items output)\n", progressPrefix(readCount, totalCount), filename));
outStr.append(String.format("%s%-12s\t" + NONE_ICON +" (no output)\n", progressPrefix(readCount, totalCount), filename));
}
synchronized(readCount) { // to prevent interleaved output
System.out.print(outStr);
}
return totalItemsInFile;
}

private static String valueSectionsFormat(int values, int sections) {
return MessageFormat.format("({0, plural, one {# value} other {# values}} in {1, plural, one {# section} other {# sections}})", values, sections);
}

private boolean localeIsModernTier(String filename) {
Level lev = CalculatedCoverageLevels.getInstance().getEffectiveCoverageLevel(filename);
if (lev == null) return false;
Expand Down Expand Up @@ -1042,7 +1074,7 @@ private static String getDefaultVersion() {

public void writePackageJson(String outputDir, String packageName) throws IOException {
PrintWriter outf = FileUtilities.openUTF8Writer(outputDir + "/" + packageName, "package.json");
logger.fine("Creating packaging file => " + outputDir + File.separator + packageName + File.separator + "package.json");
logger.fine(PACKAGE_ICON+" Creating packaging file => " + outputDir + File.separator + packageName + File.separator + "package.json");
JsonObject obj = new JsonObject();
writeBasicInfo(obj, packageName, true);

Expand Down Expand Up @@ -1088,7 +1120,7 @@ public void writePackageJson(String outputDir, String packageName) throws IOExce

public void writeBowerJson(String outputDir, String packageName) throws IOException {
PrintWriter outf = FileUtilities.openUTF8Writer(outputDir + "/" + packageName, "bower.json");
logger.fine("Creating packaging file => " + outputDir + File.separator + packageName + File.separator + "bower.json");
logger.fine(PACKAGE_ICON+" Creating packaging file => " + outputDir + File.separator + packageName + File.separator + "bower.json");
JsonObject obj = new JsonObject();
writeBasicInfo(obj, packageName, false);
if (type == RunType.supplemental) {
Expand Down Expand Up @@ -1116,18 +1148,17 @@ public void writeBowerJson(String outputDir, String packageName) throws IOExcept

public void writeDefaultContent(String outputDir) throws IOException {
PrintWriter outf = FileUtilities.openUTF8Writer(outputDir + "/cldr-core", "defaultContent.json");
System.out.println("Creating packaging file => " + outputDir + "/cldr-core" + File.separator + "defaultContent.json");
System.out.println(PACKAGE_ICON+" Creating packaging file => " + outputDir + "/cldr-core" + File.separator + "defaultContent.json");
JsonObject obj = new JsonObject();
obj.add("defaultContent", gson.toJsonTree(skippedDefaultContentLocales));
outf.println(gson.toJson(obj));
outf.close();
}

public void writeCoverageLevels(String outputDir) throws IOException {
final Splitter SEMICOLON = Splitter.on(';').trimResults();
try (PrintWriter outf = FileUtilities.openUTF8Writer(outputDir + "/cldr-core", "coverageLevels.json");) {
final Map<String, String> covlocs = new TreeMap<>();
System.out.println("Creating packaging file => " + outputDir + "/cldr-core" + File.separator + "coverageLevels.json from coverageLevels.txt");
System.out.println(PACKAGE_ICON+" Creating packaging file => " + outputDir + "/cldr-core" + File.separator + "coverageLevels.json from coverageLevels.txt");
CalculatedCoverageLevels ccl = CalculatedCoverageLevels.getInstance();
for (final Map.Entry<String, org.unicode.cldr.util.Level> e : ccl.getLevels().entrySet()) {
final String uloc = e.getKey();
Expand Down Expand Up @@ -1157,7 +1188,7 @@ public void writeCoverageLevels(String outputDir) throws IOException {

public void writeAvailableLocales(String outputDir) throws IOException {
PrintWriter outf = FileUtilities.openUTF8Writer(outputDir + "/cldr-core", "availableLocales.json");
System.out.println("Creating packaging file => " + outputDir + "/cldr-core" + File.separator + "availableLocales.json");
System.out.println(PACKAGE_ICON+" Creating packaging file => " + outputDir + "/cldr-core" + File.separator + "availableLocales.json");
JsonObject obj = new JsonObject();
obj.add("availableLocales", gson.toJsonTree(avl));
outf.println(gson.toJson(obj));
Expand Down Expand Up @@ -1185,7 +1216,7 @@ public void writeScriptMetadata(String outputDir) throws IOException {

public void writePackageList(String outputDir) throws IOException {
PrintWriter outf = FileUtilities.openUTF8Writer(outputDir + "/cldr-core", "cldr-packages.json");
System.out.println("Creating packaging metadata file => " + outputDir + File.separator + "cldr-core" + File.separator + "cldr-packages.json and PACKAGES.md");
System.out.println(PACKAGE_ICON+" Creating packaging metadata file => " + outputDir + File.separator + "cldr-core" + File.separator + "cldr-packages.json and PACKAGES.md");
PrintWriter pkgs = FileUtilities.openUTF8Writer(outputDir + "/..", "PACKAGES.md");

pkgs.println("# CLDR JSON Packages");
Expand Down Expand Up @@ -1661,15 +1692,17 @@ private final String progressPrefix(AtomicInteger readCount, int totalCount) {
return progressPrefix(readCount.get(), totalCount);
}

LocalizedNumberFormatter percentFormatter = NumberFormatter
final LocalizedNumberFormatter percentFormatter = NumberFormatter
.withLocale(Locale.ENGLISH)
.unit(NoUnit.PERCENT)
.integerWidth(IntegerWidth.zeroFillTo(3))
.precision(Precision.integer());

private final String progressPrefix(int readCount, int totalCount) {
double asPercent = ((double)readCount/(double)totalCount) * 100.0;
return String.format("%s\t[%s]:\t", type, percentFormatter.format(asPercent));
return String.format(SECTION_ICON+" %s (step %d/%d)\t[%s]:\t",
type, type.ordinal(), RunType.values().length - 1, // which 'type' are we on? (all=0, minus one to get the count right)
percentFormatter.format(asPercent));
}

/**
Expand Down Expand Up @@ -1700,7 +1733,8 @@ public void processDirectory(String dirName, DraftStatus minimalDraftStatus)
// This takes a long time (minutes, in 2020), so run it in parallel forkJoinPool threads.
// The result of this pipeline is an array of toString()-able filenames of XML files which
// produced no JSON output, just as a warning.
System.out.println(progressPrefix(0, total) + " Beginning parallel process of " + total + " file(s)");
System.out.println(progressPrefix(0, total) + " " +
MessageFormat.format(GEAR_ICON + " Beginning parallel process of {0, plural, one {# file} other {# files}}", total));
Object noOutputFiles[] = files
.parallelStream()
.unordered()
Expand Down Expand Up @@ -1734,14 +1768,15 @@ public void processDirectory(String dirName, DraftStatus minimalDraftStatus)
.filter(p -> p.getSecond() == 0) // filter out only files which produced no output
.map(p -> p.getFirst())
.toArray();
System.out.println(progressPrefix(total, total) + " Completed parallel process of " + total + " file(s)");
System.out.println(progressPrefix(total, total) + " " + DONE_ICON + MessageFormat.format("Completed parallel process of {0, plural, one {# file} other {# files}}", total));
if (noOutputFiles.length > 0) {
System.err.println("WARNING: These " + noOutputFiles.length + " file(s) did not produce any output (check JSON config):");
System.err.println(WARN_ICON + MessageFormat
.format(" Warning: {0, plural, one {# file} other {# files}} did not produce any output (check JSON config):", noOutputFiles.length));
for (final Object f : noOutputFiles) {
final String loc = f.toString();
final String uloc = unicodeLocaleToString(f.toString());
if (skipBcp47LocalesWithSubtags && type.locales() && HAS_SUBTAG.matcher(uloc).matches()) {
System.err.println("\t- " + loc + " (Skipped due to '-T true': " + uloc + ")");
System.err.println("\t- " + loc + " (Skipped due to '-T true': " + uloc + ")");
} else {
System.err.println("\t- " + loc);
}
Expand Down
Loading

0 comments on commit e9a77eb

Please sign in to comment.