From f14ddb53f83137bc89fb2271d154cb8b45c0ff52 Mon Sep 17 00:00:00 2001 From: Conrad Nied Date: Tue, 3 Dec 2024 11:47:06 -0800 Subject: [PATCH] CLDR-18155 Order languageData's scripts by number of users --- common/supplemental/supplementalData.xml | 36 ++--- .../cldr/tool/ConvertLanguageData.java | 129 +++++++++++------- .../cldr/tool/GenerateLikelySubtags.java | 9 -- .../cldr/util/SupplementalDataInfo.java | 63 ++++++--- .../util/data/country_language_population.tsv | 39 +++--- 5 files changed, 167 insertions(+), 109 deletions(-) diff --git a/common/supplemental/supplementalData.xml b/common/supplemental/supplementalData.xml index 1989ef6be8f..59d143b7ea9 100644 --- a/common/supplemental/supplementalData.xml +++ b/common/supplemental/supplementalData.xml @@ -1346,7 +1346,7 @@ XXX Code for transations where no currency is involved - + @@ -1424,7 +1424,7 @@ XXX Code for transations where no currency is involved - + @@ -1448,7 +1448,7 @@ XXX Code for transations where no currency is involved - + @@ -1477,7 +1477,7 @@ XXX Code for transations where no currency is involved - + @@ -1653,7 +1653,7 @@ XXX Code for transations where no currency is involved - + @@ -1784,7 +1784,7 @@ XXX Code for transations where no currency is involved - + @@ -1824,7 +1824,7 @@ XXX Code for transations where no currency is involved - + @@ -1840,7 +1840,7 @@ XXX Code for transations where no currency is involved - + @@ -1918,7 +1918,7 @@ XXX Code for transations where no currency is involved - + @@ -1973,7 +1973,7 @@ XXX Code for transations where no currency is involved - + @@ -2200,7 +2200,7 @@ XXX Code for transations where no currency is involved - + @@ -2289,7 +2289,7 @@ XXX Code for transations where no currency is involved - + @@ -2300,7 +2300,7 @@ XXX Code for transations where no currency is involved - + @@ -2365,9 +2365,9 @@ XXX Code for transations where no currency is involved - + - + @@ -2435,7 +2435,7 @@ XXX Code for transations where no currency is involved - + @@ -3140,6 +3140,7 @@ XXX Code for transations where no currency is involved + @@ -3591,7 +3592,7 @@ XXX Code for transations where no currency is involved - + @@ -5552,6 +5553,7 @@ XXX Code for transations where no currency is involved English official; the figure is derived from literacy * lang pop Canada 2021 Census language "Knowledge of Language"; official status from Wikipedia Languages_of_Canada [missing] + Actually literacy in Nko writing unknown but historically they used the Latin script English official, the figure is derived from literacy * lang pop Some 99% of users are literate in French or German. For languages not customarily written, the writing population is artificially set to 5% in the absence of better information. 2nd lang literacy 15-25% diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ConvertLanguageData.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ConvertLanguageData.java index 4a93948cfbc..e7a11e52768 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ConvertLanguageData.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ConvertLanguageData.java @@ -312,16 +312,21 @@ private static void getLanguageScriptSpreadsheet(PrintWriter out) { private static void writeNewBasicData2(PrintWriter out, Set sortedInput) { double cutoff = 0.2; // 20% - // Relation newLanguageData = new Relation(new TreeMap(), - // TreeSet.class); LanguageTagParser ltp = new LanguageTagParser(); Map> language_status_territories = new TreeMap<>(); - // Map> languageToBestCountry; - for (RowData rowData : sortedInput) { - if (rowData.countryCode.equals("ZZ")) continue; - ltp.set(rowData.languageCode); + Map> language_script_populations = new TreeMap<>(); + Set warnings = new LinkedHashSet<>(); + + // Get all of the rows of country<->language populations + // At certain thresholds add territory information + // [New] Also add script information + for (RowData languageInCountryData : sortedInput) { + if (languageInCountryData.countryCode.equals("ZZ")) continue; + ltp.set(languageInCountryData.languageCode); String languageCode = ltp.getLanguage(); + + // Add the territory if it is official or otherwise major Relation status_territories = language_status_territories.get(languageCode); if (status_territories == null) { @@ -332,12 +337,43 @@ private static void writeNewBasicData2(PrintWriter out, Set sortedInput new TreeMap>(), TreeSet.class)); } - if (rowData.officialStatus.isMajor()) { - status_territories.put(BasicLanguageData.Type.primary, rowData.countryCode); - } else if (rowData.officialStatus.isOfficial() - || rowData.getLanguagePopulation() >= cutoff * rowData.countryPopulation - || rowData.getLanguagePopulation() >= 1000000) { - status_territories.put(BasicLanguageData.Type.secondary, rowData.countryCode); + if (languageInCountryData.officialStatus.isMajor()) { + // Output will look like + status_territories.put( + BasicLanguageData.Type.primary, languageInCountryData.countryCode); + } else if (languageInCountryData.officialStatus.isOfficial() + || languageInCountryData.getLanguagePopulation() + >= cutoff * languageInCountryData.countryPopulation + || languageInCountryData.getLanguagePopulation() >= 1000000) { + // Output will look like + status_territories.put( + BasicLanguageData.Type.secondary, languageInCountryData.countryCode); + } + + // Add the population for the script + // language_status_territories.get(languageCode); + String script = ltp.getScript(); + + // If the script isn't specified, use the default one from LikelySubtags + if (script == null || script == "") { + script = supplementalData.getDefaultScript(languageCode); + } + + // If we have a script, add to its population to the index + if (script != null && script != "") { + Integer currentPopulation = 0; + TreeMap scriptsByPopulation = + language_script_populations.get(languageCode); + if (scriptsByPopulation == null) { + language_script_populations.put( + languageCode, scriptsByPopulation = new TreeMap()); + } else if (scriptsByPopulation.containsKey(script)) { + currentPopulation = scriptsByPopulation.get(script); + } + scriptsByPopulation.put( + script, + currentPopulation + + (int) languageInCountryData.getLiterateLanguagePopulation()); } } @@ -345,7 +381,6 @@ private static void writeNewBasicData2(PrintWriter out, Set sortedInput allLanguages.addAll(language_status_scripts.keySet()); // now add all the remaining language-script info // - Set warnings = new LinkedHashSet<>(); out.println("\t"); for (String languageSubtag : allLanguages) { Relation status_scripts = @@ -360,16 +395,30 @@ private static void writeNewBasicData2(PrintWriter out, Set sortedInput oldData = Collections.emptyMap(); } + Map scriptsByPopulationAnyLevel = + language_script_populations.get(languageSubtag); EnumMap newData = new EnumMap<>(BasicLanguageData.Type.class); for (BasicLanguageData.Type status : BasicLanguageData.Type.values()) { - Set scripts = status_scripts == null ? null : status_scripts.getAll(status); Set territories = status_territories == null ? null : status_territories.getAll(status); - if (scripts == null && territories == null) continue; + Map scriptsByPopulationAtThisLevel = new TreeMap<>(); + if (status_scripts != null) { + Set scriptsAtThisLevel = status_scripts.getAll(status); + if (scriptsAtThisLevel != null) { + for (String script : scriptsAtThisLevel) { + int population = 0; + if (scriptsByPopulationAnyLevel != null + && scriptsByPopulationAnyLevel.containsKey(script)) { + population = scriptsByPopulationAnyLevel.get(script); + } + scriptsByPopulationAtThisLevel.put(script, population); + } + } + } BasicLanguageData bld = new BasicLanguageData(); bld.setTerritories(territories); - bld.setScripts(scripts); + bld.setScripts(scriptsByPopulationAtThisLevel); bld.setType(status); bld.freeze(); newData.put(status, bld); @@ -377,7 +426,7 @@ private static void writeNewBasicData2(PrintWriter out, Set sortedInput // compare if (!CldrUtility.equals(oldData.entrySet(), newData.entrySet())) { - for (String problem : compare(oldData, newData)) { + for (String problem : compareBasicLanguageData(oldData, newData)) { warnings.add( BadItem.DETAIL.toString( "changing ", @@ -389,25 +438,9 @@ private static void writeNewBasicData2(PrintWriter out, Set sortedInput } for (BasicLanguageData bld : newData.values()) { - Set scripts = bld.getScripts(); - Set territories = bld.getTerritories(); - BasicLanguageData.Type status = bld.getType(); - out.println( - "\t\t"); + if (bld.getTerritories().size() > 0 || bld.getScripts().size() > 0) { + out.println(bld.toString(languageSubtag)); + } } } out.println("\t"); @@ -423,7 +456,7 @@ private static void writeNewBasicData2(PrintWriter out, Set sortedInput } } - private static List compare( + private static List compareBasicLanguageData( Map oldData, Map newData) { Map oldDataToType = getDataToType(oldData.values(), true); @@ -1050,6 +1083,10 @@ public String getLanguagePopulationString() { private double getLanguagePopulation() { return languagePopulation; } + + private double getLiterateLanguagePopulation() { + return languagePopulation * languageLiteracy; + } } public static String getExcelQuote(String comment) { @@ -2260,8 +2297,8 @@ static void addLanguageScriptData() throws IOException { Set fullScriptList = sc.getGoodAvailableCodes("script"); String[] scriptList = parts[2].split("[;,]\\s*"); - Set scripts = new TreeSet<>(); - Set scriptsAlt = new TreeSet<>(); + Map scriptsByPopulation = new TreeMap<>(); + Map scriptsByPopulationSecondary = new TreeMap<>(); for (String script : scriptList) { if (script.length() == 0) continue; boolean alt = false; @@ -2279,9 +2316,9 @@ static void addLanguageScriptData() throws IOException { + "> not found in " + fullScriptList); } else if (alt) { - scriptsAlt.add(script); + scriptsByPopulationSecondary.put(script, 0); } else { - scripts.add(script); + scriptsByPopulation.put(script, 1); } } // now territories @@ -2304,20 +2341,20 @@ static void addLanguageScriptData() throws IOException { } // // we're going to go ahead and set these all to secondary. - if (scripts.size() != 0) { + if (scriptsByPopulation.size() != 0) { language2BasicLanguageData.put( languageSubtag, new BasicLanguageData() - .setType(BasicLanguageData.Type.secondary) - .setScripts(scripts) + .setType(BasicLanguageData.Type.primary) + .setScripts(scriptsByPopulation) .setTerritories(territories)); } - if (scriptsAlt.size() != 0) { + if (scriptsByPopulationSecondary.size() != 0) { language2BasicLanguageData.put( languageSubtag, new BasicLanguageData() .setType(BasicLanguageData.Type.secondary) - .setScripts(scriptsAlt) + .setScripts(scriptsByPopulationSecondary) .setTerritories(territories)); } } diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateLikelySubtags.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateLikelySubtags.java index 443de3e9aa8..d849fb6ae7a 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateLikelySubtags.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateLikelySubtags.java @@ -394,9 +394,6 @@ public static void main(String[] args) throws IOException { {"rif_Latn", "rif_Latn_MA"}, {"rif_Tfng", "rif_Tfng_MA"}, {"rif_MA", "rif_Latn_MA"}, // Ibid - {"shi", "shi_Tfng_MA"}, - {"shi_Tfng", "shi_Tfng_MA"}, - {"shi_MA", "shi_Tfng_MA"}, {"sr_Latn", "sr_Latn_RS"}, {"ss", "ss_Latn_ZA"}, {"ss_Latn", "ss_Latn_ZA"}, @@ -431,12 +428,6 @@ public static void main(String[] args) throws IOException { {"und_SS", "en_Latn_SS"}, {"vo", "vo_Latn_001"}, {"vo_Latn", "vo_Latn_001"}, - // {"yi", "yi_Hebr_001"}, - // {"yi_Hebr", "yi_Hebr_001"}, - {"yue", "yue_Hant_HK"}, - {"yue_Hant", "yue_Hant_HK"}, - {"yue_Hans", "yue_Hans_CN"}, - {"yue_CN", "yue_Hans_CN"}, {"zh_Hani", "zh_Hani_CN"}, {"zh_Bopo", "zh_Bopo_TW"}, {"ccp", "ccp_Cakm_BD"}, diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/SupplementalDataInfo.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/SupplementalDataInfo.java index a298de5c959..19355579dfb 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/SupplementalDataInfo.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/SupplementalDataInfo.java @@ -333,6 +333,8 @@ public enum Type { private Set scripts = Collections.emptySet(); + private Map scriptsByPopulation = new TreeMap<>(); + private Set territories = Collections.emptySet(); public Type getType() { @@ -344,11 +346,23 @@ public BasicLanguageData setType(Type type) { return this; } - public BasicLanguageData setScripts(String scriptTokens) { - return setScripts( - scriptTokens == null - ? null - : Arrays.asList(WHITESPACE_PATTERN.split(scriptTokens))); + // Adding scripts but leaving 0 as a placeholder when there is no population data + // input: a whitespace-separated list of scripts + public BasicLanguageData setScriptsWithoutPopulation(String scriptTokens) { + List scripts = new ArrayList<>(); + if (scriptTokens != null) { + scripts = Arrays.asList(WHITESPACE_PATTERN.split(scriptTokens)); + } + return setScriptsWithoutPopulation(scripts); + } + + // Adding scripts but leaving 0 as a placeholder when there is no population data + public BasicLanguageData setScriptsWithoutPopulation(Collection scripts) { + Map scriptsByPopulation = new TreeMap<>(); + for (String script : scripts) { + scriptsByPopulation.put(script, 0); + } + return setScripts(scriptsByPopulation); } public BasicLanguageData setTerritories(String territoryTokens) { @@ -358,17 +372,14 @@ public BasicLanguageData setTerritories(String territoryTokens) { : Arrays.asList(WHITESPACE_PATTERN.split(territoryTokens))); } - public BasicLanguageData setScripts(Collection scriptTokens) { + public BasicLanguageData setScripts(Map newScripts) { if (frozen) { throw new UnsupportedOperationException(); } // TODO add error checking scripts = Collections.emptySet(); - if (scriptTokens != null) { - for (String script : scriptTokens) { - addScript(script); - } - } + scriptsByPopulation = new TreeMap<>(); + addScripts(newScripts); return this; } @@ -401,12 +412,18 @@ public Set getTerritories() { public String toString(String languageSubtag) { if (scripts.size() == 0 && territories.size() == 0) return ""; + List sortedScripts = + scriptsByPopulation.entrySet().stream() + .sorted(Map.Entry.comparingByValue(Comparator.reverseOrder())) + .map(Map.Entry::getKey) + .collect(Collectors.toList()); return "\t\t(); // retain order + scriptsByPopulation = new TreeMap<>(); } scripts.add(script); + + // Add population data + Integer currentPopulation = scriptsByPopulation.get(script); + if (currentPopulation == null) { + scriptsByPopulation.put(script, population); + } else if (population > 0) { + // TODO CLDR-18087 maybe do some ambiguity testing + scriptsByPopulation.put(script, population); + } // Ignore 0 population if we already have a script entry return this; } @@ -496,9 +523,9 @@ public BasicLanguageData cloneAsThawed() { return this; } - public void addScripts(Set scripts2) { - for (String script : scripts2) { - addScript(script); + private void addScripts(Map newScripts) { + for (Map.Entry entry : newScripts.entrySet()) { + addScript(entry.getKey(), entry.getValue()); } } } @@ -2424,7 +2451,7 @@ private void handleLanguageData(XPathValue parts) { ? BasicLanguageData.Type.primary : BasicLanguageData.Type.secondary); languageData - .setScripts(parts.getAttributeValue(2, "scripts")) + .setScriptsWithoutPopulation(parts.getAttributeValue(2, "scripts")) .setTerritories(parts.getAttributeValue(2, "territories")); Map map = languageToBasicLanguageData.get(language); if (map == null) { @@ -3046,7 +3073,7 @@ private Map doMapLanguagesToScriptsRegion() { } String script = locale.getScript(); if (script.length() > 0) { - scriptsAndRegions.addScript(script); + scriptsAndRegions.addScript(script, 0 /* 0 = no population data yet */); } String region = locale.getCountry(); if (region.length() > 0 diff --git a/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/country_language_population.tsv b/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/country_language_population.tsv index befed26015a..04fd3895bd3 100644 --- a/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/country_language_population.tsv +++ b/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/country_language_population.tsv @@ -59,7 +59,7 @@ Austria AT "8,793,370" 98% "441,000,000,000" official German de 97% Austria AT "8,793,370" 98% "441,000,000,000" official_regional Hungarian hu "23,300" Austria AT "8,793,370" 98% "441,000,000,000" Italian it 9% Austria AT "8,793,370" 98% "441,000,000,000" official_regional Slovenian sl "32,700" -Azerbaijan AZ "10,046,516" 100% "172,200,000,000" official Azerbaijani az 89% "http://www.nvtc.gov/lotw/months/march/Azerbaijani.html#writ Latin script official, used 98.8% of pop * 10% for the usage figure" +Azerbaijan AZ "10,046,516" 100% "172,200,000,000" official Azerbaijani az_Latn 89% "http://www.nvtc.gov/lotw/months/march/Azerbaijani.html#writ Latin script official, used 98.8% of pop * 10% for the usage figure" Azerbaijan AZ "10,046,516" 100% "172,200,000,000" official Azerbaijani (Cyrillic) az_Cyrl 9.9% "http://www.nvtc.gov/lotw/months/march/Azerbaijani.html#writ Latin script official, used 98.8% of pop * 90% for the usage figure" Azerbaijan AZ "10,046,516" 100% "172,200,000,000" Kurdish ku "24,400" Azerbaijan AZ "10,046,516" 100% "172,200,000,000" Muslim Tat ttt "22,100" @@ -104,7 +104,7 @@ Bolivia BO "11,306,341" 91% "83,720,000,000" official Aymara ay 20% Bolivia BO "11,306,341" 91% "83,720,000,000" Guarani gn "51,300" Bolivia BO "11,306,341" 91% "83,720,000,000" official Quechua qu 32% "http://lanic.utexas.edu/project/tilan/reports/rtf359/bolivia1.html Spanish is the official language, only about 60-70% of the population speaks it at all ;" Bolivia BO "11,306,341" 91% "83,720,000,000" official Spanish es 61% "https://www.cia.gov/library/publications/the-world-factbook/geos/bl.html http://lanic.utexas.edu/project/tilan/reports/rtf359/bolivia1.html Spanish is the official language, only about 60-70% of the population speaks it at all ;" -Bosnia & Herzegovina BA "3,849,891" 98% "44,830,000,000" official Bosnian bs 99% +Bosnia & Herzegovina BA "3,849,891" 98% "44,830,000,000" official Bosnian bs_Latn 99% Bosnia & Herzegovina BA "3,849,891" 98% "44,830,000,000" official Bosnian (Cyrillic) bs_Cyrl 99% 5% http://www.bhas.ba/index.php?option=com_content&view=article&id=52&itemid=80&lang=en&Itemid= also: http://en.wikipedia.org/wiki/Bosnian_language Bosnia & Herzegovina BA "3,849,891" 98% "44,830,000,000" official Croatian hr 12% Bosnia & Herzegovina BA "3,849,891" 98% "44,830,000,000" English en 45% @@ -130,7 +130,7 @@ British Indian Ocean Territory IO "3,500" 99% "157,200,000" official English en British Virgin Islands VG "35,802" 98% "500,000,000" official English en 98% Brunei BN "450,565" 95% "33,870,000,000" Chinese (Traditional) zh_Hant 11% Brunei BN "450,565" 95% "33,870,000,000" English en "8,000" -Brunei BN "450,565" 95% "33,870,000,000" official Malay ms 93% +Brunei BN "450,565" 95% "33,870,000,000" official Malay ms_Latn 93% Brunei BN "450,565" 95% "33,870,000,000" official Malay (Arabic) ms_Arab 5% "http://en.wikipedia.org/wiki/Languages_of_Brunei Modern use of Arabic (Jawi) seems to be minimal, but is co-official with ms; set to 5% for now." Bulgaria BG "7,057,504" 98% "153,500,000,000" official Bulgarian bg 100% Bulgaria BG "7,057,504" 98% "153,500,000,000" English en 25% @@ -264,7 +264,7 @@ China CN "1,384,688,986" 95% "23,210,000,000,000" Kyrgyz (Arabic) ky_Arab "466, China CN "1,384,688,986" 95% "23,210,000,000,000" Lisu lis "617,000" China CN "1,384,688,986" 95% "23,210,000,000,000" Literary Chinese lzh 1 No estimate available. China CN "1,384,688,986" 95% "23,210,000,000,000" Lü khb "267,000" "http://www.ethnologue.com/show_language.asp?code=khb (= Tai Lu, Xishuangbanna Dai; New Tai Lue script)" -China CN "1,384,688,986" 95% "23,210,000,000,000" Min Nan Chinese nan "26,800,000" +China CN "1,384,688,986" 95% "23,210,000,000,000" Min Nan Chinese nan_Hans "26,800,000" China CN "1,384,688,986" 95% "23,210,000,000,000" official_regional Mongolian (Mongolian) mn_Mong "3,600,000" China CN "1,384,688,986" 95% "23,210,000,000,000" Naxi nxq "329,000" China CN "1,384,688,986" 95% "23,210,000,000,000" Russian ru "14,400" @@ -349,7 +349,7 @@ Ecuador EC "16,498,502" 92% "193,000,000,000" official Spanish es 96% "percent Egypt EG "99,413,317" 74% "1,204,000,000,000" official Arabic ar 94% Egypt EG "99,413,317" 74% "1,204,000,000,000" Egyptian Arabic arz 64% Egypt EG "99,413,317" 74% "1,204,000,000,000" English en 35% -Egypt EG "99,413,317" 74% "1,204,000,000,000" Coptic cop 6% https://www.wsj.com/articles/BL-263B-3637 Lower estimate of Coptic population, actual language literacy unknown +Egypt EG "99,413,317" 74% "1,204,000,000,000" Coptic cop_Copt 6% https://www.wsj.com/articles/BL-263B-3637 Lower estimate of Coptic population, actual language literacy unknown Egypt EG "99,413,317" 74% "1,204,000,000,000" Greek el "60,900" El Salvador SV "6,187,271" 85% "51,170,000,000" official Spanish es 89% El Salvador SV "6,187,271" 85% "51,170,000,000" Nahaut Pipil ppl 2730 https://translatorswithoutborders.org/language-data-for-el-salvador @@ -433,7 +433,8 @@ Gabon GA "2,119,036" 89% "36,660,000,000" Punu puu 9% Gambia GM "2,092,731" 51% "5,556,000,000" official English en 40% Gambia GM "2,092,731" 51% "5,556,000,000" Fulah ff 1 Gambia GM "2,092,731" 51% "5,556,000,000" Fulah (Adlam) ff_Adlm 1 -Gambia GM "2,092,731" 51% "5,556,000,000" Mandingo man 29% +Gambia GM "2,092,731" 51% "5,556,000,000" Mandingo man_Latn 29% +Gambia GM "2,092,731" 51% "5,556,000,000" Mandingo (Nko) man_Nkoo 29% 5% Actually literacy in Nko writing unknown but historically they used the Latin script Georgia GE "4,926,087" 100% "39,850,000,000" official_regional Abkhazian ab "110,000" Georgia GE "4,926,087" 100% "39,850,000,000" Armenian hy 7% https://www.cia.gov/cia/publications/factbook/fields/2098.html Georgia GE "4,926,087" 100% "39,850,000,000" official Georgian ka 86% @@ -631,7 +632,7 @@ Indonesia ID "262,787,403" 93% "3,250,000,000,000" Komering kge "844,000" Indonesia ID "262,787,403" 93% "3,250,000,000,000" Lampung Api ljp "1,810,000" Indonesia ID "262,787,403" 93% "3,250,000,000,000" Madurese mad 6.3% 40% Indonesia ID "262,787,403" 93% "3,250,000,000,000" Makasar mak "1,930,000" -Indonesia ID "262,787,403" 93% "3,250,000,000,000" Malay ms 3.4% +Indonesia ID "262,787,403" 93% "3,250,000,000,000" Malay ms_Latn 3.4% Indonesia ID "262,787,403" 93% "3,250,000,000,000" Malay (Arabic) ms_Arab 1.2% Indonesia ID "262,787,403" 93% "3,250,000,000,000" Mandar mdr "241,000" Indonesia ID "262,787,403" 93% "3,250,000,000,000" Mentawai mwv "64,100" @@ -725,7 +726,7 @@ Jordan JO "10,458,413" 96% "89,000,000,000" Levantine Arabic apc "6,860,000" Kazakhstan KZ "18,744,548" 100% "478,600,000,000" English en 15% Kazakhstan KZ "18,744,548" 100% "478,600,000,000" German de 6.4% Kazakhstan KZ "18,744,548" 100% "478,600,000,000" Kara-Kalpak kaa 0.019% https://joshuaproject.net/languages/kaa -Kazakhstan KZ "18,744,548" 100% "478,600,000,000" official Kazakh kk 64% https://www.cia.gov/cia/publications/factbook/geos/kz.html CIA Factbook entry on Kazakhstan +Kazakhstan KZ "18,744,548" 100% "478,600,000,000" official Kazakh kk_Cyrl 64% https://www.cia.gov/cia/publications/factbook/geos/kz.html CIA Factbook entry on Kazakhstan Kazakhstan KZ "18,744,548" 100% "478,600,000,000" official Russian ru 72% https://www.cia.gov/cia/publications/factbook/geos/kz.html CIA Factbook entry on Kazakhstan http://windowoneurasia2.blogspot.com/2013/12/window-on-eurasia-de-russianization.html http://www.stoletie.ru/vzglyad/derusifikacija_nabirajet_oboroty_934.htm http://www.stoletie.ru/vzglyad/derusifikacija_nabirajet_oboroty_934.htm Kazakhstan KZ "18,744,548" 100% "478,600,000,000" Uyghur (Cyrillic) ug_Cyrl "375,000" Kenya KE "48,397,527" 87% "163,700,000,000" Arabic ar "22,500" @@ -756,7 +757,7 @@ Kosovo XK "1,907,592" 92% "19,600,000,000" official Serbian sr 5% Information Kosovo XK "1,907,592" 92% "19,600,000,000" official Serbian (Latin) sr_Latn 5% Information on the Latin/Cyrillic script percentages for Kosovo not currently found. Kuwait KW "2,916,467" 94% "289,700,000,000" official Arabic ar 100% Kyrgyzstan KG "5,849,296" 99% "23,150,000,000" Kara-Kalpak kaa 0.02% https://joshuaproject.net/languages/kaa -Kyrgyzstan KG "5,849,296" 99% "23,150,000,000" official Kyrgyz ky 48% +Kyrgyzstan KG "5,849,296" 99% "23,150,000,000" official Kyrgyz ky_Cyrl 48% Kyrgyzstan KG "5,849,296" 99% "23,150,000,000" official Russian ru 36% http://windowoneurasia2.blogspot.com/2013/12/window-on-eurasia-de-russianization.html http://www.stoletie.ru/vzglyad/derusifikacija_nabirajet_oboroty_934.htm Laos LA "7,234,171" 73% "49,340,000,000" Khmu kjg 5.8% http://en.wikipedia.org/wiki/Khmu_language Laos LA "7,234,171" 73% "49,340,000,000" Kuy kdt "69,300" http://en.wikipedia.org/wiki/Kuy_language @@ -820,7 +821,7 @@ Malaysia MY "31,809,660" 93% "933,300,000,000" Chinese (Traditional) zh 17% Malaysia MY "31,809,660" 93% "933,300,000,000" English en 21% Malaysia MY "31,809,660" 93% "933,300,000,000" Iban iba "792,000" Malaysia MY "31,809,660" 93% "933,300,000,000" Javanese jv "379,000" -Malaysia MY "31,809,660" 93% "933,300,000,000" official Malay ms 75% +Malaysia MY "31,809,660" 93% "933,300,000,000" official Malay ms_Latn 75% Malaysia MY "31,809,660" 93% "933,300,000,000" Malayalam ml "46,600" Malaysia MY "31,809,660" 93% "933,300,000,000" Negeri Sembilan Malay zmi "379,000" Malaysia MY "31,809,660" 93% "933,300,000,000" Tamil ta 4.2% @@ -901,8 +902,8 @@ Morocco MA "34,314,130" 67% "298,600,000,000" Riffian (Tifinagh) rif_Tfng 4.9% Morocco MA "34,314,130" 67% "298,600,000,000" Riffian (Latin) rif 4.9% 5% Morocco MA "34,314,130" 67% "298,600,000,000" Spanish es "22,400" Morocco MA "34,314,130" 67% "298,600,000,000" Standard Moroccan Tamazight zgh 22% http://unicode.org/cldr/trac/attachment/ticket/5887/zgh-ISO639-2-certif.pdf -Morocco MA "34,314,130" 67% "298,600,000,000" Tachelhit shi 8.7% -Morocco MA "34,314,130" 67% "298,600,000,000" Tachelhit (Latin) shi_Latn 8.7% "http://www.ethnologue.com/show_language.asp?code=shi Latin is not shown as being used, rather Arabic" +Morocco MA "34,314,130" 67% "298,600,000,000" Tachelhit shi_Tfng 8.7% +Morocco MA "34,314,130" 67% "298,600,000,000" Tachelhit (Latin) shi_Latn 8.7% 5% "http://www.ethnologue.com/show_language.asp?code=shi Latin is not shown as being used, rather Arabic" Mozambique MZ "27,233,789" 56% "37,090,000,000" Lomwe ngl 6.8% Mozambique MZ "27,233,789" 56% "37,090,000,000" Makhuwa vmw 13% Mozambique MZ "27,233,789" 56% "37,090,000,000" Makhuwa-Meetto mgh 4.5% http://www.ethnologue.com/language/mgh but no literacy data @@ -994,7 +995,7 @@ Nigeria NG "203,452,505" 61% "1,121,000,000,000" Efik efi "2,900,000" http:// Nigeria NG "203,452,505" 61% "1,121,000,000,000" official English en 53% Nigeria NG "203,452,505" 61% "1,121,000,000,000" Fulah ff 1 No estimate available. Nigeria NG "203,452,505" 61% "1,121,000,000,000" Fulah (Adlam) ff_Adlm 1 No estimate available. -Nigeria NG "203,452,505" 61% "1,121,000,000,000" Hausa ha 13% +Nigeria NG "203,452,505" 61% "1,121,000,000,000" Hausa ha_Latn 13% Nigeria NG "203,452,505" 61% "1,121,000,000,000" Hausa (Arabic) ha_Arab "2,030,000" Data completely unknown for Hausa in Arabic in Nigeria Nigeria NG "203,452,505" 61% "1,121,000,000,000" Ibibio ibb "2,900,000" Nigeria NG "203,452,505" 61% "1,121,000,000,000" Igbo ig 13% @@ -1304,7 +1305,7 @@ Switzerland CH "8,292,809" 99% "523,100,000,000" Walser wae "10,000" Syria SY "19,454,263" 84% "50,280,000,000" official Arabic ar 80% Syria SY "19,454,263" 84% "50,280,000,000" Armenian hy "350,000" Syria SY "19,454,263" 84% "50,280,000,000" French fr 5.9% http://www.nationsonline.org/oneworld/syria.htm Crude estimate based on import partner data. -Syria SY "19,454,263" 84% "50,280,000,000" Kurdish ku 8% +Syria SY "19,454,263" 84% "50,280,000,000" Kurdish ku_Latn 8% Syria SY "19,454,263" 84% "50,280,000,000" Syriac syr "16,400" 5% "For languages not customarily written, the writing population is artificially set to 5% in the absence of better information." Syria SY "19,454,263" 84% "50,280,000,000" Levantine Arabic apc "16,633,300" https://en.wikipedia.org/wiki/Levantine_Arabic#Speakers_by_country Taiwan TW "23,545,963" 96% "1,189,000,000,000" official Chinese (Traditional) zh_Hant 95% @@ -1314,7 +1315,7 @@ Taiwan TW "23,545,963" 96% "1,189,000,000,000" Taroko trv "4,750" Tajikistan TJ "8,604,882" 100% "28,430,000,000" Arabic ar "1,000" Tajikistan TJ "8,604,882" 100% "28,430,000,000" Persian fa "66,900" Tajikistan TJ "8,604,882" 100% "28,430,000,000" Russian ru 12% http://windowoneurasia2.blogspot.com/2013/12/window-on-eurasia-de-russianization.html http://www.stoletie.ru/vzglyad/derusifikacija_nabirajet_oboroty_934.htm -Tajikistan TJ "8,604,882" 100% "28,430,000,000" official Tajik tg 100% +Tajikistan TJ "8,604,882" 100% "28,430,000,000" official Tajik tg_Cyrl 100% Tanzania TZ "55,451,343" 68% "162,500,000,000" Asu asa "690,000" http://www.ethnologue.com/show_language.asp?code=asa Most also use Swahili with 50% literacy. Only 5% monolingual. Tanzania TZ "55,451,343" 68% "162,500,000,000" Bena bez "924,000" Tanzania TZ "55,451,343" 68% "162,500,000,000" official English en 69% "https://www.cia.gov/cia/publications/factbook/geos/tz.html English (official, primary language of commerce, administration, and higher education)" @@ -1367,7 +1368,7 @@ Turkey TR "81,257,239" 94% "2,186,000,000,000" Adyghe ady "316,000" Turkey TR "81,257,239" 94% "2,186,000,000,000" Albanian sq "16,900" Turkey TR "81,257,239" 94% "2,186,000,000,000" Arabic ar "453,000" Turkey TR "81,257,239" 94% "2,186,000,000,000" Armenian hy "45,300" -Turkey TR "81,257,239" 94% "2,186,000,000,000" Azerbaijani az "600,000" +Turkey TR "81,257,239" 94% "2,186,000,000,000" Azerbaijani az_Latn "600,000" Turkey TR "81,257,239" 94% "2,186,000,000,000" Azerbaijani (Arabic) az_Arab "528,000" Turkey TR "81,257,239" 94% "2,186,000,000,000" Balkan Gagauz Turkish bgx "370,000" Turkey TR "81,257,239" 94% "2,186,000,000,000" Bulgarian bg "341,000" @@ -1378,7 +1379,7 @@ Turkey TR "81,257,239" 94% "2,186,000,000,000" Greek el "4,000" Turkey TR "81,257,239" 94% "2,186,000,000,000" Kabardian kbd "623,000" Turkey TR "81,257,239" 94% "2,186,000,000,000" Kazakh kk 600 "http://en.wikipedia.org/wiki/Kazakh_language - the script is an assumption, needs a reference" Turkey TR "81,257,239" 94% "2,186,000,000,000" Kirmanjki kiu "158,000" -Turkey TR "81,257,239" 94% "2,186,000,000,000" Kurdish ku 5.5% +Turkey TR "81,257,239" 94% "2,186,000,000,000" Kurdish ku_Latn 5.5% Turkey TR "81,257,239" 94% "2,186,000,000,000" Kyrgyz (Latin) ky_Latn "1,140" Turkey TR "81,257,239" 94% "2,186,000,000,000" Laz lzz "22,600" Turkey TR "81,257,239" 94% "2,186,000,000,000" Pontic pnt_Latn "5,100" https://joshuaproject.net/people_groups/14444/TU @@ -1391,7 +1392,7 @@ Turkey TR "81,257,239" 94% "2,186,000,000,000" Levantine Arabic apc "4,250,000" Turkmenistan TM "5,411,012" 100% "103,700,000,000" Kurdish ku "21,900" Turkmenistan TM "5,411,012" 100% "103,700,000,000" Kara-Kalpak kaa 0.11% https://joshuaproject.net/languages/kaa Turkmenistan TM "5,411,012" 100% "103,700,000,000" Russian ru 12% -Turkmenistan TM "5,411,012" 100% "103,700,000,000" official Turkmen tk 70% +Turkmenistan TM "5,411,012" 100% "103,700,000,000" official Turkmen tk_Latn 70% Turkmenistan TM "5,411,012" 100% "103,700,000,000" Uzbek uz 9% Turks & Caicos Islands TC "53,701" 98% "632,000,000" official English en 98% Tuvalu TV "11,147" 95% "42,000,000" official English en "1,070" "http://en.wikipedia.org/wiki/Tuvalu The Tuvaluan language is spoken by virtually everyone, while Gilbertese is spoken by some people on Nui. English is also an official language, but is not spoken in daily use. Writing pop set to 10% of Tuvalu." @@ -1493,7 +1494,7 @@ Uruguay UY "3,369,299" 98% "78,160,000,000" official Spanish es 88% Uzbekistan UZ "36,799,756" 99% "223,000,000,000" Kara-Kalpak kaa 2.1% https://joshuaproject.net/languages/kaa Uzbekistan UZ "36,799,756" 99% "223,000,000,000" Russian ru 14% Uzbekistan UZ "36,799,756" 99% "223,000,000,000" Turkish tr "228,000" -Uzbekistan UZ "36,799,756" 99% "223,000,000,000" official Uzbek uz 85% "http://en.wikipedia.org/wiki/Uzbek_language#Writing_systems https://www.cia.gov/library/publications/the-world-factbook/geos/uz.html Latin/Cyrillic balance is estimated, based on literacy; younger education now in Latin" +Uzbekistan UZ "36,799,756" 99% "223,000,000,000" official Uzbek uz_Latn 85% "http://en.wikipedia.org/wiki/Uzbek_language#Writing_systems https://www.cia.gov/library/publications/the-world-factbook/geos/uz.html Latin/Cyrillic balance is estimated, based on literacy; younger education now in Latin" Uzbekistan UZ "36,799,756" 99% "223,000,000,000" official Uzbek (Cyrillic) uz_Cyrl 15% "http://en.wikipedia.org/wiki/Uzbek_language#Writing_systems https://www.cia.gov/library/publications/the-world-factbook/geos/uz.html Latin/Cyrillic balance is estimated, based on literacy; younger education now in Latin" Vanuatu VU "288,037" 83% "772,000,000" official Bislama bi 90% "native speaker pop is low, ~6200; but is most widely spoken 2nd language" Vanuatu VU "288,037" 83% "772,000,000" official English en 83%