Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve name:latin logic #147

Merged
merged 3 commits into from
Mar 25, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.function.Predicate;
import java.util.regex.Pattern;
import java.util.stream.Stream;

Expand All @@ -53,10 +54,23 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* <a href="https://github.com/openmaptiles/openmaptiles-tools/blob/master/sql/zzz_language.sql">openmaptiles-tools</a>.
*/
public class LanguageUtils {
// See https://github.com/onthegomap/planetiler/issues/86

// Name tags that should be eligible for finding a latin name.
// See https://wiki.openstreetmap.org/wiki/Multilingual_names
private static final Predicate<String> VALID_NAME_TAGS =
Pattern
.compile("^name:[a-z]{2,3}(-[a-z]{4})?([-_](x-)?[a-z]{2,})?(-([a-z]{2}|[0-9]{3}))?$", Pattern.CASE_INSENSITIVE)
.asMatchPredicate();

// Match strings that only contain latin characters.
private static final Predicate<String> ONLY_LATIN = Pattern
.compile("^[\\P{IsLetter}[\\p{IsLetter}&&\\p{IsLatin}]]+$")
.asMatchPredicate();

// Match only latin letters
private static final Pattern LATIN_LETTER = Pattern.compile("[\\p{IsLetter}&&\\p{IsLatin}]+");

private static final Pattern NONLATIN = Pattern
.compile("[^\\x{0000}-\\x{024f}\\x{1E00}-\\x{1EFF}\\x{0300}-\\x{036f}\\x{0259}]");
private static final Pattern LETTER = Pattern.compile("[A-Za-zÀ-ÖØ-öø-ÿĀ-ɏ]+");
private static final Pattern EMPTY_PARENS = Pattern.compile("(\\([ -.]*\\)|\\[[ -.]*])");
private static final Pattern LEADING_TRAILING_JUNK = Pattern.compile("(^\\s*([./-]\\s*)*|(\\s+[./-])*\\s*$)");
private static final Pattern WHITESPACE = Pattern.compile("\\s+");
Expand All @@ -73,7 +87,7 @@ private static String string(Object obj) {
}

static boolean containsOnlyLatinCharacters(String string) {
return string != null && !NONLATIN.matcher(string).find();
return string != null && ONLY_LATIN.test(string);
}

private static String transliteratedName(Map<String, Object> tags) {
Expand All @@ -84,7 +98,7 @@ static String removeLatinCharacters(String name) {
if (name == null) {
return null;
}
var matcher = LETTER.matcher(name);
var matcher = LATIN_LETTER.matcher(name);
if (matcher.find()) {
String result = matcher.replaceAll("");
// if the name was "<nonlatin text> (<latin description)"
Expand Down Expand Up @@ -128,7 +142,8 @@ public static Map<String, Object> getNames(Map<String, Object> tags, Translation

boolean isLatin = containsOnlyLatinCharacters(name);
String latin = isLatin ? name :
Stream.concat(Stream.of(nameEn, intName, nameDe), getAllNameTranslationsBesidesEnglishAndGerman(tags))
Stream
.concat(Stream.of(nameEn, intName, nameDe), getAllNameTranslationsBesidesEnglishAndGerman(tags))
.filter(LanguageUtils::containsOnlyLatinCharacters)
.findFirst().orElse(null);
if (latin == null && translations != null && translations.getShouldTransliterate()) {
Expand Down Expand Up @@ -160,12 +175,8 @@ public static Map<String, Object> getNames(Map<String, Object> tags, Translation

private static Stream<String> getAllNameTranslationsBesidesEnglishAndGerman(Map<String, Object> tags) {
return tags.entrySet().stream()
.filter(e -> {
String key = e.getKey();
return key.startsWith("name:") && !EN_DE_NAME_KEYS.contains(key);
})
.filter(e -> !EN_DE_NAME_KEYS.contains(e.getKey()) && VALID_NAME_TAGS.test(e.getKey()))
.map(Map.Entry::getValue)
.map(LanguageUtils::string);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.CsvSource;
import org.junit.jupiter.params.provider.ValueSource;

public class LanguageUtilsTest {

Expand Down Expand Up @@ -59,7 +60,7 @@ public void testSimpleExample() {
"é, true",
"éś, true",
"ɏə, true",
"ɐ, false",
"ɐ, true",
"ᵿἀ, false",
"Ḁỿ, true",
"\u02ff\u0370, false",
Expand Down Expand Up @@ -95,26 +96,97 @@ public void testRemoveNonLatin(String in, String out) {
}

@ParameterizedTest
@CsvSource({
"name, a, true",
"name:en, a, true",
"int_name, a, true",
"name:fr, a, true",
"name:es, a, true",
"name:pt, a, true",
"name:de, a, true",
"name:ar, ِغَّ, false",
"name:it, a, true",
"name:jp, ア, false",
"name:jp-Latn, a, true",
"name:jp_rm, a, true",
@ValueSource(strings = {
// OSM tags that SHOULD be eligible for name:latin feature in the output
"name:en",
"name:en-US",
"name:en-010",
"int_name",
"name:fr",
"name:es",
"name:pt",
"name:de",
"name:ar",
"name:it",
"name:ko-Latn",
"name:be-tarask",
// https://wiki.openstreetmap.org/wiki/Multilingual_names#Japan
"name:ja",
"name:ja-Latn",
"name:ja_rm",
"name:ja_kana",
// https://wiki.openstreetmap.org/wiki/Multilingual_names#China
"name:zh-CN",
"name:zh-hant-CN",
"name:zh_pinyin",
"name:zh_zhuyin",
"name:zh-Latn-tongyong",
"name:zh-Latn-pinyin",
"name:zh-Latn-wadegiles",
"name:yue-Latn-jyutping",
// https://wiki.openstreetmap.org/wiki/Multilingual_names#France
"name:fr",
"name:fr-x-gallo",
"name:br",
"name:oc",
"name:vls",
"name:frp",
"name:gcf",
"name:gsw",
})
public void testLatinFallbacks(String key, String value, boolean use) {
assertEquals(use ? value : null, LanguageUtils.getNames(Map.of(
key, value
public void testLatinFallbacks(String key) {
assertEquals("a", LanguageUtils.getNames(Map.of(
key, "a"
), translations).get("name:latin"));
assertNull(LanguageUtils.getNames(Map.of(
key, "ア"
), translations).get("name:latin"));
assertNull(LanguageUtils.getNames(Map.of(
key, "غ"
), translations).get("name:latin"));
}

@ParameterizedTest
@ValueSource(strings = {
// OSM tags that should NOT be eligible for name:latin feature in the output
"name:signed",
"name:prefix",
"name:abbreviation",
"name:source",
"name:full",
"name:adjective",
"name:proposed",
"name:pronunciation",
"name:etymology",
"name:etymology:wikidata",
"name:etymology:wikipedia",
"name:etymology:right",
"name:etymology:left",
"name:genitive",
})
public void testNoLatinFallback(String key) {
assertSubmap(Map.of(
"name", "Branch Hill–Loveland Road",
"name_en", "Branch Hill–Loveland Road",
"name_de", "Branch Hill–Loveland Road",
"name:latin", "Branch Hill–Loveland Road",
"name_int", "Branch Hill–Loveland Road"
), LanguageUtils.getNames(Map.of(
"name", "Branch Hill–Loveland Road",
key, "Q22133584;Q843993"
), translations));
assertSubmap(Map.of(
"name", "日",
"name_en", "日",
"name_de", "日",
"name:latin", "rì",
"name_int", "rì"
), LanguageUtils.getNames(Map.of(
"name", "日",
key, "other" // don't use this latin string with invalid name keys
), translations));
}

@ParameterizedTest
@CsvSource({
"キャンパス, kyanpasu",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,10 @@ public Translations setShouldTransliterate(boolean shouldTransliterate) {
return this;
}

/** Returns true if {@code language} is in the set of language translations to use. */
public boolean careAboutLanguage(String language) {
return languageSet.contains(language);
}

/** A source of name translations. */
public interface TranslationProvider {
Expand Down