From ccdec3dd360966cd01e544501146e398fc0e2cf2 Mon Sep 17 00:00:00 2001 From: Henri Sivonen Date: Wed, 11 May 2022 12:43:57 +0300 Subject: [PATCH] More remarks about Hangul in search collation --- experimental/collator/src/elements.rs | 5 +++++ experimental/collator/src/lib.rs | 15 ++++++++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/experimental/collator/src/elements.rs b/experimental/collator/src/elements.rs index 56a49648c17..114a605e290 100644 --- a/experimental/collator/src/elements.rs +++ b/experimental/collator/src/elements.rs @@ -1024,6 +1024,11 @@ where // XXX This isn't actually true with the current jamo search // expansions! + // TODO: Instead of having different jamo CE32 table for "search" + // collations, we could instead decompose the archaic jamo to + // the modern approximation sequences here and then map those + // by looking up the modern jamo from the normal root. + // We need to set data to root, because archaic jamo refer to // the root. data = self.root; diff --git a/experimental/collator/src/lib.rs b/experimental/collator/src/lib.rs index c11cd8ad413..0d861e13735 100644 --- a/experimental/collator/src/lib.rs +++ b/experimental/collator/src/lib.rs @@ -66,6 +66,19 @@ //! the tailored CE32s refer to the expansions table in each collation. To make //! them truly shareable, the archaic jamo expansions need to become self-contained //! the way Latin mini expansions in ICU4C are self-contained. +//! +//! One possible alternative to loading a different table for "search" would be +//! performing the mapping of archaic jamo to the modern approximations as a +//! special preprocessing step for the incoming characters, which would allow +//! the lookup of the resulting modern jamo from the normal root jamo table. +//! +//! "searchjl" is even more problematic than "search", since "searchjl" uses +//! prefixes matches with jamo, and currently Hangul is assumed not to participate +//! in prefix or contraction matching. Before putting too much effort into "searchjl" +//! it would be good to research its usage. The CLDR issue that introduced it, +//! https://unicode-org.atlassian.net/browse/CLDR-3560 , said that it was for +//! contact search on phones. As of 2022-05-11, I have been unable to find evidence +//! of any Open Source app, including Android Contacts, enabling "searchjl". mod comparison; mod elements; @@ -367,7 +380,7 @@ mod tests { "\u{0041}\u{00E1}\u{0063}\u{0064}", "\u{0041}\u{00E1}\u{0063}\u{0064}", "\u{0061}\u{0062}\u{0063}", - "\u{0061}\u{0062}\u{0063}", + "\u{0061}\u{0062}\u{0063}", "\u{0054}\u{0043}\u{006F}\u{006D}\u{0070}\u{0061}\u{0072}\u{0065}\u{0050}\u{006C}\u{0061}\u{0069}\u{006E}", "\u{0061}\u{0042}\u{0063}", "\u{0061}\u{0023}\u{0042}",