segmenter options

unicode-org · Jul 12, 2023 · e90ee5b · e90ee5b
1 parent 007d867
commit e90ee5b
Show file tree

Hide file tree

Showing 7 changed files with 130 additions and 1,003,609 deletions.
diff --git a/provider/datagen/src/options.rs b/provider/datagen/src/options.rs
@@ -55,6 +55,8 @@ pub struct Options {
     /// The type of fallback that the data should be generated for. If locale fallback is
     /// used at runtime, smaller data can be generated.
     pub fallback: FallbackMode,
+    /// The segmentation models to include
+    pub segmenter_models: SegmenterModelInclude,
 }
 
 /// Defines the locales to include
@@ -138,3 +140,42 @@ impl Default for TrieType {
         Self::Small
     }
 }
+
+#[derive(Debug, PartialEq, Clone, serde::Serialize, serde::Deserialize)]
+/// The segmentation models to include
+pub enum SegmenterModelInclude {
+    /// Include the recommended set of models. This will cover all languages supported
+    /// by ICU4X: Thai, Burmese, Khmer, Lao, Chinese, and Japanese. Both dictionary
+    /// and LSTM models will be included, to the extent required by the chosen data keys.
+    Recommended,
+    /// Include no dictionary or LSTM models. This will make line and word segmenters
+    /// behave like simple rule-based segmenters, which will be incorrect when handling text
+    /// that contains Thai, Burmese, Khmer, Lao, Chinese, or Japanese.
+    None,
+    /// Include an explicit list of LSTM or dictionary models, to the extent required by the
+    /// chosen data keys.
+    ///
+    /// The currently supported dictionary models are
+    /// * `cjdict`
+    /// * `burmesedict`
+    /// * `khmerdict`
+    /// * `laodict`
+    /// * `thaidict`
+    ///
+    /// The currently supported LSTM models are
+    /// * `Burmese_codepoints_exclusive_model4_heavy`
+    /// * `Khmer_codepoints_exclusive_model4_heavy`
+    /// * `Lao_codepoints_exclusive_model4_heavy`
+    /// * `Thai_codepoints_exclusive_model4_heavy`
+    ///
+    /// If a model is not included, the resulting line or word segmenter will apply rule-based
+    /// segmentation when encountering text in a script that requires the model, which will be
+    /// incorrect.
+    Explicit(Vec<String>),
+}
+
+impl Default for SegmenterModelInclude {
+    fn default() -> Self {
+        Self::Recommended
+    }
+}
diff --git a/provider/datagen/src/transform/segmenter/dictionary.rs b/provider/datagen/src/transform/segmenter/dictionary.rs
@@ -14,33 +14,46 @@ struct SegmenterDictionaryData {
     trie_data: Vec<u16>,
 }
 
+fn model_name_to_data_locale(name: &str) -> Option<DataLocale> {
+    match name {
+        "khmerdict" => Some(langid!("km").into()),
+        "cjdict" => Some(langid!("ja").into()),
+        "laodict" => Some(langid!("lo").into()),
+        "burmesedict" => Some(langid!("my").into()),
+        "thaidict" => Some(langid!("th").into()),
+        _ => None,
+    }
+}
+
+fn data_locale_to_model_name(locale: &DataLocale) -> Option<&'static str> {
+    match locale.get_langid() {
+        id if id == langid!("km") => Some("khmerdict"),
+        id if id == langid!("ja") => Some("cjdict"),
+        id if id == langid!("lo") => Some("laodict"),
+        id if id == langid!("my") => Some("burmesedict"),
+        id if id == langid!("th") => Some("thaidict"),
+        _ => None,
+    }
+}
+
 impl crate::DatagenProvider {
     fn load_dictionary_data(
         &self,
         req: DataRequest,
     ) -> Result<UCharDictionaryBreakDataV1<'static>, DataError> {
-        let filename = if req.locale.get_langid() == langid!("km") {
-            "segmenter/dictionary/khmerdict.toml"
-        } else if req.locale.get_langid() == langid!("ja") {
-            "segmenter/dictionary/cjdict.toml"
-        } else if req.locale.get_langid() == langid!("lo") {
-            "segmenter/dictionary/laodict.toml"
-        } else if req.locale.get_langid() == langid!("my") {
-            "segmenter/dictionary/burmesedict.toml"
-        } else if req.locale.get_langid() == langid!("th") {
-            "segmenter/dictionary/thaidict.toml"
-        } else {
-            Err(DataErrorKind::MissingLocale.into_error())?
-        };
+        let model = data_locale_to_model_name(&req.locale)
+            .ok_or(DataErrorKind::MissingLocale.into_error())?;
+
+        let filename = format!("segmenter/dictionary/{model}.toml");
 
         let toml_data: &SegmenterDictionaryData = self
             .source
             .icuexport()
-            .and_then(|e| e.read_and_parse_toml(filename))
+            .and_then(|e| e.read_and_parse_toml(&filename))
             .or_else(|e| {
                 self.source
                     .icuexport_fallback()
-                    .read_and_parse_toml(filename)
+                    .read_and_parse_toml(&filename)
                     .map_err(|_| e)
             })?;
 
@@ -51,12 +64,9 @@ impl crate::DatagenProvider {
 }
 
 macro_rules! implement {
-    ($marker:ident, $($locale:literal),*) => {
+    ($marker:ident, $supported:expr) => {
         impl DataProvider<$marker> for crate::DatagenProvider {
-            fn load(
-                &self,
-                req: DataRequest,
-            ) -> Result<DataResponse<$marker>, DataError> {
+            fn load(&self, req: DataRequest) -> Result<DataResponse<$marker>, DataError> {
                 self.check_req::<$marker>(req)?;
                 let data = self.load_dictionary_data(req)?;
                 Ok(DataResponse {
@@ -67,19 +77,26 @@ macro_rules! implement {
         }
 
         impl IterableDataProvider<$marker> for crate::DatagenProvider {
-            // TODO(#3408): Do we actually want to filter these by the user-selected locales?
             fn supported_locales(&self) -> Result<Vec<DataLocale>, DataError> {
-                Ok(self.filter_data_locales(vec![$(locale!($locale).into()),*]))
+                Ok(match &self.source.options.segmenter_models {
+                    crate::options::SegmenterModelInclude::Recommended => $supported
+                        .into_iter()
+                        .filter_map(model_name_to_data_locale)
+                        .collect(),
+                    crate::options::SegmenterModelInclude::None => Vec::new(),
+                    crate::options::SegmenterModelInclude::Explicit(list) => $supported
+                        .into_iter()
+                        .filter(|&model| list.iter().any(|x| x == model))
+                        .filter_map(model_name_to_data_locale)
+                        .collect(),
+                })
             }
         }
-    }
+    };
 }
 
-implement!(DictionaryForWordOnlyAutoV1Marker, "ja");
+implement!(DictionaryForWordOnlyAutoV1Marker, ["cjdict"]);
 implement!(
     DictionaryForWordLineExtendedV1Marker,
-    "th",
-    "km",
-    "lo",
-    "my"
+    ["khmerdict", "laodict", "burmesedict", "thaidict"]
 );
diff --git a/provider/datagen/src/transform/segmenter/lstm.rs b/provider/datagen/src/transform/segmenter/lstm.rs
@@ -181,25 +181,35 @@ convert!(ndarray_to_lstm_matrix1, LstmMatrix1, 1);
 convert!(ndarray_to_lstm_matrix2, LstmMatrix2, 2);
 convert!(ndarray_to_lstm_matrix3, LstmMatrix3, 3);
 
+fn model_name_to_data_locale(name: &str) -> Option<DataLocale> {
+    match name {
+        "Burmese_codepoints_exclusive_model4_heavy" => Some(langid!("my").into()),
+        "Khmer_codepoints_exclusive_model4_heavy" => Some(langid!("km").into()),
+        "Lao_codepoints_exclusive_model4_heavy" => Some(langid!("lo").into()),
+        "Thai_codepoints_exclusive_model4_heavy" => Some(langid!("th").into()),
+        _ => None,
+    }
+}
+
+fn data_locale_to_model_name(locale: &DataLocale) -> Option<&'static str> {
+    match locale.get_langid() {
+        id if id == langid!("my") => Some("Burmese_codepoints_exclusive_model4_heavy"),
+        id if id == langid!("km") => Some("Khmer_codepoints_exclusive_model4_heavy"),
+        id if id == langid!("lo") => Some("Lao_codepoints_exclusive_model4_heavy"),
+        id if id == langid!("th") => Some("Thai_codepoints_exclusive_model4_heavy"),
+        _ => None,
+    }
+}
+
 impl DataProvider<LstmForWordLineAutoV1Marker> for crate::DatagenProvider {
     fn load(
         &self,
         req: DataRequest,
     ) -> Result<DataResponse<LstmForWordLineAutoV1Marker>, DataError> {
         self.check_req::<LstmForWordLineAutoV1Marker>(req)?;
-        let model = if req.locale.language() == langid!("th").language {
-            "Thai_codepoints_exclusive_model4_heavy"
-        } else if req.locale.language() == langid!("my").language {
-            "Burmese_codepoints_exclusive_model4_heavy"
-        } else if req.locale.language() == langid!("lo").language {
-            "Lao_codepoints_exclusive_model4_heavy"
-        } else if req.locale.language() == langid!("km").language {
-            "Khmer_codepoints_exclusive_model4_heavy"
-        } else {
-            return Err(
-                DataErrorKind::MissingLocale.with_req(LstmForWordLineAutoV1Marker::KEY, req)
-            );
-        };
+
+        let model = data_locale_to_model_name(&req.locale)
+            .ok_or(DataErrorKind::MissingLocale.with_req(LstmForWordLineAutoV1Marker::KEY, req))?;
 
         let lstm_data = self
             .source
@@ -218,13 +228,23 @@ impl DataProvider<LstmForWordLineAutoV1Marker> for crate::DatagenProvider {
 
 impl IterableDataProvider<LstmForWordLineAutoV1Marker> for crate::DatagenProvider {
     fn supported_locales(&self) -> Result<Vec<DataLocale>, DataError> {
-        // TODO(#3408): Do we actually want to filter these by the user-selected locales?
-        Ok(self.filter_data_locales(vec![
-            langid!("km").into(),
-            langid!("lo").into(),
-            langid!("my").into(),
-            langid!("th").into(),
-        ]))
+        Ok(match &self.source.options.segmenter_models {
+            crate::options::SegmenterModelInclude::Recommended => [
+                "Burmese_codepoints_exclusive_model4_heavy",
+                "Khmer_codepoints_exclusive_model4_heavy",
+                "Lao_codepoints_exclusive_model4_heavy",
+                "Thai_codepoints_exclusive_model4_heavy",
+            ]
+            .into_iter()
+            .filter_map(model_name_to_data_locale)
+            .collect(),
+            crate::options::SegmenterModelInclude::None => Vec::new(),
+            crate::options::SegmenterModelInclude::Explicit(list) => list
+                .iter()
+                .map(core::ops::Deref::deref)
+                .filter_map(model_name_to_data_locale)
+                .collect(),
+        })
     }
 }
 

diff --git a/provider/datagen/tests/data/json/fingerprints.csv b/provider/datagen/tests/data/json/fingerprints.csv