From 390b13a6205614f5f9131529b0df94f790a34bde Mon Sep 17 00:00:00 2001 From: Makoto Kato Date: Tue, 14 May 2024 21:33:29 +0900 Subject: [PATCH 1/2] Set boundary_property with complex character and EOT --- components/segmenter/src/rule_segmenter.rs | 1 + components/segmenter/tests/word_rule_status.rs | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/components/segmenter/src/rule_segmenter.rs b/components/segmenter/src/rule_segmenter.rs index 72171cf8f9a..80723a15c4d 100644 --- a/components/segmenter/src/rule_segmenter.rs +++ b/components/segmenter/src/rule_segmenter.rs @@ -63,6 +63,7 @@ impl<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> Iterator for RuleBreakIterator<' self.advance_iter(); if self.is_eof() { self.result_cache.clear(); + self.boundary_property = self.data.complex_property; return Some(self.len); } } diff --git a/components/segmenter/tests/word_rule_status.rs b/components/segmenter/tests/word_rule_status.rs index d0638b047be..e2b2d5fe63e 100644 --- a/components/segmenter/tests/word_rule_status.rs +++ b/components/segmenter/tests/word_rule_status.rs @@ -87,6 +87,14 @@ fn rule_status_th() { assert_eq!(iter.next(), Some(21), "after 2nd word"); assert_eq!(iter.word_type(), WordType::Letter, "letter"); assert!(iter.is_word_like(), "Letter(Thai) is true"); + + assert_eq!(iter.next(), Some(33), "after 3rd word"); + assert_eq!(iter.word_type(), WordType::Letter, "letter"); + assert!(iter.is_word_like(), "Letter(Thai) is true"); + + assert_eq!(iter.next(), Some(42), "after 4th word and next is EOT"); + assert_eq!(iter.word_type(), WordType::Letter, "letter"); + assert!(iter.is_word_like(), "Letter(Thai) is true"); } /* The rule status functions are no longer public to non word break iterators. From a23654d18e437ca594c94303ce87cd6c61474f8f Mon Sep 17 00:00:00 2001 From: Makoto Kato Date: Wed, 22 May 2024 14:43:26 +0900 Subject: [PATCH 2/2] Reset boundary_property when iterator already reaches to EOT --- components/segmenter/src/rule_segmenter.rs | 6 +++++- components/segmenter/tests/word_rule_status.rs | 16 ++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/components/segmenter/src/rule_segmenter.rs b/components/segmenter/src/rule_segmenter.rs index 80723a15c4d..95490141c3c 100644 --- a/components/segmenter/src/rule_segmenter.rs +++ b/components/segmenter/src/rule_segmenter.rs @@ -78,8 +78,12 @@ impl<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> Iterator for RuleBreakIterator<' self.len = 1; return Some(0); } + let Some(right_prop) = self.get_current_break_property() else { + // iterator already reaches to EOT. Reset boundary property for word-like. + self.boundary_property = 0; + return None; + }; // SOT x anything - let right_prop = self.get_current_break_property()?; if matches!( self.get_break_state_from_table(self.data.sot_property, right_prop), BreakState::Break | BreakState::NoMatch diff --git a/components/segmenter/tests/word_rule_status.rs b/components/segmenter/tests/word_rule_status.rs index e2b2d5fe63e..bb5bc578909 100644 --- a/components/segmenter/tests/word_rule_status.rs +++ b/components/segmenter/tests/word_rule_status.rs @@ -33,6 +33,10 @@ fn rule_status() { assert_eq!(iter.next(), Some(15), "after number"); assert_eq!(iter.word_type(), WordType::Number, "number"); assert!(iter.is_word_like(), "Number is true"); + + assert_eq!(iter.next(), None, "EOT"); + assert_eq!(iter.word_type(), WordType::None, "none"); + assert!(!iter.is_word_like(), "None is false"); } #[test] @@ -51,6 +55,10 @@ fn rule_status_letter_eof() { assert_eq!(iter.next(), Some(4), "after full stop"); assert_eq!(iter.word_type(), WordType::None, "none"); assert!(!iter.is_word_like(), "None is false"); + + assert_eq!(iter.next(), None, "EOT"); + assert_eq!(iter.word_type(), WordType::None, "none"); + assert!(!iter.is_word_like(), "None is false"); } #[test] @@ -69,6 +77,10 @@ fn rule_status_numeric_eof() { assert_eq!(iter.next(), Some(3), "after full stop"); assert_eq!(iter.word_type(), WordType::None, "none"); assert!(!iter.is_word_like(), "None is false"); + + assert_eq!(iter.next(), None, "EOT"); + assert_eq!(iter.word_type(), WordType::None, "none"); + assert!(!iter.is_word_like(), "None is false"); } #[test] @@ -95,6 +107,10 @@ fn rule_status_th() { assert_eq!(iter.next(), Some(42), "after 4th word and next is EOT"); assert_eq!(iter.word_type(), WordType::Letter, "letter"); assert!(iter.is_word_like(), "Letter(Thai) is true"); + + assert_eq!(iter.next(), None, "EOT"); + assert_eq!(iter.word_type(), WordType::None, "none"); + assert!(!iter.is_word_like(), "None is false"); } /* The rule status functions are no longer public to non word break iterators.