Skip to content

Commit

Permalink
Add content_locale member to LineBreakOptions (#5565)
Browse files Browse the repository at this point in the history
Fixes #3284.
  • Loading branch information
makotokato authored Sep 20, 2024
1 parent 3524f8e commit d704ef7
Show file tree
Hide file tree
Showing 18 changed files with 190 additions and 159 deletions.
41 changes: 31 additions & 10 deletions components/segmenter/src/line.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ use alloc::vec;
use alloc::vec::Vec;
use core::char;
use core::str::CharIndices;
use icu_locale_core::subtags::language;
use icu_provider::prelude::*;
use utf8_iter::Utf8CharIndices;

Expand Down Expand Up @@ -183,29 +184,29 @@ pub enum LineBreakWordOption {

/// Options to tailor line-breaking behavior.
#[non_exhaustive]
#[derive(Copy, Clone, PartialEq, Eq, Debug)]
#[derive(Clone, PartialEq, Eq, Debug)]
pub struct LineBreakOptions {
/// Strictness of line-breaking rules. See [`LineBreakStrictness`].
pub strictness: LineBreakStrictness,

/// Line break opportunities between letters. See [`LineBreakWordOption`].
pub word_option: LineBreakWordOption,

/// Use `true` as a hint to the line segmenter that the writing
/// system is Chinese or Japanese. This allows more break opportunities when
/// `LineBreakStrictness` is `Normal` or `Loose`. See
/// <https://drafts.csswg.org/css-text-3/#line-break-property> for details.
/// Content locale for line segmenter
///
/// This allows more break opportunities when `LineBreakStrictness` is
/// `Normal` or `Loose`. See
/// <https://drafts.csswg.org/css-text-3/#line-break-property> for details.
/// This option has no effect in Latin-1 mode.
pub ja_zh: bool,
pub content_locale: Option<DataLocale>,
}

impl Default for LineBreakOptions {
fn default() -> Self {
Self {
strictness: LineBreakStrictness::Strict,
word_option: LineBreakWordOption::Normal,
ja_zh: false,
content_locale: None,
}
}
}
Expand Down Expand Up @@ -303,7 +304,7 @@ pub type LineBreakIteratorUtf16<'l, 's> = LineBreakIterator<'l, 's, LineBreakTyp
/// let mut options = LineBreakOptions::default();
/// options.strictness = LineBreakStrictness::Strict;
/// options.word_option = LineBreakWordOption::BreakAll;
/// options.ja_zh = false;
/// options.content_locale = None;
/// let segmenter = LineSegmenter::new_auto_with_options(options);
///
/// let breakpoints: Vec<usize> =
Expand Down Expand Up @@ -641,6 +642,11 @@ impl LineSegmenter {
///
/// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
pub fn segment_str<'l, 's>(&'l self, input: &'s str) -> LineBreakIteratorUtf8<'l, 's> {
let ja_zh = if let Some(content_locale) = &self.options.content_locale {
content_locale.language == language!("ja") || content_locale.language == language!("zh")
} else {
false
};
LineBreakIterator {
iter: input.char_indices(),
len: input.len(),
Expand All @@ -649,6 +655,7 @@ impl LineSegmenter {
data: self.payload.get(),
options: &self.options,
complex: &self.complex,
ja_zh,
}
}
/// Creates a line break iterator for a potentially ill-formed UTF8 string
Expand All @@ -660,6 +667,11 @@ impl LineSegmenter {
&'l self,
input: &'s [u8],
) -> LineBreakIteratorPotentiallyIllFormedUtf8<'l, 's> {
let ja_zh = if let Some(content_locale) = &self.options.content_locale {
content_locale.language == language!("ja") || content_locale.language == language!("zh")
} else {
false
};
LineBreakIterator {
iter: Utf8CharIndices::new(input),
len: input.len(),
Expand All @@ -668,6 +680,7 @@ impl LineSegmenter {
data: self.payload.get(),
options: &self.options,
complex: &self.complex,
ja_zh,
}
}
/// Creates a line break iterator for a Latin-1 (8-bit) string.
Expand All @@ -682,13 +695,19 @@ impl LineSegmenter {
data: self.payload.get(),
options: &self.options,
complex: &self.complex,
ja_zh: false,
}
}

/// Creates a line break iterator for a UTF-16 string.
///
/// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
pub fn segment_utf16<'l, 's>(&'l self, input: &'s [u16]) -> LineBreakIteratorUtf16<'l, 's> {
let ja_zh = if let Some(content_locale) = &self.options.content_locale {
content_locale.language == language!("ja") || content_locale.language == language!("zh")
} else {
false
};
LineBreakIterator {
iter: Utf16Indices::new(input),
len: input.len(),
Expand All @@ -697,6 +716,7 @@ impl LineSegmenter {
data: self.payload.get(),
options: &self.options,
complex: &self.complex,
ja_zh,
}
}
}
Expand Down Expand Up @@ -853,6 +873,7 @@ pub struct LineBreakIterator<'l, 's, Y: LineBreakType<'l, 's> + ?Sized> {
data: &'l RuleBreakDataV2<'l>,
options: &'l LineBreakOptions,
complex: &'l ComplexPayloads,
ja_zh: bool,
}

impl<'l, 's, Y: LineBreakType<'l, 's>> Iterator for LineBreakIterator<'l, 's, Y> {
Expand Down Expand Up @@ -948,7 +969,7 @@ impl<'l, 's, Y: LineBreakType<'l, 's>> Iterator for LineBreakIterator<'l, 's, Y>
right_codepoint.into(),
left_prop,
right_prop,
self.options.ja_zh,
self.ja_zh,
) {
if breakable && !after_zwj {
return self.get_current_position();
Expand Down Expand Up @@ -1151,7 +1172,7 @@ impl<'l, 's, Y: LineBreakType<'l, 's>> LineBreakIterator<'l, 's, Y> {

fn is_break_by_normal(&self, codepoint: Y::CharType) -> bool {
match codepoint.into() {
0x301C | 0x30A0 => self.options.ja_zh,
0x301C | 0x30A0 => self.ja_zh,
_ => false,
}
}
Expand Down
25 changes: 21 additions & 4 deletions components/segmenter/tests/css_line_break.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use icu::locale::locale;
use icu_segmenter::LineBreakOptions;
use icu_segmenter::LineBreakStrictness;
use icu_segmenter::LineBreakWordOption;
Expand Down Expand Up @@ -31,31 +32,47 @@ fn strict(s: &str, ja_zh: bool, expect_utf8: Vec<usize>, expect_utf16: Vec<usize
let mut options = LineBreakOptions::default();
options.strictness = LineBreakStrictness::Strict;
options.word_option = LineBreakWordOption::Normal;
options.ja_zh = ja_zh;
options.content_locale = if ja_zh {
Some(locale!("ja").into())
} else {
None
};
check_with_options(s, expect_utf8, expect_utf16, options);
}

fn normal(s: &str, ja_zh: bool, expect_utf8: Vec<usize>, expect_utf16: Vec<usize>) {
let mut options = LineBreakOptions::default();
options.strictness = LineBreakStrictness::Normal;
options.word_option = LineBreakWordOption::Normal;
options.ja_zh = ja_zh;
options.content_locale = if ja_zh {
Some(locale!("ja").into())
} else {
None
};
check_with_options(s, expect_utf8, expect_utf16, options);
}

fn loose(s: &str, ja_zh: bool, expect_utf8: Vec<usize>, expect_utf16: Vec<usize>) {
let mut options = LineBreakOptions::default();
options.strictness = LineBreakStrictness::Loose;
options.word_option = LineBreakWordOption::Normal;
options.ja_zh = ja_zh;
options.content_locale = if ja_zh {
Some(locale!("ja").into())
} else {
None
};
check_with_options(s, expect_utf8, expect_utf16, options);
}

fn anywhere(s: &str, ja_zh: bool, expect_utf8: Vec<usize>, expect_utf16: Vec<usize>) {
let mut options = LineBreakOptions::default();
options.strictness = LineBreakStrictness::Anywhere;
options.word_option = LineBreakWordOption::Normal;
options.ja_zh = ja_zh;
options.content_locale = if ja_zh {
Some(locale!("ja").into())
} else {
None
};
check_with_options(s, expect_utf8, expect_utf16, options);
}

Expand Down
6 changes: 3 additions & 3 deletions components/segmenter/tests/css_word_break.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,23 +31,23 @@ fn break_all(s: &str, expect_utf8: Vec<usize>, expect_utf16: Vec<usize>) {
let mut options = LineBreakOptions::default();
options.strictness = LineBreakStrictness::Strict;
options.word_option = LineBreakWordOption::BreakAll;
options.ja_zh = false;
options.content_locale = None;
check_with_options(s, expect_utf8, expect_utf16, options);
}

fn keep_all(s: &str, expect_utf8: Vec<usize>, expect_utf16: Vec<usize>) {
let mut options = LineBreakOptions::default();
options.strictness = LineBreakStrictness::Strict;
options.word_option = LineBreakWordOption::KeepAll;
options.ja_zh = false;
options.content_locale = None;
check_with_options(s, expect_utf8, expect_utf16, options);
}

fn normal(s: &str, expect_utf8: Vec<usize>, expect_utf16: Vec<usize>) {
let mut options = LineBreakOptions::default();
options.strictness = LineBreakStrictness::Strict;
options.word_option = LineBreakWordOption::Normal;
options.ja_zh = false;
options.content_locale = None;
check_with_options(s, expect_utf8, expect_utf16, options);
}

Expand Down
26 changes: 0 additions & 26 deletions ffi/capi/bindings/c/LineBreakOptionsV1.d.h

This file was deleted.

25 changes: 25 additions & 0 deletions ffi/capi/bindings/c/LineBreakOptionsV2.d.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 8 additions & 7 deletions ffi/capi/bindings/c/LineSegmenter.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit d704ef7

Please sign in to comment.