Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add content_locale member to LineBreakOptions #5565

Merged
merged 1 commit into from
Sep 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 31 additions & 10 deletions components/segmenter/src/line.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ use alloc::vec;
use alloc::vec::Vec;
use core::char;
use core::str::CharIndices;
use icu_locale_core::subtags::language;
use icu_provider::prelude::*;
use utf8_iter::Utf8CharIndices;

Expand Down Expand Up @@ -183,29 +184,29 @@ pub enum LineBreakWordOption {

/// Options to tailor line-breaking behavior.
#[non_exhaustive]
#[derive(Copy, Clone, PartialEq, Eq, Debug)]
#[derive(Clone, PartialEq, Eq, Debug)]
pub struct LineBreakOptions {
/// Strictness of line-breaking rules. See [`LineBreakStrictness`].
pub strictness: LineBreakStrictness,

/// Line break opportunities between letters. See [`LineBreakWordOption`].
pub word_option: LineBreakWordOption,

/// Use `true` as a hint to the line segmenter that the writing
/// system is Chinese or Japanese. This allows more break opportunities when
/// `LineBreakStrictness` is `Normal` or `Loose`. See
/// <https://drafts.csswg.org/css-text-3/#line-break-property> for details.
/// Content locale for line segmenter
///
/// This allows more break opportunities when `LineBreakStrictness` is
/// `Normal` or `Loose`. See
/// <https://drafts.csswg.org/css-text-3/#line-break-property> for details.
/// This option has no effect in Latin-1 mode.
pub ja_zh: bool,
pub content_locale: Option<DataLocale>,
}

impl Default for LineBreakOptions {
fn default() -> Self {
Self {
strictness: LineBreakStrictness::Strict,
word_option: LineBreakWordOption::Normal,
ja_zh: false,
content_locale: None,
}
}
}
Expand Down Expand Up @@ -303,7 +304,7 @@ pub type LineBreakIteratorUtf16<'l, 's> = LineBreakIterator<'l, 's, LineBreakTyp
/// let mut options = LineBreakOptions::default();
/// options.strictness = LineBreakStrictness::Strict;
/// options.word_option = LineBreakWordOption::BreakAll;
/// options.ja_zh = false;
/// options.content_locale = None;
/// let segmenter = LineSegmenter::new_auto_with_options(options);
///
/// let breakpoints: Vec<usize> =
Expand Down Expand Up @@ -641,6 +642,11 @@ impl LineSegmenter {
///
/// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
pub fn segment_str<'l, 's>(&'l self, input: &'s str) -> LineBreakIteratorUtf8<'l, 's> {
let ja_zh = if let Some(content_locale) = &self.options.content_locale {
content_locale.language == language!("ja") || content_locale.language == language!("zh")
} else {
false
};
LineBreakIterator {
iter: input.char_indices(),
len: input.len(),
Expand All @@ -649,6 +655,7 @@ impl LineSegmenter {
data: self.payload.get(),
options: &self.options,
complex: &self.complex,
ja_zh,
}
}
/// Creates a line break iterator for a potentially ill-formed UTF8 string
Expand All @@ -660,6 +667,11 @@ impl LineSegmenter {
&'l self,
input: &'s [u8],
) -> LineBreakIteratorPotentiallyIllFormedUtf8<'l, 's> {
let ja_zh = if let Some(content_locale) = &self.options.content_locale {
content_locale.language == language!("ja") || content_locale.language == language!("zh")
} else {
false
};
LineBreakIterator {
iter: Utf8CharIndices::new(input),
len: input.len(),
Expand All @@ -668,6 +680,7 @@ impl LineSegmenter {
data: self.payload.get(),
options: &self.options,
complex: &self.complex,
ja_zh,
}
}
/// Creates a line break iterator for a Latin-1 (8-bit) string.
Expand All @@ -682,13 +695,19 @@ impl LineSegmenter {
data: self.payload.get(),
options: &self.options,
complex: &self.complex,
ja_zh: false,
}
}

/// Creates a line break iterator for a UTF-16 string.
///
/// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
pub fn segment_utf16<'l, 's>(&'l self, input: &'s [u16]) -> LineBreakIteratorUtf16<'l, 's> {
let ja_zh = if let Some(content_locale) = &self.options.content_locale {
content_locale.language == language!("ja") || content_locale.language == language!("zh")
} else {
false
};
LineBreakIterator {
iter: Utf16Indices::new(input),
len: input.len(),
Expand All @@ -697,6 +716,7 @@ impl LineSegmenter {
data: self.payload.get(),
options: &self.options,
complex: &self.complex,
ja_zh,
}
}
}
Expand Down Expand Up @@ -853,6 +873,7 @@ pub struct LineBreakIterator<'l, 's, Y: LineBreakType<'l, 's> + ?Sized> {
data: &'l RuleBreakDataV2<'l>,
options: &'l LineBreakOptions,
complex: &'l ComplexPayloads,
ja_zh: bool,
}

impl<'l, 's, Y: LineBreakType<'l, 's>> Iterator for LineBreakIterator<'l, 's, Y> {
Expand Down Expand Up @@ -948,7 +969,7 @@ impl<'l, 's, Y: LineBreakType<'l, 's>> Iterator for LineBreakIterator<'l, 's, Y>
right_codepoint.into(),
left_prop,
right_prop,
self.options.ja_zh,
self.ja_zh,
) {
if breakable && !after_zwj {
return self.get_current_position();
Expand Down Expand Up @@ -1151,7 +1172,7 @@ impl<'l, 's, Y: LineBreakType<'l, 's>> LineBreakIterator<'l, 's, Y> {

fn is_break_by_normal(&self, codepoint: Y::CharType) -> bool {
match codepoint.into() {
0x301C | 0x30A0 => self.options.ja_zh,
0x301C | 0x30A0 => self.ja_zh,
_ => false,
}
}
Expand Down
25 changes: 21 additions & 4 deletions components/segmenter/tests/css_line_break.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use icu::locale::locale;
use icu_segmenter::LineBreakOptions;
use icu_segmenter::LineBreakStrictness;
use icu_segmenter::LineBreakWordOption;
Expand Down Expand Up @@ -31,31 +32,47 @@ fn strict(s: &str, ja_zh: bool, expect_utf8: Vec<usize>, expect_utf16: Vec<usize
let mut options = LineBreakOptions::default();
options.strictness = LineBreakStrictness::Strict;
options.word_option = LineBreakWordOption::Normal;
options.ja_zh = ja_zh;
options.content_locale = if ja_zh {
Some(locale!("ja").into())
} else {
None
};
check_with_options(s, expect_utf8, expect_utf16, options);
}

fn normal(s: &str, ja_zh: bool, expect_utf8: Vec<usize>, expect_utf16: Vec<usize>) {
let mut options = LineBreakOptions::default();
options.strictness = LineBreakStrictness::Normal;
options.word_option = LineBreakWordOption::Normal;
options.ja_zh = ja_zh;
options.content_locale = if ja_zh {
Some(locale!("ja").into())
} else {
None
};
check_with_options(s, expect_utf8, expect_utf16, options);
}

fn loose(s: &str, ja_zh: bool, expect_utf8: Vec<usize>, expect_utf16: Vec<usize>) {
let mut options = LineBreakOptions::default();
options.strictness = LineBreakStrictness::Loose;
options.word_option = LineBreakWordOption::Normal;
options.ja_zh = ja_zh;
options.content_locale = if ja_zh {
Some(locale!("ja").into())
} else {
None
};
check_with_options(s, expect_utf8, expect_utf16, options);
}

fn anywhere(s: &str, ja_zh: bool, expect_utf8: Vec<usize>, expect_utf16: Vec<usize>) {
let mut options = LineBreakOptions::default();
options.strictness = LineBreakStrictness::Anywhere;
options.word_option = LineBreakWordOption::Normal;
options.ja_zh = ja_zh;
options.content_locale = if ja_zh {
Some(locale!("ja").into())
} else {
None
};
check_with_options(s, expect_utf8, expect_utf16, options);
}

Expand Down
6 changes: 3 additions & 3 deletions components/segmenter/tests/css_word_break.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,23 +31,23 @@ fn break_all(s: &str, expect_utf8: Vec<usize>, expect_utf16: Vec<usize>) {
let mut options = LineBreakOptions::default();
options.strictness = LineBreakStrictness::Strict;
options.word_option = LineBreakWordOption::BreakAll;
options.ja_zh = false;
options.content_locale = None;
check_with_options(s, expect_utf8, expect_utf16, options);
}

fn keep_all(s: &str, expect_utf8: Vec<usize>, expect_utf16: Vec<usize>) {
let mut options = LineBreakOptions::default();
options.strictness = LineBreakStrictness::Strict;
options.word_option = LineBreakWordOption::KeepAll;
options.ja_zh = false;
options.content_locale = None;
check_with_options(s, expect_utf8, expect_utf16, options);
}

fn normal(s: &str, expect_utf8: Vec<usize>, expect_utf16: Vec<usize>) {
let mut options = LineBreakOptions::default();
options.strictness = LineBreakStrictness::Strict;
options.word_option = LineBreakWordOption::Normal;
options.ja_zh = false;
options.content_locale = None;
check_with_options(s, expect_utf8, expect_utf16, options);
}

Expand Down
26 changes: 0 additions & 26 deletions ffi/capi/bindings/c/LineBreakOptionsV1.d.h

This file was deleted.

25 changes: 25 additions & 0 deletions ffi/capi/bindings/c/LineBreakOptionsV2.d.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 8 additions & 7 deletions ffi/capi/bindings/c/LineSegmenter.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading