From 1fc0ba3855857fef9a1b466591b596ffdb9394d5 Mon Sep 17 00:00:00 2001 From: Martin Geisler Date: Wed, 30 Sep 2020 23:11:34 +0200 Subject: [PATCH] Reformulate wrapping in terms of boxes, glue, and penalties This is a complete rewrite of the core word wrapping functionality. Before, we would step though the input string and (attempt to) keep track of all aspects of the state. This didn't always work (see at least #122, #158, #158, and #193) and it's inflexible. This commit replaces the old algorithm with a new one which works on a more abstract level. We now first 1. First split the input string into "words". A word is a substring of the original string, including any trailing whitespace. 2. We split each word according to the `WordSplitter`. 3. We then simply put the words into lines based on the display width. This is slower than the previous algorithm. The `fill/1600` benchmark shows that is now takes ~18 microseconds to wrap a 1600 character long string. That is around 8 microseconds longer than before. --- examples/interactive.rs | 2 +- examples/layout.rs | 2 +- src/core.rs | 588 ++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 482 +++++++++++++------------------- src/splitting.rs | 96 +++---- 5 files changed, 803 insertions(+), 367 deletions(-) create mode 100644 src/core.rs diff --git a/examples/interactive.rs b/examples/interactive.rs index 873af000..5510b1d5 100644 --- a/examples/interactive.rs +++ b/examples/interactive.rs @@ -101,7 +101,7 @@ mod unix_only { )?; row += 2; - let mut lines = wrap(text, options).collect::>(); + let mut lines = wrap(text, options); if let Some(line) = lines.last() { // If `text` ends with a newline, the final wrapped line // contains this newline. This will in turn leave the diff --git a/examples/layout.rs b/examples/layout.rs index 04654d99..e2316858 100644 --- a/examples/layout.rs +++ b/examples/layout.rs @@ -17,7 +17,7 @@ fn main() { for width in 15..60 { options.width = width; - let lines = wrap(example, &options).collect::>(); + let lines = wrap(example, &options); if lines != prev_lines { let title = format!(" Width: {} ", width); println!(".{:-^1$}.", title, width + 2); diff --git a/src/core.rs b/src/core.rs new file mode 100644 index 00000000..4b15e5b0 --- /dev/null +++ b/src/core.rs @@ -0,0 +1,588 @@ +//! Building blocks for advanced wrapping functionality. +//! +//! The functions and structs in this module can be used to implement +//! advanced wrapping functionality when the `wrap` and `fill` +//! function don't do what you want. + +use unicode_width::UnicodeWidthChar; +use unicode_width::UnicodeWidthStr; + +/// The CSI or "Control Sequence Introducer" introduces an ANSI escape +/// sequence. This is typically used for colored text and will be +/// ignored when computing the text width. +const CSI: (char, char) = ('\x1b', '['); +/// The final bytes of an ANSI escape sequence must be in this range. +const ANSI_FINAL_BYTE: std::ops::RangeInclusive = '\x40'..='\x7e'; + +/// Skip ANSI escape sequences. The `ch` is the current `char`, the +/// `chars` provide the following characters. The `chars` will be +/// modified if `ch` is the start of an ANSI escape sequence. +fn skip_ansi_escape_sequence>(ch: char, chars: &mut I) -> bool { + if ch == CSI.0 && chars.next() == Some(CSI.1) { + // We have found the start of an ANSI escape code, typically + // used for colored terminal text. We skip until we find a + // "final byte" in the range 0x40–0x7E. + for ch in chars { + if ANSI_FINAL_BYTE.contains(&ch) { + return true; + } + } + } + return false; +} + +/// A (text) fragment denotes the unit which we wrap into lines. +/// +/// Fragments represent an abstract _word_ plus the _whitespace_ +/// following the word. In case the word falls at the end of the line, +/// the whitespace is dropped and a so-called _penalty_ is inserted +/// instead (typically `"-"` if the word was hyphenated). +/// +/// For wrapping purposes, the precise content of the word, the +/// whitespace, and the penalty is irrelevant. All we need to know is +/// the displayed width of each part, which this trait provides. +pub trait Fragment: std::fmt::Debug { + /// Displayed width of word represented by this fragment. + fn width(&self) -> usize; + + /// Displayed width of the whitespace that must follow the word + /// when the word is not at the end of a line. + fn whitespace_width(&self) -> usize; + + /// Displayed width of the penalty that must be inserted if the + /// word falls at the end of a line. + fn penalty_width(&self) -> usize; +} + +/// A piece of wrappable text, including any trailing whitespace. +/// +/// A `Word` is an example of a `Fragment`, so it has a width, +/// trailing whitespace, and potentially a penalty item. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct Word<'a> { + word: &'a str, + width: usize, + pub(crate) whitespace: &'a str, + pub(crate) penalty: &'a str, +} + +impl std::ops::Deref for Word<'_> { + type Target = str; + + fn deref(&self) -> &Self::Target { + self.word + } +} + +impl<'a> Word<'a> { + /// Construct a new `Word`. + /// + /// A trailing strech of `' '` is automatically taken to be the + /// whitespace part of the word. + pub fn from(word: &str) -> Word<'_> { + let trimmed = word.trim_end_matches(' '); + let mut chars = trimmed.chars(); + let mut width = 0; + while let Some(ch) = chars.next() { + if skip_ansi_escape_sequence(ch, &mut chars) { + continue; + }; + width += ch.width().unwrap_or(0); + } + + Word { + word: trimmed, + width: width, + whitespace: &word[trimmed.len()..], + penalty: "", + } + } + + /// Break this word into smaller words with a width of at most + /// `line_width`. The whitespace and penalty from this `Word` is + /// added to the last piece. + /// + /// # Examples + /// + /// ``` + /// use textwrap::core::Word; + /// assert_eq!(Word::from("Hello! ").break_apart(3).collect::>(), + /// vec![Word::from("Hel"), Word::from("lo! ")]); + /// ``` + pub fn break_apart<'b>(&'b self, line_width: usize) -> impl Iterator> + 'b { + let mut char_indices = self.word.char_indices(); + let mut offset = 0; + let mut width = 0; + + std::iter::from_fn(move || { + while let Some((idx, ch)) = char_indices.next() { + if skip_ansi_escape_sequence(ch, &mut char_indices.by_ref().map(|(_, ch)| ch)) { + continue; + } + + let ch_width = ch.width().unwrap_or(0); + if width > 0 && width + ch_width > line_width { + let word = Word { + word: &self.word[offset..idx], + width: width, + whitespace: "", + penalty: "", + }; + offset = idx; + width = ch_width; + return Some(word); + } + + width += ch_width; + } + + if offset < self.word.len() { + let word = Word { + word: &self.word[offset..], + width: width, + whitespace: self.whitespace, + penalty: self.penalty, + }; + offset = self.word.len(); + return Some(word); + } + + None + }) + } +} + +impl Fragment for Word<'_> { + fn width(&self) -> usize { + self.width + } + + // We assume the whitespace consist of ' ' only. This allows us to + // compute the display width in constant time. + fn whitespace_width(&self) -> usize { + self.whitespace.len() + } + + // We assume the penalty is `""` or `"-"`. This allows us to + // compute the display width in constant time. + fn penalty_width(&self) -> usize { + self.penalty.len() + } +} + +/// Split line into words separated by regions of `' '` characters. +/// +/// # Examples +/// +/// ``` +/// use textwrap::core::{Fragment, Word, find_words}; +/// let words = find_words("Hello World!").collect::>(); +/// assert_eq!(words, vec![Word::from("Hello "), Word::from("World!")]); +/// assert_eq!(words[0].width(), 5); +/// assert_eq!(words[0].whitespace_width(), 1); +/// assert_eq!(words[0].penalty_width(), 0); +/// ``` +pub fn find_words(line: &str) -> impl Iterator { + let mut start = 0; + let mut in_whitespace = false; + let mut char_indices = line.char_indices(); + + std::iter::from_fn(move || { + // for (idx, ch) in char_indices does not work, gives this + // error: + // + // > cannot move out of `char_indices`, a captured variable in + // > an `FnMut` closure + #[allow(clippy::while_let_on_iterator)] + while let Some((idx, ch)) = char_indices.next() { + if in_whitespace && ch != ' ' { + let word = Word::from(&line[start..idx]); + start = idx; + in_whitespace = ch == ' '; + return Some(word); + } + + in_whitespace = ch == ' '; + } + + if start < line.len() { + let word = Word::from(&line[start..]); + start = line.len(); + return Some(word); + } + + None + }) +} + +/// Split words into smaller words according to the split points given +/// by `options`. +/// +/// Note that we split all words, regardless of their length. This is +/// to more cleanly separate the business of splitting (including +/// automatic hyphenation) from the business of word wrapping. +/// +/// # Examples +/// +/// ``` +/// use textwrap::{Options, NoHyphenation}; +/// use textwrap::core::{Word, split_words}; +/// +/// // The default splitter is HyphenSplitter: +/// let options = Options::new(80); +/// assert_eq!( +/// split_words(vec![Word::from("foo-bar")], &&options).collect::>(), +/// vec![Word::from("foo-"), Word::from("bar")] +/// ); +/// +/// // The NoHyphenation splitter ignores the '-': +/// let options = Options::new(80).splitter(Box::new(NoHyphenation)); +/// assert_eq!( +/// split_words(vec![Word::from("foo-bar")], &&options).collect::>(), +/// vec![Word::from("foo-bar")] +/// ); +/// ``` +pub fn split_words<'a, I, T: crate::WrapOptions>( + words: I, + options: &'a T, +) -> impl Iterator> +where + I: IntoIterator>, +{ + words.into_iter().flat_map(move |word| { + let mut prev = 0; + let mut split_points = options.split_points(&word).into_iter(); + std::iter::from_fn(move || { + if let Some(idx) = split_points.next() { + let need_hyphen = !word[..idx].ends_with('-'); + let w = Word { + word: &word.word[prev..idx], + width: word[prev..idx].width(), + whitespace: "", + penalty: if need_hyphen { "-" } else { "" }, + }; + prev = idx; + return Some(w); + } + + if prev < word.word.len() || prev == 0 { + let w = Word { + word: &word.word[prev..], + width: word[prev..].width(), + whitespace: word.whitespace, + penalty: word.penalty, + }; + prev = word.word.len() + 1; + return Some(w); + } + + None + }) + }) +} + +/// Forcibly break words wider than `line_width` into smaller words. +/// +/// This simply calls `Word::break_apart` on words that are too wide. +/// This means that no extra `'-'` is inserted, the word is simply +/// broken into smaller pieces. +pub fn break_words<'a, I>(words: I, line_width: usize) -> Vec> +where + I: IntoIterator>, +{ + let mut shortened_words = Vec::new(); + for word in words { + if word.width() > line_width { + shortened_words.extend(word.break_apart(line_width)); + } else { + shortened_words.push(word); + } + } + shortened_words +} + +/// Wrap abstract fragments into lines of differnet widths. +/// +/// The `line_widths` maps the line number to the desired width. This +/// can be used to implement hanging indentation. +/// +/// The fragments must already have been split into the desired +/// widths, this function will not (and cannot) attempt to split them +/// further when arranging them into lines. +/// +/// # Examples +/// +/// Imagine you're building a house site and you have a number of +/// tasks you need to execute. Things like pour foundation, complete +/// framing, install plumbing, electric cabling, install insolutation. +/// +/// The construction workers can only work during daytime, so they +/// need to pack up everything at night. Because they need to secure +/// their tools and move machines back to the garage, this process +/// takes much more time than the time it would take them to simply +/// switch to another task. +/// +/// You would like to make a list of taks to execute every day based +/// on your estimates. You can model this with a program like this: +/// +/// ``` +/// use textwrap::core::{Fragment, wrap_fragments}; +/// +/// #[derive(Debug)] +/// struct Task<'a> { +/// name: &'a str, +/// hours: usize, // Time needed to complete task. +/// sweep: usize, // Time needed for a quick sweep after task during the day. +/// cleanup: usize, // Time needed to cleanup after task at end of day. +/// } +/// +/// impl Fragment for Task<'_> { +/// fn width(&self) -> usize { self.hours } +/// fn whitespace_width(&self) -> usize { self.sweep } +/// fn penalty_width(&self) -> usize { self.cleanup } +/// } +/// +/// // The morning tasks +/// let tasks = vec![ +/// Task { name: "Foundation", hours: 4, sweep: 2, cleanup: 3 }, +/// Task { name: "Framing", hours: 3, sweep: 1, cleanup: 2 }, +/// Task { name: "Plumbing", hours: 2, sweep: 2, cleanup: 2 }, +/// Task { name: "Electrical", hours: 2, sweep: 1, cleanup: 2 }, +/// Task { name: "Insulation", hours: 2, sweep: 1, cleanup: 2 }, +/// Task { name: "Drywall", hours: 3, sweep: 1, cleanup: 2 }, +/// Task { name: "Floors", hours: 3, sweep: 1, cleanup: 2 }, +/// Task { name: "Countertops", hours: 1, sweep: 1, cleanup: 2 }, +/// Task { name: "Bathrooms", hours: 2, sweep: 1, cleanup: 2 }, +/// ]; +/// +/// fn assign_days<'a>(tasks: &[Task<'a>], day_length: usize) -> Vec<(usize, Vec<&'a str>)> { +/// let mut days = Vec::new(); +/// for day in wrap_fragments(&tasks, |i| { day_length }) { +/// let last = day.last().unwrap(); +/// let work_hours: usize = day.iter().map(|t| t.hours + t.sweep).sum(); +/// let names = day.iter().map(|t| t.name).collect::>(); +/// days.push((work_hours - last.sweep + last.cleanup, names)); +/// } +/// days +/// } +/// +/// // With a single crew working 8 hours a day: +/// assert_eq!(assign_days(&tasks, 8), [ +/// (7, vec!["Foundation"]), +/// (8, vec!["Framing", "Plumbing"]), +/// (7, vec!["Electrical", "Insulation"]), +/// (5, vec!["Drywall"]), +/// (7, vec!["Floors", "Countertops"]), +/// (4, vec!["Bathrooms"]), +/// ]); +/// +/// // With two crews working in shifts, 16 hours a day: +/// assert_eq!(assign_days(&tasks, 16), [ +/// (14, vec!["Foundation", "Framing", "Plumbing"]), +/// (15, vec!["Electrical", "Insulation", "Drywall", "Floors"]), +/// (6, vec!["Countertops", "Bathrooms"]), +/// ]); +/// ``` +/// +/// Apologies to anyone who actually knows how to build a house and +/// knows how long each step takes :-) +pub fn wrap_fragments usize>( + fragments: &[T], + line_widths: F, +) -> Vec<&[T]> { + let mut lines = Vec::new(); + let mut start = 0; + let mut width = 0; + + for (idx, fragment) in fragments.iter().enumerate() { + let line_width = line_widths(lines.len()); + if width + fragment.width() + fragment.penalty_width() > line_width && idx > start { + lines.push(&fragments[start..idx]); + start = idx; + width = 0; + } + width += fragment.width() + fragment.whitespace_width(); + } + lines.push(&fragments[start..]); + lines +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{Options, WordSplitter}; + #[cfg(feature = "hyphenation")] + use hyphenation::{Language, Load, Standard}; + + // Like assert_eq!, but the left expression is an iterator. + macro_rules! assert_iter_eq { + ($left:expr, $right:expr) => { + assert_eq!($left.collect::>(), $right); + }; + } + + #[test] + fn skip_ansi_escape_sequence_works() { + let blue_text = "\u{1b}[34mHello\u{1b}[0m"; + let mut chars = blue_text.chars(); + let ch = chars.next().unwrap(); + assert!(skip_ansi_escape_sequence(ch, &mut chars)); + assert_eq!(chars.next(), Some('H')); + } + + #[test] + fn find_words_empty() { + assert_iter_eq!(find_words(""), vec![]); + } + + #[test] + fn find_words_single_word() { + assert_iter_eq!(find_words("foo"), vec![Word::from("foo")]); + } + + #[test] + fn find_words_two_words() { + assert_iter_eq!( + find_words("foo bar"), + vec![Word::from("foo "), Word::from("bar")] + ); + } + + #[test] + fn find_words_multiple_words() { + assert_iter_eq!( + find_words("foo bar baz"), + vec![Word::from("foo "), Word::from("bar "), Word::from("baz")] + ); + } + + #[test] + fn find_words_whitespace() { + assert_iter_eq!(find_words(" "), vec![Word::from(" ")]); + } + + #[test] + fn find_words_inter_word_whitespace() { + assert_iter_eq!( + find_words("foo bar"), + vec![Word::from("foo "), Word::from("bar")] + ) + } + + #[test] + fn find_words_trailing_whitespace() { + assert_iter_eq!(find_words("foo "), vec![Word::from("foo ")]); + } + + #[test] + fn find_words_leading_whitespace() { + assert_iter_eq!( + find_words(" foo"), + vec![Word::from(" "), Word::from("foo")] + ); + } + + #[test] + fn find_words_multi_column_char() { + assert_iter_eq!( + find_words("\u{1f920}"), // cowboy emoji 🤠 + vec![Word::from("\u{1f920}")] + ); + } + + #[test] + fn find_words_hyphens() { + assert_iter_eq!(find_words("foo-bar"), vec![Word::from("foo-bar")]); + assert_iter_eq!( + find_words("foo- bar"), + vec![Word::from("foo- "), Word::from("bar")] + ); + assert_iter_eq!( + find_words("foo - bar"), + vec![Word::from("foo "), Word::from("- "), Word::from("bar")] + ); + assert_iter_eq!( + find_words("foo -bar"), + vec![Word::from("foo "), Word::from("-bar")] + ); + } + + #[test] + fn split_words_no_words() { + assert_iter_eq!(split_words(vec![], &80), vec![]); + } + + #[test] + fn split_words_empty_word() { + assert_iter_eq!( + split_words(vec![Word::from(" ")], &80), + vec![Word::from(" ")] + ); + } + + #[test] + fn split_words_hyphen_splitter() { + assert_iter_eq!( + split_words(vec![Word::from("foo-bar")], &80), + vec![Word::from("foo-"), Word::from("bar")] + ); + } + + #[test] + fn split_words_short_line() { + // Note that `split_words` does not take the line width into + // account, that is the job of `break_words`. + assert_iter_eq!( + split_words(vec![Word::from("foobar")], &3), + vec![Word::from("foobar")] + ); + } + + #[test] + fn split_words_adds_penalty() { + #[derive(Debug)] + struct FixedSplitPoint; + impl WordSplitter for FixedSplitPoint { + fn split_points(&self, _: &str) -> Vec { + vec![3] + } + } + + let options = Options::new(80).splitter(Box::new(FixedSplitPoint)); + assert_iter_eq!( + split_words(vec![Word::from("foobar")].into_iter(), &&options), + vec![ + Word { + word: "foo", + width: 3, + whitespace: "", + penalty: "-" + }, + Word { + word: "bar", + width: 3, + whitespace: "", + penalty: "" + } + ] + ); + + assert_iter_eq!( + split_words(vec![Word::from("fo-bar")].into_iter(), &&options), + vec![ + Word { + word: "fo-", + width: 3, + whitespace: "", + penalty: "" + }, + Word { + word: "bar", + width: 3, + whitespace: "", + penalty: "" + } + ] + ); + } +} diff --git a/src/lib.rs b/src/lib.rs index a59e6d8d..e92594ab 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -95,21 +95,8 @@ #![allow(clippy::redundant_field_names)] use std::borrow::Cow; -use std::str::CharIndices; - -use unicode_width::UnicodeWidthChar; use unicode_width::UnicodeWidthStr; -/// A non-breaking space. -const NBSP: char = '\u{a0}'; - -/// The CSI or "Control Sequence Introducer" introduces an ANSI escape -/// sequence. This is typically used for colored text and will be -/// ignored when computing the text width. -const CSI: (char, char) = ('\u{1b}', '['); -/// The final bytes of an ANSI escape sequence must be in this range. -const ANSI_FINAL_BYTE: std::ops::RangeInclusive = '\x40'..='\x7e'; - mod indentation; pub use crate::indentation::dedent; pub use crate::indentation::indent; @@ -117,6 +104,8 @@ pub use crate::indentation::indent; mod splitting; pub use crate::splitting::{HyphenSplitter, NoHyphenation, WordSplitter}; +pub mod core; + /// Options for wrapping and filling text. Used with the [`wrap`] and /// [`fill`] functions. /// @@ -132,8 +121,8 @@ pub trait WrapOptions { /// Allow long words to be broken if they cannot fit on a line. /// When set to `false`, some lines may be longer than `width`. fn break_words(&self) -> bool; - /// Split word as with `WordSplitter::split`. - fn split<'w>(&self, word: &'w str) -> Vec<(&'w str, &'w str, &'w str)>; + /// Find indices where `word` can be split. + fn split_points(&self, word: &str) -> Vec; } /// Holds settings for wrapping and filling text. @@ -188,8 +177,8 @@ impl WrapOptions for &Options<'_> { self.break_words } #[inline] - fn split<'w>(&self, word: &'w str) -> Vec<(&'w str, &'w str, &'w str)> { - self.splitter.split(word) + fn split_points(&self, word: &str) -> Vec { + self.splitter.split_points(word) } } @@ -226,8 +215,8 @@ impl WrapOptions for usize { true } #[inline] - fn split<'w>(&self, word: &'w str) -> Vec<(&'w str, &'w str, &'w str)> { - HyphenSplitter.split(word) + fn split_points(&self, word: &str) -> Vec { + HyphenSplitter.split_points(word) } } @@ -358,210 +347,6 @@ impl<'a> Options<'a> { } } -/// Like `char::is_whitespace`, but non-breaking spaces don't count. -#[inline] -fn is_whitespace(ch: char) -> bool { - ch.is_whitespace() && ch != NBSP -} - -#[derive(Debug)] -struct WrapIter<'input, T: WrapOptions> { - options: T, - - // String to wrap. - source: &'input str, - // CharIndices iterator over self.source. - char_indices: CharIndices<'input>, - // Byte index where the current line starts. - start: usize, - // Byte index of the last place where the string can be split. - split: usize, - // Size in bytes of the character at self.source[self.split]. - split_len: usize, - // Width of self.source[self.start..idx]. - line_width: usize, - // Width of self.source[self.start..self.split]. - line_width_at_split: usize, - // Tracking runs of whitespace characters. - in_whitespace: bool, - // Has iterator finished producing elements? - finished: bool, -} - -impl WrapIter<'_, T> { - fn new(options: T, s: &str) -> WrapIter<'_, T> { - let initial_indent_width = options.initial_indent().width(); - - WrapIter { - options: options, - source: s, - char_indices: s.char_indices(), - start: 0, - split: 0, - split_len: 0, - line_width: initial_indent_width, - line_width_at_split: initial_indent_width, - in_whitespace: false, - finished: false, - } - } - - fn create_result_line(&self) -> Cow<'static, str> { - let indent = if self.start == 0 { - self.options.initial_indent() - } else { - self.options.subsequent_indent() - }; - if indent.is_empty() { - Cow::Borrowed("") // return Cow<'static, str> - } else { - // This removes the link between the lifetime of the - // indentation and the input string. The non-empty - // indentation will force us to create an owned `String` - // in any case. - Cow::Owned(String::from(indent)) - } - } -} - -impl<'input, T: WrapOptions> Iterator for WrapIter<'input, T> { - type Item = Cow<'input, str>; - - fn next(&mut self) -> Option> { - if self.finished { - return None; - } - - while let Some((idx, ch)) = self.char_indices.next() { - if ch == CSI.0 && self.char_indices.next().map(|(_, ch)| ch) == Some(CSI.1) { - // We have found the start of an ANSI escape code, - // typically used for colored text. We ignore all - // characters until we find a "final byte" in the - // range 0x40–0x7E. - while let Some((_, ch)) = self.char_indices.next() { - if ANSI_FINAL_BYTE.contains(&ch) { - break; - } - } - // Done with the escape sequence, we continue with - // next character in the outer loop. - continue; - } - - let char_width = ch.width().unwrap_or(0); - let char_len = ch.len_utf8(); - if ch == '\n' { - self.split = idx; - self.split_len = char_len; - self.line_width_at_split = self.line_width; - self.in_whitespace = false; - - // If this is not the final line, return the current line. Otherwise, - // we will return the line with its line break after exiting the loop - if self.split + self.split_len < self.source.len() { - let mut line = self.create_result_line(); - line += &self.source[self.start..self.split]; - - self.start = self.split + self.split_len; - self.line_width = self.options.subsequent_indent().width(); - - return Some(line); - } - } else if is_whitespace(ch) { - // Extend the previous split or create a new one. - if self.in_whitespace { - self.split_len += char_len; - } else { - self.split = idx; - self.split_len = char_len; - } - self.line_width_at_split = self.line_width + char_width; - self.in_whitespace = true; - } else if self.line_width + char_width > self.options.width() { - // There is no room for this character on the current - // line. Try to split the final word. - self.in_whitespace = false; - let remaining_text = &self.source[self.split + self.split_len..]; - let final_word = match remaining_text.find(is_whitespace) { - Some(i) => &remaining_text[..i], - None => remaining_text, - }; - - let mut hyphen = ""; - let splits = self.options.split(final_word); - for &(head, hyp, _) in splits.iter().rev() { - if self.line_width_at_split + head.width() + hyp.width() <= self.options.width() - { - // We can fit head into the current line. - // Advance the split point by the width of the - // whitespace and the head length. - self.split += self.split_len + head.len(); - // The new `split_len` is equal to the stretch - // of whitespace following the split. - self.split_len = remaining_text[head.len()..] - .char_indices() - .skip_while(|(_, ch)| is_whitespace(*ch)) - .next() - .map_or(0, |(idx, _)| idx); - self.line_width_at_split += head.width() + hyp.width(); - hyphen = hyp; - break; - } - } - - if self.start >= self.split { - // The word is too big to fit on a single line. - if self.options.break_words() { - // Break work at current index. - self.split = idx; - self.split_len = 0; - self.line_width_at_split = self.line_width; - } else { - // Add smallest split. - self.split += self.split_len + splits[0].0.len(); - // The new `split_len` is equal to the stretch - // of whitespace following the smallest split. - self.split_len = remaining_text[splits[0].0.len()..] - .char_indices() - .skip_while(|(_, ch)| is_whitespace(*ch)) - .next() - .map_or(0, |(idx, _)| idx); - self.line_width_at_split = self.line_width; - } - } - - if self.start < self.split { - let mut line = self.create_result_line(); - line += &self.source[self.start..self.split]; - line += hyphen; - - self.start = self.split + self.split_len; - self.line_width += self.options.subsequent_indent().width(); - self.line_width -= self.line_width_at_split; - self.line_width += char_width; - self.line_width_at_split = self.options.subsequent_indent().width(); - - return Some(line); - } - } else { - self.in_whitespace = false; - } - self.line_width += char_width; - } - - self.finished = true; - - // Add final line. - if self.start < self.source.len() { - let mut line = self.create_result_line(); - line += &self.source[self.start..]; - return Some(line); - } - - None - } -} - /// Return the current terminal width. If the terminal width cannot be /// determined (typically because the standard output is not connected /// to a terminal), a default width of 80 characters will be used. @@ -622,7 +407,7 @@ pub fn fill(text: &str, options: T) -> String { // indentation, no hyphenation). let mut result = String::with_capacity(text.len()); - for (i, line) in wrap(text, options).enumerate() { + for (i, line) in wrap(text, options).iter().enumerate() { if i > 0 { result.push('\n'); } @@ -634,7 +419,9 @@ pub fn fill(text: &str, options: T) -> String { /// Wrap a line of text at `width` characters. /// -/// The result is an iterator yielding individual lines. Use the +/// The result is a vector of lines, each line is of type `Cow<'_, +/// str>`, which means that the line will borrow from the input `&str` +/// if possible. The lines do not have a trailing `'\n'`. Use the /// [`fill`] function if you need a `String` instead. /// /// The easiest way to use this function is to pass an integer for @@ -644,7 +431,7 @@ pub fn fill(text: &str, options: T) -> String { /// use textwrap::wrap; /// /// let lines = wrap("Memory safety without garbage collection.", 15); -/// assert_eq!(lines.collect::>(), &[ +/// assert_eq!(lines, &[ /// "Memory safety", /// "without garbage", /// "collection.", @@ -659,7 +446,7 @@ pub fn fill(text: &str, options: T) -> String { /// /// let options = Options::new(15).initial_indent("- ").subsequent_indent(" "); /// let lines = wrap("Memory safety without garbage collection.", &options); -/// assert_eq!(lines.collect::>(), &[ +/// assert_eq!(lines, &[ /// "- Memory safety", /// " without", /// " garbage", @@ -681,7 +468,7 @@ pub fn fill(text: &str, options: T) -> String { /// /// let options = Options::new(15).subsequent_indent("...."); /// let lines = wrap("Wrapping text all day long.", &options); -/// let annotated = lines.map(|line| match line { +/// let annotated = lines.iter().map(|line| match line { /// Borrowed(text) => format!("[Borrowed] {}", text), /// Owned(text) => format!("[Owned] {}", text), /// }).collect::>(); @@ -693,8 +480,85 @@ pub fn fill(text: &str, options: T) -> String { /// ``` /// /// [`fill`]: fn.fill.html -pub fn wrap(text: &str, options: T) -> impl Iterator> { - WrapIter::new(options, text) +pub fn wrap(text: &str, options: T) -> Vec> { + let initial_width = options + .width() + .saturating_sub(options.initial_indent().width()); + let subsequent_width = options + .width() + .saturating_sub(options.subsequent_indent().width()); + + let mut lines = Vec::new(); + for line in text.split('\n') { + let words = core::find_words(line); + let split_words = core::split_words(words, &options); + let broken_words = if options.break_words() { + let mut broken_words = core::break_words(split_words, subsequent_width); + if !options.initial_indent().is_empty() { + // Without this, the first word will always go into + // the first line. However, since we break words based + // on the _second_ line width, it can be wrong to + // unconditionally put the first word onto the first + // line. An empty zero-width word fixed this. + broken_words.insert(0, core::Word::from("")); + } + broken_words + } else { + split_words.collect::>() + }; + + #[rustfmt::skip] + let line_lengths = |i| if i == 0 { initial_width } else { subsequent_width }; + let wrapped_words = core::wrap_fragments(&broken_words, line_lengths); + + let mut idx = 0; + for words in wrapped_words { + let last_word = match words.last() { + None => { + lines.push(Cow::from("")); + continue; + } + Some(word) => word, + }; + + // We assume here that all words are contiguous in `line`. + // That is, the sum of their lengths should add up to the + // lenght of `line`. + let len = words + .iter() + .map(|word| word.len() + word.whitespace.len()) + .sum::() + - last_word.whitespace.len(); + + // The result is owned if we have indentation, otherwise + // we can simply borrow an empty string. + let mut result = if lines.is_empty() && !options.initial_indent().is_empty() { + Cow::Owned(options.initial_indent().to_owned()) + } else if !lines.is_empty() && !options.subsequent_indent().is_empty() { + Cow::Owned(options.subsequent_indent().to_owned()) + } else { + // We can use an empty string here since string + // concatenation for `Cow` preserves a borrowed value + // when either side is empty. + Cow::from("") + }; + + result += &line[idx..idx + len]; + + if !last_word.penalty.is_empty() { + result.to_mut().push_str(&last_word.penalty); + } + + lines.push(result); + + // Advance by the length of `result`, plus the length of + // `last_word.whitespace` -- even if we had a penalty, we + // need to skip over the whitespace. + idx += len + last_word.whitespace.len(); + } + } + + lines } #[cfg(test)] @@ -703,12 +567,6 @@ mod tests { #[cfg(feature = "hyphenation")] use hyphenation::{Language, Load, Standard}; - macro_rules! assert_iter_eq { - ($left:expr, $right:expr) => { - assert_eq!($left.collect::>(), $right); - }; - } - #[test] fn options_agree_with_usize() { let opt_usize: &dyn WrapOptions = &42; @@ -722,70 +580,59 @@ mod tests { ); assert_eq!(opt_usize.break_words(), opt_options.break_words()); assert_eq!( - opt_usize.split("hello-world"), - opt_options.split("hello-world") + opt_usize.split_points("hello-world"), + opt_options.split_points("hello-world") ); } #[test] fn no_wrap() { - assert_iter_eq!(wrap("foo", 10), vec!["foo"]); + assert_eq!(wrap("foo", 10), vec!["foo"]); } #[test] - fn simple() { - assert_iter_eq!(wrap("foo bar baz", 5), vec!["foo", "bar", "baz"]); + fn wrap_simple() { + assert_eq!(wrap("foo bar baz", 5), vec!["foo", "bar", "baz"]); } #[test] - fn multi_word_on_line() { - assert_iter_eq!(wrap("foo bar baz", 10), vec!["foo bar", "baz"]); + fn multiple_words_on_first_line() { + assert_eq!(wrap("foo bar baz", 10), vec!["foo bar", "baz"]); } #[test] fn long_word() { - assert_iter_eq!(wrap("foo", 0), vec!["f", "o", "o"]); + assert_eq!(wrap("foo", 0), vec!["f", "o", "o"]); } #[test] fn long_words() { - assert_iter_eq!(wrap("foo bar", 0), vec!["f", "o", "o", "b", "a", "r"]); + assert_eq!(wrap("foo bar", 0), vec!["f", "o", "o", "b", "a", "r"]); } #[test] fn max_width() { - assert_iter_eq!(wrap("foo bar", usize::max_value()), vec!["foo bar"]); + assert_eq!(wrap("foo bar", usize::max_value()), vec!["foo bar"]); } #[test] fn leading_whitespace() { - assert_iter_eq!(wrap(" foo bar", 6), vec![" foo", "bar"]); + assert_eq!(wrap(" foo bar", 6), vec![" foo", "bar"]); } #[test] fn trailing_whitespace() { - assert_iter_eq!(wrap("foo bar ", 6), vec!["foo", "bar "]); - } - - #[test] - fn interior_whitespace() { - assert_iter_eq!(wrap("foo: bar baz", 10), vec!["foo: bar", "baz"]); - } - - #[test] - fn extra_whitespace_start_of_line() { // Whitespace is only significant inside a line. After a line // gets too long and is broken, the first word starts in - // column zero and is not indented. The line before might end - // up with trailing whitespace. - assert_iter_eq!(wrap("foo bar", 5), vec!["foo", "bar"]); + // column zero and is not indented. + assert_eq!(wrap("foo bar baz", 5), vec!["foo", "bar", "baz"]); } #[test] fn issue_99() { // We did not reset the in_whitespace flag correctly and did // not handle single-character words after a line break. - assert_iter_eq!( + assert_eq!( wrap("aaabbbccc x yyyzzzwww", 9), vec!["aaabbbccc", "x", "yyyzzzwww"] ); @@ -795,22 +642,26 @@ mod tests { fn issue_129() { // The dash is an em-dash which takes up four bytes. We used // to panic since we tried to index into the character. - assert_iter_eq!(wrap("x – x", 1), vec!["x", "–", "x"]); + assert_eq!(wrap("x – x", 1), vec!["x", "–", "x"]); } #[test] fn wide_character_handling() { - assert_iter_eq!(wrap("Hello, World!", 15), vec!["Hello, World!"]); - assert_iter_eq!( + assert_eq!(wrap("Hello, World!", 15), vec!["Hello, World!"]); + assert_eq!( wrap("Hello, World!", 15), vec!["Hello,", "World!"] ); } #[test] - fn empty_input_not_indented() { + fn empty_line_is_indented() { + // Previously, indentation was not applied to empty lines. + // However, this is somewhat inconsistent and undesirable if + // the indentation is something like a border ("| ") which you + // want to apply to all lines, empty or not. let options = Options::new(10).initial_indent("!!!"); - assert_eq!(fill("", &options), ""); + assert_eq!(fill("", &options), "!!!"); } #[test] @@ -819,10 +670,19 @@ mod tests { assert_eq!(fill("foo", &options), ">>>foo"); } + #[test] + fn indent_first() { + let options = Options::new(10).initial_indent("👉👉"); + assert_eq!( + wrap("x x x x x x x x x x x x x", &options), + vec!["👉👉x x x", "x x x x x", "x x x x x"] + ); + } + #[test] fn indent_multiple_lines() { let options = Options::new(6).initial_indent("* ").subsequent_indent(" "); - assert_iter_eq!( + assert_eq!( wrap("foo bar baz", &options), vec!["* foo", " bar", " baz"] ); @@ -831,29 +691,39 @@ mod tests { #[test] fn indent_break_words() { let options = Options::new(5).initial_indent("* ").subsequent_indent(" "); - assert_iter_eq!(wrap("foobarbaz", &options), vec!["* foo", " bar", " baz"]); + assert_eq!(wrap("foobarbaz", &options), vec!["* foo", " bar", " baz"]); + } + + #[test] + fn initial_indent_break_words() { + // This is a corner-case showing how the long word is broken + // according to the width of the subsequent lines. The first + // fragment of the word no longer fits on the first line, + // which ends up being pure indentation. + let options = Options::new(5).initial_indent("-->"); + assert_eq!(wrap("foobarbaz", &options), vec!["-->", "fooba", "rbaz"]); } #[test] fn hyphens() { - assert_iter_eq!(wrap("foo-bar", 5), vec!["foo-", "bar"]); + assert_eq!(wrap("foo-bar", 5), vec!["foo-", "bar"]); } #[test] fn trailing_hyphen() { let options = Options::new(5).break_words(false); - assert_iter_eq!(wrap("foobar-", &options), vec!["foobar-"]); + assert_eq!(wrap("foobar-", &options), vec!["foobar-"]); } #[test] fn multiple_hyphens() { - assert_iter_eq!(wrap("foo-bar-baz", 5), vec!["foo-", "bar-", "baz"]); + assert_eq!(wrap("foo-bar-baz", 5), vec!["foo-", "bar-", "baz"]); } #[test] fn hyphens_flag() { let options = Options::new(5).break_words(false); - assert_iter_eq!( + assert_eq!( wrap("The --foo-bar flag.", &options), vec!["The", "--foo-", "bar", "flag."] ); @@ -862,39 +732,39 @@ mod tests { #[test] fn repeated_hyphens() { let options = Options::new(4).break_words(false); - assert_iter_eq!(wrap("foo--bar", &options), vec!["foo--bar"]); + assert_eq!(wrap("foo--bar", &options), vec!["foo--bar"]); } #[test] fn hyphens_alphanumeric() { - assert_iter_eq!(wrap("Na2-CH4", 5), vec!["Na2-", "CH4"]); + assert_eq!(wrap("Na2-CH4", 5), vec!["Na2-", "CH4"]); } #[test] fn hyphens_non_alphanumeric() { let options = Options::new(5).break_words(false); - assert_iter_eq!(wrap("foo(-)bar", &options), vec!["foo(-)bar"]); + assert_eq!(wrap("foo(-)bar", &options), vec!["foo(-)bar"]); } #[test] fn multiple_splits() { - assert_iter_eq!(wrap("foo-bar-baz", 9), vec!["foo-bar-", "baz"]); + assert_eq!(wrap("foo-bar-baz", 9), vec!["foo-bar-", "baz"]); } #[test] fn forced_split() { let options = Options::new(5).break_words(false); - assert_iter_eq!(wrap("foobar-baz", &options), vec!["foobar-", "baz"]); + assert_eq!(wrap("foobar-baz", &options), vec!["foobar-", "baz"]); } #[test] fn multiple_unbroken_words_issue_193() { let options = Options::new(3).break_words(false); - assert_iter_eq!( + assert_eq!( wrap("small large tiny", &options), vec!["small", "large", "tiny"] ); - assert_iter_eq!( + assert_eq!( wrap("small large tiny", &options), vec!["small", "large", "tiny"] ); @@ -903,28 +773,28 @@ mod tests { #[test] fn very_narrow_lines_issue_193() { let options = Options::new(1).break_words(false); - assert_iter_eq!(wrap("fooo x y", &options), vec!["fooo", "x", "y"]); - assert_iter_eq!(wrap("fooo x y", &options), vec!["fooo", "x", "y"]); + assert_eq!(wrap("fooo x y", &options), vec!["fooo", "x", "y"]); + assert_eq!(wrap("fooo x y", &options), vec!["fooo", "x", "y"]); } #[test] fn no_hyphenation() { let options = Options::new(8).splitter(Box::new(NoHyphenation)); - assert_iter_eq!(wrap("foo bar-baz", &options), vec!["foo", "bar-baz"]); + assert_eq!(wrap("foo bar-baz", &options), vec!["foo", "bar-baz"]); } #[test] #[cfg(feature = "hyphenation")] - fn auto_hyphenation() { + fn auto_hyphenation_double_hyphenation() { let dictionary = Standard::from_embedded(Language::EnglishUS).unwrap(); let options = Options::new(10); - assert_iter_eq!( + assert_eq!( wrap("Internationalization", &options), vec!["Internatio", "nalization"] ); let options = Options::new(10).splitter(Box::new(dictionary)); - assert_iter_eq!( + assert_eq!( wrap("Internationalization", &options), vec!["Interna-", "tionaliza-", "tion"] ); @@ -935,15 +805,15 @@ mod tests { fn auto_hyphenation_issue_158() { let dictionary = Standard::from_embedded(Language::EnglishUS).unwrap(); let options = Options::new(10); - assert_iter_eq!( + assert_eq!( wrap("participation is the key to success", &options), vec!["participat", "ion is the", "key to", "success"] ); let options = Options::new(10).splitter(Box::new(dictionary)); - assert_iter_eq!( + assert_eq!( wrap("participation is the key to success", &options), - vec!["participa-", "tion is the", "key to", "success"] + vec!["participa-", "tion is", "the key to", "success"] ); } @@ -954,7 +824,7 @@ mod tests { // into account. let dictionary = Standard::from_embedded(Language::EnglishUS).unwrap(); let options = Options::new(15).splitter(Box::new(dictionary)); - assert_iter_eq!( + assert_eq!( wrap("garbage collection", &options), vec!["garbage col-", "lection"] ); @@ -968,7 +838,7 @@ mod tests { use std::borrow::Cow::{Borrowed, Owned}; let dictionary = Standard::from_embedded(Language::EnglishUS).unwrap(); let options = Options::new(10).splitter(Box::new(dictionary)); - let lines = wrap("Internationalization", &options).collect::>(); + let lines = wrap("Internationalization", &options); if let Borrowed(s) = lines[0] { assert!(false, "should not have been borrowed: {:?}", s); } @@ -985,13 +855,13 @@ mod tests { fn auto_hyphenation_with_hyphen() { let dictionary = Standard::from_embedded(Language::EnglishUS).unwrap(); let options = Options::new(8).break_words(false); - assert_iter_eq!( + assert_eq!( wrap("over-caffinated", &options), vec!["over-", "caffinated"] ); let options = options.splitter(Box::new(dictionary)); - assert_iter_eq!( + assert_eq!( wrap("over-caffinated", &options), vec!["over-", "caffi-", "nated"] ); @@ -999,17 +869,22 @@ mod tests { #[test] fn break_words() { - assert_iter_eq!(wrap("foobarbaz", 3), vec!["foo", "bar", "baz"]); + assert_eq!(wrap("foobarbaz", 3), vec!["foo", "bar", "baz"]); } #[test] fn break_words_wide_characters() { - assert_iter_eq!(wrap("Hello", 5), vec!["He", "ll", "o"]); + assert_eq!(wrap("Hello", 5), vec!["He", "ll", "o"]); } #[test] fn break_words_zero_width() { - assert_iter_eq!(wrap("foobar", 0), vec!["f", "o", "o", "b", "a", "r"]); + assert_eq!(wrap("foobar", 0), vec!["f", "o", "o", "b", "a", "r"]); + } + + #[test] + fn break_long_first_word() { + assert_eq!(wrap("testx y", 4), vec!["test", "x y"]); } #[test] @@ -1019,14 +894,21 @@ mod tests { } #[test] - fn preserve_line_breaks() { - assert_eq!(fill("test\n", 11), "test\n"); - assert_eq!(fill("test\n\na\n\n", 11), "test\n\na\n\n"); - assert_eq!(fill("1 3 5 7\n1 3 5 7", 7), "1 3 5 7\n1 3 5 7"); + fn break_words_empty_lines() { + assert_eq!( + fill("foo\nbar", &Options::new(2).break_words(false)), + "foo\nbar" + ); } #[test] - fn wrap_preserve_line_breaks() { + fn preserve_line_breaks() { + assert_eq!(fill("", 80), ""); + assert_eq!(fill("\n", 80), "\n"); + assert_eq!(fill("\n\n\n", 80), "\n\n\n"); + assert_eq!(fill("test\n", 80), "test\n"); + assert_eq!(fill("test\n\na\n\n", 80), "test\n\na\n\n"); + assert_eq!(fill("1 3 5 7\n1 3 5 7", 7), "1 3 5 7\n1 3 5 7"); assert_eq!(fill("1 3 5 7\n1 3 5 7", 5), "1 3 5\n7\n1 3 5\n7"); } diff --git a/src/splitting.rs b/src/splitting.rs index c35ae584..619804f9 100644 --- a/src/splitting.rs +++ b/src/splitting.rs @@ -5,13 +5,7 @@ //! functionality. [`HyphenSplitter`] is the default implementation of //! this treat: it will simply split words on existing hyphens. -/// An interface for splitting words. -/// -/// When the [`wrap`] function tries to fit text into a line, it will -/// eventually find a word that it too large the current text width. -/// It will then call the currently configured `WordSplitter` to have -/// it attempt to split the word into smaller parts. This trait -/// describes that functionality via the [`split`] method. +/// The `WordSplitter` trait describes where words can be split. /// /// If the `textwrap` crate has been compiled with the `hyphenation` /// feature enabled, you will find an implementation of `WordSplitter` @@ -20,25 +14,23 @@ /// for details. /// /// [`wrap`]: ../fn.wrap.html -/// [`split`]: #tymethod.split /// [`hyphenation` documentation]: https://docs.rs/hyphenation/ pub trait WordSplitter: std::fmt::Debug { - /// Return all possible splits of word. Each split is a triple - /// with a head, a hyphen, and a tail where `head + &tail == word`. - /// The hyphen can be empty if there is already a hyphen in the - /// head. + /// Return all possible indices where `word` can be split. + /// + /// The indices returned must be in range `0..word.len()`. They + /// should point to the index _after_ the split point, i.e., after + /// `-` if splitting on hyphens. This way, `word.split_at(idx)` + /// will break the word into two well-formed pieces. /// - /// The splits should go from smallest to longest and should - /// include no split at all. So the word "technology" could be - /// split into + /// # Examples /// - /// ```no_run - /// vec![("tech", "-", "nology"), - /// ("technol", "-", "ogy"), - /// ("technolo", "-", "gy"), - /// ("technology", "", "")]; /// ``` - fn split<'w>(&self, word: &'w str) -> Vec<(&'w str, &'w str, &'w str)>; + /// use textwrap::{NoHyphenation, HyphenSplitter, WordSplitter}; + /// assert_eq!(NoHyphenation.split_points("cannot-be-split"), vec![]); + /// assert_eq!(HyphenSplitter.split_points("can-be-split"), vec![4, 7]); + /// ``` + fn split_points(&self, word: &str) -> Vec; } /// Use this as a [`Options.splitter`] to avoid any kind of @@ -48,7 +40,7 @@ pub trait WordSplitter: std::fmt::Debug { /// use textwrap::{wrap, Options, NoHyphenation}; /// /// let options = Options::new(8).splitter(Box::new(NoHyphenation)); -/// assert_eq!(wrap("foo bar-baz", &options).collect::>(), +/// assert_eq!(wrap("foo bar-baz", &options), /// vec!["foo", "bar-baz"]); /// ``` /// @@ -59,8 +51,8 @@ pub struct NoHyphenation; /// `NoHyphenation` implements `WordSplitter` by not splitting the /// word at all. impl WordSplitter for NoHyphenation { - fn split<'w>(&self, word: &'w str) -> Vec<(&'w str, &'w str, &'w str)> { - vec![(word, "", "")] + fn split_points(&self, _: &str) -> Vec { + Vec::new() } } @@ -80,40 +72,24 @@ pub struct HyphenSplitter; /// characters, which prevents a word like "--foo-bar" from being /// split on the first or second hyphen. impl WordSplitter for HyphenSplitter { - fn split<'w>(&self, word: &'w str) -> Vec<(&'w str, &'w str, &'w str)> { - let mut triples = Vec::new(); - // Split on hyphens, smallest split first. We only use hyphens - // that are surrounded by alphanumeric characters. This is to - // avoid splitting on repeated hyphens, such as those found in - // --foo-bar. - let mut char_indices = word.char_indices(); - // Early return if the word is empty. - let mut prev = match char_indices.next() { - None => return vec![(word, "", "")], - Some((_, ch)) => ch, - }; + fn split_points(&self, word: &str) -> Vec { + let mut splits = Vec::new(); - // Find current word, or return early if the word only has a - // single character. - let (mut idx, mut cur) = match char_indices.next() { - None => return vec![(word, "", "")], - Some((idx, cur)) => (idx, cur), - }; + for (idx, _) in word.match_indices('-') { + // We only use hyphens that are surrounded by alphanumeric + // characters. This is to avoid splitting on repeated hyphens, + // such as those found in --foo-bar. + let prev = word[..idx].chars().next_back(); + let next = word[idx + 1..].chars().next(); - for (i, next) in char_indices { - if prev.is_alphanumeric() && cur == '-' && next.is_alphanumeric() { - let (head, tail) = word.split_at(idx + 1); - triples.push((head, "", tail)); + if prev.filter(|ch| ch.is_alphanumeric()).is_some() + && next.filter(|ch| ch.is_alphanumeric()).is_some() + { + splits.push(idx + 1); // +1 due to width of '-'. } - prev = cur; - idx = i; - cur = next; } - // Finally option is no split at all. - triples.push((word, "", "")); - - triples + splits } } @@ -124,18 +100,8 @@ impl WordSplitter for HyphenSplitter { /// enabled. #[cfg(feature = "hyphenation")] impl WordSplitter for hyphenation::Standard { - fn split<'w>(&self, word: &'w str) -> Vec<(&'w str, &'w str, &'w str)> { + fn split_points(&self, word: &str) -> Vec { use hyphenation::Hyphenator; - // Find splits based on language dictionary. - let mut triples = Vec::new(); - for n in self.hyphenate(word).breaks { - let (head, tail) = word.split_at(n); - let hyphen = if head.ends_with('-') { "" } else { "-" }; - triples.push((head, hyphen, tail)); - } - // Finally option is no split at all. - triples.push((word, "", "")); - - triples + self.hyphenate(word).breaks } }