Use Unicode line breaking algorithm to find words

This adds a new optional dependency on the unicode-linebreak crate, which implements the line breaking algorithm from [Unicode Standard Annex #14](https://www.unicode.org/reports/tr14/). We can use this to find words in non-ASCII text. The new dependency is enabled by default since these line breaks are more correct than what you get by splitting on ASCII space. This should help address #220 and #80, though I’m no expert on non-Western languages. More feedback from the community would be needed here.
mgeisler · May 2, 2021 · 6c5220b · 6c5220b
1 parent 48b9480
commit 6c5220b
Show file tree

Hide file tree

Showing 9 changed files with 323 additions and 37 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -21,11 +21,12 @@ harness = false
 path = "benches/linear.rs"
 
 [features]
-default = ["unicode-width", "smawk"]
+default = ["unicode-linebreak", "unicode-width", "smawk"]
 
 [dependencies]
 smawk = { version = "0.3", optional = true }
 terminal_size = { version = "0.1", optional = true }
+unicode-linebreak = { version = "0.1", optional = true }
 unicode-width = { version= "0.1", optional = true }
 
 [dependencies.hyphenation]

diff --git a/benches/linear.rs b/benches/linear.rs
@@ -26,10 +26,25 @@ pub fn benchmark(c: &mut Criterion) {
 
         #[cfg(feature = "smawk")]
         {
+            #[cfg(feature = "unicode-linebreak")]
+            {
+                let options = textwrap::Options::new(LINE_LENGTH)
+                    .wrap_algorithm(textwrap::core::WrapAlgorithm::OptimalFit)
+                    .line_breaks(textwrap::UnicodeLineBreaks);
+                group.bench_with_input(
+                    BenchmarkId::new("fill_optimal_fit_unicode", length),
+                    &text,
+                    |b, text| {
+                        b.iter(|| textwrap::fill(text, &options));
+                    },
+                );
+            }
+
             let options = textwrap::Options::new(LINE_LENGTH)
-                .wrap_algorithm(textwrap::core::WrapAlgorithm::OptimalFit);
+                .wrap_algorithm(textwrap::core::WrapAlgorithm::OptimalFit)
+                .word_separator(textwrap::AsciiSpace);
             group.bench_with_input(
-                BenchmarkId::new("fill_optimal_fit", length),
+                BenchmarkId::new("fill_optimal_fit_ascii_space", length),
                 &text,
                 |b, text| {
                     b.iter(|| textwrap::fill(text, &options));

diff --git a/examples/interactive.rs b/examples/interactive.rs
@@ -21,7 +21,7 @@ mod unix_only {
     use termion::{color, cursor, style};
     #[cfg(feature = "smawk")]
     use textwrap::core::WrapAlgorithm::{FirstFit, OptimalFit};
-    use textwrap::{wrap, AsciiSpace, Options};
+    use textwrap::{wrap, AsciiSpace, Options, WordSeparator};
     use textwrap::{HyphenSplitter, NoHyphenation, WordSplitter};
 
     #[cfg(feature = "hyphenation")]
@@ -57,7 +57,7 @@ mod unix_only {
 
     fn draw_text<'a>(
         text: &str,
-        options: &Options<'a, AsciiSpace, Box<dyn WordSplitter>>,
+        options: &Options<'a, Box<dyn WordSeparator>, Box<dyn WordSplitter>>,
         splitter_label: &str,
         stdout: &mut RawTerminal<io::Stdout>,
     ) -> Result<(), io::Error> {
@@ -257,8 +257,9 @@ mod unix_only {
         }
 
         let mut label = labels.pop().unwrap();
-        let mut options =
-            Options::new(35).splitter(Box::new(HyphenSplitter) as Box<dyn WordSplitter>);
+        let mut options = Options::new(35)
+            .splitter(Box::new(HyphenSplitter) as Box<dyn WordSplitter>)
+            .word_separator(Box::new(AsciiSpace) as Box<dyn WordSeparator>);
         options.break_words = false;
         options.splitter = splitters.pop().unwrap();
 

diff --git a/examples/wasm/Cargo.lock b/examples/wasm/Cargo.lock
diff --git a/examples/wasm/src/lib.rs b/examples/wasm/src/lib.rs
@@ -1,7 +1,7 @@
 use wasm_bindgen::prelude::*;
 use wasm_bindgen::JsCast;
 
-use textwrap::core;
+use textwrap::{core, LineBreakAlgorithm};
 
 #[wasm_bindgen]
 extern "C" {

diff --git a/src/core.rs b/src/core.rs
@@ -237,7 +237,7 @@ impl std::ops::Deref for Word<'_> {
 }
 
 impl<'a> Word<'a> {
-    /// Construct a new `Word`.
+    /// Construct a `Word` from a string.
     ///
     /// A trailing stretch of `' '` is automatically taken to be the
     /// whitespace part of the word.
@@ -354,9 +354,9 @@ impl Fragment for Word<'_> {
 ///     vec![Word::from("foo-bar")]
 /// );
 /// ```
-pub fn split_words<'a, I, T, S>(
+pub fn split_words<'a, I, R, S>(
     words: I,
-    options: &'a Options<'a, T, S>,
+    options: &'a Options<'a, R, S>,
 ) -> impl Iterator<Item = Word<'a>>
 where
     I: IntoIterator<Item = Word<'a>>,