perf(markdown): add sifter handling clean

spider-rs · Nov 13, 2024 · 544b923 · 544b923
1 parent 3e7eec8
commit 544b923
Show file tree

Hide file tree

Showing 11 changed files with 228 additions and 59 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/fast_html2md/Cargo.toml b/fast_html2md/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "fast_html2md"
-version = "0.0.27"
+version = "0.0.29"
 edition = "2021"
 description = "A fast html2md crate for rust"
 categories = ["development-tools", "parsing", "parser-implementations"]

diff --git a/fast_html2md/src/extended/mod.rs b/fast_html2md/src/extended/mod.rs
@@ -0,0 +1,2 @@
+// initial source from /JumperBot/whitespace-sifter
+pub mod sifter;
diff --git a/fast_html2md/src/extended/sifter.rs b/fast_html2md/src/extended/sifter.rs
@@ -0,0 +1,199 @@
+use std::str;
+
+/// Charector handling bytes.
+enum Character {
+    SingleByte { data: u8 },
+    MultiByte { len: usize },
+}
+
+/// A trait containing all `string` whitespace-sifting functions.
+pub trait WhitespaceSifter: AsRef<str> {
+    /// This removes duplicate [whitespaces](https://doc.rust-lang.org/reference/whitespace.html) from a `string` implementing `AsRef<str>`.
+    /// This follows the [is_ascii_whitespace](https://doc.rust-lang.org/std/primitive.char.html#method.is_ascii_whitespace) implementation.
+    /// This treats carriage-returns as just one `char` in the `string`.
+    #[must_use]
+    fn sift(&self) -> String {
+        let input: &str = self.as_ref();
+        let mut out: String = String::with_capacity(input.len());
+        sift_preallocated(input.as_bytes(), &mut out);
+        out
+    }
+
+    /// This removes duplicate [whitespaces](https://doc.rust-lang.org/reference/whitespace.html) from a `string` implementing `AsRef<str>`.
+    /// This follows the [is_ascii_whitespace](https://doc.rust-lang.org/std/primitive.char.html#method.is_ascii_whitespace) implementation.
+    /// This preserves deduplicated newlines.
+    /// This treats carriage-returns as just one `char` in the `string`.
+    #[must_use]
+    fn sift_preserve_newlines(&self) -> String {
+        let input = self.as_ref();
+        let mut out = String::with_capacity(input.len());
+        let bytes = input.as_bytes();
+        let mut ind: usize = 0;
+
+        while ind < bytes.len() {
+            sift_preallocated_until_newline(bytes, &mut ind, &mut out);
+        }
+
+        if out.ends_with("\r\n") {
+            let _ = out.pop();
+            let _ = out.pop();
+        } else if out.ends_with('\n') {
+            let _ = out.pop();
+        }
+
+        out
+    }
+}
+
+impl<T: AsRef<str>> WhitespaceSifter for T {}
+
+/// A custom implementation of `str::trim_start`.
+fn sift_trim_start(bytes: &[u8], ind: &mut usize, out: &mut String) {
+    while *ind < bytes.len() {
+        match get_char_metadata(bytes[*ind]) {
+            Character::SingleByte { data } => {
+                *ind += 1;
+                if !is_ascii_whitespace(data) {
+                    out.push(data as char);
+                    break;
+                }
+            }
+            Character::MultiByte { len } => {
+                extend_from_bytes_with_len(bytes, ind, out, len);
+                break;
+            }
+        }
+    }
+}
+
+/// A custom implementation for `str::trim_end`.
+fn sift_trim_end(out: &mut String, is_last_whitespace: bool) {
+    if is_last_whitespace {
+        out.pop();
+    }
+}
+
+/// Extend the bytes from a slice.
+fn extend_from_bytes_with_len(bytes: &[u8], ind: &mut usize, out: &mut String, len: usize) {
+    let end = ind.saturating_add(len);
+    // Check bounds to ensure we don't run into an out-of-bounds error.
+    if end <= bytes.len() {
+        // Todo: we want to pass in the bytes encoded to string.
+        if let Ok(valid_str) = str::from_utf8(&bytes[*ind..end]) {
+            out.push_str(valid_str);
+        }
+    }
+    *ind = end;
+}
+
+#[inline]
+const fn is_newline(codepoint: u8) -> bool {
+    matches!(codepoint, LINE_FEED | CARRIAGE_RETURN)
+}
+
+/// Sift preallocate safe strings.
+fn sift_preallocated(bytes: &[u8], out: &mut String) {
+    if !bytes.is_empty() {
+        let mut ind: usize = 0;
+        sift_trim_start(bytes, &mut ind, out);
+        let mut is_last_whitespace: bool = false;
+        let mut is_last_carriage_return: bool = false;
+
+        while ind < bytes.len() {
+            match get_char_metadata(bytes[ind]) {
+                Character::SingleByte { data } => {
+                    ind += 1;
+                    if is_ascii_whitespace(data) {
+                        if data == LINE_FEED && is_last_carriage_return {
+                            out.push('\n');
+                            is_last_carriage_return = false;
+                            continue;
+                        }
+                        if is_last_whitespace {
+                            continue;
+                        }
+                        is_last_whitespace = true;
+                    } else {
+                        is_last_whitespace = false;
+                    }
+                    out.push(data as char);
+                    is_last_carriage_return = data == CARRIAGE_RETURN;
+                }
+                Character::MultiByte { len } => {
+                    extend_from_bytes_with_len(bytes, &mut ind, out, len);
+                }
+            }
+            is_last_carriage_return = false;
+        }
+        sift_trim_end(out, is_last_whitespace);
+    }
+}
+
+/// Sift preallocate until complete.
+fn sift_preallocated_until_newline(bytes: &[u8], ind: &mut usize, out: &mut String) {
+    sift_trim_start(bytes, ind, out);
+
+    let mut is_last_whitespace = false;
+    let mut is_last_carriage_return = false;
+
+    while *ind < bytes.len() {
+        match get_char_metadata(bytes[*ind]) {
+            Character::SingleByte { data } => {
+                *ind += 1;
+                if is_ascii_whitespace(data) {
+                    if is_newline(data) {
+                        if is_last_carriage_return {
+                            out.push('\r');
+                        }
+                        out.push('\n');
+                        return;
+                    }
+                    is_last_carriage_return = data == CARRIAGE_RETURN;
+                    if is_last_whitespace {
+                        continue;
+                    }
+                    is_last_whitespace = true;
+                } else {
+                    is_last_whitespace = false;
+                }
+                out.push(data as char);
+            }
+            Character::MultiByte { len } => {
+                extend_from_bytes_with_len(bytes, ind, out, len);
+            }
+        }
+        is_last_carriage_return = false;
+    }
+    sift_trim_end(out, is_last_whitespace);
+}
+
+/// Binary extracted from [std](https://doc.rust-lang.org/src/core/str/validations.rs.html#36).
+#[inline]
+const fn get_char_metadata(first_byte: u8) -> Character {
+    match first_byte {
+        0b0000_0000..=0b0111_1111 => Character::SingleByte { data: first_byte },
+        0b1000_0000..=0b1101_1111 => Character::MultiByte { len: 2 },
+        0b1110_0000..=0b1110_1111 => Character::MultiByte { len: 3 },
+        0b1111_0000..=0b1111_1111 => Character::MultiByte { len: 4 },
+    }
+}
+
+#[allow(clippy::cast_possible_truncation)]
+const SPACE: u8 = ' ' as u32 as u8;
+#[allow(clippy::cast_possible_truncation)]
+const HORIZONTAL_TAB: u8 = '\t' as u32 as u8;
+#[allow(clippy::cast_possible_truncation)]
+const LINE_FEED: u8 = '\n' as u32 as u8;
+#[allow(clippy::cast_possible_truncation)]
+const FORM_FEED: u8 = '\x0C' as u32 as u8;
+#[allow(clippy::cast_possible_truncation)]
+const CARRIAGE_RETURN: u8 = '\r' as u32 as u8;
+
+/// Values extracted from [std](https://doc.rust-lang.org/src/core/char/methods.rs.html#1680).
+#[inline]
+const fn is_ascii_whitespace(codepoint: u8) -> bool {
+    matches!(
+        codepoint,
+        SPACE | HORIZONTAL_TAB | LINE_FEED | FORM_FEED | CARRIAGE_RETURN
+    )
+}
diff --git a/fast_html2md/src/lib.rs b/fast_html2md/src/lib.rs
@@ -2,20 +2,24 @@ use html5ever::driver::ParseOpts;
 use html5ever::parse_document;
 use html5ever::tendril::TendrilSink;
 use lazy_static::lazy_static;
-pub use markup5ever_rcdom::{Handle, NodeData, RcDom};
 use regex::Regex;
 use std::boxed::Box;
 use std::collections::HashMap;
 use std::sync::Arc;
 use url::Url;
 
+pub use markup5ever_rcdom::{Handle, NodeData, RcDom};
+
 // we want to just use the rewriter instead for v0.1.
+pub mod extended;
 pub mod rewriter;
 pub mod scraper;
-pub use scraper::ignore;
+
+use extended::sifter::WhitespaceSifter;
 
 pub(crate) use scraper::anchors;
 pub(crate) use scraper::codes;
+pub use scraper::ignore;
 // pub(crate) use scraper::common;
 pub(crate) use scraper::containers;
 pub(crate) use scraper::dummy;
@@ -55,14 +59,6 @@ lazy_static! {
     static ref START_OF_LINE_PATTERN: Regex = Regex::new("(^|\\n) *$").expect("valid regex pattern");                  // for Markdown escaping
     static ref MARKDOWN_STARTONLY_KEYCHARS: Regex = Regex::new(r"^(\s*)([=>+\-#])").expect("valid regex pattern");     // for Markdown escaping
     static ref MARKDOWN_MIDDLE_KEYCHARS: Regex = Regex::new(r"[<>*\\_~]").expect("valid regex pattern");               // for Markdown escaping
-    static ref CLEANUP_PATTERN: Regex = Regex::new(
-        r"(?x)
-        (?m)
-        (^\s*$\n|\n{3,})|         # Empty lines or excessive newlines
-        (\s+$|^\n+|\s{2,})|      # Trailing, leading, or excessive spaces
-        (!\[\]\(\))              # Empty image syntax
-        "
-    ).expect("Valid regex pattern");
 }
 
 /// Custom variant of main function. Allows to pass custom tag<->tag factory pairs
@@ -389,21 +385,8 @@ fn escape_markdown(result: &StructuredPrinter, text: &str) -> String {
 /// Called after all processing has been finished
 ///
 /// Clears excessive punctuation that would be trimmed by renderer anyway
-fn clean_markdown(text: &str) -> String {
-    CLEANUP_PATTERN
-        .replace_all(text, |caps: &regex::Captures| {
-            if caps.get(1).is_some() || caps.get(4).is_some() {
-                "\n\n".to_string() // Consolidate newlines
-            } else if caps.get(3).is_some() {
-                "".to_string() // Remove spaces or empty image syntax
-            } else if caps.get(2).is_some() {
-                " ".to_string() // Remove spaces or empty image syntax
-            } else {
-                caps[0].trim().to_string()
-            }
-        })
-        .trim()
-        .to_string()
+fn clean_markdown(input: &str) -> String {
+    input.sift().into()
 }
 
 /// Intermediate result of HTML -> Markdown conversion.

diff --git a/fast_html2md/tests/integration.rs b/fast_html2md/tests/integration.rs
@@ -100,8 +100,7 @@ fn test_list_newlines() {
         .read_to_string(&mut html)
         .expect("File must be readable");
     let result = parse_html(&html, false);
-    assert_that(&result).contains(".\n\nxxx xxxx");
-    assert_that(&result).contains("xx x.\n\nxxxxx:");
+    assert_that(&result).is_equal_to("xx, xx xxxxx x xxxxxx xxxxxxxx xxxxx xxxxxxxxx xxxx xx xxxx xxxx xxxxxxxx.\nxxxx, xxx xx xxxxx xx xxxxxxxxxxx xxxx.\nxxxxxxxxxxx:\n* xxxxxxx x xxxxxxxxx (xxxxx)\n* xxxxxxx xx xxxxxx xxxxxxx, xxxxxxxxxx xxxxxxxxxx xxxx\n* xxxxxxxxx xx xxxxx, xx xxxxxx xx xxxxxxxxxxx\n* xxxxxxx xxxxxx xxxxxxxxx x xxxxxxxxxx, xxxxxxx xxxxxx x xxxxxxx, x xxxxxx.\n* xx xx, xxxxxx xx xxxxxxxx, xx-xxxx xxx x xxxxxxx xxx xxx, xxxxxxx xx xxxx. xxxxxxxxx xx x.\nxxxxx:\n1. xxxxxxxxx xxxxxxxxxx - xxxxx -\\_- !\n2. xxxxxx Mother of Learning - xxxx, xxxxxxx, xxxxxxxxxxxx\n3. xxxxxx xxxxxxx xxxxxxx, xxxxxxxx \"xxx xxxxx\". xxxxx xxxxx xxxx, xx x xxxxx xxxxxxx.\n4. xxxxxxxx! xxxx xxx xxxxxxxxx xxxx xxx, xx x xxxxxxxxx.\n5. xxxx xxxxxx - xxxxxx xxxxxxxx xxx x 15-17, xxxxxx xxxxxxxxxxxxx xx xxxxxxxx xxx xxxxxxx xxxxxx.\nxxx xxxx, xxxxx x xxxxxxxxx xx xxxxxxxxxx xxxxxx. xxxxxxxxx spelling puns, xxxxxxx, x xxxxxxxxx, xxxxxxxx xxx xxxxxxxx, xxxxxx xxxxxxxxxx xxxxxx.\nxxx xxxxxxx. xxx xxx xxxxxxxx xxxxxx - x x xxxxxxxxxxx xxxxx xxxx xxxxxxxxxx xxx xxxxx, x xxxxxx xxx xxxxxxxx xxxxxxxxxx xxx xxxxx. xx xxxxxx xxxxxxxx:\n* xxx xxxxx x xxx-xxxx xxxxxxxxx. xxxxxx xxx xxxx xxxxxxxx. x xx x xx xxxxxxxx, xx x xxxxxxx xxxxxx xxxxxx xx xxxxxxxxx. xxxxxxxxxx xxxx xxxxx xxxxxx xxxxxxxxx xxxxxxx xx xxxx.\n* xxxxxx xxxx Kotlin, x xxxxxxx. xxxxxxxxxx, xxxxxxxxxx xxx xxxxx xx xxx x xxxxxxxx\n* xxx xxxxx xxxxxxxxxx Rust, xxx xxx x xx xxx xxxx xxxxxxxxx xxxxxxxxxxxxxx xxxx xxx xxxxx, xxxxxxxx xxxxxxxxxxxxxx HTML x Markdown\n* xxx xxxx xxxxxx xxx xxxxxxxx xxxxxx. xx xxxx xxx - xxxxxxxxxxxxx xxxxxxxxxxx xxxxxx x xxxxxxxxx xxxxx x xxxxxxx.\n* xxxxxxxxx xxxx xxxxxxxx xxxxxxx xx FUSE 3.0. xxxxx xxxxxxx xxxxxxx xxx xxxxxxxxxxx.\n* x xxxxxxxx xxxx xxxxxxxx DevOps-xxxxxxx x xxxxx xxxxxxx. xxxxxxxxx, xxx xx xxxxx xxxxxx. x, xx, xxx xxx xxx xxxxxxxxx?\nxxxxx xx xxx:\n\\- xxxxxxxx xxxxxxxx\n\\- xxxxxxx xxxxxxxxx, xxxxxxx xxxxx xxxxx xxxxxxxx\n\\- xxxxxxxxxx xxxx Machine Learning, xxxx xxxxxx xxx xxxxxxxx OpenCL.".to_string());
 }
 
 #[test]
@@ -138,7 +137,7 @@ fn test_tables_with_newlines() {
         .expect("File must be readable");
     let result = parse_html(&html, false);
 
-    assert_that!(result).contains(indoc! {"[![Embedded YouTube video](https://img.youtube.com/vi/ZZZZZZZZZ/0.jpg)](https://www.youtube.com/watch?v=ZZZZZZZZZ)\n\n|Maybe I'm foolish, maybe I'm blind\nThinking I can see through this and see what's behind\nGot no way to prove it so maybe I'm blind\n\nBut I'm only human after all,\nI'm only human after all\nDon't put your blame on me|xxxxx xxxx, x xxxxxx, xxxxx xxxx — xxxxxx\nxxx xxxxx, xxx xxxx xxxxxx xxxxxx xxx, x xxxxxx xxx xxx xx xxx\nxxxx x xxxx xx xxxx xxxxxxx xxxxxxxxxxxxx, xxx xxx xxxxxxxx, x xxxxxx.\n\nxx x xxxxx xxxx xxxxxxx, x xxxxx-xx xxxxxx,\nx xxxxx xxxx xxxxxxx, x xxxxx xxxxxx.\nxx xxxx xxxx|\n|||\n\n[xxxxxx xxxxx xxxxx x xxxxxxx](/)\n\nx xxxx xxxxxxxxx xxxxxxx xxxxxxxxxxx xx xxxx xxxxx. x xxxxx xxxxxxx, xxxx xxxxx xxxxxxx xx xxxxxxxxxx xxxxxx. xxx xxxxxxxx, xxx xxxxxxxxx xxxxxxxxxxxxxx xx xxxxx — xxxxxxxxxx xxxxxxxxxx x xxxxx xxxxxxxxxxxxx xxxxxxxxx. x xxx xxxxxxxxxxxx*xxxx*, xxxxxx xxxx, xxxxxxxxxx xxxxx xxxxxxxx, xxxxxxxxxx x xxxxxxxxx. xx xxxxxx xxxxx xxxxxxxxxxxxxxxxx — x xxxxxx xxx xxxx.\n\nxxxxx xxxxxxxxxx xxxxx x xxxx xxxxxxxxxx xxxxx. xxxxx. x xxxxx: «x xxxxxx xxxxxxx, x xxxxx xxx xxxx, xx xxxxxxxx xxxxxx», — xxx xxxxx xxxxxxxx. xxxxxx xxx x xxxx xxxx xxxxxxxx xxxxxxxx xxxxxxx xxxx xxxxxxxxxxx xxxxxxxxxx, xxxxxxx xxxxxx xxxxxx xxx xxxxx, xxxxxxxxxxx x x xxxxxxx xxxxxxxxx.\n\nxx x xxxxx xxxx xxxxxxx. xxxxxx xxxxx? xxxxxxxxxxx x xxxxxxxxx xxxxxx.\n\nx xxxxx x xxxxxxxxxx x xxxxx... x xxxxxx xxxx xxxxxx xxxxxxx xxxxxxxx. xx xxxx, x xxxxxx xxx-xx xxxxxxxxx xx xxxxxxx, xxx xxxxxx xxxxxx, xxx xxx xxxxx, xxxxx xxxxxxxx xx xxxx... x xxxxxx xxxxxxx xx xxxx xxxxx, xxx, xxxxx xxxx xxxxxxxxxx, x xxxxx xxxxxxxxx xx xxxxx. x xxx-xx xxx xxxxx xxxxxxx xxxxxxxxxxxxx.\n\nxxxxxx xx... xx xxx xx xxxxxxxxxxxxx xxxxxx xxxxxxxxxxxxx x xxxxxxxxxx xxxxx, xxxxx xxx xxxx xxxxxxxxx, x xxxxx xxx xxxxxxxxx, xxx xxxxxxx xxx, xxx xxxx xxxxxxx xxxxxx, x xx xxx, xxx xxxx xxxxxxxx."
+    assert_that!(result).contains(indoc! {"[![Embedded YouTube video](https://img.youtube.com/vi/ZZZZZZZZZ/0.jpg)](https://www.youtube.com/watch?v=ZZZZZZZZZ)\n|Maybe I'm foolish, maybe I'm blind\nThinking I can see through this and see what's behind\nGot no way to prove it so maybe I'm blind\nBut I'm only human after all,\nI'm only human after all\nDon't put your blame on me|xxxxx xxxx, x xxxxxx, xxxxx xxxx —xxxxxx\nxxx xxxxx, xxx xxxx xxxxxx xxxxxx xxx, x xxxxxx xxx xxx xx xxx\nxxxx x xxxx xx xxxx xxxxxxx xxxxxxxxxxxxx, xxx xxx xxxxxxxx, x xxxxxx.\nxx x xxxxx xxxx xxxxxxx, x xxxxx-xx xxxxxx,\nx xxxxx xxxx xxxxxxx, x xxxxx xxxxxx.\nxx xxxx xxxx|\n|||\n[xxxxxx xxxxx xxxxx x xxxxxxx](/)\nx xxxx xxxxxxxxx xxxxxxx xxxxxxxxxxx xx xxxx xxxxx. x xxxxx xxxxxxx, xxxx xxxxx xxxxxxx xx xxxxxxxxxx xxxxxx. xxx xxxxxxxx, xxx xxxxxxxxx xxxxxxxxxxxxxx xx xxxxx —xxxxxxxxxx xxxxxxxxxx x xxxxx xxxxxxxxxxxxx xxxxxxxxx. x xxx xxxxxxxxxxxx*xxxx*, xxxxxx xxxx, xxxxxxxxxx xxxxx xxxxxxxx, xxxxxxxxxx x xxxxxxxxx. xx xxxxxx xxxxx xxxxxxxxxxxxxxxxx —x xxxxxx xxx xxxx.\nxxxxx xxxxxxxxxx xxxxx x xxxx xxxxxxxxxx xxxxx. xxxxx. x xxxxx: «x xxxxxx xxxxxxx, x xxxxx xxx xxxx, xx xxxxxxxx xxxxxx», —xxx xxxxx xxxxxxxx. xxxxxx xxx x xxxx xxxx xxxxxxxx xxxxxxxx xxxxxxx xxxx xxxxxxxxxxx xxxxxxxxxx, xxxxxxx xxxxxx xxxxxx xxx xxxxx, xxxxxxxxxxx x x xxxxxxx xxxxxxxxx.\nxx x xxxxx xxxx xxxxxxx. xxxxxx xxxxx? xxxxxxxxxxx x xxxxxxxxx xxxxxx.\nx xxxxx x xxxxxxxxxx x xxxxx... x xxxxxx xxxx xxxxxx xxxxxxx xxxxxxxx. xx xxxx, x xxxxxx xxx-xx xxxxxxxxx xx xxxxxxx, xxx xxxxxx xxxxxx, xxx xxx xxxxx, xxxxx xxxxxxxx xx xxxx... x xxxxxx xxxxxxx xx xxxx xxxxx, xxx, xxxxx xxxx xxxxxxxxxx, x xxxxx xxxxxxxxx xx xxxxx. x xxx-xx xxx xxxxx xxxxxxx xxxxxxxxxxxxx.\nxxxxxx xx... xx xxx xx xxxxxxxxxxxxx xxxxxx xxxxxxxxxxxxx x xxxxxxxxxx xxxxx, xxxxx xxx xxxx xxxxxxxxx, x xxxxx xxx xxxxxxxxx, xxx xxxxxxx xxx, xxx xxxx xxxxxxx xxxxxx, x xx xxx, xxx xxxx xxxxxxxx."
 });
 }
 
@@ -151,7 +150,7 @@ fn test_tables_crash2() {
         .expect("File must be readable");
     let table_with_vertical_header = parse_html(&html, false);
 
-    assert_that!(table_with_vertical_header).contains(indoc! {"xxxxx xxxxxxxxxx xxxxxxx x xxxxx))~~xxxxxxxx xxxxxxxx~~\n\n## At a Glance\n\n|Current Conditions:|Open all year. No reservations. No services.|\n|||\n| Reservations: | No reservations. |\n| Fees | No fee. |\n| Water: | No water. |"
+    assert_that!(table_with_vertical_header).contains(indoc! {"xxxxx xxxxxxxxxx xxxxxxx x xxxxx))~~xxxxxxxx xxxxxxxx~~\n## At a Glance\n|Current Conditions:|Open all year. No reservations. No services.|\n|||\n| Reservations: | No reservations. |\n| Fees | No fee. |\n| Water: | No water. |"
     });
 }
 
@@ -197,5 +196,6 @@ fn test_html_from_text_rewrite() {
         // &Some(Url::parse("https://spider.cloud").unwrap()),
     );
 
+    println!("{:?}", result);
     assert!(!result.is_empty());
 }
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		// initial source from /JumperBot/whitespace-sifter
		pub mod sifter;