Skip to content

Commit

Permalink
perf(markdown): add sifter handling clean
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Nov 13, 2024
1 parent 3e7eec8 commit 544b923
Show file tree
Hide file tree
Showing 11 changed files with 228 additions and 59 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion fast_html2md/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "fast_html2md"
version = "0.0.27"
version = "0.0.29"
edition = "2021"
description = "A fast html2md crate for rust"
categories = ["development-tools", "parsing", "parser-implementations"]
Expand Down
2 changes: 2 additions & 0 deletions fast_html2md/src/extended/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
// initial source from /JumperBot/whitespace-sifter
pub mod sifter;
199 changes: 199 additions & 0 deletions fast_html2md/src/extended/sifter.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
use std::str;

/// Charector handling bytes.
enum Character {
SingleByte { data: u8 },
MultiByte { len: usize },
}

/// A trait containing all `string` whitespace-sifting functions.
pub trait WhitespaceSifter: AsRef<str> {
/// This removes duplicate [whitespaces](https://doc.rust-lang.org/reference/whitespace.html) from a `string` implementing `AsRef<str>`.
/// This follows the [is_ascii_whitespace](https://doc.rust-lang.org/std/primitive.char.html#method.is_ascii_whitespace) implementation.
/// This treats carriage-returns as just one `char` in the `string`.
#[must_use]
fn sift(&self) -> String {
let input: &str = self.as_ref();
let mut out: String = String::with_capacity(input.len());
sift_preallocated(input.as_bytes(), &mut out);
out
}

/// This removes duplicate [whitespaces](https://doc.rust-lang.org/reference/whitespace.html) from a `string` implementing `AsRef<str>`.
/// This follows the [is_ascii_whitespace](https://doc.rust-lang.org/std/primitive.char.html#method.is_ascii_whitespace) implementation.
/// This preserves deduplicated newlines.
/// This treats carriage-returns as just one `char` in the `string`.
#[must_use]
fn sift_preserve_newlines(&self) -> String {
let input = self.as_ref();
let mut out = String::with_capacity(input.len());
let bytes = input.as_bytes();
let mut ind: usize = 0;

while ind < bytes.len() {
sift_preallocated_until_newline(bytes, &mut ind, &mut out);
}

if out.ends_with("\r\n") {
let _ = out.pop();
let _ = out.pop();
} else if out.ends_with('\n') {
let _ = out.pop();
}

out
}
}

impl<T: AsRef<str>> WhitespaceSifter for T {}

/// A custom implementation of `str::trim_start`.
fn sift_trim_start(bytes: &[u8], ind: &mut usize, out: &mut String) {
while *ind < bytes.len() {
match get_char_metadata(bytes[*ind]) {
Character::SingleByte { data } => {
*ind += 1;
if !is_ascii_whitespace(data) {
out.push(data as char);
break;
}
}
Character::MultiByte { len } => {
extend_from_bytes_with_len(bytes, ind, out, len);
break;
}
}
}
}

/// A custom implementation for `str::trim_end`.
fn sift_trim_end(out: &mut String, is_last_whitespace: bool) {
if is_last_whitespace {
out.pop();
}
}

/// Extend the bytes from a slice.
fn extend_from_bytes_with_len(bytes: &[u8], ind: &mut usize, out: &mut String, len: usize) {
let end = ind.saturating_add(len);
// Check bounds to ensure we don't run into an out-of-bounds error.
if end <= bytes.len() {
// Todo: we want to pass in the bytes encoded to string.
if let Ok(valid_str) = str::from_utf8(&bytes[*ind..end]) {
out.push_str(valid_str);
}
}
*ind = end;
}

#[inline]
const fn is_newline(codepoint: u8) -> bool {
matches!(codepoint, LINE_FEED | CARRIAGE_RETURN)
}

/// Sift preallocate safe strings.
fn sift_preallocated(bytes: &[u8], out: &mut String) {
if !bytes.is_empty() {
let mut ind: usize = 0;
sift_trim_start(bytes, &mut ind, out);
let mut is_last_whitespace: bool = false;
let mut is_last_carriage_return: bool = false;

while ind < bytes.len() {
match get_char_metadata(bytes[ind]) {
Character::SingleByte { data } => {
ind += 1;
if is_ascii_whitespace(data) {
if data == LINE_FEED && is_last_carriage_return {
out.push('\n');
is_last_carriage_return = false;
continue;
}
if is_last_whitespace {
continue;
}
is_last_whitespace = true;
} else {
is_last_whitespace = false;
}
out.push(data as char);
is_last_carriage_return = data == CARRIAGE_RETURN;
}
Character::MultiByte { len } => {
extend_from_bytes_with_len(bytes, &mut ind, out, len);
}
}
is_last_carriage_return = false;
}
sift_trim_end(out, is_last_whitespace);
}
}

/// Sift preallocate until complete.
fn sift_preallocated_until_newline(bytes: &[u8], ind: &mut usize, out: &mut String) {
sift_trim_start(bytes, ind, out);

let mut is_last_whitespace = false;
let mut is_last_carriage_return = false;

while *ind < bytes.len() {
match get_char_metadata(bytes[*ind]) {
Character::SingleByte { data } => {
*ind += 1;
if is_ascii_whitespace(data) {
if is_newline(data) {
if is_last_carriage_return {
out.push('\r');
}
out.push('\n');
return;
}
is_last_carriage_return = data == CARRIAGE_RETURN;
if is_last_whitespace {
continue;
}
is_last_whitespace = true;
} else {
is_last_whitespace = false;
}
out.push(data as char);
}
Character::MultiByte { len } => {
extend_from_bytes_with_len(bytes, ind, out, len);
}
}
is_last_carriage_return = false;
}
sift_trim_end(out, is_last_whitespace);
}

/// Binary extracted from [std](https://doc.rust-lang.org/src/core/str/validations.rs.html#36).
#[inline]
const fn get_char_metadata(first_byte: u8) -> Character {
match first_byte {
0b0000_0000..=0b0111_1111 => Character::SingleByte { data: first_byte },
0b1000_0000..=0b1101_1111 => Character::MultiByte { len: 2 },
0b1110_0000..=0b1110_1111 => Character::MultiByte { len: 3 },
0b1111_0000..=0b1111_1111 => Character::MultiByte { len: 4 },
}
}

#[allow(clippy::cast_possible_truncation)]
const SPACE: u8 = ' ' as u32 as u8;
#[allow(clippy::cast_possible_truncation)]
const HORIZONTAL_TAB: u8 = '\t' as u32 as u8;
#[allow(clippy::cast_possible_truncation)]
const LINE_FEED: u8 = '\n' as u32 as u8;
#[allow(clippy::cast_possible_truncation)]
const FORM_FEED: u8 = '\x0C' as u32 as u8;
#[allow(clippy::cast_possible_truncation)]
const CARRIAGE_RETURN: u8 = '\r' as u32 as u8;

/// Values extracted from [std](https://doc.rust-lang.org/src/core/char/methods.rs.html#1680).
#[inline]
const fn is_ascii_whitespace(codepoint: u8) -> bool {
matches!(
codepoint,
SPACE | HORIZONTAL_TAB | LINE_FEED | FORM_FEED | CARRIAGE_RETURN
)
}
33 changes: 8 additions & 25 deletions fast_html2md/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,24 @@ use html5ever::driver::ParseOpts;
use html5ever::parse_document;
use html5ever::tendril::TendrilSink;
use lazy_static::lazy_static;
pub use markup5ever_rcdom::{Handle, NodeData, RcDom};
use regex::Regex;
use std::boxed::Box;
use std::collections::HashMap;
use std::sync::Arc;
use url::Url;

pub use markup5ever_rcdom::{Handle, NodeData, RcDom};

// we want to just use the rewriter instead for v0.1.
pub mod extended;
pub mod rewriter;
pub mod scraper;
pub use scraper::ignore;

use extended::sifter::WhitespaceSifter;

pub(crate) use scraper::anchors;
pub(crate) use scraper::codes;
pub use scraper::ignore;
// pub(crate) use scraper::common;
pub(crate) use scraper::containers;
pub(crate) use scraper::dummy;
Expand Down Expand Up @@ -55,14 +59,6 @@ lazy_static! {
static ref START_OF_LINE_PATTERN: Regex = Regex::new("(^|\\n) *$").expect("valid regex pattern"); // for Markdown escaping
static ref MARKDOWN_STARTONLY_KEYCHARS: Regex = Regex::new(r"^(\s*)([=>+\-#])").expect("valid regex pattern"); // for Markdown escaping
static ref MARKDOWN_MIDDLE_KEYCHARS: Regex = Regex::new(r"[<>*\\_~]").expect("valid regex pattern"); // for Markdown escaping
static ref CLEANUP_PATTERN: Regex = Regex::new(
r"(?x)
(?m)
(^\s*$\n|\n{3,})| # Empty lines or excessive newlines
(\s+$|^\n+|\s{2,})| # Trailing, leading, or excessive spaces
(!\[\]\(\)) # Empty image syntax
"
).expect("Valid regex pattern");
}

/// Custom variant of main function. Allows to pass custom tag<->tag factory pairs
Expand Down Expand Up @@ -389,21 +385,8 @@ fn escape_markdown(result: &StructuredPrinter, text: &str) -> String {
/// Called after all processing has been finished
///
/// Clears excessive punctuation that would be trimmed by renderer anyway
fn clean_markdown(text: &str) -> String {
CLEANUP_PATTERN
.replace_all(text, |caps: &regex::Captures| {
if caps.get(1).is_some() || caps.get(4).is_some() {
"\n\n".to_string() // Consolidate newlines
} else if caps.get(3).is_some() {
"".to_string() // Remove spaces or empty image syntax
} else if caps.get(2).is_some() {
" ".to_string() // Remove spaces or empty image syntax
} else {
caps[0].trim().to_string()
}
})
.trim()
.to_string()
fn clean_markdown(input: &str) -> String {
input.sift().into()
}

/// Intermediate result of HTML -> Markdown conversion.
Expand Down
8 changes: 4 additions & 4 deletions fast_html2md/tests/integration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,7 @@ fn test_list_newlines() {
.read_to_string(&mut html)
.expect("File must be readable");
let result = parse_html(&html, false);
assert_that(&result).contains(".\n\nxxx xxxx");
assert_that(&result).contains("xx x.\n\nxxxxx:");
assert_that(&result).is_equal_to("xx, xx xxxxx x xxxxxx xxxxxxxx xxxxx xxxxxxxxx xxxx xx xxxx xxxx xxxxxxxx.\nxxxx, xxx xx xxxxx xx xxxxxxxxxxx xxxx.\nxxxxxxxxxxx:\n* xxxxxxx x xxxxxxxxx (xxxxx)\n* xxxxxxx xx xxxxxx xxxxxxx, xxxxxxxxxx xxxxxxxxxx xxxx\n* xxxxxxxxx xx xxxxx, xx xxxxxx xx xxxxxxxxxxx\n* xxxxxxx xxxxxx xxxxxxxxx x xxxxxxxxxx, xxxxxxx xxxxxx x xxxxxxx, x xxxxxx.\n* xx xx, xxxxxx xx xxxxxxxx, xx-xxxx xxx x xxxxxxx xxx xxx, xxxxxxx xx xxxx. xxxxxxxxx xx x.\nxxxxx:\n1. xxxxxxxxx xxxxxxxxxx - xxxxx -\\_- !\n2. xxxxxx Mother of Learning - xxxx, xxxxxxx, xxxxxxxxxxxx\n3. xxxxxx xxxxxxx xxxxxxx, xxxxxxxx \"xxx xxxxx\". xxxxx xxxxx xxxx, xx x xxxxx xxxxxxx.\n4. xxxxxxxx! xxxx xxx xxxxxxxxx xxxx xxx, xx x xxxxxxxxx.\n5. xxxx xxxxxx - xxxxxx xxxxxxxx xxx x 15-17, xxxxxx xxxxxxxxxxxxx xx xxxxxxxx xxx xxxxxxx xxxxxx.\nxxx xxxx, xxxxx x xxxxxxxxx xx xxxxxxxxxx xxxxxx. xxxxxxxxx spelling puns, xxxxxxx, x xxxxxxxxx, xxxxxxxx xxx xxxxxxxx, xxxxxx xxxxxxxxxx xxxxxx.\nxxx xxxxxxx. xxx xxx xxxxxxxx xxxxxx - x x xxxxxxxxxxx xxxxx xxxx xxxxxxxxxx xxx xxxxx, x xxxxxx xxx xxxxxxxx xxxxxxxxxx xxx xxxxx. xx xxxxxx xxxxxxxx:\n* xxx xxxxx x xxx-xxxx xxxxxxxxx. xxxxxx xxx xxxx xxxxxxxx. x xx x xx xxxxxxxx, xx x xxxxxxx xxxxxx xxxxxx xx xxxxxxxxx. xxxxxxxxxx xxxx xxxxx xxxxxx xxxxxxxxx xxxxxxx xx xxxx.\n* xxxxxx xxxx Kotlin, x xxxxxxx. xxxxxxxxxx, xxxxxxxxxx xxx xxxxx xx xxx x xxxxxxxx\n* xxx xxxxx xxxxxxxxxx Rust, xxx xxx x xx xxx xxxx xxxxxxxxx xxxxxxxxxxxxxx xxxx xxx xxxxx, xxxxxxxx xxxxxxxxxxxxxx HTML x Markdown\n* xxx xxxx xxxxxx xxx xxxxxxxx xxxxxx. xx xxxx xxx - xxxxxxxxxxxxx xxxxxxxxxxx xxxxxx x xxxxxxxxx xxxxx x xxxxxxx.\n* xxxxxxxxx xxxx xxxxxxxx xxxxxxx xx FUSE 3.0. xxxxx xxxxxxx xxxxxxx xxx xxxxxxxxxxx.\n* x xxxxxxxx xxxx xxxxxxxx DevOps-xxxxxxx x xxxxx xxxxxxx. xxxxxxxxx, xxx xx xxxxx xxxxxx. x, xx, xxx xxx xxx xxxxxxxxx?\nxxxxx xx xxx:\n\\- xxxxxxxx xxxxxxxx\n\\- xxxxxxx xxxxxxxxx, xxxxxxx xxxxx xxxxx xxxxxxxx\n\\- xxxxxxxxxx xxxx Machine Learning, xxxx xxxxxx xxx xxxxxxxx OpenCL.".to_string());
}

#[test]
Expand Down Expand Up @@ -138,7 +137,7 @@ fn test_tables_with_newlines() {
.expect("File must be readable");
let result = parse_html(&html, false);

assert_that!(result).contains(indoc! {"[![Embedded YouTube video](https://img.youtube.com/vi/ZZZZZZZZZ/0.jpg)](https://www.youtube.com/watch?v=ZZZZZZZZZ)\n\n|Maybe I'm foolish, maybe I'm blind\nThinking I can see through this and see what's behind\nGot no way to prove it so maybe I'm blind\n\nBut I'm only human after all,\nI'm only human after all\nDon't put your blame on me|xxxxx xxxx, x xxxxxx, xxxxx xxxx — xxxxxx\nxxx xxxxx, xxx xxxx xxxxxx xxxxxx xxx, x xxxxxx xxx xxx xx xxx\nxxxx x xxxx xx xxxx xxxxxxx xxxxxxxxxxxxx, xxx xxx xxxxxxxx, x xxxxxx.\n\nxx x xxxxx xxxx xxxxxxx, x xxxxx-xx xxxxxx,\nx xxxxx xxxx xxxxxxx, x xxxxx xxxxxx.\nxx xxxx xxxx|\n|||\n\n[xxxxxx xxxxx xxxxx x xxxxxxx](/)\n\nx xxxx xxxxxxxxx xxxxxxx xxxxxxxxxxx xx xxxx xxxxx. x xxxxx xxxxxxx, xxxx xxxxx xxxxxxx xx xxxxxxxxxx xxxxxx. xxx xxxxxxxx, xxx xxxxxxxxx xxxxxxxxxxxxxx xx xxxxx — xxxxxxxxxx xxxxxxxxxx x xxxxx xxxxxxxxxxxxx xxxxxxxxx. x xxx xxxxxxxxxxxx*xxxx*, xxxxxx xxxx, xxxxxxxxxx xxxxx xxxxxxxx, xxxxxxxxxx x xxxxxxxxx. xx xxxxxx xxxxx xxxxxxxxxxxxxxxxx — x xxxxxx xxx xxxx.\n\nxxxxx xxxxxxxxxx xxxxx x xxxx xxxxxxxxxx xxxxx. xxxxx. x xxxxx: «x xxxxxx xxxxxxx, x xxxxx xxx xxxx, xx xxxxxxxx xxxxxx», — xxx xxxxx xxxxxxxx. xxxxxx xxx x xxxx xxxx xxxxxxxx xxxxxxxx xxxxxxx xxxx xxxxxxxxxxx xxxxxxxxxx, xxxxxxx xxxxxx xxxxxx xxx xxxxx, xxxxxxxxxxx x x xxxxxxx xxxxxxxxx.\n\nxx x xxxxx xxxx xxxxxxx. xxxxxx xxxxx? xxxxxxxxxxx x xxxxxxxxx xxxxxx.\n\nx xxxxx x xxxxxxxxxx x xxxxx... x xxxxxx xxxx xxxxxx xxxxxxx xxxxxxxx. xx xxxx, x xxxxxx xxx-xx xxxxxxxxx xx xxxxxxx, xxx xxxxxx xxxxxx, xxx xxx xxxxx, xxxxx xxxxxxxx xx xxxx... x xxxxxx xxxxxxx xx xxxx xxxxx, xxx, xxxxx xxxx xxxxxxxxxx, x xxxxx xxxxxxxxx xx xxxxx. x xxx-xx xxx xxxxx xxxxxxx xxxxxxxxxxxxx.\n\nxxxxxx xx... xx xxx xx xxxxxxxxxxxxx xxxxxx xxxxxxxxxxxxx x xxxxxxxxxx xxxxx, xxxxx xxx xxxx xxxxxxxxx, x xxxxx xxx xxxxxxxxx, xxx xxxxxxx xxx, xxx xxxx xxxxxxx xxxxxx, x xx xxx, xxx xxxx xxxxxxxx."
assert_that!(result).contains(indoc! {"[![Embedded YouTube video](https://img.youtube.com/vi/ZZZZZZZZZ/0.jpg)](https://www.youtube.com/watch?v=ZZZZZZZZZ)\n|Maybe I'm foolish, maybe I'm blind\nThinking I can see through this and see what's behind\nGot no way to prove it so maybe I'm blind\nBut I'm only human after all,\nI'm only human after all\nDon't put your blame on me|xxxxx xxxx, x xxxxxx, xxxxx xxxx —xxxxxx\nxxx xxxxx, xxx xxxx xxxxxx xxxxxx xxx, x xxxxxx xxx xxx xx xxx\nxxxx x xxxx xx xxxx xxxxxxx xxxxxxxxxxxxx, xxx xxx xxxxxxxx, x xxxxxx.\nxx x xxxxx xxxx xxxxxxx, x xxxxx-xx xxxxxx,\nx xxxxx xxxx xxxxxxx, x xxxxx xxxxxx.\nxx xxxx xxxx|\n|||\n[xxxxxx xxxxx xxxxx x xxxxxxx](/)\nx xxxx xxxxxxxxx xxxxxxx xxxxxxxxxxx xx xxxx xxxxx. x xxxxx xxxxxxx, xxxx xxxxx xxxxxxx xx xxxxxxxxxx xxxxxx. xxx xxxxxxxx, xxx xxxxxxxxx xxxxxxxxxxxxxx xx xxxxx —xxxxxxxxxx xxxxxxxxxx x xxxxx xxxxxxxxxxxxx xxxxxxxxx. x xxx xxxxxxxxxxxx*xxxx*, xxxxxx xxxx, xxxxxxxxxx xxxxx xxxxxxxx, xxxxxxxxxx x xxxxxxxxx. xx xxxxxx xxxxx xxxxxxxxxxxxxxxxx —x xxxxxx xxx xxxx.\nxxxxx xxxxxxxxxx xxxxx x xxxx xxxxxxxxxx xxxxx. xxxxx. x xxxxx: «x xxxxxx xxxxxxx, x xxxxx xxx xxxx, xx xxxxxxxx xxxxxx», —xxx xxxxx xxxxxxxx. xxxxxx xxx x xxxx xxxx xxxxxxxx xxxxxxxx xxxxxxx xxxx xxxxxxxxxxx xxxxxxxxxx, xxxxxxx xxxxxx xxxxxx xxx xxxxx, xxxxxxxxxxx x x xxxxxxx xxxxxxxxx.\nxx x xxxxx xxxx xxxxxxx. xxxxxx xxxxx? xxxxxxxxxxx x xxxxxxxxx xxxxxx.\nx xxxxx x xxxxxxxxxx x xxxxx... x xxxxxx xxxx xxxxxx xxxxxxx xxxxxxxx. xx xxxx, x xxxxxx xxx-xx xxxxxxxxx xx xxxxxxx, xxx xxxxxx xxxxxx, xxx xxx xxxxx, xxxxx xxxxxxxx xx xxxx... x xxxxxx xxxxxxx xx xxxx xxxxx, xxx, xxxxx xxxx xxxxxxxxxx, x xxxxx xxxxxxxxx xx xxxxx. x xxx-xx xxx xxxxx xxxxxxx xxxxxxxxxxxxx.\nxxxxxx xx... xx xxx xx xxxxxxxxxxxxx xxxxxx xxxxxxxxxxxxx x xxxxxxxxxx xxxxx, xxxxx xxx xxxx xxxxxxxxx, x xxxxx xxx xxxxxxxxx, xxx xxxxxxx xxx, xxx xxxx xxxxxxx xxxxxx, x xx xxx, xxx xxxx xxxxxxxx."
});
}

Expand All @@ -151,7 +150,7 @@ fn test_tables_crash2() {
.expect("File must be readable");
let table_with_vertical_header = parse_html(&html, false);

assert_that!(table_with_vertical_header).contains(indoc! {"xxxxx xxxxxxxxxx xxxxxxx x xxxxx))~~xxxxxxxx xxxxxxxx~~\n\n## At a Glance\n\n|Current Conditions:|Open all year. No reservations. No services.|\n|||\n| Reservations: | No reservations. |\n| Fees | No fee. |\n| Water: | No water. |"
assert_that!(table_with_vertical_header).contains(indoc! {"xxxxx xxxxxxxxxx xxxxxxx x xxxxx))~~xxxxxxxx xxxxxxxx~~\n## At a Glance\n|Current Conditions:|Open all year. No reservations. No services.|\n|||\n| Reservations: | No reservations. |\n| Fees | No fee. |\n| Water: | No water. |"
});
}

Expand Down Expand Up @@ -197,5 +196,6 @@ fn test_html_from_text_rewrite() {
// &Some(Url::parse("https://spider.cloud").unwrap()),
);

println!("{:?}", result);
assert!(!result.is_empty());
}
Loading

0 comments on commit 544b923

Please sign in to comment.