From 544b9237f657d9244a0b4c992d83b3563ac3ea23 Mon Sep 17 00:00:00 2001
From: j-mendez
Date: Wed, 13 Nov 2024 10:17:42 -0500
Subject: [PATCH] perf(markdown): add sifter handling clean
---
Cargo.lock | 2 +-
fast_html2md/Cargo.toml | 2 +-
fast_html2md/src/extended/mod.rs | 2 +
fast_html2md/src/extended/sifter.rs | 199 +++++++++++++++++++++++++
fast_html2md/src/lib.rs | 33 +---
fast_html2md/tests/integration.rs | 8 +-
fast_html2md/tests/lists.rs | 21 +--
fast_html2md/tests/quotes.rs | 7 +-
fast_html2md/tests/tables.rs | 2 +-
fast_html2md/tests/unit.rs | 10 +-
test-samples/markdown-spider-sample.md | 1 +
11 files changed, 228 insertions(+), 59 deletions(-)
create mode 100644 fast_html2md/src/extended/mod.rs
create mode 100644 fast_html2md/src/extended/sifter.rs
create mode 100644 test-samples/markdown-spider-sample.md
diff --git a/Cargo.lock b/Cargo.lock
index 905e603..b9ee2eb 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -374,7 +374,7 @@ dependencies = [
[[package]]
name = "fast_html2md"
-version = "0.0.27"
+version = "0.0.29"
dependencies = [
"auto_encoder",
"html5ever",
diff --git a/fast_html2md/Cargo.toml b/fast_html2md/Cargo.toml
index 3b8c8b4..9b57708 100644
--- a/fast_html2md/Cargo.toml
+++ b/fast_html2md/Cargo.toml
@@ -1,6 +1,6 @@
[package]
name = "fast_html2md"
-version = "0.0.27"
+version = "0.0.29"
edition = "2021"
description = "A fast html2md crate for rust"
categories = ["development-tools", "parsing", "parser-implementations"]
diff --git a/fast_html2md/src/extended/mod.rs b/fast_html2md/src/extended/mod.rs
new file mode 100644
index 0000000..ac792bb
--- /dev/null
+++ b/fast_html2md/src/extended/mod.rs
@@ -0,0 +1,2 @@
+// initial source from /JumperBot/whitespace-sifter
+pub mod sifter;
diff --git a/fast_html2md/src/extended/sifter.rs b/fast_html2md/src/extended/sifter.rs
new file mode 100644
index 0000000..65d464c
--- /dev/null
+++ b/fast_html2md/src/extended/sifter.rs
@@ -0,0 +1,199 @@
+use std::str;
+
+/// Charector handling bytes.
+enum Character {
+ SingleByte { data: u8 },
+ MultiByte { len: usize },
+}
+
+/// A trait containing all `string` whitespace-sifting functions.
+pub trait WhitespaceSifter: AsRef {
+ /// This removes duplicate [whitespaces](https://doc.rust-lang.org/reference/whitespace.html) from a `string` implementing `AsRef`.
+ /// This follows the [is_ascii_whitespace](https://doc.rust-lang.org/std/primitive.char.html#method.is_ascii_whitespace) implementation.
+ /// This treats carriage-returns as just one `char` in the `string`.
+ #[must_use]
+ fn sift(&self) -> String {
+ let input: &str = self.as_ref();
+ let mut out: String = String::with_capacity(input.len());
+ sift_preallocated(input.as_bytes(), &mut out);
+ out
+ }
+
+ /// This removes duplicate [whitespaces](https://doc.rust-lang.org/reference/whitespace.html) from a `string` implementing `AsRef`.
+ /// This follows the [is_ascii_whitespace](https://doc.rust-lang.org/std/primitive.char.html#method.is_ascii_whitespace) implementation.
+ /// This preserves deduplicated newlines.
+ /// This treats carriage-returns as just one `char` in the `string`.
+ #[must_use]
+ fn sift_preserve_newlines(&self) -> String {
+ let input = self.as_ref();
+ let mut out = String::with_capacity(input.len());
+ let bytes = input.as_bytes();
+ let mut ind: usize = 0;
+
+ while ind < bytes.len() {
+ sift_preallocated_until_newline(bytes, &mut ind, &mut out);
+ }
+
+ if out.ends_with("\r\n") {
+ let _ = out.pop();
+ let _ = out.pop();
+ } else if out.ends_with('\n') {
+ let _ = out.pop();
+ }
+
+ out
+ }
+}
+
+impl> WhitespaceSifter for T {}
+
+/// A custom implementation of `str::trim_start`.
+fn sift_trim_start(bytes: &[u8], ind: &mut usize, out: &mut String) {
+ while *ind < bytes.len() {
+ match get_char_metadata(bytes[*ind]) {
+ Character::SingleByte { data } => {
+ *ind += 1;
+ if !is_ascii_whitespace(data) {
+ out.push(data as char);
+ break;
+ }
+ }
+ Character::MultiByte { len } => {
+ extend_from_bytes_with_len(bytes, ind, out, len);
+ break;
+ }
+ }
+ }
+}
+
+/// A custom implementation for `str::trim_end`.
+fn sift_trim_end(out: &mut String, is_last_whitespace: bool) {
+ if is_last_whitespace {
+ out.pop();
+ }
+}
+
+/// Extend the bytes from a slice.
+fn extend_from_bytes_with_len(bytes: &[u8], ind: &mut usize, out: &mut String, len: usize) {
+ let end = ind.saturating_add(len);
+ // Check bounds to ensure we don't run into an out-of-bounds error.
+ if end <= bytes.len() {
+ // Todo: we want to pass in the bytes encoded to string.
+ if let Ok(valid_str) = str::from_utf8(&bytes[*ind..end]) {
+ out.push_str(valid_str);
+ }
+ }
+ *ind = end;
+}
+
+#[inline]
+const fn is_newline(codepoint: u8) -> bool {
+ matches!(codepoint, LINE_FEED | CARRIAGE_RETURN)
+}
+
+/// Sift preallocate safe strings.
+fn sift_preallocated(bytes: &[u8], out: &mut String) {
+ if !bytes.is_empty() {
+ let mut ind: usize = 0;
+ sift_trim_start(bytes, &mut ind, out);
+ let mut is_last_whitespace: bool = false;
+ let mut is_last_carriage_return: bool = false;
+
+ while ind < bytes.len() {
+ match get_char_metadata(bytes[ind]) {
+ Character::SingleByte { data } => {
+ ind += 1;
+ if is_ascii_whitespace(data) {
+ if data == LINE_FEED && is_last_carriage_return {
+ out.push('\n');
+ is_last_carriage_return = false;
+ continue;
+ }
+ if is_last_whitespace {
+ continue;
+ }
+ is_last_whitespace = true;
+ } else {
+ is_last_whitespace = false;
+ }
+ out.push(data as char);
+ is_last_carriage_return = data == CARRIAGE_RETURN;
+ }
+ Character::MultiByte { len } => {
+ extend_from_bytes_with_len(bytes, &mut ind, out, len);
+ }
+ }
+ is_last_carriage_return = false;
+ }
+ sift_trim_end(out, is_last_whitespace);
+ }
+}
+
+/// Sift preallocate until complete.
+fn sift_preallocated_until_newline(bytes: &[u8], ind: &mut usize, out: &mut String) {
+ sift_trim_start(bytes, ind, out);
+
+ let mut is_last_whitespace = false;
+ let mut is_last_carriage_return = false;
+
+ while *ind < bytes.len() {
+ match get_char_metadata(bytes[*ind]) {
+ Character::SingleByte { data } => {
+ *ind += 1;
+ if is_ascii_whitespace(data) {
+ if is_newline(data) {
+ if is_last_carriage_return {
+ out.push('\r');
+ }
+ out.push('\n');
+ return;
+ }
+ is_last_carriage_return = data == CARRIAGE_RETURN;
+ if is_last_whitespace {
+ continue;
+ }
+ is_last_whitespace = true;
+ } else {
+ is_last_whitespace = false;
+ }
+ out.push(data as char);
+ }
+ Character::MultiByte { len } => {
+ extend_from_bytes_with_len(bytes, ind, out, len);
+ }
+ }
+ is_last_carriage_return = false;
+ }
+ sift_trim_end(out, is_last_whitespace);
+}
+
+/// Binary extracted from [std](https://doc.rust-lang.org/src/core/str/validations.rs.html#36).
+#[inline]
+const fn get_char_metadata(first_byte: u8) -> Character {
+ match first_byte {
+ 0b0000_0000..=0b0111_1111 => Character::SingleByte { data: first_byte },
+ 0b1000_0000..=0b1101_1111 => Character::MultiByte { len: 2 },
+ 0b1110_0000..=0b1110_1111 => Character::MultiByte { len: 3 },
+ 0b1111_0000..=0b1111_1111 => Character::MultiByte { len: 4 },
+ }
+}
+
+#[allow(clippy::cast_possible_truncation)]
+const SPACE: u8 = ' ' as u32 as u8;
+#[allow(clippy::cast_possible_truncation)]
+const HORIZONTAL_TAB: u8 = '\t' as u32 as u8;
+#[allow(clippy::cast_possible_truncation)]
+const LINE_FEED: u8 = '\n' as u32 as u8;
+#[allow(clippy::cast_possible_truncation)]
+const FORM_FEED: u8 = '\x0C' as u32 as u8;
+#[allow(clippy::cast_possible_truncation)]
+const CARRIAGE_RETURN: u8 = '\r' as u32 as u8;
+
+/// Values extracted from [std](https://doc.rust-lang.org/src/core/char/methods.rs.html#1680).
+#[inline]
+const fn is_ascii_whitespace(codepoint: u8) -> bool {
+ matches!(
+ codepoint,
+ SPACE | HORIZONTAL_TAB | LINE_FEED | FORM_FEED | CARRIAGE_RETURN
+ )
+}
\ No newline at end of file
diff --git a/fast_html2md/src/lib.rs b/fast_html2md/src/lib.rs
index 2e53230..d0e55bf 100644
--- a/fast_html2md/src/lib.rs
+++ b/fast_html2md/src/lib.rs
@@ -2,20 +2,24 @@ use html5ever::driver::ParseOpts;
use html5ever::parse_document;
use html5ever::tendril::TendrilSink;
use lazy_static::lazy_static;
-pub use markup5ever_rcdom::{Handle, NodeData, RcDom};
use regex::Regex;
use std::boxed::Box;
use std::collections::HashMap;
use std::sync::Arc;
use url::Url;
+pub use markup5ever_rcdom::{Handle, NodeData, RcDom};
+
// we want to just use the rewriter instead for v0.1.
+pub mod extended;
pub mod rewriter;
pub mod scraper;
-pub use scraper::ignore;
+
+use extended::sifter::WhitespaceSifter;
pub(crate) use scraper::anchors;
pub(crate) use scraper::codes;
+pub use scraper::ignore;
// pub(crate) use scraper::common;
pub(crate) use scraper::containers;
pub(crate) use scraper::dummy;
@@ -55,14 +59,6 @@ lazy_static! {
static ref START_OF_LINE_PATTERN: Regex = Regex::new("(^|\\n) *$").expect("valid regex pattern"); // for Markdown escaping
static ref MARKDOWN_STARTONLY_KEYCHARS: Regex = Regex::new(r"^(\s*)([=>+\-#])").expect("valid regex pattern"); // for Markdown escaping
static ref MARKDOWN_MIDDLE_KEYCHARS: Regex = Regex::new(r"[<>*\\_~]").expect("valid regex pattern"); // for Markdown escaping
- static ref CLEANUP_PATTERN: Regex = Regex::new(
- r"(?x)
- (?m)
- (^\s*$\n|\n{3,})| # Empty lines or excessive newlines
- (\s+$|^\n+|\s{2,})| # Trailing, leading, or excessive spaces
- (!\[\]\(\)) # Empty image syntax
- "
- ).expect("Valid regex pattern");
}
/// Custom variant of main function. Allows to pass custom tag<->tag factory pairs
@@ -389,21 +385,8 @@ fn escape_markdown(result: &StructuredPrinter, text: &str) -> String {
/// Called after all processing has been finished
///
/// Clears excessive punctuation that would be trimmed by renderer anyway
-fn clean_markdown(text: &str) -> String {
- CLEANUP_PATTERN
- .replace_all(text, |caps: ®ex::Captures| {
- if caps.get(1).is_some() || caps.get(4).is_some() {
- "\n\n".to_string() // Consolidate newlines
- } else if caps.get(3).is_some() {
- "".to_string() // Remove spaces or empty image syntax
- } else if caps.get(2).is_some() {
- " ".to_string() // Remove spaces or empty image syntax
- } else {
- caps[0].trim().to_string()
- }
- })
- .trim()
- .to_string()
+fn clean_markdown(input: &str) -> String {
+ input.sift().into()
}
/// Intermediate result of HTML -> Markdown conversion.
diff --git a/fast_html2md/tests/integration.rs b/fast_html2md/tests/integration.rs
index e22257f..b5951a8 100644
--- a/fast_html2md/tests/integration.rs
+++ b/fast_html2md/tests/integration.rs
@@ -100,8 +100,7 @@ fn test_list_newlines() {
.read_to_string(&mut html)
.expect("File must be readable");
let result = parse_html(&html, false);
- assert_that(&result).contains(".\n\nxxx xxxx");
- assert_that(&result).contains("xx x.\n\nxxxxx:");
+ assert_that(&result).is_equal_to("xx, xx xxxxx x xxxxxx xxxxxxxx xxxxx xxxxxxxxx xxxx xx xxxx xxxx xxxxxxxx.\nxxxx, xxx xx xxxxx xx xxxxxxxxxxx xxxx.\nxxxxxxxxxxx:\n* xxxxxxx x xxxxxxxxx (xxxxx)\n* xxxxxxx xx xxxxxx xxxxxxx, xxxxxxxxxx xxxxxxxxxx xxxx\n* xxxxxxxxx xx xxxxx, xx xxxxxx xx xxxxxxxxxxx\n* xxxxxxx xxxxxx xxxxxxxxx x xxxxxxxxxx, xxxxxxx xxxxxx x xxxxxxx, x xxxxxx.\n* xx xx, xxxxxx xx xxxxxxxx, xx-xxxx xxx x xxxxxxx xxx xxx, xxxxxxx xx xxxx. xxxxxxxxx xx x.\nxxxxx:\n1. xxxxxxxxx xxxxxxxxxx - xxxxx -\\_- !\n2. xxxxxx Mother of Learning - xxxx, xxxxxxx, xxxxxxxxxxxx\n3. xxxxxx xxxxxxx xxxxxxx, xxxxxxxx \"xxx xxxxx\". xxxxx xxxxx xxxx, xx x xxxxx xxxxxxx.\n4. xxxxxxxx! xxxx xxx xxxxxxxxx xxxx xxx, xx x xxxxxxxxx.\n5. xxxx xxxxxx - xxxxxx xxxxxxxx xxx x 15-17, xxxxxx xxxxxxxxxxxxx xx xxxxxxxx xxx xxxxxxx xxxxxx.\nxxx xxxx, xxxxx x xxxxxxxxx xx xxxxxxxxxx xxxxxx. xxxxxxxxx spelling puns, xxxxxxx, x xxxxxxxxx, xxxxxxxx xxx xxxxxxxx, xxxxxx xxxxxxxxxx xxxxxx.\nxxx xxxxxxx. xxx xxx xxxxxxxx xxxxxx - x x xxxxxxxxxxx xxxxx xxxx xxxxxxxxxx xxx xxxxx, x xxxxxx xxx xxxxxxxx xxxxxxxxxx xxx xxxxx. xx xxxxxx xxxxxxxx:\n* xxx xxxxx x xxx-xxxx xxxxxxxxx. xxxxxx xxx xxxx xxxxxxxx. x xx x xx xxxxxxxx, xx x xxxxxxx xxxxxx xxxxxx xx xxxxxxxxx. xxxxxxxxxx xxxx xxxxx xxxxxx xxxxxxxxx xxxxxxx xx xxxx.\n* xxxxxx xxxx Kotlin, x xxxxxxx. xxxxxxxxxx, xxxxxxxxxx xxx xxxxx xx xxx x xxxxxxxx\n* xxx xxxxx xxxxxxxxxx Rust, xxx xxx x xx xxx xxxx xxxxxxxxx xxxxxxxxxxxxxx xxxx xxx xxxxx, xxxxxxxx xxxxxxxxxxxxxx HTML x Markdown\n* xxx xxxx xxxxxx xxx xxxxxxxx xxxxxx. xx xxxx xxx - xxxxxxxxxxxxx xxxxxxxxxxx xxxxxx x xxxxxxxxx xxxxx x xxxxxxx.\n* xxxxxxxxx xxxx xxxxxxxx xxxxxxx xx FUSE 3.0. xxxxx xxxxxxx xxxxxxx xxx xxxxxxxxxxx.\n* x xxxxxxxx xxxx xxxxxxxx DevOps-xxxxxxx x xxxxx xxxxxxx. xxxxxxxxx, xxx xx xxxxx xxxxxx. x, xx, xxx xxx xxx xxxxxxxxx?\nxxxxx xx xxx:\n\\- xxxxxxxx xxxxxxxx\n\\- xxxxxxx xxxxxxxxx, xxxxxxx xxxxx xxxxx xxxxxxxx\n\\- xxxxxxxxxx xxxx Machine Learning, xxxx xxxxxx xxx xxxxxxxx OpenCL.".to_string());
}
#[test]
@@ -138,7 +137,7 @@ fn test_tables_with_newlines() {
.expect("File must be readable");
let result = parse_html(&html, false);
- assert_that!(result).contains(indoc! {"[![Embedded YouTube video](https://img.youtube.com/vi/ZZZZZZZZZ/0.jpg)](https://www.youtube.com/watch?v=ZZZZZZZZZ)\n\n|Maybe I'm foolish, maybe I'm blind\nThinking I can see through this and see what's behind\nGot no way to prove it so maybe I'm blind\n\nBut I'm only human after all,\nI'm only human after all\nDon't put your blame on me|xxxxx xxxx, x xxxxxx, xxxxx xxxx — xxxxxx\nxxx xxxxx, xxx xxxx xxxxxx xxxxxx xxx, x xxxxxx xxx xxx xx xxx\nxxxx x xxxx xx xxxx xxxxxxx xxxxxxxxxxxxx, xxx xxx xxxxxxxx, x xxxxxx.\n\nxx x xxxxx xxxx xxxxxxx, x xxxxx-xx xxxxxx,\nx xxxxx xxxx xxxxxxx, x xxxxx xxxxxx.\nxx xxxx xxxx|\n|||\n\n[xxxxxx xxxxx xxxxx x xxxxxxx](/)\n\nx xxxx xxxxxxxxx xxxxxxx xxxxxxxxxxx xx xxxx xxxxx. x xxxxx xxxxxxx, xxxx xxxxx xxxxxxx xx xxxxxxxxxx xxxxxx. xxx xxxxxxxx, xxx xxxxxxxxx xxxxxxxxxxxxxx xx xxxxx — xxxxxxxxxx xxxxxxxxxx x xxxxx xxxxxxxxxxxxx xxxxxxxxx. x xxx xxxxxxxxxxxx*xxxx*, xxxxxx xxxx, xxxxxxxxxx xxxxx xxxxxxxx, xxxxxxxxxx x xxxxxxxxx. xx xxxxxx xxxxx xxxxxxxxxxxxxxxxx — x xxxxxx xxx xxxx.\n\nxxxxx xxxxxxxxxx xxxxx x xxxx xxxxxxxxxx xxxxx. xxxxx. x xxxxx: «x xxxxxx xxxxxxx, x xxxxx xxx xxxx, xx xxxxxxxx xxxxxx», — xxx xxxxx xxxxxxxx. xxxxxx xxx x xxxx xxxx xxxxxxxx xxxxxxxx xxxxxxx xxxx xxxxxxxxxxx xxxxxxxxxx, xxxxxxx xxxxxx xxxxxx xxx xxxxx, xxxxxxxxxxx x x xxxxxxx xxxxxxxxx.\n\nxx x xxxxx xxxx xxxxxxx. xxxxxx xxxxx? xxxxxxxxxxx x xxxxxxxxx xxxxxx.\n\nx xxxxx x xxxxxxxxxx x xxxxx... x xxxxxx xxxx xxxxxx xxxxxxx xxxxxxxx. xx xxxx, x xxxxxx xxx-xx xxxxxxxxx xx xxxxxxx, xxx xxxxxx xxxxxx, xxx xxx xxxxx, xxxxx xxxxxxxx xx xxxx... x xxxxxx xxxxxxx xx xxxx xxxxx, xxx, xxxxx xxxx xxxxxxxxxx, x xxxxx xxxxxxxxx xx xxxxx. x xxx-xx xxx xxxxx xxxxxxx xxxxxxxxxxxxx.\n\nxxxxxx xx... xx xxx xx xxxxxxxxxxxxx xxxxxx xxxxxxxxxxxxx x xxxxxxxxxx xxxxx, xxxxx xxx xxxx xxxxxxxxx, x xxxxx xxx xxxxxxxxx, xxx xxxxxxx xxx, xxx xxxx xxxxxxx xxxxxx, x xx xxx, xxx xxxx xxxxxxxx."
+ assert_that!(result).contains(indoc! {"[![Embedded YouTube video](https://img.youtube.com/vi/ZZZZZZZZZ/0.jpg)](https://www.youtube.com/watch?v=ZZZZZZZZZ)\n|Maybe I'm foolish, maybe I'm blind\nThinking I can see through this and see what's behind\nGot no way to prove it so maybe I'm blind\nBut I'm only human after all,\nI'm only human after all\nDon't put your blame on me|xxxxx xxxx, x xxxxxx, xxxxx xxxx —xxxxxx\nxxx xxxxx, xxx xxxx xxxxxx xxxxxx xxx, x xxxxxx xxx xxx xx xxx\nxxxx x xxxx xx xxxx xxxxxxx xxxxxxxxxxxxx, xxx xxx xxxxxxxx, x xxxxxx.\nxx x xxxxx xxxx xxxxxxx, x xxxxx-xx xxxxxx,\nx xxxxx xxxx xxxxxxx, x xxxxx xxxxxx.\nxx xxxx xxxx|\n|||\n[xxxxxx xxxxx xxxxx x xxxxxxx](/)\nx xxxx xxxxxxxxx xxxxxxx xxxxxxxxxxx xx xxxx xxxxx. x xxxxx xxxxxxx, xxxx xxxxx xxxxxxx xx xxxxxxxxxx xxxxxx. xxx xxxxxxxx, xxx xxxxxxxxx xxxxxxxxxxxxxx xx xxxxx —xxxxxxxxxx xxxxxxxxxx x xxxxx xxxxxxxxxxxxx xxxxxxxxx. x xxx xxxxxxxxxxxx*xxxx*, xxxxxx xxxx, xxxxxxxxxx xxxxx xxxxxxxx, xxxxxxxxxx x xxxxxxxxx. xx xxxxxx xxxxx xxxxxxxxxxxxxxxxx —x xxxxxx xxx xxxx.\nxxxxx xxxxxxxxxx xxxxx x xxxx xxxxxxxxxx xxxxx. xxxxx. x xxxxx: «x xxxxxx xxxxxxx, x xxxxx xxx xxxx, xx xxxxxxxx xxxxxx», —xxx xxxxx xxxxxxxx. xxxxxx xxx x xxxx xxxx xxxxxxxx xxxxxxxx xxxxxxx xxxx xxxxxxxxxxx xxxxxxxxxx, xxxxxxx xxxxxx xxxxxx xxx xxxxx, xxxxxxxxxxx x x xxxxxxx xxxxxxxxx.\nxx x xxxxx xxxx xxxxxxx. xxxxxx xxxxx? xxxxxxxxxxx x xxxxxxxxx xxxxxx.\nx xxxxx x xxxxxxxxxx x xxxxx... x xxxxxx xxxx xxxxxx xxxxxxx xxxxxxxx. xx xxxx, x xxxxxx xxx-xx xxxxxxxxx xx xxxxxxx, xxx xxxxxx xxxxxx, xxx xxx xxxxx, xxxxx xxxxxxxx xx xxxx... x xxxxxx xxxxxxx xx xxxx xxxxx, xxx, xxxxx xxxx xxxxxxxxxx, x xxxxx xxxxxxxxx xx xxxxx. x xxx-xx xxx xxxxx xxxxxxx xxxxxxxxxxxxx.\nxxxxxx xx... xx xxx xx xxxxxxxxxxxxx xxxxxx xxxxxxxxxxxxx x xxxxxxxxxx xxxxx, xxxxx xxx xxxx xxxxxxxxx, x xxxxx xxx xxxxxxxxx, xxx xxxxxxx xxx, xxx xxxx xxxxxxx xxxxxx, x xx xxx, xxx xxxx xxxxxxxx."
});
}
@@ -151,7 +150,7 @@ fn test_tables_crash2() {
.expect("File must be readable");
let table_with_vertical_header = parse_html(&html, false);
- assert_that!(table_with_vertical_header).contains(indoc! {"xxxxx xxxxxxxxxx xxxxxxx x xxxxx))~~xxxxxxxx xxxxxxxx~~\n\n## At a Glance\n\n|Current Conditions:|Open all year. No reservations. No services.|\n|||\n| Reservations: | No reservations. |\n| Fees | No fee. |\n| Water: | No water. |"
+ assert_that!(table_with_vertical_header).contains(indoc! {"xxxxx xxxxxxxxxx xxxxxxx x xxxxx))~~xxxxxxxx xxxxxxxx~~\n## At a Glance\n|Current Conditions:|Open all year. No reservations. No services.|\n|||\n| Reservations: | No reservations. |\n| Fees | No fee. |\n| Water: | No water. |"
});
}
@@ -197,5 +196,6 @@ fn test_html_from_text_rewrite() {
// &Some(Url::parse("https://spider.cloud").unwrap()),
);
+ println!("{:?}", result);
assert!(!result.is_empty());
}
diff --git a/fast_html2md/tests/lists.rs b/fast_html2md/tests/lists.rs
index f833066..a6b1cd3 100644
--- a/fast_html2md/tests/lists.rs
+++ b/fast_html2md/tests/lists.rs
@@ -9,7 +9,7 @@ fn test_list_simple() {
);
assert_eq!(
md,
- "\n\n* Seven things has lady Lackless\n* Keeps them underneath her black dress\n* One a thing that's not for wearing\n\n"
+ "* Seven things has lady Lackless\n* Keeps them underneath her black dress\n* One a thing that's not for wearing"
)
}
@@ -35,7 +35,7 @@ fn test_list_formatted() {
);
assert_eq!(
md,
- "\n\n* You should NEVER see this error\n * Broken lines, broken strings\n * Broken threads, broken springs\n * Broken idols, broken heads\n * People sleep in broken beds\n \n* Ain't no use jiving\n* Ain't no use joking\n* EVERYTHING IS BROKEN"
+ "* You should NEVER see this error\n* Broken lines, broken strings\n* Broken threads, broken springs\n* Broken idols, broken heads\n* People sleep in broken beds\n* Ain't no use jiving\n* Ain't no use joking\n* EVERYTHING IS BROKEN"
)
}
@@ -75,7 +75,7 @@ fn test_list_stackedit() {
);
assert_eq!(
md,
- "* You should NEVER see this error\n \n * Broken lines, broken strings\n \n * Broken threads, broken springs\n \n * Broken idols, broken heads\n \n * People sleep in broken beds\n \n \n* Ain’t no use jiving\n \n* Ain’t no use joking\n \n* EVERYTHING IS BROKEN"
+ "* You should NEVER see this error\n* Broken lines, broken strings\n* Broken threads, broken springs\n* Broken idols, broken heads\n* People sleep in broken beds\n* Ain’t no use jiving\n* Ain’t no use joking\n* EVERYTHING IS BROKEN"
)
}
@@ -117,7 +117,7 @@ fn test_list_stackedit_add_brs() {
);
assert_eq!(
md,
- "* You should NEVER see this error\n \n * Broken lines, broken strings\n \n * Broken threads, broken springs\n \n * Broken idols, broken heads\n \n * People sleep in broken beds\n \n \n \n \n* Ain’t no use jiving\n \n* Ain’t no use joking\n \n* EVERYTHING IS BROKEN"
+ "* You should NEVER see this error\n* Broken lines, broken strings\n* Broken threads, broken springs\n* Broken idols, broken heads\n* People sleep in broken beds\n* Ain’t no use jiving\n* Ain’t no use joking\n* EVERYTHING IS BROKEN"
)
}
@@ -138,7 +138,7 @@ fn test_list_multiline() {
);
assert_eq!(
md,
- "1. In the heat and the rains\n \n With whips and chains\n \n Just to see him fly\n So many die!"
+ "1. In the heat and the rains\nWith whips and chains\nJust to see him fly\nSo many die!"
)
}
@@ -166,7 +166,7 @@ fn test_list_multiline_formatted() {
);
assert_eq!(
md,
- "\n\n* You should NEVER see this error\n * Broken lines, broken strings\n * Broken threads, broken springs\n * Broken idols, broken heads\n * People sleep in broken beds\n * Ain't no use jiving\n \n Ain't no use joking\n \n EVERYTHING IS BROKEN"
+ "* You should NEVER see this error\n* Broken lines, broken strings\n* Broken threads, broken springs\n* Broken idols, broken heads\n* People sleep in broken beds\n* Ain't no use jiving\nAin't no use joking\nEVERYTHING IS BROKEN"
)
}
@@ -211,13 +211,6 @@ fn test_list_text_prevsibling() {
);
assert_eq!(
md,
- "\
-Phrases to describe me:
-
-* Awesome
-* Cool
-* Awesome and cool
-* Can count to five
-* Learning to count to six B)"
+ "Phrases to describe me:\n* Awesome\n* Cool\n* Awesome and cool\n* Can count to five\n* Learning to count to six B)"
)
}
diff --git a/fast_html2md/tests/quotes.rs b/fast_html2md/tests/quotes.rs
index 5bb8914..ffb9a65 100644
--- a/fast_html2md/tests/quotes.rs
+++ b/fast_html2md/tests/quotes.rs
@@ -10,17 +10,14 @@ fn test_quotes() {
);
assert_eq!(
md,
- "\n\n> here's a quote next line of it\nAnd some text after it"
+ "> here's a quote next line of it\nAnd some text after it"
)
}
#[test]
fn test_quotes2() {
let md = parse_html("here'snested quote!
a quote\n next line of it
", false);
- assert_eq!(
- md,
- "\n\n> here's\n> > nested quote!\n> a quote next line of it\n\n"
- )
+ assert_eq!(md, "> here's\n> > nested quote!\n> a quote next line of it")
}
#[test]
diff --git a/fast_html2md/tests/tables.rs b/fast_html2md/tests/tables.rs
index 89bc31b..73a83c2 100644
--- a/fast_html2md/tests/tables.rs
+++ b/fast_html2md/tests/tables.rs
@@ -194,5 +194,5 @@ fn test_tables_wild_example() {
false,
);
- assert_eq!(md, "| One ring | Patterns | Titanic | | | |\n|||||||\n| One ring to rule them all |There's one for the sorrow| Roll on, Titanic, roll | | | |\n| One ring to find them | And two for the joy |You're the pride of White Star Line| | | |\n| One ring to bring them all | And three for the girls | Roll on, Titanic, roll | | | |\n|And in the darkness bind them| And four for the boys | Into the mists of time | | | |");
+ assert_eq!(md, "| One ring | Patterns | Titanic | | | |\n|||||||\n| One ring to rule them all |There's one for the sorrow| Roll on, Titanic, roll | | | |\n| One ring to find them | And two for the joy |You're the pride of White Star Line| | | |\n| One ring to bring them all | And three for the girls | Roll on, Titanic, roll | | | |\n|And in the darkness bind them| And four for the boys | Into the mists of time | | | |");
}
diff --git a/fast_html2md/tests/unit.rs b/fast_html2md/tests/unit.rs
index 378dbb0..638919e 100644
--- a/fast_html2md/tests/unit.rs
+++ b/fast_html2md/tests/unit.rs
@@ -35,13 +35,7 @@ fn test_anchor3() {
r#"APOSIMZ
SIDONIA"#,
false,
);
- assert_eq!(
- md,
- "\
-[APOSIMZ](http://ya.ru)
-
-[SIDONIA](http://yandex.ru)"
- )
+ assert_eq!(md, "[APOSIMZ](http://ya.ru)\n[SIDONIA](http://yandex.ru)")
}
#[test]
@@ -128,7 +122,7 @@ fn test_headers() {
);
assert_eq!(
md,
- "# MARC-FS\n\n[Mail.ru](http://Mail.ru)Cloud filesystem written for FUSE\n## Synopsis"
+ "# MARC-FS\n[Mail.ru](http://Mail.ru)Cloud filesystem written for FUSE\n## Synopsis"
)
}
diff --git a/test-samples/markdown-spider-sample.md b/test-samples/markdown-spider-sample.md
new file mode 100644
index 0000000..0e92c14
--- /dev/null
+++ b/test-samples/markdown-spider-sample.md
@@ -0,0 +1 @@
+\nTo help you get started with Spider, we’ll give you $200 in credits when you spend $100.[Terms apply](https://spider.cloud/promotion-spider-credits)\n# The Web Crawler for AI Agents and LLMs\nSpider offers the finest data collecting solution. Engineered for speed and scalability, it allows you to elevate your AI projects.\n[Get Started](https://spider.cloud/credits/new)View Preview\n* Basic\n* Streaming\nExample request\nPython\nJSONL\nCopy\n```\nimport requests, os, json\nheaders = {\n 'Authorization': f'Bearer {os.getenv(\"SPIDER_API_KEY\")}',\n 'Content-Type': 'application/jsonl',\n}\njson_data = {\"limit\":50,\"metadata\":True,\"url\":\"https://spider.cloud\"}\nresponse = requests.post('https://api.spider.cloud/crawl', \n headers=headers, json=json_data, stream=True)\nwith response as r:\n r.raise_for_status()\nfor chunk in r.iter_lines(\n chunk_size=None, \n decode_unicode=True\n ):\n data = json.loads(chunk)\n print(data)\n```\n[Free Trial](https://spider.cloud/credits/new?free-trial=1)\nExample Response\n## Built with the need for**Speed**\nExperience the power of**Spider**, built fully in**Rust**for next-generation scalability.\n### 2.4secs\nTo crawl over 20,000 pages\n### 500-1000x\nFaster than alternatives\n### 500x\nCheaper than traditional scraping services\nSpider API Request Modes · Benchmarked tailwindcss.com ·06/16/2024\n[See framework benchmarks](https://github.com/spider-rs/spider/blob/main/benches/BENCHMARKS.md)\n### Seamless Integrations\nSeamlessly integrate Spider with a wide range of platforms, ensuring data curation perfectly aligned with your requirements. Compatible with all major AI tools.\n[LangChain integration](https://python.langchain.com/docs/integrations/document_loaders/spider)[LlamaIndex integration](https://docs.llamaindex.ai/en/stable/examples/data_connectors/WebPageDemo/#using-spider-reader)[CrewAI integration](https://docs.crewai.com/tools/SpiderTool/)[FlowWiseAI integration](https://docs.flowiseai.com/integrations/langchain/document-loaders/spider-web-scraper-crawler)[Composio integration](https://docs.composio.dev/introduction/foundations/components/list_local_tools#spider-crawler)[PhiData integration](https://docs.phidata.com/tools/spider)\n### Concurrent Streaming\nSave time and money without having to worry about bandwidth concerns by effectively streaming all the results concurrently. The latency cost that is saved becomes drastic as you crawl more websites.\n### Warp Speed\nPowered by the cutting-edge[Spider](https://github.com/spider-rs/spider)open-source project, our robust Rust engine scales effortlessly to handle extreme workloads. We ensure continuous maintenance and improvement for top-tier performance.\n## Kickstart Your Data Collecting Projects Today\nJumpstart web crawling with full elastic scaling concurrency, optimal formats, and AI scraping.\n### Performance Tuned\nSpider is written in Rust and runs in full concurrency to achieve crawling thousands of pages in secs.\n### Multiple response formats\nGet clean and formatted markdown, HTML, or text content for fine-tuning or training AI models.\n### Caching\nFurther boost speed by caching repeated web page crawls to minimize expenses while building.\n### Smart Mode\nSpider dynamically switches to Headless Chrome when it needs to quick.\nBeta\n### Scrape with AI\nDo custom browser scripting and data extraction using the latest AI models with no cost step caching.\n### The crawler for LLMs\nDon't let crawling and scraping be the highest latency in your LLM & AI agent stack.\n### Scrape with no headaches\n* Auto Proxy rotations\n* Agent headers\n* Anti-bot detections\n* Headless chrome\n* Markdown responses\n### The Fastest Web Crawler\n* Powered by[spider-rs](https://github.com/spider-rs/spider)\n* 100,000 pages/seconds\n* Unlimited concurrency\n* Simple API\n* 50,000 RPM\n### Do more with AI\n* Browser scripting\n* Advanced extraction\n* Data pipelines\n* Ideal for LLMs and AI Agents\n* Accurate labeling\n## Achieve more with these new API features\nOur API is set to stream so you can act in realtime.\n![A user interface with a search bar containing the text \"Latest sports news,\" a green \"Submit\" button, and two icon buttons to display searching and extracting with the service.](https://spider.cloud/img/search_feature.webp)\n### Search\nGet access to search engine results from anywhere and easily crawl and transform pages to LLM-ready markdown.\n[Explore Search](https://spider.cloud/docs/api#search)\n![A user interface segment showing three icons representing different stages of data transformation.](https://spider.cloud/img/transform_feature_example.webp)\n### Transform\nConvert raw HTML into markdown easily by using this API. Transform thousands of html pages in seconds.\n[Explore Transform](https://spider.cloud/docs/api#transform)\n## Join the community\nBacked by a network of early advocates, contributors, and supporters.\n[GitHub discussions\n](https://github.com/orgs/spider-rs/discussions)\n[Discord\n](https://discord.spider.cloud)\n[\n![iammerrick's avatar](https://spider.cloud/img/external/iammerrick_twitter.webp)\n@iammerrick\nRust based crawler Spider is next level for crawling & scraping sites. So fast. Their cloud offering is also so easy to use. Good stuff. https://github.com/spider-rs/spider\n](https://twitter.com/iammerrick/status/1787873425446572462)\n[\n![WilliamEspegren's avatar](https://spider.cloud/img/external/william_twitter.webp)\n@WilliamEspegren\nWeb crawler built in rust, currently the nr1 performance in the world with crazy resource management Aaaaaaand they have a cloud offer, that’s wayyyy cheaper than any competitor Name a reason for me to use anything else? github.com/spider-rs/spid…\n](https://twitter.com/WilliamEspegren/status/1789419820821184764)\n[\n![gasa's avatar](https://spider.cloud/img/external/gaza_twitter.webp)\n@gasa\n@gasathenaper is the best crawling tool i have used. I had a complicated project where i needed to paste url and get the website whole website data. Spider cloud does it in an instant\n](https://x.com/gasathenaper/status/1810612492596383948)\n[\n![Ashpreet Bedi's avatar](https://spider.cloud/img/external/ashpreet_bedi.webp)\n@Ashpreet Bedi\n@ashpreetbedi is THE best crawler out there, give it a try\n](https://x.com/ashpreetbedi/status/1815512219003572315?s=46&t=37F5QP_8oKqOsNpHSo6VVw)\n[\n![Troyusrex's avatar](https://spider.cloud/img/external/troy_twitter.webp)\n@Troyusrex\nI found a new tool, Spider-rs, which scrapes significantly faster and handles more scenarios than the basic scraper I built did. Our use of Spider-rs and AWS infrastructure reduced the scraping time from four months to under a week.\n](https://medium.com/@troyusrex/inside-my-virtual-college-advisor-a-deep-dive-into-rag-ai-and-agent-technology-84731b2928f7#1326)\n[\n![Dify.AI's avatar](https://spider.cloud/img/external/difyai.webp)\n@Dify.AI\n🕷️ Spider @spider\\_rust can be used as a built-in tool in #Dify Workflow or as an LLM-callable tool in Agent. It allows fast and affordable web scraping and crawling when your AI applications need real-time web data for context.\n](https://x.com/dify_ai/status/1818226971056243089)\n## FAQ\nFrequently asked questions about Spider.\n### What is Spider?\nSpider is a leading web crawling tool designed for speed and cost-effectiveness, supporting various data formats including LLM-ready markdown.\n### Why is my website not crawling?\nYour crawl may fail if it requires JavaScript rendering. Try setting your request to 'chrome' to solve this issue.\n### Can you crawl all pages?\nYes, Spider accurately crawls all necessary content without needing a sitemap.\n### What formats can Spider convert web data into?\nSpider outputs HTML, raw, text, and various markdown formats. It supports`JSON`,`JSONL`,`CSV`, and`XML`for API responses.\n### Is Spider suitable for large scraping projects?\nAbsolutely, Spider is ideal for large-scale data collection and offers a cost-effective dashboard for data management.\n### How can I try Spider?\nPurchase credits for our cloud system or test the Open Source Spider engine to explore its capabilities.\n### Does it respect robots.txt?\nYes, compliance with robots.txt is default, but you can disable this if necessary.\n### Unable to get dynamic content?\nIf you are having trouble getting dynamic pages, try setting the request parameter to \"chrome\" or \"smart.\" You may also need to set `disable\\_intercept` to allow third-party or external scripts to run.\n### Why is my crawl going slow?\nIf you are experiencing a slow crawl, it is most likely due to the robots.txt file for the website. The robots.txt file may have a crawl delay set, and we respect the delay up to 60 seconds.\n### Do you offer a Free Trial?\nYes, you can try out the service before being charged for free at[checkout](https://spider.cloud/credits/new?free-trial=1).\n## Comprehensive Data Curation for Everyone\nTrusted by leading tech businesses worldwide to deliver accurate and insightful data solutions.\n[Zapier](https://zapier.com/apps/spider/integrations)\n### Next generation data for AI, scale to millions\n[Start now](https://spider.cloud/credits/new)\n### Company\n* [About](https://spider.cloud/about)\n* [Privacy](https://spider.cloud/privacy)\n* [Terms](https://spider.cloud/eula)\n* [FAQ](https://spider.cloud/faq)\n### Resources\n* [API](https://spider.cloud/docs/api)\n* [Docs](https://spider.cloud/docs/overview)\n* [Guides](https://spider.cloud/guides)\n* [Spider.rs Docs](https://docs.rs/spider/latest/spider/)\n### Services\n* [Pricing](https://spider.cloud/credits/new)\n* [Web Crawling and Scraping](https://spider.cloud/web-crawling-and-scraping)\n[All systems normal.](https://spidercloud.statuspage.io/)\n[GitHub](https://github.com/spider-rs/spider)\n[Discord](https://discord.spider.cloud)\n[Twitter](https://twitter.com/spider_rust)\n
\ No newline at end of file