From 95063e39c9ee628ac13abb19f8817c015bd69b5d Mon Sep 17 00:00:00 2001 From: j-mendez Date: Sun, 24 Nov 2024 17:46:41 -0500 Subject: [PATCH] chore(lib): cleanup entry scraper and rewriter --- fast_html2md/src/lib.rs | 418 +--------------------------- fast_html2md/src/rewriter/handle.rs | 151 ++++++++++ fast_html2md/src/rewriter/mod.rs | 1 + fast_html2md/src/rewriter/writer.rs | 157 +---------- fast_html2md/src/scraper/mod.rs | 406 ++++++++++++++++++++++++++- fast_html2md/tests/integration.rs | 8 +- 6 files changed, 578 insertions(+), 563 deletions(-) create mode 100644 fast_html2md/src/rewriter/handle.rs diff --git a/fast_html2md/src/lib.rs b/fast_html2md/src/lib.rs index 2dbf4de..ed6812b 100644 --- a/fast_html2md/src/lib.rs +++ b/fast_html2md/src/lib.rs @@ -1,60 +1,22 @@ use extended::sifter::WhitespaceSifterBytes; -use html5ever::driver::ParseOpts; -use html5ever::parse_document; -use html5ever::tendril::TendrilSink; use lazy_static::lazy_static; +pub use markup5ever_rcdom::{Handle, NodeData, RcDom}; use regex::Regex; -use std::boxed::Box; -use std::collections::HashMap; use std::collections::HashSet; -use std::sync::Arc; use url::Url; - -pub use markup5ever_rcdom::{Handle, NodeData, RcDom}; - // we want to just use the rewriter instead for v0.1. pub mod extended; pub mod rewriter; pub mod scraper; - use extended::sifter::WhitespaceSifter; -pub(crate) use scraper::anchors; -pub(crate) use scraper::codes; pub use scraper::ignore; -// pub(crate) use scraper::common; -pub(crate) use scraper::containers; -pub(crate) use scraper::dummy; -pub(crate) use scraper::headers; -pub(crate) use scraper::iframes; -pub(crate) use scraper::images; -pub(crate) use scraper::lists; -pub(crate) use scraper::paragraphs; -pub(crate) use scraper::quotes; -pub(crate) use scraper::styles; -pub(crate) use scraper::tables; -pub(crate) use scraper::utils; - -use anchors::AnchorHandler; -use codes::CodeHandler; -use containers::ContainerHandler; -use dummy::DummyHandler; -use dummy::HtmlCherryPickHandler; -use dummy::IdentityHandler; -use headers::HeaderHandler; -use iframes::IframeHandler; -use images::ImgHandler; -use lists::ListHandler; -use lists::ListItemHandler; -use paragraphs::ParagraphHandler; -use quotes::QuoteHandler; -use styles::StyleHandler; -use tables::TableHandler; +pub use scraper::{ + parse_html, parse_html_custom, parse_html_custom_base, parse_html_custom_with_url, + parse_html_extended, +}; lazy_static! { - static ref EXCESSIVE_WHITESPACE_PATTERN: Regex = Regex::new("\\s{2,}").expect("valid regex pattern"); // for HTML on-the-fly cleanup - static ref START_OF_LINE_PATTERN: Regex = Regex::new("(^|\\n) *$").expect("valid regex pattern"); // for Markdown escaping - static ref MARKDOWN_STARTONLY_KEYCHARS: Regex = Regex::new(r"^(\s*)([=>+\-#])").expect("valid regex pattern"); // for Markdown escaping static ref MARKDOWN_MIDDLE_KEYCHARS: Regex = Regex::new(r"[<>*\\_~]").expect("valid regex pattern"); // for Markdown escaping static ref MARKDOWN_MIDDLE_KEYCHARS_SET: regex::RegexSet = regex::RegexSet::new(&[ r"[<>*\\_~]", // Matches any single markdown character @@ -62,89 +24,6 @@ lazy_static! { ]).expect("valid regex set"); } -/// Custom variant of main function. Allows to pass custom tag<->tag factory pairs -/// in order to register custom tag hadler for tags you want. -/// -/// You can also override standard tag handlers this way -/// # Arguments -/// `html` is source HTML as `String` -/// `custom` is custom tag hadler producers for tags you want, can be empty -/// `commonmark` is for adjusting markdown output to commonmark -pub fn parse_html_custom_base( - html: &str, - custom: &HashMap>, - commonmark: bool, - url: &Option, -) -> String { - let document_parser = parse_document(RcDom::default(), ParseOpts::default()); - - match document_parser.from_utf8().read_from(&mut html.as_bytes()) { - Ok(dom) => { - let mut result = Box::new(StructuredPrinter::default()); - - walk( - &dom.document, - &mut result, - custom, - commonmark, - &if let Some(u) = url { - Some(Arc::new(u.clone())) - } else { - None - }, - false, - ); - - // we want to eventually remove the clean step. - clean_markdown(&result.data) - } - _ => Default::default(), - } -} - -/// Custom variant of main function. Allows to pass custom tag<->tag factory pairs -/// in order to register custom tag hadler for tags you want. -/// -/// You can also override standard tag handlers this way -/// # Arguments -/// `html` is source HTML as `String` -/// `custom` is custom tag hadler producers for tags you want, can be empty -/// `commonmark` is for adjusting markdown output to commonmark -pub fn parse_html_custom( - html: &str, - custom: &HashMap>, - commonmark: bool, -) -> String { - parse_html_custom_base(html, custom, commonmark, &None) -} - -/// Custom variant of main function. Allows to pass custom tag<->tag factory pairs -/// in order to register custom tag hadler for tags you want. -/// -/// You can also override standard tag handlers this way -/// # Arguments -/// `html` is source HTML as `String` -/// `custom` is custom tag hadler producers for tags you want, can be empty -/// `commonmark` is for adjusting markdown output to commonmark -/// `url` is used to provide absolute url handling -pub fn parse_html_custom_with_url( - html: &str, - custom: &HashMap>, - commonmark: bool, - url: &Option, -) -> String { - parse_html_custom_base(html, custom, commonmark, &url) -} - -/// Main function of this library. Parses incoming HTML, converts it into Markdown -/// and returns converted string. -/// # Arguments -/// `html` is source HTML as `String` -/// `commonmark` to change the markdown flavor to commonmark as `boolean` -pub fn parse_html(html: &str, commonmark: bool) -> String { - parse_html_custom(html, &HashMap::default(), commonmark) -} - /// Main function of this library to come. Rewrites incoming HTML, converts it into Markdown /// and returns converted string. Incomplete work in progress for major performance increases. /// # Arguments @@ -170,237 +49,6 @@ pub fn rewrite_html_custom_with_url( rewriter::writer::convert_html_to_markdown(html, &custom, commonmark, url).unwrap_or_default() } -/// Same as `parse_html` but retains all "span" html elements intact -/// Markdown parsers usually strip them down when rendering but they -/// may be useful for later processing. -pub fn parse_html_extended(html: &str, commonmark: bool) -> String { - struct SpanAsIsTagFactory; - - impl TagHandlerFactory for SpanAsIsTagFactory { - fn instantiate(&self) -> Box { - Box::new(HtmlCherryPickHandler::default()) - } - } - - let mut tag_factory: HashMap> = HashMap::new(); - tag_factory.insert(String::from("span"), Box::new(SpanAsIsTagFactory {})); - parse_html_custom(html, &tag_factory, commonmark) -} - -/// Recursively walk through all DOM tree and handle all elements according to -/// HTML tag -> Markdown syntax mapping. Text content is trimmed to one whitespace according to HTML5 rules. -/// -/// # Arguments -/// `input` is DOM tree or its subtree -/// `result` is output holder with position and context tracking -/// `custom` is custom tag hadler producers for tags you want, can be empty -fn walk( - input: &Handle, - result: &mut StructuredPrinter, - custom: &HashMap>, - commonmark: bool, - url: &Option>, - ignore_parents: bool, -) { - let mut handler: Box = Box::new(DummyHandler); - let mut tag_name = String::default(); - - let mut inside_pre = false; - let mut inside_code = false; - let mut ignore_write = false; - let mut inside_table = false; - - let find_parent_tags = matches!( - &input.data, - NodeData::Element { .. } | NodeData::Text { .. } - ); - - if find_parent_tags || ignore_parents { - for tag in result.parent_chain.iter() { - if ignore_parents && tag == "table" { - inside_table = true; - break; - } - if tag == "code" { - inside_code = true; - break; - } - if tag == "pre" { - inside_pre = true; - break; - } - if tag_name == "script" || tag_name == "style" { - ignore_write = true; - break; - } - } - } - - match input.data { - NodeData::Document - | NodeData::Comment { .. } - | NodeData::Doctype { .. } - | NodeData::ProcessingInstruction { .. } => (), - NodeData::Text { ref contents } => { - let mut text = contents.borrow().to_string(); - - if inside_pre { - // this is preformatted text, insert as-is - result.append_str(&text); - } else if !(text.trim().is_empty() - && (result.data.ends_with('\n') || result.data.ends_with(' '))) - && !ignore_write - { - if !inside_code { - text = escape_markdown(result, &text); - } - - let minified_text = EXCESSIVE_WHITESPACE_PATTERN.replace_all(&text, " "); - - result.append_str(minified_text.trim()); - } else { - result.append_str(text.trim()); - } - } - NodeData::Element { ref name, .. } => { - if !utils::inline_elements::SKIP_ELEMENTS.contains(&name.local) { - tag_name = name.local.to_string(); - - // do not parse scripts or style tags - if tag_name == "script" || tag_name == "style" { - return; - } - - if ignore_parents && tag_name == "table" { - inside_table = true; - } - - handler = if inside_pre { - // don't add any html tags inside the pre section - Box::new(DummyHandler) - } else { - get_handler(custom, &tag_name, commonmark, url) - } - } - } - } - - if !inside_table || ignore_parents && inside_table { - // handle this tag, while it's not in parent chain - // and doesn't have child siblings - handler.handle(input, result); - } - - result.parent_chain.push(tag_name.clone()); // e.g. it was ["body"] and now it's ["body", "p"] - - let current_depth = result.parent_chain.len(); // e.g. it was 1 and now it's 2 - - // create space for siblings of next level - result.siblings.insert(current_depth, vec![]); - - if !handler.skip_descendants() { - for child in input.children.borrow().iter() { - if valid_block_element(&child.data) { - walk(&child, result, custom, commonmark, url, ignore_parents); - - if let NodeData::Element { ref name, .. } = child.data { - if let Some(el) = result.siblings.get_mut(¤t_depth) { - el.push(name.local.to_string()); - } - } - } - } - } - - result.siblings.remove(¤t_depth); - result.parent_chain.pop(); - - // finish handling of tag - parent chain now doesn't contain this tag itself again - handler.after_handle(result); -} - -/// This conversion should only be applied to text tags -/// -/// Escapes text inside HTML tags so it won't be recognized as Markdown control sequence -/// like list start or bold text style -fn escape_markdown_base(result: &str, text: &str) -> String { - // always escape bold/italic/strikethrough - let data: std::borrow::Cow = MARKDOWN_MIDDLE_KEYCHARS.replace_all(text, "\\$0"); - - // if we're at the start of the line we need to escape list- and quote-starting sequences - let data = if START_OF_LINE_PATTERN.is_match(&result) { - MARKDOWN_STARTONLY_KEYCHARS.replace(&data, "$1\\$2") - } else { - data - }; - - // no handling of more complicated cases such as - // ![] or []() ones, for now this will suffice - data.into() -} - -/// Get the handler to use for the element. -pub(crate) fn get_handler + std::hash::Hash + std::cmp::Eq>( - custom: &HashMap>, - tag_name: &T, - commonmark: bool, - url: &Option>, -) -> Box { - let name = tag_name.borrow(); - match custom.get(name) { - Some(factory) => { - // have user-supplied factory, instantiate a handler for this tag - factory.instantiate() - } - _ => { - match name.as_ref() { - // containers - "div" | "section" | "header" | "footer" => Box::new(ContainerHandler), - // pagination, breaks - "p" | "br" | "hr" => Box::new(ParagraphHandler::default()), - "q" | "cite" | "blockquote" => Box::new(QuoteHandler::default()), - // spoiler tag - "details" | "summary" => Box::new(HtmlCherryPickHandler::new(commonmark)), - // formatting - "b" | "i" | "s" | "strong" | "em" | "del" => Box::new(StyleHandler::default()), - "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => Box::new(HeaderHandler::default()), - "pre" | "code" => Box::new(CodeHandler::default()), - // images, links - "img" => Box::new(ImgHandler::new(commonmark, url)), - "a" => Box::new(AnchorHandler::new(url)), - // lists - "ol" | "ul" | "menu" => Box::new(ListHandler), - "li" => Box::new(ListItemHandler::default()), - // as-is - "sub" | "sup" => Box::new(IdentityHandler::new(commonmark)), - // tables, handled fully internally as markdown can't have nested content in tables - // supports only single tables as of now - "table" => Box::new(TableHandler::new(commonmark, url.clone())), - "iframe" => Box::new(IframeHandler), - _ => Box::new(DummyHandler), - } - } - } -} - -/// A valid HTML block element. -pub(crate) fn valid_block_element(node: &NodeData) -> bool { - match node { - NodeData::Element { ref name, .. } => { - !utils::inline_elements::SKIP_ELEMENTS.contains(&name.local) - } - _ => true, - } -} - -/// This conversion should only be applied to text tags -/// -/// Escapes text inside HTML tags so it won't be recognized as Markdown control sequence -/// like list start or bold text style -fn escape_markdown(result: &StructuredPrinter, text: &str) -> String { - escape_markdown_base(&result.data, text) -} - /// Called after all processing has been finished /// /// Clears excessive punctuation that would be trimmed by renderer anyway @@ -414,59 +62,3 @@ pub fn clean_markdown(input: &str) -> String { pub fn clean_markdown_bytes(input: &Vec) -> String { input.sift_bytes() } - -/// Intermediate result of HTML -> Markdown conversion. -/// -/// Holds context in the form of parent tags and siblings chain -/// and resulting string of markup content with current position. -#[derive(Debug, Default)] -pub struct StructuredPrinter { - /// Chain of parents leading to upmost tag - pub parent_chain: Vec, - /// Siblings of currently processed tag in order where they're appearing in html - pub siblings: HashMap>, - /// resulting markdown document - pub data: String, -} - -impl StructuredPrinter { - /// Inserts newline - pub fn insert_newline(&mut self) { - self.append_str("\n"); - } - - /// Append string to the end of the printer - pub fn append_str(&mut self, it: &str) { - self.data.push_str(it); - } - - /// Insert string at specified position of printer, adjust position to the end of inserted string - pub fn insert_str(&mut self, pos: usize, it: &str) { - self.data.insert_str(pos, it); - } -} - -/// Tag handler factory. This class is required in providing proper -/// custom tag parsing capabilities to users of this library. -/// -/// The problem with directly providing tag handlers is that they're not stateless. -/// Once tag handler is parsing some tag, it holds data, such as start position, indent etc. -/// The only way to create fresh tag handler for each tag is to provide a factory like this one. -/// -pub trait TagHandlerFactory { - fn instantiate(&self) -> Box; -} - -/// Trait interface describing abstract handler of arbitrary HTML tag. -pub trait TagHandler { - /// Handle tag encountered when walking HTML tree. - /// This is executed before the children processing - fn handle(&mut self, tag: &Handle, printer: &mut StructuredPrinter); - - /// Executed after all children of this tag have been processed - fn after_handle(&mut self, printer: &mut StructuredPrinter); - - fn skip_descendants(&self) -> bool { - false - } -} diff --git a/fast_html2md/src/rewriter/handle.rs b/fast_html2md/src/rewriter/handle.rs new file mode 100644 index 0000000..09f6484 --- /dev/null +++ b/fast_html2md/src/rewriter/handle.rs @@ -0,0 +1,151 @@ +use super::anchors::rewrite_anchor_element; +use super::iframes::handle_iframe; +use super::images::rewrite_image_element; +use super::lists::handle_list_or_item; +use super::quotes::rewrite_blockquote_element; +use super::styles::rewrite_style_element; +use lol_html::html_content::ContentType::{Html, Text}; +use lol_html::html_content::Element; +use lol_html::{doc_comments, doctype, text}; +use lol_html::{element, RewriteStrSettings}; +use std::cell::RefCell; +use std::rc::Rc; +use url::Url; + +/// Insert a new line after +#[inline] +pub fn insert_newline_after(element: &mut Element) { + element.after("\n", Text); +} + +/// Insert a new line before +#[inline] +pub fn insert_newline_before(element: &mut Element) { + element.before("\n", Text); +} + +/// Handle the lol_html tag. +#[inline] +pub fn handle_tag( + element: &mut Element, + commonmark: bool, + url: &Option, + list_type: Rc>>, + order_counter: Rc>, + quote_depth: Rc>, + inside_table: Rc>, +) -> Result<(), Box> { + let element_name = element.tag_name(); + + let remove_attrs = + commonmark && (element_name.as_str() == "sub" || element_name.as_str() == "sup"); + + // check common mark includes. + if remove_attrs { + let attrs = element + .attributes() + .iter() + .map(|f| f.name()) + .collect::>(); + + for attr in attrs.iter() { + element.remove_attribute(&attr); + } + } else { + element.remove_and_keep_content(); + } + + // Add the markdown equivalents before the element. + match element_name.as_str() { + "h1" => { + element.before("# ", Text); + insert_newline_after(element); + } + "h2" => { + element.before("## ", Text); + insert_newline_after(element); + } + "h3" => { + element.before("### ", Text); + insert_newline_after(element); + } + "h4" => { + element.before("#### ", Text); + insert_newline_after(element); + } + "h5" => { + element.before("##### ", Text); + insert_newline_after(element); + } + "h6" => { + element.before("###### ", Text); + insert_newline_after(element); + } + "p" => { + insert_newline_before(element); + insert_newline_after(element); + } + "hr" => { + insert_newline_before(element); + element.append("---", Text); + insert_newline_after(element); + } + "br" => insert_newline_after(element), + "a" => { + let _ = rewrite_anchor_element(element, commonmark, url); + } + "img" => { + let _ = rewrite_image_element(element, commonmark, &url); + } + "table" => { + *inside_table.borrow_mut() = true; + } + "tr" => { + insert_newline_after(element); + } + "th" => { + if commonmark { + element.before("** ", Html); + element.after("** |", Html); + } else { + element.after("|", Html); + } + + // add the first table row start + if *inside_table.borrow() { + element.before("|", Html); + *inside_table.borrow_mut() = false; + } + } + "td" => { + element.after("|", Html); + } + "iframe" => { + let _ = handle_iframe(element); + } + "b" | "i" | "s" | "strong" | "em" | "del" => { + let _ = rewrite_style_element(element); + } + "ol" | "ul" | "menu" | "li" => { + let _ = handle_list_or_item(element, list_type.clone(), order_counter.clone()); + } + "q" | "cite" | "blockquote" => { + let _ = rewrite_blockquote_element(element, quote_depth); + } + "div" | "section" | "header" | "footer" => { + insert_newline_before(element); + insert_newline_after(element); + } + "pre" => { + element.before("\n```\n", Html); + element.after("\n```\n", Html); + } + "code" | "samp" => { + element.before("`", Html); + element.after("`", Html); + } + _ => (), + } + + Ok(()) +} diff --git a/fast_html2md/src/rewriter/mod.rs b/fast_html2md/src/rewriter/mod.rs index ab5a8e3..1a15ca5 100644 --- a/fast_html2md/src/rewriter/mod.rs +++ b/fast_html2md/src/rewriter/mod.rs @@ -1,5 +1,6 @@ pub(crate) mod anchors; pub(crate) mod counter; +pub(crate) mod handle; pub(crate) mod iframes; pub(crate) mod images; pub(crate) mod lists; diff --git a/fast_html2md/src/rewriter/writer.rs b/fast_html2md/src/rewriter/writer.rs index b6dbefc..9d3e379 100644 --- a/fast_html2md/src/rewriter/writer.rs +++ b/fast_html2md/src/rewriter/writer.rs @@ -1,11 +1,7 @@ -use super::anchors::rewrite_anchor_element; -use super::iframes::handle_iframe; -use super::images::rewrite_image_element; -use super::lists::handle_list_or_item; -use super::quotes::{rewrite_blockquote_element, rewrite_blockquote_text}; -use super::styles::rewrite_style_element; +use super::handle::handle_tag; +use super::quotes::rewrite_blockquote_text; use crate::clean_markdown_bytes; -use lol_html::html_content::ContentType::{Html, Text}; +use lol_html::html_content::ContentType::Text; use lol_html::html_content::Element; use lol_html::{doc_comments, doctype, text}; use lol_html::{element, RewriteStrSettings}; @@ -25,132 +21,6 @@ pub fn insert_newline_before(element: &mut Element) { element.before("\n", Text); } -/// Handle the lol_html tag. -#[inline] -fn handle_tag( - element: &mut Element, - commonmark: bool, - url: &Option, - list_type: Rc>>, - order_counter: Rc>, - quote_depth: Rc>, - inside_table: Rc>, -) -> Result<(), Box> { - let element_name = element.tag_name(); - - let remove_attrs = - commonmark && (element_name.as_str() == "sub" || element_name.as_str() == "sup"); - - // check common mark includes. - if remove_attrs { - let attrs = element - .attributes() - .iter() - .map(|f| f.name()) - .collect::>(); - - for attr in attrs.iter() { - element.remove_attribute(&attr); - } - } else { - element.remove_and_keep_content(); - } - - // Add the markdown equivalents before the element. - match element_name.as_str() { - "h1" => { - element.before("# ", Text); - insert_newline_after(element); - } - "h2" => { - element.before("## ", Text); - insert_newline_after(element); - } - "h3" => { - element.before("### ", Text); - insert_newline_after(element); - } - "h4" => { - element.before("#### ", Text); - insert_newline_after(element); - } - "h5" => { - element.before("##### ", Text); - insert_newline_after(element); - } - "h6" => { - element.before("###### ", Text); - insert_newline_after(element); - } - "p" => { - insert_newline_before(element); - insert_newline_after(element); - } - "hr" => { - insert_newline_before(element); - element.append("---", Text); - insert_newline_after(element); - } - "br" => insert_newline_after(element), - "a" => { - let _ = rewrite_anchor_element(element, commonmark, url); - } - "img" => { - let _ = rewrite_image_element(element, commonmark, &url); - } - "table" => { - *inside_table.borrow_mut() = true; - } - "tr" => { - insert_newline_after(element); - } - "th" => { - if commonmark { - element.before("** ", Html); - element.after("** |", Html); - } else { - element.after("|", Html); - } - - // add the first table row start - if *inside_table.borrow() { - element.before("|", Html); - *inside_table.borrow_mut() = false; - } - } - "td" => { - element.after("|", Html); - } - "iframe" => { - let _ = handle_iframe(element); - } - "b" | "i" | "s" | "strong" | "em" | "del" => { - let _ = rewrite_style_element(element); - } - "ol" | "ul" | "menu" | "li" => { - let _ = handle_list_or_item(element, list_type.clone(), order_counter.clone()); - } - "q" | "cite" | "blockquote" => { - let _ = rewrite_blockquote_element(element, quote_depth); - } - "div" | "section" | "header" | "footer" => { - insert_newline_before(element); - insert_newline_after(element); - } - "pre" => { - element.before("\n```\n", Html); - element.after("\n```\n", Html); - } - "code" | "samp" => { - element.before("`", Html); - element.after("`", Html); - } - _ => (), - } - - Ok(()) -} - /// Replace the markdown chars cleanly. fn replace_markdown_chars(input: &str) -> String { use crate::MARKDOWN_MIDDLE_KEYCHARS_SET; @@ -191,7 +61,7 @@ fn replace_markdown_chars(input: &str) -> String { output } -/// Get the HTML rewriter settings to convert ot markdown. +/// Get the HTML rewriter settings to convert to markdown. pub fn get_rewriter_settings( commonmark: bool, custom: &Option>, @@ -201,7 +71,6 @@ pub fn get_rewriter_settings( let order_counter = Rc::new(RefCell::new(0)); let quote_depth = Rc::new(RefCell::new(0)); let quote_depth1 = quote_depth.clone(); - let inside_table = Rc::new(RefCell::new(false)); let mut element_content_handlers = @@ -251,14 +120,16 @@ pub fn get_rewriter_settings( } RewriteStrSettings { - document_content_handlers: vec![doc_comments!(|c| { - c.remove(); - Ok(()) - }), - doctype!(|c| { - c.remove(); - Ok(()) - })], + document_content_handlers: vec![ + doc_comments!(|c| { + c.remove(); + Ok(()) + }), + doctype!(|c| { + c.remove(); + Ok(()) + }), + ], element_content_handlers, ..RewriteStrSettings::default() } diff --git a/fast_html2md/src/scraper/mod.rs b/fast_html2md/src/scraper/mod.rs index 4e1380b..7b94c9b 100644 --- a/fast_html2md/src/scraper/mod.rs +++ b/fast_html2md/src/scraper/mod.rs @@ -13,9 +13,405 @@ pub mod quotes; pub mod styles; pub mod tables; pub mod utils; +use super::clean_markdown; +use anchors::AnchorHandler; +use codes::CodeHandler; +use containers::ContainerHandler; +use dummy::DummyHandler; +use dummy::HtmlCherryPickHandler; +use dummy::IdentityHandler; +use headers::HeaderHandler; +use html5ever::driver::ParseOpts; +use html5ever::parse_document; +use html5ever::tendril::TendrilSink; +use iframes::IframeHandler; +use images::ImgHandler; +use lazy_static::lazy_static; +use lists::ListHandler; +use lists::ListItemHandler; +use markup5ever_rcdom::{Handle, NodeData, RcDom}; +use paragraphs::ParagraphHandler; +use quotes::QuoteHandler; +use regex::Regex; +use std::boxed::Box; +use std::collections::HashMap; +use std::sync::Arc; +use styles::StyleHandler; +use tables::TableHandler; +use url::Url; -use super::Handle; -use super::StructuredPrinter; -use super::TagHandler; -use super::TagHandlerFactory; -use super::{clean_markdown, walk}; +lazy_static! { + static ref EXCESSIVE_WHITESPACE_PATTERN: Regex = Regex::new("\\s{2,}").expect("valid regex pattern"); // for HTML on-the-fly cleanup + static ref START_OF_LINE_PATTERN: Regex = Regex::new("(^|\\n) *$").expect("valid regex pattern"); // for Markdown escaping + static ref MARKDOWN_STARTONLY_KEYCHARS: Regex = Regex::new(r"^(\s*)([=>+\-#])").expect("valid regex pattern"); // for Markdown escaping +} + +/// Custom variant of main function. Allows to pass custom tag<->tag factory pairs +/// in order to register custom tag hadler for tags you want. +/// +/// You can also override standard tag handlers this way +/// # Arguments +/// `html` is source HTML as `String` +/// `custom` is custom tag hadler producers for tags you want, can be empty +/// `commonmark` is for adjusting markdown output to commonmark +pub fn parse_html_custom_base( + html: &str, + custom: &HashMap>, + commonmark: bool, + url: &Option, +) -> String { + let document_parser = parse_document(RcDom::default(), ParseOpts::default()); + + match document_parser.from_utf8().read_from(&mut html.as_bytes()) { + Ok(dom) => { + let mut result = Box::new(StructuredPrinter::default()); + + walk( + &dom.document, + &mut result, + custom, + commonmark, + &if let Some(u) = url { + Some(Arc::new(u.clone())) + } else { + None + }, + false, + ); + + // we want to eventually remove the clean step. + clean_markdown(&result.data) + } + _ => Default::default(), + } +} + +/// Custom variant of main function. Allows to pass custom tag<->tag factory pairs +/// in order to register custom tag hadler for tags you want. +/// +/// You can also override standard tag handlers this way +/// # Arguments +/// `html` is source HTML as `String` +/// `custom` is custom tag hadler producers for tags you want, can be empty +/// `commonmark` is for adjusting markdown output to commonmark +pub fn parse_html_custom( + html: &str, + custom: &HashMap>, + commonmark: bool, +) -> String { + parse_html_custom_base(html, custom, commonmark, &None) +} + +/// Custom variant of main function. Allows to pass custom tag<->tag factory pairs +/// in order to register custom tag hadler for tags you want. +/// +/// You can also override standard tag handlers this way +/// # Arguments +/// `html` is source HTML as `String` +/// `custom` is custom tag hadler producers for tags you want, can be empty +/// `commonmark` is for adjusting markdown output to commonmark +/// `url` is used to provide absolute url handling +pub fn parse_html_custom_with_url( + html: &str, + custom: &HashMap>, + commonmark: bool, + url: &Option, +) -> String { + parse_html_custom_base(html, custom, commonmark, &url) +} + +/// Main function of this library. Parses incoming HTML, converts it into Markdown +/// and returns converted string. +/// # Arguments +/// `html` is source HTML as `String` +/// `commonmark` to change the markdown flavor to commonmark as `boolean` +pub fn parse_html(html: &str, commonmark: bool) -> String { + parse_html_custom(html, &HashMap::default(), commonmark) +} + +/// Same as `parse_html` but retains all "span" html elements intact +/// Markdown parsers usually strip them down when rendering but they +/// may be useful for later processing. +pub fn parse_html_extended(html: &str, commonmark: bool) -> String { + struct SpanAsIsTagFactory; + + impl TagHandlerFactory for SpanAsIsTagFactory { + fn instantiate(&self) -> Box { + Box::new(HtmlCherryPickHandler::default()) + } + } + + let mut tag_factory: HashMap> = HashMap::new(); + tag_factory.insert(String::from("span"), Box::new(SpanAsIsTagFactory {})); + parse_html_custom(html, &tag_factory, commonmark) +} + +/// This conversion should only be applied to text tags +/// +/// Escapes text inside HTML tags so it won't be recognized as Markdown control sequence +/// like list start or bold text style +fn escape_markdown(result: &StructuredPrinter, text: &str) -> String { + escape_markdown_base(&result.data, text) +} + +/// Recursively walk through all DOM tree and handle all elements according to +/// HTML tag -> Markdown syntax mapping. Text content is trimmed to one whitespace according to HTML5 rules. +/// +/// # Arguments +/// `input` is DOM tree or its subtree +/// `result` is output holder with position and context tracking +/// `custom` is custom tag hadler producers for tags you want, can be empty +pub fn walk( + input: &Handle, + result: &mut StructuredPrinter, + custom: &HashMap>, + commonmark: bool, + url: &Option>, + ignore_parents: bool, +) { + let mut handler: Box = Box::new(DummyHandler); + let mut tag_name = String::default(); + + let mut inside_pre = false; + let mut inside_code = false; + let mut ignore_write = false; + let mut inside_table = false; + + let find_parent_tags = matches!( + &input.data, + NodeData::Element { .. } | NodeData::Text { .. } + ); + + if find_parent_tags || ignore_parents { + for tag in result.parent_chain.iter() { + if ignore_parents && tag == "table" { + inside_table = true; + break; + } + if tag == "code" { + inside_code = true; + break; + } + if tag == "pre" { + inside_pre = true; + break; + } + if tag_name == "script" || tag_name == "style" { + ignore_write = true; + break; + } + } + } + + match input.data { + NodeData::Document + | NodeData::Comment { .. } + | NodeData::Doctype { .. } + | NodeData::ProcessingInstruction { .. } => (), + NodeData::Text { ref contents } => { + let mut text = contents.borrow().to_string(); + + if inside_pre { + // this is preformatted text, insert as-is + result.append_str(&text); + } else if !(text.trim().is_empty() + && (result.data.ends_with('\n') || result.data.ends_with(' '))) + && !ignore_write + { + if !inside_code { + text = escape_markdown(result, &text); + } + + let minified_text = EXCESSIVE_WHITESPACE_PATTERN.replace_all(&text, " "); + + result.append_str(minified_text.trim()); + } else { + result.append_str(text.trim()); + } + } + NodeData::Element { ref name, .. } => { + if !utils::inline_elements::SKIP_ELEMENTS.contains(&name.local) { + tag_name = name.local.to_string(); + + // do not parse scripts or style tags + if tag_name == "script" || tag_name == "style" { + return; + } + + if ignore_parents && tag_name == "table" { + inside_table = true; + } + + handler = if inside_pre { + // don't add any html tags inside the pre section + Box::new(DummyHandler) + } else { + get_handler(custom, &tag_name, commonmark, url) + } + } + } + } + + if !inside_table || ignore_parents && inside_table { + // handle this tag, while it's not in parent chain + // and doesn't have child siblings + handler.handle(input, result); + } + + result.parent_chain.push(tag_name.clone()); // e.g. it was ["body"] and now it's ["body", "p"] + + let current_depth = result.parent_chain.len(); // e.g. it was 1 and now it's 2 + + // create space for siblings of next level + result.siblings.insert(current_depth, vec![]); + + if !handler.skip_descendants() { + for child in input.children.borrow().iter() { + if valid_block_element(&child.data) { + walk(&child, result, custom, commonmark, url, ignore_parents); + + if let NodeData::Element { ref name, .. } = child.data { + if let Some(el) = result.siblings.get_mut(¤t_depth) { + el.push(name.local.to_string()); + } + } + } + } + } + + result.siblings.remove(¤t_depth); + result.parent_chain.pop(); + + // finish handling of tag - parent chain now doesn't contain this tag itself again + handler.after_handle(result); +} + +/// This conversion should only be applied to text tags +/// +/// Escapes text inside HTML tags so it won't be recognized as Markdown control sequence +/// like list start or bold text style +fn escape_markdown_base(result: &str, text: &str) -> String { + // always escape bold/italic/strikethrough + let data: std::borrow::Cow = crate::MARKDOWN_MIDDLE_KEYCHARS.replace_all(text, "\\$0"); + + // if we're at the start of the line we need to escape list- and quote-starting sequences + let data = if START_OF_LINE_PATTERN.is_match(&result) { + MARKDOWN_STARTONLY_KEYCHARS.replace(&data, "$1\\$2") + } else { + data + }; + + // no handling of more complicated cases such as + // ![] or []() ones, for now this will suffice + data.into() +} + +/// Get the handler to use for the element. +pub(crate) fn get_handler + std::hash::Hash + std::cmp::Eq>( + custom: &HashMap>, + tag_name: &T, + commonmark: bool, + url: &Option>, +) -> Box { + let name = tag_name.borrow(); + match custom.get(name) { + Some(factory) => { + // have user-supplied factory, instantiate a handler for this tag + factory.instantiate() + } + _ => { + match name.as_ref() { + // containers + "div" | "section" | "header" | "footer" => Box::new(ContainerHandler), + // pagination, breaks + "p" | "br" | "hr" => Box::new(ParagraphHandler::default()), + "q" | "cite" | "blockquote" => Box::new(QuoteHandler::default()), + // spoiler tag + "details" | "summary" => Box::new(HtmlCherryPickHandler::new(commonmark)), + // formatting + "b" | "i" | "s" | "strong" | "em" | "del" => Box::new(StyleHandler::default()), + "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => Box::new(HeaderHandler::default()), + "pre" | "code" => Box::new(CodeHandler::default()), + // images, links + "img" => Box::new(ImgHandler::new(commonmark, url)), + "a" => Box::new(AnchorHandler::new(url)), + // lists + "ol" | "ul" | "menu" => Box::new(ListHandler), + "li" => Box::new(ListItemHandler::default()), + // as-is + "sub" | "sup" => Box::new(IdentityHandler::new(commonmark)), + // tables, handled fully internally as markdown can't have nested content in tables + // supports only single tables as of now + "table" => Box::new(TableHandler::new(commonmark, url.clone())), + "iframe" => Box::new(IframeHandler), + _ => Box::new(DummyHandler), + } + } + } +} + +/// A valid HTML block element. +pub(crate) fn valid_block_element(node: &NodeData) -> bool { + match node { + NodeData::Element { ref name, .. } => { + !utils::inline_elements::SKIP_ELEMENTS.contains(&name.local) + } + _ => true, + } +} + +/// Intermediate result of HTML -> Markdown conversion. +/// +/// Holds context in the form of parent tags and siblings chain +/// and resulting string of markup content with current position. +#[derive(Debug, Default)] +pub struct StructuredPrinter { + /// Chain of parents leading to upmost tag + pub parent_chain: Vec, + /// Siblings of currently processed tag in order where they're appearing in html + pub siblings: HashMap>, + /// resulting markdown document + pub data: String, +} + +impl StructuredPrinter { + /// Inserts newline + pub fn insert_newline(&mut self) { + self.append_str("\n"); + } + + /// Append string to the end of the printer + pub fn append_str(&mut self, it: &str) { + self.data.push_str(it); + } + + /// Insert string at specified position of printer, adjust position to the end of inserted string + pub fn insert_str(&mut self, pos: usize, it: &str) { + self.data.insert_str(pos, it); + } +} + +/// Tag handler factory. This class is required in providing proper +/// custom tag parsing capabilities to users of this library. +/// +/// The problem with directly providing tag handlers is that they're not stateless. +/// Once tag handler is parsing some tag, it holds data, such as start position, indent etc. +/// The only way to create fresh tag handler for each tag is to provide a factory like this one. +/// +pub trait TagHandlerFactory { + fn instantiate(&self) -> Box; +} + +/// Trait interface describing abstract handler of arbitrary HTML tag. +pub trait TagHandler { + /// Handle tag encountered when walking HTML tree. + /// This is executed before the children processing + fn handle(&mut self, tag: &Handle, printer: &mut StructuredPrinter); + + /// Executed after all children of this tag have been processed + fn after_handle(&mut self, printer: &mut StructuredPrinter); + + fn skip_descendants(&self) -> bool { + false + } +} diff --git a/fast_html2md/tests/integration.rs b/fast_html2md/tests/integration.rs index c41d161..75d31c2 100644 --- a/fast_html2md/tests/integration.rs +++ b/fast_html2md/tests/integration.rs @@ -83,7 +83,10 @@ fn test_real_spider() { .read_to_string(&mut html) .expect("File must be readable"); let result = rewrite_html(&html, false); - assert!(result == r#"To help you get started with Spider, we’ll give you $200 in credits when you spend $100.[Terms apply](https://spider.cloud/promotion-spider-credits)\n# The Web Crawler for AI Agents and LLMs\nSpider offers the finest data collecting solution. Engineered for speed and scalability, it\nallows you to elevate your AI projects.\n[Get Started](https://spider.cloud/credits/new)View Preview\n* Basic\n* Streaming\nExample request\nPython\nJSONL\nCopy\n```\n`import requests, os, json\nheaders = {\n''Authorization '': f ''Bearer {os.getenv(""SPIDER\\_API\\_KEY "")}'',\n''Content-Type '': ''application/jsonl '',\n}\njson\\_data = {""limit "":50,""metadata "":True,""url "":""https://spider.cloud ""}\nresponse = requests.post(''https://api.spider.cloud/crawl '', headers=headers, json=json\\_data, stream=True)\nwith response as r:\nr.raise\\_for\\_status()\nfor chunk in r.iter\\_lines(\nchunk\\_size=None, decode\\_unicode=True\n):\ndata = json.loads(chunk)\nprint(data)`\n```\n[Free Trial](https://spider.cloud/credits/new?free-trial=1)\nExample Response\n## Built with the need for**Speed**\nExperience the power of**Spider**, built fully in**Rust**for\nnext-generation scalability.\n### 2.4secs\nTo crawl over 20,000 pages\n### 500-1000x\nFaster than alternatives\n### 500x\nCheaper than traditional scraping services\nBenchmarks displaying performance between Spider API request modes.\nSpider API Request Modes ·Benchmarked tailwindcss.com ·06/16/2024\n[See framework benchmarks](https://github.com/spider-rs/spider/blob/main/benches/BENCHMARKS.md)\n### Seamless Integrations\nSeamlessly integrate Spider with a wide range of platforms, ensuring data curation\nperfectly aligned with your requirements. Compatible with all major AI tools.\n[LangChain integration](https://python.langchain.com/docs/integrations/document_loaders/spider)[LlamaIndex integrationLlama Index Logo](https://docs.llamaindex.ai/en/stable/examples/data_connectors/WebPageDemo/#using-spider-reader)[CrewAI integrationCrewAI Logo](https://docs.crewai.com/tools/SpiderTool/)[FlowWiseAI integrationFlowiseAI LogoFlowiseAI](https://docs.flowiseai.com/integrations/langchain/document-loaders/spider-web-scraper-crawler)[Composio integrationComposio Logo](https://docs.composio.dev/introduction/foundations/components/list_local_tools#spider-crawler)[PhiData integrationPhiData Logo](https://docs.phidata.com/tools/spider)\n### Concurrent Streaming\nSave time and money without having to worry about bandwidth concerns by effectively\nstreaming all the results concurrently. The latency cost that is saved becomes drastic as\nyou crawl more websites.\n### Warp Speed\nPowered by the cutting-edge[Spider](https://github.com/spider-rs/spider)open-source project, our robust Rust engine scales effortlessly to handle extreme\nworkloads. We ensure continuous maintenance and improvement for top-tier performance.\n## Kickstart Your Data Collecting Projects Today\nJumpstart web crawling with full elastic scaling concurrency, optimal formats, and AI scraping.\n### Performance Tuned\nSpider is written in Rust and runs in full concurrency to achieve crawling thousands of\npages in secs.\n### Multiple response formats\nGet clean and formatted markdown, HTML, or text content for fine-tuning or training AI\nmodels.\n### Caching\nFurther boost speed by caching repeated web page crawls to minimize expenses while\nbuilding.\n### Smart Mode\nSpider dynamically switches to Headless Chrome when it needs to quick.\nBeta\n### Scrape with AI\nDo custom browser scripting and data extraction using the latest AI models with no cost\nstep caching.\n### The crawler for LLMs\nDon't let crawling and scraping be the highest latency in your LLM & AI agent stack.\n### Scrape with no headaches\n* Auto Proxy rotations\n* Agent headers\n* Anti-bot detections\n* Headless chrome\n* Markdown responses\n### The Fastest Web Crawler\n* Powered by[spider-rs](https://github.com/spider-rs/spider)\n* 100,000 pages/seconds\n* Unlimited concurrency\n* Simple API\n* 50,000 RPM\n### Do more with AI\n* Browser scripting\n* Advanced extraction\n* Data pipelines\n* Ideal for LLMs and AI Agents\n* Accurate labeling\n## Achieve more with these new API features\nOur API is set to stream so you can act in realtime.\n![A user interface with a search bar containing the text "Latest sports news," a green "Submit" button, and two icon buttons to display searching and extracting with the service.](/img/search_feature.webp)\n### Search\nGet access to search engine results from anywhere and easily crawl and transform pages to\nLLM-ready markdown.\n[Explore SearchRight Arrow](https://spider.cloud/docs/api#search)\n![A user interface segment showing three icons representing different stages of data transformation.](/img/transform_feature_example.webp)\n### Transform\nConvert raw HTML into markdown easily by using this API. Transform thousands of html pages\nin seconds.\n[Explore TransformRight Arrow](https://spider.cloud/docs/api#transform)\n## Join the community\nBacked by a network of early advocates, contributors, and supporters.\n[GitHub discussions\nChat Icon\n](https://github.com/orgs/spider-rs/discussions)[Discord\nChat Icon\n](https://discord.spider.cloud)\n[\n![iammerrick's avatar](/img/external/iammerrick_twitter.webp)\n@iammerrick\nRust based crawler Spider is next level for crawling &scraping sites. So fast.\nTheir cloud offering is also so easy to use. Good stuff. https://github.com/spider-rs/spider\n](https://twitter.com/iammerrick/status/1787873425446572462)\n[\n![WilliamEspegren's avatar](/img/external/william_twitter.webp)\n@WilliamEspegren\nWeb crawler built in rust, currently the nr1 performance in the world with crazy resource management Aaaaaaand they have a cloud offer, that’s wayyyy cheaper than any competitor\nName a reason for me to use anything else?\ngithub.com/spider-rs/spid…\n](https://twitter.com/WilliamEspegren/status/1789419820821184764)\n[\n![gasa's avatar](/img/external/gaza_twitter.webp)\n@gasa\n@gasathenaper\nis the best crawling tool i have used. I had a complicated project where i needed to paste url and get the website whole website data. Spider does it in an instant\n](https://x.com/gasathenaper/status/1810612492596383948)\n[\n![Ashpreet Bedi's avatar](/img/external/ashpreet_bedi.webp)\n@Ashpreet Bedi\n@ashpreetbedi\nis THE best crawler out there, give it a try\n](https://x.com/ashpreetbedi/status/1815512219003572315?s=46&t=37F5QP_8oKqOsNpHSo6VVw)\n[\n![Troyusrex's avatar](/img/external/troy_twitter.webp)\n@Troyusrex\nI found a new tool, Spider-rs, which scrapes significantly faster and handles more scenarios than the basic scraper I built did. Our use of Spider-rs and AWS infrastructure reduced the scraping time from four months to under a week.\n](https://medium.com/@troyusrex/inside-my-virtual-college-advisor-a-deep-dive-into-rag-ai-and-agent-technology-84731b2928f7#1326)\n[\n![Dify.AI's avatar](/img/external/difyai.webp)\n@Dify.AI\n🕷\u{fe0f}Spider @spider\\_rust\ncan be used as a built-in tool in #Dify Workflow or as an LLM-callable tool in Agent. It allows fast and affordable web scraping and crawling when your AI applications need real-time web data for context.\n](https://x.com/dify_ai/status/1818226971056243089)\n## FAQ\nFrequently asked questions about Spider.\n### What is Spider?\nSpider is a leading web crawling tool designed for speed and cost-effectiveness, supporting various data formats including LLM-ready markdown.\n### Why is my website not crawling?\nYour crawl may fail if it requires JavaScript rendering. Try setting your request to 'chrome 'to solve this issue.\n### Can you crawl all pages?\nYes, Spider accurately crawls all necessary content without needing a sitemap.\n### What formats can Spider convert web data into?\nSpider outputs HTML, raw, text, and various markdown formats. It supports`JSON`,`JSONL`,`CSV`, and`XML`for API responses.\n### Is Spider suitable for large scraping projects?\nAbsolutely, Spider is ideal for large-scale data collection and offers a cost-effective dashboard for data management.\n### How can I try Spider?\nPurchase credits for our cloud system or test the Open Source Spider engine to explore its capabilities.\n### Does it respect robots.txt?\nYes, compliance with robots.txt is default, but you can disable this if necessary.\n### Unable to get dynamic content?\nIf you are having trouble getting dynamic pages, try setting the request parameter to ""chrome ""or ""smart.""You may also need to set `disable\\_intercept` to allow third-party or external scripts to run.\n### Why is my crawl going slow?\nIf you are experiencing a slow crawl, it is most likely due to the robots.txt file for the website. The robots.txt file may have a crawl delay set, and we respect the delay up to 60 seconds.\n### Do you offer a Free Trial?\nYes, you can try out the service before being charged for free at[checkout](https://spider.cloud/credits/new?free-trial=1).\n## Comprehensive Data Curation for Everyone\nTrusted by leading tech businesses worldwide to deliver accurate and insightful data solutions.\nOuter Labs\n[Zapier LogoZapier](https://zapier.com/apps/spider/integrations)\nElementus Logo\nSuper AI Logo\nLayerX Logo\nSwiss Re\nWrite Sonic Logo\nAlioth Logo\n### Next generation data for AI, scale to millions\n[Start now](https://spider.cloud/credits/new)\n### Company\n* [About](https://spider.cloud/about)\n* [Privacy](https://spider.cloud/privacy)\n* [Terms](https://spider.cloud/eula)\n* [FAQ](https://spider.cloud/faq)\n### Resources\n* [API](https://spider.cloud/docs/api)\n* [Docs](https://spider.cloud/docs/overview)\n* [Guides](https://spider.cloud/guides)\n* [Spider.rs Docs](https://docs.rs/spider/latest/spider/)\n### Services\n* [Pricing](https://spider.cloud/credits/new)\n* [Web Crawling and Scraping](https://spider.cloud/web-crawling-and-scraping)\n[All systems normal.](https://spidercloud.statuspage.io/)\n[\nGithub LogoGitHub\n](https://github.com/spider-rs/spider)[\nDiscord LogoDiscord\n](https://discord.spider.cloud)[\nTwitter LogoTwitter\n](https://twitter.com/spider_rust)"#); + assert!( + result + == r#"To help you get started with Spider, we’ll give you $200 in credits when you spend $100.[Terms apply](https://spider.cloud/promotion-spider-credits)\n# The Web Crawler for AI Agents and LLMs\nSpider offers the finest data collecting solution. Engineered for speed and scalability, it\nallows you to elevate your AI projects.\n[Get Started](https://spider.cloud/credits/new)View Preview\n* Basic\n* Streaming\nExample request\nPython\nJSONL\nCopy\n```\n`import requests, os, json\nheaders = {\n''Authorization '': f ''Bearer {os.getenv(""SPIDER\\_API\\_KEY "")}'',\n''Content-Type '': ''application/jsonl '',\n}\njson\\_data = {""limit "":50,""metadata "":True,""url "":""https://spider.cloud ""}\nresponse = requests.post(''https://api.spider.cloud/crawl '', headers=headers, json=json\\_data, stream=True)\nwith response as r:\nr.raise\\_for\\_status()\nfor chunk in r.iter\\_lines(\nchunk\\_size=None, decode\\_unicode=True\n):\ndata = json.loads(chunk)\nprint(data)`\n```\n[Free Trial](https://spider.cloud/credits/new?free-trial=1)\nExample Response\n## Built with the need for**Speed**\nExperience the power of**Spider**, built fully in**Rust**for\nnext-generation scalability.\n### 2.4secs\nTo crawl over 20,000 pages\n### 500-1000x\nFaster than alternatives\n### 500x\nCheaper than traditional scraping services\nBenchmarks displaying performance between Spider API request modes.\nSpider API Request Modes ·Benchmarked tailwindcss.com ·06/16/2024\n[See framework benchmarks](https://github.com/spider-rs/spider/blob/main/benches/BENCHMARKS.md)\n### Seamless Integrations\nSeamlessly integrate Spider with a wide range of platforms, ensuring data curation\nperfectly aligned with your requirements. Compatible with all major AI tools.\n[LangChain integration](https://python.langchain.com/docs/integrations/document_loaders/spider)[LlamaIndex integrationLlama Index Logo](https://docs.llamaindex.ai/en/stable/examples/data_connectors/WebPageDemo/#using-spider-reader)[CrewAI integrationCrewAI Logo](https://docs.crewai.com/tools/SpiderTool/)[FlowWiseAI integrationFlowiseAI LogoFlowiseAI](https://docs.flowiseai.com/integrations/langchain/document-loaders/spider-web-scraper-crawler)[Composio integrationComposio Logo](https://docs.composio.dev/introduction/foundations/components/list_local_tools#spider-crawler)[PhiData integrationPhiData Logo](https://docs.phidata.com/tools/spider)\n### Concurrent Streaming\nSave time and money without having to worry about bandwidth concerns by effectively\nstreaming all the results concurrently. The latency cost that is saved becomes drastic as\nyou crawl more websites.\n### Warp Speed\nPowered by the cutting-edge[Spider](https://github.com/spider-rs/spider)open-source project, our robust Rust engine scales effortlessly to handle extreme\nworkloads. We ensure continuous maintenance and improvement for top-tier performance.\n## Kickstart Your Data Collecting Projects Today\nJumpstart web crawling with full elastic scaling concurrency, optimal formats, and AI scraping.\n### Performance Tuned\nSpider is written in Rust and runs in full concurrency to achieve crawling thousands of\npages in secs.\n### Multiple response formats\nGet clean and formatted markdown, HTML, or text content for fine-tuning or training AI\nmodels.\n### Caching\nFurther boost speed by caching repeated web page crawls to minimize expenses while\nbuilding.\n### Smart Mode\nSpider dynamically switches to Headless Chrome when it needs to quick.\nBeta\n### Scrape with AI\nDo custom browser scripting and data extraction using the latest AI models with no cost\nstep caching.\n### The crawler for LLMs\nDon't let crawling and scraping be the highest latency in your LLM & AI agent stack.\n### Scrape with no headaches\n* Auto Proxy rotations\n* Agent headers\n* Anti-bot detections\n* Headless chrome\n* Markdown responses\n### The Fastest Web Crawler\n* Powered by[spider-rs](https://github.com/spider-rs/spider)\n* 100,000 pages/seconds\n* Unlimited concurrency\n* Simple API\n* 50,000 RPM\n### Do more with AI\n* Browser scripting\n* Advanced extraction\n* Data pipelines\n* Ideal for LLMs and AI Agents\n* Accurate labeling\n## Achieve more with these new API features\nOur API is set to stream so you can act in realtime.\n![A user interface with a search bar containing the text "Latest sports news," a green "Submit" button, and two icon buttons to display searching and extracting with the service.](/img/search_feature.webp)\n### Search\nGet access to search engine results from anywhere and easily crawl and transform pages to\nLLM-ready markdown.\n[Explore SearchRight Arrow](https://spider.cloud/docs/api#search)\n![A user interface segment showing three icons representing different stages of data transformation.](/img/transform_feature_example.webp)\n### Transform\nConvert raw HTML into markdown easily by using this API. Transform thousands of html pages\nin seconds.\n[Explore TransformRight Arrow](https://spider.cloud/docs/api#transform)\n## Join the community\nBacked by a network of early advocates, contributors, and supporters.\n[GitHub discussions\nChat Icon\n](https://github.com/orgs/spider-rs/discussions)[Discord\nChat Icon\n](https://discord.spider.cloud)\n[\n![iammerrick's avatar](/img/external/iammerrick_twitter.webp)\n@iammerrick\nRust based crawler Spider is next level for crawling &scraping sites. So fast.\nTheir cloud offering is also so easy to use. Good stuff. https://github.com/spider-rs/spider\n](https://twitter.com/iammerrick/status/1787873425446572462)\n[\n![WilliamEspegren's avatar](/img/external/william_twitter.webp)\n@WilliamEspegren\nWeb crawler built in rust, currently the nr1 performance in the world with crazy resource management Aaaaaaand they have a cloud offer, that’s wayyyy cheaper than any competitor\nName a reason for me to use anything else?\ngithub.com/spider-rs/spid…\n](https://twitter.com/WilliamEspegren/status/1789419820821184764)\n[\n![gasa's avatar](/img/external/gaza_twitter.webp)\n@gasa\n@gasathenaper\nis the best crawling tool i have used. I had a complicated project where i needed to paste url and get the website whole website data. Spider does it in an instant\n](https://x.com/gasathenaper/status/1810612492596383948)\n[\n![Ashpreet Bedi's avatar](/img/external/ashpreet_bedi.webp)\n@Ashpreet Bedi\n@ashpreetbedi\nis THE best crawler out there, give it a try\n](https://x.com/ashpreetbedi/status/1815512219003572315?s=46&t=37F5QP_8oKqOsNpHSo6VVw)\n[\n![Troyusrex's avatar](/img/external/troy_twitter.webp)\n@Troyusrex\nI found a new tool, Spider-rs, which scrapes significantly faster and handles more scenarios than the basic scraper I built did. Our use of Spider-rs and AWS infrastructure reduced the scraping time from four months to under a week.\n](https://medium.com/@troyusrex/inside-my-virtual-college-advisor-a-deep-dive-into-rag-ai-and-agent-technology-84731b2928f7#1326)\n[\n![Dify.AI's avatar](/img/external/difyai.webp)\n@Dify.AI\n🕷\u{fe0f}Spider @spider\\_rust\ncan be used as a built-in tool in #Dify Workflow or as an LLM-callable tool in Agent. It allows fast and affordable web scraping and crawling when your AI applications need real-time web data for context.\n](https://x.com/dify_ai/status/1818226971056243089)\n## FAQ\nFrequently asked questions about Spider.\n### What is Spider?\nSpider is a leading web crawling tool designed for speed and cost-effectiveness, supporting various data formats including LLM-ready markdown.\n### Why is my website not crawling?\nYour crawl may fail if it requires JavaScript rendering. Try setting your request to 'chrome 'to solve this issue.\n### Can you crawl all pages?\nYes, Spider accurately crawls all necessary content without needing a sitemap.\n### What formats can Spider convert web data into?\nSpider outputs HTML, raw, text, and various markdown formats. It supports`JSON`,`JSONL`,`CSV`, and`XML`for API responses.\n### Is Spider suitable for large scraping projects?\nAbsolutely, Spider is ideal for large-scale data collection and offers a cost-effective dashboard for data management.\n### How can I try Spider?\nPurchase credits for our cloud system or test the Open Source Spider engine to explore its capabilities.\n### Does it respect robots.txt?\nYes, compliance with robots.txt is default, but you can disable this if necessary.\n### Unable to get dynamic content?\nIf you are having trouble getting dynamic pages, try setting the request parameter to ""chrome ""or ""smart.""You may also need to set `disable\\_intercept` to allow third-party or external scripts to run.\n### Why is my crawl going slow?\nIf you are experiencing a slow crawl, it is most likely due to the robots.txt file for the website. The robots.txt file may have a crawl delay set, and we respect the delay up to 60 seconds.\n### Do you offer a Free Trial?\nYes, you can try out the service before being charged for free at[checkout](https://spider.cloud/credits/new?free-trial=1).\n## Comprehensive Data Curation for Everyone\nTrusted by leading tech businesses worldwide to deliver accurate and insightful data solutions.\nOuter Labs\n[Zapier LogoZapier](https://zapier.com/apps/spider/integrations)\nElementus Logo\nSuper AI Logo\nLayerX Logo\nSwiss Re\nWrite Sonic Logo\nAlioth Logo\n### Next generation data for AI, scale to millions\n[Start now](https://spider.cloud/credits/new)\n### Company\n* [About](https://spider.cloud/about)\n* [Privacy](https://spider.cloud/privacy)\n* [Terms](https://spider.cloud/eula)\n* [FAQ](https://spider.cloud/faq)\n### Resources\n* [API](https://spider.cloud/docs/api)\n* [Docs](https://spider.cloud/docs/overview)\n* [Guides](https://spider.cloud/guides)\n* [Spider.rs Docs](https://docs.rs/spider/latest/spider/)\n### Services\n* [Pricing](https://spider.cloud/credits/new)\n* [Web Crawling and Scraping](https://spider.cloud/web-crawling-and-scraping)\n[All systems normal.](https://spidercloud.statuspage.io/)\n[\nGithub LogoGitHub\n](https://github.com/spider-rs/spider)[\nDiscord LogoDiscord\n](https://discord.spider.cloud)[\nTwitter LogoTwitter\n](https://twitter.com/spider_rust)"# + ); } #[test] @@ -182,7 +185,8 @@ fn test_html_from_text() { .read_to_string(&mut html) .expect("File must be readable"); - let mut tag_factory: HashMap> = HashMap::new(); + let mut tag_factory: HashMap> = + HashMap::new(); let tag = Box::new(html2md::scraper::ignore::IgnoreTagFactory {}); tag_factory.insert(String::from("script"), tag.clone());