From 95063e39c9ee628ac13abb19f8817c015bd69b5d Mon Sep 17 00:00:00 2001
From: j-mendez <jeff@a11ywatch.com>
Date: Sun, 24 Nov 2024 17:46:41 -0500
Subject: [PATCH] chore(lib): cleanup entry scraper and rewriter

---
 fast_html2md/src/lib.rs             | 418 +---------------------------
 fast_html2md/src/rewriter/handle.rs | 151 ++++++++++
 fast_html2md/src/rewriter/mod.rs    |   1 +
 fast_html2md/src/rewriter/writer.rs | 157 +----------
 fast_html2md/src/scraper/mod.rs     | 406 ++++++++++++++++++++++++++-
 fast_html2md/tests/integration.rs   |   8 +-
 6 files changed, 578 insertions(+), 563 deletions(-)
 create mode 100644 fast_html2md/src/rewriter/handle.rs
diff --git a/fast_html2md/src/lib.rs b/fast_html2md/src/lib.rs
index 2dbf4de..ed6812b 100644
--- a/fast_html2md/src/lib.rs
+++ b/fast_html2md/src/lib.rs
@@ -1,60 +1,22 @@
 use extended::sifter::WhitespaceSifterBytes;
-use html5ever::driver::ParseOpts;
-use html5ever::parse_document;
-use html5ever::tendril::TendrilSink;
 use lazy_static::lazy_static;
+pub use markup5ever_rcdom::{Handle, NodeData, RcDom};
 use regex::Regex;
-use std::boxed::Box;
-use std::collections::HashMap;
 use std::collections::HashSet;
-use std::sync::Arc;
 use url::Url;
-
-pub use markup5ever_rcdom::{Handle, NodeData, RcDom};
-
 // we want to just use the rewriter instead for v0.1.
 pub mod extended;
 pub mod rewriter;
 pub mod scraper;
-
 use extended::sifter::WhitespaceSifter;
 
-pub(crate) use scraper::anchors;
-pub(crate) use scraper::codes;
 pub use scraper::ignore;
-// pub(crate) use scraper::common;
-pub(crate) use scraper::containers;
-pub(crate) use scraper::dummy;
-pub(crate) use scraper::headers;
-pub(crate) use scraper::iframes;
-pub(crate) use scraper::images;
-pub(crate) use scraper::lists;
-pub(crate) use scraper::paragraphs;
-pub(crate) use scraper::quotes;
-pub(crate) use scraper::styles;
-pub(crate) use scraper::tables;
-pub(crate) use scraper::utils;
-
-use anchors::AnchorHandler;
-use codes::CodeHandler;
-use containers::ContainerHandler;
-use dummy::DummyHandler;
-use dummy::HtmlCherryPickHandler;
-use dummy::IdentityHandler;
-use headers::HeaderHandler;
-use iframes::IframeHandler;
-use images::ImgHandler;
-use lists::ListHandler;
-use lists::ListItemHandler;
-use paragraphs::ParagraphHandler;
-use quotes::QuoteHandler;
-use styles::StyleHandler;
-use tables::TableHandler;
+pub use scraper::{
+    parse_html, parse_html_custom, parse_html_custom_base, parse_html_custom_with_url,
+    parse_html_extended,
+};
 
 lazy_static! {
-    static ref EXCESSIVE_WHITESPACE_PATTERN: Regex = Regex::new("\\s{2,}").expect("valid regex pattern");   // for HTML on-the-fly cleanup
-    static ref START_OF_LINE_PATTERN: Regex = Regex::new("(^|\\n) *$").expect("valid regex pattern");                  // for Markdown escaping
-    static ref MARKDOWN_STARTONLY_KEYCHARS: Regex = Regex::new(r"^(\s*)([=>+\-#])").expect("valid regex pattern");     // for Markdown escaping
     static ref MARKDOWN_MIDDLE_KEYCHARS: Regex = Regex::new(r"[<>*\\_~]").expect("valid regex pattern");               // for Markdown escaping
     static ref MARKDOWN_MIDDLE_KEYCHARS_SET: regex::RegexSet = regex::RegexSet::new(&[
         r"[<>*\\_~]",  // Matches any single markdown character
@@ -62,89 +24,6 @@ lazy_static! {
     ]).expect("valid regex set");
 }
 
-/// Custom variant of main function. Allows to pass custom tag<->tag factory pairs
-/// in order to register custom tag hadler for tags you want.
-///
-/// You can also override standard tag handlers this way
-/// # Arguments
-/// `html` is source HTML as `String`
-/// `custom` is custom tag hadler producers for tags you want, can be empty
-/// `commonmark` is for adjusting markdown output to commonmark
-pub fn parse_html_custom_base(
-    html: &str,
-    custom: &HashMap<String, Box<dyn TagHandlerFactory>>,
-    commonmark: bool,
-    url: &Option<Url>,
-) -> String {
-    let document_parser = parse_document(RcDom::default(), ParseOpts::default());
-
-    match document_parser.from_utf8().read_from(&mut html.as_bytes()) {
-        Ok(dom) => {
-            let mut result = Box::new(StructuredPrinter::default());
-
-            walk(
-                &dom.document,
-                &mut result,
-                custom,
-                commonmark,
-                &if let Some(u) = url {
-                    Some(Arc::new(u.clone()))
-                } else {
-                    None
-                },
-                false,
-            );
-
-            // we want to eventually remove the clean step.
-            clean_markdown(&result.data)
-        }
-        _ => Default::default(),
-    }
-}
-
-/// Custom variant of main function. Allows to pass custom tag<->tag factory pairs
-/// in order to register custom tag hadler for tags you want.
-///
-/// You can also override standard tag handlers this way
-/// # Arguments
-/// `html` is source HTML as `String`
-/// `custom` is custom tag hadler producers for tags you want, can be empty
-/// `commonmark` is for adjusting markdown output to commonmark
-pub fn parse_html_custom(
-    html: &str,
-    custom: &HashMap<String, Box<dyn TagHandlerFactory>>,
-    commonmark: bool,
-) -> String {
-    parse_html_custom_base(html, custom, commonmark, &None)
-}
-
-/// Custom variant of main function. Allows to pass custom tag<->tag factory pairs
-/// in order to register custom tag hadler for tags you want.
-///
-/// You can also override standard tag handlers this way
-/// # Arguments
-/// `html` is source HTML as `String`
-/// `custom` is custom tag hadler producers for tags you want, can be empty
-/// `commonmark` is for adjusting markdown output to commonmark
-/// `url` is used to provide absolute url handling
-pub fn parse_html_custom_with_url(
-    html: &str,
-    custom: &HashMap<String, Box<dyn TagHandlerFactory>>,
-    commonmark: bool,
-    url: &Option<Url>,
-) -> String {
-    parse_html_custom_base(html, custom, commonmark, &url)
-}
-
-/// Main function of this library. Parses incoming HTML, converts it into Markdown
-/// and returns converted string.
-/// # Arguments
-/// `html` is source HTML as `String`
-/// `commonmark` to change the markdown flavor to commonmark as `boolean`
-pub fn parse_html(html: &str, commonmark: bool) -> String {
-    parse_html_custom(html, &HashMap::default(), commonmark)
-}
-
 /// Main function of this library to come. Rewrites incoming HTML, converts it into Markdown
 /// and returns converted string. Incomplete work in progress for major performance increases.
 /// # Arguments
@@ -170,237 +49,6 @@ pub fn rewrite_html_custom_with_url(
     rewriter::writer::convert_html_to_markdown(html, &custom, commonmark, url).unwrap_or_default()
 }
 
-/// Same as `parse_html` but retains all "span" html elements intact
-/// Markdown parsers usually strip them down when rendering but they
-/// may be useful for later processing.
-pub fn parse_html_extended(html: &str, commonmark: bool) -> String {
-    struct SpanAsIsTagFactory;
-
-    impl TagHandlerFactory for SpanAsIsTagFactory {
-        fn instantiate(&self) -> Box<dyn TagHandler> {
-            Box::new(HtmlCherryPickHandler::default())
-        }
-    }
-
-    let mut tag_factory: HashMap<String, Box<dyn TagHandlerFactory>> = HashMap::new();
-    tag_factory.insert(String::from("span"), Box::new(SpanAsIsTagFactory {}));
-    parse_html_custom(html, &tag_factory, commonmark)
-}
-
-/// Recursively walk through all DOM tree and handle all elements according to
-/// HTML tag -> Markdown syntax mapping. Text content is trimmed to one whitespace according to HTML5 rules.
-///
-/// # Arguments
-/// `input` is DOM tree or its subtree
-/// `result` is output holder with position and context tracking
-/// `custom` is custom tag hadler producers for tags you want, can be empty
-fn walk(
-    input: &Handle,
-    result: &mut StructuredPrinter,
-    custom: &HashMap<String, Box<dyn TagHandlerFactory>>,
-    commonmark: bool,
-    url: &Option<Arc<Url>>,
-    ignore_parents: bool,
-) {
-    let mut handler: Box<dyn TagHandler> = Box::new(DummyHandler);
-    let mut tag_name = String::default();
-
-    let mut inside_pre = false;
-    let mut inside_code = false;
-    let mut ignore_write = false;
-    let mut inside_table = false;
-
-    let find_parent_tags = matches!(
-        &input.data,
-        NodeData::Element { .. } | NodeData::Text { .. }
-    );
-
-    if find_parent_tags || ignore_parents {
-        for tag in result.parent_chain.iter() {
-            if ignore_parents && tag == "table" {
-                inside_table = true;
-                break;
-            }
-            if tag == "code" {
-                inside_code = true;
-                break;
-            }
-            if tag == "pre" {
-                inside_pre = true;
-                break;
-            }
-            if tag_name == "script" || tag_name == "style" {
-                ignore_write = true;
-                break;
-            }
-        }
-    }
-
-    match input.data {
-        NodeData::Document
-        | NodeData::Comment { .. }
-        | NodeData::Doctype { .. }
-        | NodeData::ProcessingInstruction { .. } => (),
-        NodeData::Text { ref contents } => {
-            let mut text = contents.borrow().to_string();
-
-            if inside_pre {
-                // this is preformatted text, insert as-is
-                result.append_str(&text);
-            } else if !(text.trim().is_empty()
-                && (result.data.ends_with('\n') || result.data.ends_with(' ')))
-                && !ignore_write
-            {
-                if !inside_code {
-                    text = escape_markdown(result, &text);
-                }
-
-                let minified_text = EXCESSIVE_WHITESPACE_PATTERN.replace_all(&text, " ");
-
-                result.append_str(minified_text.trim());
-            } else {
-                result.append_str(text.trim());
-            }
-        }
-        NodeData::Element { ref name, .. } => {
-            if !utils::inline_elements::SKIP_ELEMENTS.contains(&name.local) {
-                tag_name = name.local.to_string();
-
-                // do not parse scripts or style tags
-                if tag_name == "script" || tag_name == "style" {
-                    return;
-                }
-
-                if ignore_parents && tag_name == "table" {
-                    inside_table = true;
-                }
-
-                handler = if inside_pre {
-                    // don't add any html tags inside the pre section
-                    Box::new(DummyHandler)
-                } else {
-                    get_handler(custom, &tag_name, commonmark, url)
-                }
-            }
-        }
-    }
-
-    if !inside_table || ignore_parents && inside_table {
-        // handle this tag, while it's not in parent chain
-        // and doesn't have child siblings
-        handler.handle(input, result);
-    }
-
-    result.parent_chain.push(tag_name.clone()); // e.g. it was ["body"] and now it's ["body", "p"]
-
-    let current_depth = result.parent_chain.len(); // e.g. it was 1 and now it's 2
-
-    // create space for siblings of next level
-    result.siblings.insert(current_depth, vec![]);
-
-    if !handler.skip_descendants() {
-        for child in input.children.borrow().iter() {
-            if valid_block_element(&child.data) {
-                walk(&child, result, custom, commonmark, url, ignore_parents);
-
-                if let NodeData::Element { ref name, .. } = child.data {
-                    if let Some(el) = result.siblings.get_mut(&current_depth) {
-                        el.push(name.local.to_string());
-                    }
-                }
-            }
-        }
-    }
-
-    result.siblings.remove(&current_depth);
-    result.parent_chain.pop();
-
-    // finish handling of tag - parent chain now doesn't contain this tag itself again
-    handler.after_handle(result);
-}
-
-/// This conversion should only be applied to text tags
-///
-/// Escapes text inside HTML tags so it won't be recognized as Markdown control sequence
-/// like list start or bold text style
-fn escape_markdown_base(result: &str, text: &str) -> String {
-    // always escape bold/italic/strikethrough
-    let data: std::borrow::Cow<str> = MARKDOWN_MIDDLE_KEYCHARS.replace_all(text, "\\$0");
-
-    // if we're at the start of the line we need to escape list- and quote-starting sequences
-    let data = if START_OF_LINE_PATTERN.is_match(&result) {
-        MARKDOWN_STARTONLY_KEYCHARS.replace(&data, "$1\\$2")
-    } else {
-        data
-    };
-
-    // no handling of more complicated cases such as
-    // ![] or []() ones, for now this will suffice
-    data.into()
-}
-
-/// Get the handler to use for the element.
-pub(crate) fn get_handler<T: std::borrow::Borrow<str> + std::hash::Hash + std::cmp::Eq>(
-    custom: &HashMap<String, Box<dyn TagHandlerFactory>>,
-    tag_name: &T,
-    commonmark: bool,
-    url: &Option<Arc<Url>>,
-) -> Box<dyn TagHandler> {
-    let name = tag_name.borrow();
-    match custom.get(name) {
-        Some(factory) => {
-            // have user-supplied factory, instantiate a handler for this tag
-            factory.instantiate()
-        }
-        _ => {
-            match name.as_ref() {
-                // containers
-                "div" | "section" | "header" | "footer" => Box::new(ContainerHandler),
-                // pagination, breaks
-                "p" | "br" | "hr" => Box::new(ParagraphHandler::default()),
-                "q" | "cite" | "blockquote" => Box::new(QuoteHandler::default()),
-                // spoiler tag
-                "details" | "summary" => Box::new(HtmlCherryPickHandler::new(commonmark)),
-                // formatting
-                "b" | "i" | "s" | "strong" | "em" | "del" => Box::new(StyleHandler::default()),
-                "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => Box::new(HeaderHandler::default()),
-                "pre" | "code" => Box::new(CodeHandler::default()),
-                // images, links
-                "img" => Box::new(ImgHandler::new(commonmark, url)),
-                "a" => Box::new(AnchorHandler::new(url)),
-                // lists
-                "ol" | "ul" | "menu" => Box::new(ListHandler),
-                "li" => Box::new(ListItemHandler::default()),
-                // as-is
-                "sub" | "sup" => Box::new(IdentityHandler::new(commonmark)),
-                // tables, handled fully internally as markdown can't have nested content in tables
-                // supports only single tables as of now
-                "table" => Box::new(TableHandler::new(commonmark, url.clone())),
-                "iframe" => Box::new(IframeHandler),
-                _ => Box::new(DummyHandler),
-            }
-        }
-    }
-}
-
-/// A valid HTML block element.
-pub(crate) fn valid_block_element(node: &NodeData) -> bool {
-    match node {
-        NodeData::Element { ref name, .. } => {
-            !utils::inline_elements::SKIP_ELEMENTS.contains(&name.local)
-        }
-        _ => true,
-    }
-}
-
-/// This conversion should only be applied to text tags
-///
-/// Escapes text inside HTML tags so it won't be recognized as Markdown control sequence
-/// like list start or bold text style
-fn escape_markdown(result: &StructuredPrinter, text: &str) -> String {
-    escape_markdown_base(&result.data, text)
-}
-
 /// Called after all processing has been finished
 ///
 /// Clears excessive punctuation that would be trimmed by renderer anyway
@@ -414,59 +62,3 @@ pub fn clean_markdown(input: &str) -> String {
 pub fn clean_markdown_bytes(input: &Vec<u8>) -> String {
     input.sift_bytes()
 }
-
-/// Intermediate result of HTML -> Markdown conversion.
-///
-/// Holds context in the form of parent tags and siblings chain
-/// and resulting string of markup content with current position.
-#[derive(Debug, Default)]
-pub struct StructuredPrinter {
-    /// Chain of parents leading to upmost <html> tag
-    pub parent_chain: Vec<String>,
-    /// Siblings of currently processed tag in order where they're appearing in html
-    pub siblings: HashMap<usize, Vec<String>>,
-    /// resulting markdown document
-    pub data: String,
-}
-
-impl StructuredPrinter {
-    /// Inserts newline
-    pub fn insert_newline(&mut self) {
-        self.append_str("\n");
-    }
-
-    /// Append string to the end of the printer
-    pub fn append_str(&mut self, it: &str) {
-        self.data.push_str(it);
-    }
-
-    /// Insert string at specified position of printer, adjust position to the end of inserted string
-    pub fn insert_str(&mut self, pos: usize, it: &str) {
-        self.data.insert_str(pos, it);
-    }
-}
-
-/// Tag handler factory. This class is required in providing proper
-/// custom tag parsing capabilities to users of this library.
-///
-/// The problem with directly providing tag handlers is that they're not stateless.
-/// Once tag handler is parsing some tag, it holds data, such as start position, indent etc.
-/// The only way to create fresh tag handler for each tag is to provide a factory like this one.
-///
-pub trait TagHandlerFactory {
-    fn instantiate(&self) -> Box<dyn TagHandler>;
-}
-
-/// Trait interface describing abstract handler of arbitrary HTML tag.
-pub trait TagHandler {
-    /// Handle tag encountered when walking HTML tree.
-    /// This is executed before the children processing
-    fn handle(&mut self, tag: &Handle, printer: &mut StructuredPrinter);
-
-    /// Executed after all children of this tag have been processed
-    fn after_handle(&mut self, printer: &mut StructuredPrinter);
-
-    fn skip_descendants(&self) -> bool {
-        false
-    }
-}
diff --git a/fast_html2md/src/rewriter/handle.rs b/fast_html2md/src/rewriter/handle.rs
new file mode 100644
index 0000000..09f6484
--- /dev/null
+++ b/fast_html2md/src/rewriter/handle.rs
@@ -0,0 +1,151 @@
+use super::anchors::rewrite_anchor_element;
+use super::iframes::handle_iframe;
+use super::images::rewrite_image_element;
+use super::lists::handle_list_or_item;
+use super::quotes::rewrite_blockquote_element;
+use super::styles::rewrite_style_element;
+use lol_html::html_content::ContentType::{Html, Text};
+use lol_html::html_content::Element;
+use lol_html::{doc_comments, doctype, text};
+use lol_html::{element, RewriteStrSettings};
+use std::cell::RefCell;
+use std::rc::Rc;
+use url::Url;
+
+/// Insert a new line after
+#[inline]
+pub fn insert_newline_after(element: &mut Element) {
+    element.after("\n", Text);
+}
+
+/// Insert a new line before
+#[inline]
+pub fn insert_newline_before(element: &mut Element) {
+    element.before("\n", Text);
+}
+
+/// Handle the lol_html tag.
+#[inline]
+pub fn handle_tag(
+    element: &mut Element,
+    commonmark: bool,
+    url: &Option<Url>,
+    list_type: Rc<RefCell<Option<String>>>,
+    order_counter: Rc<RefCell<usize>>,
+    quote_depth: Rc<RefCell<usize>>,
+    inside_table: Rc<RefCell<bool>>,
+) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
+    let element_name = element.tag_name();
+
+    let remove_attrs =
+        commonmark && (element_name.as_str() == "sub" || element_name.as_str() == "sup");
+
+    // check common mark includes.
+    if remove_attrs {
+        let attrs = element
+            .attributes()
+            .iter()
+            .map(|f| f.name())
+            .collect::<Vec<String>>();
+
+        for attr in attrs.iter() {
+            element.remove_attribute(&attr);
+        }
+    } else {
+        element.remove_and_keep_content();
+    }
+
+    // Add the markdown equivalents before the element.
+    match element_name.as_str() {
+        "h1" => {
+            element.before("# ", Text);
+            insert_newline_after(element);
+        }
+        "h2" => {
+            element.before("## ", Text);
+            insert_newline_after(element);
+        }
+        "h3" => {
+            element.before("### ", Text);
+            insert_newline_after(element);
+        }
+        "h4" => {
+            element.before("#### ", Text);
+            insert_newline_after(element);
+        }
+        "h5" => {
+            element.before("##### ", Text);
+            insert_newline_after(element);
+        }
+        "h6" => {
+            element.before("###### ", Text);
+            insert_newline_after(element);
+        }
+        "p" => {
+            insert_newline_before(element);
+            insert_newline_after(element);
+        }
+        "hr" => {
+            insert_newline_before(element);
+            element.append("---", Text);
+            insert_newline_after(element);
+        }
+        "br" => insert_newline_after(element),
+        "a" => {
+            let _ = rewrite_anchor_element(element, commonmark, url);
+        }
+        "img" => {
+            let _ = rewrite_image_element(element, commonmark, &url);
+        }
+        "table" => {
+            *inside_table.borrow_mut() = true;
+        }
+        "tr" => {
+            insert_newline_after(element);
+        }
+        "th" => {
+            if commonmark {
+                element.before("** ", Html);
+                element.after("** |", Html);
+            } else {
+                element.after("|", Html);
+            }
+
+            // add the first table row start
+            if *inside_table.borrow() {
+                element.before("|", Html);
+                *inside_table.borrow_mut() = false;
+            }
+        }
+        "td" => {
+            element.after("|", Html);
+        }
+        "iframe" => {
+            let _ = handle_iframe(element);
+        }
+        "b" | "i" | "s" | "strong" | "em" | "del" => {
+            let _ = rewrite_style_element(element);
+        }
+        "ol" | "ul" | "menu" | "li" => {
+            let _ = handle_list_or_item(element, list_type.clone(), order_counter.clone());
+        }
+        "q" | "cite" | "blockquote" => {
+            let _ = rewrite_blockquote_element(element, quote_depth);
+        }
+        "div" | "section" | "header" | "footer" => {
+            insert_newline_before(element);
+            insert_newline_after(element);
+        }
+        "pre" => {
+            element.before("\n```\n", Html);
+            element.after("\n```\n", Html);
+        }
+        "code" | "samp" => {
+            element.before("`", Html);
+            element.after("`", Html);
+        }
+        _ => (),
+    }
+
+    Ok(())
+}
diff --git a/fast_html2md/src/rewriter/mod.rs b/fast_html2md/src/rewriter/mod.rs
index ab5a8e3..1a15ca5 100644
--- a/fast_html2md/src/rewriter/mod.rs
+++ b/fast_html2md/src/rewriter/mod.rs
@@ -1,5 +1,6 @@
 pub(crate) mod anchors;
 pub(crate) mod counter;
+pub(crate) mod handle;
 pub(crate) mod iframes;
 pub(crate) mod images;
 pub(crate) mod lists;
diff --git a/fast_html2md/src/rewriter/writer.rs b/fast_html2md/src/rewriter/writer.rs
index b6dbefc..9d3e379 100644
--- a/fast_html2md/src/rewriter/writer.rs
+++ b/fast_html2md/src/rewriter/writer.rs
@@ -1,11 +1,7 @@
-use super::anchors::rewrite_anchor_element;
-use super::iframes::handle_iframe;
-use super::images::rewrite_image_element;
-use super::lists::handle_list_or_item;
-use super::quotes::{rewrite_blockquote_element, rewrite_blockquote_text};
-use super::styles::rewrite_style_element;
+use super::handle::handle_tag;
+use super::quotes::rewrite_blockquote_text;
 use crate::clean_markdown_bytes;
-use lol_html::html_content::ContentType::{Html, Text};
+use lol_html::html_content::ContentType::Text;
 use lol_html::html_content::Element;
 use lol_html::{doc_comments, doctype, text};
 use lol_html::{element, RewriteStrSettings};
@@ -25,132 +21,6 @@ pub fn insert_newline_before(element: &mut Element) {
     element.before("\n", Text);
 }
 
-/// Handle the lol_html tag.
-#[inline]
-fn handle_tag(
-    element: &mut Element,
-    commonmark: bool,
-    url: &Option<Url>,
-    list_type: Rc<RefCell<Option<String>>>,
-    order_counter: Rc<RefCell<usize>>,
-    quote_depth: Rc<RefCell<usize>>,
-    inside_table: Rc<RefCell<bool>>,
-) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
-    let element_name = element.tag_name();
-
-    let remove_attrs =
-        commonmark && (element_name.as_str() == "sub" || element_name.as_str() == "sup");
-
-    // check common mark includes.
-    if remove_attrs {
-        let attrs = element
-            .attributes()
-            .iter()
-            .map(|f| f.name())
-            .collect::<Vec<String>>();
-
-        for attr in attrs.iter() {
-            element.remove_attribute(&attr);
-        }
-    } else {
-        element.remove_and_keep_content();
-    }
-
-    // Add the markdown equivalents before the element.
-    match element_name.as_str() {
-        "h1" => {
-            element.before("# ", Text);
-            insert_newline_after(element);
-        }
-        "h2" => {
-            element.before("## ", Text);
-            insert_newline_after(element);
-        }
-        "h3" => {
-            element.before("### ", Text);
-            insert_newline_after(element);
-        }
-        "h4" => {
-            element.before("#### ", Text);
-            insert_newline_after(element);
-        }
-        "h5" => {
-            element.before("##### ", Text);
-            insert_newline_after(element);
-        }
-        "h6" => {
-            element.before("###### ", Text);
-            insert_newline_after(element);
-        }
-        "p" => {
-            insert_newline_before(element);
-            insert_newline_after(element);
-        }
-        "hr" => {
-            insert_newline_before(element);
-            element.append("---", Text);
-            insert_newline_after(element);
-        }
-        "br" => insert_newline_after(element),
-        "a" => {
-            let _ = rewrite_anchor_element(element, commonmark, url);
-        }
-        "img" => {
-            let _ = rewrite_image_element(element, commonmark, &url);
-        }
-        "table" => {
-            *inside_table.borrow_mut() = true;
-        }
-        "tr" => {
-            insert_newline_after(element);
-        }
-        "th" => {
-            if commonmark {
-                element.before("** ", Html);
-                element.after("** |", Html);
-            } else {
-                element.after("|", Html);
-            }
-
-            // add the first table row start
-            if *inside_table.borrow() {
-                element.before("|", Html);
-                *inside_table.borrow_mut() = false;
-            }
-        }
-        "td" => {
-            element.after("|", Html);
-        }
-        "iframe" => {
-            let _ = handle_iframe(element);
-        }
-        "b" | "i" | "s" | "strong" | "em" | "del" => {
-            let _ = rewrite_style_element(element);
-        }
-        "ol" | "ul" | "menu" | "li" => {
-            let _ = handle_list_or_item(element, list_type.clone(), order_counter.clone());
-        }
-        "q" | "cite" | "blockquote" => {
-            let _ = rewrite_blockquote_element(element, quote_depth);
-        }
-        "div" | "section" | "header" | "footer" => {
-            insert_newline_before(element);
-            insert_newline_after(element);
-        }
-        "pre" => {
-            element.before("\n```\n", Html);
-            element.after("\n```\n", Html);
-        }
-        "code" | "samp" => {
-            element.before("`", Html);
-            element.after("`", Html);
-        }
-        _ => (),
-    }
-
-    Ok(())
-}
-
 /// Replace the markdown chars cleanly.
 fn replace_markdown_chars(input: &str) -> String {
     use crate::MARKDOWN_MIDDLE_KEYCHARS_SET;
@@ -191,7 +61,7 @@ fn replace_markdown_chars(input: &str) -> String {
     output
 }
 
-/// Get the HTML rewriter settings to convert ot markdown.
+/// Get the HTML rewriter settings to convert to markdown.
 pub fn get_rewriter_settings(
     commonmark: bool,
     custom: &Option<std::collections::HashSet<String>>,
@@ -201,7 +71,6 @@ pub fn get_rewriter_settings(
     let order_counter = Rc::new(RefCell::new(0));
     let quote_depth = Rc::new(RefCell::new(0));
     let quote_depth1 = quote_depth.clone();
-
     let inside_table = Rc::new(RefCell::new(false));
 
     let mut element_content_handlers =
@@ -251,14 +120,16 @@ pub fn get_rewriter_settings(
     }
 
     RewriteStrSettings {
-        document_content_handlers: vec![doc_comments!(|c| {
-            c.remove();
-            Ok(())
-        }),
-        doctype!(|c| {
-            c.remove();
-            Ok(())
-        })],
+        document_content_handlers: vec![
+            doc_comments!(|c| {
+                c.remove();
+                Ok(())
+            }),
+            doctype!(|c| {
+                c.remove();
+                Ok(())
+            }),
+        ],
         element_content_handlers,
         ..RewriteStrSettings::default()
     }
diff --git a/fast_html2md/src/scraper/mod.rs b/fast_html2md/src/scraper/mod.rs
index 4e1380b..7b94c9b 100644
--- a/fast_html2md/src/scraper/mod.rs
+++ b/fast_html2md/src/scraper/mod.rs
@@ -13,9 +13,405 @@ pub mod quotes;
 pub mod styles;
 pub mod tables;
 pub mod utils;
+use super::clean_markdown;
+use anchors::AnchorHandler;
+use codes::CodeHandler;
+use containers::ContainerHandler;
+use dummy::DummyHandler;
+use dummy::HtmlCherryPickHandler;
+use dummy::IdentityHandler;
+use headers::HeaderHandler;
+use html5ever::driver::ParseOpts;
+use html5ever::parse_document;
+use html5ever::tendril::TendrilSink;
+use iframes::IframeHandler;
+use images::ImgHandler;
+use lazy_static::lazy_static;
+use lists::ListHandler;
+use lists::ListItemHandler;
+use markup5ever_rcdom::{Handle, NodeData, RcDom};
+use paragraphs::ParagraphHandler;
+use quotes::QuoteHandler;
+use regex::Regex;
+use std::boxed::Box;
+use std::collections::HashMap;
+use std::sync::Arc;
+use styles::StyleHandler;
+use tables::TableHandler;
+use url::Url;
 
-use super::Handle;
-use super::StructuredPrinter;
-use super::TagHandler;
-use super::TagHandlerFactory;
-use super::{clean_markdown, walk};
+lazy_static! {
+    static ref EXCESSIVE_WHITESPACE_PATTERN: Regex = Regex::new("\\s{2,}").expect("valid regex pattern");   // for HTML on-the-fly cleanup
+    static ref START_OF_LINE_PATTERN: Regex = Regex::new("(^|\\n) *$").expect("valid regex pattern");                  // for Markdown escaping
+    static ref MARKDOWN_STARTONLY_KEYCHARS: Regex = Regex::new(r"^(\s*)([=>+\-#])").expect("valid regex pattern");     // for Markdown escaping
+}
+
+/// Custom variant of main function. Allows to pass custom tag<->tag factory pairs
+/// in order to register custom tag hadler for tags you want.
+///
+/// You can also override standard tag handlers this way
+/// # Arguments
+/// `html` is source HTML as `String`
+/// `custom` is custom tag hadler producers for tags you want, can be empty
+/// `commonmark` is for adjusting markdown output to commonmark
+pub fn parse_html_custom_base(
+    html: &str,
+    custom: &HashMap<String, Box<dyn TagHandlerFactory>>,
+    commonmark: bool,
+    url: &Option<Url>,
+) -> String {
+    let document_parser = parse_document(RcDom::default(), ParseOpts::default());
+
+    match document_parser.from_utf8().read_from(&mut html.as_bytes()) {
+        Ok(dom) => {
+            let mut result = Box::new(StructuredPrinter::default());
+
+            walk(
+                &dom.document,
+                &mut result,
+                custom,
+                commonmark,
+                &if let Some(u) = url {
+                    Some(Arc::new(u.clone()))
+                } else {
+                    None
+                },
+                false,
+            );
+
+            // we want to eventually remove the clean step.
+            clean_markdown(&result.data)
+        }
+        _ => Default::default(),
+    }
+}
+
+/// Custom variant of main function. Allows to pass custom tag<->tag factory pairs
+/// in order to register custom tag hadler for tags you want.
+///
+/// You can also override standard tag handlers this way
+/// # Arguments
+/// `html` is source HTML as `String`
+/// `custom` is custom tag hadler producers for tags you want, can be empty
+/// `commonmark` is for adjusting markdown output to commonmark
+pub fn parse_html_custom(
+    html: &str,
+    custom: &HashMap<String, Box<dyn TagHandlerFactory>>,
+    commonmark: bool,
+) -> String {
+    parse_html_custom_base(html, custom, commonmark, &None)
+}
+
+/// Custom variant of main function. Allows to pass custom tag<->tag factory pairs
+/// in order to register custom tag hadler for tags you want.
+///
+/// You can also override standard tag handlers this way
+/// # Arguments
+/// `html` is source HTML as `String`
+/// `custom` is custom tag hadler producers for tags you want, can be empty
+/// `commonmark` is for adjusting markdown output to commonmark
+/// `url` is used to provide absolute url handling
+pub fn parse_html_custom_with_url(
+    html: &str,
+    custom: &HashMap<String, Box<dyn TagHandlerFactory>>,
+    commonmark: bool,
+    url: &Option<Url>,
+) -> String {
+    parse_html_custom_base(html, custom, commonmark, &url)
+}
+
+/// Main function of this library. Parses incoming HTML, converts it into Markdown
+/// and returns converted string.
+/// # Arguments
+/// `html` is source HTML as `String`
+/// `commonmark` to change the markdown flavor to commonmark as `boolean`
+pub fn parse_html(html: &str, commonmark: bool) -> String {
+    parse_html_custom(html, &HashMap::default(), commonmark)
+}
+
+/// Same as `parse_html` but retains all "span" html elements intact
+/// Markdown parsers usually strip them down when rendering but they
+/// may be useful for later processing.
+pub fn parse_html_extended(html: &str, commonmark: bool) -> String {
+    struct SpanAsIsTagFactory;
+
+    impl TagHandlerFactory for SpanAsIsTagFactory {
+        fn instantiate(&self) -> Box<dyn TagHandler> {
+            Box::new(HtmlCherryPickHandler::default())
+        }
+    }
+
+    let mut tag_factory: HashMap<String, Box<dyn TagHandlerFactory>> = HashMap::new();
+    tag_factory.insert(String::from("span"), Box::new(SpanAsIsTagFactory {}));
+    parse_html_custom(html, &tag_factory, commonmark)
+}
+
+/// This conversion should only be applied to text tags
+///
+/// Escapes text inside HTML tags so it won't be recognized as Markdown control sequence
+/// like list start or bold text style
+fn escape_markdown(result: &StructuredPrinter, text: &str) -> String {
+    escape_markdown_base(&result.data, text)
+}
+
+/// Recursively walk through all DOM tree and handle all elements according to
+/// HTML tag -> Markdown syntax mapping. Text content is trimmed to one whitespace according to HTML5 rules.
+///
+/// # Arguments
+/// `input` is DOM tree or its subtree
+/// `result` is output holder with position and context tracking
+/// `custom` is custom tag hadler producers for tags you want, can be empty
+pub fn walk(
+    input: &Handle,
+    result: &mut StructuredPrinter,
+    custom: &HashMap<String, Box<dyn TagHandlerFactory>>,
+    commonmark: bool,
+    url: &Option<Arc<Url>>,
+    ignore_parents: bool,
+) {
+    let mut handler: Box<dyn TagHandler> = Box::new(DummyHandler);
+    let mut tag_name = String::default();
+
+    let mut inside_pre = false;
+    let mut inside_code = false;
+    let mut ignore_write = false;
+    let mut inside_table = false;
+
+    let find_parent_tags = matches!(
+        &input.data,
+        NodeData::Element { .. } | NodeData::Text { .. }
+    );
+
+    if find_parent_tags || ignore_parents {
+        for tag in result.parent_chain.iter() {
+            if ignore_parents && tag == "table" {
+                inside_table = true;
+                break;
+            }
+            if tag == "code" {
+                inside_code = true;
+                break;
+            }
+            if tag == "pre" {
+                inside_pre = true;
+                break;
+            }
+            if tag_name == "script" || tag_name == "style" {
+                ignore_write = true;
+                break;
+            }
+        }
+    }
+
+    match input.data {
+        NodeData::Document
+        | NodeData::Comment { .. }
+        | NodeData::Doctype { .. }
+        | NodeData::ProcessingInstruction { .. } => (),
+        NodeData::Text { ref contents } => {
+            let mut text = contents.borrow().to_string();
+
+            if inside_pre {
+                // this is preformatted text, insert as-is
+                result.append_str(&text);
+            } else if !(text.trim().is_empty()
+                && (result.data.ends_with('\n') || result.data.ends_with(' ')))
+                && !ignore_write
+            {
+                if !inside_code {
+                    text = escape_markdown(result, &text);
+                }
+
+                let minified_text = EXCESSIVE_WHITESPACE_PATTERN.replace_all(&text, " ");
+
+                result.append_str(minified_text.trim());
+            } else {
+                result.append_str(text.trim());
+            }
+        }
+        NodeData::Element { ref name, .. } => {
+            if !utils::inline_elements::SKIP_ELEMENTS.contains(&name.local) {
+                tag_name = name.local.to_string();
+
+                // do not parse scripts or style tags
+                if tag_name == "script" || tag_name == "style" {
+                    return;
+                }
+
+                if ignore_parents && tag_name == "table" {
+                    inside_table = true;
+                }
+
+                handler = if inside_pre {
+                    // don't add any html tags inside the pre section
+                    Box::new(DummyHandler)
+                } else {
+                    get_handler(custom, &tag_name, commonmark, url)
+                }
+            }
+        }
+    }
+
+    if !inside_table || ignore_parents && inside_table {
+        // handle this tag, while it's not in parent chain
+        // and doesn't have child siblings
+        handler.handle(input, result);
+    }
+
+    result.parent_chain.push(tag_name.clone()); // e.g. it was ["body"] and now it's ["body", "p"]
+
+    let current_depth = result.parent_chain.len(); // e.g. it was 1 and now it's 2
+
+    // create space for siblings of next level
+    result.siblings.insert(current_depth, vec![]);
+
+    if !handler.skip_descendants() {
+        for child in input.children.borrow().iter() {
+            if valid_block_element(&child.data) {
+                walk(&child, result, custom, commonmark, url, ignore_parents);
+
+                if let NodeData::Element { ref name, .. } = child.data {
+                    if let Some(el) = result.siblings.get_mut(&current_depth) {
+                        el.push(name.local.to_string());
+                    }
+                }
+            }
+        }
+    }
+
+    result.siblings.remove(&current_depth);
+    result.parent_chain.pop();
+
+    // finish handling of tag - parent chain now doesn't contain this tag itself again
+    handler.after_handle(result);
+}
+
+/// This conversion should only be applied to text tags
+///
+/// Escapes text inside HTML tags so it won't be recognized as Markdown control sequence
+/// like list start or bold text style
+fn escape_markdown_base(result: &str, text: &str) -> String {
+    // always escape bold/italic/strikethrough
+    let data: std::borrow::Cow<str> = crate::MARKDOWN_MIDDLE_KEYCHARS.replace_all(text, "\\$0");
+
+    // if we're at the start of the line we need to escape list- and quote-starting sequences
+    let data = if START_OF_LINE_PATTERN.is_match(&result) {
+        MARKDOWN_STARTONLY_KEYCHARS.replace(&data, "$1\\$2")
+    } else {
+        data
+    };
+
+    // no handling of more complicated cases such as
+    // ![] or []() ones, for now this will suffice
+    data.into()
+}
+
+/// Get the handler to use for the element.
+pub(crate) fn get_handler<T: std::borrow::Borrow<str> + std::hash::Hash + std::cmp::Eq>(
+    custom: &HashMap<String, Box<dyn TagHandlerFactory>>,
+    tag_name: &T,
+    commonmark: bool,
+    url: &Option<Arc<Url>>,
+) -> Box<dyn TagHandler> {
+    let name = tag_name.borrow();
+    match custom.get(name) {
+        Some(factory) => {
+            // have user-supplied factory, instantiate a handler for this tag
+            factory.instantiate()
+        }
+        _ => {
+            match name.as_ref() {
+                // containers
+                "div" | "section" | "header" | "footer" => Box::new(ContainerHandler),
+                // pagination, breaks
+                "p" | "br" | "hr" => Box::new(ParagraphHandler::default()),
+                "q" | "cite" | "blockquote" => Box::new(QuoteHandler::default()),
+                // spoiler tag
+                "details" | "summary" => Box::new(HtmlCherryPickHandler::new(commonmark)),
+                // formatting
+                "b" | "i" | "s" | "strong" | "em" | "del" => Box::new(StyleHandler::default()),
+                "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => Box::new(HeaderHandler::default()),
+                "pre" | "code" => Box::new(CodeHandler::default()),
+                // images, links
+                "img" => Box::new(ImgHandler::new(commonmark, url)),
+                "a" => Box::new(AnchorHandler::new(url)),
+                // lists
+                "ol" | "ul" | "menu" => Box::new(ListHandler),
+                "li" => Box::new(ListItemHandler::default()),
+                // as-is
+                "sub" | "sup" => Box::new(IdentityHandler::new(commonmark)),
+                // tables, handled fully internally as markdown can't have nested content in tables
+                // supports only single tables as of now
+                "table" => Box::new(TableHandler::new(commonmark, url.clone())),
+                "iframe" => Box::new(IframeHandler),
+                _ => Box::new(DummyHandler),
+            }
+        }
+    }
+}
+
+/// A valid HTML block element.
+pub(crate) fn valid_block_element(node: &NodeData) -> bool {
+    match node {
+        NodeData::Element { ref name, .. } => {
+            !utils::inline_elements::SKIP_ELEMENTS.contains(&name.local)
+        }
+        _ => true,
+    }
+}
+
+/// Intermediate result of HTML -> Markdown conversion.
+///
+/// Holds context in the form of parent tags and siblings chain
+/// and resulting string of markup content with current position.
+#[derive(Debug, Default)]
+pub struct StructuredPrinter {
+    /// Chain of parents leading to upmost <html> tag
+    pub parent_chain: Vec<String>,
+    /// Siblings of currently processed tag in order where they're appearing in html
+    pub siblings: HashMap<usize, Vec<String>>,
+    /// resulting markdown document
+    pub data: String,
+}
+
+impl StructuredPrinter {
+    /// Inserts newline
+    pub fn insert_newline(&mut self) {
+        self.append_str("\n");
+    }
+
+    /// Append string to the end of the printer
+    pub fn append_str(&mut self, it: &str) {
+        self.data.push_str(it);
+    }
+
+    /// Insert string at specified position of printer, adjust position to the end of inserted string
+    pub fn insert_str(&mut self, pos: usize, it: &str) {
+        self.data.insert_str(pos, it);
+    }
+}
+
+/// Tag handler factory. This class is required in providing proper
+/// custom tag parsing capabilities to users of this library.
+///
+/// The problem with directly providing tag handlers is that they're not stateless.
+/// Once tag handler is parsing some tag, it holds data, such as start position, indent etc.
+/// The only way to create fresh tag handler for each tag is to provide a factory like this one.
+///
+pub trait TagHandlerFactory {
+    fn instantiate(&self) -> Box<dyn TagHandler>;
+}
+
+/// Trait interface describing abstract handler of arbitrary HTML tag.
+pub trait TagHandler {
+    /// Handle tag encountered when walking HTML tree.
+    /// This is executed before the children processing
+    fn handle(&mut self, tag: &Handle, printer: &mut StructuredPrinter);
+
+    /// Executed after all children of this tag have been processed
+    fn after_handle(&mut self, printer: &mut StructuredPrinter);
+
+    fn skip_descendants(&self) -> bool {
+        false
+    }
+}
diff --git a/fast_html2md/tests/integration.rs b/fast_html2md/tests/integration.rs
index c41d161..75d31c2 100644
--- a/fast_html2md/tests/integration.rs
+++ b/fast_html2md/tests/integration.rs
@@ -83,7 +83,10 @@ fn test_real_spider() {
         .read_to_string(&mut html)
         .expect("File must be readable");
     let result = rewrite_html(&html, false);
-    assert!(result == r#"To help you get started with Spider, we’ll give you $200 in credits when you spend $100.[Terms apply](https://spider.cloud/promotion-spider-credits)\n# The Web Crawler for AI Agents and LLMs\nSpider offers the finest data collecting solution. Engineered for speed and scalability, it\nallows you to elevate your AI projects.\n[Get Started](https://spider.cloud/credits/new)View Preview\n* Basic\n* Streaming\nExample request\nPython\nJSONL\nCopy\n```\n`import requests, os, json\nheaders = {\n&#x27;&#x27;Authorization &#x27;&#x27;: f &#x27;&#x27;Bearer {os.getenv(&quot;&quot;SPIDER\\_API\\_KEY &quot;&quot;)}&#x27;&#x27;,\n&#x27;&#x27;Content-Type &#x27;&#x27;: &#x27;&#x27;application/jsonl &#x27;&#x27;,\n}\njson\\_data = {&quot;&quot;limit &quot;&quot;:50,&quot;&quot;metadata &quot;&quot;:True,&quot;&quot;url &quot;&quot;:&quot;&quot;https://spider.cloud &quot;&quot;}\nresponse = requests.post(&#x27;&#x27;https://api.spider.cloud/crawl &#x27;&#x27;, headers=headers, json=json\\_data, stream=True)\nwith response as r:\nr.raise\\_for\\_status()\nfor chunk in r.iter\\_lines(\nchunk\\_size=None, decode\\_unicode=True\n):\ndata = json.loads(chunk)\nprint(data)`\n```\n[Free Trial](https://spider.cloud/credits/new?free-trial=1)\nExample Response\n## Built with the need for**Speed**\nExperience the power of**Spider**, built fully in**Rust**for\nnext-generation scalability.\n### 2.4secs\nTo crawl over 20,000 pages\n### 500-1000x\nFaster than alternatives\n### 500x\nCheaper than traditional scraping services\nBenchmarks displaying performance between Spider API request modes.\nSpider API Request Modes &middot;Benchmarked tailwindcss.com &middot;06/16/2024\n[See framework benchmarks](https://github.com/spider-rs/spider/blob/main/benches/BENCHMARKS.md)\n### Seamless Integrations\nSeamlessly integrate Spider with a wide range of platforms, ensuring data curation\nperfectly aligned with your requirements. Compatible with all major AI tools.\n[LangChain integration](https://python.langchain.com/docs/integrations/document_loaders/spider)[LlamaIndex integrationLlama Index Logo](https://docs.llamaindex.ai/en/stable/examples/data_connectors/WebPageDemo/#using-spider-reader)[CrewAI integrationCrewAI Logo](https://docs.crewai.com/tools/SpiderTool/)[FlowWiseAI integrationFlowiseAI LogoFlowiseAI](https://docs.flowiseai.com/integrations/langchain/document-loaders/spider-web-scraper-crawler)[Composio integrationComposio Logo](https://docs.composio.dev/introduction/foundations/components/list_local_tools#spider-crawler)[PhiData integrationPhiData Logo](https://docs.phidata.com/tools/spider)\n### Concurrent Streaming\nSave time and money without having to worry about bandwidth concerns by effectively\nstreaming all the results concurrently. The latency cost that is saved becomes drastic as\nyou crawl more websites.\n### Warp Speed\nPowered by the cutting-edge[Spider](https://github.com/spider-rs/spider)open-source project, our robust Rust engine scales effortlessly to handle extreme\nworkloads. We ensure continuous maintenance and improvement for top-tier performance.\n## Kickstart Your Data Collecting Projects Today\nJumpstart web crawling with full elastic scaling concurrency, optimal formats, and AI scraping.\n### Performance Tuned\nSpider is written in Rust and runs in full concurrency to achieve crawling thousands of\npages in secs.\n### Multiple response formats\nGet clean and formatted markdown, HTML, or text content for fine-tuning or training AI\nmodels.\n### Caching\nFurther boost speed by caching repeated web page crawls to minimize expenses while\nbuilding.\n### Smart Mode\nSpider dynamically switches to Headless Chrome when it needs to quick.\nBeta\n### Scrape with AI\nDo custom browser scripting and data extraction using the latest AI models with no cost\nstep caching.\n### The crawler for LLMs\nDon't let crawling and scraping be the highest latency in your LLM & AI agent stack.\n### Scrape with no headaches\n* Auto Proxy rotations\n* Agent headers\n* Anti-bot detections\n* Headless chrome\n* Markdown responses\n### The Fastest Web Crawler\n* Powered by[spider-rs](https://github.com/spider-rs/spider)\n* 100,000 pages/seconds\n* Unlimited concurrency\n* Simple API\n* 50,000 RPM\n### Do more with AI\n* Browser scripting\n* Advanced extraction\n* Data pipelines\n* Ideal for LLMs and AI Agents\n* Accurate labeling\n## Achieve more with these new API features\nOur API is set to stream so you can act in realtime.\n![A user interface with a search bar containing the text &#34;Latest sports news,&#34; a green &#34;Submit&#34; button, and two icon buttons to display searching and extracting with the service.](/img/search_feature.webp)\n### Search\nGet access to search engine results from anywhere and easily crawl and transform pages to\nLLM-ready markdown.\n[Explore SearchRight Arrow](https://spider.cloud/docs/api#search)\n![A user interface segment showing three icons representing different stages of data transformation.](/img/transform_feature_example.webp)\n### Transform\nConvert raw HTML into markdown easily by using this API. Transform thousands of html pages\nin seconds.\n[Explore TransformRight Arrow](https://spider.cloud/docs/api#transform)\n## Join the community\nBacked by a network of early advocates, contributors, and supporters.\n[GitHub discussions\nChat Icon\n](https://github.com/orgs/spider-rs/discussions)[Discord\nChat Icon\n](https://discord.spider.cloud)\n[\n![iammerrick's avatar](/img/external/iammerrick_twitter.webp)\n@iammerrick\nRust based crawler Spider is next level for crawling &amp;scraping sites. So fast.\nTheir cloud offering is also so easy to use. Good stuff. https://github.com/spider-rs/spider\n](https://twitter.com/iammerrick/status/1787873425446572462)\n[\n![WilliamEspegren's avatar](/img/external/william_twitter.webp)\n@WilliamEspegren\nWeb crawler built in rust, currently the nr1 performance in the world with crazy resource management Aaaaaaand they have a cloud offer, that’s wayyyy cheaper than any competitor\nName a reason for me to use anything else?\ngithub.com/spider-rs/spid…\n](https://twitter.com/WilliamEspegren/status/1789419820821184764)\n[\n![gasa's avatar](/img/external/gaza_twitter.webp)\n@gasa\n@gasathenaper\nis the best crawling tool i have used. I had a complicated project where i needed to paste url and get the website whole website data. Spider does it in an instant\n](https://x.com/gasathenaper/status/1810612492596383948)\n[\n![Ashpreet Bedi's avatar](/img/external/ashpreet_bedi.webp)\n@Ashpreet Bedi\n@ashpreetbedi\nis THE best crawler out there, give it a try\n](https://x.com/ashpreetbedi/status/1815512219003572315?s=46&t=37F5QP_8oKqOsNpHSo6VVw)\n[\n![Troyusrex's avatar](/img/external/troy_twitter.webp)\n@Troyusrex\nI found a new tool, Spider-rs, which scrapes significantly faster and handles more scenarios than the basic scraper I built did. Our use of Spider-rs and AWS infrastructure reduced the scraping time from four months to under a week.\n](https://medium.com/@troyusrex/inside-my-virtual-college-advisor-a-deep-dive-into-rag-ai-and-agent-technology-84731b2928f7#1326)\n[\n![Dify.AI's avatar](/img/external/difyai.webp)\n@Dify.AI\n🕷\u{fe0f}Spider @spider\\_rust\ncan be used as a built-in tool in #Dify Workflow or as an LLM-callable tool in Agent. It allows fast and affordable web scraping and crawling when your AI applications need real-time web data for context.\n](https://x.com/dify_ai/status/1818226971056243089)\n## FAQ\nFrequently asked questions about Spider.\n### What is Spider?\nSpider is a leading web crawling tool designed for speed and cost-effectiveness, supporting various data formats including LLM-ready markdown.\n### Why is my website not crawling?\nYour crawl may fail if it requires JavaScript rendering. Try setting your request to &#x27;chrome &#x27;to solve this issue.\n### Can you crawl all pages?\nYes, Spider accurately crawls all necessary content without needing a sitemap.\n### What formats can Spider convert web data into?\nSpider outputs HTML, raw, text, and various markdown formats. It supports`JSON`,`JSONL`,`CSV`, and`XML`for API responses.\n### Is Spider suitable for large scraping projects?\nAbsolutely, Spider is ideal for large-scale data collection and offers a cost-effective dashboard for data management.\n### How can I try Spider?\nPurchase credits for our cloud system or test the Open Source Spider engine to explore its capabilities.\n### Does it respect robots.txt?\nYes, compliance with robots.txt is default, but you can disable this if necessary.\n### Unable to get dynamic content?\nIf you are having trouble getting dynamic pages, try setting the request parameter to &quot;&quot;chrome &quot;&quot;or &quot;&quot;smart.&quot;&quot;You may also need to set `disable\\_intercept` to allow third-party or external scripts to run.\n### Why is my crawl going slow?\nIf you are experiencing a slow crawl, it is most likely due to the robots.txt file for the website. The robots.txt file may have a crawl delay set, and we respect the delay up to 60 seconds.\n### Do you offer a Free Trial?\nYes, you can try out the service before being charged for free at[checkout](https://spider.cloud/credits/new?free-trial=1).\n## Comprehensive Data Curation for Everyone\nTrusted by leading tech businesses worldwide to deliver accurate and insightful data solutions.\nOuter Labs\n[Zapier LogoZapier](https://zapier.com/apps/spider/integrations)\nElementus Logo\nSuper AI Logo\nLayerX Logo\nSwiss Re\nWrite Sonic Logo\nAlioth Logo\n### Next generation data for AI, scale to millions\n[Start now](https://spider.cloud/credits/new)\n### Company\n* [About](https://spider.cloud/about)\n* [Privacy](https://spider.cloud/privacy)\n* [Terms](https://spider.cloud/eula)\n* [FAQ](https://spider.cloud/faq)\n### Resources\n* [API](https://spider.cloud/docs/api)\n* [Docs](https://spider.cloud/docs/overview)\n* [Guides](https://spider.cloud/guides)\n* [Spider.rs Docs](https://docs.rs/spider/latest/spider/)\n### Services\n* [Pricing](https://spider.cloud/credits/new)\n* [Web Crawling and Scraping](https://spider.cloud/web-crawling-and-scraping)\n[All systems normal.](https://spidercloud.statuspage.io/)\n[\nGithub LogoGitHub\n](https://github.com/spider-rs/spider)[\nDiscord LogoDiscord\n](https://discord.spider.cloud)[\nTwitter LogoTwitter\n](https://twitter.com/spider_rust)"#);
+    assert!(
+        result
+            == r#"To help you get started with Spider, we’ll give you $200 in credits when you spend $100.[Terms apply](https://spider.cloud/promotion-spider-credits)\n# The Web Crawler for AI Agents and LLMs\nSpider offers the finest data collecting solution. Engineered for speed and scalability, it\nallows you to elevate your AI projects.\n[Get Started](https://spider.cloud/credits/new)View Preview\n* Basic\n* Streaming\nExample request\nPython\nJSONL\nCopy\n```\n`import requests, os, json\nheaders = {\n&#x27;&#x27;Authorization &#x27;&#x27;: f &#x27;&#x27;Bearer {os.getenv(&quot;&quot;SPIDER\\_API\\_KEY &quot;&quot;)}&#x27;&#x27;,\n&#x27;&#x27;Content-Type &#x27;&#x27;: &#x27;&#x27;application/jsonl &#x27;&#x27;,\n}\njson\\_data = {&quot;&quot;limit &quot;&quot;:50,&quot;&quot;metadata &quot;&quot;:True,&quot;&quot;url &quot;&quot;:&quot;&quot;https://spider.cloud &quot;&quot;}\nresponse = requests.post(&#x27;&#x27;https://api.spider.cloud/crawl &#x27;&#x27;, headers=headers, json=json\\_data, stream=True)\nwith response as r:\nr.raise\\_for\\_status()\nfor chunk in r.iter\\_lines(\nchunk\\_size=None, decode\\_unicode=True\n):\ndata = json.loads(chunk)\nprint(data)`\n```\n[Free Trial](https://spider.cloud/credits/new?free-trial=1)\nExample Response\n## Built with the need for**Speed**\nExperience the power of**Spider**, built fully in**Rust**for\nnext-generation scalability.\n### 2.4secs\nTo crawl over 20,000 pages\n### 500-1000x\nFaster than alternatives\n### 500x\nCheaper than traditional scraping services\nBenchmarks displaying performance between Spider API request modes.\nSpider API Request Modes &middot;Benchmarked tailwindcss.com &middot;06/16/2024\n[See framework benchmarks](https://github.com/spider-rs/spider/blob/main/benches/BENCHMARKS.md)\n### Seamless Integrations\nSeamlessly integrate Spider with a wide range of platforms, ensuring data curation\nperfectly aligned with your requirements. Compatible with all major AI tools.\n[LangChain integration](https://python.langchain.com/docs/integrations/document_loaders/spider)[LlamaIndex integrationLlama Index Logo](https://docs.llamaindex.ai/en/stable/examples/data_connectors/WebPageDemo/#using-spider-reader)[CrewAI integrationCrewAI Logo](https://docs.crewai.com/tools/SpiderTool/)[FlowWiseAI integrationFlowiseAI LogoFlowiseAI](https://docs.flowiseai.com/integrations/langchain/document-loaders/spider-web-scraper-crawler)[Composio integrationComposio Logo](https://docs.composio.dev/introduction/foundations/components/list_local_tools#spider-crawler)[PhiData integrationPhiData Logo](https://docs.phidata.com/tools/spider)\n### Concurrent Streaming\nSave time and money without having to worry about bandwidth concerns by effectively\nstreaming all the results concurrently. The latency cost that is saved becomes drastic as\nyou crawl more websites.\n### Warp Speed\nPowered by the cutting-edge[Spider](https://github.com/spider-rs/spider)open-source project, our robust Rust engine scales effortlessly to handle extreme\nworkloads. We ensure continuous maintenance and improvement for top-tier performance.\n## Kickstart Your Data Collecting Projects Today\nJumpstart web crawling with full elastic scaling concurrency, optimal formats, and AI scraping.\n### Performance Tuned\nSpider is written in Rust and runs in full concurrency to achieve crawling thousands of\npages in secs.\n### Multiple response formats\nGet clean and formatted markdown, HTML, or text content for fine-tuning or training AI\nmodels.\n### Caching\nFurther boost speed by caching repeated web page crawls to minimize expenses while\nbuilding.\n### Smart Mode\nSpider dynamically switches to Headless Chrome when it needs to quick.\nBeta\n### Scrape with AI\nDo custom browser scripting and data extraction using the latest AI models with no cost\nstep caching.\n### The crawler for LLMs\nDon't let crawling and scraping be the highest latency in your LLM & AI agent stack.\n### Scrape with no headaches\n* Auto Proxy rotations\n* Agent headers\n* Anti-bot detections\n* Headless chrome\n* Markdown responses\n### The Fastest Web Crawler\n* Powered by[spider-rs](https://github.com/spider-rs/spider)\n* 100,000 pages/seconds\n* Unlimited concurrency\n* Simple API\n* 50,000 RPM\n### Do more with AI\n* Browser scripting\n* Advanced extraction\n* Data pipelines\n* Ideal for LLMs and AI Agents\n* Accurate labeling\n## Achieve more with these new API features\nOur API is set to stream so you can act in realtime.\n![A user interface with a search bar containing the text &#34;Latest sports news,&#34; a green &#34;Submit&#34; button, and two icon buttons to display searching and extracting with the service.](/img/search_feature.webp)\n### Search\nGet access to search engine results from anywhere and easily crawl and transform pages to\nLLM-ready markdown.\n[Explore SearchRight Arrow](https://spider.cloud/docs/api#search)\n![A user interface segment showing three icons representing different stages of data transformation.](/img/transform_feature_example.webp)\n### Transform\nConvert raw HTML into markdown easily by using this API. Transform thousands of html pages\nin seconds.\n[Explore TransformRight Arrow](https://spider.cloud/docs/api#transform)\n## Join the community\nBacked by a network of early advocates, contributors, and supporters.\n[GitHub discussions\nChat Icon\n](https://github.com/orgs/spider-rs/discussions)[Discord\nChat Icon\n](https://discord.spider.cloud)\n[\n![iammerrick's avatar](/img/external/iammerrick_twitter.webp)\n@iammerrick\nRust based crawler Spider is next level for crawling &amp;scraping sites. So fast.\nTheir cloud offering is also so easy to use. Good stuff. https://github.com/spider-rs/spider\n](https://twitter.com/iammerrick/status/1787873425446572462)\n[\n![WilliamEspegren's avatar](/img/external/william_twitter.webp)\n@WilliamEspegren\nWeb crawler built in rust, currently the nr1 performance in the world with crazy resource management Aaaaaaand they have a cloud offer, that’s wayyyy cheaper than any competitor\nName a reason for me to use anything else?\ngithub.com/spider-rs/spid…\n](https://twitter.com/WilliamEspegren/status/1789419820821184764)\n[\n![gasa's avatar](/img/external/gaza_twitter.webp)\n@gasa\n@gasathenaper\nis the best crawling tool i have used. I had a complicated project where i needed to paste url and get the website whole website data. Spider does it in an instant\n](https://x.com/gasathenaper/status/1810612492596383948)\n[\n![Ashpreet Bedi's avatar](/img/external/ashpreet_bedi.webp)\n@Ashpreet Bedi\n@ashpreetbedi\nis THE best crawler out there, give it a try\n](https://x.com/ashpreetbedi/status/1815512219003572315?s=46&t=37F5QP_8oKqOsNpHSo6VVw)\n[\n![Troyusrex's avatar](/img/external/troy_twitter.webp)\n@Troyusrex\nI found a new tool, Spider-rs, which scrapes significantly faster and handles more scenarios than the basic scraper I built did. Our use of Spider-rs and AWS infrastructure reduced the scraping time from four months to under a week.\n](https://medium.com/@troyusrex/inside-my-virtual-college-advisor-a-deep-dive-into-rag-ai-and-agent-technology-84731b2928f7#1326)\n[\n![Dify.AI's avatar](/img/external/difyai.webp)\n@Dify.AI\n🕷\u{fe0f}Spider @spider\\_rust\ncan be used as a built-in tool in #Dify Workflow or as an LLM-callable tool in Agent. It allows fast and affordable web scraping and crawling when your AI applications need real-time web data for context.\n](https://x.com/dify_ai/status/1818226971056243089)\n## FAQ\nFrequently asked questions about Spider.\n### What is Spider?\nSpider is a leading web crawling tool designed for speed and cost-effectiveness, supporting various data formats including LLM-ready markdown.\n### Why is my website not crawling?\nYour crawl may fail if it requires JavaScript rendering. Try setting your request to &#x27;chrome &#x27;to solve this issue.\n### Can you crawl all pages?\nYes, Spider accurately crawls all necessary content without needing a sitemap.\n### What formats can Spider convert web data into?\nSpider outputs HTML, raw, text, and various markdown formats. It supports`JSON`,`JSONL`,`CSV`, and`XML`for API responses.\n### Is Spider suitable for large scraping projects?\nAbsolutely, Spider is ideal for large-scale data collection and offers a cost-effective dashboard for data management.\n### How can I try Spider?\nPurchase credits for our cloud system or test the Open Source Spider engine to explore its capabilities.\n### Does it respect robots.txt?\nYes, compliance with robots.txt is default, but you can disable this if necessary.\n### Unable to get dynamic content?\nIf you are having trouble getting dynamic pages, try setting the request parameter to &quot;&quot;chrome &quot;&quot;or &quot;&quot;smart.&quot;&quot;You may also need to set `disable\\_intercept` to allow third-party or external scripts to run.\n### Why is my crawl going slow?\nIf you are experiencing a slow crawl, it is most likely due to the robots.txt file for the website. The robots.txt file may have a crawl delay set, and we respect the delay up to 60 seconds.\n### Do you offer a Free Trial?\nYes, you can try out the service before being charged for free at[checkout](https://spider.cloud/credits/new?free-trial=1).\n## Comprehensive Data Curation for Everyone\nTrusted by leading tech businesses worldwide to deliver accurate and insightful data solutions.\nOuter Labs\n[Zapier LogoZapier](https://zapier.com/apps/spider/integrations)\nElementus Logo\nSuper AI Logo\nLayerX Logo\nSwiss Re\nWrite Sonic Logo\nAlioth Logo\n### Next generation data for AI, scale to millions\n[Start now](https://spider.cloud/credits/new)\n### Company\n* [About](https://spider.cloud/about)\n* [Privacy](https://spider.cloud/privacy)\n* [Terms](https://spider.cloud/eula)\n* [FAQ](https://spider.cloud/faq)\n### Resources\n* [API](https://spider.cloud/docs/api)\n* [Docs](https://spider.cloud/docs/overview)\n* [Guides](https://spider.cloud/guides)\n* [Spider.rs Docs](https://docs.rs/spider/latest/spider/)\n### Services\n* [Pricing](https://spider.cloud/credits/new)\n* [Web Crawling and Scraping](https://spider.cloud/web-crawling-and-scraping)\n[All systems normal.](https://spidercloud.statuspage.io/)\n[\nGithub LogoGitHub\n](https://github.com/spider-rs/spider)[\nDiscord LogoDiscord\n](https://discord.spider.cloud)[\nTwitter LogoTwitter\n](https://twitter.com/spider_rust)"#
+    );
 }
 
 #[test]
@@ -182,7 +185,8 @@ fn test_html_from_text() {
         .read_to_string(&mut html)
         .expect("File must be readable");
 
-    let mut tag_factory: HashMap<String, Box<dyn html2md::TagHandlerFactory>> = HashMap::new();
+    let mut tag_factory: HashMap<String, Box<dyn html2md::scraper::TagHandlerFactory>> =
+        HashMap::new();
     let tag = Box::new(html2md::scraper::ignore::IgnoreTagFactory {});
 
     tag_factory.insert(String::from("script"), tag.clone());