From c0e121667a7d6b5cc7a1e221c80b40adc7067cff Mon Sep 17 00:00:00 2001 From: j-mendez Date: Thu, 24 Oct 2024 05:27:01 -0400 Subject: [PATCH] chore(tables): remove
auto inject tables --- Cargo.lock | 2 +- Cargo.toml | 2 +- src/dummy.rs | 18 +++++++++++++-- src/lib.rs | 60 ++++++++++++++++++++++++++++++++++++++----------- src/tables.rs | 5 +---- tests/quotes.rs | 2 ++ tests/unit.rs | 10 +++++++++ 7 files changed, 78 insertions(+), 21 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d02e2fb..aa4a756 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -95,7 +95,7 @@ dependencies = [ [[package]] name = "fast_html2md" -version = "0.0.15" +version = "0.0.18" dependencies = [ "auto_encoder", "html5ever", diff --git a/Cargo.toml b/Cargo.toml index 554ce05..676e0a6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "fast_html2md" -version = "0.0.15" +version = "0.0.18" edition = "2021" description = "A fast html2md crate for rust" categories = ["development-tools", "parsing", "parser-implementations"] diff --git a/src/dummy.rs b/src/dummy.rs index ffadeb1..b952bf9 100644 --- a/src/dummy.rs +++ b/src/dummy.rs @@ -16,14 +16,28 @@ impl TagHandler for DummyHandler { /// Handler that completely copies tag to printer as HTML with all descendants #[derive(Default)] -pub(super) struct IdentityHandler; +pub(super) struct IdentityHandler { + /// Commonmark spec + pub commonmark: bool, +} + +impl IdentityHandler { + /// A new identity handler. + pub fn new(commonmark: bool) -> Self { + Self { commonmark } + } +} impl TagHandler for IdentityHandler { fn handle(&mut self, tag: &Handle, printer: &mut StructuredPrinter) { let mut buffer = vec![]; let options = SerializeOpts { - traversal_scope: TraversalScope::IncludeNode, + traversal_scope: if self.commonmark { + TraversalScope::IncludeNode + } else { + TraversalScope::ChildrenOnly(None) + }, ..Default::default() }; let to_be_serialized = SerializableHandle::from(tag.clone()); diff --git a/src/lib.rs b/src/lib.rs index 43c0a95..71cda7f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -125,12 +125,38 @@ fn walk( let mut handler: Box = Box::new(DummyHandler::default()); let mut tag_name = String::default(); + let mut inside_pre = false; + let mut inside_code = false; + let mut ignore_write = false; + + let find_parent_tags = match &input.data { + NodeData::Element { .. } => true, + NodeData::Text { .. } => true, + _ => false, + }; + + if find_parent_tags { + for tag in result.parent_chain.iter() { + if tag == "code" { + inside_code = true; + break; + } + if tag == "pre" { + inside_pre = true; + break; + } + if tag_name == "script" || tag_name == "style" { + ignore_write = true; + break; + } + } + } + match input.data { NodeData::Document | NodeData::Doctype { .. } | NodeData::ProcessingInstruction { .. } => {} NodeData::Text { ref contents } => { let mut text = contents.borrow().to_string(); - let inside_pre = result.parent_chain.iter().any(|t| t == "pre"); if inside_pre { // this is preformatted text, insert as-is result.append_str(&text); @@ -138,22 +164,25 @@ fn walk( && (result.data.chars().last() == Some('\n') || result.data.chars().last() == Some(' '))) { - // in case it's not just a whitespace after the newline or another whitespace + if !ignore_write { + if !inside_code { + text = escape_markdown(result, &text); + } - // regular text, collapse whitespace and newlines in text - let inside_code = result.parent_chain.iter().any(|t| t == "code"); - if !inside_code { - text = escape_markdown(result, &text); + let minified_text = EXCESSIVE_WHITESPACE_PATTERN.replace_all(&text, " "); + result.append_str(&minified_text.trim()); } - let minified_text = EXCESSIVE_WHITESPACE_PATTERN.replace_all(&text, " "); - result.append_str(&minified_text.trim()); } } NodeData::Comment { .. } => {} // ignore comments NodeData::Element { ref name, .. } => { - let inside_pre = result.parent_chain.iter().any(|tag| tag == "pre"); tag_name = name.local.to_string(); + // do not parse scripts or style tags + if tag_name == "script" || tag_name == "style" { + return; + } + if inside_pre { // don't add any html tags inside the pre section handler = Box::new(DummyHandler::default()); @@ -191,13 +220,11 @@ fn walk( "ol" | "ul" | "menu" => Box::new(ListHandler::default()), "li" => Box::new(ListItemHandler::default()), // as-is - "sub" | "sup" => Box::new(IdentityHandler::default()), + "sub" | "sup" => Box::new(IdentityHandler::new(commonmark)), // tables, handled fully internally as markdown can't have nested content in tables // supports only single tables as of now "table" => Box::new(TableHandler::default()), "iframe" => Box::new(IframeHandler::default()), - // other - "html" | "head" | "body" => Box::new(DummyHandler::default()), _ => Box::new(DummyHandler::default()), } } @@ -226,7 +253,14 @@ fn walk( match child.data { NodeData::Element { ref name, .. } => match result.siblings.get_mut(¤t_depth) { - Some(el) => el.push(name.local.to_string()), + Some(el) => { + let eln = name.local.to_string(); + let ignore_push = eln == "script" || eln == "style"; + + if !ignore_push { + el.push(eln) + } + } _ => (), }, _ => (), diff --git a/src/tables.rs b/src/tables.rs index cd435a0..0b1f8cf 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -229,8 +229,5 @@ where fn to_text(tag: &Handle, commonmark: bool) -> String { let mut printer = StructuredPrinter::default(); walk(tag, &mut printer, &HashMap::default(), commonmark); - - let result = clean_markdown(&printer.data); - - result.replace("\n", "
") + clean_markdown(&printer.data) } diff --git a/tests/quotes.rs b/tests/quotes.rs index f894e6b..5e27d0c 100644 --- a/tests/quotes.rs +++ b/tests/quotes.rs @@ -50,5 +50,7 @@ fn test_details() { #[test] fn test_subsup() { let md = parse_html("X2", false); + assert_eq!(md, r#"X2"#); + let md = parse_html("X2", true); assert_eq!(md, r#"X2"#) } diff --git a/tests/unit.rs b/tests/unit.rs index 6bbff10..6e0434c 100644 --- a/tests/unit.rs +++ b/tests/unit.rs @@ -145,3 +145,13 @@ fn test_escaping_start_hyphen_space() { let md = parse_html(r#"

This is NOT a header!
-------

"#, false); assert_eq!(md, "This is NOT a header!\n\\-------") } + +/// Note: Also strips multiple spaces +#[test] +fn test_escaping_sup_tags() { + let md = parse_html( + r#"

This is NOT a header!
something -------

"#, + false, + ); + assert_eq!(md, "This is NOT a header!\nsomething-------") +}