From c0e121667a7d6b5cc7a1e221c80b40adc7067cff Mon Sep 17 00:00:00 2001
From: j-mendez <jeff@a11ywatch.com>
Date: Thu, 24 Oct 2024 05:27:01 -0400
Subject: [PATCH] chore(tables): remove <br /> auto inject tables

---
 Cargo.lock      |  2 +-
 Cargo.toml      |  2 +-
 src/dummy.rs    | 18 +++++++++++++--
 src/lib.rs      | 60 ++++++++++++++++++++++++++++++++++++++-----------
 src/tables.rs   |  5 +----
 tests/quotes.rs |  2 ++
 tests/unit.rs   | 10 +++++++++
 7 files changed, 78 insertions(+), 21 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index d02e2fb..aa4a756 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -95,7 +95,7 @@ dependencies = [
 
 [[package]]
 name = "fast_html2md"
-version = "0.0.15"
+version = "0.0.18"
 dependencies = [
  "auto_encoder",
  "html5ever",
diff --git a/Cargo.toml b/Cargo.toml
index 554ce05..676e0a6 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "fast_html2md"
-version = "0.0.15"
+version = "0.0.18"
 edition = "2021"
 description = "A fast html2md crate for rust"
 categories = ["development-tools", "parsing", "parser-implementations"]
diff --git a/src/dummy.rs b/src/dummy.rs
index ffadeb1..b952bf9 100644
--- a/src/dummy.rs
+++ b/src/dummy.rs
@@ -16,14 +16,28 @@ impl TagHandler for DummyHandler {
 
 /// Handler that completely copies tag to printer as HTML with all descendants
 #[derive(Default)]
-pub(super) struct IdentityHandler;
+pub(super) struct IdentityHandler {
+    /// Commonmark spec
+    pub commonmark: bool,
+}
+
+impl IdentityHandler {
+    /// A new identity handler.
+    pub fn new(commonmark: bool) -> Self {
+        Self { commonmark }
+    }
+}
 
 impl TagHandler for IdentityHandler {
     fn handle(&mut self, tag: &Handle, printer: &mut StructuredPrinter) {
         let mut buffer = vec![];
 
         let options = SerializeOpts {
-            traversal_scope: TraversalScope::IncludeNode,
+            traversal_scope: if self.commonmark {
+                TraversalScope::IncludeNode
+            } else {
+                TraversalScope::ChildrenOnly(None)
+            },
             ..Default::default()
         };
         let to_be_serialized = SerializableHandle::from(tag.clone());
diff --git a/src/lib.rs b/src/lib.rs
index 43c0a95..71cda7f 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -125,12 +125,38 @@ fn walk(
     let mut handler: Box<dyn TagHandler> = Box::new(DummyHandler::default());
     let mut tag_name = String::default();
 
+    let mut inside_pre = false;
+    let mut inside_code = false;
+    let mut ignore_write = false;
+
+    let find_parent_tags = match &input.data {
+        NodeData::Element { .. } => true,
+        NodeData::Text { .. } => true,
+        _ => false,
+    };
+
+    if find_parent_tags {
+        for tag in result.parent_chain.iter() {
+            if tag == "code" {
+                inside_code = true;
+                break;
+            }
+            if tag == "pre" {
+                inside_pre = true;
+                break;
+            }
+            if tag_name == "script" || tag_name == "style" {
+                ignore_write = true;
+                break;
+            }
+        }
+    }
+
     match input.data {
         NodeData::Document | NodeData::Doctype { .. } | NodeData::ProcessingInstruction { .. } => {}
         NodeData::Text { ref contents } => {
             let mut text = contents.borrow().to_string();
 
-            let inside_pre = result.parent_chain.iter().any(|t| t == "pre");
             if inside_pre {
                 // this is preformatted text, insert as-is
                 result.append_str(&text);
@@ -138,22 +164,25 @@ fn walk(
                 && (result.data.chars().last() == Some('\n')
                     || result.data.chars().last() == Some(' ')))
             {
-                // in case it's not just a whitespace after the newline or another whitespace
+                if !ignore_write {
+                    if !inside_code {
+                        text = escape_markdown(result, &text);
+                    }
 
-                // regular text, collapse whitespace and newlines in text
-                let inside_code = result.parent_chain.iter().any(|t| t == "code");
-                if !inside_code {
-                    text = escape_markdown(result, &text);
+                    let minified_text = EXCESSIVE_WHITESPACE_PATTERN.replace_all(&text, " ");
+                    result.append_str(&minified_text.trim());
                 }
-                let minified_text = EXCESSIVE_WHITESPACE_PATTERN.replace_all(&text, " ");
-                result.append_str(&minified_text.trim());
             }
         }
         NodeData::Comment { .. } => {} // ignore comments
         NodeData::Element { ref name, .. } => {
-            let inside_pre = result.parent_chain.iter().any(|tag| tag == "pre");
             tag_name = name.local.to_string();
 
+            // do not parse scripts or style tags
+            if tag_name == "script" || tag_name == "style" {
+                return;
+            }
+
             if inside_pre {
                 // don't add any html tags inside the pre section
                 handler = Box::new(DummyHandler::default());
@@ -191,13 +220,11 @@ fn walk(
                             "ol" | "ul" | "menu" => Box::new(ListHandler::default()),
                             "li" => Box::new(ListItemHandler::default()),
                             // as-is
-                            "sub" | "sup" => Box::new(IdentityHandler::default()),
+                            "sub" | "sup" => Box::new(IdentityHandler::new(commonmark)),
                             // tables, handled fully internally as markdown can't have nested content in tables
                             // supports only single tables as of now
                             "table" => Box::new(TableHandler::default()),
                             "iframe" => Box::new(IframeHandler::default()),
-                            // other
-                            "html" | "head" | "body" => Box::new(DummyHandler::default()),
                             _ => Box::new(DummyHandler::default()),
                         }
                     }
@@ -226,7 +253,14 @@ fn walk(
 
         match child.data {
             NodeData::Element { ref name, .. } => match result.siblings.get_mut(&current_depth) {
-                Some(el) => el.push(name.local.to_string()),
+                Some(el) => {
+                    let eln = name.local.to_string();
+                    let ignore_push = eln == "script" || eln == "style";
+
+                    if !ignore_push {
+                        el.push(eln)
+                    }
+                }
                 _ => (),
             },
             _ => (),
diff --git a/src/tables.rs b/src/tables.rs
index cd435a0..0b1f8cf 100644
--- a/src/tables.rs
+++ b/src/tables.rs
@@ -229,8 +229,5 @@ where
 fn to_text(tag: &Handle, commonmark: bool) -> String {
     let mut printer = StructuredPrinter::default();
     walk(tag, &mut printer, &HashMap::default(), commonmark);
-
-    let result = clean_markdown(&printer.data);
-
-    result.replace("\n", "<br/>")
+    clean_markdown(&printer.data)
 }
diff --git a/tests/quotes.rs b/tests/quotes.rs
index f894e6b..5e27d0c 100644
--- a/tests/quotes.rs
+++ b/tests/quotes.rs
@@ -50,5 +50,7 @@ fn test_details() {
 #[test]
 fn test_subsup() {
     let md = parse_html("X<sub>2</sub>", false);
+    assert_eq!(md, r#"X2"#);
+    let md = parse_html("X<sub>2</sub>", true);
     assert_eq!(md, r#"X<sub>2</sub>"#)
 }
diff --git a/tests/unit.rs b/tests/unit.rs
index 6bbff10..6e0434c 100644
--- a/tests/unit.rs
+++ b/tests/unit.rs
@@ -145,3 +145,13 @@ fn test_escaping_start_hyphen_space() {
     let md = parse_html(r#"<p>This is NOT a header!<br/>     -------</p>"#, false);
     assert_eq!(md, "This is NOT a header!\n\\-------")
 }
+
+/// Note: Also strips multiple spaces
+#[test]
+fn test_escaping_sup_tags() {
+    let md = parse_html(
+        r#"<p>This is NOT a header!<br/><sup>something</sup>     -------</p>"#,
+        false,
+    );
+    assert_eq!(md, "This is NOT a header!\nsomething-------")
+}