Skip to content

Commit

Permalink
chore(tables): remove <br /> auto inject tables
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Oct 24, 2024
1 parent 3093a04 commit c0e1216
Show file tree
Hide file tree
Showing 7 changed files with 78 additions and 21 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "fast_html2md"
version = "0.0.15"
version = "0.0.18"
edition = "2021"
description = "A fast html2md crate for rust"
categories = ["development-tools", "parsing", "parser-implementations"]
Expand Down
18 changes: 16 additions & 2 deletions src/dummy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,28 @@ impl TagHandler for DummyHandler {

/// Handler that completely copies tag to printer as HTML with all descendants
#[derive(Default)]
pub(super) struct IdentityHandler;
pub(super) struct IdentityHandler {
/// Commonmark spec
pub commonmark: bool,
}

impl IdentityHandler {
/// A new identity handler.
pub fn new(commonmark: bool) -> Self {
Self { commonmark }
}
}

impl TagHandler for IdentityHandler {
fn handle(&mut self, tag: &Handle, printer: &mut StructuredPrinter) {
let mut buffer = vec![];

let options = SerializeOpts {
traversal_scope: TraversalScope::IncludeNode,
traversal_scope: if self.commonmark {
TraversalScope::IncludeNode
} else {
TraversalScope::ChildrenOnly(None)
},
..Default::default()
};
let to_be_serialized = SerializableHandle::from(tag.clone());
Expand Down
60 changes: 47 additions & 13 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,35 +125,64 @@ fn walk(
let mut handler: Box<dyn TagHandler> = Box::new(DummyHandler::default());
let mut tag_name = String::default();

let mut inside_pre = false;
let mut inside_code = false;
let mut ignore_write = false;

let find_parent_tags = match &input.data {
NodeData::Element { .. } => true,
NodeData::Text { .. } => true,
_ => false,
};

if find_parent_tags {
for tag in result.parent_chain.iter() {
if tag == "code" {
inside_code = true;
break;
}
if tag == "pre" {
inside_pre = true;
break;
}
if tag_name == "script" || tag_name == "style" {
ignore_write = true;
break;
}
}
}

match input.data {
NodeData::Document | NodeData::Doctype { .. } | NodeData::ProcessingInstruction { .. } => {}
NodeData::Text { ref contents } => {
let mut text = contents.borrow().to_string();

let inside_pre = result.parent_chain.iter().any(|t| t == "pre");
if inside_pre {
// this is preformatted text, insert as-is
result.append_str(&text);
} else if !(text.trim().len() == 0
&& (result.data.chars().last() == Some('\n')
|| result.data.chars().last() == Some(' ')))
{
// in case it's not just a whitespace after the newline or another whitespace
if !ignore_write {
if !inside_code {
text = escape_markdown(result, &text);
}

// regular text, collapse whitespace and newlines in text
let inside_code = result.parent_chain.iter().any(|t| t == "code");
if !inside_code {
text = escape_markdown(result, &text);
let minified_text = EXCESSIVE_WHITESPACE_PATTERN.replace_all(&text, " ");
result.append_str(&minified_text.trim());
}
let minified_text = EXCESSIVE_WHITESPACE_PATTERN.replace_all(&text, " ");
result.append_str(&minified_text.trim());
}
}
NodeData::Comment { .. } => {} // ignore comments
NodeData::Element { ref name, .. } => {
let inside_pre = result.parent_chain.iter().any(|tag| tag == "pre");
tag_name = name.local.to_string();

// do not parse scripts or style tags
if tag_name == "script" || tag_name == "style" {
return;
}

if inside_pre {
// don't add any html tags inside the pre section
handler = Box::new(DummyHandler::default());
Expand Down Expand Up @@ -191,13 +220,11 @@ fn walk(
"ol" | "ul" | "menu" => Box::new(ListHandler::default()),
"li" => Box::new(ListItemHandler::default()),
// as-is
"sub" | "sup" => Box::new(IdentityHandler::default()),
"sub" | "sup" => Box::new(IdentityHandler::new(commonmark)),
// tables, handled fully internally as markdown can't have nested content in tables
// supports only single tables as of now
"table" => Box::new(TableHandler::default()),
"iframe" => Box::new(IframeHandler::default()),
// other
"html" | "head" | "body" => Box::new(DummyHandler::default()),
_ => Box::new(DummyHandler::default()),
}
}
Expand Down Expand Up @@ -226,7 +253,14 @@ fn walk(

match child.data {
NodeData::Element { ref name, .. } => match result.siblings.get_mut(&current_depth) {
Some(el) => el.push(name.local.to_string()),
Some(el) => {
let eln = name.local.to_string();
let ignore_push = eln == "script" || eln == "style";

if !ignore_push {
el.push(eln)
}
}
_ => (),
},
_ => (),
Expand Down
5 changes: 1 addition & 4 deletions src/tables.rs
Original file line number Diff line number Diff line change
Expand Up @@ -229,8 +229,5 @@ where
fn to_text(tag: &Handle, commonmark: bool) -> String {
let mut printer = StructuredPrinter::default();
walk(tag, &mut printer, &HashMap::default(), commonmark);

let result = clean_markdown(&printer.data);

result.replace("\n", "<br/>")
clean_markdown(&printer.data)
}
2 changes: 2 additions & 0 deletions tests/quotes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,5 +50,7 @@ fn test_details() {
#[test]
fn test_subsup() {
let md = parse_html("X<sub>2</sub>", false);
assert_eq!(md, r#"X2"#);
let md = parse_html("X<sub>2</sub>", true);
assert_eq!(md, r#"X<sub>2</sub>"#)
}
10 changes: 10 additions & 0 deletions tests/unit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -145,3 +145,13 @@ fn test_escaping_start_hyphen_space() {
let md = parse_html(r#"<p>This is NOT a header!<br/> -------</p>"#, false);
assert_eq!(md, "This is NOT a header!\n\\-------")
}

/// Note: Also strips multiple spaces
#[test]
fn test_escaping_sup_tags() {
let md = parse_html(
r#"<p>This is NOT a header!<br/><sup>something</sup> -------</p>"#,
false,
);
assert_eq!(md, "This is NOT a header!\nsomething-------")
}

0 comments on commit c0e1216

Please sign in to comment.