diff --git a/Cargo.lock b/Cargo.lock index d02e2fb..e4d9bc5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -95,7 +95,7 @@ dependencies = [ [[package]] name = "fast_html2md" -version = "0.0.15" +version = "0.0.16" dependencies = [ "auto_encoder", "html5ever", diff --git a/Cargo.toml b/Cargo.toml index 554ce05..4197e89 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "fast_html2md" -version = "0.0.15" +version = "0.0.16" edition = "2021" description = "A fast html2md crate for rust" categories = ["development-tools", "parsing", "parser-implementations"] diff --git a/src/lib.rs b/src/lib.rs index 43c0a95..a90352f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -125,12 +125,38 @@ fn walk( let mut handler: Box = Box::new(DummyHandler::default()); let mut tag_name = String::default(); + let mut inside_pre = false; + let mut inside_code = false; + let mut ignore_write = false; + + let find_parent_tags = match &input.data { + NodeData::Element { .. } => true, + NodeData::Text { .. } => true, + _ => false, + }; + + if find_parent_tags { + for tag in result.parent_chain.iter() { + if tag == "code" { + inside_code = true; + break; + } + if tag == "pre" { + inside_pre = true; + break; + } + if tag_name == "script" || tag_name == "style" { + ignore_write = true; + break; + } + } + } + match input.data { NodeData::Document | NodeData::Doctype { .. } | NodeData::ProcessingInstruction { .. } => {} NodeData::Text { ref contents } => { let mut text = contents.borrow().to_string(); - let inside_pre = result.parent_chain.iter().any(|t| t == "pre"); if inside_pre { // this is preformatted text, insert as-is result.append_str(&text); @@ -138,22 +164,25 @@ fn walk( && (result.data.chars().last() == Some('\n') || result.data.chars().last() == Some(' '))) { - // in case it's not just a whitespace after the newline or another whitespace + if !ignore_write { + if !inside_code { + text = escape_markdown(result, &text); + } - // regular text, collapse whitespace and newlines in text - let inside_code = result.parent_chain.iter().any(|t| t == "code"); - if !inside_code { - text = escape_markdown(result, &text); + let minified_text = EXCESSIVE_WHITESPACE_PATTERN.replace_all(&text, " "); + result.append_str(&minified_text.trim()); } - let minified_text = EXCESSIVE_WHITESPACE_PATTERN.replace_all(&text, " "); - result.append_str(&minified_text.trim()); } } NodeData::Comment { .. } => {} // ignore comments NodeData::Element { ref name, .. } => { - let inside_pre = result.parent_chain.iter().any(|tag| tag == "pre"); tag_name = name.local.to_string(); + // do not parse scripts or style tags + if tag_name == "script" || tag_name == "style" { + return; + } + if inside_pre { // don't add any html tags inside the pre section handler = Box::new(DummyHandler::default()); @@ -196,8 +225,6 @@ fn walk( // supports only single tables as of now "table" => Box::new(TableHandler::default()), "iframe" => Box::new(IframeHandler::default()), - // other - "html" | "head" | "body" => Box::new(DummyHandler::default()), _ => Box::new(DummyHandler::default()), } } @@ -226,7 +253,14 @@ fn walk( match child.data { NodeData::Element { ref name, .. } => match result.siblings.get_mut(¤t_depth) { - Some(el) => el.push(name.local.to_string()), + Some(el) => { + let eln = name.local.to_string(); + let ignore_push = eln == "script" || eln == "style"; + + if !ignore_push { + el.push(eln) + } + } _ => (), }, _ => (), diff --git a/src/tables.rs b/src/tables.rs index cd435a0..bf83fe4 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -229,8 +229,6 @@ where fn to_text(tag: &Handle, commonmark: bool) -> String { let mut printer = StructuredPrinter::default(); walk(tag, &mut printer, &HashMap::default(), commonmark); - let result = clean_markdown(&printer.data); - - result.replace("\n", "
") + result.replace("\n", "") } diff --git a/tests/integration.rs b/tests/integration.rs index 223b842..b7dc488 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -114,6 +114,8 @@ fn test_tables_with_newlines() { .filter(|line| !line.ends_with("|")) .collect(); + println!("{:?}", result); + assert_that(&invalid_table_lines).is_empty(); }