diff --git a/Cargo.lock b/Cargo.lock index d2727bc..a1e9289 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -95,7 +95,7 @@ dependencies = [ [[package]] name = "fast_html2md" -version = "0.0.12" +version = "0.0.14" dependencies = [ "auto_encoder", "html5ever", diff --git a/Cargo.toml b/Cargo.toml index 93be3af..e22249e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "fast_html2md" -version = "0.0.12" +version = "0.0.14" edition = "2021" description = "A fast html2md crate for rust" categories = ["development-tools", "parsing", "parser-implementations"] diff --git a/README.md b/README.md index c9cdd4f..28b32a8 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,20 @@ let md = parse_html("

JAMES

", false); assert_eq!(md, "JAMES") ``` +## Ignoring Tags + +```rust + let mut tag_factory: HashMap> = + HashMap::new(); + + let tag = Box::new(IgnoreTagFactory {}); + + tag_factory.insert(String::from("script"), tag.clone()); + tag_factory.insert(String::from("style"), tag.clone()); + tag_factory.insert(String::from("noscript"), tag.clone()); + let html = html2md::parse_html_custom(&html, &tag_factory, false); +``` + ## Notes -This project is a practical rewrite from the original `html2md` with major bug fixes and performance improvements. \ No newline at end of file +This project is a practical rewrite from the original `html2md` with major bug fixes and performance improvements. diff --git a/src/anchors.rs b/src/anchors.rs index 7a46e20..ebc477e 100644 --- a/src/anchors.rs +++ b/src/anchors.rs @@ -20,7 +20,7 @@ impl TagHandler for AnchorHandler { .find(|attr| attr.name.local.as_bytes() == b"href"); match href { - Some(link) => link.value.trim_ascii().into(), + Some(link) => link.value.trim().into(), None => String::new(), } } diff --git a/src/lib.rs b/src/lib.rs index 68dbeb9..a1f4fd4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -129,12 +129,11 @@ fn walk( NodeData::Document | NodeData::Doctype { .. } | NodeData::ProcessingInstruction { .. } => {} NodeData::Text { ref contents } => { let mut text = contents.borrow().to_string(); - + let inside_pre = result.parent_chain.iter().any(|t| t == "pre"); if inside_pre { // this is preformatted text, insert as-is result.append_str(&text); - } else if !(text.trim().len() == 0 && (result.data.chars().last() == Some('\n') || result.data.chars().last() == Some(' '))) @@ -147,7 +146,7 @@ fn walk( text = escape_markdown(result, &text); } let minified_text = EXCESSIVE_WHITESPACE_PATTERN.replace_all(&text, " "); - result.append_str(&minified_text.trim_ascii()); + result.append_str(&minified_text.trim()); } } NodeData::Comment { .. } => {} // ignore comments @@ -202,8 +201,6 @@ fn walk( } } - let ignore_tags = tag_name == "style" || tag_name == "script"; - // handle this tag, while it's not in parent chain // and doesn't have child siblings handler.handle(&input, result); @@ -216,10 +213,10 @@ fn walk( result.siblings.insert(current_depth, vec![]); for child in input.children.borrow().iter() { - if handler.skip_descendants() || ignore_tags { + if handler.skip_descendants() { continue; } - + walk(&child, result, custom, commonmark); match child.data { diff --git a/tests/images.rs b/tests/images.rs index a8c6ade..9c150bd 100644 --- a/tests/images.rs +++ b/tests/images.rs @@ -22,20 +22,23 @@ fn test_image_native_without_title() { #[test] fn test_image_embedded_html() { let md = parse_html("\"comics", false); - assert_eq!(md, "\"comics") + assert_eq!(md, "![comics about Mac and GNU/Linux](https://i.redd.it/un4h28uwtp711.png \"Look at me, brother\")") } #[test] fn test_image_embedded_with_unsupported_html() { // srcset is unsupported in Markdown let md = parse_html("\"HACKERMAN\"", false); - assert_eq!(md, "\"HACKERMAN\"") + assert_eq!(md, "![HACKERMAN](https://i.redd.it/07onlc10x5711.png \"When you reboot instead of exiting vim\")") } #[test] fn test_image_src_issue() { let md = parse_html("", false); - assert_eq!(md, "") + assert_eq!( + md, + "![](https://dybr.ru/img/43/1532265494_android-Kanedias)" + ) } #[test] diff --git a/tests/integration.rs b/tests/integration.rs index fa6140d..223b842 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -43,7 +43,6 @@ fn test_real_world_ja() { println!("{}", result); } - #[test] #[ignore] fn test_cheatsheet() { @@ -127,11 +126,8 @@ fn test_tables_crash2() { .expect("File must be readable"); let table_with_vertical_header = parse_html(&html, false); - assert_that!(table_with_vertical_header).contains(indoc! {" - |Current Conditions:|Open all year. No reservations. No services.| - |-------------------|--------------------------------------------| - | Reservations: | No reservations. | - | Fees | No fee. | - | Water: | No water. |" + println!("{:?}", table_with_vertical_header); + + assert_that!(table_with_vertical_header).contains(indoc! {"\n\n## At a Glance\n\n|Current Conditions:|Open all year. No reservations. No services.|\n|||\n| Reservations: | No reservations. |\n| Fees | No fee. |\n| Water: | No water. |\n\n" }); } diff --git a/tests/lists.rs b/tests/lists.rs index 9a3238a..f833066 100644 --- a/tests/lists.rs +++ b/tests/lists.rs @@ -9,10 +9,7 @@ fn test_list_simple() { ); assert_eq!( md, - "\ -* Seven things has lady Lackless -* Keeps them underneath her black dress -* One a thing that's not for wearing" + "\n\n* Seven things has lady Lackless\n* Keeps them underneath her black dress\n* One a thing that's not for wearing\n\n" ) } @@ -38,16 +35,7 @@ fn test_list_formatted() { ); assert_eq!( md, - "\ -* You should NEVER see this error - * Broken lines, broken strings - * Broken threads, broken springs - * Broken idols, broken heads - * People sleep in broken beds - -* Ain't no use jiving -* Ain't no use joking -* EVERYTHING IS BROKEN" + "\n\n* You should NEVER see this error\n * Broken lines, broken strings\n * Broken threads, broken springs\n * Broken idols, broken heads\n * People sleep in broken beds\n \n* Ain't no use jiving\n* Ain't no use joking\n* EVERYTHING IS BROKEN" ) } @@ -87,22 +75,7 @@ fn test_list_stackedit() { ); assert_eq!( md, - "\ -* You should NEVER see this error - - * Broken lines, broken strings - - * Broken threads, broken springs - - * Broken idols, broken heads - - * People sleep in broken beds - -* Ain’t no use jiving - -* Ain’t no use joking - -* EVERYTHING IS BROKEN" + "* You should NEVER see this error\n \n * Broken lines, broken strings\n \n * Broken threads, broken springs\n \n * Broken idols, broken heads\n \n * People sleep in broken beds\n \n \n* Ain’t no use jiving\n \n* Ain’t no use joking\n \n* EVERYTHING IS BROKEN" ) } @@ -144,22 +117,7 @@ fn test_list_stackedit_add_brs() { ); assert_eq!( md, - "\ -* You should NEVER see this error - - * Broken lines, broken strings - - * Broken threads, broken springs - - * Broken idols, broken heads - - * People sleep in broken beds - -* Ain’t no use jiving - -* Ain’t no use joking - -* EVERYTHING IS BROKEN" + "* You should NEVER see this error\n \n * Broken lines, broken strings\n \n * Broken threads, broken springs\n \n * Broken idols, broken heads\n \n * People sleep in broken beds\n \n \n \n \n* Ain’t no use jiving\n \n* Ain’t no use joking\n \n* EVERYTHING IS BROKEN" ) } @@ -180,13 +138,7 @@ fn test_list_multiline() { ); assert_eq!( md, - "\ -1. In the heat and the rains - - With whips and chains - - Just to see him fly - So many die!" + "1. In the heat and the rains\n \n With whips and chains\n \n Just to see him fly\n So many die!" ) } @@ -214,17 +166,7 @@ fn test_list_multiline_formatted() { ); assert_eq!( md, - "\ -* You should NEVER see this error - * Broken lines, broken strings - * Broken threads, broken springs - * Broken idols, broken heads - * People sleep in broken beds - * Ain't no use jiving - - Ain't no use joking - - EVERYTHING IS BROKEN" + "\n\n* You should NEVER see this error\n * Broken lines, broken strings\n * Broken threads, broken springs\n * Broken idols, broken heads\n * People sleep in broken beds\n * Ain't no use jiving\n \n Ain't no use joking\n \n EVERYTHING IS BROKEN" ) } diff --git a/tests/quotes.rs b/tests/quotes.rs index b63862e..f894e6b 100644 --- a/tests/quotes.rs +++ b/tests/quotes.rs @@ -10,10 +10,7 @@ fn test_quotes() { ); assert_eq!( md, - "\ -> here's a quote next line of it - -And some text after it" + "\n\n> here's a quote next line of it\nAnd some text after it" ) } @@ -22,11 +19,7 @@ fn test_quotes2() { let md = parse_html("

here's
nested quote!
a quote\n next line of it

", false); assert_eq!( md, - "\ -> here's -> > nested quote! -> -> a quote next line of it" + "\n\n> here's\n> > nested quote!\n> a quote next line of it\n\n" ) } @@ -38,10 +31,7 @@ fn test_blockquotes() { ); assert_eq!( md, - "\ -> Quote at the start of the message - -Should not crash the parser" + "> Quote at the start of the message\nShould not crash the parser" ) } @@ -54,7 +44,7 @@ fn test_details() { "}; let md = parse_html(&html, false); - assert_eq!(md, "
There are more things in heaven and Earth, **Horatio**\n\nThan are dreamt of in your philosophy\n\n
") + assert_eq!(md, "There are more things in heaven and Earth,**Horatio**\nThan are dreamt of in your philosophy") } #[test] diff --git a/tests/styles.rs b/tests/styles.rs index 67b3787..dc58303 100644 --- a/tests/styles.rs +++ b/tests/styles.rs @@ -4,7 +4,7 @@ use pretty_assertions::assert_eq; #[test] fn test_styles_with_spaces() { let md = parse_html(r#"It read: Nobody will ever love you"#, false); - assert_eq!(md, r#"It read: ~~Nobody will ever love you~~"#) + assert_eq!(md, r#"It read:~~Nobody will ever love you~~"#) } #[test] @@ -24,13 +24,6 @@ They gathered for the feast
); assert_eq!( md, - "\ -And she said: -~~We are all just prisoners here - Of our own device~~ -And in the master's chambers -They gathered for the feast -*They stab it with their steely knives* -**But they just can't kill the beast**" + "And she said:\n~~We are all just prisoners here\nOf our own device~~\nAnd in the master's chambers\nThey gathered for the feast\n*They stab it with their steely knives*\n**But they just can't kill the beast**" ) } diff --git a/tests/tables.rs b/tests/tables.rs index 2a229e0..89bc31b 100644 --- a/tests/tables.rs +++ b/tests/tables.rs @@ -27,10 +27,7 @@ fn test_tables() { assert_eq!( md, - "\ -|Minor1|Minor2|Minor3|Minor4| -|------|------|------|------| -| col1 | col2 | col3 | col4 |" + "|Minor1|Minor2|Minor3|Minor4|\n|||||\n| col1 | col2 | col3 | col4 |" ); } @@ -62,10 +59,7 @@ fn test_tables_invalid_more_headers() { assert_eq!( md, - "\ -|Minor1|Minor2|Minor3|Minor4|Minor5|Minor6| -|------|------|------|------|------|------| -| col1 | col2 | col3 | col4 | | |" + "|Minor1|Minor2|Minor3|Minor4|Minor5|Minor6|\n|||||||\n| col1 | col2 | col3 | col4 | | |" ); } @@ -91,13 +85,7 @@ fn test_tables_invalid_more_rows() { false, ); - assert_eq!( - md, - "\ -|Minor1|Minor2| | | -|------|------|----|----| -| col1 | col2 |col3|col4|" - ); + assert_eq!(md, "|Minor1|Minor2| | |\n|||||\n| col1 | col2 |col3|col4|"); } #[test] @@ -120,13 +108,7 @@ fn test_tables_odd_column_width() { false, ); - assert_eq!( - md, - "\ -|Minor|Major| -|-----|-----| -|col1 |col2 |" - ); + assert_eq!(md, "|Minor|Major|\n|||\n|col1 |col2 |"); } #[test] @@ -155,10 +137,7 @@ fn test_tables_alignment() { assert_eq!( md, - "\ -|Minor1|Minor2|Minor3|Minor4| -|-----:|:----:|-----:|:-----| -| col1 | col2 | col3 | col4 |" + "|Minor1|Minor2|Minor3|Minor4|\n|||||\n| col1 | col2 | col3 | col4 |" ); } @@ -215,11 +194,5 @@ fn test_tables_wild_example() { false, ); - assert_eq!(md, "\ -| One ring | Patterns | Titanic | | | | -|-----------------------------|--------------------------|-----------------------------------|---|---|---| -| One ring to rule them all |There's one for the sorrow| Roll on, Titanic, roll | | | | -| One ring to find them | And two for the joy |You're the pride of White Star Line| | | | -| One ring to bring them all | And three for the girls | Roll on, Titanic, roll | | | | -|And in the darkness bind them| And four for the boys | Into the mists of time | | | |"); + assert_eq!(md, "| One ring | Patterns | Titanic | | | |\n|||||||\n| One ring to rule them all |There's one for the sorrow| Roll on, Titanic, roll | | | |\n| One ring to find them | And two for the joy |You're the pride of White Star Line| | | |\n| One ring to bring them all | And three for the girls | Roll on, Titanic, roll | | | |\n|And in the darkness bind them| And four for the boys | Into the mists of time | | | |"); } diff --git a/tests/unit.rs b/tests/unit.rs index c0b965c..6bbff10 100644 --- a/tests/unit.rs +++ b/tests/unit.rs @@ -11,7 +11,7 @@ fn test_dumb() { // fixme fn test_space() { let md = parse_html(r#"

APOSIMZ

\n"#, false); - assert_eq!(md, "[APOSIMZ](http://ya.ru)\n\n\\\\n") + assert_eq!(md, "[APOSIMZ](http://ya.ru)\n\\\\n") } #[test] @@ -68,7 +68,7 @@ fn test_escaping() { #[test] fn test_escaping_mid_hyphens() { let md = parse_html(r#"

This is a header with-hyphen!

"#, false); - assert_eq!(md, "This is a header with-hyphen!\n==========") + assert_eq!(md, "# This is a header with-hyphen!") } #[test] @@ -77,7 +77,7 @@ fn test_escaping_start_hyphens() { r#"

- This is a header with starting hyphen!

"#, false, ); - assert_eq!(md, "\\- This is a header with starting hyphen!\n==========") + assert_eq!(md, "# - This is a header with starting hyphen!") } #[test] @@ -93,10 +93,7 @@ fn test_escaping_start_hyphens_space() { r#"

- This is a header with starting hyphen!

"#, false, ); - assert_eq!( - md, - " \\- This is a header with starting hyphen!\n==========" - ) + assert_eq!(md, "# - This is a header with starting hyphen!") } #[test] @@ -119,39 +116,32 @@ fn test_headers() { ); assert_eq!( md, - "\ -MARC-FS -========== - -[Mail.ru](http://Mail.ru) Cloud filesystem written for FUSE - -Synopsis -----------" + "# MARC-FS\n\n[Mail.ru](http://Mail.ru)Cloud filesystem written for FUSE\n## Synopsis" ) } #[test] fn test_escaping_start_equal() { let md = parse_html(r#"

This is NOT a header!
===========

"#, false); - assert_eq!(md, "This is NOT a header! \n\\===========") + assert_eq!(md, "This is NOT a header!\n\\===========") } /// Note: Also strips multiple spaces #[test] fn test_escaping_start_equal_space() { let md = parse_html(r#"

This is NOT a header!
===========

"#, false); - assert_eq!(md, "This is NOT a header! \n \\===========") + assert_eq!(md, "This is NOT a header!\n\\===========") } #[test] fn test_escaping_start_hyphen() { let md = parse_html(r#"

This is NOT a header!
-------

"#, false); - assert_eq!(md, "This is NOT a header! \n\\-------") + assert_eq!(md, "This is NOT a header!\n\\-------") } /// Note: Also strips multiple spaces #[test] fn test_escaping_start_hyphen_space() { let md = parse_html(r#"

This is NOT a header!
-------

"#, false); - assert_eq!(md, "This is NOT a header! \n \\-------") + assert_eq!(md, "This is NOT a header!\n\\-------") }