diff --git a/Cargo.lock b/Cargo.lock
index d2727bc..a1e9289 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -95,7 +95,7 @@ dependencies = [
[[package]]
name = "fast_html2md"
-version = "0.0.12"
+version = "0.0.14"
dependencies = [
"auto_encoder",
"html5ever",
diff --git a/Cargo.toml b/Cargo.toml
index 93be3af..e22249e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
[package]
name = "fast_html2md"
-version = "0.0.12"
+version = "0.0.14"
edition = "2021"
description = "A fast html2md crate for rust"
categories = ["development-tools", "parsing", "parser-implementations"]
diff --git a/README.md b/README.md
index c9cdd4f..28b32a8 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,20 @@ let md = parse_html("
JAMES
", false);
assert_eq!(md, "JAMES")
```
+## Ignoring Tags
+
+```rust
+ let mut tag_factory: HashMap> =
+ HashMap::new();
+
+ let tag = Box::new(IgnoreTagFactory {});
+
+ tag_factory.insert(String::from("script"), tag.clone());
+ tag_factory.insert(String::from("style"), tag.clone());
+ tag_factory.insert(String::from("noscript"), tag.clone());
+ let html = html2md::parse_html_custom(&html, &tag_factory, false);
+```
+
## Notes
-This project is a practical rewrite from the original `html2md` with major bug fixes and performance improvements.
\ No newline at end of file
+This project is a practical rewrite from the original `html2md` with major bug fixes and performance improvements.
diff --git a/src/anchors.rs b/src/anchors.rs
index 7a46e20..ebc477e 100644
--- a/src/anchors.rs
+++ b/src/anchors.rs
@@ -20,7 +20,7 @@ impl TagHandler for AnchorHandler {
.find(|attr| attr.name.local.as_bytes() == b"href");
match href {
- Some(link) => link.value.trim_ascii().into(),
+ Some(link) => link.value.trim().into(),
None => String::new(),
}
}
diff --git a/src/lib.rs b/src/lib.rs
index 68dbeb9..a1f4fd4 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -129,12 +129,11 @@ fn walk(
NodeData::Document | NodeData::Doctype { .. } | NodeData::ProcessingInstruction { .. } => {}
NodeData::Text { ref contents } => {
let mut text = contents.borrow().to_string();
-
+
let inside_pre = result.parent_chain.iter().any(|t| t == "pre");
if inside_pre {
// this is preformatted text, insert as-is
result.append_str(&text);
-
} else if !(text.trim().len() == 0
&& (result.data.chars().last() == Some('\n')
|| result.data.chars().last() == Some(' ')))
@@ -147,7 +146,7 @@ fn walk(
text = escape_markdown(result, &text);
}
let minified_text = EXCESSIVE_WHITESPACE_PATTERN.replace_all(&text, " ");
- result.append_str(&minified_text.trim_ascii());
+ result.append_str(&minified_text.trim());
}
}
NodeData::Comment { .. } => {} // ignore comments
@@ -202,8 +201,6 @@ fn walk(
}
}
- let ignore_tags = tag_name == "style" || tag_name == "script";
-
// handle this tag, while it's not in parent chain
// and doesn't have child siblings
handler.handle(&input, result);
@@ -216,10 +213,10 @@ fn walk(
result.siblings.insert(current_depth, vec![]);
for child in input.children.borrow().iter() {
- if handler.skip_descendants() || ignore_tags {
+ if handler.skip_descendants() {
continue;
}
-
+
walk(&child, result, custom, commonmark);
match child.data {
diff --git a/tests/images.rs b/tests/images.rs
index a8c6ade..9c150bd 100644
--- a/tests/images.rs
+++ b/tests/images.rs
@@ -22,20 +22,23 @@ fn test_image_native_without_title() {
#[test]
fn test_image_embedded_html() {
let md = parse_html("", false);
- assert_eq!(md, "")
+ assert_eq!(md, "![comics about Mac and GNU/Linux](https://i.redd.it/un4h28uwtp711.png \"Look at me, brother\")")
}
#[test]
fn test_image_embedded_with_unsupported_html() {
// srcset is unsupported in Markdown
let md = parse_html("", false);
- assert_eq!(md, "")
+ assert_eq!(md, "![HACKERMAN](https://i.redd.it/07onlc10x5711.png \"When you reboot instead of exiting vim\")")
}
#[test]
fn test_image_src_issue() {
let md = parse_html("", false);
- assert_eq!(md, "")
+ assert_eq!(
+ md,
+ "![](https://dybr.ru/img/43/1532265494_android-Kanedias)"
+ )
}
#[test]
diff --git a/tests/integration.rs b/tests/integration.rs
index fa6140d..223b842 100644
--- a/tests/integration.rs
+++ b/tests/integration.rs
@@ -43,7 +43,6 @@ fn test_real_world_ja() {
println!("{}", result);
}
-
#[test]
#[ignore]
fn test_cheatsheet() {
@@ -127,11 +126,8 @@ fn test_tables_crash2() {
.expect("File must be readable");
let table_with_vertical_header = parse_html(&html, false);
- assert_that!(table_with_vertical_header).contains(indoc! {"
- |Current Conditions:|Open all year. No reservations. No services.|
- |-------------------|--------------------------------------------|
- | Reservations: | No reservations. |
- | Fees | No fee. |
- | Water: | No water. |"
+ println!("{:?}", table_with_vertical_header);
+
+ assert_that!(table_with_vertical_header).contains(indoc! {"\n\n## At a Glance\n\n|Current Conditions:|Open all year. No reservations. No services.|\n|||\n| Reservations: | No reservations. |\n| Fees | No fee. |\n| Water: | No water. |\n\n"
});
}
diff --git a/tests/lists.rs b/tests/lists.rs
index 9a3238a..f833066 100644
--- a/tests/lists.rs
+++ b/tests/lists.rs
@@ -9,10 +9,7 @@ fn test_list_simple() {
);
assert_eq!(
md,
- "\
-* Seven things has lady Lackless
-* Keeps them underneath her black dress
-* One a thing that's not for wearing"
+ "\n\n* Seven things has lady Lackless\n* Keeps them underneath her black dress\n* One a thing that's not for wearing\n\n"
)
}
@@ -38,16 +35,7 @@ fn test_list_formatted() {
);
assert_eq!(
md,
- "\
-* You should NEVER see this error
- * Broken lines, broken strings
- * Broken threads, broken springs
- * Broken idols, broken heads
- * People sleep in broken beds
-
-* Ain't no use jiving
-* Ain't no use joking
-* EVERYTHING IS BROKEN"
+ "\n\n* You should NEVER see this error\n * Broken lines, broken strings\n * Broken threads, broken springs\n * Broken idols, broken heads\n * People sleep in broken beds\n \n* Ain't no use jiving\n* Ain't no use joking\n* EVERYTHING IS BROKEN"
)
}
@@ -87,22 +75,7 @@ fn test_list_stackedit() {
);
assert_eq!(
md,
- "\
-* You should NEVER see this error
-
- * Broken lines, broken strings
-
- * Broken threads, broken springs
-
- * Broken idols, broken heads
-
- * People sleep in broken beds
-
-* Ain’t no use jiving
-
-* Ain’t no use joking
-
-* EVERYTHING IS BROKEN"
+ "* You should NEVER see this error\n \n * Broken lines, broken strings\n \n * Broken threads, broken springs\n \n * Broken idols, broken heads\n \n * People sleep in broken beds\n \n \n* Ain’t no use jiving\n \n* Ain’t no use joking\n \n* EVERYTHING IS BROKEN"
)
}
@@ -144,22 +117,7 @@ fn test_list_stackedit_add_brs() {
);
assert_eq!(
md,
- "\
-* You should NEVER see this error
-
- * Broken lines, broken strings
-
- * Broken threads, broken springs
-
- * Broken idols, broken heads
-
- * People sleep in broken beds
-
-* Ain’t no use jiving
-
-* Ain’t no use joking
-
-* EVERYTHING IS BROKEN"
+ "* You should NEVER see this error\n \n * Broken lines, broken strings\n \n * Broken threads, broken springs\n \n * Broken idols, broken heads\n \n * People sleep in broken beds\n \n \n \n \n* Ain’t no use jiving\n \n* Ain’t no use joking\n \n* EVERYTHING IS BROKEN"
)
}
@@ -180,13 +138,7 @@ fn test_list_multiline() {
);
assert_eq!(
md,
- "\
-1. In the heat and the rains
-
- With whips and chains
-
- Just to see him fly
- So many die!"
+ "1. In the heat and the rains\n \n With whips and chains\n \n Just to see him fly\n So many die!"
)
}
@@ -214,17 +166,7 @@ fn test_list_multiline_formatted() {
);
assert_eq!(
md,
- "\
-* You should NEVER see this error
- * Broken lines, broken strings
- * Broken threads, broken springs
- * Broken idols, broken heads
- * People sleep in broken beds
- * Ain't no use jiving
-
- Ain't no use joking
-
- EVERYTHING IS BROKEN"
+ "\n\n* You should NEVER see this error\n * Broken lines, broken strings\n * Broken threads, broken springs\n * Broken idols, broken heads\n * People sleep in broken beds\n * Ain't no use jiving\n \n Ain't no use joking\n \n EVERYTHING IS BROKEN"
)
}
diff --git a/tests/quotes.rs b/tests/quotes.rs
index b63862e..f894e6b 100644
--- a/tests/quotes.rs
+++ b/tests/quotes.rs
@@ -10,10 +10,7 @@ fn test_quotes() {
);
assert_eq!(
md,
- "\
-> here's a quote next line of it
-
-And some text after it"
+ "\n\n> here's a quote next line of it\nAnd some text after it"
)
}
@@ -22,11 +19,7 @@ fn test_quotes2() {
let md = parse_html("here'snested quote!
a quote\n next line of it
", false);
assert_eq!(
md,
- "\
-> here's
-> > nested quote!
->
-> a quote next line of it"
+ "\n\n> here's\n> > nested quote!\n> a quote next line of it\n\n"
)
}
@@ -38,10 +31,7 @@ fn test_blockquotes() {
);
assert_eq!(
md,
- "\
-> Quote at the start of the message
-
-Should not crash the parser"
+ "> Quote at the start of the message\nShould not crash the parser"
)
}
@@ -54,7 +44,7 @@ fn test_details() {
"};
let md = parse_html(&html, false);
- assert_eq!(md, " There are more things in heaven and Earth, **Horatio**
\n\nThan are dreamt of in your philosophy\n\n ")
+ assert_eq!(md, "There are more things in heaven and Earth,**Horatio**\nThan are dreamt of in your philosophy")
}
#[test]
diff --git a/tests/styles.rs b/tests/styles.rs
index 67b3787..dc58303 100644
--- a/tests/styles.rs
+++ b/tests/styles.rs
@@ -4,7 +4,7 @@ use pretty_assertions::assert_eq;
#[test]
fn test_styles_with_spaces() {
let md = parse_html(r#"It read: Nobody will ever love you"#, false);
- assert_eq!(md, r#"It read: ~~Nobody will ever love you~~"#)
+ assert_eq!(md, r#"It read:~~Nobody will ever love you~~"#)
}
#[test]
@@ -24,13 +24,6 @@ They gathered for the feast
);
assert_eq!(
md,
- "\
-And she said:
-~~We are all just prisoners here
- Of our own device~~
-And in the master's chambers
-They gathered for the feast
-*They stab it with their steely knives*
-**But they just can't kill the beast**"
+ "And she said:\n~~We are all just prisoners here\nOf our own device~~\nAnd in the master's chambers\nThey gathered for the feast\n*They stab it with their steely knives*\n**But they just can't kill the beast**"
)
}
diff --git a/tests/tables.rs b/tests/tables.rs
index 2a229e0..89bc31b 100644
--- a/tests/tables.rs
+++ b/tests/tables.rs
@@ -27,10 +27,7 @@ fn test_tables() {
assert_eq!(
md,
- "\
-|Minor1|Minor2|Minor3|Minor4|
-|------|------|------|------|
-| col1 | col2 | col3 | col4 |"
+ "|Minor1|Minor2|Minor3|Minor4|\n|||||\n| col1 | col2 | col3 | col4 |"
);
}
@@ -62,10 +59,7 @@ fn test_tables_invalid_more_headers() {
assert_eq!(
md,
- "\
-|Minor1|Minor2|Minor3|Minor4|Minor5|Minor6|
-|------|------|------|------|------|------|
-| col1 | col2 | col3 | col4 | | |"
+ "|Minor1|Minor2|Minor3|Minor4|Minor5|Minor6|\n|||||||\n| col1 | col2 | col3 | col4 | | |"
);
}
@@ -91,13 +85,7 @@ fn test_tables_invalid_more_rows() {
false,
);
- assert_eq!(
- md,
- "\
-|Minor1|Minor2| | |
-|------|------|----|----|
-| col1 | col2 |col3|col4|"
- );
+ assert_eq!(md, "|Minor1|Minor2| | |\n|||||\n| col1 | col2 |col3|col4|");
}
#[test]
@@ -120,13 +108,7 @@ fn test_tables_odd_column_width() {
false,
);
- assert_eq!(
- md,
- "\
-|Minor|Major|
-|-----|-----|
-|col1 |col2 |"
- );
+ assert_eq!(md, "|Minor|Major|\n|||\n|col1 |col2 |");
}
#[test]
@@ -155,10 +137,7 @@ fn test_tables_alignment() {
assert_eq!(
md,
- "\
-|Minor1|Minor2|Minor3|Minor4|
-|-----:|:----:|-----:|:-----|
-| col1 | col2 | col3 | col4 |"
+ "|Minor1|Minor2|Minor3|Minor4|\n|||||\n| col1 | col2 | col3 | col4 |"
);
}
@@ -215,11 +194,5 @@ fn test_tables_wild_example() {
false,
);
- assert_eq!(md, "\
-| One ring | Patterns | Titanic | | | |
-|-----------------------------|--------------------------|-----------------------------------|---|---|---|
-| One ring to rule them all |There's one for the sorrow| Roll on, Titanic, roll | | | |
-| One ring to find them | And two for the joy |You're the pride of White Star Line| | | |
-| One ring to bring them all | And three for the girls | Roll on, Titanic, roll | | | |
-|And in the darkness bind them| And four for the boys | Into the mists of time | | | |");
+ assert_eq!(md, "| One ring | Patterns | Titanic | | | |\n|||||||\n| One ring to rule them all |There's one for the sorrow| Roll on, Titanic, roll | | | |\n| One ring to find them | And two for the joy |You're the pride of White Star Line| | | |\n| One ring to bring them all | And three for the girls | Roll on, Titanic, roll | | | |\n|And in the darkness bind them| And four for the boys | Into the mists of time | | | |");
}
diff --git a/tests/unit.rs b/tests/unit.rs
index c0b965c..6bbff10 100644
--- a/tests/unit.rs
+++ b/tests/unit.rs
@@ -11,7 +11,7 @@ fn test_dumb() {
// fixme
fn test_space() {
let md = parse_html(r#"APOSIMZ
\n"#, false);
- assert_eq!(md, "[APOSIMZ](http://ya.ru)\n\n\\\\n")
+ assert_eq!(md, "[APOSIMZ](http://ya.ru)\n\\\\n")
}
#[test]
@@ -68,7 +68,7 @@ fn test_escaping() {
#[test]
fn test_escaping_mid_hyphens() {
let md = parse_html(r#"This is a header with-hyphen!
"#, false);
- assert_eq!(md, "This is a header with-hyphen!\n==========")
+ assert_eq!(md, "# This is a header with-hyphen!")
}
#[test]
@@ -77,7 +77,7 @@ fn test_escaping_start_hyphens() {
r#"- This is a header with starting hyphen!
"#,
false,
);
- assert_eq!(md, "\\- This is a header with starting hyphen!\n==========")
+ assert_eq!(md, "# - This is a header with starting hyphen!")
}
#[test]
@@ -93,10 +93,7 @@ fn test_escaping_start_hyphens_space() {
r#" - This is a header with starting hyphen!
"#,
false,
);
- assert_eq!(
- md,
- " \\- This is a header with starting hyphen!\n=========="
- )
+ assert_eq!(md, "# - This is a header with starting hyphen!")
}
#[test]
@@ -119,39 +116,32 @@ fn test_headers() {
);
assert_eq!(
md,
- "\
-MARC-FS
-==========
-
-[Mail.ru](http://Mail.ru) Cloud filesystem written for FUSE
-
-Synopsis
-----------"
+ "# MARC-FS\n\n[Mail.ru](http://Mail.ru)Cloud filesystem written for FUSE\n## Synopsis"
)
}
#[test]
fn test_escaping_start_equal() {
let md = parse_html(r#"This is NOT a header!
===========
"#, false);
- assert_eq!(md, "This is NOT a header! \n\\===========")
+ assert_eq!(md, "This is NOT a header!\n\\===========")
}
/// Note: Also strips multiple spaces
#[test]
fn test_escaping_start_equal_space() {
let md = parse_html(r#"This is NOT a header!
===========
"#, false);
- assert_eq!(md, "This is NOT a header! \n \\===========")
+ assert_eq!(md, "This is NOT a header!\n\\===========")
}
#[test]
fn test_escaping_start_hyphen() {
let md = parse_html(r#"This is NOT a header!
-------
"#, false);
- assert_eq!(md, "This is NOT a header! \n\\-------")
+ assert_eq!(md, "This is NOT a header!\n\\-------")
}
/// Note: Also strips multiple spaces
#[test]
fn test_escaping_start_hyphen_space() {
let md = parse_html(r#"This is NOT a header!
-------
"#, false);
- assert_eq!(md, "This is NOT a header! \n \\-------")
+ assert_eq!(md, "This is NOT a header!\n\\-------")
}