diff --git a/Cargo.lock b/Cargo.lock index 935828a..ea316bb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -374,7 +374,7 @@ dependencies = [ [[package]] name = "fast_html2md" -version = "0.0.32" +version = "0.0.34" dependencies = [ "auto_encoder", "html5ever", diff --git a/benches/parse.rs b/benches/parse.rs index 6c77cdb..e034533 100644 --- a/benches/parse.rs +++ b/benches/parse.rs @@ -23,6 +23,20 @@ pub fn bench_speed(c: &mut Criterion) { b.iter(|| black_box(rewrite_html(&html, false))) }); + let path = std::path::Path::new("../test-samples/wiki/en-wikipedia-org_wiki_Cat.html"); + + let mut html = String::new(); + let mut html_file = File::open(path).unwrap(); + html_file.read_to_string(&mut html).unwrap(); + + group.bench_function(format!("Scraper wiki-cat: {}", sample_title), |b| { + b.iter(|| black_box(parse_html(&html, false))) + }); + + group.bench_function(format!("Rewriter wiki-cat: {}", sample_title), |b| { + b.iter(|| black_box(rewrite_html(&html, false))) + }); + group.finish(); } diff --git a/fast_html2md/Cargo.toml b/fast_html2md/Cargo.toml index 1f1c04c..da429ef 100644 --- a/fast_html2md/Cargo.toml +++ b/fast_html2md/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "fast_html2md" -version = "0.0.32" +version = "0.0.34" edition = "2021" description = "A fast html2md crate for rust" categories = ["development-tools", "parsing", "parser-implementations"] diff --git a/fast_html2md/src/lib.rs b/fast_html2md/src/lib.rs index e8b02c9..37e57c4 100644 --- a/fast_html2md/src/lib.rs +++ b/fast_html2md/src/lib.rs @@ -5,6 +5,7 @@ use lazy_static::lazy_static; use regex::Regex; use std::boxed::Box; use std::collections::HashMap; +use std::collections::HashSet; use std::sync::Arc; use url::Url; @@ -51,11 +52,6 @@ use tables::TableHandler; lazy_static! { static ref EXCESSIVE_WHITESPACE_PATTERN: Regex = Regex::new("\\s{2,}").expect("valid regex pattern"); // for HTML on-the-fly cleanup - static ref EMPTY_LINE_PATTERN: Regex = Regex::new("(?m)^ +$").expect("valid regex pattern"); // for Markdown post-processing - static ref EXCESSIVE_NEWLINE_PATTERN: Regex = Regex::new("\\n{3,}").expect("valid regex pattern"); // for Markdown post-processing - static ref TRAILING_SPACE_PATTERN: Regex = Regex::new("(?m)(\\S) $").expect("valid regex pattern"); // for Markdown post-processing - static ref LEADING_NEWLINES_PATTERN: Regex = Regex::new("^\\n+").expect("valid regex pattern"); // for Markdown post-processing - static ref LAST_WHITESPACE_PATTERN: Regex = Regex::new("\\s+$").expect("valid regex pattern"); // for Markdown post-processing static ref START_OF_LINE_PATTERN: Regex = Regex::new("(^|\\n) *$").expect("valid regex pattern"); // for Markdown escaping static ref MARKDOWN_STARTONLY_KEYCHARS: Regex = Regex::new(r"^(\s*)([=>+\-#])").expect("valid regex pattern"); // for Markdown escaping static ref MARKDOWN_MIDDLE_KEYCHARS: Regex = Regex::new(r"[<>*\\_~]").expect("valid regex pattern"); // for Markdown escaping @@ -149,7 +145,7 @@ pub fn parse_html(html: &str, commonmark: bool) -> String { /// # Arguments /// `html` is source HTML as `String` pub fn rewrite_html(html: &str, commonmark: bool) -> String { - rewriter::writer::convert_html_to_markdown(html, commonmark, &None).unwrap_or_default() + rewriter::writer::convert_html_to_markdown(html, &None, commonmark, &None).unwrap_or_default() } /// Custom variant of rewrite function. @@ -157,10 +153,16 @@ pub fn rewrite_html(html: &str, commonmark: bool) -> String { /// You can also override standard tag handlers this way /// # Arguments /// `html` is source HTML as `String` +/// `custom` is custom tag hadler producers for tags you want, can be empty /// `commonmark` is for adjusting markdown output to commonmark /// `url` is used to provide absolute url handling -pub fn rewrite_html_with_url(html: &str, commonmark: bool, url: &Option) -> String { - rewriter::writer::convert_html_to_markdown(html, commonmark, url).unwrap_or_default() +pub fn rewrite_html_custom_with_url( + html: &str, + custom: &Option>, + commonmark: bool, + url: &Option, +) -> String { + rewriter::writer::convert_html_to_markdown(html, &custom, commonmark, url).unwrap_or_default() } /// Same as `parse_html` but retains all "span" html elements intact @@ -385,6 +387,7 @@ pub(crate) fn valid_block_element(node: &NodeData) -> bool { _ => true, } } + /// This conversion should only be applied to text tags /// /// Escapes text inside HTML tags so it won't be recognized as Markdown control sequence @@ -396,7 +399,7 @@ fn escape_markdown(result: &StructuredPrinter, text: &str) -> String { /// Called after all processing has been finished /// /// Clears excessive punctuation that would be trimmed by renderer anyway -fn clean_markdown(input: &str) -> String { +pub fn clean_markdown(input: &str) -> String { input.sift().into() } diff --git a/fast_html2md/src/rewriter/anchors.rs b/fast_html2md/src/rewriter/anchors.rs new file mode 100644 index 0000000..7a48198 --- /dev/null +++ b/fast_html2md/src/rewriter/anchors.rs @@ -0,0 +1,40 @@ +use lol_html::html_content::{ContentType::Html, Element}; +use percent_encoding::percent_decode_str; +use std::borrow::Cow; +use url::Url; + +/// Rewrite the anchor. +pub(crate) fn rewrite_anchor_element( + el: &mut Element, + _commonmark: bool, + url: &Option, +) -> Result<(), std::io::Error> { + if let Some(href) = el.get_attribute("href") { + let decoded_url: Cow<'_, str> = percent_decode_str(&href).decode_utf8_lossy(); + + let resolved_url = if decoded_url.starts_with('/') { + match &url { + Some(url) => { + if let Ok(u) = url.join(&decoded_url) { + u.to_string() + } else { + decoded_url.to_string() + } + } + None => decoded_url.to_string(), + } + } else { + decoded_url.to_string() + }; + + let markdown_url = if resolved_url.contains(|c: char| c.is_ascii_control() || c == ' ') { + Cow::Owned(format!("<{}>", resolved_url)) + } else { + Cow::Borrowed(&resolved_url) + }; + + el.before("[", Html); + el.after(&format!("]({})", markdown_url), Html); + } + Ok(()) +} diff --git a/fast_html2md/src/rewriter/mod.rs b/fast_html2md/src/rewriter/mod.rs index 76fd3e4..ab5a8e3 100644 --- a/fast_html2md/src/rewriter/mod.rs +++ b/fast_html2md/src/rewriter/mod.rs @@ -1,3 +1,4 @@ +pub(crate) mod anchors; pub(crate) mod counter; pub(crate) mod iframes; pub(crate) mod images; diff --git a/fast_html2md/src/rewriter/styles.rs b/fast_html2md/src/rewriter/styles.rs index c6dbf1b..e2c3b63 100644 --- a/fast_html2md/src/rewriter/styles.rs +++ b/fast_html2md/src/rewriter/styles.rs @@ -1,8 +1,9 @@ -use lol_html::html_content::Element; +use lol_html::html_content::{ContentType::Text, Element}; /// Rewrite the initial elements that need extra styles. pub(crate) fn rewrite_style_element(el: &mut Element) -> Result<(), std::io::Error> { - let tag_name = el.tag_name().to_ascii_lowercase(); + let tag_name = el.tag_name(); + let mark = match tag_name.as_str() { "b" | "strong" => "**", "i" | "em" => "*", @@ -11,11 +12,8 @@ pub(crate) fn rewrite_style_element(el: &mut Element) -> Result<(), std::io::Err _ => return Ok(()), // Return early if tag is not one of the specified }; - // Apply the markup before the element's content - el.before(mark, lol_html::html_content::ContentType::Text); - - // Apply the markup after the element's content - el.after(mark, lol_html::html_content::ContentType::Text); + el.before(mark, Text); + el.after(mark, Text); Ok(()) } diff --git a/fast_html2md/src/rewriter/writer.rs b/fast_html2md/src/rewriter/writer.rs index 69b6f85..b96a2d0 100644 --- a/fast_html2md/src/rewriter/writer.rs +++ b/fast_html2md/src/rewriter/writer.rs @@ -1,10 +1,11 @@ +use super::anchors::rewrite_anchor_element; use super::iframes::handle_iframe; use super::images::rewrite_image_element; use super::lists::handle_list_or_item; use super::quotes::{rewrite_blockquote_element, rewrite_blockquote_text}; use super::styles::rewrite_style_element; -use crate::clean_markdown; -use lol_html::html_content::ContentType::Text; +use crate::{clean_markdown, escape_markdown_base}; +use lol_html::html_content::ContentType::{Html, Text}; use lol_html::html_content::Element; use lol_html::{doc_comments, text}; use lol_html::{element, rewrite_str, RewriteStrSettings}; @@ -12,12 +13,18 @@ use std::cell::RefCell; use std::rc::Rc; use url::Url; -/// Insert a new line +/// Insert a new line after #[inline] -pub fn insert_newline(element: &mut Element) { +pub fn insert_newline_after(element: &mut Element) { element.after("\n", Text); } +/// Insert a new line before +#[inline] +pub fn insert_newline_before(element: &mut Element) { + element.before("\n", Text); +} + /// Handle the lol_html tag. #[inline] fn handle_tag( @@ -52,44 +59,40 @@ fn handle_tag( match element_name.as_str() { "h1" => { element.before("# ", Text); - insert_newline(element); + insert_newline_after(element); } "h2" => { element.before("## ", Text); - insert_newline(element); + insert_newline_after(element); } "h3" => { element.before("### ", Text); - insert_newline(element); + insert_newline_after(element); } "h4" => { element.before("#### ", Text); - insert_newline(element); + insert_newline_after(element); } "h5" => { element.before("##### ", Text); - insert_newline(element); + insert_newline_after(element); } "h6" => { element.before("###### ", Text); - insert_newline(element); + insert_newline_after(element); + } + "p" => { + insert_newline_before(element); + insert_newline_after(element); } - "p" => element.before("\n", Text), "hr" => { - insert_newline(element); + insert_newline_before(element); element.append("---", Text); - insert_newline(element); + insert_newline_after(element); } - "br" => insert_newline(element), + "br" => insert_newline_after(element), "a" => { - if let Some(href) = element.get_attribute("href") { - element.before("[", lol_html::html_content::ContentType::Text); - element.after( - &format!("]({})", href), - lol_html::html_content::ContentType::Text, - ); - element.set_inner_content("", lol_html::html_content::ContentType::Text); - } + let _ = rewrite_anchor_element(element, commonmark, url); } "img" => { let _ = rewrite_image_element(element, commonmark, &url); @@ -117,6 +120,18 @@ fn handle_tag( "q" | "cite" | "blockquote" => { let _ = rewrite_blockquote_element(element, quote_depth); } + "div" | "section" | "header" | "footer" => { + insert_newline_before(element); + insert_newline_after(element); + } + "pre" => { + element.before("\n```\n", Html); + element.after("\n```\n", Html); + } + "code" | "samp" => { + element.before("`", Html); + element.after("`", Html); + } _ => (), } @@ -126,6 +141,7 @@ fn handle_tag( /// Get the HTML rewriter settings to convert ot markdown. pub fn get_rewriter_settings( commonmark: bool, + custom: &Option>, url: Option, ) -> RewriteStrSettings<'static, 'static> { let list_type = Rc::new(RefCell::new(None)); @@ -134,36 +150,59 @@ pub fn get_rewriter_settings( let quote_depth1 = quote_depth.clone(); + let mut element_content_handlers = + Vec::with_capacity(4 + custom.as_ref().map_or(0, |c| c.len())); + + element_content_handlers.push(text!("blockquote, q, cite", move |el| { + let _ = rewrite_blockquote_text(el, quote_depth1.clone()); + Ok(()) + })); + + element_content_handlers.push(text!( + "*:not(script):not(head):not(style):not(svg)", + move |el| { + *el.as_mut_str() = crate::MARKDOWN_MIDDLE_KEYCHARS + .replace_all(el.as_str().trim().into(), "\\$0") + .to_string(); + Ok(()) + } + )); + + element_content_handlers.push(element!("head, nav, script, noscript, style", |el| { + el.remove(); + Ok(()) + })); + + element_content_handlers.push(element!("*", move |el| { + let _ = handle_tag( + el, + commonmark, + &url, + list_type.clone(), + order_counter.clone(), + quote_depth.clone(), + ); + Ok(()) + })); + + if let Some(ignore) = custom { + let ignore_handler = element!( + ignore.iter().cloned().collect::>().join(","), + |el| { + el.remove(); + Ok(()) + } + ); + + element_content_handlers.push(ignore_handler); + } + RewriteStrSettings { document_content_handlers: vec![doc_comments!(|c| { c.remove(); Ok(()) })], - element_content_handlers: vec![ - text!("blockquote, q, cite", move |el| { - let _ = rewrite_blockquote_text(el, quote_depth1.clone()); - Ok(()) - }), - text!("summary, details", move |el| { - *el.as_mut_str() = el.as_str().trim().into(); - Ok(()) - }), - element!("head, nav", |el| { - el.remove(); - Ok(()) - }), - element!("*:not(script):not(head):not(style):not(svg)", move |el| { - let _ = handle_tag( - el, - commonmark, - &url, - list_type.clone(), - order_counter.clone(), - quote_depth.clone(), - ); - Ok(()) - }), - ], + element_content_handlers, ..RewriteStrSettings::default() } } @@ -171,10 +210,11 @@ pub fn get_rewriter_settings( /// Convert to markdown streaming re-writer pub(crate) fn convert_html_to_markdown( html: &str, + custom: &Option>, commonmark: bool, url: &Option, ) -> Result> { - let settings = get_rewriter_settings(commonmark, url.clone()); + let settings = get_rewriter_settings(commonmark, custom, url.clone()); match rewrite_str(&Box::new(html), settings) { Ok(markdown) => Ok(clean_markdown(&markdown)), diff --git a/fast_html2md/src/scraper/paragraphs.rs b/fast_html2md/src/scraper/paragraphs.rs index 873ad94..859c26e 100644 --- a/fast_html2md/src/scraper/paragraphs.rs +++ b/fast_html2md/src/scraper/paragraphs.rs @@ -31,7 +31,7 @@ impl TagHandler for ParagraphHandler { printer.append_str("---"); printer.insert_newline(); } - "br" => printer.append_str("\n"), // we prob want nbsp here. + "br" => printer.insert_newline(), // we prob want nbsp here. _ => (), } } diff --git a/fast_html2md/tests/integration.rs b/fast_html2md/tests/integration.rs index 6ae7be4..1134cdd 100644 --- a/fast_html2md/tests/integration.rs +++ b/fast_html2md/tests/integration.rs @@ -2,7 +2,7 @@ extern crate spectral; // use html2md::ignore::IgnoreTagFactory; // use html2md::{parse_html, parse_html_custom, parse_html_custom_with_url}; -use html2md::parse_html; +use html2md::{parse_html, rewrite_html}; use indoc::indoc; use spectral::prelude::*; use std::collections::HashMap; @@ -66,7 +66,7 @@ fn test_real_world_wiki() -> Result<(), Box> { #[ignore] fn test_real_world_ja() { let mut html = String::new(); - let mut html_file = File::open("../test-samples/real-world-ja-1.html").unwrap(); + let mut html_file: File = File::open("../test-samples/real-world-ja-1.html").unwrap(); html_file .read_to_string(&mut html) .expect("File must be readable"); @@ -136,9 +136,11 @@ fn test_tables_with_newlines() { .read_to_string(&mut html) .expect("File must be readable"); let result = parse_html(&html, false); + let m = indoc! { "[![Embedded YouTube video](https://img.youtube.com/vi/ZZZZZZZZZ/0.jpg)](https://www.youtube.com/watch?v=ZZZZZZZZZ)\n|Maybe I'm foolish, maybe I'm blind\nThinking I can see through this and see what's behind\nGot no way to prove it so maybe I'm blind\nBut I'm only human after all,\nI'm only human after all\nDon't put your blame on me|xxxxx xxxx, x xxxxxx, xxxxx xxxx —xxxxxx\nxxx xxxxx, xxx xxxx xxxxxx xxxxxx xxx, x xxxxxx xxx xxx xx xxx\nxxxx x xxxx xx xxxx xxxxxxx xxxxxxxxxxxxx, xxx xxx xxxxxxxx, x xxxxxx.\nxx x xxxxx xxxx xxxxxxx, x xxxxx-xx xxxxxx,\nx xxxxx xxxx xxxxxxx, x xxxxx xxxxxx.\nxx xxxx xxxx|\n|||\n[xxxxxx xxxxx xxxxx x xxxxxxx](/)\nx xxxx xxxxxxxxx xxxxxxx xxxxxxxxxxx xx xxxx xxxxx. x xxxxx xxxxxxx, xxxx xxxxx xxxxxxx xx xxxxxxxxxx xxxxxx. xxx xxxxxxxx, xxx xxxxxxxxx xxxxxxxxxxxxxx xx xxxxx —xxxxxxxxxx xxxxxxxxxx x xxxxx xxxxxxxxxxxxx xxxxxxxxx. x xxx xxxxxxxxxxxx*xxxx*, xxxxxx xxxx, xxxxxxxxxx xxxxx xxxxxxxx, xxxxxxxxxx x xxxxxxxxx. xx xxxxxx xxxxx xxxxxxxxxxxxxxxxx —x xxxxxx xxx xxxx.\nxxxxx xxxxxxxxxx xxxxx x xxxx xxxxxxxxxx xxxxx. xxxxx. x xxxxx: «x xxxxxx xxxxxxx, x xxxxx xxx xxxx, xx xxxxxxxx xxxxxx», —xxx xxxxx xxxxxxxx. xxxxxx xxx x xxxx xxxx xxxxxxxx xxxxxxxx xxxxxxx xxxx xxxxxxxxxxx xxxxxxxxxx, xxxxxxx xxxxxx xxxxxx xxx xxxxx, xxxxxxxxxxx x x xxxxxxx xxxxxxxxx.\nxx x xxxxx xxxx xxxxxxx. xxxxxx xxxxx? xxxxxxxxxxx x xxxxxxxxx xxxxxx.\nx xxxxx x xxxxxxxxxx x xxxxx... x xxxxxx xxxx xxxxxx xxxxxxx xxxxxxxx. xx xxxx, x xxxxxx xxx-xx xxxxxxxxx xx xxxxxxx, xxx xxxxxx xxxxxx, xxx xxx xxxxx, xxxxx xxxxxxxx xx xxxx... x xxxxxx xxxxxxx xx xxxx xxxxx, xxx, xxxxx xxxx xxxxxxxxxx, x xxxxx xxxxxxxxx xx xxxxx. x xxx-xx xxx xxxxx xxxxxxx xxxxxxxxxxxxx.\nxxxxxx xx... xx xxx xx xxxxxxxxxxxxx xxxxxx xxxxxxxxxxxxx x xxxxxxxxxx xxxxx, xxxxx xxx xxxx xxxxxxxxx, x xxxxx xxx xxxxxxxxx, xxx xxxxxxx xxx, xxx xxxx xxxxxxx xxxxxx, x xx xxx, xxx xxxx xxxxxxxx." }; - assert_that!(result).contains(indoc! {"[![Embedded YouTube video](https://img.youtube.com/vi/ZZZZZZZZZ/0.jpg)](https://www.youtube.com/watch?v=ZZZZZZZZZ)\n|Maybe I'm foolish, maybe I'm blind\nThinking I can see through this and see what's behind\nGot no way to prove it so maybe I'm blind\nBut I'm only human after all,\nI'm only human after all\nDon't put your blame on me|xxxxx xxxx, x xxxxxx, xxxxx xxxx —xxxxxx\nxxx xxxxx, xxx xxxx xxxxxx xxxxxx xxx, x xxxxxx xxx xxx xx xxx\nxxxx x xxxx xx xxxx xxxxxxx xxxxxxxxxxxxx, xxx xxx xxxxxxxx, x xxxxxx.\nxx x xxxxx xxxx xxxxxxx, x xxxxx-xx xxxxxx,\nx xxxxx xxxx xxxxxxx, x xxxxx xxxxxx.\nxx xxxx xxxx|\n|||\n[xxxxxx xxxxx xxxxx x xxxxxxx](/)\nx xxxx xxxxxxxxx xxxxxxx xxxxxxxxxxx xx xxxx xxxxx. x xxxxx xxxxxxx, xxxx xxxxx xxxxxxx xx xxxxxxxxxx xxxxxx. xxx xxxxxxxx, xxx xxxxxxxxx xxxxxxxxxxxxxx xx xxxxx —xxxxxxxxxx xxxxxxxxxx x xxxxx xxxxxxxxxxxxx xxxxxxxxx. x xxx xxxxxxxxxxxx*xxxx*, xxxxxx xxxx, xxxxxxxxxx xxxxx xxxxxxxx, xxxxxxxxxx x xxxxxxxxx. xx xxxxxx xxxxx xxxxxxxxxxxxxxxxx —x xxxxxx xxx xxxx.\nxxxxx xxxxxxxxxx xxxxx x xxxx xxxxxxxxxx xxxxx. xxxxx. x xxxxx: «x xxxxxx xxxxxxx, x xxxxx xxx xxxx, xx xxxxxxxx xxxxxx», —xxx xxxxx xxxxxxxx. xxxxxx xxx x xxxx xxxx xxxxxxxx xxxxxxxx xxxxxxx xxxx xxxxxxxxxxx xxxxxxxxxx, xxxxxxx xxxxxx xxxxxx xxx xxxxx, xxxxxxxxxxx x x xxxxxxx xxxxxxxxx.\nxx x xxxxx xxxx xxxxxxx. xxxxxx xxxxx? xxxxxxxxxxx x xxxxxxxxx xxxxxx.\nx xxxxx x xxxxxxxxxx x xxxxx... x xxxxxx xxxx xxxxxx xxxxxxx xxxxxxxx. xx xxxx, x xxxxxx xxx-xx xxxxxxxxx xx xxxxxxx, xxx xxxxxx xxxxxx, xxx xxx xxxxx, xxxxx xxxxxxxx xx xxxx... x xxxxxx xxxxxxx xx xxxx xxxxx, xxx, xxxxx xxxx xxxxxxxxxx, x xxxxx xxxxxxxxx xx xxxxx. x xxx-xx xxx xxxxx xxxxxxx xxxxxxxxxxxxx.\nxxxxxx xx... xx xxx xx xxxxxxxxxxxxx xxxxxx xxxxxxxxxxxxx x xxxxxxxxxx xxxxx, xxxxx xxx xxxx xxxxxxxxx, x xxxxx xxx xxxxxxxxx, xxx xxxxxxx xxx, xxx xxxx xxxxxxx xxxxxx, x xx xxx, xxx xxxx xxxxxxxx." -}); + assert_that!(result).contains(m); + // let result = rewrite_html(&html, false); + // assert_that!(result).contains(m); } #[test] @@ -149,9 +151,15 @@ fn test_tables_crash2() { .read_to_string(&mut html) .expect("File must be readable"); let table_with_vertical_header = parse_html(&html, false); + let m = indoc! {"xxxxx xxxxxxxxxx xxxxxxx x xxxxx))~~xxxxxxxx xxxxxxxx~~\n## At a Glance\n|Current Conditions:|Open all year. No reservations. No services.|\n|||\n| Reservations: | No reservations. |\n| Fees | No fee. |\n| Water: | No water. |"}; - assert_that!(table_with_vertical_header).contains(indoc! {"xxxxx xxxxxxxxxx xxxxxxx x xxxxx))~~xxxxxxxx xxxxxxxx~~\n## At a Glance\n|Current Conditions:|Open all year. No reservations. No services.|\n|||\n| Reservations: | No reservations. |\n| Fees | No fee. |\n| Water: | No water. |" - }); + assert_that!(table_with_vertical_header).contains(m); + + let table_with_vertical_header = rewrite_html(&html, false); + + let m = indoc! { "xxxxx xxxxxxxxxx xxxxxxx x xxxxx))~~xxxxxxxx xxxxxxxx~~\n## At a Glance\n | **Current Conditions:** | Open all year. No reservations. No services.  | |\n| **Reservations:** | No reservations.  | |\n| **Fees** | No fee.  | |\n| **Water:** | No water. | |"}; + + assert_that!(table_with_vertical_header).contains(m); } #[test] @@ -177,6 +185,7 @@ fn test_html_from_text() { false, &Some(Url::parse("https://spider.cloud").unwrap()), ); + assert!(!result.is_empty()); } @@ -189,12 +198,12 @@ fn test_html_from_text_rewrite() { .read_to_string(&mut html) .expect("File must be readable"); - let result = html2md::rewrite_html_with_url( + let result = html2md::rewrite_html_custom_with_url( &html, + &None, false, &Some(Url::parse("https://spider.cloud").unwrap()), ); - println!("{:?}", result); assert!(!result.is_empty()); } diff --git a/fast_html2md/tests/styles.rs b/fast_html2md/tests/styles.rs index dc58303..3173709 100644 --- a/fast_html2md/tests/styles.rs +++ b/fast_html2md/tests/styles.rs @@ -1,29 +1,34 @@ -use html2md::parse_html; +use html2md::{parse_html, rewrite_html}; use pretty_assertions::assert_eq; #[test] fn test_styles_with_spaces() { - let md = parse_html(r#"It read: Nobody will ever love you"#, false); - assert_eq!(md, r#"It read:~~Nobody will ever love you~~"#) + let s = r#"It read: Nobody will ever love you"#; + + let md = parse_html(s, false); + assert_eq!(md, r#"It read:~~Nobody will ever love you~~"#); + let md = rewrite_html(s, false); + assert_eq!(md, r#"It read:~~Nobody will ever love you~~"#); } #[test] fn test_styles_with_newlines() { - let md = parse_html( - r#" + let s = r#" And she said:
We are all just prisoners here
Of our own device
And in the master's chambers
They gathered for the feast
They stab it with their steely knives
-But they just can't kill the beast
- -"#, - false, - ); - assert_eq!( - md, - "And she said:\n~~We are all just prisoners here\nOf our own device~~\nAnd in the master's chambers\nThey gathered for the feast\n*They stab it with their steely knives*\n**But they just can't kill the beast**" - ) +But they just can't kill the beast
"#; + + let m = "And she said:\n~~We are all just prisoners here\nOf our own device~~\nAnd in the master's chambers\nThey gathered for the feast\n*They stab it with their steely knives*\n**But they just can't kill the beast**"; + + let md = parse_html(s, false); + + assert_eq!(md, m); + + // let md = rewrite_html(s, false); + + // assert_eq!(md, m); } diff --git a/fast_html2md/tests/tables.rs b/fast_html2md/tests/tables.rs index 73a83c2..a50b23c 100644 --- a/fast_html2md/tests/tables.rs +++ b/fast_html2md/tests/tables.rs @@ -1,10 +1,9 @@ -use html2md::parse_html; +use html2md::{parse_html, rewrite_html}; use pretty_assertions::assert_eq; #[test] fn test_tables() { - let md = parse_html( - r#" + let s = r#"
@@ -21,14 +20,21 @@ fn test_tables() { -
Minor1col4
"#, - false, - ); +"#; + + let md = parse_html(s, false); assert_eq!( md, "|Minor1|Minor2|Minor3|Minor4|\n|||||\n| col1 | col2 | col3 | col4 |" ); + + let md = rewrite_html(s, false); + + assert_eq!( + md, + "| **Minor1** | **Minor2** | **Minor3** | **Minor4** | |\n| col1 | col2 | col3 | col4 | |" + ); } #[test] diff --git a/fast_html2md/tests/unit.rs b/fast_html2md/tests/unit.rs index 638919e..c04dfab 100644 --- a/fast_html2md/tests/unit.rs +++ b/fast_html2md/tests/unit.rs @@ -1,53 +1,68 @@ -use html2md::parse_html; +use html2md::{parse_html, rewrite_html}; use pretty_assertions::assert_eq; #[test] fn test_dumb() { let md = parse_html("

CARTHAPHILUS

", false); + assert_eq!(md, "CARTHAPHILUS"); + let md = rewrite_html("

CARTHAPHILUS

", false); assert_eq!(md, "CARTHAPHILUS") } #[test] // fixme fn test_space() { - let md = parse_html(r#"

APOSIMZ

\n"#, false); - assert_eq!(md, "[APOSIMZ](http://ya.ru)\n\\\\n") + let s = r#"

APOSIMZ

\n"#; + let md = parse_html(s, false); + assert_eq!(md, "[APOSIMZ](http://ya.ru)\n\\\\n"); + let md = rewrite_html(s, false); + assert_eq!(md, "[APOSIMZ](http://ya.ru)\n\\n"); } #[test] fn test_anchor() { let md = parse_html(r#"

APOSIMZ

"#, false); - assert_eq!(md, "[APOSIMZ](http://ya.ru)") + assert_eq!(md, "[APOSIMZ](http://ya.ru)"); + let md = rewrite_html(r#"

APOSIMZ

"#, false); + assert_eq!(md, "[APOSIMZ](http://ya.ru)"); } #[test] fn test_anchor2() { - let md = parse_html( - r#"

APOSIMZSIDONIA

"#, - false, - ); - assert_eq!(md, "[APOSIMZ](http://ya.ru)[SIDONIA](http://yandex.ru)") + let s = r#"

APOSIMZSIDONIA

"#; + + let md = parse_html(s, false); + assert_eq!(md, "[APOSIMZ](http://ya.ru)[SIDONIA](http://yandex.ru)"); + let md = rewrite_html(s, false); + assert_eq!(md, "[APOSIMZ](http://ya.ru)[SIDONIA](http://yandex.ru)"); } #[test] fn test_anchor3() { - let md = parse_html( - r#"

APOSIMZ

SIDONIA

"#, - false, - ); - assert_eq!(md, "[APOSIMZ](http://ya.ru)\n[SIDONIA](http://yandex.ru)") + let s = + r#"

APOSIMZ

SIDONIA

"#; + let m = "[APOSIMZ](http://ya.ru)\n[SIDONIA](http://yandex.ru)"; + let md = parse_html(s, false); + assert_eq!(md, m); + let md = rewrite_html(s, false); + assert_eq!(md, m) } #[test] /// The destination can only contain spaces if it is enclosed in pointy brackets: /// [Commonmark: Example 489](https://spec.commonmark.org/0.31.2/#example-489) fn test_anchor4() { - let md = parse_html(r#"

link

"#, false); - assert_eq!( - md, - "\ -[link]()" - ) + let s = r#"

link

"#; + let m = "\ +[link]()"; + + let md = parse_html(s, false); + + assert_eq!(md, m); + + let md = rewrite_html(s, false); + + assert_eq!(md, m); } #[test] @@ -61,14 +76,14 @@ fn test_image() { #[test] fn test_escaping() { - let md = parse_html( - r#"

*god*'s in his **heaven** - all is right with the __world__

"#, - false, - ); - assert_eq!( - md, - "\\*god\\*\'s in his \\*\\*heaven\\*\\* - all is right with the \\_\\_world\\_\\_" - ) + let s = r#"

*god*'s in his **heaven** - all is right with the __world__

"#; + let m = "\\*god\\*\'s in his \\*\\*heaven\\*\\* - all is right with the \\_\\_world\\_\\_"; + + let md = parse_html(s, false); + assert_eq!(md, m); + + let md = rewrite_html(s, false); + assert_eq!(md, m); } #[test]