Skip to content

Commit

Permalink
chore(rewriter): add full spec
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Nov 15, 2024
1 parent 196296d commit fbdeaf4
Show file tree
Hide file tree
Showing 13 changed files with 254 additions and 123 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 14 additions & 0 deletions benches/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,20 @@ pub fn bench_speed(c: &mut Criterion) {
b.iter(|| black_box(rewrite_html(&html, false)))
});

let path = std::path::Path::new("../test-samples/wiki/en-wikipedia-org_wiki_Cat.html");

let mut html = String::new();
let mut html_file = File::open(path).unwrap();
html_file.read_to_string(&mut html).unwrap();

group.bench_function(format!("Scraper wiki-cat: {}", sample_title), |b| {
b.iter(|| black_box(parse_html(&html, false)))
});

group.bench_function(format!("Rewriter wiki-cat: {}", sample_title), |b| {
b.iter(|| black_box(rewrite_html(&html, false)))
});

group.finish();
}

Expand Down
2 changes: 1 addition & 1 deletion fast_html2md/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "fast_html2md"
version = "0.0.32"
version = "0.0.34"
edition = "2021"
description = "A fast html2md crate for rust"
categories = ["development-tools", "parsing", "parser-implementations"]
Expand Down
21 changes: 12 additions & 9 deletions fast_html2md/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use lazy_static::lazy_static;
use regex::Regex;
use std::boxed::Box;
use std::collections::HashMap;
use std::collections::HashSet;
use std::sync::Arc;
use url::Url;

Expand Down Expand Up @@ -51,11 +52,6 @@ use tables::TableHandler;

lazy_static! {
static ref EXCESSIVE_WHITESPACE_PATTERN: Regex = Regex::new("\\s{2,}").expect("valid regex pattern"); // for HTML on-the-fly cleanup
static ref EMPTY_LINE_PATTERN: Regex = Regex::new("(?m)^ +$").expect("valid regex pattern"); // for Markdown post-processing
static ref EXCESSIVE_NEWLINE_PATTERN: Regex = Regex::new("\\n{3,}").expect("valid regex pattern"); // for Markdown post-processing
static ref TRAILING_SPACE_PATTERN: Regex = Regex::new("(?m)(\\S) $").expect("valid regex pattern"); // for Markdown post-processing
static ref LEADING_NEWLINES_PATTERN: Regex = Regex::new("^\\n+").expect("valid regex pattern"); // for Markdown post-processing
static ref LAST_WHITESPACE_PATTERN: Regex = Regex::new("\\s+$").expect("valid regex pattern"); // for Markdown post-processing
static ref START_OF_LINE_PATTERN: Regex = Regex::new("(^|\\n) *$").expect("valid regex pattern"); // for Markdown escaping
static ref MARKDOWN_STARTONLY_KEYCHARS: Regex = Regex::new(r"^(\s*)([=>+\-#])").expect("valid regex pattern"); // for Markdown escaping
static ref MARKDOWN_MIDDLE_KEYCHARS: Regex = Regex::new(r"[<>*\\_~]").expect("valid regex pattern"); // for Markdown escaping
Expand Down Expand Up @@ -149,18 +145,24 @@ pub fn parse_html(html: &str, commonmark: bool) -> String {
/// # Arguments
/// `html` is source HTML as `String`
pub fn rewrite_html(html: &str, commonmark: bool) -> String {
rewriter::writer::convert_html_to_markdown(html, commonmark, &None).unwrap_or_default()
rewriter::writer::convert_html_to_markdown(html, &None, commonmark, &None).unwrap_or_default()
}

/// Custom variant of rewrite function.
///
/// You can also override standard tag handlers this way
/// # Arguments
/// `html` is source HTML as `String`
/// `custom` is custom tag hadler producers for tags you want, can be empty
/// `commonmark` is for adjusting markdown output to commonmark
/// `url` is used to provide absolute url handling
pub fn rewrite_html_with_url(html: &str, commonmark: bool, url: &Option<Url>) -> String {
rewriter::writer::convert_html_to_markdown(html, commonmark, url).unwrap_or_default()
pub fn rewrite_html_custom_with_url(
html: &str,
custom: &Option<HashSet<String>>,
commonmark: bool,
url: &Option<Url>,
) -> String {
rewriter::writer::convert_html_to_markdown(html, &custom, commonmark, url).unwrap_or_default()
}

/// Same as `parse_html` but retains all "span" html elements intact
Expand Down Expand Up @@ -385,6 +387,7 @@ pub(crate) fn valid_block_element(node: &NodeData) -> bool {
_ => true,
}
}

/// This conversion should only be applied to text tags
///
/// Escapes text inside HTML tags so it won't be recognized as Markdown control sequence
Expand All @@ -396,7 +399,7 @@ fn escape_markdown(result: &StructuredPrinter, text: &str) -> String {
/// Called after all processing has been finished
///
/// Clears excessive punctuation that would be trimmed by renderer anyway
fn clean_markdown(input: &str) -> String {
pub fn clean_markdown(input: &str) -> String {
input.sift().into()
}

Expand Down
40 changes: 40 additions & 0 deletions fast_html2md/src/rewriter/anchors.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
use lol_html::html_content::{ContentType::Html, Element};
use percent_encoding::percent_decode_str;
use std::borrow::Cow;
use url::Url;

/// Rewrite the anchor.
pub(crate) fn rewrite_anchor_element(
el: &mut Element,
_commonmark: bool,
url: &Option<Url>,
) -> Result<(), std::io::Error> {
if let Some(href) = el.get_attribute("href") {
let decoded_url: Cow<'_, str> = percent_decode_str(&href).decode_utf8_lossy();

let resolved_url = if decoded_url.starts_with('/') {
match &url {
Some(url) => {
if let Ok(u) = url.join(&decoded_url) {
u.to_string()
} else {
decoded_url.to_string()
}
}
None => decoded_url.to_string(),
}
} else {
decoded_url.to_string()
};

let markdown_url = if resolved_url.contains(|c: char| c.is_ascii_control() || c == ' ') {
Cow::Owned(format!("<{}>", resolved_url))
} else {
Cow::Borrowed(&resolved_url)
};

el.before("[", Html);
el.after(&format!("]({})", markdown_url), Html);
}
Ok(())
}
1 change: 1 addition & 0 deletions fast_html2md/src/rewriter/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
pub(crate) mod anchors;
pub(crate) mod counter;
pub(crate) mod iframes;
pub(crate) mod images;
Expand Down
12 changes: 5 additions & 7 deletions fast_html2md/src/rewriter/styles.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
use lol_html::html_content::Element;
use lol_html::html_content::{ContentType::Text, Element};

/// Rewrite the initial elements that need extra styles.
pub(crate) fn rewrite_style_element(el: &mut Element) -> Result<(), std::io::Error> {
let tag_name = el.tag_name().to_ascii_lowercase();
let tag_name = el.tag_name();

let mark = match tag_name.as_str() {
"b" | "strong" => "**",
"i" | "em" => "*",
Expand All @@ -11,11 +12,8 @@ pub(crate) fn rewrite_style_element(el: &mut Element) -> Result<(), std::io::Err
_ => return Ok(()), // Return early if tag is not one of the specified
};

// Apply the markup before the element's content
el.before(mark, lol_html::html_content::ContentType::Text);

// Apply the markup after the element's content
el.after(mark, lol_html::html_content::ContentType::Text);
el.before(mark, Text);
el.after(mark, Text);

Ok(())
}
136 changes: 88 additions & 48 deletions fast_html2md/src/rewriter/writer.rs
Original file line number Diff line number Diff line change
@@ -1,23 +1,30 @@
use super::anchors::rewrite_anchor_element;
use super::iframes::handle_iframe;
use super::images::rewrite_image_element;
use super::lists::handle_list_or_item;
use super::quotes::{rewrite_blockquote_element, rewrite_blockquote_text};
use super::styles::rewrite_style_element;
use crate::clean_markdown;
use lol_html::html_content::ContentType::Text;
use crate::{clean_markdown, escape_markdown_base};
use lol_html::html_content::ContentType::{Html, Text};
use lol_html::html_content::Element;
use lol_html::{doc_comments, text};
use lol_html::{element, rewrite_str, RewriteStrSettings};
use std::cell::RefCell;
use std::rc::Rc;
use url::Url;

/// Insert a new line
/// Insert a new line after
#[inline]
pub fn insert_newline(element: &mut Element) {
pub fn insert_newline_after(element: &mut Element) {
element.after("\n", Text);
}

/// Insert a new line before
#[inline]
pub fn insert_newline_before(element: &mut Element) {
element.before("\n", Text);
}

/// Handle the lol_html tag.
#[inline]
fn handle_tag(
Expand Down Expand Up @@ -52,44 +59,40 @@ fn handle_tag(
match element_name.as_str() {
"h1" => {
element.before("# ", Text);
insert_newline(element);
insert_newline_after(element);
}
"h2" => {
element.before("## ", Text);
insert_newline(element);
insert_newline_after(element);
}
"h3" => {
element.before("### ", Text);
insert_newline(element);
insert_newline_after(element);
}
"h4" => {
element.before("#### ", Text);
insert_newline(element);
insert_newline_after(element);
}
"h5" => {
element.before("##### ", Text);
insert_newline(element);
insert_newline_after(element);
}
"h6" => {
element.before("###### ", Text);
insert_newline(element);
insert_newline_after(element);
}
"p" => {
insert_newline_before(element);
insert_newline_after(element);
}
"p" => element.before("\n", Text),
"hr" => {
insert_newline(element);
insert_newline_before(element);
element.append("---", Text);
insert_newline(element);
insert_newline_after(element);
}
"br" => insert_newline(element),
"br" => insert_newline_after(element),
"a" => {
if let Some(href) = element.get_attribute("href") {
element.before("[", lol_html::html_content::ContentType::Text);
element.after(
&format!("]({})", href),
lol_html::html_content::ContentType::Text,
);
element.set_inner_content("", lol_html::html_content::ContentType::Text);
}
let _ = rewrite_anchor_element(element, commonmark, url);
}
"img" => {
let _ = rewrite_image_element(element, commonmark, &url);
Expand Down Expand Up @@ -117,6 +120,18 @@ fn handle_tag(
"q" | "cite" | "blockquote" => {
let _ = rewrite_blockquote_element(element, quote_depth);
}
"div" | "section" | "header" | "footer" => {
insert_newline_before(element);
insert_newline_after(element);
}
"pre" => {
element.before("\n```\n", Html);
element.after("\n```\n", Html);
}
"code" | "samp" => {
element.before("`", Html);
element.after("`", Html);
}
_ => (),
}

Expand All @@ -126,6 +141,7 @@ fn handle_tag(
/// Get the HTML rewriter settings to convert ot markdown.
pub fn get_rewriter_settings(
commonmark: bool,
custom: &Option<std::collections::HashSet<String>>,
url: Option<Url>,
) -> RewriteStrSettings<'static, 'static> {
let list_type = Rc::new(RefCell::new(None));
Expand All @@ -134,47 +150,71 @@ pub fn get_rewriter_settings(

let quote_depth1 = quote_depth.clone();

let mut element_content_handlers =
Vec::with_capacity(4 + custom.as_ref().map_or(0, |c| c.len()));

element_content_handlers.push(text!("blockquote, q, cite", move |el| {
let _ = rewrite_blockquote_text(el, quote_depth1.clone());
Ok(())
}));

element_content_handlers.push(text!(
"*:not(script):not(head):not(style):not(svg)",
move |el| {
*el.as_mut_str() = crate::MARKDOWN_MIDDLE_KEYCHARS
.replace_all(el.as_str().trim().into(), "\\$0")
.to_string();
Ok(())
}
));

element_content_handlers.push(element!("head, nav, script, noscript, style", |el| {
el.remove();
Ok(())
}));

element_content_handlers.push(element!("*", move |el| {
let _ = handle_tag(
el,
commonmark,
&url,
list_type.clone(),
order_counter.clone(),
quote_depth.clone(),
);
Ok(())
}));

if let Some(ignore) = custom {
let ignore_handler = element!(
ignore.iter().cloned().collect::<Vec<String>>().join(","),
|el| {
el.remove();
Ok(())
}
);

element_content_handlers.push(ignore_handler);
}

RewriteStrSettings {
document_content_handlers: vec![doc_comments!(|c| {
c.remove();
Ok(())
})],
element_content_handlers: vec![
text!("blockquote, q, cite", move |el| {
let _ = rewrite_blockquote_text(el, quote_depth1.clone());
Ok(())
}),
text!("summary, details", move |el| {
*el.as_mut_str() = el.as_str().trim().into();
Ok(())
}),
element!("head, nav", |el| {
el.remove();
Ok(())
}),
element!("*:not(script):not(head):not(style):not(svg)", move |el| {
let _ = handle_tag(
el,
commonmark,
&url,
list_type.clone(),
order_counter.clone(),
quote_depth.clone(),
);
Ok(())
}),
],
element_content_handlers,
..RewriteStrSettings::default()
}
}

/// Convert to markdown streaming re-writer
pub(crate) fn convert_html_to_markdown(
html: &str,
custom: &Option<std::collections::HashSet<String>>,
commonmark: bool,
url: &Option<Url>,
) -> Result<String, Box<dyn std::error::Error>> {
let settings = get_rewriter_settings(commonmark, url.clone());
let settings = get_rewriter_settings(commonmark, custom, url.clone());

match rewrite_str(&Box::new(html), settings) {
Ok(markdown) => Ok(clean_markdown(&markdown)),
Expand Down
Loading

0 comments on commit fbdeaf4

Please sign in to comment.