Skip to content

Commit

Permalink
feat(transform): add full rewriter spec handling
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Nov 15, 2024
1 parent fbdeaf4 commit 81ef4ed
Show file tree
Hide file tree
Showing 8 changed files with 111 additions and 40 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 12 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,16 +1,27 @@
# fast_html2md

A Rust html to markdown crate built for performance.
The fastest Rust html to markdown transformer.

`cargo add fast_html2md`

You can use a scraper or rewriter to transform. The rewriter is over 2-3 times faster.

```rust
use html2md::parse_html;

let md = parse_html("<p>JAMES</p>", false);
assert_eq!(md, "JAMES")
```

Using a rewriter.

```rust
use html2md::rewrite_html;

let md = parse_html("<p>JAMES</p>", false);
assert_eq!(md, "JAMES")
```

## Ignoring Tags

```rust
Expand All @@ -24,7 +35,3 @@ assert_eq!(md, "JAMES")
tag_factory.insert(String::from("noscript"), tag.clone());
let html = html2md::parse_html_custom(&html, &tag_factory, false);
```

## Notes

This project is a practical rewrite from the original `html2md` with major bug fixes and performance improvements.
2 changes: 1 addition & 1 deletion fast_html2md/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "fast_html2md"
version = "0.0.34"
version = "0.0.35"
edition = "2021"
description = "A fast html2md crate for rust"
categories = ["development-tools", "parsing", "parser-implementations"]
Expand Down
4 changes: 2 additions & 2 deletions fast_html2md/src/extended/sifter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,8 @@ fn extend_from_bytes_with_len(bytes: &[u8], ind: &mut usize, out: &mut String, l
let end = ind.saturating_add(len);
// Check bounds to ensure we don't run into an out-of-bounds error.
if *ind <= end && end <= bytes.len() {
// Todo: we want to pass in the bytes encoded to string.
out.push_str(&auto_encode_bytes(&bytes[*ind..end]));
let output = auto_encode_bytes(&bytes[*ind..end]);
out.push_str(&output);
}
*ind = end;
}
Expand Down
4 changes: 4 additions & 0 deletions fast_html2md/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@ lazy_static! {
static ref START_OF_LINE_PATTERN: Regex = Regex::new("(^|\\n) *$").expect("valid regex pattern"); // for Markdown escaping
static ref MARKDOWN_STARTONLY_KEYCHARS: Regex = Regex::new(r"^(\s*)([=>+\-#])").expect("valid regex pattern"); // for Markdown escaping
static ref MARKDOWN_MIDDLE_KEYCHARS: Regex = Regex::new(r"[<>*\\_~]").expect("valid regex pattern"); // for Markdown escaping
static ref MARKDOWN_MIDDLE_KEYCHARS_SET: regex::RegexSet = regex::RegexSet::new(&[
r"[<>*\\_~]", // Matches any single markdown character
r"&nbsp;" // Matches the entire "&nbsp;" string
]).expect("valid regex set");
}

/// Custom variant of main function. Allows to pass custom tag<->tag factory pairs
Expand Down
71 changes: 62 additions & 9 deletions fast_html2md/src/rewriter/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ fn handle_tag(
list_type: Rc<RefCell<Option<String>>>,
order_counter: Rc<RefCell<usize>>,
quote_depth: Rc<RefCell<usize>>,
inside_table: Rc<RefCell<bool>>,
) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
let element_name = element.tag_name();

Expand Down Expand Up @@ -97,16 +98,28 @@ fn handle_tag(
"img" => {
let _ = rewrite_image_element(element, commonmark, &url);
}
"table" => {
*inside_table.borrow_mut() = true;
}
"tr" => {
element.before("| ", Text);
element.after(" |\n", Text);
insert_newline_after(element);
}
"th" => {
element.before("**", Text);
element.after("** | ", Text);
if commonmark {
element.before("** ", Html);
element.after("** |", Html);
} else {
element.after("|", Html);
}

// add the first table row start
if *inside_table.borrow() {
element.before("|", Html);
*inside_table.borrow_mut() = false;
}
}
"td" => {
element.after(" | ", Text);
element.after("|", Html);
}
"iframe" => {
let _ = handle_iframe(element);
Expand Down Expand Up @@ -138,6 +151,46 @@ fn handle_tag(
Ok(())
}

/// Replace the markdown chars cleanly.
fn replace_markdown_chars(input: &str) -> String {
use crate::MARKDOWN_MIDDLE_KEYCHARS_SET;

if !MARKDOWN_MIDDLE_KEYCHARS_SET.is_match(input) {
return input.to_string();
}

let mut output = String::new();
let mut chars = input.chars().peekable();

while let Some(ch) = chars.next() {
if ch == '&' {
let mut entity = String::new();
entity.push(ch);
while let Some(&next_ch) = chars.peek() {
entity.push(next_ch);
chars.next();
if entity == "&nbsp;" {
entity.clear(); // discard &nbsp;
break;
} else if next_ch == ';' || entity.len() > 6 {
output.push_str(&entity);
break;
}
}
if !entity.is_empty() {
output.push_str(&entity);
}
} else if "<>*\\_~".contains(ch) {
output.push('\\');
output.push(ch);
} else {
output.push(ch);
}
}

output
}

/// Get the HTML rewriter settings to convert ot markdown.
pub fn get_rewriter_settings(
commonmark: bool,
Expand All @@ -147,9 +200,10 @@ pub fn get_rewriter_settings(
let list_type = Rc::new(RefCell::new(None));
let order_counter = Rc::new(RefCell::new(0));
let quote_depth = Rc::new(RefCell::new(0));

let quote_depth1 = quote_depth.clone();

let inside_table = Rc::new(RefCell::new(false));

let mut element_content_handlers =
Vec::with_capacity(4 + custom.as_ref().map_or(0, |c| c.len()));

Expand All @@ -161,9 +215,7 @@ pub fn get_rewriter_settings(
element_content_handlers.push(text!(
"*:not(script):not(head):not(style):not(svg)",
move |el| {
*el.as_mut_str() = crate::MARKDOWN_MIDDLE_KEYCHARS
.replace_all(el.as_str().trim().into(), "\\$0")
.to_string();
*el.as_mut_str() = replace_markdown_chars(el.as_str().trim().into());
Ok(())
}
));
Expand All @@ -181,6 +233,7 @@ pub fn get_rewriter_settings(
list_type.clone(),
order_counter.clone(),
quote_depth.clone(),
inside_table.clone(),
);
Ok(())
}));
Expand Down
2 changes: 1 addition & 1 deletion fast_html2md/tests/integration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ fn test_tables_crash2() {

let table_with_vertical_header = rewrite_html(&html, false);

let m = indoc! { "xxxxx xxxxxxxxxx xxxxxxx x xxxxx))~~xxxxxxxx xxxxxxxx~~\n## At a Glance\n&nbsp;| **Current Conditions:** | Open all year. No reservations. No services.&nbsp; | |\n| **Reservations:** | No reservations.&nbsp; | |\n| **Fees** | No fee.&nbsp; | |\n| **Water:** | No water. | |"};
let m = indoc! { "xxxxx xxxxxxxxxx xxxxxxx x xxxxx))~~xxxxxxxx xxxxxxxx~~\n## At a Glance\n|Current Conditions:|Open all year. No reservations. No services.|\nReservations:|No reservations.|\nFees|No fee.|\nWater:|No water.|"};

assert_that!(table_with_vertical_header).contains(m);
}
Expand Down
49 changes: 28 additions & 21 deletions fast_html2md/tests/tables.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,16 +31,12 @@ fn test_tables() {

let md = rewrite_html(s, false);

assert_eq!(
md,
"| **Minor1** | **Minor2** | **Minor3** | **Minor4** | |\n| col1 | col2 | col3 | col4 | |"
);
assert_eq!(md, "|Minor1|Minor2|Minor3|Minor4|\ncol1|col2|col3|col4|");
}

#[test]
fn test_tables_invalid_more_headers() {
let md = parse_html(
r#"<table>
let s = r#"<table>
<thead>
<tr>
<th scope='col'>Minor1</th>
Expand All @@ -59,20 +55,26 @@ fn test_tables_invalid_more_headers() {
<td>col4</td>
</tr>
</tbody>
</table>"#,
false,
);
</table>"#;

let m =
"|Minor1|Minor2|Minor3|Minor4|Minor5|Minor6|\n|||||||\n| col1 | col2 | col3 | col4 | | |";

let md = parse_html(s, false);

assert_eq!(md, m);

let md = rewrite_html(s, false);

assert_eq!(
md,
"|Minor1|Minor2|Minor3|Minor4|Minor5|Minor6|\n|||||||\n| col1 | col2 | col3 | col4 | | |"
"|Minor1|Minor2|Minor3|Minor4|Minor5|Minor6|\ncol1|col2|col3|col4|"
);
}

#[test]
fn test_tables_invalid_more_rows() {
let md = parse_html(
r#"<table>
let s = r#"<table>
<thead>
<tr>
<th scope='col'>Minor1</th>
Expand All @@ -87,17 +89,22 @@ fn test_tables_invalid_more_rows() {
<td>col4</td>
</tr>
</tbody>
</table>"#,
false,
);
</table>"#;

let m = "|Minor1|Minor2| | |\n|||||\n| col1 | col2 |col3|col4|";

let md = parse_html(s, false);

assert_eq!(md, m);

let md = rewrite_html(s, false);

assert_eq!(md, "|Minor1|Minor2| | |\n|||||\n| col1 | col2 |col3|col4|");
assert_eq!(md, "|Minor1|Minor2|\ncol1|col2|col3|col4|");
}

#[test]
fn test_tables_odd_column_width() {
let md = parse_html(
r#"<table>
let s = r#"<table>
<thead>
<tr>
<th scope='col'>Minor</th>
Expand All @@ -110,9 +117,9 @@ fn test_tables_odd_column_width() {
<td>col2</td>
</tr>
</tbody>
</table>"#,
false,
);
</table>"#;

let md = parse_html(s, false);

assert_eq!(md, "|Minor|Major|\n|||\n|col1 |col2 |");
}
Expand Down

0 comments on commit 81ef4ed

Please sign in to comment.