Skip to content

Commit

Permalink
perf(writer): remove tokio channel sending rewrite
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Jan 25, 2025
1 parent 38ed8c1 commit 6a6fdbe
Show file tree
Hide file tree
Showing 8 changed files with 262 additions and 256 deletions.
400 changes: 204 additions & 196 deletions Cargo.lock

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ The fastest Rust html to markdown transformer.

`cargo add fast_html2md`

You can use a html5ever or lol_html to transform.
You can use a [html5ever](https://docs.rs/html5ever/latest/html5ever/) or [lol_html](https://docs.rs/lol_html/latest/lol_html/) to transform.

Using the rewriter with the default `rewriter` feature flag.
Using the rewriter with the default `rewriter` feature flag (recommended and 2x faster baseline).

```rust
let md = html2md::rewrite_html("<p>JAMES</p>", false);
Expand Down
4 changes: 2 additions & 2 deletions benches/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ pub fn bench_speed(c: &mut Criterion) {
b.iter(|| black_box(rewrite_html(&html, false)))
});

group.bench_function(format!("Async real-world-1: {}", sample_title), |b| {
group.bench_function(format!("Rewriter(async,streaming) real-world-1: {}", sample_title), |b| {
let rt = tokio::runtime::Runtime::new().unwrap();
b.to_async(rt)
.iter(|| async { black_box(rewrite_html_streaming(&html, false).await) });
Expand All @@ -43,7 +43,7 @@ pub fn bench_speed(c: &mut Criterion) {
b.iter(|| black_box(rewrite_html(&html, false)))
});

group.bench_function(format!("Async Scraper wiki-cat: {}", sample_title), |b| {
group.bench_function(format!("Rewriter(async,streaming) wiki-cat: {}", sample_title), |b| {
let rt = tokio::runtime::Runtime::new().unwrap();
b.to_async(rt)
.iter(|| async { black_box(rewrite_html_streaming(&html, false).await) });
Expand Down
2 changes: 1 addition & 1 deletion fast_html2md/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "fast_html2md"
version = "0.0.41"
version = "0.0.42"
edition = "2021"
description = "A fast html2md crate for rust"
categories = ["development-tools", "parsing", "parser-implementations"]
Expand Down
7 changes: 3 additions & 4 deletions fast_html2md/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
use extended::sifter::{WhitespaceSifter, WhitespaceSifterBytes};
use lazy_static::lazy_static;
use regex::Regex;
use std::collections::HashSet;
use url::Url;

// we want to just use the rewriter instead for v0.1.
Expand Down Expand Up @@ -59,7 +58,7 @@ pub async fn rewrite_html_streaming(html: &str, commonmark: bool) -> String {
#[cfg(all(feature = "tokio", feature = "rewriter"))]
pub fn rewrite_html_custom_with_url(
html: &str,
custom: &Option<HashSet<String>>,
custom: &Option<std::collections::HashSet<String>>,
commonmark: bool,
url: &Option<Url>,
) -> String {
Expand All @@ -78,7 +77,7 @@ pub fn rewrite_html_custom_with_url(
#[cfg(all(feature = "tokio", feature = "rewriter"))]
pub async fn rewrite_html_custom_with_url_and_chunk(
html: &str,
custom: &Option<HashSet<String>>,
custom: &Option<std::collections::HashSet<String>>,
commonmark: bool,
url: &Option<Url>,
chunk_size: usize,
Expand All @@ -101,7 +100,7 @@ pub async fn rewrite_html_custom_with_url_and_chunk(
#[cfg(all(feature = "tokio", feature = "rewriter"))]
pub async fn rewrite_html_custom_with_url_streaming(
html: &str,
custom: &Option<HashSet<String>>,
custom: &Option<std::collections::HashSet<String>>,
commonmark: bool,
url: &Option<Url>,
) -> String {
Expand Down
26 changes: 8 additions & 18 deletions fast_html2md/src/rewriter/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@ use lol_html::{doc_comments, doctype, text};
use lol_html::{element, RewriteStrSettings};
use std::cell::RefCell;
use std::rc::Rc;
use std::sync::RwLock;
use std::sync::{Arc, Mutex};
use std::sync::{Arc, RwLock};
use url::Url;

/// Get the HTML rewriter settings to convert to markdown.
Expand Down Expand Up @@ -209,40 +208,31 @@ pub async fn convert_html_to_markdown_send_with_size(
) -> Result<String, Box<dyn std::error::Error>> {
use tokio_stream::StreamExt;
let settings = get_rewriter_settings_send(commonmark, custom, url.clone());
let (txx, mut rxx) = tokio::sync::mpsc::unbounded_channel();

let mut rewrited_bytes: Vec<u8> = Vec::new();

let mut rewriter = lol_html::send::HtmlRewriter::new(settings.into(), |c: &[u8]| {
let _ = txx.send(c.to_vec());
rewrited_bytes.extend_from_slice(&c);
});

let html_bytes = html.as_bytes();
let chunks = html_bytes.chunks(chunk_size);

let mut stream = tokio_stream::iter(chunks).map(Ok::<&[u8], ()>);
let mut stream = tokio_stream::iter(chunks);

let mut wrote_error = false;

while let Some(chunk) = stream.next().await {
if let Ok(chunk) = chunk {
if rewriter.write(chunk).is_err() {
wrote_error = true;
break;
}
if rewriter.write(chunk).is_err() {
wrote_error = true;
break;
}
}

if !wrote_error {
let _ = rewriter.end();
}

drop(txx);

let mut rewrited_bytes: Vec<u8> = Vec::new();

while let Some(c) = rxx.recv().await {
rewrited_bytes.extend_from_slice(&c);
}

Ok(clean_markdown_bytes(&rewrited_bytes))
}

Expand Down
29 changes: 14 additions & 15 deletions fast_html2md/tests/images.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
use html2md::{parse_html, rewrite_html};
use pretty_assertions::assert_eq;

#[test]
fn test_image_native_simple() {
let md = parse_html("<img src=\"https://i.redd.it/vesfbmwfkz811.png\" alt=\"image of Linus holding his laptop\" title=\"Daddy Linus\" />", false);
let md = html2md::parse_html("<img src=\"https://i.redd.it/vesfbmwfkz811.png\" alt=\"image of Linus holding his laptop\" title=\"Daddy Linus\" />", false);
assert_eq!(
md,
"![image of Linus holding his laptop](https://i.redd.it/vesfbmwfkz811.png \"Daddy Linus\")"
);
let md = rewrite_html("<img src=\"https://i.redd.it/vesfbmwfkz811.png\" alt=\"image of Linus holding his laptop\" title=\"Daddy Linus\" />", false);
let md = html2md::rewrite_html("<img src=\"https://i.redd.it/vesfbmwfkz811.png\" alt=\"image of Linus holding his laptop\" title=\"Daddy Linus\" />", false);
assert_eq!(
md,
"![image of Linus holding his laptop](https://i.redd.it/vesfbmwfkz811.png \"Daddy Linus\")"
Expand All @@ -17,12 +16,12 @@ fn test_image_native_simple() {

#[test]
fn test_image_native_without_title() {
let md = parse_html("<img src=\"https://i.redd.it/l0ne52x7fh611.png\" alt=\"image of usual kill -9 sequence\" />", false);
let md = html2md::parse_html("<img src=\"https://i.redd.it/l0ne52x7fh611.png\" alt=\"image of usual kill -9 sequence\" />", false);
assert_eq!(
md,
"![image of usual kill -9 sequence](https://i.redd.it/l0ne52x7fh611.png)"
);
let md = rewrite_html("<img src=\"https://i.redd.it/l0ne52x7fh611.png\" alt=\"image of usual kill -9 sequence\" />", false);
let md = html2md::rewrite_html("<img src=\"https://i.redd.it/l0ne52x7fh611.png\" alt=\"image of usual kill -9 sequence\" />", false);
assert_eq!(
md,
"![image of usual kill -9 sequence](https://i.redd.it/l0ne52x7fh611.png)"
Expand All @@ -31,30 +30,30 @@ fn test_image_native_without_title() {

#[test]
fn test_image_embedded_html() {
let md = parse_html("<img src=\"https://i.redd.it/un4h28uwtp711.png\" alt=\"comics about Mac and GNU/Linux\" title=\"Look at me, brother\" height=\"150\" width=\"150\" />", false);
let md = html2md::parse_html("<img src=\"https://i.redd.it/un4h28uwtp711.png\" alt=\"comics about Mac and GNU/Linux\" title=\"Look at me, brother\" height=\"150\" width=\"150\" />", false);
assert_eq!(md, "![comics about Mac and GNU/Linux](https://i.redd.it/un4h28uwtp711.png \"Look at me, brother\")");
let md = rewrite_html("<img src=\"https://i.redd.it/un4h28uwtp711.png\" alt=\"comics about Mac and GNU/Linux\" title=\"Look at me, brother\" height=\"150\" width=\"150\" />", false);
let md = html2md::rewrite_html("<img src=\"https://i.redd.it/un4h28uwtp711.png\" alt=\"comics about Mac and GNU/Linux\" title=\"Look at me, brother\" height=\"150\" width=\"150\" />", false);
assert_eq!(md, "![comics about Mac and GNU/Linux](https://i.redd.it/un4h28uwtp711.png \"Look at me, brother\")")
}

#[test]
fn test_image_embedded_with_unsupported_html() {
// srcset is unsupported in Markdown
let md = parse_html("<img src=\"https://i.redd.it/07onlc10x5711.png\" alt=\"HACKERMAN\" title=\"When you reboot instead of exiting vim\" height=\"150\" width=\"150\" srcset=\"image1 image2\" align=\"center\" />", false);
let md = html2md::parse_html("<img src=\"https://i.redd.it/07onlc10x5711.png\" alt=\"HACKERMAN\" title=\"When you reboot instead of exiting vim\" height=\"150\" width=\"150\" srcset=\"image1 image2\" align=\"center\" />", false);
assert_eq!(md, "![HACKERMAN](https://i.redd.it/07onlc10x5711.png \"When you reboot instead of exiting vim\")");
// srcset is unsupported in Markdown
let md = rewrite_html("<img src=\"https://i.redd.it/07onlc10x5711.png\" alt=\"HACKERMAN\" title=\"When you reboot instead of exiting vim\" height=\"150\" width=\"150\" srcset=\"image1 image2\" align=\"center\" />", false);
let md = html2md::rewrite_html("<img src=\"https://i.redd.it/07onlc10x5711.png\" alt=\"HACKERMAN\" title=\"When you reboot instead of exiting vim\" height=\"150\" width=\"150\" srcset=\"image1 image2\" align=\"center\" />", false);
assert_eq!(md, "![HACKERMAN](https://i.redd.it/07onlc10x5711.png \"When you reboot instead of exiting vim\")");
}

#[test]
fn test_image_src_issue() {
let md = parse_html("<img src=\"https://dybr.ru/img/43/1532265494_android-Kanedias\" width=\"auto\" height=\"500\" >", false);
let md = html2md::parse_html("<img src=\"https://dybr.ru/img/43/1532265494_android-Kanedias\" width=\"auto\" height=\"500\" >", false);
assert_eq!(
md,
"![](https://dybr.ru/img/43/1532265494_android-Kanedias)"
);
let md = rewrite_html("<img src=\"https://dybr.ru/img/43/1532265494_android-Kanedias\" width=\"auto\" height=\"500\" >", false);
let md = html2md::rewrite_html("<img src=\"https://dybr.ru/img/43/1532265494_android-Kanedias\" width=\"auto\" height=\"500\" >", false);
assert_eq!(
md,
"![](https://dybr.ru/img/43/1532265494_android-Kanedias)"
Expand All @@ -63,12 +62,12 @@ fn test_image_src_issue() {

#[test]
fn test_image_with_space_issue() {
let md = parse_html("<img src=\"https://i.redd.it/l0ne 52x7f h611.png\" alt=\"image of usual kill -9 sequence\" />", false);
let md = html2md::parse_html("<img src=\"https://i.redd.it/l0ne 52x7f h611.png\" alt=\"image of usual kill -9 sequence\" />", false);
assert_eq!(
md,
"![image of usual kill -9 sequence](https://i.redd.it/l0ne%2052x7f%20h611.png)"
);
let md = rewrite_html("<img src=\"https://i.redd.it/l0ne 52x7f h611.png\" alt=\"image of usual kill -9 sequence\" />", false);
let md = html2md::rewrite_html("<img src=\"https://i.redd.it/l0ne 52x7f h611.png\" alt=\"image of usual kill -9 sequence\" />", false);
assert_eq!(
md,
"![image of usual kill -9 sequence](https://i.redd.it/l0ne%2052x7f%20h611.png)"
Expand All @@ -77,8 +76,8 @@ fn test_image_with_space_issue() {

#[test]
fn test_image_with_query_issue() {
let md = parse_html("<img src=\"https://instagram.ftll1-1.fna.fbcdn.net/vp/4c753762a3cd58ec2cd55f7e20f87e5c/5D39A8B3/t51.2885-15/sh0.08/e35/p640x640/54511922_267736260775264_8482507773977053160_n.jpg?_nc_ht=instagram.ftll1-1.fna.fbcdn.net\" style=\"width: 494px;\">", false);
let md = html2md::parse_html("<img src=\"https://instagram.ftll1-1.fna.fbcdn.net/vp/4c753762a3cd58ec2cd55f7e20f87e5c/5D39A8B3/t51.2885-15/sh0.08/e35/p640x640/54511922_267736260775264_8482507773977053160_n.jpg?_nc_ht=instagram.ftll1-1.fna.fbcdn.net\" style=\"width: 494px;\">", false);
assert_eq!(md, "![](https://instagram.ftll1-1.fna.fbcdn.net/vp/4c753762a3cd58ec2cd55f7e20f87e5c/5D39A8B3/t51.2885-15/sh0.08/e35/p640x640/54511922_267736260775264_8482507773977053160_n.jpg?_nc_ht=instagram.ftll1-1.fna.fbcdn.net)");
let md = rewrite_html("<img src=\"https://instagram.ftll1-1.fna.fbcdn.net/vp/4c753762a3cd58ec2cd55f7e20f87e5c/5D39A8B3/t51.2885-15/sh0.08/e35/p640x640/54511922_267736260775264_8482507773977053160_n.jpg?_nc_ht=instagram.ftll1-1.fna.fbcdn.net\" style=\"width: 494px;\">", false);
let md = html2md::rewrite_html("<img src=\"https://instagram.ftll1-1.fna.fbcdn.net/vp/4c753762a3cd58ec2cd55f7e20f87e5c/5D39A8B3/t51.2885-15/sh0.08/e35/p640x640/54511922_267736260775264_8482507773977053160_n.jpg?_nc_ht=instagram.ftll1-1.fna.fbcdn.net\" style=\"width: 494px;\">", false);
assert_eq!(md, "![](https://instagram.ftll1-1.fna.fbcdn.net/vp/4c753762a3cd58ec2cd55f7e20f87e5c/5D39A8B3/t51.2885-15/sh0.08/e35/p640x640/54511922_267736260775264_8482507773977053160_n.jpg?_nc_ht=instagram.ftll1-1.fna.fbcdn.net)");
}
Loading

0 comments on commit 6a6fdbe

Please sign in to comment.