Skip to content

Commit

Permalink
chore(rewriter): add lists
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Nov 14, 2024
1 parent 2975e49 commit 8a31dc6
Show file tree
Hide file tree
Showing 8 changed files with 300 additions and 87 deletions.
22 changes: 22 additions & 0 deletions fast_html2md/src/rewriter/counter.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
// Counter utility methods
pub trait Counter {
fn reset(&mut self);
fn increment(&mut self) -> usize;
fn decrement(&mut self) -> usize;
}

impl Counter for usize {
fn reset(&mut self) {
*self = 0;
}

fn increment(&mut self) -> usize {
*self = self.checked_add(1).unwrap_or(*self);
*self
}

fn decrement(&mut self) -> usize {
*self = self.checked_sub(1).unwrap_or(*self);
*self
}
}
35 changes: 35 additions & 0 deletions fast_html2md/src/rewriter/lists.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
use super::counter::Counter;
use lol_html::html_content::ContentType;
use lol_html::html_content::Element;
use std::cell::RefCell;
use std::rc::Rc;

// Function to handle list elements and items
#[inline]
pub(crate) fn handle_list_or_item(
element: &mut Element,
list_type: Rc<RefCell<Option<String>>>,
order_counter: Rc<RefCell<usize>>,
) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
match element.tag_name().as_str() {
"ul" | "menu" => {
*list_type.borrow_mut() = Some("ul".to_string());
order_counter.borrow_mut().reset(); // Reset the order counter for a new list
}
"ol" => {
*list_type.borrow_mut() = Some("ol".to_string());
order_counter.borrow_mut().reset();
}
"li" => {
if list_type.borrow().as_deref() == Some("ol") {
let order = order_counter.borrow_mut().increment();
element.before(&format!("\n{}. ", order), ContentType::Text);
} else {
element.before("\n* ", ContentType::Text);
}
}
_ => (),
}

Ok(())
}
5 changes: 5 additions & 0 deletions fast_html2md/src/rewriter/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
pub(crate) mod counter;
pub(crate) mod iframes;
pub(crate) mod images;
pub(crate) mod lists;
pub(crate) mod quotes;
pub(crate) mod styles;

pub mod writer;
58 changes: 58 additions & 0 deletions fast_html2md/src/rewriter/quotes.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
use crate::rewriter::counter::Counter;
use lol_html::html_content::{ContentType, Element, TextChunk};
use std::error::Error;
use std::{cell::RefCell, rc::Rc};

// Function to handle <blockquote> elements
pub(crate) fn rewrite_blockquote_element(
el: &mut Element,
quote_depth: Rc<RefCell<usize>>,
) -> Result<(), Box<dyn Error + Send + Sync>> {
quote_depth.borrow_mut().increment();

if let Some(end_tag_handlers) = el.end_tag_handlers() {
end_tag_handlers.push(Box::new({
let quote_depth = quote_depth.clone();
move |_end| {
quote_depth.borrow_mut().decrement();
Ok(())
}
}));
}

Ok(())
}

// Function to handle text within <blockquote> elements
pub(crate) fn rewrite_blockquote_text(
text_chunk: &mut TextChunk<'_>,
quote_depth: Rc<RefCell<usize>>,
) -> Result<(), Box<dyn Error + Send + Sync>> {
let depth = *quote_depth.borrow();
let quote_prefix = "> ".repeat(depth);
let lines: Vec<&str> = text_chunk.as_str().lines().collect();
let total_lines = lines.len();

let last = text_chunk.last_in_text_node();

let modified_text = lines
.iter()
.enumerate()
.map(|(i, line)| {
if i >= 1 && i == total_lines - 1 {
format!("{}", line)
} else {
format!("{}{}", quote_prefix, line)
}
})
.collect::<Vec<_>>()
.join("");

text_chunk.replace(&modified_text, ContentType::Html);

if last {
text_chunk.after("\n", ContentType::Text);
}

Ok(())
}
21 changes: 21 additions & 0 deletions fast_html2md/src/rewriter/styles.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
use lol_html::html_content::Element;

/// Rewrite the initial elements that need extra styles.
pub(crate) fn rewrite_style_element(el: &mut Element) -> Result<(), std::io::Error> {
let tag_name = el.tag_name().to_ascii_lowercase();
let mark = match tag_name.as_str() {
"b" | "strong" => "**",
"i" | "em" => "*",
"s" | "del" => "~~",
"u" | "ins" => "__",
_ => return Ok(()), // Return early if tag is not one of the specified
};

// Apply the markup before the element's content
el.before(mark, lol_html::html_content::ContentType::Text);

// Apply the markup after the element's content
el.after(mark, lol_html::html_content::ContentType::Text);

Ok(())
}
69 changes: 61 additions & 8 deletions fast_html2md/src/rewriter/writer.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
use super::iframes::handle_iframe;
use super::images::rewrite_image_element;
use super::lists::handle_list_or_item;
use super::quotes::{rewrite_blockquote_element, rewrite_blockquote_text};
use super::styles::rewrite_style_element;
use crate::clean_markdown;
use lol_html::doc_comments;
use lol_html::html_content::ContentType::Text;
use lol_html::html_content::Element;
use lol_html::{doc_comments, text};
use lol_html::{element, rewrite_str, RewriteStrSettings};
use std::sync::Arc;
use std::cell::RefCell;
use std::rc::Rc;
use url::Url;

/// Insert a new line
Expand All @@ -20,11 +24,32 @@ fn handle_tag(
element: &mut Element,
commonmark: bool,
url: &Option<Url>,
list_type: Rc<RefCell<Option<String>>>,
order_counter: Rc<RefCell<usize>>,
quote_depth: Rc<RefCell<usize>>,
) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
element.remove_and_keep_content();
let element_name = element.tag_name();

let keep_tags =
commonmark && (element_name.as_str() == "sub" || element_name.as_str() == "sup");

// check common mark includes.
if !keep_tags {
let attrs = element
.attributes()
.iter()
.map(|f| f.name())
.collect::<Vec<String>>();

for attr in attrs.iter() {
element.remove_attribute(&attr);
}

element.remove_and_keep_content();
}

// Add the markdown equivalents before the element.
match element.tag_name().as_str() {
match element_name.as_str() {
"h1" => {
element.before("# ", Text);
insert_newline(element);
Expand Down Expand Up @@ -56,7 +81,6 @@ fn handle_tag(
insert_newline(element);
}
"br" => insert_newline(element),
"li" => element.before("* ", Text),
"a" => {
if let Some(href) = element.get_attribute("href") {
element.before("[", lol_html::html_content::ContentType::Text);
Expand All @@ -65,7 +89,6 @@ fn handle_tag(
lol_html::html_content::ContentType::Text,
);
element.set_inner_content("", lol_html::html_content::ContentType::Text);
// Remove content tags.
}
}
"img" => {
Expand All @@ -85,6 +108,15 @@ fn handle_tag(
"iframe" => {
let _ = handle_iframe(element);
}
"b" | "i" | "s" | "strong" | "em" | "del" => {
let _ = rewrite_style_element(element);
}
"ol" | "ul" | "menu" | "li" => {
let _ = handle_list_or_item(element, list_type.clone(), order_counter.clone());
}
"q" | "cite" | "blockquote" => {
let _ = rewrite_blockquote_element(element, quote_depth);
}
_ => (),
}

Expand All @@ -96,18 +128,39 @@ pub fn get_rewriter_settings(
commonmark: bool,
url: Option<Url>,
) -> RewriteStrSettings<'static, 'static> {
let list_type = Rc::new(RefCell::new(None));
let order_counter = Rc::new(RefCell::new(0));
let quote_depth = Rc::new(RefCell::new(0));

let quote_depth1 = quote_depth.clone();

RewriteStrSettings {
document_content_handlers: vec![doc_comments!(|c| {
c.remove();
Ok(())
})],
element_content_handlers: vec![
element!("head, nav, svg", |el| {
text!("blockquote, q, cite", move |el| {
let _ = rewrite_blockquote_text(el, quote_depth1.clone());
Ok(())
}),
text!("summary, details", move |el| {
*el.as_mut_str() = el.as_str().trim().into();
Ok(())
}),
element!("head, nav", |el| {
el.remove();
Ok(())
}),
element!("*:not(script):not(head):not(style):not(svg)", move |el| {
let _ = handle_tag(el, commonmark, &url);
let _ = handle_tag(
el,
commonmark,
&url,
list_type.clone(),
order_counter.clone(),
quote_depth.clone(),
);
Ok(())
}),
],
Expand Down
Loading

0 comments on commit 8a31dc6

Please sign in to comment.