Skip to content

Commit

Permalink
chore(tables): fix double table walk
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Nov 11, 2024
1 parent adefda0 commit 5b08a87
Show file tree
Hide file tree
Showing 14 changed files with 79,817 additions and 106 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "fast_html2md"
version = "0.0.21"
version = "0.0.22"
edition = "2021"
description = "A fast html2md crate for rust"
categories = ["development-tools", "parsing", "parser-implementations"]
Expand Down
51 changes: 29 additions & 22 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ use dummy::DummyHandler;
use dummy::HtmlCherryPickHandler;
use dummy::IdentityHandler;
use headers::HeaderHandler;
use html5ever::LocalName;
use iframes::IframeHandler;
use images::ImgHandler;
use lists::ListHandler;
Expand Down Expand Up @@ -75,12 +76,11 @@ pub fn parse_html_custom_base(
commonmark: bool,
url: &Option<Url>,
) -> String {
match parse_document(RcDom::default(), ParseOpts::default())
.from_utf8()
.read_from(&mut html.as_bytes())
{
let document_parser = parse_document(RcDom::default(), ParseOpts::default());

match document_parser.from_utf8().read_from(&mut html.as_bytes()) {
Ok(dom) => {
let mut result = StructuredPrinter::default();
let mut result = Box::new(StructuredPrinter::default());

walk(
&dom.document,
Expand All @@ -92,6 +92,7 @@ pub fn parse_html_custom_base(
} else {
None
},
false,
);

// we want to eventually remove the clean step.
Expand Down Expand Up @@ -174,6 +175,7 @@ fn walk(
custom: &HashMap<String, Box<dyn TagHandlerFactory>>,
commonmark: bool,
url: &Option<Arc<Url>>,
ignore_parents: bool,
) {
let mut handler: Box<dyn TagHandler> = Box::new(DummyHandler);
let mut tag_name = String::default();
Expand All @@ -187,7 +189,7 @@ fn walk(
NodeData::Element { .. } | NodeData::Text { .. }
);

if find_parent_tags {
if find_parent_tags && !ignore_parents {
for tag in result.parent_chain.iter() {
if tag == "code" {
inside_code = true;
Expand All @@ -205,7 +207,10 @@ fn walk(
}

match input.data {
NodeData::Document | NodeData::Doctype { .. } | NodeData::ProcessingInstruction { .. } => {}
NodeData::Document
| NodeData::Comment { .. }
| NodeData::Doctype { .. }
| NodeData::ProcessingInstruction { .. } => (),
NodeData::Text { ref contents } => {
let mut text = contents.borrow().to_string();

Expand All @@ -221,10 +226,10 @@ fn walk(
}

let minified_text = EXCESSIVE_WHITESPACE_PATTERN.replace_all(&text, " ");

result.append_str(minified_text.trim());
}
}
NodeData::Comment { .. } => {} // ignore comments
NodeData::Element { ref name, .. } => {
tag_name = name.local.to_string();

Expand Down Expand Up @@ -271,7 +276,7 @@ fn walk(
"sub" | "sup" => Box::new(IdentityHandler::new(commonmark)),
// tables, handled fully internally as markdown can't have nested content in tables
// supports only single tables as of now
"table" => Box::new(TableHandler::default()),
"table" => Box::new(TableHandler::new(commonmark, url.clone())),
"iframe" => Box::new(IframeHandler),
_ => Box::new(DummyHandler),
}
Expand All @@ -287,28 +292,30 @@ fn walk(

// save this tag name as parent for child nodes
result.parent_chain.push(tag_name.clone()); // e.g. it was ["body"] and now it's ["body", "p"]

let current_depth = result.parent_chain.len(); // e.g. it was 1 and now it's 2

// create space for siblings of next level
result.siblings.insert(current_depth, vec![]);

for child in input.children.borrow().iter() {
if handler.skip_descendants() {
continue;
}
if !handler.skip_descendants() {
let children = input.children.borrow();

walk(child, result, custom, commonmark, url);
for child in children.iter() {
walk(child, result, custom, commonmark, url, false);

if let NodeData::Element { ref name, .. } = child.data {
if let Some(el) = result.siblings.get_mut(&current_depth) {
let eln = name.local.to_string();
let ignore_push = eln == "script" || eln == "style";
if let NodeData::Element { ref name, .. } = child.data {
if let Some(el) = result.siblings.get_mut(&current_depth) {
let eln = name.local.to_string();

if !ignore_push {
el.push(eln)
let ignore_push = eln == "script" || eln == "style";

if !ignore_push {
el.push(eln)
}
}
}
};
};
}
}

// clear siblings of next level
Expand Down
130 changes: 65 additions & 65 deletions src/tables.rs
Original file line number Diff line number Diff line change
@@ -1,23 +1,36 @@
use super::StructuredPrinter;
use super::TagHandler;
use super::{clean_markdown, walk};

use std::sync::Arc;
use std::{cmp, collections::HashMap};

use html5ever::LocalName;
use markup5ever_rcdom::{Handle, NodeData};
use url::Url;

#[derive(Default)]
pub struct TableHandler {
commonmark: bool,
url: Option<Arc<Url>>,
}

const TD: LocalName = html5ever::local_name!("td");
const TH: LocalName = html5ever::local_name!("th");

impl TableHandler {
/// A new table handler.
pub fn new(commonmark: bool, url: Option<std::sync::Arc<Url>>) -> Self {
TableHandler { commonmark, url }
}
}

impl TagHandler for TableHandler {
fn handle(&mut self, tag: &Handle, printer: &mut StructuredPrinter) {
let mut table_markup = String::new();

let any_matcher = |cell: &Handle| {
let name = tag_name(cell);
name == "td" || name == "th"
let any_matcher = |cell: &Handle| match cell.data {
NodeData::Element { ref name, .. } => name.local == TD || name.local == TH,
_ => false,
};

// detect cell width, counts
Expand All @@ -39,34 +52,33 @@ impl TagHandler for TableHandler {

column_widths = vec![3; column_count];

// detect max column width
// header row must always be present
for (idx, row) in rows.iter().enumerate() {
table_markup.push('|');
let cells = collect_children(row, any_matcher);

let mut inner_table_markup = String::new();

for index in 0..column_count {
// from regular rows
if index >= 10000 {
break;
}

if let Some(cell) = cells.get(index) {
let text = to_text(cell, self.commonmark);
let mut text = to_text(cell, self.commonmark, &self.url);

column_widths[index] = cmp::max(column_widths[index], text.chars().count());
}
}
if idx >= 1000 {
break;
}
}

// header row must always be present
for (idx, row) in rows.iter().enumerate() {
table_markup.push('|');
let cells = collect_children(row, any_matcher);
// we need to fill all cells in a column, even if some rows don't have enough
pad_cell_text(&mut text, column_widths[index]);

for index in 0..column_count {
// we need to fill all cells in a column, even if some rows don't have enough
let padded_cell_text =
pad_cell_text(&cells.get(index), column_widths[index], self.commonmark);
table_markup.push_str(&padded_cell_text);
table_markup.push('|');
inner_table_markup.push_str(&text);
}

inner_table_markup.push('|');
}

table_markup.push_str(&inner_table_markup);
table_markup.push('\n');

if idx == 0 {
Expand Down Expand Up @@ -131,13 +143,14 @@ impl TagHandler for TableHandler {
}
}

printer.insert_newline();
printer.insert_newline();
printer.append_str(&table_markup);
}
}

fn after_handle(&mut self, _printer: &mut StructuredPrinter) {}
fn after_handle(&mut self, printer: &mut StructuredPrinter) {
printer.insert_newline();
}

fn skip_descendants(&self) -> bool {
true
Expand All @@ -149,43 +162,19 @@ impl TagHandler for TableHandler {
/// `tag` - optional reference to currently processed handle, text is extracted from here
///
/// `column_width` - precomputed column width to compute padding length from
fn pad_cell_text(tag: &Option<&Handle>, column_width: usize, commonmark: bool) -> String {
let mut result = String::new();
if let Some(cell) = tag {
// have header at specified position
let text = to_text(cell, commonmark);
// compute difference between width and text length
let len_diff = column_width - text.chars().count();

if len_diff > 0 {
// should pad
if len_diff > 1 {
result.push(' ');
result.push_str(&text);
result.push(' ');
} else {
// it's just one space, add at the end
result.push_str(&text);
result.push(' ');
}
fn pad_cell_text(text: &mut String, column_width: usize) {
// Compute difference between column width and text length
let len_diff = column_width
.checked_sub(text.chars().count())
.unwrap_or_default();

if len_diff > 0 {
if len_diff > 1 {
text.insert(0, ' ');
text.push(' ');
} else {
// shouldn't pad, text fills whole cell
result.push_str(&text);
text.push(' ');
}
} else {
// no text in this cell, fill cell with spaces
result.push(' ');
}

result
}

/// Extracts tag name from passed tag
/// Returns empty string if it's not an html element
fn tag_name(tag: &Handle) -> String {
match tag.data {
NodeData::Element { ref name, .. } => name.local.to_string(),
_ => String::new(),
}
}

Expand All @@ -194,12 +183,16 @@ fn tag_name(tag: &Handle) -> String {
fn find_children(tag: &Handle, name: &str) -> Vec<Handle> {
let mut result: Vec<Handle> = vec![];
let children = tag.children.borrow();

for child in children.iter() {
if tag_name(child) == name {
result.push(child.clone());
if let NodeData::Element { ref name, .. } = tag.data {
if name.local == name.local {
result.push(child.clone());
}
}

let mut descendants = find_children(child, name);

result.append(&mut descendants);
}

Expand All @@ -212,8 +205,8 @@ fn collect_children<P>(tag: &Handle, predicate: P) -> Vec<Handle>
where
P: Fn(&Handle) -> bool,
{
let mut result: Vec<Handle> = vec![];
let children = tag.children.borrow();
let mut result: Vec<Handle> = Vec::with_capacity(children.len());

for child in children.iter() {
if predicate(child) {
Expand All @@ -226,8 +219,15 @@ where

/// Convert html tag to text. This collects all tag children in correct order where they're observed
/// and concatenates their text, recursively.
fn to_text(tag: &Handle, commonmark: bool) -> String {
fn to_text(tag: &Handle, commonmark: bool, url: &Option<std::sync::Arc<Url>>) -> String {
let mut printer = StructuredPrinter::default();
walk(tag, &mut printer, &HashMap::default(), commonmark, &None);
walk(
tag,
&mut printer,
&HashMap::default(),
commonmark,
&url,
false,
);
clean_markdown(&printer.data)
}
Loading

0 comments on commit 5b08a87

Please sign in to comment.