Skip to content

Commit

Permalink
chore(lib): add style and script ignore
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Sep 23, 2024
1 parent a4e5ab7 commit 2ac4f09
Show file tree
Hide file tree
Showing 9 changed files with 1,546 additions and 16 deletions.
29 changes: 29 additions & 0 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
name: Rust
on:
push:
branches: [main]
pull_request:
branches: [main]

env:
CARGO_TERM_COLOR: always

jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/cache@v3
id: cache
with:
path: |
~/.cargo/bin/
~/.cargo/registry/index/
~/.cargo/registry/cache/
~/.cargo/git/db/
target/
key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
- name: Build
run: cargo build --verbose --release
- name: Run tests
run: cargo test
21 changes: 21 additions & 0 deletions .github/workflows/stale.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
name: Mark stale issues and pull requests

on:
schedule:
- cron: "30 1 * * *"

jobs:
stale:
runs-on: ubuntu-latest
steps:
- uses: actions/stale@v3
with:
repo-token: ${{ secrets.GITHUB_TOKEN }}
stale-issue-message: "This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 5 days."
stale-pr-message: "This PR is stale because it has been open 45 days with no activity. Remove stale label or comment or this will be closed in 10 days."
close-issue-message: "This issue was closed because it has been stalled for 5 days with no activity."
close-pr-message: "This PR was closed because it has been stalled for 10 days with no activity."
days-before-issue-stale: 30
days-before-pr-stale: 45
days-before-issue-close: 5
days-before-pr-close: 10
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "fast_html2md"
version = "0.0.11"
version = "0.0.12"
edition = "2021"
description = "A fast html2md crate for rust"
categories = ["development-tools", "parsing", "parser-implementations"]
Expand Down
37 changes: 31 additions & 6 deletions src/anchors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@ pub struct AnchorHandler {
impl TagHandler for AnchorHandler {
fn handle(&mut self, tag: &Handle, printer: &mut StructuredPrinter) {
self.start_pos = printer.data.len();

// try to extract a hyperlink
self.url = match tag.data {
NodeData::Element { ref attrs, .. } => {
let attrs = attrs.borrow();
Expand All @@ -22,7 +20,7 @@ impl TagHandler for AnchorHandler {
.find(|attr| attr.name.local.as_bytes() == b"href");

match href {
Some(link) => link.value.trim().into(),
Some(link) => link.value.trim_ascii().into(),
None => String::new(),
}
}
Expand All @@ -31,8 +29,35 @@ impl TagHandler for AnchorHandler {
}

fn after_handle(&mut self, printer: &mut StructuredPrinter) {
// add braces around already present text, put an url afterwards
printer.insert_str(self.start_pos, "[");
printer.append_str(&format!("]({})", self.url))
match printer.data.get(self.start_pos..) {
Some(d) => {
let starts_new_line = d.starts_with("\n");
let ends_new_line = d.ends_with("\n");

if starts_new_line || ends_new_line {
// handle start
if starts_new_line {
printer.insert_str(self.start_pos + 1, "[");
} else {
printer.insert_str(self.start_pos, "[");
}

// handle end
if ends_new_line {
let next_position = printer.data.len();
printer.insert_str(next_position - 1, &format!("]({})", self.url));
} else {
printer.append_str(&format!("]({})", self.url));
}
} else {
printer.insert_str(self.start_pos, "[");
printer.append_str(&format!("]({})", self.url));
}
}
_ => {
printer.insert_str(self.start_pos, "[");
printer.append_str(&format!("]({})", self.url));
}
}
}
}
20 changes: 12 additions & 8 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ pub fn parse_html_custom(
Ok(dom) => {
let mut result = StructuredPrinter::default();
walk(&dom.document, &mut result, custom, commonmark);
// we want to eventually remove the clean step.
clean_markdown(&result.data)
}
_ => Default::default(),
Expand Down Expand Up @@ -128,30 +129,31 @@ fn walk(
NodeData::Document | NodeData::Doctype { .. } | NodeData::ProcessingInstruction { .. } => {}
NodeData::Text { ref contents } => {
let mut text = contents.borrow().to_string();
let inside_pre = result.parent_chain.iter().any(|tag| tag == "pre");

let inside_pre = result.parent_chain.iter().any(|t| t == "pre");
if inside_pre {
// this is preformatted text, insert as-is
result.append_str(&text);

} else if !(text.trim().len() == 0
&& (result.data.chars().last() == Some('\n')
|| result.data.chars().last() == Some(' ')))
{
// in case it's not just a whitespace after the newline or another whitespace

// regular text, collapse whitespace and newlines in text
let inside_code = result.parent_chain.iter().any(|tag| tag == "code");
let inside_code = result.parent_chain.iter().any(|t| t == "code");
if !inside_code {
text = escape_markdown(result, &text);
}
let minified_text = EXCESSIVE_WHITESPACE_PATTERN.replace_all(&text, " ");
let minified_text = minified_text.trim_matches(|ch: char| ch == '\n' || ch == '\r');
result.append_str(&minified_text);
result.append_str(&minified_text.trim_ascii());
}
}
NodeData::Comment { .. } => {} // ignore comments
NodeData::Element { ref name, .. } => {
tag_name = name.local.to_string();
let inside_pre = result.parent_chain.iter().any(|tag| tag == "pre");
tag_name = name.local.to_string();

if inside_pre {
// don't add any html tags inside the pre section
Expand Down Expand Up @@ -200,22 +202,24 @@ fn walk(
}
}

let ignore_tags = tag_name == "style" || tag_name == "script";

// handle this tag, while it's not in parent chain
// and doesn't have child siblings
handler.handle(&input, result);

// save this tag name as parent for child nodes
result.parent_chain.push(tag_name.to_string()); // e.g. it was ["body"] and now it's ["body", "p"]
result.parent_chain.push(tag_name.clone()); // e.g. it was ["body"] and now it's ["body", "p"]
let current_depth = result.parent_chain.len(); // e.g. it was 1 and now it's 2

// create space for siblings of next level
result.siblings.insert(current_depth, vec![]);

for child in input.children.borrow().iter() {
if handler.skip_descendants() {
if handler.skip_descendants() || ignore_tags {
continue;
}

walk(&child, result, custom, commonmark);

match child.data {
Expand Down
Loading

0 comments on commit 2ac4f09

Please sign in to comment.