Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update pulldown_cmark to 0.10 #2308

Merged
merged 1 commit into from
Feb 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ handlebars = "5.0"
log = "0.4.17"
memchr = "2.5.0"
opener = "0.6.1"
pulldown-cmark = { version = "0.9.3", default-features = false }
pulldown-cmark = { version = "0.10.0", default-features = false, features = ["html"] }
regex = "1.8.1"
serde = { version = "1.0.163", features = ["derive"] }
serde_json = "1.0.96"
Expand Down
55 changes: 36 additions & 19 deletions src/book/summary.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use crate::errors::*;
use log::{debug, trace, warn};
use memchr::{self, Memchr};
use pulldown_cmark::{self, Event, HeadingLevel, Tag};
use memchr::Memchr;
use pulldown_cmark::{DefaultBrokenLinkCallback, Event, HeadingLevel, Tag, TagEnd};
use serde::{Deserialize, Serialize};
use std::fmt::{self, Display, Formatter};
use std::iter::FromIterator;
Expand Down Expand Up @@ -163,7 +163,7 @@ impl From<Link> for SummaryItem {
/// > match the following regex: "[^<>\n[]]+".
struct SummaryParser<'a> {
src: &'a str,
stream: pulldown_cmark::OffsetIter<'a, 'a>,
stream: pulldown_cmark::OffsetIter<'a, DefaultBrokenLinkCallback>,
offset: usize,

/// We can't actually put an event back into the `OffsetIter` stream, so instead we store it
Expand Down Expand Up @@ -210,7 +210,7 @@ macro_rules! collect_events {
}

impl<'a> SummaryParser<'a> {
fn new(text: &str) -> SummaryParser<'_> {
fn new(text: &'a str) -> SummaryParser<'a> {
let pulldown_parser = pulldown_cmark::Parser::new(text).into_offset_iter();

SummaryParser {
Expand Down Expand Up @@ -265,7 +265,12 @@ impl<'a> SummaryParser<'a> {
loop {
match self.next_event() {
Some(ev @ Event::Start(Tag::List(..)))
| Some(ev @ Event::Start(Tag::Heading(HeadingLevel::H1, ..))) => {
| Some(
ev @ Event::Start(Tag::Heading {
level: HeadingLevel::H1,
..
}),
) => {
if is_prefix {
// we've finished prefix chapters and are at the start
// of the numbered section.
Expand All @@ -275,8 +280,8 @@ impl<'a> SummaryParser<'a> {
bail!(self.parse_error("Suffix chapters cannot be followed by a list"));
}
}
Some(Event::Start(Tag::Link(_type, href, _title))) => {
let link = self.parse_link(href.to_string());
Some(Event::Start(Tag::Link { dest_url, .. })) => {
let link = self.parse_link(dest_url.to_string());
items.push(SummaryItem::Link(link));
}
Some(Event::Rule) => items.push(SummaryItem::Separator),
Expand Down Expand Up @@ -304,10 +309,13 @@ impl<'a> SummaryParser<'a> {
break;
}

Some(Event::Start(Tag::Heading(HeadingLevel::H1, ..))) => {
Some(Event::Start(Tag::Heading {
level: HeadingLevel::H1,
..
})) => {
debug!("Found a h1 in the SUMMARY");

let tags = collect_events!(self.stream, end Tag::Heading(HeadingLevel::H1, ..));
let tags = collect_events!(self.stream, end TagEnd::Heading(HeadingLevel::H1));
Some(stringify_events(tags))
}

Expand Down Expand Up @@ -336,7 +344,7 @@ impl<'a> SummaryParser<'a> {
/// Finishes parsing a link once the `Event::Start(Tag::Link(..))` has been opened.
fn parse_link(&mut self, href: String) -> Link {
let href = href.replace("%20", " ");
let link_content = collect_events!(self.stream, end Tag::Link(..));
let link_content = collect_events!(self.stream, end TagEnd::Link);
let name = stringify_events(link_content);

let path = if href.is_empty() {
Expand Down Expand Up @@ -377,7 +385,12 @@ impl<'a> SummaryParser<'a> {
}
// The expectation is that pulldown cmark will terminate a paragraph before a new
// heading, so we can always count on this to return without skipping headings.
Some(ev @ Event::Start(Tag::Heading(HeadingLevel::H1, ..))) => {
Some(
ev @ Event::Start(Tag::Heading {
level: HeadingLevel::H1,
..
}),
) => {
// we're starting a new part
self.back(ev);
break;
Expand All @@ -398,7 +411,7 @@ impl<'a> SummaryParser<'a> {

// Skip over the contents of this tag
while let Some(event) = self.next_event() {
if event == Event::End(other_tag.clone()) {
if event == Event::End(other_tag.clone().into()) {
break;
}
}
Expand Down Expand Up @@ -469,7 +482,7 @@ impl<'a> SummaryParser<'a> {

last_item.nested_items = sub_items;
}
Some(Event::End(Tag::List(..))) => break,
Some(Event::End(TagEnd::List(..))) => break,
Some(_) => {}
None => break,
}
Expand All @@ -486,8 +499,8 @@ impl<'a> SummaryParser<'a> {
loop {
match self.next_event() {
Some(Event::Start(Tag::Paragraph)) => continue,
Some(Event::Start(Tag::Link(_type, href, _title))) => {
let mut link = self.parse_link(href.to_string());
Some(Event::Start(Tag::Link { dest_url, .. })) => {
let mut link = self.parse_link(dest_url.to_string());

let mut number = parent.clone();
number.0.push(num_existing_items as u32 + 1);
Expand Down Expand Up @@ -529,14 +542,18 @@ impl<'a> SummaryParser<'a> {
fn parse_title(&mut self) -> Option<String> {
loop {
match self.next_event() {
Some(Event::Start(Tag::Heading(HeadingLevel::H1, ..))) => {
Some(Event::Start(Tag::Heading {
level: HeadingLevel::H1,
..
})) => {
debug!("Found a h1 in the SUMMARY");

let tags = collect_events!(self.stream, end Tag::Heading(HeadingLevel::H1, ..));
let tags = collect_events!(self.stream, end TagEnd::Heading(HeadingLevel::H1));
return Some(stringify_events(tags));
}
// Skip a HTML element such as a comment line.
Some(Event::Html(_)) => {}
Some(Event::Html(_) | Event::InlineHtml(_))
| Some(Event::Start(Tag::HtmlBlock) | Event::End(TagEnd::HtmlBlock)) => {}
// Otherwise, no title.
Some(ev) => {
self.back(ev);
Expand Down Expand Up @@ -744,7 +761,7 @@ mod tests {
let _ = parser.stream.next(); // Discard opening paragraph

let href = match parser.stream.next() {
Some((Event::Start(Tag::Link(_type, href, _title)), _range)) => href.to_string(),
Some((Event::Start(Tag::Link { dest_url, .. }), _range)) => dest_url.to_string(),
other => panic!("Unreachable, {:?}", other),
};

Expand Down
54 changes: 41 additions & 13 deletions src/renderer/html_handlebars/search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,23 @@ fn add_doc(
index: &mut Index,
doc_urls: &mut Vec<String>,
anchor_base: &str,
section_id: &Option<String>,
heading: &str,
id_counter: &mut HashMap<String, usize>,
section_id: &Option<CowStr<'_>>,
items: &[&str],
) {
let url = if let Some(ref id) = *section_id {
// Either use the explicit section id the user specified, or generate one
// from the heading content.
let section_id = section_id.as_ref().map(|id| id.to_string()).or_else(|| {
if heading.is_empty() {
// In the case where a chapter has no heading, don't set a section id.
None
} else {
Some(utils::unique_id_from_content(heading, id_counter))
}
});

let url = if let Some(id) = section_id {
Cow::Owned(format!("{}#{}", anchor_base, id))
} else {
Cow::Borrowed(anchor_base)
Expand Down Expand Up @@ -119,30 +132,29 @@ fn render_item(
let mut id_counter = HashMap::new();
while let Some(event) = p.next() {
match event {
Event::Start(Tag::Heading(i, ..)) if i as u32 <= max_section_depth => {
Event::Start(Tag::Heading { level, id, .. }) if level as u32 <= max_section_depth => {
if !heading.is_empty() {
// Section finished, the next heading is following now
// Write the data to the index, and clear it for the next section
add_doc(
index,
doc_urls,
&anchor_base,
&heading,
&mut id_counter,
&section_id,
&[&heading, &body, &breadcrumbs.join(" » ")],
);
section_id = None;
heading.clear();
body.clear();
breadcrumbs.pop();
}

section_id = id;
in_heading = true;
}
Event::End(Tag::Heading(i, id, _classes)) if i as u32 <= max_section_depth => {
Event::End(TagEnd::Heading(level)) if level as u32 <= max_section_depth => {
in_heading = false;
section_id = id
.map(|id| id.to_string())
.or_else(|| Some(utils::unique_id_from_content(&heading, &mut id_counter)));
breadcrumbs.push(heading.clone());
}
Event::Start(Tag::FootnoteDefinition(name)) => {
Expand All @@ -159,9 +171,19 @@ fn render_item(
html_block.push_str(html);
p.next();
}

body.push_str(&clean_html(&html_block));
}
Event::InlineHtml(html) => {
// This is not capable of cleaning inline tags like
// `foo <script>…</script>`. The `<script>` tags show up as
// individual InlineHtml events, and the content inside is
// just a regular Text event. There isn't a very good way to
// know how to collect all the content in-between. I'm not
// sure if this is easily fixable. It should be extremely
// rare, since script and style tags should almost always be
// blocks, and worse case you have some noise in the index.
body.push_str(&clean_html(&html));
}
Event::Start(_) | Event::End(_) | Event::Rule | Event::SoftBreak | Event::HardBreak => {
// Insert spaces where HTML output would usually separate text
// to ensure words don't get merged together
Expand All @@ -188,18 +210,24 @@ fn render_item(
}

if !body.is_empty() || !heading.is_empty() {
if heading.is_empty() {
let title = if heading.is_empty() {
if let Some(chapter) = breadcrumbs.first() {
heading = chapter.clone();
chapter
} else {
""
}
}
} else {
&heading
};
// Make sure the last section is added to the index
add_doc(
index,
doc_urls,
&anchor_base,
&heading,
&mut id_counter,
&section_id,
&[&heading, &body, &breadcrumbs.join(" » ")],
&[title, &body, &breadcrumbs.join(" » ")],
);
}

Expand Down
35 changes: 26 additions & 9 deletions src/utils/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ pub(crate) mod toml_ext;
use crate::errors::Error;
use log::error;
use once_cell::sync::Lazy;
use pulldown_cmark::{html, CodeBlockKind, CowStr, Event, Options, Parser, Tag};
use pulldown_cmark::{html, CodeBlockKind, CowStr, Event, Options, Parser, Tag, TagEnd};
use regex::Regex;

use std::borrow::Cow;
Expand Down Expand Up @@ -161,13 +161,30 @@ fn adjust_links<'a>(event: Event<'a>, path: Option<&Path>) -> Event<'a> {
}

match event {
Event::Start(Tag::Link(link_type, dest, title)) => {
Event::Start(Tag::Link(link_type, fix(dest, path), title))
}
Event::Start(Tag::Image(link_type, dest, title)) => {
Event::Start(Tag::Image(link_type, fix(dest, path), title))
}
Event::Start(Tag::Link {
link_type,
dest_url,
title,
id,
}) => Event::Start(Tag::Link {
link_type,
dest_url: fix(dest_url, path),
title,
id,
}),
Event::Start(Tag::Image {
link_type,
dest_url,
title,
id,
}) => Event::Start(Tag::Image {
link_type,
dest_url: fix(dest_url, path),
title,
id,
}),
Event::Html(html) => Event::Html(fix_html(html, path)),
Event::InlineHtml(html) => Event::InlineHtml(fix_html(html, path)),
_ => event,
}
}
Expand All @@ -177,7 +194,7 @@ pub fn render_markdown(text: &str, curly_quotes: bool) -> String {
render_markdown_with_path(text, curly_quotes, None)
}

pub fn new_cmark_parser(text: &str, curly_quotes: bool) -> Parser<'_, '_> {
pub fn new_cmark_parser(text: &str, curly_quotes: bool) -> Parser<'_> {
let mut opts = Options::empty();
opts.insert(Options::ENABLE_TABLES);
opts.insert(Options::ENABLE_FOOTNOTES);
Expand Down Expand Up @@ -212,7 +229,7 @@ fn wrap_tables(event: Event<'_>) -> (Option<Event<'_>>, Option<Event<'_>>) {
Some(Event::Html(r#"<div class="table-wrapper">"#.into())),
Some(event),
),
Event::End(Tag::Table(_)) => (Some(event), Some(Event::Html(r#"</div>"#.into()))),
Event::End(TagEnd::Table) => (Some(event), Some(Event::Html(r#"</div>"#.into()))),
_ => (Some(event), None),
}
}
Expand Down
4 changes: 4 additions & 0 deletions tests/dummy_book/src/conclusion.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,7 @@ css looks, like this {
}
*/
</style>

Sneaky inline event <script>alert("inline");</script>.

But regular <b>inline</b> is indexed.
Loading
Loading