Skip to content

Commit

Permalink
No longer resolve predefined entities in unescape_with
Browse files Browse the repository at this point in the history
  • Loading branch information
phdavis1027 authored and Mingun committed May 30, 2024
1 parent 7a4995e commit cf27e0d
Show file tree
Hide file tree
Showing 7 changed files with 75 additions and 24 deletions.
5 changes: 2 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -111,13 +111,12 @@ async-tokio = ["tokio"]
## [#158]: https://github.com/tafia/quick-xml/issues/158
encoding = ["encoding_rs"]

## Enables support for recognizing all [HTML 5 entities] in [`unescape`] and
## [`unescape_with`] functions. The full list of entities also can be found in
## Enables support for recognizing all [HTML 5 entities] in [`unescape`]
## function. The full list of entities also can be found in
## <https://html.spec.whatwg.org/entities.json>.
##
## [HTML 5 entities]: https://dev.w3.org/html5/html-author/charref
## [`unescape`]: crate::escape::unescape
## [`unescape_with`]: crate::escape::unescape_with
escape-html = []

## This feature is for the Serde deserializer that enables support for deserializing
Expand Down
8 changes: 8 additions & 0 deletions Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ The method of reporting positions of errors has changed - use `error_position()`
to get an offset of the error position. For `SyntaxError`s the range
`error_position()..buffer_position()` also will represent a span of error.

The way of resolve entities with `unescape_with` are changed. Those methods no longer
resolve predefined entities.

### New Features

- [#513]: Allow to continue parsing after getting new `Error::IllFormed`.
Expand All @@ -39,6 +42,10 @@ to get an offset of the error position. For `SyntaxError`s the range
- `quick_xml::escape::resolve_predefined_entity`
- `quick_xml::escape::resolve_xml_entity`
- `quick_xml::escape::resolve_html5_entity`
- [#734]: Rename `NoEntityResolver` to `PredefinedEntityResolver`.
- [#734]: No longer resolve predefined entities (`lt`, `gt`, `apos`, `quot`, `amp`)
in `unescape_with` family of methods. You should do that by yourself using the methods
listed above.

### Bug Fixes

Expand Down Expand Up @@ -87,6 +94,7 @@ to get an offset of the error position. For `SyntaxError`s the range
[#704]: https://github.com/tafia/quick-xml/pull/704
[#705]: https://github.com/tafia/quick-xml/pull/705
[#722]: https://github.com/tafia/quick-xml/pull/722
[#734]: https://github.com/tafia/quick-xml/pull/734
[#738]: https://github.com/tafia/quick-xml/pull/738
[#743]: https://github.com/tafia/quick-xml/pull/743
[#748]: https://github.com/tafia/quick-xml/pull/748
Expand Down
8 changes: 6 additions & 2 deletions examples/custom_entities.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

use std::collections::HashMap;

use quick_xml::escape::resolve_predefined_entity;
use quick_xml::events::Event;
use quick_xml::reader::Reader;
use regex::bytes::Regex;
Expand Down Expand Up @@ -59,8 +60,11 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
Ok(Event::Text(ref e)) => {
println!(
"text value: {}",
e.unescape_with(|ent| custom_entities.get(ent).map(|s| s.as_str()))
.unwrap()
e.unescape_with(|ent| match custom_entities.get(ent) {
Some(s) => Some(s.as_str()),
None => resolve_predefined_entity(ent),
})
.unwrap()
);
}
Ok(Event::Eof) => break,
Expand Down
10 changes: 5 additions & 5 deletions src/de/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1997,7 +1997,7 @@ mod text;
mod var;

pub use crate::errors::serialize::DeError;
pub use resolver::{EntityResolver, NoEntityResolver};
pub use resolver::{EntityResolver, PredefinedEntityResolver};

use crate::{
de::map::ElementMapAccess,
Expand Down Expand Up @@ -2125,7 +2125,7 @@ impl<'a> PayloadEvent<'a> {
/// An intermediate reader that consumes [`PayloadEvent`]s and produces final [`DeEvent`]s.
/// [`PayloadEvent::Text`] events, that followed by any event except
/// [`PayloadEvent::Text`] or [`PayloadEvent::CData`], are trimmed from the end.
struct XmlReader<'i, R: XmlRead<'i>, E: EntityResolver = NoEntityResolver> {
struct XmlReader<'i, R: XmlRead<'i>, E: EntityResolver = PredefinedEntityResolver> {
/// A source of low-level XML events
reader: R,
/// Intermediate event, that could be returned by the next call to `next()`.
Expand Down Expand Up @@ -2356,7 +2356,7 @@ where
////////////////////////////////////////////////////////////////////////////////////////////////////

/// A structure that deserializes XML into Rust values.
pub struct Deserializer<'de, R, E: EntityResolver = NoEntityResolver>
pub struct Deserializer<'de, R, E: EntityResolver = PredefinedEntityResolver>
where
R: XmlRead<'de>,
{
Expand Down Expand Up @@ -2799,7 +2799,7 @@ impl<'de> Deserializer<'de, SliceReader<'de>> {
/// Deserializer created with this method will not resolve custom entities.
#[allow(clippy::should_implement_trait)]
pub fn from_str(source: &'de str) -> Self {
Self::from_str_with_resolver(source, NoEntityResolver)
Self::from_str_with_resolver(source, PredefinedEntityResolver)
}
}

Expand Down Expand Up @@ -2837,7 +2837,7 @@ where
///
/// Deserializer created with this method will not resolve custom entities.
pub fn from_reader(reader: R) -> Self {
Self::with_resolver(reader, NoEntityResolver)
Self::with_resolver(reader, PredefinedEntityResolver)
}
}

Expand Down
21 changes: 16 additions & 5 deletions src/de/resolver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
use std::convert::Infallible;
use std::error::Error;

use crate::escape::resolve_predefined_entity;
use crate::events::BytesText;

/// Used to resolve unknown entities while parsing
Expand Down Expand Up @@ -87,18 +88,28 @@ pub trait EntityResolver {
fn resolve(&self, entity: &str) -> Option<&str>;
}

/// An `EntityResolver` that does nothing and always returns `None`.
/// An [`EntityResolver`] that resolves only predefined entities:
///
/// | Entity | Resolution
/// |--------|------------
/// |`&lt;` | `<`
/// |`&gt;` | `>`
/// |`&amp;` | `&`
/// |`&apos;`| `'`
/// |`&quot;`| `"`
#[derive(Default, Copy, Clone)]
pub struct NoEntityResolver;
pub struct PredefinedEntityResolver;

impl EntityResolver for NoEntityResolver {
impl EntityResolver for PredefinedEntityResolver {
type Error = Infallible;

#[inline]
fn capture(&mut self, _doctype: BytesText) -> Result<(), Self::Error> {
Ok(())
}

fn resolve(&self, _entity: &str) -> Option<&str> {
None
#[inline]
fn resolve(&self, entity: &str) -> Option<&str> {
resolve_predefined_entity(entity)
}
}
41 changes: 34 additions & 7 deletions src/escape.rs
Original file line number Diff line number Diff line change
Expand Up @@ -184,16 +184,48 @@ pub(crate) fn _escape<F: Fn(u8) -> bool>(raw: &str, escape_chars: F) -> Cow<str>
/// [`escape-html`]: ../index.html#escape-html
/// [HTML5 escapes]: https://dev.w3.org/html5/html-author/charref
pub fn unescape(raw: &str) -> Result<Cow<str>, EscapeError> {
unescape_with(raw, |_| None)
unescape_with(raw, resolve_predefined_entity)
}

/// Unescape an `&str` and replaces all xml escaped characters (`&...;`) into
/// their corresponding value, using a resolver function for custom entities.
///
/// If feature [`escape-html`] is enabled, then recognizes all [HTML5 escapes].
///
/// Predefined entities will be resolved _after_ trying to resolve with `resolve_entity`,
/// which allows you to override default behavior which required in some XML dialects.
///
/// Character references (`&#hh;`) cannot be overridden, they are resolved before
/// calling `resolve_entity`.
///
/// Note, that entities will not be resolved recursively. In order to satisfy the
/// XML [requirements] you should unescape nested entities by yourself.
///
/// # Example
///
/// ```
/// use quick_xml::escape::resolve_xml_entity;
/// # use quick_xml::escape::unescape_with;
/// # use pretty_assertions::assert_eq;
/// let override_named_entities = |entity: &str| match entity {
/// // Override standard entities
/// "lt" => Some("FOO"),
/// "gt" => Some("BAR"),
/// // Resolve custom entities
/// "baz" => Some("&lt;"),
/// // Delegate other entities to the default implementation
/// _ => resolve_xml_entity(entity),
/// };
///
/// assert_eq!(
/// unescape_with("&amp;&lt;test&gt;&baz;", override_named_entities).unwrap(),
/// "&FOOtestBAR&lt;"
/// );
/// ```
///
/// [`escape-html`]: ../index.html#escape-html
/// [HTML5 escapes]: https://dev.w3.org/html5/html-author/charref
/// [requirements]: https://www.w3.org/TR/xml11/#intern-replacement
pub fn unescape_with<'input, 'entity, F>(
raw: &'input str,
mut resolve_entity: F,
Expand Down Expand Up @@ -221,8 +253,6 @@ where
if let Some(entity) = pat.strip_prefix('#') {
let codepoint = parse_number(entity, start..end)?;
unescaped.push_str(codepoint.encode_utf8(&mut [0u8; 4]));
} else if let Some(value) = resolve_predefined_entity(pat) {
unescaped.push_str(value);
} else if let Some(value) = resolve_entity(pat) {
unescaped.push_str(value);
} else {
Expand Down Expand Up @@ -1840,10 +1870,7 @@ fn test_unescape_with() {
assert_eq!(unchanged, Cow::Borrowed("test"));
assert!(matches!(unchanged, Cow::Borrowed(_)));

assert_eq!(
unescape_with("&lt;test&gt;", custom_entities).unwrap(),
"<test>"
);
assert!(unescape_with("&lt;", custom_entities).is_err());
assert_eq!(unescape_with("&#x30;", custom_entities).unwrap(), "0");
assert_eq!(unescape_with("&#48;", custom_entities).unwrap(), "0");
assert_eq!(unescape_with("&foo;", custom_entities).unwrap(), "BAR");
Expand Down
6 changes: 4 additions & 2 deletions src/events/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,9 @@ use std::str::from_utf8;

use crate::encoding::Decoder;
use crate::errors::{Error, IllFormedError, Result};
use crate::escape::{escape, minimal_escape, partial_escape, unescape_with};
use crate::escape::{
escape, minimal_escape, partial_escape, resolve_predefined_entity, unescape_with,
};
use crate::name::{LocalName, QName};
use crate::reader::is_whitespace;
use crate::utils::write_cow_string;
Expand Down Expand Up @@ -748,7 +750,7 @@ impl<'a> BytesText<'a> {
/// This will allocate if the value contains any escape sequences or in
/// non-UTF-8 encoding.
pub fn unescape(&self) -> Result<Cow<'a, str>> {
self.unescape_with(|_| None)
self.unescape_with(resolve_predefined_entity)
}

/// Decodes then unescapes the content of the event with custom entities.
Expand Down

0 comments on commit cf27e0d

Please sign in to comment.