From 9ba6dc7a955ed23884e8e25f8d503c4fe144c27a Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Mon, 8 Jul 2024 14:40:54 +1000 Subject: [PATCH] Make Entities.escape(string) suitable for both text and attributes Fixes #1278 --- CHANGES.md | 10 ++++-- src/main/java/org/jsoup/nodes/Attribute.java | 2 +- src/main/java/org/jsoup/nodes/Entities.java | 35 +++++++++++++------ src/main/java/org/jsoup/nodes/TextNode.java | 2 +- .../java/org/jsoup/nodes/XmlDeclaration.java | 2 +- .../java/org/jsoup/nodes/EntitiesTest.java | 19 ++++++---- 6 files changed, 48 insertions(+), 22 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 3758b9a417..bf985e0963 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -40,8 +40,14 @@ e.g.: `h1:has(+h2)`). [2137](https://github.com/jhy/jsoup/issues/2137) * The `:empty` selector incorrectly matched elements that started with a blank text node and were followed by non-empty nodes, due to an incorrect short-circuit. [2130](https://github.com/jhy/jsoup/issues/2130) -* `Element.cssSelector()` would fail with "Did not find balanced marker" when building a selector for elements that had a `(` or `[` in their class names. And selectors with those characters escaped would not match as expected. [2146](https://github.com/jhy/jsoup/issues/2146) -* Fuzz: a Stack Overflow exception could occur when resolving a crafted `` URL, in the normalizing regex. +* `Element.cssSelector()` would fail with "Did not find balanced marker" when building a selector for elements that had + a `(` or `[` in their class names. And selectors with those characters escaped would not match as + expected. [2146](https://github.com/jhy/jsoup/issues/2146) +* Updated `Entities.escape(string)` to make the escaped text suitable for both text nodes and attributes (previously was + only for text nodes). This does not impact the output of `Element.html()` which correctly applies a minimal escape + depending on if the use will be for text data or in a quoted + attribute. [1278](https://github.com/jhy/jsoup/issues/1278) +* Fuzz: a Stack Overflow exception could occur when resolving a crafted `` URL, in the normalizing regex. [2165](https://github.com/jhy/jsoup/issues/2165) --- diff --git a/src/main/java/org/jsoup/nodes/Attribute.java b/src/main/java/org/jsoup/nodes/Attribute.java index 5f0e2d356a..bd7be7f748 100644 --- a/src/main/java/org/jsoup/nodes/Attribute.java +++ b/src/main/java/org/jsoup/nodes/Attribute.java @@ -168,7 +168,7 @@ static void htmlNoValidate(String key, @Nullable String val, Appendable accum, D accum.append(key); if (!shouldCollapseAttribute(key, val, out)) { accum.append("=\""); - Entities.escape(accum, Attributes.checkNotNull(val) , out, true, false, false, false); + Entities.escape(accum, Attributes.checkNotNull(val) , out, false, true, false, false, false); accum.append('"'); } } diff --git a/src/main/java/org/jsoup/nodes/Entities.java b/src/main/java/org/jsoup/nodes/Entities.java index 3fb39a8070..a5585780ee 100644 --- a/src/main/java/org/jsoup/nodes/Entities.java +++ b/src/main/java/org/jsoup/nodes/Entities.java @@ -131,18 +131,20 @@ public static int codepointsForName(final String name, final int[] codepoints) { } /** - * HTML escape an input string. That is, {@code <} is returned as {@code <} - * - * @param string the un-escaped string to escape - * @param out the output settings to use - * @return the escaped string + HTML escape an input string. That is, {@code <} is returned as {@code <}. The escaped string is suitable for use + both in attributes and in text data. + @param string the un-escaped string to escape + @param out the output settings to use. This configures the character set escaped against (that is, if a + character is supported in the output character set, it doesn't have to be escaped), and also HTML or XML + settings. + @return the escaped string */ public static String escape(String string, OutputSettings out) { if (string == null) return ""; StringBuilder accum = StringUtil.borrowBuilder(); try { - escape(accum, string, out, false, false, false, false); + escape(accum, string, out, true, true, false, false, false); // for text and for attribute; preserve whitespaces } catch (IOException e) { throw new SerializationException(e); // doesn't happen } @@ -151,10 +153,11 @@ public static String escape(String string, OutputSettings out) { /** * HTML escape an input string, using the default settings (UTF-8, base entities). That is, {@code <} is returned as - * {@code <} + * {@code <}. The escaped string is suitable for use both in attributes and in text data. * * @param string the un-escaped string to escape * @return the escaped string + * @see #escape(String, OutputSettings) */ public static String escape(String string) { if (DefaultOutput == null) @@ -165,7 +168,7 @@ public static String escape(String string) { // this method does a lot, but other breakups cause rescanning and stringbuilder generations static void escape(Appendable accum, String string, OutputSettings out, - boolean inAttribute, boolean normaliseWhite, boolean stripLeadingWhite, boolean trimTrailing) throws IOException { + boolean forText, boolean forAttribute, boolean normaliseWhite, boolean stripLeadingWhite, boolean trimTrailing) throws IOException { boolean lastWasWhite = false; boolean reachedNonWhite = false; @@ -215,23 +218,33 @@ static void escape(Appendable accum, String string, OutputSettings out, break; case '<': // escape when in character data or when in a xml attribute val or XML syntax; not needed in html attr val - if (!inAttribute || escapeMode == EscapeMode.xhtml || out.syntax() == Syntax.xml) + if (forText || escapeMode == EscapeMode.xhtml || out.syntax() == Syntax.xml) accum.append("<"); else accum.append(c); break; case '>': - if (!inAttribute) + if (forText) accum.append(">"); else accum.append(c); break; case '"': - if (inAttribute) + if (forAttribute) accum.append("""); else accum.append(c); break; + case '\'': + if (forAttribute && forText) { // special case for the Entities.escape(string) method when we are maximally escaping. Otherwise, because we output attributes in "", there's no need to escape. + if (escapeMode == EscapeMode.xhtml) + accum.append("'"); + else + accum.append("'"); + } + else + accum.append(c); + break; // we escape ascii control