Skip to content

Commit

Permalink
Make Entities.escape(string) suitable for both text and attributes
Browse files Browse the repository at this point in the history
Fixes #1278
  • Loading branch information
jhy committed Jul 8, 2024
1 parent a0537c7 commit 9ba6dc7
Show file tree
Hide file tree
Showing 6 changed files with 48 additions and 22 deletions.
10 changes: 8 additions & 2 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,14 @@
e.g.: `h1:has(+h2)`). [2137](https://github.com/jhy/jsoup/issues/2137)
* The `:empty` selector incorrectly matched elements that started with a blank text node and were followed by
non-empty nodes, due to an incorrect short-circuit. [2130](https://github.com/jhy/jsoup/issues/2130)
* `Element.cssSelector()` would fail with "Did not find balanced marker" when building a selector for elements that had a `(` or `[` in their class names. And selectors with those characters escaped would not match as expected. [2146](https://github.com/jhy/jsoup/issues/2146)
* Fuzz: a Stack Overflow exception could occur when resolving a crafted `<base href>` URL, in the normalizing regex.
* `Element.cssSelector()` would fail with "Did not find balanced marker" when building a selector for elements that had
a `(` or `[` in their class names. And selectors with those characters escaped would not match as
expected. [2146](https://github.com/jhy/jsoup/issues/2146)
* Updated `Entities.escape(string)` to make the escaped text suitable for both text nodes and attributes (previously was
only for text nodes). This does not impact the output of `Element.html()` which correctly applies a minimal escape
depending on if the use will be for text data or in a quoted
attribute. [1278](https://github.com/jhy/jsoup/issues/1278)
* Fuzz: a Stack Overflow exception could occur when resolving a crafted `<base href>` URL, in the normalizing regex.
[2165](https://github.com/jhy/jsoup/issues/2165)

---
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/jsoup/nodes/Attribute.java
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ static void htmlNoValidate(String key, @Nullable String val, Appendable accum, D
accum.append(key);
if (!shouldCollapseAttribute(key, val, out)) {
accum.append("=\"");
Entities.escape(accum, Attributes.checkNotNull(val) , out, true, false, false, false);
Entities.escape(accum, Attributes.checkNotNull(val) , out, false, true, false, false, false);
accum.append('"');
}
}
Expand Down
35 changes: 24 additions & 11 deletions src/main/java/org/jsoup/nodes/Entities.java
Original file line number Diff line number Diff line change
Expand Up @@ -131,18 +131,20 @@ public static int codepointsForName(final String name, final int[] codepoints) {
}

/**
* HTML escape an input string. That is, {@code <} is returned as {@code &lt;}
*
* @param string the un-escaped string to escape
* @param out the output settings to use
* @return the escaped string
HTML escape an input string. That is, {@code <} is returned as {@code &lt;}. The escaped string is suitable for use
both in attributes and in text data.
@param string the un-escaped string to escape
@param out the output settings to use. This configures the character set escaped against (that is, if a
character is supported in the output character set, it doesn't have to be escaped), and also HTML or XML
settings.
@return the escaped string
*/
public static String escape(String string, OutputSettings out) {
if (string == null)
return "";
StringBuilder accum = StringUtil.borrowBuilder();
try {
escape(accum, string, out, false, false, false, false);
escape(accum, string, out, true, true, false, false, false); // for text and for attribute; preserve whitespaces
} catch (IOException e) {
throw new SerializationException(e); // doesn't happen
}
Expand All @@ -151,10 +153,11 @@ public static String escape(String string, OutputSettings out) {

/**
* HTML escape an input string, using the default settings (UTF-8, base entities). That is, {@code <} is returned as
* {@code &lt;}
* {@code &lt;}. The escaped string is suitable for use both in attributes and in text data.
*
* @param string the un-escaped string to escape
* @return the escaped string
* @see #escape(String, OutputSettings)
*/
public static String escape(String string) {
if (DefaultOutput == null)
Expand All @@ -165,7 +168,7 @@ public static String escape(String string) {

// this method does a lot, but other breakups cause rescanning and stringbuilder generations
static void escape(Appendable accum, String string, OutputSettings out,
boolean inAttribute, boolean normaliseWhite, boolean stripLeadingWhite, boolean trimTrailing) throws IOException {
boolean forText, boolean forAttribute, boolean normaliseWhite, boolean stripLeadingWhite, boolean trimTrailing) throws IOException {

boolean lastWasWhite = false;
boolean reachedNonWhite = false;
Expand Down Expand Up @@ -215,23 +218,33 @@ static void escape(Appendable accum, String string, OutputSettings out,
break;
case '<':
// escape when in character data or when in a xml attribute val or XML syntax; not needed in html attr val
if (!inAttribute || escapeMode == EscapeMode.xhtml || out.syntax() == Syntax.xml)
if (forText || escapeMode == EscapeMode.xhtml || out.syntax() == Syntax.xml)
accum.append("&lt;");
else
accum.append(c);
break;
case '>':
if (!inAttribute)
if (forText)
accum.append("&gt;");
else
accum.append(c);
break;
case '"':
if (inAttribute)
if (forAttribute)
accum.append("&quot;");
else
accum.append(c);
break;
case '\'':
if (forAttribute && forText) { // special case for the Entities.escape(string) method when we are maximally escaping. Otherwise, because we output attributes in "", there's no need to escape.
if (escapeMode == EscapeMode.xhtml)
accum.append("&#x27;");
else
accum.append("&apos;");
}
else
accum.append(c);
break;
// we escape ascii control <x20 (other than tab, line-feed, carriage return) for XML compliance (required) and HTML ease of reading (not required) - https://www.w3.org/TR/xml/#charsets
case 0x9:
case 0xA:
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/jsoup/nodes/TextNode.java
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ void outerHtmlHead(Appendable accum, int depth, Document.OutputSettings out) thr
indent(accum, depth, out);
}

Entities.escape(accum, coreValue(), out, false, normaliseWhite, trimLeading, trimTrailing);
Entities.escape(accum, coreValue(), out, true, false, normaliseWhite, trimLeading, trimTrailing);
}

@Override
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/jsoup/nodes/XmlDeclaration.java
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ private void getWholeDeclaration(Appendable accum, Document.OutputSettings out)
accum.append(key);
if (!val.isEmpty()) {
accum.append("=\"");
Entities.escape(accum, val, out, true, false, false, false);
Entities.escape(accum, val, out, false, true, false, false, false);
accum.append('"');
}
}
Expand Down
19 changes: 13 additions & 6 deletions src/test/java/org/jsoup/nodes/EntitiesTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,19 @@

public class EntitiesTest {
@Test public void escape() {
String text = "Hello &<> Å å π 新 there ¾ © »";
// escape is maximal (as in the escapes cover use in both text and attributes; vs Element.html() which checks if attribute or text and minimises escapes
String text = "Hello &<> Å å π 新 there ¾ © » ' \"";
String escapedAscii = Entities.escape(text, new OutputSettings().charset("ascii").escapeMode(base));
String escapedAsciiFull = Entities.escape(text, new OutputSettings().charset("ascii").escapeMode(extended));
String escapedAsciiXhtml = Entities.escape(text, new OutputSettings().charset("ascii").escapeMode(xhtml));
String escapedUtfFull = Entities.escape(text, new OutputSettings().charset("UTF-8").escapeMode(extended));
String escapedUtfMin = Entities.escape(text, new OutputSettings().charset("UTF-8").escapeMode(xhtml));

assertEquals("Hello &amp;&lt;&gt; &Aring; &aring; &#x3c0; &#x65b0; there &frac34; &copy; &raquo;", escapedAscii);
assertEquals("Hello &amp;&lt;&gt; &angst; &aring; &pi; &#x65b0; there &frac34; &copy; &raquo;", escapedAsciiFull);
assertEquals("Hello &amp;&lt;&gt; &#xc5; &#xe5; &#x3c0; &#x65b0; there &#xbe; &#xa9; &#xbb;", escapedAsciiXhtml);
assertEquals("Hello &amp;&lt;&gt; Å å π 新 there ¾ © »", escapedUtfFull);
assertEquals("Hello &amp;&lt;&gt; Å å π 新 there ¾ © »", escapedUtfMin);
assertEquals("Hello &amp;&lt;&gt; &Aring; &aring; &#x3c0; &#x65b0; there &frac34; &copy; &raquo; &apos; &quot;", escapedAscii);
assertEquals("Hello &amp;&lt;&gt; &angst; &aring; &pi; &#x65b0; there &frac34; &copy; &raquo; &apos; &quot;", escapedAsciiFull);
assertEquals("Hello &amp;&lt;&gt; &#xc5; &#xe5; &#x3c0; &#x65b0; there &#xbe; &#xa9; &#xbb; &#x27; &quot;", escapedAsciiXhtml);
assertEquals("Hello &amp;&lt;&gt; Å å π 新 there ¾ © » &apos; &quot;", escapedUtfFull);
assertEquals("Hello &amp;&lt;&gt; Å å π 新 there ¾ © » &#x27; &quot;", escapedUtfMin);
// odd that it's defined as aring in base but angst in full

// round trip
Expand All @@ -33,6 +34,12 @@ public class EntitiesTest {
assertEquals(text, Entities.unescape(escapedUtfMin));
}

@Test public void escapeDefaults() {
String text = "Hello &<> Å å π 新 there ¾ © » ' \"";
String escaped = Entities.escape(text);
assertEquals("Hello &amp;&lt;&gt; Å å π 新 there ¾ © » &apos; &quot;", escaped);
}

@Test public void escapedSupplementary() {
String text = "\uD835\uDD59";
String escapedAscii = Entities.escape(text, new OutputSettings().charset("ascii").escapeMode(base));
Expand Down

0 comments on commit 9ba6dc7

Please sign in to comment.