Skip to content

Commit

Permalink
In Cleaner, use the normalized tag name in the tail hit
Browse files Browse the repository at this point in the history
And ensure that tags added to the safelist are normalized.

In copy safe nodes, head was using normalized but the tail was using nodename. So if the safelist and the input had different cases, the tail hit would not ascend the stack on an otherwise safe tag, leading to mis-nested children.

Also fixed an issue that the cleaned element did not preserve the self-closing style of the input document.

Fixes #2049
  • Loading branch information
jhy committed Nov 22, 2023
1 parent f638410 commit 94af4ec
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 5 deletions.
8 changes: 8 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,14 @@ Release 1.17.1 [PENDING]
* Bugfix: in W3CDom, if the jsoup input document contained an empty doctype, the conversion would fail with a
DOMException. Now, said doctype is discarded, and the conversion continues.

* Bugfix: when cleaning a document containing SVG elements (or other foreign elements that have preserved case names),
the cleaned output would be incorrectly nested if the safelist had a different case than the input document.
<https://github.com/jhy/jsoup/issues/2049>

* Bugfix: when cleaning a document, the output style of unknown self-closing tags from the input was not preserved in
the output. (So a <foo /> in the input, if safe-listed, would be output as <foo></foo>.)
<https://github.com/jhy/jsoup/issues/2049>

* Build Improvement: added a local test proxy implementation, for proxy integration tests.
<https://github.com/jhy/jsoup/pull/2029>

Expand Down
6 changes: 3 additions & 3 deletions src/main/java/org/jsoup/safety/Cleaner.java
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ public void head(Node source, int depth) {
TextNode sourceText = (TextNode) source;
TextNode destText = new TextNode(sourceText.getWholeText());
destination.appendChild(destText);
} else if (source instanceof DataNode && safelist.isSafeTag(source.parent().nodeName())) {
} else if (source instanceof DataNode && safelist.isSafeTag(source.parent().normalName())) {
DataNode sourceData = (DataNode) source;
DataNode destData = new DataNode(sourceData.getWholeData());
destination.appendChild(destData);
Expand All @@ -165,7 +165,7 @@ public void head(Node source, int depth) {
}

public void tail(Node source, int depth) {
if (source instanceof Element && safelist.isSafeTag(source.nodeName())) {
if (source instanceof Element && safelist.isSafeTag(source.normalName())) {
destination = destination.parent(); // would have descended, so pop destination stack
}
}
Expand All @@ -180,7 +180,7 @@ private int copySafeNodes(Element source, Element dest) {
private ElementMeta createSafeElement(Element sourceEl) {
String sourceTag = sourceEl.tagName();
Attributes destAttrs = new Attributes();
Element dest = new Element(Tag.valueOf(sourceTag, sourceEl.tag().namespace(), ParseSettings.preserveCase), sourceEl.baseUri(), destAttrs);
Element dest = new Element(sourceEl.tag(), sourceEl.baseUri(), destAttrs);
int numDiscarded = 0;

Attributes sourceAttrs = sourceEl.attributes();
Expand Down
5 changes: 3 additions & 2 deletions src/main/java/org/jsoup/safety/Safelist.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ Thank you to Ryan Grove (wonko.com) for the Ruby HTML cleaner http://github.com/
*/

import org.jsoup.helper.Validate;
import org.jsoup.internal.Normalizer;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Element;
Expand Down Expand Up @@ -296,8 +297,8 @@ public Safelist addAttributes(String tag, String... attributes) {
Validate.notNull(attributes);
Validate.isTrue(attributes.length > 0, "No attribute names supplied.");

addTags(tag);
TagName tagName = TagName.valueOf(tag);
tagNames.add(tagName);
Set<AttributeKey> attributeSet = new HashSet<>();
for (String key : attributes) {
Validate.notEmpty(key);
Expand Down Expand Up @@ -621,7 +622,7 @@ static class TagName extends TypedValue {
}

static TagName valueOf(String value) {
return new TagName(value);
return new TagName(Normalizer.lowerCase(value));
}
}

Expand Down
30 changes: 30 additions & 0 deletions src/test/java/org/jsoup/safety/CleanerTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,11 @@
import org.jsoup.nodes.Range;
import org.jsoup.parser.Parser;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.MethodSource;
import org.junit.jupiter.params.provider.ValueSource;

import java.util.Arrays;
import java.util.Locale;

import static org.junit.jupiter.api.Assertions.*;
Expand Down Expand Up @@ -399,4 +403,30 @@ public void bailsIfRemovingProtocolThatsNotSet() {
assertEquals(cleanRange, origRange);
assertEquals(clean.endSourceRange(), orig.endSourceRange());
}

@ParameterizedTest @ValueSource(booleans = {true, false})
void cleansCaseSensitiveElements(boolean preserveCase) {
// https://github.com/jhy/jsoup/issues/2049
String html = "<svg><feMerge><feMergeNode kernelMatrix=1 /><feMergeNode><clipPath /></feMergeNode><feMergeNode />";
String[] tags = {"svg", "feMerge", "feMergeNode", "clipPath"};
String[] attrs = {"kernelMatrix"};

if (!preserveCase) {
tags = Arrays.stream(tags).map(String::toLowerCase).toArray(String[]::new);
}

Safelist safelist = Safelist.none().addTags(tags).addAttributes(":all", attrs);
String clean = Jsoup.clean(html, safelist);
String expected = "<svg>\n" +
" <feMerge>\n" +
" <feMergeNode kernelMatrix=\"1\" />\n" +
" <feMergeNode>\n" +
" <clipPath />\n" +
" </feMergeNode>\n" +
" <feMergeNode />\n" +
" </feMerge>\n" +
"</svg>";
assertEquals(expected, clean);
}

}

0 comments on commit 94af4ec

Please sign in to comment.