From 94af4ec25ded5c74b4a5748d59378fb955b5ecc1 Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Thu, 23 Nov 2023 10:47:10 +1100 Subject: [PATCH] In Cleaner, use the normalized tag name in the tail hit And ensure that tags added to the safelist are normalized. In copy safe nodes, head was using normalized but the tail was using nodename. So if the safelist and the input had different cases, the tail hit would not ascend the stack on an otherwise safe tag, leading to mis-nested children. Also fixed an issue that the cleaned element did not preserve the self-closing style of the input document. Fixes #2049 --- CHANGES | 8 +++++ src/main/java/org/jsoup/safety/Cleaner.java | 6 ++-- src/main/java/org/jsoup/safety/Safelist.java | 5 ++-- .../java/org/jsoup/safety/CleanerTest.java | 30 +++++++++++++++++++ 4 files changed, 44 insertions(+), 5 deletions(-) diff --git a/CHANGES b/CHANGES index f4a2f08f9a..dc6a0ebb86 100644 --- a/CHANGES +++ b/CHANGES @@ -52,6 +52,14 @@ Release 1.17.1 [PENDING] * Bugfix: in W3CDom, if the jsoup input document contained an empty doctype, the conversion would fail with a DOMException. Now, said doctype is discarded, and the conversion continues. + * Bugfix: when cleaning a document containing SVG elements (or other foreign elements that have preserved case names), + the cleaned output would be incorrectly nested if the safelist had a different case than the input document. + + + * Bugfix: when cleaning a document, the output style of unknown self-closing tags from the input was not preserved in + the output. (So a in the input, if safe-listed, would be output as .) + + * Build Improvement: added a local test proxy implementation, for proxy integration tests. diff --git a/src/main/java/org/jsoup/safety/Cleaner.java b/src/main/java/org/jsoup/safety/Cleaner.java index 7b9317ec29..05867a29c7 100644 --- a/src/main/java/org/jsoup/safety/Cleaner.java +++ b/src/main/java/org/jsoup/safety/Cleaner.java @@ -155,7 +155,7 @@ public void head(Node source, int depth) { TextNode sourceText = (TextNode) source; TextNode destText = new TextNode(sourceText.getWholeText()); destination.appendChild(destText); - } else if (source instanceof DataNode && safelist.isSafeTag(source.parent().nodeName())) { + } else if (source instanceof DataNode && safelist.isSafeTag(source.parent().normalName())) { DataNode sourceData = (DataNode) source; DataNode destData = new DataNode(sourceData.getWholeData()); destination.appendChild(destData); @@ -165,7 +165,7 @@ public void head(Node source, int depth) { } public void tail(Node source, int depth) { - if (source instanceof Element && safelist.isSafeTag(source.nodeName())) { + if (source instanceof Element && safelist.isSafeTag(source.normalName())) { destination = destination.parent(); // would have descended, so pop destination stack } } @@ -180,7 +180,7 @@ private int copySafeNodes(Element source, Element dest) { private ElementMeta createSafeElement(Element sourceEl) { String sourceTag = sourceEl.tagName(); Attributes destAttrs = new Attributes(); - Element dest = new Element(Tag.valueOf(sourceTag, sourceEl.tag().namespace(), ParseSettings.preserveCase), sourceEl.baseUri(), destAttrs); + Element dest = new Element(sourceEl.tag(), sourceEl.baseUri(), destAttrs); int numDiscarded = 0; Attributes sourceAttrs = sourceEl.attributes(); diff --git a/src/main/java/org/jsoup/safety/Safelist.java b/src/main/java/org/jsoup/safety/Safelist.java index 73cb08a704..49747e9ebf 100644 --- a/src/main/java/org/jsoup/safety/Safelist.java +++ b/src/main/java/org/jsoup/safety/Safelist.java @@ -6,6 +6,7 @@ Thank you to Ryan Grove (wonko.com) for the Ruby HTML cleaner http://github.com/ */ import org.jsoup.helper.Validate; +import org.jsoup.internal.Normalizer; import org.jsoup.nodes.Attribute; import org.jsoup.nodes.Attributes; import org.jsoup.nodes.Element; @@ -296,8 +297,8 @@ public Safelist addAttributes(String tag, String... attributes) { Validate.notNull(attributes); Validate.isTrue(attributes.length > 0, "No attribute names supplied."); + addTags(tag); TagName tagName = TagName.valueOf(tag); - tagNames.add(tagName); Set attributeSet = new HashSet<>(); for (String key : attributes) { Validate.notEmpty(key); @@ -621,7 +622,7 @@ static class TagName extends TypedValue { } static TagName valueOf(String value) { - return new TagName(value); + return new TagName(Normalizer.lowerCase(value)); } } diff --git a/src/test/java/org/jsoup/safety/CleanerTest.java b/src/test/java/org/jsoup/safety/CleanerTest.java index d7c6371cbc..83717457ba 100644 --- a/src/test/java/org/jsoup/safety/CleanerTest.java +++ b/src/test/java/org/jsoup/safety/CleanerTest.java @@ -9,7 +9,11 @@ import org.jsoup.nodes.Range; import org.jsoup.parser.Parser; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; +import org.junit.jupiter.params.provider.ValueSource; +import java.util.Arrays; import java.util.Locale; import static org.junit.jupiter.api.Assertions.*; @@ -399,4 +403,30 @@ public void bailsIfRemovingProtocolThatsNotSet() { assertEquals(cleanRange, origRange); assertEquals(clean.endSourceRange(), orig.endSourceRange()); } + + @ParameterizedTest @ValueSource(booleans = {true, false}) + void cleansCaseSensitiveElements(boolean preserveCase) { + // https://github.com/jhy/jsoup/issues/2049 + String html = ""; + String[] tags = {"svg", "feMerge", "feMergeNode", "clipPath"}; + String[] attrs = {"kernelMatrix"}; + + if (!preserveCase) { + tags = Arrays.stream(tags).map(String::toLowerCase).toArray(String[]::new); + } + + Safelist safelist = Safelist.none().addTags(tags).addAttributes(":all", attrs); + String clean = Jsoup.clean(html, safelist); + String expected = "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + ""; + assertEquals(expected, clean); + } + }