Updated AfterBody and AfterAfterBody to current spec

Don't pop stack to close on </body> or </html>, but leave them on the stack. Had to deviate from the spec slightly to allow whitespace to be added to the html or doc elements. (The goal of that is so that when pretty-printing is off, the output more closely resembles the input, by tracking newlines after </body> etc) Fixes #1851
jhy · Mar 27, 2023 · dea4969 · dea4969
1 parent 21aac91
commit dea4969
Show file tree

Hide file tree

Showing 6 changed files with 81 additions and 14 deletions.
diff --git a/CHANGES b/CHANGES
@@ -8,6 +8,11 @@ Release 1.16.1 [PENDING]
   * Improvement: Calling Node.remove() on a node with no parent is now a no-op, vs a validation error.
     <https://github.com/jhy/jsoup/issues/1898>
 
+  * Bugfix: aligned the HTML Tree Builder processing steps for AfterBody and AfterAfterBody to the updated WHATWG
+    standard, to not pop the stack to close <body> or <html> elements. This prevents an errant </html> closing preceding
+    structure. Also added appropriate error message outputs in this case.
+    <https://github.com/jhy/jsoup/issues/1851>
+
   * Bugfix: Corrected support for ruby elements (<ruby>, <rp>, <rt>, and <rtc>) to current spec.
     <https://github.com/jhy/jsoup/issues/1294>
 

diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java
@@ -301,9 +301,14 @@ void insert(Token.Comment commentToken) {
         insertNode(comment, commentToken);
     }
 
+    /** Inserts the provided character token into the current element. */
     void insert(Token.Character characterToken) {
+        final Element el = currentElement(); // will be doc if no current element; allows for whitespace to be inserted into the doc root object (not on the stack)
+        insert(characterToken, el);
+    }
+
+    void insert(Token.Character characterToken, Element el) {
         final Node node;
-        Element el = currentElement(); // will be doc if no current element; allows for whitespace to be inserted into the doc root object (not on the stack)
         final String tagName = el.normalName();
         final String data = characterToken.getData();
 
@@ -317,6 +322,7 @@ else if (isContentForTagData(tagName))
         onNodeInserted(node, characterToken);
     }
 
+    /** Inserts the provided character token into the provided element. Use when not going onto stack element */
     private void insertNode(Node node, @Nullable Token token) {
         // if the stack hasn't been set up yet, elements (doctype, comments) go into the doc
         if (stack.isEmpty())
@@ -632,6 +638,20 @@ boolean inSelectScope(String targetName) {
         return false;
     }
 
+    /** Tests if there is some element on the stack that is not in the provided set. */
+    boolean onStackNot(String[] allowedTags) {
+        final int bottom = stack.size() -1;
+        final int top = bottom > MaxScopeSearchDepth ? bottom - MaxScopeSearchDepth : 0;
+        // don't walk too far up the tree
+
+        for (int pos = bottom; pos >= top; pos--) {
+            final String elName = stack.get(pos).normalName();
+            if (!inSorted(elName, allowedTags))
+                return true;
+        }
+        return false;
+    }
+
     void setHeadElement(Element headElement) {
         this.headElement = headElement;
     }

diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java
@@ -310,7 +310,8 @@ boolean process(Token t, HtmlTreeBuilder tb) {
                 case EOF:
                     if (tb.templateModeSize() > 0)
                         return tb.process(t, InTemplate);
-                    // todo: error if stack contains something not dd, dt, li, p, tbody, td, tfoot, th, thead, tr, body, html
+                    if (tb.onStackNot(InBodyEndOtherErrors))
+                        tb.error(this);
                     // stop parsing
                     break;
             }
@@ -726,16 +727,22 @@ private boolean inBodyEndTag(Token t, HtmlTreeBuilder tb) {
                         tb.error(this);
                         return false;
                     } else {
-                        // todo: error if stack contains something not dd, dt, li, optgroup, option, p, rp, rt, tbody, td, tfoot, th, thead, tr, body, html
-                        anyOtherEndTag(t, tb);
+                        if (tb.onStackNot(InBodyEndOtherErrors))
+                            tb.error(this);
                         tb.transition(AfterBody);
                     }
                     break;
                 case "html":
-                    boolean notIgnored = tb.processEndTag("body");
-                    if (notIgnored)
-                        return tb.process(endTag);
-                    break;
+                    if (!tb.onStack("body")) {
+                        tb.error(this);
+                        return false; // ignore
+                    } else {
+                        if (tb.onStackNot(InBodyEndOtherErrors))
+                            tb.error(this);
+                        tb.transition(AfterBody);
+                        return tb.process(t); // re-process
+                    }
+
                 case "form":
                     if (!tb.onStack("template")) {
                         Element currentForm = tb.getFormElement();
@@ -1594,7 +1601,12 @@ else if (name.equals("col")) {
     AfterBody {
         boolean process(Token t, HtmlTreeBuilder tb) {
             if (isWhitespace(t)) {
-                tb.insert(t.asCharacter()); // out of spec - include whitespace. spec would move into body
+                // spec deviation - currently body is still on stack, but we want this to go to the html node
+                Element html = tb.getFromStack("html");
+                if (html != null)
+                    tb.insert(t.asCharacter(), html);
+                else
+                    tb.process(t, InBody); // will get into body
             } else if (t.isComment()) {
                 tb.insert(t.asComment()); // into html node
             } else if (t.isDoctype()) {
@@ -1607,7 +1619,6 @@ boolean process(Token t, HtmlTreeBuilder tb) {
                     tb.error(this);
                     return false;
                 } else {
-                    if (tb.onStack("html")) tb.popStackToClose("html");
                     tb.transition(AfterAfterBody);
                 }
             } else if (t.isEOF()) {
@@ -1699,7 +1710,9 @@ boolean process(Token t, HtmlTreeBuilder tb) {
             } else if (t.isDoctype() || (t.isStartTag() && t.asStartTag().normalName().equals("html"))) {
                 return tb.process(t, InBody);
             } else if (isWhitespace(t)) {
-                tb.insert(t.asCharacter());
+                // spec deviation - body and html still on stack, but want this space to go after </html>
+                Element doc = tb.getDocument();
+                tb.insert(t.asCharacter(), doc);
             }else if (t.isEOF()) {
                 // nice work chuck
             } else {
@@ -1786,6 +1799,7 @@ static final class Constants {
         static final String[] InBodyEndClosers = new String[]{"address", "article", "aside", "blockquote", "button", "center", "details", "dir", "div",
             "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "listing", "menu",
             "nav", "ol", "pre", "section", "summary", "ul"};
+        static final String[] InBodyEndOtherErrors = new String[] {"body", "dd", "dt", "html", "li", "optgroup", "option", "p", "rb", "rp", "rt", "rtc", "tbody", "td", "tfoot", "th", "thead", "tr"};
         static final String[] InBodyEndAdoptionFormatters = new String[]{"a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u"};
         static final String[] InBodyEndTableFosters = new String[]{"table", "tbody", "tfoot", "thead", "tr"};
         static final String[] InTableToBody = new String[]{"tbody", "tfoot", "thead"};

diff --git a/src/main/java/org/jsoup/parser/Tag.java b/src/main/java/org/jsoup/parser/Tag.java
@@ -246,7 +246,8 @@ protected Tag clone() {
             "sub", "sup", "bdo", "iframe", "embed", "span", "input", "select", "textarea", "label", "button", "optgroup",
             "option", "legend", "datalist", "keygen", "output", "progress", "meter", "area", "param", "source", "track",
             "summary", "command", "device", "area", "basefont", "bgsound", "menuitem", "param", "source", "track",
-            "data", "bdi", "s", "strike", "nobr"
+            "data", "bdi", "s", "strike", "nobr",
+            "rb" // deprecated but still known / special handling
     };
     private static final String[] emptyTags = {
             "meta", "link", "base", "frame", "img", "br", "wbr", "embed", "hr", "input", "keygen", "col", "command",

diff --git a/src/test/java/org/jsoup/parser/HtmlParserTest.java b/src/test/java/org/jsoup/parser/HtmlParserTest.java
@@ -1698,11 +1698,38 @@ private boolean didAddElements(String input) {
         parser.setTrackErrors(10);
         Document doc = Jsoup.parse(html, parser);
         ParseErrorList errors = parser.getErrors();
-        assertEquals(1, errors.size());
+        assertEquals(2, errors.size());
         Element ruby = doc.expectFirst("ruby");
         assertEquals(
             "<ruby><div><rp>Hello</rp></div></ruby>",
             TextUtil.stripNewlines(ruby.outerHtml()));
         assertEquals("<1:16>: Unexpected StartTag token [<rp>] when in state [InBody]", errors.get(0).toString());
     }
+
+    @Test void errorOnEofIfOpen() {
+        String html = "<div>";
+        Parser parser = Parser.htmlParser();
+        parser.setTrackErrors(10);
+        Document doc = Jsoup.parse(html, parser);
+        ParseErrorList errors = parser.getErrors();
+        assertEquals(1, errors.size());
+        assertEquals("Unexpected EOF token [] when in state [InBody]", errors.get(0).getErrorMessage());
+    }
+
+    @Test void NoErrorOnEofIfBodyOpen() {
+        String html = "<body>";
+        Parser parser = Parser.htmlParser();
+        parser.setTrackErrors(10);
+        Document doc = Jsoup.parse(html, parser);
+        ParseErrorList errors = parser.getErrors();
+        assertEquals(0, errors.size());
+    }
+
+    @Test void htmlClose() {
+        // https://github.com/jhy/jsoup/issues/1851
+        String html = "<body><div>One</html>Two</div></body>";
+        Document doc = Jsoup.parse(html);
+        //assertEquals("OneTwo", doc.expectFirst("body > div").text());
+        System.out.println(doc.html());
+    }
 }
diff --git a/src/test/java/org/jsoup/parser/HtmlTreeBuilderStateTest.java b/src/test/java/org/jsoup/parser/HtmlTreeBuilderStateTest.java
@@ -45,7 +45,7 @@ static void ensureSorted(List<Object[]> constants) {
     public void ensureArraysAreSorted() {
         List<Object[]> constants = findConstantArrays(Constants.class);
         ensureSorted(constants);
-        assertEquals(38, constants.size());
+        assertEquals(39, constants.size());
     }
 
     @Test public void ensureTagSearchesAreKnownTags() {