diff --git a/CHANGES b/CHANGES index 5f09616908..13f0353171 100644 --- a/CHANGES +++ b/CHANGES @@ -27,6 +27,10 @@ jsoup changelog memoizing the scan and reducing GC. + * Improvement: when parsing custom tags (in HTML or XML), added a flyweight cache on Tag.valueOf(name) to reduce + memory overhead when many tags are repeated. + + * Bugfix: when tracking errors or checking for validity in the Cleaner, errors were incorrectly raised for missing optional closing tags. diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java index aaf16dbf55..1c82827713 100644 --- a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java +++ b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java @@ -131,7 +131,7 @@ List parseFragment(String inputFragment, @Nullable Element context, String default: tokeniser.transition(TokeniserState.Data); } - root = new Element(Tag.valueOf(contextTag, settings), baseUri); + root = new Element(tagFor(contextTag, settings), baseUri); doc.appendChild(root); stack.add(root); resetInsertionMode(); @@ -245,13 +245,13 @@ Element insert(final Token.StartTag startTag) { return el; } - Element el = new Element(Tag.valueOf(startTag.name(), settings), null, settings.normalizeAttributes(startTag.attributes)); + Element el = new Element(tagFor(startTag.name(), settings), null, settings.normalizeAttributes(startTag.attributes)); insert(el); return el; } Element insertStartTag(String startTagName) { - Element el = new Element(Tag.valueOf(startTagName, settings), null); + Element el = new Element(tagFor(startTagName, settings), null); insert(el); return el; } @@ -262,7 +262,7 @@ void insert(Element el) { } Element insertEmpty(Token.StartTag startTag) { - Tag tag = Tag.valueOf(startTag.name(), settings); + Tag tag = tagFor(startTag.name(), settings); Element el = new Element(tag, null, settings.normalizeAttributes(startTag.attributes)); insertNode(el); if (startTag.isSelfClosing()) { @@ -277,7 +277,7 @@ Element insertEmpty(Token.StartTag startTag) { } FormElement insertForm(Token.StartTag startTag, boolean onStack, boolean checkTemplateStack) { - Tag tag = Tag.valueOf(startTag.name(), settings); + Tag tag = tagFor(startTag.name(), settings); FormElement el = new FormElement(tag, null, settings.normalizeAttributes(startTag.attributes)); if (checkTemplateStack) { if(!onStack("template")) diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java index 4074cca349..e437cd9813 100644 --- a/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java +++ b/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java @@ -891,7 +891,7 @@ else if (!tb.onStack(formatEl)) { } else if (node == formatEl) break; - Element replacement = new Element(Tag.valueOf(node.nodeName(), ParseSettings.preserveCase), tb.getBaseUri()); + Element replacement = new Element(tb.tagFor(node.nodeName(), ParseSettings.preserveCase), tb.getBaseUri()); // case will follow the original node (so honours ParseSettings) tb.replaceActiveFormattingElement(node, replacement); tb.replaceOnStack(node, replacement); diff --git a/src/main/java/org/jsoup/parser/TreeBuilder.java b/src/main/java/org/jsoup/parser/TreeBuilder.java index e21d70c586..f895c9ed28 100644 --- a/src/main/java/org/jsoup/parser/TreeBuilder.java +++ b/src/main/java/org/jsoup/parser/TreeBuilder.java @@ -10,7 +10,9 @@ import javax.annotation.ParametersAreNonnullByDefault; import java.io.Reader; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; /** * @author Jonathan Hedley @@ -24,6 +26,7 @@ abstract class TreeBuilder { protected String baseUri; // current base uri, for creating new elements protected Token currentToken; // currentToken is used only for error tracking. protected ParseSettings settings; + protected Map seenTags; // tags we've used in this parse; saves tag GC for custom tags. private Token.StartTag start = new Token.StartTag(); // start tag to process private Token.EndTag end = new Token.EndTag(); @@ -44,6 +47,7 @@ protected void initialiseParse(Reader input, String baseUri, Parser parser) { currentToken = null; tokeniser = new Tokeniser(reader, parser.getErrors()); stack = new ArrayList<>(32); + seenTags = new HashMap<>(); this.baseUri = baseUri; } @@ -57,6 +61,7 @@ Document parse(Reader input, String baseUri, Parser parser) { reader = null; tokeniser = null; stack = null; + seenTags = null; return doc; } @@ -159,4 +164,13 @@ protected void error(String msg, Object... args) { protected boolean isContentForTagData(String normalName) { return false; } + + protected Tag tagFor(String tagName, ParseSettings settings) { + Tag tag = seenTags.get(tagName); // note that we don't normalize the cache key. But tag via valueOf may be normalized. + if (tag == null) { + tag = Tag.valueOf(tagName, settings); + seenTags.put(tagName, tag); + } + return tag; + } } diff --git a/src/main/java/org/jsoup/parser/XmlTreeBuilder.java b/src/main/java/org/jsoup/parser/XmlTreeBuilder.java index 5fad99e2c7..97ef372c37 100644 --- a/src/main/java/org/jsoup/parser/XmlTreeBuilder.java +++ b/src/main/java/org/jsoup/parser/XmlTreeBuilder.java @@ -83,7 +83,7 @@ private void insertNode(Node node) { } Element insert(Token.StartTag startTag) { - Tag tag = Tag.valueOf(startTag.name(), settings); + Tag tag = tagFor(startTag.name(), settings); // todo: wonder if for xml parsing, should treat all tags as unknown? because it's not html. if (startTag.hasAttributes()) startTag.attributes.deduplicate(settings); diff --git a/src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java b/src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java index 8cd6bdd53c..c254e8e9cf 100644 --- a/src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java +++ b/src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java @@ -298,4 +298,20 @@ public void handlesLTinScript() { String out = doc.html(); assertEquals("
", out); } + + @Test void customTagsAreFlyweights() { + String xml = "FooFooFOOFOO"; + Document doc = Jsoup.parse(xml, Parser.xmlParser()); + Elements els = doc.children(); + + Tag t1 = els.get(0).tag(); + Tag t2 = els.get(1).tag(); + Tag t3 = els.get(2).tag(); + Tag t4 = els.get(3).tag(); + assertEquals("foo", t1.getName()); + assertEquals("FOO", t3.getName()); + assertSame(t1, t2); + assertSame(t3, t4); + + } }