From 46dbc818cd21691b2a78b47db2cfbec48c93c850 Mon Sep 17 00:00:00 2001 From: nhant01 Date: Mon, 9 Nov 2020 22:11:01 -0500 Subject: [PATCH] Add AMP validation features to support literal attribute value, e.g []. --- CHANGES | 7 +- README.md | 2 +- core/pom.xml | 2 +- .../java/com/yahoo/tagchowder/Parser.java | 15 ++ .../com/yahoo/tagchowder/CustomHandler.java | 54 +++++++ .../com/yahoo/tagchowder/ParsedHtmlTag.java | 86 ++++++++++ .../java/com/yahoo/tagchowder/ParserTest.java | 148 ++++++++++++++++++ core/src/test/resources/html/amphtml.txt | 25 +++ pom.xml | 2 +- 9 files changed, 337 insertions(+), 4 deletions(-) create mode 100644 core/src/test/java/com/yahoo/tagchowder/CustomHandler.java create mode 100644 core/src/test/java/com/yahoo/tagchowder/ParsedHtmlTag.java create mode 100644 core/src/test/resources/html/amphtml.txt diff --git a/CHANGES b/CHANGES index 23fd876..fe3c1ba 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,8 @@ +Changes from tagsoup 2.0.18 +=========================== +- Add AMP validation features to support literal attribute value such as 'html ⚡4email' and [text] + + Changes from tagsoup 2.0.0 to tagchowder 2.0.14 ========================= - speed up parser performance. @@ -14,4 +19,4 @@ Rename the project to tagchowder. Replaced ant with maven, code check-style checks and code coverage checks. Restructured the folders, renamed the packages to lafaspot. Clean up the code, a bit. -Fork of tagsoup 1.2. available at http://vrici.lojban.org/~cowan/XML/tagsoup/ +Fork of tagsoup 1.2. available at http://vrici.lojban.org/~cowan/XML/tagsoup/ \ No newline at end of file diff --git a/README.md b/README.md index 3df559d..ccc7517 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ TagChowder uses maven as tool for building and managing project. Add following s com.github.lafa.tagchowder tagchowder.core - 2.0.3 + 2.0.18 ``` Here are the instructions to setup maven environment. diff --git a/core/pom.xml b/core/pom.xml index fd7fe4d..f894adf 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -7,7 +7,7 @@ com.yahoo.tagchowder tagchowder - 2.0.17 + 2.0.18 tagchowder.core ${project.artifactId} diff --git a/core/src/main/java/com/yahoo/tagchowder/Parser.java b/core/src/main/java/com/yahoo/tagchowder/Parser.java index c47fb6c..9225069 100644 --- a/core/src/main/java/com/yahoo/tagchowder/Parser.java +++ b/core/src/main/java/com/yahoo/tagchowder/Parser.java @@ -84,6 +84,7 @@ public class Parser extends DefaultHandler implements ScanHandler, XMLReader, Le private static final boolean DEFAULT_RESTART_ELEMENTS = true; private static final boolean DEFAULT_IGNORABLE_WHITESPACE = false; private static final boolean DEFAULT_CDATA_ELEMENTS = true; + private static final boolean DEFAULT_AMP_VALIDATION = false; // Feature flags. @@ -96,6 +97,7 @@ public class Parser extends DefaultHandler implements ScanHandler, XMLReader, Le private boolean restartElements = DEFAULT_RESTART_ELEMENTS; private boolean ignorableWhitespace = DEFAULT_IGNORABLE_WHITESPACE; private boolean cdataElements = DEFAULT_CDATA_ELEMENTS; + private boolean ampValidation = DEFAULT_AMP_VALIDATION; /** * A value of "true" indicates namespace URIs and unprefixed local names for element and attribute names will be available. @@ -166,6 +168,11 @@ public class Parser extends DefaultHandler implements ScanHandler, XMLReader, Le **/ public static final String VALIDATION_FEATURE = "http://xml.org/sax/features/validation"; + /** + * Controls whether the parser is reporting all validity errors for AMP contents. + */ + public static final String AMP_VALIDATION_FEATURE = "https://github.com/ampproject/validator-java"; + /** * Controls whether the parser reports Unicode normalization errors as described in section 2.13 and Appendix B of the XML 1.1 Recommendation. (We * don't normalize.) @@ -280,6 +287,7 @@ public class Parser extends DefaultHandler implements ScanHandler, XMLReader, Le theFeatures.put(RESTART_ELEMENTS_FEATURE, truthValue(DEFAULT_RESTART_ELEMENTS)); theFeatures.put(IGNORABLE_WHITESPACE_FEATURE, truthValue(DEFAULT_IGNORABLE_WHITESPACE)); theFeatures.put(CDATA_ELEMENTS_FEATURE, truthValue(DEFAULT_CDATA_ELEMENTS)); + theFeatures.put(AMP_VALIDATION_FEATURE, truthValue(DEFAULT_AMP_VALIDATION)); } // Private clone of Boolean.valueOf that is guaranteed to return @@ -346,6 +354,8 @@ public void setFeature(final String name, final boolean value) throws SAXNotReco cdataElements = value; } else if (name.equals(STRING_INTERNING_FEATURE)) { useIntern = value; + } else if (name.equals(AMP_VALIDATION_FEATURE)) { + ampValidation = value; } } @@ -1185,6 +1195,11 @@ private String makeName(final char[] buff, final int offset, final int length) { } start = true; dst.append(translateColons ? '_' : ch); + } else if (ampValidation) { + if (ch == '⚡' || ch == '[' || ch == ']' || ch == '{' || ch == '}') { + start = false; + dst.append(ch); + } } } int dstLength = dst.length(); diff --git a/core/src/test/java/com/yahoo/tagchowder/CustomHandler.java b/core/src/test/java/com/yahoo/tagchowder/CustomHandler.java new file mode 100644 index 0000000..6a88cc1 --- /dev/null +++ b/core/src/test/java/com/yahoo/tagchowder/CustomHandler.java @@ -0,0 +1,54 @@ +/* + * + * ==================================================================== + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ==================================================================== + */ + +/* + * Changes to the original project are Copyright 2019 Oath Inc. + */ + +package com.yahoo.tagchowder; + +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import java.util.ArrayList; +import java.util.List; + +/** + * A simple CustomHandler class to scan HTML doc and return the list of parsed Html tags. + * + * @author nhant01 + */ +public class CustomHandler extends DefaultHandler { + @Override + public void startElement(final String uri, final String localName, + final String qName, final Attributes attributes) throws SAXException { + final ParsedHtmlTag parsedHtmlTag = new ParsedHtmlTag(localName, attributes); + parsedHtmlTagSet.add(parsedHtmlTag); + } + + /** + * Returns the list of parsed Html tags. + * @return the list parsed Html tags + */ + public List getParsedHtmlTags() { + return parsedHtmlTagSet; + } + + /** Set of parsed Html tags */ + private List parsedHtmlTagSet = new ArrayList<>(); +} \ No newline at end of file diff --git a/core/src/test/java/com/yahoo/tagchowder/ParsedHtmlTag.java b/core/src/test/java/com/yahoo/tagchowder/ParsedHtmlTag.java new file mode 100644 index 0000000..b050a03 --- /dev/null +++ b/core/src/test/java/com/yahoo/tagchowder/ParsedHtmlTag.java @@ -0,0 +1,86 @@ +/* + * + * ==================================================================== + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ==================================================================== + */ + +/* + * Changes to the original project are Copyright 2019 Oath Inc. + */ + +package com.yahoo.tagchowder; + +import org.xml.sax.Attributes; + +import javax.annotation.Nonnull; + +/** + * The Html ParsedHtmlTag class. + * + * @author nhant01 + */ +public class ParsedHtmlTag { + /** + * Constructor. + * + * @param tagName the name of the underlying tag in html document. + * @param attributes the attributes attached to the element. If + * there are no attributes, it shall be an empty Attributes object. + */ + public ParsedHtmlTag(@Nonnull final String tagName, @Nonnull final Attributes attributes) { + this.tagName = tagName.toUpperCase(); + this.attrs = attributes; + } + + /** + * Lower-case tag name. + * @return returns a lower case tag name. + */ + public String lowerName() { + return this.tagName.toLowerCase(); + } + + /** + * Determine if an attribute name exists. Return true if found. + * @param attrName attribute name + * @return true if found. Otherwise false. + */ + public boolean hasAttribute(final String attrName) { + for (int i = 0; i < attrs().getLength(); i++) { + if (attrs.getQName(i).equalsIgnoreCase(attrName)) { + return true; + } + } + + return false; + } + + /** + * Returns an array of attributes. Each attribute has two fields: name and + * value. Name is always lower-case, value is the case from the original + * document. Values are unescaped. + * @return returns the attributes. + */ + public Attributes attrs() { + return this.attrs; + } + + /** The parsed tag name. */ + @Nonnull + private String tagName; + + /** The attributes. */ + @Nonnull + private final Attributes attrs; +} \ No newline at end of file diff --git a/core/src/test/java/com/yahoo/tagchowder/ParserTest.java b/core/src/test/java/com/yahoo/tagchowder/ParserTest.java index d195dc2..c1e6c5a 100644 --- a/core/src/test/java/com/yahoo/tagchowder/ParserTest.java +++ b/core/src/test/java/com/yahoo/tagchowder/ParserTest.java @@ -26,7 +26,9 @@ import java.io.InputStreamReader; import java.io.StringReader; import java.nio.charset.StandardCharsets; +import java.util.List; +import org.testng.Assert; import org.testng.annotations.Test; import org.xml.sax.InputSource; import org.xml.sax.SAXException; @@ -51,6 +53,152 @@ public void testSampleHtml() throws IOException, SAXException { parser.parse(inSource); } + /** + * Parse an sample AMP html.txt and verify tag's attributes with AMP validation enabled. + * + * @throws IOException IOException + * @throws SAXException SAXException + */ + @Test + public void testSampleHtmlWithAMPValidationFeatureEnable() throws IOException, SAXException { + final String html = getSampleHtml("html.txt"); + final Parser parser = new Parser(); + final CustomHandler customHandler = new CustomHandler(); + parser.setContentHandler(customHandler); + parser.setErrorHandler(customHandler); + parser.setFeature(Parser.AMP_VALIDATION_FEATURE, true); + parser.setDefaultBufferSize(html.length()); + final InputSource inSource = new InputSource(new StringReader(html)); + parser.parse(inSource); + + List parsedHtmlTagList = customHandler.getParsedHtmlTags(); + Assert.assertEquals(parsedHtmlTagList.size(), 31); + + //Assert true for tags having attributes in 'html.txt' + ParsedHtmlTag parsedHtmlTag = parsedHtmlTagList.get(3); + Assert.assertEquals(parsedHtmlTag.lowerName(), "body"); + Assert.assertTrue(parsedHtmlTag.hasAttribute("class")); + + parsedHtmlTag = parsedHtmlTagList.get(4); + Assert.assertEquals(parsedHtmlTag.lowerName(), "div"); + Assert.assertTrue(parsedHtmlTag.hasAttribute("dir")); + + parsedHtmlTag = parsedHtmlTagList.get(10); + Assert.assertEquals(parsedHtmlTag.lowerName(), "div"); + Assert.assertTrue(parsedHtmlTag.hasAttribute("id")); + + parsedHtmlTag = parsedHtmlTagList.get(11); + Assert.assertEquals(parsedHtmlTag.lowerName(), "hr"); + Assert.assertTrue(parsedHtmlTag.hasAttribute("id")); + + parsedHtmlTag = parsedHtmlTagList.get(17); + Assert.assertEquals(parsedHtmlTag.lowerName(), "div"); + Assert.assertTrue(parsedHtmlTag.hasAttribute("style")); + } + + /** + * Parse an sample AMP amphtml.txt and verify tag's attributes with AMP validation enabled. + * + * @throws IOException IOException + * @throws SAXException SAXException + */ + @Test + public void testSampleAMPHtml() throws IOException, SAXException { + final String html = getSampleHtml("amphtml.txt"); + final Parser parser = new Parser(); + final CustomHandler customHandler = new CustomHandler(); + parser.setContentHandler(customHandler); + parser.setErrorHandler(customHandler); + parser.setFeature(Parser.AMP_VALIDATION_FEATURE, true); + parser.setDefaultBufferSize(html.length()); + final InputSource inSource = new InputSource(new StringReader(html)); + parser.parse(inSource); + + List parsedHtmlTagList = customHandler.getParsedHtmlTags(); + Assert.assertEquals(parsedHtmlTagList.size(), 21); + + //Assert true for these cases + ParsedHtmlTag parsedHtmlTag = parsedHtmlTagList.get(0); + Assert.assertEquals(parsedHtmlTag.lowerName(), "html"); + Assert.assertTrue(parsedHtmlTag.hasAttribute("⚡4email")); + + parsedHtmlTag = parsedHtmlTagList.get(6); + Assert.assertEquals(parsedHtmlTag.lowerName(), "p"); + Assert.assertTrue(parsedHtmlTag.hasAttribute("[text]")); + + parsedHtmlTag = parsedHtmlTagList.get(7); + Assert.assertEquals(parsedHtmlTag.lowerName(), "p"); + Assert.assertTrue(parsedHtmlTag.hasAttribute("[[text]]")); + + parsedHtmlTag = parsedHtmlTagList.get(8); + Assert.assertEquals(parsedHtmlTag.lowerName(), "p"); + Assert.assertTrue(parsedHtmlTag.hasAttribute("[{text}]")); + + parsedHtmlTag = parsedHtmlTagList.get(9); + Assert.assertEquals(parsedHtmlTag.lowerName(), "p"); + Assert.assertTrue(parsedHtmlTag.hasAttribute("{text}")); + + parsedHtmlTag = parsedHtmlTagList.get(10); + Assert.assertEquals(parsedHtmlTag.lowerName(), "p"); + Assert.assertTrue(parsedHtmlTag.hasAttribute("{{text}}")); + + parsedHtmlTag = parsedHtmlTagList.get(11); + Assert.assertEquals(parsedHtmlTag.lowerName(), "p"); + Assert.assertTrue(parsedHtmlTag.hasAttribute("{[text]}")); + + //Assert false for these cases + parsedHtmlTag = parsedHtmlTagList.get(12); + Assert.assertEquals(parsedHtmlTag.lowerName(), "p"); + Assert.assertTrue(parsedHtmlTag.hasAttribute("{[text}")); + + parsedHtmlTag = parsedHtmlTagList.get(13); + Assert.assertEquals(parsedHtmlTag.lowerName(), "p"); + Assert.assertTrue(parsedHtmlTag.hasAttribute("{text]")); + + parsedHtmlTag = parsedHtmlTagList.get(14); + Assert.assertEquals(parsedHtmlTag.lowerName(), "p"); + Assert.assertTrue(parsedHtmlTag.hasAttribute("{text")); + + parsedHtmlTag = parsedHtmlTagList.get(15); + Assert.assertEquals(parsedHtmlTag.lowerName(), "p"); + Assert.assertTrue(parsedHtmlTag.hasAttribute("text}")); + + parsedHtmlTag = parsedHtmlTagList.get(16); + Assert.assertEquals(parsedHtmlTag.lowerName(), "p"); + Assert.assertTrue(parsedHtmlTag.hasAttribute("[text")); + + parsedHtmlTag = parsedHtmlTagList.get(17); + Assert.assertEquals(parsedHtmlTag.lowerName(), "p"); + Assert.assertTrue(parsedHtmlTag.hasAttribute("text]")); + + parsedHtmlTag = parsedHtmlTagList.get(18); + Assert.assertEquals(parsedHtmlTag.lowerName(), "p"); + Assert.assertTrue(parsedHtmlTag.hasAttribute("⚡text]")); + + parsedHtmlTag = parsedHtmlTagList.get(19); + Assert.assertEquals(parsedHtmlTag.lowerName(), "p"); + Assert.assertTrue(parsedHtmlTag.hasAttribute("⚡text}")); + + parsedHtmlTag = parsedHtmlTagList.get(20); + Assert.assertEquals(parsedHtmlTag.lowerName(), "p"); + Assert.assertTrue(parsedHtmlTag.hasAttribute("⚡text")); + } + + /** + * Test AMP feature. + * + * @throws IOException IOException + * @throws SAXException SAXException + */ + @Test + public void testAMPFeature() throws IOException, SAXException { + final Parser parser = new Parser(); + Assert.assertEquals(parser.getFeature(Parser.AMP_VALIDATION_FEATURE), false); + + parser.setFeature(Parser.AMP_VALIDATION_FEATURE, true); + Assert.assertEquals(parser.getFeature(Parser.AMP_VALIDATION_FEATURE), true); + } + /** * Parse an sample html with only double quote in public id inside !DOCTYPE. * diff --git a/core/src/test/resources/html/amphtml.txt b/core/src/test/resources/html/amphtml.txt new file mode 100644 index 0000000..90ab98a --- /dev/null +++ b/core/src/test/resources/html/amphtml.txt @@ -0,0 +1,25 @@ + + + + + + + + +

1

+

1

+

1

+

1

+

1

+

1

+

1

+

1

+

1

+

1

+

1

+

1

+

1

+

1

+

1

+ + \ No newline at end of file diff --git a/pom.xml b/pom.xml index ca1a4e9..b80295a 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ com.yahoo.tagchowder tagchowder pom - 2.0.17 + 2.0.18 ${project.artifactId} https://github.com/yahoo/tagchowder Parent POM file for tagchowder project