Skip to content

Commit

Permalink
Add AMP validation features to support literal attribute value, e.g [].
Browse files Browse the repository at this point in the history
  • Loading branch information
nhant01 committed Nov 10, 2020
1 parent bb930af commit 46dbc81
Show file tree
Hide file tree
Showing 9 changed files with 337 additions and 4 deletions.
7 changes: 6 additions & 1 deletion CHANGES
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
Changes from tagsoup 2.0.18
===========================
- Add AMP validation features to support literal attribute value such as 'html ⚡4email' and [text]


Changes from tagsoup 2.0.0 to tagchowder 2.0.14
=========================
- speed up parser performance.
Expand All @@ -14,4 +19,4 @@ Rename the project to tagchowder.
Replaced ant with maven, code check-style checks and code coverage checks.
Restructured the folders, renamed the packages to lafaspot.
Clean up the code, a bit.
Fork of tagsoup 1.2. available at http://vrici.lojban.org/~cowan/XML/tagsoup/
Fork of tagsoup 1.2. available at http://vrici.lojban.org/~cowan/XML/tagsoup/
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ TagChowder uses maven as tool for building and managing project. Add following s
<dependency>
<groupId>com.github.lafa.tagchowder</groupId>
<artifactId>tagchowder.core</artifactId>
<version>2.0.3</version>
<version>2.0.18</version>
</dependency>
```
Here are the instructions to setup maven environment.
Expand Down
2 changes: 1 addition & 1 deletion core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
<parent>
<groupId>com.yahoo.tagchowder</groupId>
<artifactId>tagchowder</artifactId>
<version>2.0.17</version>
<version>2.0.18</version>
</parent>
<artifactId>tagchowder.core</artifactId>
<name>${project.artifactId}</name>
Expand Down
15 changes: 15 additions & 0 deletions core/src/main/java/com/yahoo/tagchowder/Parser.java
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ public class Parser extends DefaultHandler implements ScanHandler, XMLReader, Le
private static final boolean DEFAULT_RESTART_ELEMENTS = true;
private static final boolean DEFAULT_IGNORABLE_WHITESPACE = false;
private static final boolean DEFAULT_CDATA_ELEMENTS = true;
private static final boolean DEFAULT_AMP_VALIDATION = false;

// Feature flags.

Expand All @@ -96,6 +97,7 @@ public class Parser extends DefaultHandler implements ScanHandler, XMLReader, Le
private boolean restartElements = DEFAULT_RESTART_ELEMENTS;
private boolean ignorableWhitespace = DEFAULT_IGNORABLE_WHITESPACE;
private boolean cdataElements = DEFAULT_CDATA_ELEMENTS;
private boolean ampValidation = DEFAULT_AMP_VALIDATION;

/**
* A value of "true" indicates namespace URIs and unprefixed local names for element and attribute names will be available.
Expand Down Expand Up @@ -166,6 +168,11 @@ public class Parser extends DefaultHandler implements ScanHandler, XMLReader, Le
**/
public static final String VALIDATION_FEATURE = "http://xml.org/sax/features/validation";

/**
* Controls whether the parser is reporting all validity errors for AMP contents.
*/
public static final String AMP_VALIDATION_FEATURE = "https://github.com/ampproject/validator-java";

/**
* Controls whether the parser reports Unicode normalization errors as described in section 2.13 and Appendix B of the XML 1.1 Recommendation. (We
* don't normalize.)
Expand Down Expand Up @@ -280,6 +287,7 @@ public class Parser extends DefaultHandler implements ScanHandler, XMLReader, Le
theFeatures.put(RESTART_ELEMENTS_FEATURE, truthValue(DEFAULT_RESTART_ELEMENTS));
theFeatures.put(IGNORABLE_WHITESPACE_FEATURE, truthValue(DEFAULT_IGNORABLE_WHITESPACE));
theFeatures.put(CDATA_ELEMENTS_FEATURE, truthValue(DEFAULT_CDATA_ELEMENTS));
theFeatures.put(AMP_VALIDATION_FEATURE, truthValue(DEFAULT_AMP_VALIDATION));
}

// Private clone of Boolean.valueOf that is guaranteed to return
Expand Down Expand Up @@ -346,6 +354,8 @@ public void setFeature(final String name, final boolean value) throws SAXNotReco
cdataElements = value;
} else if (name.equals(STRING_INTERNING_FEATURE)) {
useIntern = value;
} else if (name.equals(AMP_VALIDATION_FEATURE)) {
ampValidation = value;
}
}

Expand Down Expand Up @@ -1185,6 +1195,11 @@ private String makeName(final char[] buff, final int offset, final int length) {
}
start = true;
dst.append(translateColons ? '_' : ch);
} else if (ampValidation) {
if (ch == '⚡' || ch == '[' || ch == ']' || ch == '{' || ch == '}') {
start = false;
dst.append(ch);
}
}
}
int dstLength = dst.length();
Expand Down
54 changes: 54 additions & 0 deletions core/src/test/java/com/yahoo/tagchowder/CustomHandler.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
/*
*
* ====================================================================
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* ====================================================================
*/

/*
* Changes to the original project are Copyright 2019 Oath Inc.
*/

package com.yahoo.tagchowder;

import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

import java.util.ArrayList;
import java.util.List;

/**
* A simple CustomHandler class to scan HTML doc and return the list of parsed Html tags.
*
* @author nhant01
*/
public class CustomHandler extends DefaultHandler {
@Override
public void startElement(final String uri, final String localName,
final String qName, final Attributes attributes) throws SAXException {
final ParsedHtmlTag parsedHtmlTag = new ParsedHtmlTag(localName, attributes);
parsedHtmlTagSet.add(parsedHtmlTag);
}

/**
* Returns the list of parsed Html tags.
* @return the list parsed Html tags
*/
public List<ParsedHtmlTag> getParsedHtmlTags() {
return parsedHtmlTagSet;
}

/** Set of parsed Html tags */
private List<ParsedHtmlTag> parsedHtmlTagSet = new ArrayList<>();
}
86 changes: 86 additions & 0 deletions core/src/test/java/com/yahoo/tagchowder/ParsedHtmlTag.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
/*
*
* ====================================================================
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* ====================================================================
*/

/*
* Changes to the original project are Copyright 2019 Oath Inc.
*/

package com.yahoo.tagchowder;

import org.xml.sax.Attributes;

import javax.annotation.Nonnull;

/**
* The Html ParsedHtmlTag class.
*
* @author nhant01
*/
public class ParsedHtmlTag {
/**
* Constructor.
*
* @param tagName the name of the underlying tag in html document.
* @param attributes the attributes attached to the element. If
* there are no attributes, it shall be an empty Attributes object.
*/
public ParsedHtmlTag(@Nonnull final String tagName, @Nonnull final Attributes attributes) {
this.tagName = tagName.toUpperCase();
this.attrs = attributes;
}

/**
* Lower-case tag name.
* @return returns a lower case tag name.
*/
public String lowerName() {
return this.tagName.toLowerCase();
}

/**
* Determine if an attribute name exists. Return true if found.
* @param attrName attribute name
* @return true if found. Otherwise false.
*/
public boolean hasAttribute(final String attrName) {
for (int i = 0; i < attrs().getLength(); i++) {
if (attrs.getQName(i).equalsIgnoreCase(attrName)) {
return true;
}
}

return false;
}

/**
* Returns an array of attributes. Each attribute has two fields: name and
* value. Name is always lower-case, value is the case from the original
* document. Values are unescaped.
* @return returns the attributes.
*/
public Attributes attrs() {
return this.attrs;
}

/** The parsed tag name. */
@Nonnull
private String tagName;

/** The attributes. */
@Nonnull
private final Attributes attrs;
}
148 changes: 148 additions & 0 deletions core/src/test/java/com/yahoo/tagchowder/ParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@
import java.io.InputStreamReader;
import java.io.StringReader;
import java.nio.charset.StandardCharsets;
import java.util.List;

import org.testng.Assert;
import org.testng.annotations.Test;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
Expand All @@ -51,6 +53,152 @@ public void testSampleHtml() throws IOException, SAXException {
parser.parse(inSource);
}

/**
* Parse an sample AMP html.txt and verify tag's attributes with AMP validation enabled.
*
* @throws IOException IOException
* @throws SAXException SAXException
*/
@Test
public void testSampleHtmlWithAMPValidationFeatureEnable() throws IOException, SAXException {
final String html = getSampleHtml("html.txt");
final Parser parser = new Parser();
final CustomHandler customHandler = new CustomHandler();
parser.setContentHandler(customHandler);
parser.setErrorHandler(customHandler);
parser.setFeature(Parser.AMP_VALIDATION_FEATURE, true);
parser.setDefaultBufferSize(html.length());
final InputSource inSource = new InputSource(new StringReader(html));
parser.parse(inSource);

List<ParsedHtmlTag> parsedHtmlTagList = customHandler.getParsedHtmlTags();
Assert.assertEquals(parsedHtmlTagList.size(), 31);

//Assert true for tags having attributes in 'html.txt'
ParsedHtmlTag parsedHtmlTag = parsedHtmlTagList.get(3);
Assert.assertEquals(parsedHtmlTag.lowerName(), "body");
Assert.assertTrue(parsedHtmlTag.hasAttribute("class"));

parsedHtmlTag = parsedHtmlTagList.get(4);
Assert.assertEquals(parsedHtmlTag.lowerName(), "div");
Assert.assertTrue(parsedHtmlTag.hasAttribute("dir"));

parsedHtmlTag = parsedHtmlTagList.get(10);
Assert.assertEquals(parsedHtmlTag.lowerName(), "div");
Assert.assertTrue(parsedHtmlTag.hasAttribute("id"));

parsedHtmlTag = parsedHtmlTagList.get(11);
Assert.assertEquals(parsedHtmlTag.lowerName(), "hr");
Assert.assertTrue(parsedHtmlTag.hasAttribute("id"));

parsedHtmlTag = parsedHtmlTagList.get(17);
Assert.assertEquals(parsedHtmlTag.lowerName(), "div");
Assert.assertTrue(parsedHtmlTag.hasAttribute("style"));
}

/**
* Parse an sample AMP amphtml.txt and verify tag's attributes with AMP validation enabled.
*
* @throws IOException IOException
* @throws SAXException SAXException
*/
@Test
public void testSampleAMPHtml() throws IOException, SAXException {
final String html = getSampleHtml("amphtml.txt");
final Parser parser = new Parser();
final CustomHandler customHandler = new CustomHandler();
parser.setContentHandler(customHandler);
parser.setErrorHandler(customHandler);
parser.setFeature(Parser.AMP_VALIDATION_FEATURE, true);
parser.setDefaultBufferSize(html.length());
final InputSource inSource = new InputSource(new StringReader(html));
parser.parse(inSource);

List<ParsedHtmlTag> parsedHtmlTagList = customHandler.getParsedHtmlTags();
Assert.assertEquals(parsedHtmlTagList.size(), 21);

//Assert true for these cases
ParsedHtmlTag parsedHtmlTag = parsedHtmlTagList.get(0);
Assert.assertEquals(parsedHtmlTag.lowerName(), "html");
Assert.assertTrue(parsedHtmlTag.hasAttribute("⚡4email"));

parsedHtmlTag = parsedHtmlTagList.get(6);
Assert.assertEquals(parsedHtmlTag.lowerName(), "p");
Assert.assertTrue(parsedHtmlTag.hasAttribute("[text]"));

parsedHtmlTag = parsedHtmlTagList.get(7);
Assert.assertEquals(parsedHtmlTag.lowerName(), "p");
Assert.assertTrue(parsedHtmlTag.hasAttribute("[[text]]"));

parsedHtmlTag = parsedHtmlTagList.get(8);
Assert.assertEquals(parsedHtmlTag.lowerName(), "p");
Assert.assertTrue(parsedHtmlTag.hasAttribute("[{text}]"));

parsedHtmlTag = parsedHtmlTagList.get(9);
Assert.assertEquals(parsedHtmlTag.lowerName(), "p");
Assert.assertTrue(parsedHtmlTag.hasAttribute("{text}"));

parsedHtmlTag = parsedHtmlTagList.get(10);
Assert.assertEquals(parsedHtmlTag.lowerName(), "p");
Assert.assertTrue(parsedHtmlTag.hasAttribute("{{text}}"));

parsedHtmlTag = parsedHtmlTagList.get(11);
Assert.assertEquals(parsedHtmlTag.lowerName(), "p");
Assert.assertTrue(parsedHtmlTag.hasAttribute("{[text]}"));

//Assert false for these cases
parsedHtmlTag = parsedHtmlTagList.get(12);
Assert.assertEquals(parsedHtmlTag.lowerName(), "p");
Assert.assertTrue(parsedHtmlTag.hasAttribute("{[text}"));

parsedHtmlTag = parsedHtmlTagList.get(13);
Assert.assertEquals(parsedHtmlTag.lowerName(), "p");
Assert.assertTrue(parsedHtmlTag.hasAttribute("{text]"));

parsedHtmlTag = parsedHtmlTagList.get(14);
Assert.assertEquals(parsedHtmlTag.lowerName(), "p");
Assert.assertTrue(parsedHtmlTag.hasAttribute("{text"));

parsedHtmlTag = parsedHtmlTagList.get(15);
Assert.assertEquals(parsedHtmlTag.lowerName(), "p");
Assert.assertTrue(parsedHtmlTag.hasAttribute("text}"));

parsedHtmlTag = parsedHtmlTagList.get(16);
Assert.assertEquals(parsedHtmlTag.lowerName(), "p");
Assert.assertTrue(parsedHtmlTag.hasAttribute("[text"));

parsedHtmlTag = parsedHtmlTagList.get(17);
Assert.assertEquals(parsedHtmlTag.lowerName(), "p");
Assert.assertTrue(parsedHtmlTag.hasAttribute("text]"));

parsedHtmlTag = parsedHtmlTagList.get(18);
Assert.assertEquals(parsedHtmlTag.lowerName(), "p");
Assert.assertTrue(parsedHtmlTag.hasAttribute("⚡text]"));

parsedHtmlTag = parsedHtmlTagList.get(19);
Assert.assertEquals(parsedHtmlTag.lowerName(), "p");
Assert.assertTrue(parsedHtmlTag.hasAttribute("⚡text}"));

parsedHtmlTag = parsedHtmlTagList.get(20);
Assert.assertEquals(parsedHtmlTag.lowerName(), "p");
Assert.assertTrue(parsedHtmlTag.hasAttribute("⚡text"));
}

/**
* Test AMP feature.
*
* @throws IOException IOException
* @throws SAXException SAXException
*/
@Test
public void testAMPFeature() throws IOException, SAXException {
final Parser parser = new Parser();
Assert.assertEquals(parser.getFeature(Parser.AMP_VALIDATION_FEATURE), false);

parser.setFeature(Parser.AMP_VALIDATION_FEATURE, true);
Assert.assertEquals(parser.getFeature(Parser.AMP_VALIDATION_FEATURE), true);
}

/**
* Parse an sample html with only double quote in public id inside !DOCTYPE.
*
Expand Down
Loading

0 comments on commit 46dbc81

Please sign in to comment.