Skip to content

Commit

Permalink
#57 Uncaught IllegalArgumentExcept due to malformed unicode entity ref
Browse files Browse the repository at this point in the history
- Added a more readable error message by means of a
XmlPullParserException.
- Improved validation of the numeric character reference, according to
XML 1.0 spec. (https://www.w3.org/TR/REC-xml/#NT-Char)
- Added tests for valid char references.
- Catched and fixed wrong parsing bug for decimal >= &#10000
(supplemental) char refs.
  • Loading branch information
belingueres authored and hboutemy committed Mar 10, 2019
1 parent dd8d35a commit 1dafbae
Show file tree
Hide file tree
Showing 2 changed files with 153 additions and 15 deletions.
39 changes: 24 additions & 15 deletions src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -2664,13 +2664,16 @@ protected char[] parseEntityRef()
entityRefName = null;
posStart = pos;
char ch = more();
StringBuilder sb = new StringBuilder();
if ( ch == '#' )
{
// parse character reference

char charRef = 0;
ch = more();
if ( ch == 'x' )
StringBuilder sb = new StringBuilder();
boolean isHex = ( ch == 'x' );

if ( isHex )
{
// encoded in hex
while ( true )
Expand Down Expand Up @@ -2710,6 +2713,7 @@ else if ( ch >= 'A' && ch <= 'F' )
if ( ch >= '0' && ch <= '9' )
{
charRef = (char) ( charRef * 10 + ( ch - '0' ) );
sb.append( ch );
}
else if ( ch == ';' )
{
Expand All @@ -2724,20 +2728,19 @@ else if ( ch >= 'A' && ch <= 'F' )
}
}
posEnd = pos - 1;
if ( sb.length() > 0 )
try
{
char[] tmp = toChars( Integer.parseInt( sb.toString(), 16 ) );
charRefOneCharBuf = tmp;
if ( tokenize )
{
text = newString( charRefOneCharBuf, 0, charRefOneCharBuf.length );
}
return charRefOneCharBuf;
charRefOneCharBuf = toChars( Integer.parseInt( sb.toString(), isHex ? 16 : 10 ) );
}
charRefOneCharBuf[0] = charRef;
catch ( IllegalArgumentException e )
{
throw new XmlPullParserException( "character reference (with " + ( isHex ? "hex" : "decimal" )
+ " value " + sb.toString() + ") is invalid", this, null );
}

if ( tokenize )
{
text = newString( charRefOneCharBuf, 0, 1 );
text = newString( charRefOneCharBuf, 0, charRefOneCharBuf.length );
}
return charRefOneCharBuf;
}
Expand Down Expand Up @@ -3996,15 +3999,21 @@ private static boolean isHighSurrogate( char ch )
return ( MIN_HIGH_SURROGATE <= ch && MAX_HIGH_SURROGATE >= ch );
}

private static final int MIN_CODE_POINT = 0x000000;

private static final int MAX_CODE_POINT = 0x10FFFF;

private static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000;

/**
* Check if the provided parameter is a valid Char, according to: {@link https://www.w3.org/TR/REC-xml/#NT-Char}
*
* @param codePoint the numeric value to check
* @return true if it is a valid numeric character reference. False otherwise.
*/
private static boolean isValidCodePoint( int codePoint )
{
return ( MIN_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint );
// Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
return codePoint == 0x9 || codePoint == 0xA || codePoint == 0xD || ( 0x20 <= codePoint && codePoint <= 0xD7FF )
|| ( 0xE000 <= codePoint && codePoint <= 0xFFFD ) || ( 0x10000 <= codePoint && codePoint <= 0x10FFFF );
}

private static boolean isSupplementaryCodePoint( int codePoint )
Expand Down
129 changes: 129 additions & 0 deletions src/test/java/org/codehaus/plexus/util/xml/pull/MXParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
*/

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;

import java.io.IOException;
import java.io.StringReader;
Expand Down Expand Up @@ -156,6 +158,133 @@ public void testUnicodeEntities()
assertEquals( XmlPullParser.END_TAG, parser.nextToken() );
}

@Test
public void testInvalidCharacterReferenceHexa()
throws Exception
{
MXParser parser = new MXParser();
String input = "<root>&#x110000;</root>";
parser.setInput( new StringReader( input ) );

try
{
assertEquals( XmlPullParser.START_TAG, parser.nextToken() );
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
fail( "Should fail since &#x110000; is an illegal character reference" );
}
catch ( XmlPullParserException e )
{
assertTrue( e.getMessage().contains( "character reference (with hex value 110000) is invalid" ) );
}
}

@Test
public void testValidCharacterReferenceHexa()
throws Exception
{
MXParser parser = new MXParser();
String input = "<root>&#x9;&#xA;&#xD;&#x20;&#x200;&#xD7FF;&#xE000;&#xFFA2;&#xFFFD;&#x10000;&#x10FFFD;&#x10FFFF;</root>";
parser.setInput( new StringReader( input ) );

try
{
assertEquals( XmlPullParser.START_TAG, parser.nextToken() );
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
assertEquals( 0x9, parser.getText().codePointAt( 0 ) );
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
assertEquals( 0xA, parser.getText().codePointAt( 0 ) );
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
assertEquals( 0xD, parser.getText().codePointAt( 0 ) );
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
assertEquals( 0x20, parser.getText().codePointAt( 0 ) );
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
assertEquals( 0x200, parser.getText().codePointAt( 0 ) );
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
assertEquals( 0xD7FF, parser.getText().codePointAt( 0 ) );
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
assertEquals( 0xE000, parser.getText().codePointAt( 0 ) );
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
assertEquals( 0xFFA2, parser.getText().codePointAt( 0 ) );
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
assertEquals( 0xFFFD, parser.getText().codePointAt( 0 ) );
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
assertEquals( 0x10000, parser.getText().codePointAt( 0 ) );
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
assertEquals( 0x10FFFD, parser.getText().codePointAt( 0 ) );
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
assertEquals( 0x10FFFF, parser.getText().codePointAt( 0 ) );
assertEquals( XmlPullParser.END_TAG, parser.nextToken() );
}
catch ( XmlPullParserException e )
{
fail( "Should success since the input represents all legal character references" );
}
}

@Test
public void testInvalidCharacterReferenceDecimal()
throws Exception
{
MXParser parser = new MXParser();
String input = "<root>&#1114112;</root>";
parser.setInput( new StringReader( input ) );

try
{
assertEquals( XmlPullParser.START_TAG, parser.nextToken() );
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
fail( "Should fail since &#1114112; is an illegal character reference" );
}
catch ( XmlPullParserException e )
{
assertTrue( e.getMessage().contains( "character reference (with decimal value 1114112) is invalid" ) );
}
}

@Test
public void testValidCharacterReferenceDecimal()
throws Exception
{
MXParser parser = new MXParser();
String input =
"<root>&#9;&#10;&#13;&#32;&#512;&#55295;&#57344;&#65442;&#65533;&#65536;&#1114109;&#1114111;</root>";
parser.setInput( new StringReader( input ) );

try
{
assertEquals( XmlPullParser.START_TAG, parser.nextToken() );
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
assertEquals( 9, parser.getText().codePointAt( 0 ) );
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
assertEquals( 10, parser.getText().codePointAt( 0 ) );
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
assertEquals( 13, parser.getText().codePointAt( 0 ) );
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
assertEquals( 32, parser.getText().codePointAt( 0 ) );
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
assertEquals( 512, parser.getText().codePointAt( 0 ) );
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
assertEquals( 55295, parser.getText().codePointAt( 0 ) );
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
assertEquals( 57344, parser.getText().codePointAt( 0 ) );
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
assertEquals( 65442, parser.getText().codePointAt( 0 ) );
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
assertEquals( 65533, parser.getText().codePointAt( 0 ) );
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
assertEquals( 65536, parser.getText().codePointAt( 0 ) );
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
assertEquals( 1114109, parser.getText().codePointAt( 0 ) );
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
assertEquals( 1114111, parser.getText().codePointAt( 0 ) );
assertEquals( XmlPullParser.END_TAG, parser.nextToken() );
}
catch ( XmlPullParserException e )
{
fail( "Should success since the input represents all legal character references" );
}
}

@Test
public void testProcessingInstruction()
throws Exception
Expand Down

0 comments on commit 1dafbae

Please sign in to comment.