Skip to content

Commit

Permalink
libhtml: Fix edge case in entity parsing
Browse files Browse the repository at this point in the history
When a named entity is aborted, we need to backtrack to the longest
prefix that doesn't require a semicolon.
  • Loading branch information
lilyball committed May 19, 2014
1 parent 4ad5be4 commit 63d3b2f
Showing 1 changed file with 30 additions and 14 deletions.
44 changes: 30 additions & 14 deletions src/libhtml/escape.rs
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ pub struct UnescapeWriter<W> {
enum UnescapeState {
CharData,
Begin,
Named(uint, uint), // index into ENTITIES, and prefix len
Named(uint, uint, uint), // index into ENTITIES, prefix len, last non-semi index
HexStart(bool), // boolean indicates if x is lower or upper case
Hex(u32),
DecStart,
Expand Down Expand Up @@ -209,10 +209,14 @@ impl<W: Writer> UnescapeWriter<W> {
Begin => {
try!(self.inner.get_mut_ref().write_str("&"));
}
Named(cursor, plen) => {
Named(cursor, plen, lastcur) => {
let (name, chars, needs_semi) = ENTITIES[cursor];
if !needs_semi && name.len() == plen {
try!(self.inner.get_mut_ref().write_str(chars));
} else if lastcur != -1 {
let (lastname, chars, _) = ENTITIES[lastcur];
try!(self.inner.get_mut_ref().write_str(chars));
try!(self.inner.get_mut_ref().write_str(name.slice(lastname.len(), plen)));
} else {
try!(self.inner.get_mut_ref().write_str(name.slice_to(plen)));
}
Expand Down Expand Up @@ -349,13 +353,13 @@ impl<W:Writer> Writer for UnescapeWriter<W> {
// with our character as a prefix.
// There's at least one entity that starts with every letter, so we don't
// have to worry about not finding one.
self.state = Named(base, 2); // plen is 2 to include &
self.state = Named(base, 2, -1); // plen is 2 to include &
}
(Named(cursor, plen), ';') => {
(Named(cursor, plen, _), ';') => {
it.next(); // consume ;
let (name, chars, _) = ENTITIES[cursor];
if name.len() == plen {
// valid entity
it.next(); // consume ;
try!(self.inner_write_str(chars));
self.state = CharData;
cdata = i+1;
Expand All @@ -365,25 +369,32 @@ impl<W:Writer> Writer for UnescapeWriter<W> {
cdata = i;
}
}
(Named(cursor, plen), 'a'..'z') |
(Named(cursor, plen), 'A'..'Z') |
(Named(cursor, plen), '0'..'9') => {
(Named(cursor, plen, lastcur), 'a'..'z') |
(Named(cursor, plen, lastcur), 'A'..'Z') |
(Named(cursor, plen, lastcur), '0'..'9') => {
let mut cursor = cursor;
let (name, _, _) = ENTITIES[cursor];
it.next(); // consume character
let (mut name, _, mut needs_semi) = ENTITIES[cursor];
if name.len() > plen && name[plen] == b {
// existing cursor is still a match
} else {
// search forward to find the next entity with our prefix
let prefix = name.slice_to(plen);
for ix in range(cursor+1, ENTITIES.len()) {
let (name, _, _) = ENTITIES[ix];
if !name.starts_with(prefix) {
let (name_, _, needs_semi_) = ENTITIES[ix];
if !name_.starts_with(prefix) {
// no match
cursor = -1;
break;
}
if name.len() > plen && name[plen] == b {
if name_.len() > plen && name_[plen] == b {
cursor = ix;
name = name_;
needs_semi = needs_semi_;
if name_.len() == plen+1 {
name = name_;
needs_semi = needs_semi_;
}
break;
}
}
Expand All @@ -394,8 +405,13 @@ impl<W:Writer> Writer for UnescapeWriter<W> {
self.state = CharData;
cdata = i;
} else {
it.next(); // consume character
self.state = Named(cursor, plen+1);
let plen = plen+1;
let lastcur = if !needs_semi && name.len() == plen {
cursor
} else {
lastcur
};
self.state = Named(cursor, plen, lastcur);
}
}
(HexStart(_), 'a'..'f')|(HexStart(_), 'A'..'F')|(HexStart(_), '0'..'9') => {
Expand Down

0 comments on commit 63d3b2f

Please sign in to comment.