From 4ad5be46e9ee4f777434bbf4acdccdfa64196b2e Mon Sep 17 00:00:00 2001 From: Kevin Ballard Date: Fri, 2 May 2014 23:51:21 -0700 Subject: [PATCH] libhtml: Add tests, support &#X Add a bunch of tests taken from cpython's html module, along with a couple of other homegrown ones. Add support for ģ entities, with the capital X, which was forgotten before. --- src/libhtml/escape.rs | 15 ++- src/libhtml/lib.rs | 85 +---------------- src/libhtml/tests.rs | 217 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 231 insertions(+), 86 deletions(-) create mode 100644 src/libhtml/tests.rs diff --git a/src/libhtml/escape.rs b/src/libhtml/escape.rs index 81fa5e000f6aa..5918641c87c26 100644 --- a/src/libhtml/escape.rs +++ b/src/libhtml/escape.rs @@ -141,7 +141,7 @@ enum UnescapeState { CharData, Begin, Named(uint, uint), // index into ENTITIES, and prefix len - HexStart, + HexStart(bool), // boolean indicates if x is lower or upper case Hex(u32), DecStart, Dec(u32) @@ -220,9 +220,12 @@ impl UnescapeWriter { DecStart => { try!(self.inner.get_mut_ref().write_str("&#")); } - HexStart => { + HexStart(false) => { try!(self.inner.get_mut_ref().write_str("&#x")); } + HexStart(true) => { + try!(self.inner.get_mut_ref().write_str("&#X")); + } Hex(val) | Dec(val) => { let c = match char::from_u32(val) { None|Some('\0') => '\uFFFD', @@ -312,7 +315,11 @@ impl Writer for UnescapeWriter { self.state = match it.peek().map(|&(_,&b)| b as char) { Some('x') => { it.next(); // consume x - HexStart + HexStart(false) + } + Some('X') => { + it.next(); // consume X + HexStart(true) } _ => DecStart } @@ -391,7 +398,7 @@ impl Writer for UnescapeWriter { self.state = Named(cursor, plen+1); } } - (HexStart, 'a'..'f')|(HexStart, 'A'..'F')|(HexStart, '0'..'9') => { + (HexStart(_), 'a'..'f')|(HexStart(_), 'A'..'F')|(HexStart(_), '0'..'9') => { self.state = Hex(0); // don't consume, re-try this digit in the Hex state } diff --git a/src/libhtml/lib.rs b/src/libhtml/lib.rs index 3fe40b9a0840d..16792974de0e6 100644 --- a/src/libhtml/lib.rs +++ b/src/libhtml/lib.rs @@ -55,6 +55,8 @@ #![crate_type = "dylib"] #![crate_type = "rlib"] +#![feature(macro_rules)] // used for tests + use std::fmt::Show; use fmt::{Escape, Unescape}; @@ -80,85 +82,4 @@ pub fn unescape(s: T) -> ~str { } #[cfg(test)] -mod tests { - extern crate test; - use std::fmt; - use super::{escape, unescape}; - - struct Test(StrBuf); - - impl fmt::Show for Test { - fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { - let Test(ref s) = *self; - write!(fmt, "{}", s) - } - } - - struct UnTest(&'static str, &'static str); - - impl fmt::Show for UnTest { - fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { - let UnTest(s1, s2) = *self; - try!(write!(fmt, "{}", s1)); - write!(fmt, "{}", s2) - } - } - - #[test] - fn test_escape() { - let s = r#"'"#).as_slice(), + "'<script>"&foo;"</script>'"); + let mut w = EscapeWriter::new(MemWriter::new(), EscapeText); + assert!(w.write_str(r#"''"#).is_ok()); + assert_eq!(w.unwrap().unwrap().as_slice(), + r#"'<script>"&foo;"</script>'"#.as_bytes()); + } + + #[test] + fn test_unescape() { + macro_rules! check{ + ($text:expr, $exp:expr) => { + assert_eq!(unescape($text).as_slice(), $exp.as_slice()); + }; + (num: $num:expr, $exp:expr) => {{ + let num = $num; + let exp = $exp; + check!(format!(r"&\#{}", num), exp); + check!(format!(r"&\#{};", num), exp); + check!(format!(r"&\#x{:x}", num), exp); + check!(format!(r"&\#x{:x};", num), exp); + }}; + } + + // check text with no character references + check!("no character references", "no character references"); + // check & followed by invalid chars + check!("&\n&\t& &&", "&\n&\t& &&"); + // check & followed by numbers and letters + check!("&0 &9 &a &0; &9; &a;", "&0 &9 &a &0; &9; &a;"); + // check incomplete entities at the end of the string + for x in ["&", "&#", "&#x", "&#X", "&#y", "&#xy", "&#Xy"].iter() { + check!(x, x); + check!(x+";", x+";"); + } + // check several combinations of numeric character references, + // possibly followed by different characters + // NB: no runtime formatting strings so the loop has been unrolled + for (&num, &c) in [65u32, 97, 34, 38, 0x2603, 0x101234].iter() + .zip(["A", "a", "\"", "&", "\u2603", "\U00101234"].iter()) { + let v = [format!(r"&\#{}",num), format!(r"&\#{:07}",num), + format!(r"&\#{};",num), format!(r"&\#{:07};",num), + format!(r"&\#x{:x}",num), format!(r"&\#x{:06x}",num), + format!(r"&\#x{:x};",num), format!(r"&\#x{:06x};",num), + format!(r"&\#x{:X}",num), format!(r"&\#x{:06X}",num), + format!(r"&\#X{:x};",num), format!(r"&\#X{:06x};",num)]; + for s in v.iter() { + check!(s.as_slice(), c); + for end in [" ", "X"].iter() { + check!(*s+*end, c+*end); + } + } + } + // check invalid codepoints + for &cp in [0xD800, 0xDB00, 0xDC00, 0xDFFF, 0x110000].iter() { + check!(num: cp, "\uFFFD"); + } + // check more invalid codepoints + // this test is elided because it's wrong. I don't know why cpython thinks codepoints + // [0x1, 0xb, 0xe, 0x7f, 0xfffe, 0xffff, 0x10fffe, 0x10ffff] should return nothing. + // check invalid numbers + for (&num, &c) in [0x0d, 0x80, 0x95, 0x9d].iter() + .zip(["\r", "\u20ac", "\u2022", "\x9d"].iter()) { + check!(num: num, c); + } + // check small numbers + check!(num: 0, "\uFFFD"); + check!(num: 9, "\t"); + // check a big number + check!(num: 1000000000000000000u64, "\uFFFD"); + // check that multiple trailing semicolons are handled correctly + for e in ["";", "";", "";", "";"].iter() { + check!(*e, "\";"); + } + // check that semicolons in the middle don't create problems + for e in [""quot;", ""quot;", ""quot;", ""quot;"].iter() { + check!(*e, "\"quot;"); + } + // check triple adjacent charrefs + for e in [""", """, """, """].iter() { + check!(e.repeat(3), r#"""""#); + check!((*e+";").repeat(3), r#"""""#); + } + // check that the case is respected + for e in ["&", "&", "&", "&"].iter() { + check!(*e, "&"); + } + for e in ["&Amp", "&Amp;"].iter() { + check!(*e, *e); + } + // check that non-existent named entities are returned unchanged + check!("&svadilfari;", "&svadilfari;"); + // the following examples are in the html5 specs + check!("¬it", "¬it"); + check!("¬it;", "¬it;"); + check!("¬in", "¬in"); + check!("∉", "∉"); + // a similar example with a long name + check!("¬ReallyAnExistingNamedCharacterReference;", + "¬ReallyAnExistingNamedCharacterReference;"); + // longest valid name + check!("∳", "∳"); + // check a charref that maps to two unicode chars + check!("∾̳", "\u223E\u0333"); + check!("&acE", "&acE"); + // test a large number of entities + check!("{ ".repeat(1050), "{ ".repeat(1050)); + // check some html5 entities + check!("ÉricÉric&alphacentauriαcentauri", + "ÉricÉric&alphacentauriαcentauri"); + check!("&co;", "&co;"); + } +} + +#[bench] +fn bench_escape(b: &mut test::Bencher) { + let s = "