libhtml: Add tests, support &#X

Add a bunch of tests taken from cpython's html module, along with a couple of other homegrown ones. Add support for &#X123; entities, with the capital X, which was forgotten before.
lilyball · May 19, 2014 · 4ad5be4 · 4ad5be4
1 parent 24b73d7
commit 4ad5be4
Show file tree

Hide file tree

Showing 3 changed files with 231 additions and 86 deletions.
diff --git a/src/libhtml/escape.rs b/src/libhtml/escape.rs
@@ -141,7 +141,7 @@ enum UnescapeState {
     CharData,
     Begin,
     Named(uint, uint), // index into ENTITIES, and prefix len
-    HexStart,
+    HexStart(bool), // boolean indicates if x is lower or upper case
     Hex(u32),
     DecStart,
     Dec(u32)
@@ -220,9 +220,12 @@ impl<W: Writer> UnescapeWriter<W> {
             DecStart => {
                 try!(self.inner.get_mut_ref().write_str("&#"));
             }
-            HexStart => {
+            HexStart(false) => {
                 try!(self.inner.get_mut_ref().write_str("&#x"));
             }
+            HexStart(true) => {
+                try!(self.inner.get_mut_ref().write_str("&#X"));
+            }
             Hex(val) | Dec(val) => {
                 let c = match char::from_u32(val) {
                     None|Some('\0') => '\uFFFD',
@@ -312,7 +315,11 @@ impl<W:Writer> Writer for UnescapeWriter<W> {
                     self.state = match it.peek().map(|&(_,&b)| b as char) {
                         Some('x') => {
                             it.next(); // consume x
-                            HexStart
+                            HexStart(false)
+                        }
+                        Some('X') => {
+                            it.next(); // consume X
+                            HexStart(true)
                         }
                         _ => DecStart
                     }
@@ -391,7 +398,7 @@ impl<W:Writer> Writer for UnescapeWriter<W> {
                         self.state = Named(cursor, plen+1);
                     }
                 }
-                (HexStart, 'a'..'f')|(HexStart, 'A'..'F')|(HexStart, '0'..'9') => {
+                (HexStart(_), 'a'..'f')|(HexStart(_), 'A'..'F')|(HexStart(_), '0'..'9') => {
                     self.state = Hex(0);
                     // don't consume, re-try this digit in the Hex state
                 }

diff --git a/src/libhtml/lib.rs b/src/libhtml/lib.rs
@@ -55,6 +55,8 @@
 #![crate_type = "dylib"]
 #![crate_type = "rlib"]
 
+#![feature(macro_rules)] // used for tests
+
 use std::fmt::Show;
 use fmt::{Escape, Unescape};
 
@@ -80,85 +82,4 @@ pub fn unescape<T: Show>(s: T) -> ~str {
 }
 
 #[cfg(test)]
-mod tests {
-    extern crate test;
-    use std::fmt;
-    use super::{escape, unescape};
-
-    struct Test(StrBuf);
-
-    impl fmt::Show for Test {
-        fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
-            let Test(ref s) = *self;
-            write!(fmt, "<Test>{}</Test>", s)
-        }
-    }
-
-    struct UnTest(&'static str, &'static str);
-
-    impl fmt::Show for UnTest {
-        fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
-            let UnTest(s1, s2) = *self;
-            try!(write!(fmt, "{}", s1));
-            write!(fmt, "{}", s2)
-        }
-    }
-
-    #[test]
-    fn test_escape() {
-        let s = r#"<script src="evil.domain?foo&" type='baz'>"#;
-        assert_eq!(escape(s).as_slice(), "&lt;script src=&quot;evil.domain?foo&amp;&quot; \
-                                          type=&apos;baz&apos;&gt;");
-
-        let t = Test("foo".to_strbuf());
-        assert_eq!(escape(t), "&lt;Test&gt;foo&lt;/Test&gt;".to_owned());
-    }
-
-    #[test]
-    fn test_unescape() {
-        let s = "&lt;script src=&quot;evil.domain?foo&amp;&quot; type=&#39;baz&#39;&gt;";
-        assert_eq!(unescape(s), r#"<script src="evil.domain?foo&" type='baz'>"#.to_owned());
-
-        assert_eq!(unescape("&rarr;"), "\u2192".to_owned());
-        assert_eq!(unescape("&&amp;amp;amp;"), "&&amp;amp;".to_owned());
-        assert_eq!(unescape("&CounterClockwiseContourIntegral;"), "\u2233".to_owned());
-        assert_eq!(unescape("&amp"), "&".to_owned());
-        assert_eq!(unescape(UnTest("&am", "p;")), "&".to_owned());
-        assert_eq!(unescape("&fakentity"), "&fakentity".to_owned());
-        assert_eq!(unescape("&fakentity;"), "&fakentity;".to_owned());
-        assert_eq!(unescape("&aeligtest"), "ætest".to_owned());
-        assert_eq!(unescape("&#0abc"), "\uFFFDabc".to_owned());
-        assert_eq!(unescape("&#abc"), "&#abc".to_owned());
-        assert_eq!(unescape("&#xgabc"), "&#xgabc".to_owned());
-    }
-
-    #[bench]
-    fn bench_escape(b: &mut test::Bencher) {
-        let s = "<script src=\"evil.domain?foo&\" type='baz'>";
-        b.iter(|| escape(s));
-    }
-
-    #[bench]
-    fn bench_unescape(b: &mut test::Bencher) {
-        let s = "&lt;script src=&quot;evil.domain?foo&amp;&quot; type=&#39;baz&#39;&gt;";
-        b.iter(|| unescape(s));
-    }
-
-    #[bench]
-    fn bench_longest_entity(b: &mut test::Bencher) {
-        let s = "&CounterClockwiseContourIntegral;";
-        b.iter(|| assert_eq!(unescape(s).as_slice(), "\u2233"));
-    }
-
-    #[bench]
-    fn bench_longest_non_entity(b: &mut test::Bencher) {
-        let s = "&CounterClockwiseContourIntegraX;";
-        b.iter(|| assert_eq!(unescape(s).as_slice(), "&CounterClockwiseContourIntegraX;"));
-    }
-
-    #[bench]
-    fn bench_short_entity_long_tail(b: &mut test::Bencher) {
-        let s = "&ampnterClockwiseContourIntegral";
-        b.iter(|| assert_eq!(unescape(s).as_slice(), "&nterClockwiseContourIntegral"));
-    }
-}
+mod tests;
diff --git a/src/libhtml/tests.rs b/src/libhtml/tests.rs
@@ -0,0 +1,217 @@
+// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+extern crate test;
+use std::fmt;
+use super::{escape, unescape};
+
+struct Test(StrBuf);
+
+impl fmt::Show for Test {
+    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
+        let Test(ref s) = *self;
+        write!(fmt, "<Test>{}</Test>", s)
+    }
+}
+
+struct UnTest(&'static str, &'static str);
+
+impl fmt::Show for UnTest {
+    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
+        let UnTest(s1, s2) = *self;
+        try!(write!(fmt, "{}", s1));
+        write!(fmt, "{}", s2)
+    }
+}
+
+#[test]
+fn test_escape() {
+    let s = r#"<script src="evil.domain?foo&" type='baz'>"#;
+    assert_eq!(escape(s).as_slice(), "&lt;script src=&quot;evil.domain?foo&amp;&quot; \
+                                        type=&apos;baz&apos;&gt;");
+
+    let t = Test("foo".to_strbuf());
+    assert_eq!(escape(t), "&lt;Test&gt;foo&lt;/Test&gt;".to_owned());
+}
+
+#[test]
+fn test_unescape() {
+    let s = "&lt;script src=&quot;evil.domain?foo&amp;&quot; type=&#39;baz&#39;&gt;";
+    assert_eq!(unescape(s), r#"<script src="evil.domain?foo&" type='baz'>"#.to_owned());
+
+    assert_eq!(unescape("&rarr;"), "\u2192".to_owned());
+    assert_eq!(unescape("&&amp;amp;amp;"), "&&amp;amp;".to_owned());
+    assert_eq!(unescape("&CounterClockwiseContourIntegral;"), "\u2233".to_owned());
+    assert_eq!(unescape("&amp"), "&".to_owned());
+    assert_eq!(unescape(UnTest("&am", "p;")), "&".to_owned());
+    assert_eq!(unescape("&fakentity"), "&fakentity".to_owned());
+    assert_eq!(unescape("&fakentity;"), "&fakentity;".to_owned());
+    assert_eq!(unescape("&aeligtest"), "ætest".to_owned());
+    assert_eq!(unescape("&#0abc"), "\uFFFDabc".to_owned());
+    assert_eq!(unescape("&#abc"), "&#abc".to_owned());
+    assert_eq!(unescape("&#xgabc"), "&#xgabc".to_owned());
+    assert_eq!(unescape("&#X2022; &#XYZ;"), "\u2022 &#XYZ;".to_owned());
+    // this next escape overflows a u64. WebKit incorrectly treats this as &#x2022;
+    assert_eq!(unescape("&#x100000000000000002022;"), "\uFFFD".to_owned());
+}
+
+// Tests from python's html module
+// See http://hg.python.org/cpython/file/82caec3865e3/Lib/test/test_html.py
+mod python {
+    use {escape, unescape};
+    use escape::{EscapeWriter, EscapeText};
+    use std::io::MemWriter;
+
+    #[test]
+    fn test_escape() {
+        // python converts ' to &#x27; but we go to &apos;
+        assert_eq!(escape(r#"'<script>"&foo;"</script>'"#).as_slice(),
+                   "&apos;&lt;script&gt;&quot;&amp;foo;&quot;&lt;/script&gt;&apos;");
+        let mut w = EscapeWriter::new(MemWriter::new(), EscapeText);
+        assert!(w.write_str(r#"'<script>"&foo;"</script>'"#).is_ok());
+        assert_eq!(w.unwrap().unwrap().as_slice(),
+                   r#"'&lt;script&gt;"&amp;foo;"&lt;/script&gt;'"#.as_bytes());
+    }
+
+    #[test]
+    fn test_unescape() {
+        macro_rules! check{
+            ($text:expr, $exp:expr) => {
+                assert_eq!(unescape($text).as_slice(), $exp.as_slice());
+            };
+            (num: $num:expr, $exp:expr) => {{
+                let num = $num;
+                let exp = $exp;
+                check!(format!(r"&\#{}", num), exp);
+                check!(format!(r"&\#{};", num), exp);
+                check!(format!(r"&\#x{:x}", num), exp);
+                check!(format!(r"&\#x{:x};", num), exp);
+            }};
+        }
+
+        // check text with no character references
+        check!("no character references", "no character references");
+        // check & followed by invalid chars
+        check!("&\n&\t& &&", "&\n&\t& &&");
+        // check & followed by numbers and letters
+        check!("&0 &9 &a &0; &9; &a;", "&0 &9 &a &0; &9; &a;");
+        // check incomplete entities at the end of the string
+        for x in ["&", "&#", "&#x", "&#X", "&#y", "&#xy", "&#Xy"].iter() {
+            check!(x, x);
+            check!(x+";", x+";");
+        }
+        // check several combinations of numeric character references,
+        // possibly followed by different characters
+        // NB: no runtime formatting strings so the loop has been unrolled
+        for (&num, &c) in [65u32, 97, 34, 38, 0x2603, 0x101234].iter()
+                          .zip(["A", "a", "\"", "&", "\u2603", "\U00101234"].iter()) {
+            let v = [format!(r"&\#{}",num),     format!(r"&\#{:07}",num),
+                     format!(r"&\#{};",num),    format!(r"&\#{:07};",num),
+                     format!(r"&\#x{:x}",num),  format!(r"&\#x{:06x}",num),
+                     format!(r"&\#x{:x};",num), format!(r"&\#x{:06x};",num),
+                     format!(r"&\#x{:X}",num),  format!(r"&\#x{:06X}",num),
+                     format!(r"&\#X{:x};",num), format!(r"&\#X{:06x};",num)];
+            for s in v.iter() {
+                check!(s.as_slice(), c);
+                for end in [" ", "X"].iter() {
+                    check!(*s+*end, c+*end);
+                }
+            }
+        }
+        // check invalid codepoints
+        for &cp in [0xD800, 0xDB00, 0xDC00, 0xDFFF, 0x110000].iter() {
+            check!(num: cp, "\uFFFD");
+        }
+        // check more invalid codepoints
+        // this test is elided because it's wrong. I don't know why cpython thinks codepoints
+        // [0x1, 0xb, 0xe, 0x7f, 0xfffe, 0xffff, 0x10fffe, 0x10ffff] should return nothing.
+        // check invalid numbers
+        for (&num, &c) in [0x0d, 0x80, 0x95, 0x9d].iter()
+                          .zip(["\r", "\u20ac", "\u2022", "\x9d"].iter()) {
+            check!(num: num, c);
+        }
+        // check small numbers
+        check!(num: 0, "\uFFFD");
+        check!(num: 9, "\t");
+        // check a big number
+        check!(num: 1000000000000000000u64, "\uFFFD");
+        // check that multiple trailing semicolons are handled correctly
+        for e in ["&quot;;", "&#34;;", "&#x22;;", "&#X22;;"].iter() {
+            check!(*e, "\";");
+        }
+        // check that semicolons in the middle don't create problems
+        for e in ["&quot;quot;", "&#34;quot;", "&#x22;quot;", "&#X22;quot;"].iter() {
+            check!(*e, "\"quot;");
+        }
+        // check triple adjacent charrefs
+        for e in ["&quot", "&#34", "&#x22", "&#X22"].iter() {
+            check!(e.repeat(3), r#"""""#);
+            check!((*e+";").repeat(3), r#"""""#);
+        }
+        // check that the case is respected
+        for e in ["&amp", "&amp;", "&AMP", "&AMP;"].iter() {
+            check!(*e, "&");
+        }
+        for e in ["&Amp", "&Amp;"].iter() {
+            check!(*e, *e);
+        }
+        // check that non-existent named entities are returned unchanged
+        check!("&svadilfari;", "&svadilfari;");
+        // the following examples are in the html5 specs
+        check!("&notit", "¬it");
+        check!("&notit;", "¬it;");
+        check!("&notin", "¬in");
+        check!("&notin;", "∉");
+        // a similar example with a long name
+        check!("&notReallyAnExistingNamedCharacterReference;",
+               "¬ReallyAnExistingNamedCharacterReference;");
+        // longest valid name
+        check!("&CounterClockwiseContourIntegral;", "∳");
+        // check a charref that maps to two unicode chars
+        check!("&acE;", "\u223E\u0333");
+        check!("&acE", "&acE");
+        // test a large number of entities
+        check!("&#123; ".repeat(1050), "{ ".repeat(1050));
+        // check some html5 entities
+        check!("&Eacuteric&Eacute;ric&alphacentauri&alpha;centauri",
+               "ÉricÉric&alphacentauriαcentauri");
+        check!("&co;", "&co;");
+    }
+}
+
+#[bench]
+fn bench_escape(b: &mut test::Bencher) {
+    let s = "<script src=\"evil.domain?foo&\" type='baz'>";
+    b.iter(|| escape(s));
+}
+
+#[bench]
+fn bench_unescape(b: &mut test::Bencher) {
+    let s = "&lt;script src=&quot;evil.domain?foo&amp;&quot; type=&#39;baz&#39;&gt;";
+    b.iter(|| unescape(s));
+}
+
+#[bench]
+fn bench_longest_entity(b: &mut test::Bencher) {
+    let s = "&CounterClockwiseContourIntegral;";
+    b.iter(|| assert_eq!(unescape(s).as_slice(), "\u2233"));
+}
+
+#[bench]
+fn bench_longest_non_entity(b: &mut test::Bencher) {
+    let s = "&CounterClockwiseContourIntegraX;";
+    b.iter(|| assert_eq!(unescape(s).as_slice(), "&CounterClockwiseContourIntegraX;"));
+}
+
+#[bench]
+fn bench_short_entity_long_tail(b: &mut test::Bencher) {
+    let s = "&ampnterClockwiseContourIntegral";
+    b.iter(|| assert_eq!(unescape(s).as_slice(), "&nterClockwiseContourIntegral"));
+}