diff --git a/lib/loofah/html5/scrub.rb b/lib/loofah/html5/scrub.rb index 4b35807..55fba46 100644 --- a/lib/loofah/html5/scrub.rb +++ b/lib/loofah/html5/scrub.rb @@ -36,20 +36,7 @@ def scrub_attributes(node) end if SafeList::ATTR_VAL_IS_URI.include?(attr_name) - # this block lifted nearly verbatim from HTML5 sanitization - val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS, "").downcase - if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && !SafeList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0]) - attr_node.remove - next - elsif val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0] == "data" - # permit only allowed data mediatypes - mediatype = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[1] - mediatype, _ = mediatype.split(";")[0..1] if mediatype - if mediatype && !SafeList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype) - attr_node.remove - next - end - end + next if scrub_uri_attribute(attr_node) end if SafeList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name) @@ -152,6 +139,24 @@ def scrub_attribute_that_allows_local_ref(attr_node) attr_node.value = values.join(" ") end + def scrub_uri_attribute(attr_node) + # this block lifted nearly verbatim from HTML5 sanitization + val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS, "").downcase + if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && !SafeList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0]) + attr_node.remove + return true + elsif val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0] == "data" + # permit only allowed data mediatypes + mediatype = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[1] + mediatype, _ = mediatype.split(";")[0..1] if mediatype + if mediatype && !SafeList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype) + attr_node.remove + return true + end + end + false + end + # # libxml2 >= 2.9.2 fails to escape comments within some attributes. #