Skip to content

Commit

Permalink
fix: serializing an individual attribute in an HTML5 document (#3127)
Browse files Browse the repository at this point in the history
**What problem is this PR intended to solve?**

Previously an exception would be raised, either "Unexpected node" or
"Unsupported document node (2)" depending on the version of Nokogiri.

Fixes #3125

**Have you included adequate test coverage?**

Yes.

**Does this change affect the behavior of either the C or the Java
implementations?**

HTML5 is only available in CRuby.
  • Loading branch information
flavorjones authored Feb 8, 2024
2 parents e80c962 + 5ba752c commit e67310b
Show file tree
Hide file tree
Showing 4 changed files with 133 additions and 90 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA
### Added

* [CRuby] `Nokogiri::HTML5::Builder` is similar to `HTML4::Builder` but returns an `HTML5::Document`. (@flavorjones)
* [CRuby] Attributes in an HTML5 document can be serialized individually, something that has always been supported by the HTML4 serializer. [#3125, #3127] @flavorjones


### Fixed
Expand Down
29 changes: 18 additions & 11 deletions ext/nokogiri/xml_node.c
Original file line number Diff line number Diff line change
Expand Up @@ -1875,17 +1875,7 @@ output_node(
// Add attributes.
for (xmlAttrPtr attr = node->properties; attr; attr = attr->next) {
output_char(out, ' ');
output_attr_name(out, attr);
if (attr->children) {
output_string(out, "=\"");
xmlChar *value = xmlNodeListGetString(attr->doc, attr->children, 1);
output_escaped_string(out, value, true);
xmlFree(value);
output_char(out, '"');
} else {
// Output name=""
output_string(out, "=\"\"");
}
output_node(out, (xmlNodePtr)attr, preserve_newline);
}
output_char(out, '>');

Expand All @@ -1903,6 +1893,23 @@ output_node(
}
break;

case XML_ATTRIBUTE_NODE:
{
xmlAttrPtr attr = (xmlAttrPtr)node;
output_attr_name(out, attr);
if (attr->children) {
output_string(out, "=\"");
xmlChar *value = xmlNodeListGetString(attr->doc, attr->children, 1);
output_escaped_string(out, value, true);
xmlFree(value);
output_char(out, '"');
} else {
// Output name=""
output_string(out, "=\"\"");
}
}
break;

case XML_TEXT_NODE:
if (node->parent
&& is_one_of(node->parent, UNESCAPED_TEXT_ELEMENTS,
Expand Down
175 changes: 96 additions & 79 deletions test/html4/test_attributes.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,90 +2,107 @@

require "helper"

module Nokogiri
module HTML
class TestAttr < Nokogiri::TestCase
#
# libxml2 >= 2.9.2 fails to escape comments within some attributes. It
# wants to ensure these comments can be treated as "server-side includes",
# but as a result fails to ensure that serialization is well-formed,
# resulting in an opportunity for XSS injection of code into a final
# re-parsed document (presumably in a browser).
#
# the offending commit is:
#
# https://github.com/GNOME/libxml2/commit/960f0e2
#
# we'll test this by parsing the HTML, serializing it, then
# re-parsing it to ensure there isn't any ambiguity in the output
# that might allow code injection into a browser consuming
# "sanitized" output.
#
# complaints have been made upstream about this behavior, notably at
#
# https://bugzilla.gnome.org/show_bug.cgi?id=769760
#
# and multiple CVEs have been declared and fixed in downstream
# libraries as a result, a list is being kept up to date here:
#
# https://github.com/flavorjones/loofah/issues/144
#
[
#
# these tags and attributes are determined by the code at:
#
# https://git.gnome.org/browse/libxml2/tree/HTMLtree.c?h=v2.9.2#n714
#
{ tag: "a", attr: "href" },
{ tag: "div", attr: "href" },
{ tag: "a", attr: "action" },
{ tag: "div", attr: "action" },
{ tag: "a", attr: "src" },
{ tag: "div", attr: "src" },
{ tag: "a", attr: "name" },
#
# note that div+name is _not_ affected by the libxml2 issue.
# but we test it anyway to ensure our logic isn't modifying
# attributes that don't need modifying.
#
{ tag: "div", attr: "name", unescaped: true },
].each do |config|
define_method "test_uri_escaping_of_#{config[:attr]}_attr_in_#{config[:tag]}_tag" do
skip if Nokogiri::VersionInfo.instance.libxml2? && Nokogiri::VersionInfo.instance.libxml2_using_system?
class TestHTML4Attributes < Nokogiri::TestCase
#
# libxml2 >= 2.9.2 fails to escape comments within some attributes. It
# wants to ensure these comments can be treated as "server-side includes",
# but as a result fails to ensure that serialization is well-formed,
# resulting in an opportunity for XSS injection of code into a final
# re-parsed document (presumably in a browser).
#
# the offending commit is:
#
# https://github.com/GNOME/libxml2/commit/960f0e2
#
# we'll test this by parsing the HTML, serializing it, then
# re-parsing it to ensure there isn't any ambiguity in the output
# that might allow code injection into a browser consuming
# "sanitized" output.
#
# complaints have been made upstream about this behavior, notably at
#
# https://bugzilla.gnome.org/show_bug.cgi?id=769760
#
# and multiple CVEs have been declared and fixed in downstream
# libraries as a result, a list is being kept up to date here:
#
# https://github.com/flavorjones/loofah/issues/144
#
[
#
# these tags and attributes are determined by the code at:
#
# https://git.gnome.org/browse/libxml2/tree/HTMLtree.c?h=v2.9.2#n714
#
{ tag: "a", attr: "href" },
{ tag: "div", attr: "href" },
{ tag: "a", attr: "action" },
{ tag: "div", attr: "action" },
{ tag: "a", attr: "src" },
{ tag: "div", attr: "src" },
{ tag: "a", attr: "name" },
#
# note that div+name is _not_ affected by the libxml2 issue.
# but we test it anyway to ensure our logic isn't modifying
# attributes that don't need modifying.
#
{ tag: "div", attr: "name", unescaped: true },
].each do |config|
define_method "test_uri_escaping_of_#{config[:attr]}_attr_in_#{config[:tag]}_tag" do
skip if Nokogiri::VersionInfo.instance.libxml2? && Nokogiri::VersionInfo.instance.libxml2_using_system?

html = %{<#{config[:tag]} #{config[:attr]}='examp<!--" unsafeattr=unsafevalue()>-->le.com'>test</#{config[:tag]}>}
html = %{<#{config[:tag]} #{config[:attr]}='examp<!--" unsafeattr=unsafevalue()>-->le.com'>test</#{config[:tag]}>}

reparsed = Nokogiri::HTML4.fragment(Nokogiri::HTML4.fragment(html).to_html)
attributes = reparsed.at_css(config[:tag]).attribute_nodes
reparsed = Nokogiri::HTML4.fragment(Nokogiri::HTML4.fragment(html).to_html)
attributes = reparsed.at_css(config[:tag]).attribute_nodes

assert_equal [config[:attr]], attributes.collect(&:name)
if Nokogiri::VersionInfo.instance.libxml2?
if config[:unescaped]
#
# this attribute was emitted wrapped in single-quotes, so a double quote is A-OK.
# assert that this attribute's serialization is unaffected.
#
assert_equal %{examp<!--" unsafeattr=unsafevalue()>-->le.com}, attributes.first.value
elsif Nokogiri.uses_libxml?(">= 2.11.0")
#
# as of upstream 76d6b0d768d4e60a2d2844d in v2.11.0 (dev), quotes are no longer escaped.
#
assert_equal %{examp<!--"%20unsafeattr=unsafevalue()>-->le.com}, attributes.first.value
else
#
# let's match the behavior in libxml < 2.9.2.
# test that this attribute's serialization is well-formed and sanitized.
#
assert_equal %{examp<!--%22%20unsafeattr=unsafevalue()>-->le.com}, attributes.first.value
end
else
#
# yay for consistency in javaland. move along, nothing to see here.
#
assert_equal %{examp<!--%22 unsafeattr=unsafevalue()>-->le.com}, attributes.first.value
end
assert_equal [config[:attr]], attributes.collect(&:name)
if Nokogiri::VersionInfo.instance.libxml2?
if config[:unescaped]
#
# this attribute was emitted wrapped in single-quotes, so a double quote is A-OK.
# assert that this attribute's serialization is unaffected.
#
assert_equal %{examp<!--" unsafeattr=unsafevalue()>-->le.com}, attributes.first.value
elsif Nokogiri.uses_libxml?(">= 2.11.0")
#
# as of upstream 76d6b0d768d4e60a2d2844d in v2.11.0 (dev), quotes are no longer escaped.
#
assert_equal %{examp<!--"%20unsafeattr=unsafevalue()>-->le.com}, attributes.first.value
else
#
# let's match the behavior in libxml < 2.9.2.
# test that this attribute's serialization is well-formed and sanitized.
#
assert_equal %{examp<!--%22%20unsafeattr=unsafevalue()>-->le.com}, attributes.first.value
end
else
#
# yay for consistency in javaland. move along, nothing to see here.
#
assert_equal %{examp<!--%22 unsafeattr=unsafevalue()>-->le.com}, attributes.first.value
end
end
end

def test_serialize_attribute
html = <<~HTML
<div id='foo' class="bar baz"></div>
HTML
div = Nokogiri::HTML4::DocumentFragment.parse(html).at_css("div")
attrs = div.attribute_nodes
id_attr = attrs.find { |a| a.name == "id" }
class_attr = attrs.find { |a| a.name == "class" }

if Nokogiri.uses_libxml?
id_expected = ' id="foo"'
class_expected = ' class="bar baz"'
else
id_expected = 'id="foo"'
class_expected = 'class="bar baz"'
end

assert_equal(id_expected, id_attr.to_html)
assert_equal(class_expected, class_attr.to_html)
end
end
18 changes: 18 additions & 0 deletions test/html5/test_attributes.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# frozen_string_literal: true

require "helper"

class TestHtml5Attributes < Nokogiri::TestCase
def test_serialize_attribute
html = <<~HTML
<div id='foo' class="bar baz"></div>
HTML
div = Nokogiri::HTML5::DocumentFragment.parse(html).at_css("div")
attrs = div.attribute_nodes
id_attr = attrs.find { |a| a.name == "id" }
class_attr = attrs.find { |a| a.name == "class" }

assert_equal('id="foo"', id_attr.to_html)
assert_equal('class="bar baz"', class_attr.to_html)
end
end if Nokogiri.uses_gumbo?

0 comments on commit e67310b

Please sign in to comment.