Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: serializing an individual attribute in an HTML5 document #3127

Merged
merged 1 commit into from
Feb 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA
### Added

* [CRuby] `Nokogiri::HTML5::Builder` is similar to `HTML4::Builder` but returns an `HTML5::Document`. (@flavorjones)
* [CRuby] Attributes in an HTML5 document can be serialized individually, something that has always been supported by the HTML4 serializer. [#3125, #3127] @flavorjones


### Fixed
Expand Down
29 changes: 18 additions & 11 deletions ext/nokogiri/xml_node.c
Original file line number Diff line number Diff line change
Expand Up @@ -1875,17 +1875,7 @@ output_node(
// Add attributes.
for (xmlAttrPtr attr = node->properties; attr; attr = attr->next) {
output_char(out, ' ');
output_attr_name(out, attr);
if (attr->children) {
output_string(out, "=\"");
xmlChar *value = xmlNodeListGetString(attr->doc, attr->children, 1);
output_escaped_string(out, value, true);
xmlFree(value);
output_char(out, '"');
} else {
// Output name=""
output_string(out, "=\"\"");
}
output_node(out, (xmlNodePtr)attr, preserve_newline);
}
output_char(out, '>');

Expand All @@ -1903,6 +1893,23 @@ output_node(
}
break;

case XML_ATTRIBUTE_NODE:
{
xmlAttrPtr attr = (xmlAttrPtr)node;
output_attr_name(out, attr);
if (attr->children) {
output_string(out, "=\"");
xmlChar *value = xmlNodeListGetString(attr->doc, attr->children, 1);
output_escaped_string(out, value, true);
xmlFree(value);
output_char(out, '"');
} else {
// Output name=""
output_string(out, "=\"\"");
}
}
break;

case XML_TEXT_NODE:
if (node->parent
&& is_one_of(node->parent, UNESCAPED_TEXT_ELEMENTS,
Expand Down
175 changes: 96 additions & 79 deletions test/html4/test_attributes.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,90 +2,107 @@

require "helper"

module Nokogiri
module HTML
class TestAttr < Nokogiri::TestCase
#
# libxml2 >= 2.9.2 fails to escape comments within some attributes. It
# wants to ensure these comments can be treated as "server-side includes",
# but as a result fails to ensure that serialization is well-formed,
# resulting in an opportunity for XSS injection of code into a final
# re-parsed document (presumably in a browser).
#
# the offending commit is:
#
# https://github.com/GNOME/libxml2/commit/960f0e2
#
# we'll test this by parsing the HTML, serializing it, then
# re-parsing it to ensure there isn't any ambiguity in the output
# that might allow code injection into a browser consuming
# "sanitized" output.
#
# complaints have been made upstream about this behavior, notably at
#
# https://bugzilla.gnome.org/show_bug.cgi?id=769760
#
# and multiple CVEs have been declared and fixed in downstream
# libraries as a result, a list is being kept up to date here:
#
# https://github.com/flavorjones/loofah/issues/144
#
[
#
# these tags and attributes are determined by the code at:
#
# https://git.gnome.org/browse/libxml2/tree/HTMLtree.c?h=v2.9.2#n714
#
{ tag: "a", attr: "href" },
{ tag: "div", attr: "href" },
{ tag: "a", attr: "action" },
{ tag: "div", attr: "action" },
{ tag: "a", attr: "src" },
{ tag: "div", attr: "src" },
{ tag: "a", attr: "name" },
#
# note that div+name is _not_ affected by the libxml2 issue.
# but we test it anyway to ensure our logic isn't modifying
# attributes that don't need modifying.
#
{ tag: "div", attr: "name", unescaped: true },
].each do |config|
define_method "test_uri_escaping_of_#{config[:attr]}_attr_in_#{config[:tag]}_tag" do
skip if Nokogiri::VersionInfo.instance.libxml2? && Nokogiri::VersionInfo.instance.libxml2_using_system?
class TestHTML4Attributes < Nokogiri::TestCase
#
# libxml2 >= 2.9.2 fails to escape comments within some attributes. It
# wants to ensure these comments can be treated as "server-side includes",
# but as a result fails to ensure that serialization is well-formed,
# resulting in an opportunity for XSS injection of code into a final
# re-parsed document (presumably in a browser).
#
# the offending commit is:
#
# https://github.com/GNOME/libxml2/commit/960f0e2
#
# we'll test this by parsing the HTML, serializing it, then
# re-parsing it to ensure there isn't any ambiguity in the output
# that might allow code injection into a browser consuming
# "sanitized" output.
#
# complaints have been made upstream about this behavior, notably at
#
# https://bugzilla.gnome.org/show_bug.cgi?id=769760
#
# and multiple CVEs have been declared and fixed in downstream
# libraries as a result, a list is being kept up to date here:
#
# https://github.com/flavorjones/loofah/issues/144
#
[
#
# these tags and attributes are determined by the code at:
#
# https://git.gnome.org/browse/libxml2/tree/HTMLtree.c?h=v2.9.2#n714
#
{ tag: "a", attr: "href" },
{ tag: "div", attr: "href" },
{ tag: "a", attr: "action" },
{ tag: "div", attr: "action" },
{ tag: "a", attr: "src" },
{ tag: "div", attr: "src" },
{ tag: "a", attr: "name" },
#
# note that div+name is _not_ affected by the libxml2 issue.
# but we test it anyway to ensure our logic isn't modifying
# attributes that don't need modifying.
#
{ tag: "div", attr: "name", unescaped: true },
].each do |config|
define_method "test_uri_escaping_of_#{config[:attr]}_attr_in_#{config[:tag]}_tag" do
skip if Nokogiri::VersionInfo.instance.libxml2? && Nokogiri::VersionInfo.instance.libxml2_using_system?

html = %{<#{config[:tag]} #{config[:attr]}='examp<!--" unsafeattr=unsafevalue()>-->le.com'>test</#{config[:tag]}>}
html = %{<#{config[:tag]} #{config[:attr]}='examp<!--" unsafeattr=unsafevalue()>-->le.com'>test</#{config[:tag]}>}

reparsed = Nokogiri::HTML4.fragment(Nokogiri::HTML4.fragment(html).to_html)
attributes = reparsed.at_css(config[:tag]).attribute_nodes
reparsed = Nokogiri::HTML4.fragment(Nokogiri::HTML4.fragment(html).to_html)
attributes = reparsed.at_css(config[:tag]).attribute_nodes

assert_equal [config[:attr]], attributes.collect(&:name)
if Nokogiri::VersionInfo.instance.libxml2?
if config[:unescaped]
#
# this attribute was emitted wrapped in single-quotes, so a double quote is A-OK.
# assert that this attribute's serialization is unaffected.
#
assert_equal %{examp<!--" unsafeattr=unsafevalue()>-->le.com}, attributes.first.value
elsif Nokogiri.uses_libxml?(">= 2.11.0")
#
# as of upstream 76d6b0d768d4e60a2d2844d in v2.11.0 (dev), quotes are no longer escaped.
#
assert_equal %{examp<!--"%20unsafeattr=unsafevalue()>-->le.com}, attributes.first.value
else
#
# let's match the behavior in libxml < 2.9.2.
# test that this attribute's serialization is well-formed and sanitized.
#
assert_equal %{examp<!--%22%20unsafeattr=unsafevalue()>-->le.com}, attributes.first.value
end
else
#
# yay for consistency in javaland. move along, nothing to see here.
#
assert_equal %{examp<!--%22 unsafeattr=unsafevalue()>-->le.com}, attributes.first.value
end
assert_equal [config[:attr]], attributes.collect(&:name)
if Nokogiri::VersionInfo.instance.libxml2?
if config[:unescaped]
#
# this attribute was emitted wrapped in single-quotes, so a double quote is A-OK.
# assert that this attribute's serialization is unaffected.
#
assert_equal %{examp<!--" unsafeattr=unsafevalue()>-->le.com}, attributes.first.value
elsif Nokogiri.uses_libxml?(">= 2.11.0")
#
# as of upstream 76d6b0d768d4e60a2d2844d in v2.11.0 (dev), quotes are no longer escaped.
#
assert_equal %{examp<!--"%20unsafeattr=unsafevalue()>-->le.com}, attributes.first.value
else
#
# let's match the behavior in libxml < 2.9.2.
# test that this attribute's serialization is well-formed and sanitized.
#
assert_equal %{examp<!--%22%20unsafeattr=unsafevalue()>-->le.com}, attributes.first.value
end
else
#
# yay for consistency in javaland. move along, nothing to see here.
#
assert_equal %{examp<!--%22 unsafeattr=unsafevalue()>-->le.com}, attributes.first.value
end
end
end

def test_serialize_attribute
html = <<~HTML
<div id='foo' class="bar baz"></div>
HTML
div = Nokogiri::HTML4::DocumentFragment.parse(html).at_css("div")
attrs = div.attribute_nodes
id_attr = attrs.find { |a| a.name == "id" }
class_attr = attrs.find { |a| a.name == "class" }

if Nokogiri.uses_libxml?
id_expected = ' id="foo"'
class_expected = ' class="bar baz"'
else
id_expected = 'id="foo"'
class_expected = 'class="bar baz"'
end

assert_equal(id_expected, id_attr.to_html)
assert_equal(class_expected, class_attr.to_html)
end
end
18 changes: 18 additions & 0 deletions test/html5/test_attributes.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# frozen_string_literal: true

require "helper"

class TestHtml5Attributes < Nokogiri::TestCase
def test_serialize_attribute
html = <<~HTML
<div id='foo' class="bar baz"></div>
HTML
div = Nokogiri::HTML5::DocumentFragment.parse(html).at_css("div")
attrs = div.attribute_nodes
id_attr = attrs.find { |a| a.name == "id" }
class_attr = attrs.find { |a| a.name == "class" }

assert_equal('id="foo"', id_attr.to_html)
assert_equal('class="bar baz"', class_attr.to_html)
end
end if Nokogiri.uses_gumbo?