diff --git a/CHANGELOG.md b/CHANGELOG.md index fb7af2d2485..1436038201e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA * [CRuby] `Nokogiri::HTML5::Builder` is similar to `HTML4::Builder` but returns an `HTML5::Document`. @flavorjones * [CRuby] Attributes in an HTML5 document can be serialized individually, something that has always been supported by the HTML4 serializer. [#3125, #3127] @flavorjones * [CRuby] When compiling packaged libraries from source, allow users' `AR` and `LD` environment variables to set the archiver and linker commands, respectively. This augments the existing `CC` environment variable to set the compiler command. [#3165] @ziggythehamster +* [CRuby] The HTML5 parse methods accept a `:parse_noscript_content_as_text` keyword argument which will emulate the parsing behavior of a browser which has scripting enabled. [#3178, #3231] @stevecheckoway ### Fixed diff --git a/ext/nokogiri/gumbo.c b/ext/nokogiri/gumbo.c index c0e0176389f..096bdaab3f2 100644 --- a/ext/nokogiri/gumbo.c +++ b/ext/nokogiri/gumbo.c @@ -301,15 +301,19 @@ common_options(VALUE kwargs) // If this order is changed, then setting the options below must change as // well. ID keywords[] = { + // Required keywords. rb_intern_const("max_attributes"), rb_intern_const("max_errors"), rb_intern_const("max_tree_depth"), + + // Optional keywords. + rb_intern_const("parse_noscript_content_as_text"), }; VALUE values[sizeof keywords / sizeof keywords[0]]; // Extract the values coresponding to the required keywords. Raise an error // if required arguments are missing. - rb_get_kwargs(kwargs, keywords, 3, 0, values); + rb_get_kwargs(kwargs, keywords, 3, 1, values); GumboOptions options = kGumboDefaultOptions; options.max_attributes = NUM2INT(values[0]); @@ -319,6 +323,8 @@ common_options(VALUE kwargs) int depth = NUM2INT(values[2]); options.max_tree_depth = depth < 0 ? UINT_MAX : (unsigned int)depth; + options.parse_noscript_content_as_text = values[3] != Qundef && RTEST(values[3]); + return options; } diff --git a/gumbo-parser/src/nokogiri_gumbo.h b/gumbo-parser/src/nokogiri_gumbo.h index 15b2e989953..69555607ffb 100644 --- a/gumbo-parser/src/nokogiri_gumbo.h +++ b/gumbo-parser/src/nokogiri_gumbo.h @@ -780,6 +780,15 @@ typedef struct GumboInternalOptions { * Default: `false`. */ bool fragment_context_has_form_ancestor; + + /** + * Parse `noscript` elements as if scripting was enabled. This causes the + * contents of the `noscript` element to be parsed as raw text, rather + * than as HTML elements. + * + * Default: `false`. + */ + bool parse_noscript_content_as_text; } GumboOptions; /** Default options struct; use this with gumbo_parse_with_options. */ diff --git a/gumbo-parser/src/parser.c b/gumbo-parser/src/parser.c index 422c1f24d7e..07f31533485 100644 --- a/gumbo-parser/src/parser.c +++ b/gumbo-parser/src/parser.c @@ -56,6 +56,7 @@ const GumboOptions kGumboDefaultOptions = { .fragment_encoding = NULL, .quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS, .fragment_context_has_form_ancestor = false, + .parse_noscript_content_as_text = false, }; #define STRING(s) {.data = s, .length = sizeof(s) - 1} @@ -2614,6 +2615,7 @@ static void handle_in_head(GumboParser* parser, GumboToken* token) { } if ( tag_in(token, kStartTag, &(const TagSet){TAG(NOFRAMES), TAG(STYLE)}) + || (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT) && parser->_options->parse_noscript_content_as_text) ) { run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT); return; @@ -3319,7 +3321,10 @@ static void handle_in_body(GumboParser* parser, GumboToken* token) { run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT); return; } - if (tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)) { + if ( + tag_is(token, kStartTag, GUMBO_TAG_NOEMBED) + || (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT) && parser->_options->parse_noscript_content_as_text) + ) { run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT); return; } @@ -4633,12 +4638,20 @@ static void fragment_parser_init ( const char* fragment_encoding = options->fragment_encoding; GumboQuirksModeEnum quirks = options->quirks_mode; bool ctx_has_form_ancestor = options->fragment_context_has_form_ancestor; - GumboNode* root; - // 2. + + // 1. [Create a new Document node, and mark it as being an HTML document.] + // 2. [If the node document of the context element is in quirks mode, then + // let the Document be in quirks mode. Otherwise, the node document of + // the context element is in limited-quirks mode, then let the Document + // be in limited-quirks mode. Otherwise, leave the Document in no-quirks + // mode.] get_document_node(parser)->v.document.doc_type_quirks_mode = quirks; - // 3. + // 3. [If allowDeclarativeShadowRoots is true, then set the Document's allow + // declarative shadow roots to true.] + // 4. [Create a new HTML parser, and associate it with the just created Document node.] + // 5. [Set the state of the HTML parser's tokenization stage as follows, switching on the context element:] parser->_parser_state->_fragment_ctx = create_fragment_ctx_element(fragment_ctx, fragment_namespace, fragment_encoding); GumboTag ctx_tag = parser->_parser_state->_fragment_ctx->v.element.tag; @@ -4665,8 +4678,8 @@ static void fragment_parser_init ( break; case GUMBO_TAG_NOSCRIPT: - /* scripting is disabled in Gumbo, so leave the tokenizer - * in the default data state */ + if (options->parse_noscript_content_as_text) + gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT); break; case GUMBO_TAG_PLAINTEXT: diff --git a/lib/nokogiri/html5.rb b/lib/nokogiri/html5.rb index e251ee158bb..1465e265fe3 100644 --- a/lib/nokogiri/html5.rb +++ b/lib/nokogiri/html5.rb @@ -48,20 +48,20 @@ def self.HTML5(input, url = nil, encoding = nil, **options, &block) # # The document and fragment parsing methods support options that are different from Nokogiri's. # - # - Nokogiri.HTML5(html, url = nil, encoding = nil, options = {}) - # - Nokogiri::HTML5.parse(html, url = nil, encoding = nil, options = {}) - # - Nokogiri::HTML5::Document.parse(html, url = nil, encoding = nil, options = {}) - # - Nokogiri::HTML5.fragment(html, encoding = nil, options = {}) - # - Nokogiri::HTML5::DocumentFragment.parse(html, encoding = nil, options = {}) + # - Nokogiri.HTML5(html, url = nil, encoding = nil, **options) + # - Nokogiri::HTML5.parse(html, url = nil, encoding = nil, **options) + # - Nokogiri::HTML5::Document.parse(html, url = nil, encoding = nil, **options) + # - Nokogiri::HTML5.fragment(html, encoding = nil, **options) + # - Nokogiri::HTML5::DocumentFragment.parse(html, encoding = nil, **options) # - # The three currently supported options are +:max_errors+, +:max_tree_depth+ and - # +:max_attributes+, described below. + # The four currently supported options are +:max_errors+, +:max_tree_depth+, +:max_attributes+, + # and +:parse_noscript_content_as_text+ described below. # # === Error reporting # # Nokogiri contains an experimental HTML5 parse error reporting facility. By default, no parse # errors are reported but this can be configured by passing the +:max_errors+ option to - # {HTML5.parse} or {HTML5.fragment}. + # HTML5.parse or HTML5.fragment. # # For example, this script: # @@ -87,20 +87,21 @@ def self.HTML5(input, url = nil, encoding = nil, **options, &block) # # Using max_errors: -1 results in an unlimited number of errors being returned. # - # The errors returned by {HTML5::Document#errors} are instances of {Nokogiri::XML::SyntaxError}. + # The errors returned by HTML5::Document#errors are instances of Nokogiri::XML::SyntaxError. # - # The {https://html.spec.whatwg.org/multipage/parsing.html#parse-errors HTML standard} defines a + # The {HTML standard}[https://html.spec.whatwg.org/multipage/parsing.html#parse-errors] defines a # number of standard parse error codes. These error codes only cover the "tokenization" stage of # parsing HTML. The parse errors in the "tree construction" stage do not have standardized error # codes (yet). # - # As a convenience to Nokogiri users, the defined error codes are available via - # {Nokogiri::XML::SyntaxError#str1} method. + # As a convenience to Nokogiri users, the defined error codes are available + # via Nokogiri::XML::SyntaxError#str1 method. # # doc = Nokogiri::HTML5.parse('Hi there!', max_errors: 10) # doc.errors.each do |err| # puts("#{err.line}:#{err.column}: #{err.str1}") # end + # doc = Nokogiri::HTML5.parse('Hi there!', # # => 1:1: generic-parser # # 1:1: non-void-html-element-start-tag-with-trailing-solidus # # 1:17: end-tag-with-trailing-solidus @@ -117,10 +118,10 @@ def self.HTML5(input, url = nil, encoding = nil, **options, &block) # # The maximum depth of the DOM tree parsed by the various parsing methods is configurable by the # +:max_tree_depth+ option. If the depth of the tree would exceed this limit, then an - # {::ArgumentError} is thrown. + # +ArgumentError+ is thrown. # - # This limit (which defaults to Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH = 400) can be - # removed by giving the option max_tree_depth: -1. + # This limit (which defaults to +Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH+) can be removed by + # giving the option max_tree_depth: -1. # # html = '' + '
' * 1000 # doc = Nokogiri.HTML5(html) @@ -130,23 +131,58 @@ def self.HTML5(input, url = nil, encoding = nil, **options, &block) # === Attribute limit per element # # The maximum number of attributes per DOM element is configurable by the +:max_attributes+ - # option. If a given element would exceed this limit, then an {::ArgumentError} is thrown. + # option. If a given element would exceed this limit, then an +ArgumentError+ is thrown. # - # This limit (which defaults to Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES = 400) can be - # removed by giving the option max_attributes: -1. + # This limit (which defaults to +Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES+) can be removed by + # giving the option max_attributes: -1. # - # html = '
' + # html = '
' # # "
" # doc = Nokogiri.HTML5(html) # # raises ArgumentError: Attributes per element limit exceeded + # # doc = Nokogiri.HTML5(html, max_attributes: -1) + # # parses successfully + # + # === Parse +noscript+ elements' content as text + # + # By default, the content of +noscript+ elements is parsed as HTML elements. Browsers that + # support scripting parse the content of +noscript+ elements as raw text. + # + # The +:parse_noscript_content_as_text+ option causes Nokogiri to parse the content of +noscript+ + # elements as a single text node. + # + # html = "" + # doc = Nokogiri::HTML5.parse(html, parse_noscript_content_as_text: true) + # pp doc.at_xpath("/html/head/noscript") + # # => #(Element:0x878c { + # # name = "noscript", + # # children = [ #(Text "")] + # # }) + # + # In contrast, parse_noscript_content_as_text: false (the default) causes the +noscript+ + # element in the previous example to have two children, a +meta+ element and a +link+ element. + # + # doc = Nokogiri::HTML5.parse(html) + # puts doc.at_xpath("/html/head/noscript") + # # => #(Element:0x96b4 { + # # name = "noscript", + # # children = [ + # # #(Element:0x97e0 { name = "meta", attribute_nodes = [ #(Attr:0x990c { name = "charset", value = "UTF-8" })] }), + # # #(Element:0x9b00 { + # # name = "link", + # # attribute_nodes = [ + # # #(Attr:0x9c2c { name = "rel", value = "stylesheet" }), + # # #(Attr:0x9dd0 { name = "href", value = "!" })] + # # })] + # # }) # # == HTML Serialization # - # After parsing HTML, it may be serialized using any of the {Nokogiri::XML::Node} serialization - # methods. In particular, {XML::Node#serialize}, {XML::Node#to_html}, and {XML::Node#to_s} will + # After parsing HTML, it may be serialized using any of the Nokogiri::XML::Node serialization + # methods. In particular, XML::Node#serialize, XML::Node#to_html, and XML::Node#to_s will # serialize a given node and its children. (This is the equivalent of JavaScript's - # +Element.outerHTML+.) Similarly, {XML::Node#inner_html} will serialize the children of a given + # +Element.outerHTML+.) Similarly, XML::Node#inner_html will serialize the children of a given # node. (This is the equivalent of JavaScript's +Element.innerHTML+.) # # doc = Nokogiri::HTML5("Hello world!") @@ -154,12 +190,12 @@ def self.HTML5(input, url = nil, encoding = nil, **options, &block) # # => Hello world! # # Due to quirks in how HTML is parsed and serialized, it's possible for a DOM tree to be - # serialized and then re-parsed, resulting in a different DOM. Mostly, this happens with DOMs + # serialized and then re-parsed, resulting in a different DOM. Mostly, this happens with DOMs # produced from invalid HTML. Unfortunately, even valid HTML may not survive serialization and # re-parsing. # - # In particular, a newline at the start of +pre+, +listing+, and +textarea+ elements is ignored by - # the parser. + # In particular, a newline at the start of +pre+, +listing+, and +textarea+ + # elements is ignored by the parser. # # doc = Nokogiri::HTML5(<<-EOF) # @@ -188,54 +224,54 @@ def self.HTML5(input, url = nil, encoding = nil, **options, &block) # # == Encodings # - # Nokogiri always parses HTML5 using {https://en.wikipedia.org/wiki/UTF-8 UTF-8}; however, the + # Nokogiri always parses HTML5 using {UTF-8}[https://en.wikipedia.org/wiki/UTF-8]; however, the # encoding of the input can be explicitly selected via the optional +encoding+ parameter. This is # most useful when the input comes not from a string but from an IO object. # # When serializing a document or node, the encoding of the output string can be specified via the # +:encoding+ options. Characters that cannot be encoded in the selected encoding will be encoded - # as {https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references HTML numeric - # entities}. + # as {HTML numeric + # entities}[https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references]. # # frag = Nokogiri::HTML5.fragment('아는 길도 물어가라') # html = frag.serialize(encoding: 'US-ASCII') # puts html # # => 아는 길도 물어가라 + # # frag = Nokogiri::HTML5.fragment(html) # puts frag.serialize # # => 아는 길도 물어가라 # - # (There's a {https://bugs.ruby-lang.org/issues/15033 bug} in all current versions of Ruby that + # (There's a {bug}[https://bugs.ruby-lang.org/issues/15033] in all current versions of Ruby that # can cause the entity encoding to fail. Of the mandated supported encodings for HTML, the only # encoding I'm aware of that has this bug is 'ISO-2022-JP'. We recommend avoiding this # encoding.) # # == Notes # - # * The {Nokogiri::HTML5.fragment} function takes a string and parses it - # as a HTML5 document. The ++, ++, and ++ elements are - # removed from this document, and any children of these elements that remain - # are returned as a {Nokogiri::HTML5::DocumentFragment}. + # * The Nokogiri::HTML5.fragment function takes a string and parses it as a HTML5 document. The + # +html+, +head+, and +body+ elements are removed from this document, and any children of these + # elements that remain are returned as a Nokogiri::HTML5::DocumentFragment. # - # * The {Nokogiri::HTML5.parse} function takes a string and passes it to the - # gumbo_parse_with_options method, using the default options. - # The resulting Gumbo parse tree is then walked. + # * The Nokogiri::HTML5.parse function takes a string and passes it to the + # gumbo_parse_with_options method, using the default options. The resulting Gumbo + # parse tree is then walked. # # * Instead of uppercase element names, lowercase element names are produced. # - # * Instead of returning +unknown+ as the element name for unknown tags, the - # original tag name is returned verbatim. + # * Instead of returning +unknown+ as the element name for unknown tags, the original tag name is + # returned verbatim. # # Since v1.12.0 module HTML5 class << self - # Parse an HTML 5 document. Convenience method for {Nokogiri::HTML5::Document.parse} + # Parse an HTML 5 document. Convenience method for Nokogiri::HTML5::Document.parse def parse(string, url = nil, encoding = nil, **options, &block) Document.parse(string, url, encoding, **options, &block) end # Parse a fragment from +string+. Convenience method for - # {Nokogiri::HTML5::DocumentFragment.parse}. + # Nokogiri::HTML5::DocumentFragment.parse. def fragment(string, encoding = nil, **options) DocumentFragment.parse(string, encoding, options) end @@ -268,11 +304,11 @@ def read_and_encode(string, encoding) private # Charset sniffing is a complex and controversial topic that understandably isn't done _by - # default_ by the Ruby Net::HTTP library. This being said, it is a very real problem for + # default_ by the Ruby Net::HTTP library. This being said, it is a very real problem for # consumers of HTML as the default for HTML is iso-8859-1, most "good" producers use utf-8, and # the Gumbo parser *only* supports utf-8. # - # Accordingly, Nokogiri::HTML4::Document.parse provides limited encoding detection. Following + # Accordingly, Nokogiri::HTML4::Document.parse provides limited encoding detection. Following # this lead, Nokogiri::HTML5 attempts to do likewise, while attempting to more closely follow # the HTML5 standard. # diff --git a/test/html5/test_api.rb b/test/html5/test_api.rb index 926fa8a4dc0..93c5fb7783e 100644 --- a/test/html5/test_api.rb +++ b/test/html5/test_api.rb @@ -104,6 +104,75 @@ def test_serialization_encoding assert_match("ฉันไม่พูดภาษาไทย", html2) end + def test_parse_noscript_as_elements_in_head + # isn't allowed in noscript so the noscript element is popped off + # the stack of open elements and the token is reprocessed in `head` + # which causes the `head` element to be popped off the stack of open + # elements and a `body` element to be inserted. Then the `img` element is + # inserted in the body. + html = "" + doc = Nokogiri::HTML5(html, parse_noscript_content_as_text: false, max_errors: 100) + noscript = doc.at("/html/head/noscript") + assert_equal(3, doc.errors.length, doc.errors.join("\n")) + # Start tag 'img' isn't allowed here + # End tag 'noscript' isn't allowed here + # End tag head isn't allowed here + assert_empty(noscript.children) + img = doc.at("/html/body/img") + refute_nil(img) + end + + def test_parse_noscript_as_text_in_head + # In contrast to the previous test, when the scripting flag is enabled, the content + # of the noscript element is parsed as raw text. + html = "" + doc = Nokogiri::HTML5(html, parse_noscript_content_as_text: true, max_errors: 100) + noscript = doc.at("/html/head/noscript") + assert_empty(doc.errors) + assert_equal(1, noscript.children.length) + assert_kind_of(Nokogiri::XML::Text, noscript.children.first) + end + + def test_parse_noscript_as_elements_in_body + html = "" + doc = Nokogiri::HTML5(html, parse_noscript_content_as_text: false, max_errors: 100) + assert_empty(doc.errors) + img = doc.at("/html/body/noscript/img") + refute_nil(img) + end + + def test_parse_noscript_as_text_in_body + html = "" + doc = Nokogiri::HTML5(html, parse_noscript_content_as_text: true, max_errors: 100) + noscript = doc.at("/html/body/noscript") + assert_empty(doc.errors, doc.errors.join("\n")) + assert_equal(1, noscript.children.length) + assert_kind_of(Nokogiri::XML::Text, noscript.children.first) + end + + def test_parse_noscript_fragment_as_elements + html = "" + frag = Nokogiri::HTML5::DocumentFragment.new(Nokogiri::HTML5::Document.new, html, "noscript", parse_noscript_content_as_text: false, max_errors: 100) + assert_empty(frag.errors) + assert_equal(2, frag.children.length) + end + + def test_parse_noscript_fragment_as_text + html = "" + frag = Nokogiri::HTML5::DocumentFragment.new(Nokogiri::HTML5::Document.new, html, "noscript", parse_noscript_content_as_text: true, max_errors: 100) + assert_empty(frag.errors) + assert_equal(1, frag.children.length) + assert_kind_of(Nokogiri::XML::Text, frag.children.first) + end + + def test_parse_noscript_content_default + html = "" + doc = Nokogiri::HTML5(html, max_errors: 100) + assert_empty(doc.errors) + img = doc.at("/html/body/noscript/img") + refute_nil(img) + end + ["pre", "listing", "textarea"].each do |tag| define_method("test_serialize_preserve_newline_#{tag}".to_sym) do doc = Nokogiri::HTML5("<#{tag}>\n\nContent") diff --git a/test/html5/test_tree_construction.rb b/test/html5/test_tree_construction.rb index 7e01be866c8..1e321b3ecaa 100644 --- a/test/html5/test_tree_construction.rb +++ b/test/html5/test_tree_construction.rb @@ -70,6 +70,11 @@ def compare_nodes(node, ng_node) end def run_test + options = { + max_errors: -1, + parse_noscript_content_as_text: @test[:script] == :on, + } + if @test[:context] # this is a fragment test if @test_context_node @@ -85,15 +90,15 @@ def run_test doc = Nokogiri::HTML5::Document.new context_node = doc.create_element(@test[:context].first) end - doc = Nokogiri::HTML5::DocumentFragment.new(doc, @test[:data], context_node, max_errors: @test[:errors].length + 10) + doc = Nokogiri::HTML5::DocumentFragment.new(doc, @test[:data], context_node, **options) else # run the test using a tag name ctx = @test[:context].join(":") doc = Nokogiri::HTML5::Document.new - doc = Nokogiri::HTML5::DocumentFragment.new(doc, @test[:data], ctx, max_errors: @test[:errors].length + 10) + doc = Nokogiri::HTML5::DocumentFragment.new(doc, @test[:data], ctx, **options) end else - doc = Nokogiri::HTML5.parse(@test[:data], max_errors: @test[:errors].length + 10) + doc = Nokogiri::HTML5.parse(@test[:data], **options) end # Walk the tree. exp_nodes = [@test[:document]] @@ -161,7 +166,7 @@ module Html5libTestCaseParser class BadHtml5libFormat < RuntimeError; end def self.parse_test(test_data) - test = { script: :both } + test = { script: :off } index = /(?:^#errors\n|\n#errors\n)/ =~ test_data raise(BadHtml5libFormat, "Expected #errors in\n#{test_data}") if index.nil? @@ -323,8 +328,6 @@ def self.generate_tests klass = Class.new(TestHtml5TreeConstructionBase) do tests.each_with_index do |test, index| - next if test[:script] == :on - define_method "test_#{index}" do @test = test @index = index