diff --git a/CHANGELOG.md b/CHANGELOG.md index fb7af2d2485..1436038201e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA * [CRuby] `Nokogiri::HTML5::Builder` is similar to `HTML4::Builder` but returns an `HTML5::Document`. @flavorjones * [CRuby] Attributes in an HTML5 document can be serialized individually, something that has always been supported by the HTML4 serializer. [#3125, #3127] @flavorjones * [CRuby] When compiling packaged libraries from source, allow users' `AR` and `LD` environment variables to set the archiver and linker commands, respectively. This augments the existing `CC` environment variable to set the compiler command. [#3165] @ziggythehamster +* [CRuby] The HTML5 parse methods accept a `:parse_noscript_content_as_text` keyword argument which will emulate the parsing behavior of a browser which has scripting enabled. [#3178, #3231] @stevecheckoway ### Fixed diff --git a/ext/nokogiri/gumbo.c b/ext/nokogiri/gumbo.c index c0e0176389f..096bdaab3f2 100644 --- a/ext/nokogiri/gumbo.c +++ b/ext/nokogiri/gumbo.c @@ -301,15 +301,19 @@ common_options(VALUE kwargs) // If this order is changed, then setting the options below must change as // well. ID keywords[] = { + // Required keywords. rb_intern_const("max_attributes"), rb_intern_const("max_errors"), rb_intern_const("max_tree_depth"), + + // Optional keywords. + rb_intern_const("parse_noscript_content_as_text"), }; VALUE values[sizeof keywords / sizeof keywords[0]]; // Extract the values coresponding to the required keywords. Raise an error // if required arguments are missing. - rb_get_kwargs(kwargs, keywords, 3, 0, values); + rb_get_kwargs(kwargs, keywords, 3, 1, values); GumboOptions options = kGumboDefaultOptions; options.max_attributes = NUM2INT(values[0]); @@ -319,6 +323,8 @@ common_options(VALUE kwargs) int depth = NUM2INT(values[2]); options.max_tree_depth = depth < 0 ? UINT_MAX : (unsigned int)depth; + options.parse_noscript_content_as_text = values[3] != Qundef && RTEST(values[3]); + return options; } diff --git a/gumbo-parser/src/nokogiri_gumbo.h b/gumbo-parser/src/nokogiri_gumbo.h index 15b2e989953..69555607ffb 100644 --- a/gumbo-parser/src/nokogiri_gumbo.h +++ b/gumbo-parser/src/nokogiri_gumbo.h @@ -780,6 +780,15 @@ typedef struct GumboInternalOptions { * Default: `false`. */ bool fragment_context_has_form_ancestor; + + /** + * Parse `noscript` elements as if scripting was enabled. This causes the + * contents of the `noscript` element to be parsed as raw text, rather + * than as HTML elements. + * + * Default: `false`. + */ + bool parse_noscript_content_as_text; } GumboOptions; /** Default options struct; use this with gumbo_parse_with_options. */ diff --git a/gumbo-parser/src/parser.c b/gumbo-parser/src/parser.c index 422c1f24d7e..07f31533485 100644 --- a/gumbo-parser/src/parser.c +++ b/gumbo-parser/src/parser.c @@ -56,6 +56,7 @@ const GumboOptions kGumboDefaultOptions = { .fragment_encoding = NULL, .quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS, .fragment_context_has_form_ancestor = false, + .parse_noscript_content_as_text = false, }; #define STRING(s) {.data = s, .length = sizeof(s) - 1} @@ -2614,6 +2615,7 @@ static void handle_in_head(GumboParser* parser, GumboToken* token) { } if ( tag_in(token, kStartTag, &(const TagSet){TAG(NOFRAMES), TAG(STYLE)}) + || (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT) && parser->_options->parse_noscript_content_as_text) ) { run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT); return; @@ -3319,7 +3321,10 @@ static void handle_in_body(GumboParser* parser, GumboToken* token) { run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT); return; } - if (tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)) { + if ( + tag_is(token, kStartTag, GUMBO_TAG_NOEMBED) + || (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT) && parser->_options->parse_noscript_content_as_text) + ) { run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT); return; } @@ -4633,12 +4638,20 @@ static void fragment_parser_init ( const char* fragment_encoding = options->fragment_encoding; GumboQuirksModeEnum quirks = options->quirks_mode; bool ctx_has_form_ancestor = options->fragment_context_has_form_ancestor; - GumboNode* root; - // 2. + + // 1. [Create a new Document node, and mark it as being an HTML document.] + // 2. [If the node document of the context element is in quirks mode, then + // let the Document be in quirks mode. Otherwise, the node document of + // the context element is in limited-quirks mode, then let the Document + // be in limited-quirks mode. Otherwise, leave the Document in no-quirks + // mode.] get_document_node(parser)->v.document.doc_type_quirks_mode = quirks; - // 3. + // 3. [If allowDeclarativeShadowRoots is true, then set the Document's allow + // declarative shadow roots to true.] + // 4. [Create a new HTML parser, and associate it with the just created Document node.] + // 5. [Set the state of the HTML parser's tokenization stage as follows, switching on the context element:] parser->_parser_state->_fragment_ctx = create_fragment_ctx_element(fragment_ctx, fragment_namespace, fragment_encoding); GumboTag ctx_tag = parser->_parser_state->_fragment_ctx->v.element.tag; @@ -4665,8 +4678,8 @@ static void fragment_parser_init ( break; case GUMBO_TAG_NOSCRIPT: - /* scripting is disabled in Gumbo, so leave the tokenizer - * in the default data state */ + if (options->parse_noscript_content_as_text) + gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT); break; case GUMBO_TAG_PLAINTEXT: diff --git a/lib/nokogiri/html5.rb b/lib/nokogiri/html5.rb index e251ee158bb..1465e265fe3 100644 --- a/lib/nokogiri/html5.rb +++ b/lib/nokogiri/html5.rb @@ -48,20 +48,20 @@ def self.HTML5(input, url = nil, encoding = nil, **options, &block) # # The document and fragment parsing methods support options that are different from Nokogiri's. # - # - Nokogiri.HTML5(html, url = nil, encoding = nil, options = {}) - # - Nokogiri::HTML5.parse(html, url = nil, encoding = nil, options = {}) - # - Nokogiri::HTML5::Document.parse(html, url = nil, encoding = nil, options = {}) - # - Nokogiri::HTML5.fragment(html, encoding = nil, options = {}) - # - Nokogiri::HTML5::DocumentFragment.parse(html, encoding = nil, options = {}) + # - Nokogiri.HTML5(html, url = nil, encoding = nil, **options) + # - Nokogiri::HTML5.parse(html, url = nil, encoding = nil, **options) + # - Nokogiri::HTML5::Document.parse(html, url = nil, encoding = nil, **options) + # - Nokogiri::HTML5.fragment(html, encoding = nil, **options) + # - Nokogiri::HTML5::DocumentFragment.parse(html, encoding = nil, **options) # - # The three currently supported options are +:max_errors+, +:max_tree_depth+ and - # +:max_attributes+, described below. + # The four currently supported options are +:max_errors+, +:max_tree_depth+, +:max_attributes+, + # and +:parse_noscript_content_as_text+ described below. # # === Error reporting # # Nokogiri contains an experimental HTML5 parse error reporting facility. By default, no parse # errors are reported but this can be configured by passing the +:max_errors+ option to - # {HTML5.parse} or {HTML5.fragment}. + # HTML5.parse or HTML5.fragment. # # For example, this script: # @@ -87,20 +87,21 @@ def self.HTML5(input, url = nil, encoding = nil, **options, &block) # # Using max_errors: -1 results in an unlimited number of errors being returned. # - # The errors returned by {HTML5::Document#errors} are instances of {Nokogiri::XML::SyntaxError}. + # The errors returned by HTML5::Document#errors are instances of Nokogiri::XML::SyntaxError. # - # The {https://html.spec.whatwg.org/multipage/parsing.html#parse-errors HTML standard} defines a + # The {HTML standard}[https://html.spec.whatwg.org/multipage/parsing.html#parse-errors] defines a # number of standard parse error codes. These error codes only cover the "tokenization" stage of # parsing HTML. The parse errors in the "tree construction" stage do not have standardized error # codes (yet). # - # As a convenience to Nokogiri users, the defined error codes are available via - # {Nokogiri::XML::SyntaxError#str1} method. + # As a convenience to Nokogiri users, the defined error codes are available + # via Nokogiri::XML::SyntaxError#str1 method. # # doc = Nokogiri::HTML5.parse('Hi there!', max_errors: 10) # doc.errors.each do |err| # puts("#{err.line}:#{err.column}: #{err.str1}") # end + # doc = Nokogiri::HTML5.parse('Hi there!', # # => 1:1: generic-parser # # 1:1: non-void-html-element-start-tag-with-trailing-solidus # # 1:17: end-tag-with-trailing-solidus @@ -117,10 +118,10 @@ def self.HTML5(input, url = nil, encoding = nil, **options, &block) # # The maximum depth of the DOM tree parsed by the various parsing methods is configurable by the # +:max_tree_depth+ option. If the depth of the tree would exceed this limit, then an - # {::ArgumentError} is thrown. + # +ArgumentError+ is thrown. # - # This limit (which defaults to Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH = 400) can be - # removed by giving the option max_tree_depth: -1. + # This limit (which defaults to +Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH+) can be removed by + # giving the option max_tree_depth: -1. # # html = '' + '
gumbo_parse_with_options
method, using the default options.
- # The resulting Gumbo parse tree is then walked.
+ # * The Nokogiri::HTML5.parse function takes a string and passes it to the
+ # gumbo_parse_with_options
method, using the default options. The resulting Gumbo
+ # parse tree is then walked.
#
# * Instead of uppercase element names, lowercase element names are produced.
#
- # * Instead of returning +unknown+ as the element name for unknown tags, the
- # original tag name is returned verbatim.
+ # * Instead of returning +unknown+ as the element name for unknown tags, the original tag name is
+ # returned verbatim.
#
# Since v1.12.0
module HTML5
class << self
- # Parse an HTML 5 document. Convenience method for {Nokogiri::HTML5::Document.parse}
+ # Parse an HTML 5 document. Convenience method for Nokogiri::HTML5::Document.parse
def parse(string, url = nil, encoding = nil, **options, &block)
Document.parse(string, url, encoding, **options, &block)
end
# Parse a fragment from +string+. Convenience method for
- # {Nokogiri::HTML5::DocumentFragment.parse}.
+ # Nokogiri::HTML5::DocumentFragment.parse.
def fragment(string, encoding = nil, **options)
DocumentFragment.parse(string, encoding, options)
end
@@ -268,11 +304,11 @@ def read_and_encode(string, encoding)
private
# Charset sniffing is a complex and controversial topic that understandably isn't done _by
- # default_ by the Ruby Net::HTTP library. This being said, it is a very real problem for
+ # default_ by the Ruby Net::HTTP library. This being said, it is a very real problem for
# consumers of HTML as the default for HTML is iso-8859-1, most "good" producers use utf-8, and
# the Gumbo parser *only* supports utf-8.
#
- # Accordingly, Nokogiri::HTML4::Document.parse provides limited encoding detection. Following
+ # Accordingly, Nokogiri::HTML4::Document.parse provides limited encoding detection. Following
# this lead, Nokogiri::HTML5 attempts to do likewise, while attempting to more closely follow
# the HTML5 standard.
#
diff --git a/test/html5/test_api.rb b/test/html5/test_api.rb
index 926fa8a4dc0..93c5fb7783e 100644
--- a/test/html5/test_api.rb
+++ b/test/html5/test_api.rb
@@ -104,6 +104,75 @@ def test_serialization_encoding
assert_match("ฉันไม่พูดภาษาไทย", html2)
end
+ def test_parse_noscript_as_elements_in_head
+ # isn't allowed in noscript so the noscript element is popped off
+ # the stack of open elements and the token is reprocessed in `head`
+ # which causes the `head` element to be popped off the stack of open
+ # elements and a `body` element to be inserted. Then the `img` element is
+ # inserted in the body.
+ html = ""
+ doc = Nokogiri::HTML5(html, parse_noscript_content_as_text: false, max_errors: 100)
+ noscript = doc.at("/html/head/noscript")
+ assert_equal(3, doc.errors.length, doc.errors.join("\n"))
+ # Start tag 'img' isn't allowed here
+ # End tag 'noscript' isn't allowed here
+ # End tag head isn't allowed here
+ assert_empty(noscript.children)
+ img = doc.at("/html/body/img")
+ refute_nil(img)
+ end
+
+ def test_parse_noscript_as_text_in_head
+ # In contrast to the previous test, when the scripting flag is enabled, the content
+ # of the noscript element is parsed as raw text.
+ html = ""
+ doc = Nokogiri::HTML5(html, parse_noscript_content_as_text: true, max_errors: 100)
+ noscript = doc.at("/html/head/noscript")
+ assert_empty(doc.errors)
+ assert_equal(1, noscript.children.length)
+ assert_kind_of(Nokogiri::XML::Text, noscript.children.first)
+ end
+
+ def test_parse_noscript_as_elements_in_body
+ html = ""
+ doc = Nokogiri::HTML5(html, parse_noscript_content_as_text: false, max_errors: 100)
+ assert_empty(doc.errors)
+ img = doc.at("/html/body/noscript/img")
+ refute_nil(img)
+ end
+
+ def test_parse_noscript_as_text_in_body
+ html = ""
+ doc = Nokogiri::HTML5(html, parse_noscript_content_as_text: true, max_errors: 100)
+ noscript = doc.at("/html/body/noscript")
+ assert_empty(doc.errors, doc.errors.join("\n"))
+ assert_equal(1, noscript.children.length)
+ assert_kind_of(Nokogiri::XML::Text, noscript.children.first)
+ end
+
+ def test_parse_noscript_fragment_as_elements
+ html = ""
+ frag = Nokogiri::HTML5::DocumentFragment.new(Nokogiri::HTML5::Document.new, html, "noscript", parse_noscript_content_as_text: false, max_errors: 100)
+ assert_empty(frag.errors)
+ assert_equal(2, frag.children.length)
+ end
+
+ def test_parse_noscript_fragment_as_text
+ html = ""
+ frag = Nokogiri::HTML5::DocumentFragment.new(Nokogiri::HTML5::Document.new, html, "noscript", parse_noscript_content_as_text: true, max_errors: 100)
+ assert_empty(frag.errors)
+ assert_equal(1, frag.children.length)
+ assert_kind_of(Nokogiri::XML::Text, frag.children.first)
+ end
+
+ def test_parse_noscript_content_default
+ html = ""
+ doc = Nokogiri::HTML5(html, max_errors: 100)
+ assert_empty(doc.errors)
+ img = doc.at("/html/body/noscript/img")
+ refute_nil(img)
+ end
+
["pre", "listing", "textarea"].each do |tag|
define_method("test_serialize_preserve_newline_#{tag}".to_sym) do
doc = Nokogiri::HTML5("<#{tag}>\n\nContent#{tag}>")
diff --git a/test/html5/test_tree_construction.rb b/test/html5/test_tree_construction.rb
index 7e01be866c8..1e321b3ecaa 100644
--- a/test/html5/test_tree_construction.rb
+++ b/test/html5/test_tree_construction.rb
@@ -70,6 +70,11 @@ def compare_nodes(node, ng_node)
end
def run_test
+ options = {
+ max_errors: -1,
+ parse_noscript_content_as_text: @test[:script] == :on,
+ }
+
if @test[:context]
# this is a fragment test
if @test_context_node
@@ -85,15 +90,15 @@ def run_test
doc = Nokogiri::HTML5::Document.new
context_node = doc.create_element(@test[:context].first)
end
- doc = Nokogiri::HTML5::DocumentFragment.new(doc, @test[:data], context_node, max_errors: @test[:errors].length + 10)
+ doc = Nokogiri::HTML5::DocumentFragment.new(doc, @test[:data], context_node, **options)
else
# run the test using a tag name
ctx = @test[:context].join(":")
doc = Nokogiri::HTML5::Document.new
- doc = Nokogiri::HTML5::DocumentFragment.new(doc, @test[:data], ctx, max_errors: @test[:errors].length + 10)
+ doc = Nokogiri::HTML5::DocumentFragment.new(doc, @test[:data], ctx, **options)
end
else
- doc = Nokogiri::HTML5.parse(@test[:data], max_errors: @test[:errors].length + 10)
+ doc = Nokogiri::HTML5.parse(@test[:data], **options)
end
# Walk the tree.
exp_nodes = [@test[:document]]
@@ -161,7 +166,7 @@ module Html5libTestCaseParser
class BadHtml5libFormat < RuntimeError; end
def self.parse_test(test_data)
- test = { script: :both }
+ test = { script: :off }
index = /(?:^#errors\n|\n#errors\n)/ =~ test_data
raise(BadHtml5libFormat, "Expected #errors in\n#{test_data}") if index.nil?
@@ -323,8 +328,6 @@ def self.generate_tests
klass = Class.new(TestHtml5TreeConstructionBase) do
tests.each_with_index do |test, index|
- next if test[:script] == :on
-
define_method "test_#{index}" do
@test = test
@index = index