sparklemotion · flavorjones · Jun 20, 2024 · Jun 12, 2024 · Jun 12, 2024 · Jun 12, 2024
diff --git a/ext/nokogiri/gumbo.c b/ext/nokogiri/gumbo.c
@@ -301,15 +301,19 @@ common_options(VALUE kwargs)
   // If this order is changed, then setting the options below must change as
   // well.
   ID keywords[] = {
+    // Required keywords.
     rb_intern_const("max_attributes"),
     rb_intern_const("max_errors"),
     rb_intern_const("max_tree_depth"),
+
+    // Optional keywords.
+    rb_intern_const("parse_noscript_content_as_text"),
   };
   VALUE values[sizeof keywords / sizeof keywords[0]];
 
   // Extract the values coresponding to the required keywords. Raise an error
   // if required arguments are missing.
-  rb_get_kwargs(kwargs, keywords, 3, 0, values);
+  rb_get_kwargs(kwargs, keywords, 3, 1, values);
 
   GumboOptions options = kGumboDefaultOptions;
   options.max_attributes = NUM2INT(values[0]);
@@ -319,6 +323,8 @@ common_options(VALUE kwargs)
   int depth = NUM2INT(values[2]);
   options.max_tree_depth = depth < 0 ? UINT_MAX : (unsigned int)depth;
 
+  options.parse_noscript_content_as_text = values[3] != Qundef && RTEST(values[3]);
+
   return options;
 }
 

diff --git a/gumbo-parser/src/nokogiri_gumbo.h b/gumbo-parser/src/nokogiri_gumbo.h
@@ -780,6 +780,15 @@ typedef struct GumboInternalOptions {
    * Default: `false`.
    */
   bool fragment_context_has_form_ancestor;
+
+  /**
+   * Parse `noscript` elements as if scripting was enabled. This causes the
+   * contents of the `noscript` element to be parsed as raw text, rather
+   * than as HTML elements.
+   * 
+   * Default: `false`.
+   */
+  bool parse_noscript_content_as_text;
 } GumboOptions;
 
 /** Default options struct; use this with gumbo_parse_with_options. */

diff --git a/gumbo-parser/src/parser.c b/gumbo-parser/src/parser.c
@@ -56,6 +56,7 @@ const GumboOptions kGumboDefaultOptions = {
   .fragment_encoding = NULL,
   .quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS,
   .fragment_context_has_form_ancestor = false,
+  .parse_noscript_content_as_text = false,
 };
 
 #define STRING(s) {.data = s, .length = sizeof(s) - 1}
@@ -2614,6 +2615,7 @@ static void handle_in_head(GumboParser* parser, GumboToken* token) {
   }
   if (
     tag_in(token, kStartTag, &(const TagSet){TAG(NOFRAMES), TAG(STYLE)})
+    || (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT) && parser->_options->parse_noscript_content_as_text)
   ) {
     run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
     return;
@@ -3319,7 +3321,10 @@ static void handle_in_body(GumboParser* parser, GumboToken* token) {
     run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
     return;
   }
-  if (tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)) {
+  if (
+    tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)
+    || (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT) && parser->_options->parse_noscript_content_as_text)
+  ) {
     run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
     return;
   }
@@ -4633,12 +4638,20 @@ static void fragment_parser_init (
   const char* fragment_encoding = options->fragment_encoding;
   GumboQuirksModeEnum quirks = options->quirks_mode;
   bool ctx_has_form_ancestor = options->fragment_context_has_form_ancestor;
-
   GumboNode* root;
-  // 2.
+
+  // 1. [Create a new Document node, and mark it as being an HTML document.]
+  // 2. [If the node document of the context element is in quirks mode, then
+  //    let the Document be in quirks mode. Otherwise, the node document of
+  //    the context element is in limited-quirks mode, then let the Document
+  //    be in limited-quirks mode. Otherwise, leave the Document in no-quirks
+  //    mode.]
   get_document_node(parser)->v.document.doc_type_quirks_mode = quirks;
 
-  // 3.
+  // 3. [If allowDeclarativeShadowRoots is true, then set the Document's allow
+  //    declarative shadow roots to true.]
+  // 4. [Create a new HTML parser, and associate it with the just created Document node.]
+  // 5. [Set the state of the HTML parser's tokenization stage as follows, switching on the context element:]
   parser->_parser_state->_fragment_ctx =
     create_fragment_ctx_element(fragment_ctx, fragment_namespace, fragment_encoding);
   GumboTag ctx_tag = parser->_parser_state->_fragment_ctx->v.element.tag;
@@ -4665,8 +4678,8 @@ static void fragment_parser_init (
         break;
 
       case GUMBO_TAG_NOSCRIPT:
-        /* scripting is disabled in Gumbo, so leave the tokenizer
-         * in the default data state */
+        if (options->parse_noscript_content_as_text)
+          gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
         break;
 
       case GUMBO_TAG_PLAINTEXT:

diff --git a/test/html5/test_api.rb b/test/html5/test_api.rb
@@ -104,6 +104,66 @@ def test_serialization_encoding
     assert_match("ฉันไม่พูดภาษาไทย", html2)
   end
 
+  def test_parse_noscript_as_elements_in_head
+    # <img> isn't allowed in noscript so the noscript element is popped off
+    # the stack of open elements and the <img> token is reprocessed in `head`
+    # which causes the `head` element to be popped off the stack of open
+    # elements and a `body` element to be inserted. Then the `img` element is
+    # inserted in the body.
+    html = "<!DOCTYPE html><head><noscript><img src=!></noscript></head>"
+    doc = Nokogiri::HTML5(html, parse_noscript_content_as_text: false, max_errors: 100)
+    noscript = doc.at("/html/head/noscript")
+    assert_equal(3, doc.errors.length, doc.errors.join("\n"))
+    # Start tag 'img' isn't allowed here
+    # End tag 'noscript' isn't allowed here
+    # End tag head isn't allowed here
+    assert_empty(noscript.children)
+    img = doc.at("/html/body/img")
+    refute_nil(img)
+  end
+
+  def test_parse_noscript_as_text_in_head
+    # In contrast to the previous test, when the scripting flag is enabled, the content
+    # of the noscript element is parsed as raw text.
+    html = "<!DOCTYPE html><head><noscript><img src=!></noscript></head>"
+    doc = Nokogiri::HTML5(html, parse_noscript_content_as_text: true, max_errors: 100)
+    noscript = doc.at("/html/head/noscript")
+    assert_equal(0, doc.errors.length, doc.errors.join("\n"))
+    assert_equal(1, noscript.children.length)
+    assert_kind_of(Nokogiri::XML::Text, noscript.children.first)
+  end
+
+  def test_parse_noscript_as_elements_in_body
+    html = "<!DOCTYPE html><body><noscript><img src=!></noscript></body>"
+    doc = Nokogiri::HTML5(html, parse_noscript_content_as_text: false, max_errors: 100)
+    assert_equal(0, doc.errors.length, doc.errors.join("\n"))
+    img = doc.at("/html/body/noscript/img")
+    refute_nil(img)
+  end
+
+  def test_parse_noscript_as_text_in_body
+    html = "<!DOCTYPE html><body><noscript><img src=!></noscript></body>"
+    doc = Nokogiri::HTML5(html, parse_noscript_content_as_text: true, max_errors: 100)
+    noscript = doc.at("/html/body/noscript")
+    assert_equal(0, doc.errors.length, doc.errors.join("\n"))
+    assert_kind_of(Nokogiri::XML::Text, noscript.children.first)
+  end
+
+  def test_parse_noscript_fragment_as_elements
+    html = "<meta charset='UTF-8'><link rel=stylesheet href=!>"
+    frag = Nokogiri::HTML5::DocumentFragment.new(Nokogiri::HTML5::Document.new, html, "noscript", parse_noscript_content_as_text: false, max_errors: 100)
+    assert_equal(0, frag.errors.length, frag.errors.join("\n"))
+    assert_equal(2, frag.children.length)
+  end
+
+  def test_parse_noscript_fragment_as_text
+    html = "<meta charset='UTF-8'><link rel=stylesheet href=!>"
+    frag = Nokogiri::HTML5::DocumentFragment.new(Nokogiri::HTML5::Document.new, html, "noscript", parse_noscript_content_as_text: true, max_errors: 100)
+    assert_equal(0, frag.errors.length, frag.errors.join("\n"))
+    assert_equal(1, frag.children.length)
+    assert_kind_of(Nokogiri::XML::Text, frag.children.first)
+  end
+
   ["pre", "listing", "textarea"].each do |tag|
     define_method("test_serialize_preserve_newline_#{tag}".to_sym) do
       doc = Nokogiri::HTML5("<!DOCTYPE html><#{tag}>\n\nContent</#{tag}>")

diff --git a/test/html5/test_tree_construction.rb b/test/html5/test_tree_construction.rb
@@ -70,6 +70,11 @@ def compare_nodes(node, ng_node)
   end
 
   def run_test
+    options = {
+      max_errors: -1,
+      parse_noscript_content_as_text: @test[:script] == :on,
+    }
+
     if @test[:context]
       # this is a fragment test
       if @test_context_node
@@ -85,15 +90,15 @@ def run_test
           doc = Nokogiri::HTML5::Document.new
           context_node = doc.create_element(@test[:context].first)
         end
-        doc = Nokogiri::HTML5::DocumentFragment.new(doc, @test[:data], context_node, max_errors: @test[:errors].length + 10)
+        doc = Nokogiri::HTML5::DocumentFragment.new(doc, @test[:data], context_node, **options)
       else
         # run the test using a tag name
         ctx = @test[:context].join(":")
         doc = Nokogiri::HTML5::Document.new
-        doc = Nokogiri::HTML5::DocumentFragment.new(doc, @test[:data], ctx, max_errors: @test[:errors].length + 10)
+        doc = Nokogiri::HTML5::DocumentFragment.new(doc, @test[:data], ctx, **options)
       end
     else
-      doc = Nokogiri::HTML5.parse(@test[:data], max_errors: @test[:errors].length + 10)
+      doc = Nokogiri::HTML5.parse(@test[:data], **options)
     end
     # Walk the tree.
     exp_nodes = [@test[:document]]
@@ -161,7 +166,7 @@ module Html5libTestCaseParser
   class BadHtml5libFormat < RuntimeError; end
 
   def self.parse_test(test_data)
-    test = { script: :both }
+    test = { script: :off }
     index = /(?:^#errors\n|\n#errors\n)/ =~ test_data
     raise(BadHtml5libFormat, "Expected #errors in\n#{test_data}") if index.nil?
 
@@ -323,8 +328,6 @@ def self.generate_tests
 
       klass = Class.new(TestHtml5TreeConstructionBase) do
         tests.each_with_index do |test, index|
-          next if test[:script] == :on
-
           define_method "test_#{index}" do
             @test = test
             @index = index