Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support parsing noscript content in script-enabled mode #3231

Merged
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion ext/nokogiri/gumbo.c
Original file line number Diff line number Diff line change
Expand Up @@ -301,15 +301,19 @@ common_options(VALUE kwargs)
// If this order is changed, then setting the options below must change as
// well.
ID keywords[] = {
// Required keywords.
rb_intern_const("max_attributes"),
rb_intern_const("max_errors"),
rb_intern_const("max_tree_depth"),

// Optional keywords.
rb_intern_const("parse_noscript_content_as_text"),
};
VALUE values[sizeof keywords / sizeof keywords[0]];

// Extract the values coresponding to the required keywords. Raise an error
// if required arguments are missing.
rb_get_kwargs(kwargs, keywords, 3, 0, values);
rb_get_kwargs(kwargs, keywords, 3, 1, values);

GumboOptions options = kGumboDefaultOptions;
options.max_attributes = NUM2INT(values[0]);
Expand All @@ -319,6 +323,8 @@ common_options(VALUE kwargs)
int depth = NUM2INT(values[2]);
options.max_tree_depth = depth < 0 ? UINT_MAX : (unsigned int)depth;

options.parse_noscript_content_as_text = values[3] != Qundef && RTEST(values[3]);

return options;
}

Expand Down
9 changes: 9 additions & 0 deletions gumbo-parser/src/nokogiri_gumbo.h
Original file line number Diff line number Diff line change
Expand Up @@ -780,6 +780,15 @@ typedef struct GumboInternalOptions {
* Default: `false`.
*/
bool fragment_context_has_form_ancestor;

/**
* Parse `noscript` elements as if scripting was enabled. This causes the
* contents of the `noscript` element to be parsed as raw text, rather
* than as HTML elements.
*
* Default: `false`.
*/
bool parse_noscript_content_as_text;
} GumboOptions;

/** Default options struct; use this with gumbo_parse_with_options. */
Expand Down
25 changes: 19 additions & 6 deletions gumbo-parser/src/parser.c
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ const GumboOptions kGumboDefaultOptions = {
.fragment_encoding = NULL,
.quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS,
.fragment_context_has_form_ancestor = false,
.parse_noscript_content_as_text = false,
};

#define STRING(s) {.data = s, .length = sizeof(s) - 1}
Expand Down Expand Up @@ -2614,6 +2615,7 @@ static void handle_in_head(GumboParser* parser, GumboToken* token) {
}
if (
tag_in(token, kStartTag, &(const TagSet){TAG(NOFRAMES), TAG(STYLE)})
|| (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT) && parser->_options->parse_noscript_content_as_text)
) {
run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
return;
Expand Down Expand Up @@ -3319,7 +3321,10 @@ static void handle_in_body(GumboParser* parser, GumboToken* token) {
run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
return;
}
if (tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)) {
if (
tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)
|| (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT) && parser->_options->parse_noscript_content_as_text)
) {
run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
return;
}
Expand Down Expand Up @@ -4633,12 +4638,20 @@ static void fragment_parser_init (
const char* fragment_encoding = options->fragment_encoding;
GumboQuirksModeEnum quirks = options->quirks_mode;
bool ctx_has_form_ancestor = options->fragment_context_has_form_ancestor;

GumboNode* root;
// 2.

// 1. [Create a new Document node, and mark it as being an HTML document.]
// 2. [If the node document of the context element is in quirks mode, then
// let the Document be in quirks mode. Otherwise, the node document of
// the context element is in limited-quirks mode, then let the Document
// be in limited-quirks mode. Otherwise, leave the Document in no-quirks
// mode.]
get_document_node(parser)->v.document.doc_type_quirks_mode = quirks;

// 3.
// 3. [If allowDeclarativeShadowRoots is true, then set the Document's allow
// declarative shadow roots to true.]
// 4. [Create a new HTML parser, and associate it with the just created Document node.]
// 5. [Set the state of the HTML parser's tokenization stage as follows, switching on the context element:]
parser->_parser_state->_fragment_ctx =
create_fragment_ctx_element(fragment_ctx, fragment_namespace, fragment_encoding);
GumboTag ctx_tag = parser->_parser_state->_fragment_ctx->v.element.tag;
Expand All @@ -4665,8 +4678,8 @@ static void fragment_parser_init (
break;

case GUMBO_TAG_NOSCRIPT:
/* scripting is disabled in Gumbo, so leave the tokenizer
* in the default data state */
if (options->parse_noscript_content_as_text)
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
break;

case GUMBO_TAG_PLAINTEXT:
Expand Down
60 changes: 60 additions & 0 deletions test/html5/test_api.rb
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,66 @@ def test_serialization_encoding
assert_match("ฉันไม่พูดภาษาไทย", html2)
end

def test_parse_noscript_as_elements_in_head
stevecheckoway marked this conversation as resolved.
Show resolved Hide resolved
# <img> isn't allowed in noscript so the noscript element is popped off
# the stack of open elements and the <img> token is reprocessed in `head`
# which causes the `head` element to be popped off the stack of open
# elements and a `body` element to be inserted. Then the `img` element is
# inserted in the body.
html = "<!DOCTYPE html><head><noscript><img src=!></noscript></head>"
doc = Nokogiri::HTML5(html, parse_noscript_content_as_text: false, max_errors: 100)
noscript = doc.at("/html/head/noscript")
assert_equal(3, doc.errors.length, doc.errors.join("\n"))
# Start tag 'img' isn't allowed here
# End tag 'noscript' isn't allowed here
# End tag head isn't allowed here
assert_empty(noscript.children)
img = doc.at("/html/body/img")
refute_nil(img)
end

def test_parse_noscript_as_text_in_head
# In contrast to the previous test, when the scripting flag is enabled, the content
# of the noscript element is parsed as raw text.
html = "<!DOCTYPE html><head><noscript><img src=!></noscript></head>"
doc = Nokogiri::HTML5(html, parse_noscript_content_as_text: true, max_errors: 100)
noscript = doc.at("/html/head/noscript")
assert_equal(0, doc.errors.length, doc.errors.join("\n"))
stevecheckoway marked this conversation as resolved.
Show resolved Hide resolved
assert_equal(1, noscript.children.length)
assert_kind_of(Nokogiri::XML::Text, noscript.children.first)
end

def test_parse_noscript_as_elements_in_body
html = "<!DOCTYPE html><body><noscript><img src=!></noscript></body>"
doc = Nokogiri::HTML5(html, parse_noscript_content_as_text: false, max_errors: 100)
assert_equal(0, doc.errors.length, doc.errors.join("\n"))
img = doc.at("/html/body/noscript/img")
refute_nil(img)
end

def test_parse_noscript_as_text_in_body
html = "<!DOCTYPE html><body><noscript><img src=!></noscript></body>"
doc = Nokogiri::HTML5(html, parse_noscript_content_as_text: true, max_errors: 100)
noscript = doc.at("/html/body/noscript")
assert_equal(0, doc.errors.length, doc.errors.join("\n"))
assert_kind_of(Nokogiri::XML::Text, noscript.children.first)
stevecheckoway marked this conversation as resolved.
Show resolved Hide resolved
end

def test_parse_noscript_fragment_as_elements
html = "<meta charset='UTF-8'><link rel=stylesheet href=!>"
frag = Nokogiri::HTML5::DocumentFragment.new(Nokogiri::HTML5::Document.new, html, "noscript", parse_noscript_content_as_text: false, max_errors: 100)
assert_equal(0, frag.errors.length, frag.errors.join("\n"))
assert_equal(2, frag.children.length)
end

def test_parse_noscript_fragment_as_text
html = "<meta charset='UTF-8'><link rel=stylesheet href=!>"
frag = Nokogiri::HTML5::DocumentFragment.new(Nokogiri::HTML5::Document.new, html, "noscript", parse_noscript_content_as_text: true, max_errors: 100)
assert_equal(0, frag.errors.length, frag.errors.join("\n"))
assert_equal(1, frag.children.length)
assert_kind_of(Nokogiri::XML::Text, frag.children.first)
end

["pre", "listing", "textarea"].each do |tag|
define_method("test_serialize_preserve_newline_#{tag}".to_sym) do
doc = Nokogiri::HTML5("<!DOCTYPE html><#{tag}>\n\nContent</#{tag}>")
Expand Down
15 changes: 9 additions & 6 deletions test/html5/test_tree_construction.rb
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,11 @@ def compare_nodes(node, ng_node)
end

def run_test
options = {
max_errors: -1,
parse_noscript_content_as_text: @test[:script] == :on,
}

if @test[:context]
# this is a fragment test
if @test_context_node
Expand All @@ -85,15 +90,15 @@ def run_test
doc = Nokogiri::HTML5::Document.new
context_node = doc.create_element(@test[:context].first)
end
doc = Nokogiri::HTML5::DocumentFragment.new(doc, @test[:data], context_node, max_errors: @test[:errors].length + 10)
doc = Nokogiri::HTML5::DocumentFragment.new(doc, @test[:data], context_node, **options)
else
# run the test using a tag name
ctx = @test[:context].join(":")
doc = Nokogiri::HTML5::Document.new
doc = Nokogiri::HTML5::DocumentFragment.new(doc, @test[:data], ctx, max_errors: @test[:errors].length + 10)
doc = Nokogiri::HTML5::DocumentFragment.new(doc, @test[:data], ctx, **options)
end
else
doc = Nokogiri::HTML5.parse(@test[:data], max_errors: @test[:errors].length + 10)
doc = Nokogiri::HTML5.parse(@test[:data], **options)
end
# Walk the tree.
exp_nodes = [@test[:document]]
Expand Down Expand Up @@ -161,7 +166,7 @@ module Html5libTestCaseParser
class BadHtml5libFormat < RuntimeError; end

def self.parse_test(test_data)
test = { script: :both }
test = { script: :off }
flavorjones marked this conversation as resolved.
Show resolved Hide resolved
index = /(?:^#errors\n|\n#errors\n)/ =~ test_data
raise(BadHtml5libFormat, "Expected #errors in\n#{test_data}") if index.nil?

Expand Down Expand Up @@ -323,8 +328,6 @@ def self.generate_tests

klass = Class.new(TestHtml5TreeConstructionBase) do
tests.each_with_index do |test, index|
next if test[:script] == :on

define_method "test_#{index}" do
@test = test
@index = index
Expand Down