diff --git a/lib/nokogiri/html5.rb b/lib/nokogiri/html5.rb index b9009f10cc4..1465e265fe3 100644 --- a/lib/nokogiri/html5.rb +++ b/lib/nokogiri/html5.rb @@ -46,142 +46,152 @@ def self.HTML5(input, url = nil, encoding = nil, **options, &block) # # == Parsing options # - # The document and fragment parsing methods support options that are - # different from Nokogiri's. + # The document and fragment parsing methods support options that are different from Nokogiri's. # - # - Nokogiri.HTML5(html, url = nil, encoding = nil, options = {}) - # - Nokogiri::HTML5.parse(html, url = nil, encoding = nil, options = - # {}) - # - Nokogiri::HTML5::Document.parse(html, url = nil, encoding = nil, - # options = {}) - # - Nokogiri::HTML5.fragment(html, encoding = nil, options = {}) - # - Nokogiri::HTML5::DocumentFragment.parse(html, encoding = nil, - # options = {}) + # - Nokogiri.HTML5(html, url = nil, encoding = nil, **options) + # - Nokogiri::HTML5.parse(html, url = nil, encoding = nil, **options) + # - Nokogiri::HTML5::Document.parse(html, url = nil, encoding = nil, **options) + # - Nokogiri::HTML5.fragment(html, encoding = nil, **options) + # - Nokogiri::HTML5::DocumentFragment.parse(html, encoding = nil, **options) # - # The four currently supported options are +:max_errors+, +:max_tree_depth+, - # +:max_attributes+, and +parse_noscript_content_as_text+ described below. + # The four currently supported options are +:max_errors+, +:max_tree_depth+, +:max_attributes+, + # and +:parse_noscript_content_as_text+ described below. # # === Error reporting # - # Nokogiri contains an experimental HTML5 parse error reporting facility. By - # default, no parse errors are reported but this can be configured by - # passing the +:max_errors+ option to {HTML5.parse} or {HTML5.fragment}. + # Nokogiri contains an experimental HTML5 parse error reporting facility. By default, no parse + # errors are reported but this can be configured by passing the +:max_errors+ option to + # HTML5.parse or HTML5.fragment. # # For example, this script: # - # doc = Nokogiri::HTML5.parse('Hi there!', - # max_errors: 10) doc.errors.each do |err| puts(err) end + # doc = Nokogiri::HTML5.parse('Hi there!', max_errors: 10) + # doc.errors.each do |err| + # puts(err) + # end # # Emits: # - # 1:1: ERROR: Expected a doctype token Hi there! ^ + # 1:1: ERROR: Expected a doctype token + # Hi there! + # ^ # 1:1: ERROR: Start tag of nonvoid HTML element ends with '/>', use '>'. - # Hi there! ^ 1:17: ERROR: End tag ends with '/>', - # use '>'. Hi there! ^ 1:17: ERROR: End tag - # contains attributes. Hi there! ^ - # - # Using max_errors: -1 results in an unlimited number of errors - # being returned. - # - # The errors returned by {HTML5::Document#errors} are instances of - # {Nokogiri::XML::SyntaxError}. - # - # The {https://html.spec.whatwg.org/multipage/parsing.html#parse-errors HTML - # standard} defines a number of standard parse error codes. These error - # codes only cover the "tokenization" stage of parsing HTML. The parse - # errors in the "tree construction" stage do not have standardized error + # Hi there! + # ^ + # 1:17: ERROR: End tag ends with '/>', use '>'. + # Hi there! + # ^ + # 1:17: ERROR: End tag contains attributes. + # Hi there! + # ^ + # + # Using max_errors: -1 results in an unlimited number of errors being returned. + # + # The errors returned by HTML5::Document#errors are instances of Nokogiri::XML::SyntaxError. + # + # The {HTML standard}[https://html.spec.whatwg.org/multipage/parsing.html#parse-errors] defines a + # number of standard parse error codes. These error codes only cover the "tokenization" stage of + # parsing HTML. The parse errors in the "tree construction" stage do not have standardized error # codes (yet). # # As a convenience to Nokogiri users, the defined error codes are available - # via {Nokogiri::XML::SyntaxError#str1} method. + # via Nokogiri::XML::SyntaxError#str1 method. # + # doc = Nokogiri::HTML5.parse('Hi there!', max_errors: 10) + # doc.errors.each do |err| + # puts("#{err.line}:#{err.column}: #{err.str1}") + # end # doc = Nokogiri::HTML5.parse('Hi there!', - # max_errors: 10) doc.errors.each do |err| - # puts("#{err.line}:#{err.column}: #{err.str1}") end # # => 1:1: generic-parser # # 1:1: non-void-html-element-start-tag-with-trailing-solidus # # 1:17: end-tag-with-trailing-solidus # # 1:17: end-tag-with-attributes # - # Note that the first error is +generic-parser+ because it's an error from - # the tree construction stage and doesn't have a standardized error code. + # Note that the first error is +generic-parser+ because it's an error from the tree construction + # stage and doesn't have a standardized error code. # - # For the purposes of semantic versioning, the error messages, error - # locations, and error codes are not part of Nokogiri's public API. That is, - # these are subject to change without Nokogiri's major version number - # changing. These may be stabilized in the future. + # For the purposes of semantic versioning, the error messages, error locations, and error codes + # are not part of Nokogiri's public API. That is, these are subject to change without Nokogiri's + # major version number changing. These may be stabilized in the future. # # === Maximum tree depth # - # The maximum depth of the DOM tree parsed by the various parsing methods is - # configurable by the +:max_tree_depth+ option. If the depth of the tree - # would exceed this limit, then an {::ArgumentError} is thrown. + # The maximum depth of the DOM tree parsed by the various parsing methods is configurable by the + # +:max_tree_depth+ option. If the depth of the tree would exceed this limit, then an + # +ArgumentError+ is thrown. # - # This limit (which defaults to Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH - # = 400) can be removed by giving the option max_tree_depth: - # -1. + # This limit (which defaults to +Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH+) can be removed by + # giving the option max_tree_depth: -1. # - # html = '' + '
# Content- # EOF puts doc.at('/html/body/pre').serialize + # EOF + # puts doc.at('/html/body/pre').serialize # # =>
Content# - # In this case, the original HTML is semantically equivalent to the - # serialized version. If the +pre+, +listing+, or +textarea+ content starts - # with two newlines, the first newline will be stripped on the first parse - # and the second newline will be stripped on the second, leading to - # semantically different DOMs. Passing the parameter preserve_newline: - # true will cause two or more newlines to be preserved. (A single - # leading newline will still be removed.) + # In this case, the original HTML is semantically equivalent to the serialized version. If the + # +pre+, +listing+, or +textarea+ content starts with two newlines, the first newline will be + # stripped on the first parse and the second newline will be stripped on the second, leading to + # semantically different DOMs. Passing the parameter preserve_newline: true will cause + # two or more newlines to be preserved. (A single leading newline will still be removed.) # # doc = Nokogiri::HTML5(<<-EOF) # #
gumbo_parse_with_options
method, using the default options.
- # The resulting Gumbo parse tree is then walked.
+ # * The Nokogiri::HTML5.parse function takes a string and passes it to the
+ # gumbo_parse_with_options
method, using the default options. The resulting Gumbo
+ # parse tree is then walked.
#
- # * Instead of uppercase element names, lowercase element names are
- # produced.
+ # * Instead of uppercase element names, lowercase element names are produced.
#
- # * Instead of returning +unknown+ as the element name for unknown tags, the
- # original tag name is returned verbatim.
+ # * Instead of returning +unknown+ as the element name for unknown tags, the original tag name is
+ # returned verbatim.
#
# Since v1.12.0
module HTML5
class << self
- # Parse an HTML 5 document. Convenience method for {Nokogiri::HTML5::Document.parse}
+ # Parse an HTML 5 document. Convenience method for Nokogiri::HTML5::Document.parse
def parse(string, url = nil, encoding = nil, **options, &block)
Document.parse(string, url, encoding, **options, &block)
end
# Parse a fragment from +string+. Convenience method for
- # {Nokogiri::HTML5::DocumentFragment.parse}.
+ # Nokogiri::HTML5::DocumentFragment.parse.
def fragment(string, encoding = nil, **options)
DocumentFragment.parse(string, encoding, options)
end
@@ -296,11 +304,11 @@ def read_and_encode(string, encoding)
private
# Charset sniffing is a complex and controversial topic that understandably isn't done _by
- # default_ by the Ruby Net::HTTP library. This being said, it is a very real problem for
+ # default_ by the Ruby Net::HTTP library. This being said, it is a very real problem for
# consumers of HTML as the default for HTML is iso-8859-1, most "good" producers use utf-8, and
# the Gumbo parser *only* supports utf-8.
#
- # Accordingly, Nokogiri::HTML4::Document.parse provides limited encoding detection. Following
+ # Accordingly, Nokogiri::HTML4::Document.parse provides limited encoding detection. Following
# this lead, Nokogiri::HTML5 attempts to do likewise, while attempting to more closely follow
# the HTML5 standard.
#