Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

{XML,HTML4,HTML5}::{Document,DocumentFragment}{.parse,#initialize} take keyword arguments #3355

Merged
merged 10 commits into from
Dec 8, 2024
1 change: 0 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,6 @@ We've resolved many long-standing bugs in the various schema classes, validation

* The undocumented and unused method `Nokogiri::CSS.parse` is now deprecated and will generate a warning. The AST returned by this method is private and subject to change and removal in future versions of Nokogiri. This method will be removed in a future version of Nokogiri.
* Passing an options hash to `CSS.xpath_for` is now deprecated and will generate a warning. Use keyword arguments instead. This will become an error in a future version of Nokogiri.
* Passing an options hash to `HTML5::DocumentFragment.parse` is now deprecated and will generate a warning. Use keyword arguments instead. This will become an error in a future version of Nokogiri.
* Passing libxml2 encoding IDs to `SAX::ParserContext` methods is now deprecated and will generate a warning. The use of `SAX::Parser::ENCODINGS` is also deprecated. Use `Encoding` objects or encoding names instead.


Expand Down
96 changes: 4 additions & 92 deletions ext/java/nokogiri/XmlDocumentFragment.java
Original file line number Diff line number Diff line change
Expand Up @@ -48,112 +48,24 @@ public class XmlDocumentFragment extends XmlNode
super(ruby, klazz);
}

@JRubyMethod(name = "new", meta = true, required = 1, optional = 3)
@JRubyMethod(name = "native_new", meta = true)
public static IRubyObject
rbNew(ThreadContext context, IRubyObject cls, IRubyObject[] args, Block block)
rbNew(ThreadContext context, IRubyObject cls, IRubyObject value)
{
if (args.length < 1) {
throw context.runtime.newArgumentError(args.length, 1);
}

if (!(args[0] instanceof XmlDocument)) {
if (!(value instanceof XmlDocument)) {
throw context.runtime.newArgumentError("first parameter must be a Nokogiri::XML::Document instance");
}

XmlDocument doc = (XmlDocument) args[0];

// make well-formed fragment, ignore invalid namespace, or add appropriate namespace to parse
if (args.length > 1 && args[1] instanceof RubyString) {
final RubyString arg1 = (RubyString) args[1];
if (XmlDocumentFragment.isTag(arg1)) {
args[1] = RubyString.newString(context.runtime, addNamespaceDeclIfNeeded(doc, rubyStringToString(arg1)));
}
}
XmlDocument doc = (XmlDocument) value;

XmlDocumentFragment fragment = (XmlDocumentFragment) NokogiriService.XML_DOCUMENT_FRAGMENT_ALLOCATOR.allocate(
context.runtime, (RubyClass)cls);
fragment.setDocument(context, doc);
fragment.setNode(context.runtime, doc.getDocument().createDocumentFragment());

Helpers.invoke(context, fragment, "initialize", args, block);
return fragment;
}

private static final ByteList TAG_BEG = ByteList.create("<");
private static final ByteList TAG_END = ByteList.create(">");

private static boolean
isTag(final RubyString str)
{
return str.getByteList().startsWith(TAG_BEG) && str.getByteList().endsWith(TAG_END);
}

private static boolean
isNamespaceDefined(String qName, NamedNodeMap nodeMap)
{
if (isNamespace(qName.intern())) { return true; }
for (int i = 0; i < nodeMap.getLength(); i++) {
Attr attr = (Attr)nodeMap.item(i);
if (isNamespace(attr.getNodeName())) {
String localPart = getLocalNameForNamespace(attr.getNodeName(), null);
if (getPrefix(qName).equals(localPart)) {
return true;
}
}
}
return false;
}

private static final Pattern QNAME_RE = Pattern.compile("[^</:>\\s]+:[^</:>=\\s]+");
private static final Pattern START_TAG_RE = Pattern.compile("<[^</>]+>");

private static String
addNamespaceDeclIfNeeded(XmlDocument doc, String tags)
{
if (doc.getDocument() == null) { return tags; }
if (doc.getDocument().getDocumentElement() == null) { return tags; }
Matcher matcher = START_TAG_RE.matcher(tags);
Map<CharSequence, CharSequence> rewriteTable = null;
while (matcher.find()) {
String start_tag = matcher.group();
Matcher matcher2 = QNAME_RE.matcher(start_tag);
while (matcher2.find()) {
String qName = matcher2.group();
NamedNodeMap nodeMap = doc.getDocument().getDocumentElement().getAttributes();
if (isNamespaceDefined(qName, nodeMap)) {
CharSequence namespaceDecl = getNamespaceDecl(getPrefix(qName), nodeMap);
if (namespaceDecl != null) {
if (rewriteTable == null) { rewriteTable = new HashMap<CharSequence, CharSequence>(8, 1); }
StringBuilder str = new StringBuilder(qName.length() + namespaceDecl.length() + 3);
String key = str.append('<').append(qName).append('>').toString();
str.setCharAt(key.length() - 1, ' '); // (last) '>' -> ' '
rewriteTable.put(key, str.append(namespaceDecl).append('>'));
}
}
}
}
if (rewriteTable != null) {
for (Map.Entry<CharSequence, CharSequence> e : rewriteTable.entrySet()) {
tags = tags.replace(e.getKey(), e.getValue());
}
}

return tags;
}

private static CharSequence
getNamespaceDecl(final String prefix, NamedNodeMap nodeMap)
{
for (int i = 0; i < nodeMap.getLength(); i++) {
Attr attr = (Attr) nodeMap.item(i);
if (prefix.equals(attr.getLocalName())) {
return new StringBuilder().
append(attr.getName()).append('=').append('"').append(attr.getValue()).append('"');
}
}
return null;
}

@Override
public void
relink_namespace(ThreadContext context)
Expand Down
10 changes: 8 additions & 2 deletions ext/nokogiri/html4_document.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ static ID id_to_s;

/*
* call-seq:
* new
* new(uri=nil, external_id=nil) → HTML4::Document
*
* Create a new document
* Create a new empty document with base URI +uri+ and external ID +external_id+.
*/
static VALUE
rb_html_document_s_new(int argc, VALUE *argv, VALUE klass)
Expand Down Expand Up @@ -151,6 +151,12 @@ rb_html_document_type(VALUE self)
void
noko_init_html_document(void)
{
/* this is here so that rdoc doesn't ignore this file. */
/*
mNokogiri = rb_define_module("Nokogiri");
mNokogiriHtml4 = rb_define_module_under(mNokogiri, "HTML4");
*/

assert(cNokogiriXmlDocument);
cNokogiriHtml4Document = rb_define_class_under(mNokogiriHtml4, "Document", cNokogiriXmlDocument);

Expand Down
4 changes: 2 additions & 2 deletions ext/nokogiri/nokogiri.c
Original file line number Diff line number Diff line change
Expand Up @@ -185,8 +185,8 @@ Init_nokogiri(void)
{
mNokogiri = rb_define_module("Nokogiri");
mNokogiriGumbo = rb_define_module_under(mNokogiri, "Gumbo");
mNokogiriHtml4 = rb_define_module_under(mNokogiri, "HTML4");
mNokogiriHtml4Sax = rb_define_module_under(mNokogiriHtml4, "SAX");
mNokogiriHtml4 = rb_define_module_under(mNokogiri, "HTML4");
mNokogiriHtml4Sax = rb_define_module_under(mNokogiriHtml4, "SAX");
mNokogiriHtml5 = rb_define_module_under(mNokogiri, "HTML5");
mNokogiriXml = rb_define_module_under(mNokogiri, "XML");
mNokogiriXmlSax = rb_define_module_under(mNokogiriXml, "SAX");
Expand Down
12 changes: 7 additions & 5 deletions ext/nokogiri/xml_document.c
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,8 @@ noko_xml_document_s_read_io(VALUE rb_class,
VALUE rb_encoding,
VALUE rb_options)
{
/* TODO: deprecate this method, parse should be the preferred entry point. then we can make this
private. */
libxmlStructuredErrorHandlerState handler_state;
VALUE rb_errors = rb_ary_new();

Expand Down Expand Up @@ -417,6 +419,8 @@ noko_xml_document_s_read_memory(VALUE rb_class,
VALUE rb_encoding,
VALUE rb_options)
{
/* TODO: deprecate this method, parse should be the preferred entry point. then we can make this
private. */
VALUE rb_errors = rb_ary_new();
xmlSetStructuredErrorFunc((void *)rb_errors, noko__error_array_pusher);

Expand Down Expand Up @@ -444,9 +448,9 @@ noko_xml_document_s_read_memory(VALUE rb_class,

/*
* call-seq:
* new(version = default)
* new(version = "1.0")
*
* Create a new document with +version+ (defaults to "1.0")
* Create a new empty document declaring XML version +version+.
*/
static VALUE
new (int argc, VALUE *argv, VALUE klass)
Expand Down Expand Up @@ -756,9 +760,7 @@ void
noko_init_xml_document(void)
{
assert(cNokogiriXmlNode);
/*
* Nokogiri::XML::Document wraps an xml document.
*/

cNokogiriXmlDocument = rb_define_class_under(mNokogiriXml, "Document", cNokogiriXmlNode);

rb_define_alloc_func(cNokogiriXmlDocument, _xml_document_alloc);
Expand Down
35 changes: 10 additions & 25 deletions ext/nokogiri/xml_document_fragment.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,31 +2,18 @@

VALUE cNokogiriXmlDocumentFragment;

/*
* call-seq:
* new(document)
*
* Create a new DocumentFragment element on the +document+
*/
/* :nodoc: */
static VALUE
new (int argc, VALUE *argv, VALUE klass)
noko_xml_document_fragment_s_native_new(VALUE klass, VALUE rb_doc)
{
xmlDocPtr xml_doc;
xmlNodePtr node;
VALUE document;
VALUE rest;
xmlDocPtr c_doc;
xmlNodePtr c_node;
VALUE rb_node;

rb_scan_args(argc, argv, "1*", &document, &rest);

xml_doc = noko_xml_document_unwrap(document);

node = xmlNewDocFragment(xml_doc->doc);

noko_xml_document_pin_node(node);

rb_node = noko_xml_node_wrap(klass, node);
rb_obj_call_init(rb_node, argc, argv);
c_doc = noko_xml_document_unwrap(rb_doc);
c_node = xmlNewDocFragment(c_doc->doc);
noko_xml_document_pin_node(c_node);
rb_node = noko_xml_node_wrap(klass, c_node);

return rb_node;
}
Expand All @@ -35,10 +22,8 @@ void
noko_init_xml_document_fragment(void)
{
assert(cNokogiriXmlNode);
/*
* DocumentFragment represents a DocumentFragment node in an xml document.
*/

cNokogiriXmlDocumentFragment = rb_define_class_under(mNokogiriXml, "DocumentFragment", cNokogiriXmlNode);

rb_define_singleton_method(cNokogiriXmlDocumentFragment, "new", new, -1);
rb_define_singleton_method(cNokogiriXmlDocumentFragment, "native_new", noko_xml_document_fragment_s_native_new, 1);
}
67 changes: 44 additions & 23 deletions lib/nokogiri/html4/document.rb
Original file line number Diff line number Diff line change
Expand Up @@ -161,52 +161,73 @@ def xpath_doctype
end

class << self
###
# Parse HTML. +string_or_io+ may be a String, or any object that
# responds to _read_ and _close_ such as an IO, or StringIO.
# +url+ is resource where this document is located. +encoding+ is the
# encoding that should be used when processing the document. +options+
# is a number that sets options in the parser, such as
# Nokogiri::XML::ParseOptions::RECOVER. See the constants in
# Nokogiri::XML::ParseOptions.
def parse(string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML)
# :call-seq:
# parse(input) { |options| ... } => Nokogiri::HTML4::Document
# parse(input, url:, encoding:, options:) => Nokogiri::HTML4::Document
#
# Parse \HTML4 input from a String or IO object, and return a new HTML4::Document.
#
# [Required Parameters]
# - +input+ (String | IO) The content to be parsed.
#
# [Optional Keyword Arguments]
# - +url:+ (String) The base URI for this document.
#
# - +encoding:+ (String) The name of the encoding that should be used when processing the
# document. When not provided, the encoding will be determined based on the document
# content.
#
# - +options:+ (Nokogiri::XML::ParseOptions) Configuration object that determines some
# behaviors during parsing. See ParseOptions for more information. The default value is
# +ParseOptions::DEFAULT_HTML+.
#
# [Yields]
# If a block is given, a Nokogiri::XML::ParseOptions object is yielded to the block which
# can be configured before parsing. See Nokogiri::XML::ParseOptions for more information.
#
# [Returns] Nokogiri::HTML4::Document
def parse(
input,
url_ = nil, encoding_ = nil, options_ = XML::ParseOptions::DEFAULT_HTML,
url: url_, encoding: encoding_, options: options_
)
options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
yield options if block_given?

url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
url ||= input.respond_to?(:path) ? input.path : nil

if string_or_io.respond_to?(:encoding)
unless string_or_io.encoding == Encoding::ASCII_8BIT
encoding ||= string_or_io.encoding.name
if input.respond_to?(:encoding)
unless input.encoding == Encoding::ASCII_8BIT
encoding ||= input.encoding.name
end
end

if string_or_io.respond_to?(:read)
if string_or_io.is_a?(Pathname)
if input.respond_to?(:read)
if input.is_a?(Pathname)
# resolve the Pathname to the file and open it as an IO object, see #2110
string_or_io = string_or_io.expand_path.open
url ||= string_or_io.path
input = input.expand_path.open
url ||= input.path
end

unless encoding
string_or_io = EncodingReader.new(string_or_io)
input = EncodingReader.new(input)
begin
return read_io(string_or_io, url, encoding, options.to_i)
return read_io(input, url, encoding, options.to_i)
rescue EncodingReader::EncodingFound => e
encoding = e.found_encoding
end
end
return read_io(string_or_io, url, encoding, options.to_i)
return read_io(input, url, encoding, options.to_i)
end

# read_memory pukes on empty docs
if string_or_io.nil? || string_or_io.empty?
if input.nil? || input.empty?
return encoding ? new.tap { |i| i.encoding = encoding } : new
end

encoding ||= EncodingReader.detect_encoding(string_or_io)
encoding ||= EncodingReader.detect_encoding(input)

read_memory(string_or_io, url, encoding, options.to_i)
read_memory(input, url, encoding, options.to_i)
end
end
end
Expand Down
Loading
Loading