From 8cb8a6d526bf2bc6a271b0dce497ddc4775b7dfe Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Wed, 13 Dec 2023 10:46:40 -0500 Subject: [PATCH 1/2] fix: protect ourselves from xmlCreateIOParserCtxt returning NULL I've opened an upstream merge request to fix the underlying problem at https://gitlab.gnome.org/GNOME/libxml2/-/merge_requests/231 Without this check, we'd segfault. So a failing test is preferable. --- ext/nokogiri/xml_sax_parser_context.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ext/nokogiri/xml_sax_parser_context.c b/ext/nokogiri/xml_sax_parser_context.c index be784b5e88a..9ac2844fd38 100644 --- a/ext/nokogiri/xml_sax_parser_context.c +++ b/ext/nokogiri/xml_sax_parser_context.c @@ -59,6 +59,10 @@ parse_io(VALUE klass, VALUE io, VALUE encoding) (xmlInputReadCallback)noko_io_read, (xmlInputCloseCallback)noko_io_close, (void *)io, enc); + if (!ctxt) { + rb_raise(rb_eRuntimeError, "failed to create xml sax parser context"); + } + if (ctxt->sax) { xmlFree(ctxt->sax); ctxt->sax = NULL; From c50d60b112958b2f7017d0b11b01c1b189437a02 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Wed, 13 Dec 2023 10:49:02 -0500 Subject: [PATCH 2/2] fix: upstream libxml (2.13.dev) fixes in-context parsing recovery Fixed in abd74186 and 1c106edf. See https://gitlab.gnome.org/GNOME/libxml2/-/issues/645 --- lib/nokogiri/xml/node.rb | 44 +++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/lib/nokogiri/xml/node.rb b/lib/nokogiri/xml/node.rb index a0a2770e7a7..8b1b19bcd03 100644 --- a/lib/nokogiri/xml/node.rb +++ b/lib/nokogiri/xml/node.rb @@ -1049,29 +1049,35 @@ def parse(string_or_io, options = nil) return Nokogiri::XML::NodeSet.new(document) if contents.empty? - # libxml2 does not obey the +recover+ option after encountering errors during +in_context+ - # parsing, and so this horrible hack is here to try to emulate recovery behavior. - # - # Unfortunately, this means we're no longer parsing "in context" and so namespaces that - # would have been inherited from the context node won't be handled correctly. This hack was - # written in 2010, and I regret it, because it's silently degrading functionality in a way - # that's not easily prevented (or even detected). - # - # I think preferable behavior would be to either: - # - # a. add an error noting that we "fell back" and pointing the user to turning off the +recover+ option - # b. don't recover, but raise a sensible exception - # - # For context and background: https://github.com/sparklemotion/nokogiri/issues/313 - # FIXME bug report: https://github.com/sparklemotion/nokogiri/issues/2092 error_count = document.errors.length node_set = in_context(contents, options.to_i) - if node_set.empty? && (document.errors.length > error_count) - if options.recover? + if document.errors.length > error_count + raise document.errors[error_count] unless options.recover? + + if node_set.empty? + # libxml2 < 2.13 does not obey the +recover+ option after encountering errors during + # +in_context+ parsing, and so this horrible hack is here to try to emulate recovery + # behavior. + # + # (Note that HTML4 fragment parsing seems to have been fixed in abd74186, and XML + # fragment parsing is fixed in 1c106edf. Both are in 2.13.) + # + # Unfortunately, this means we're no longer parsing "in context" and so namespaces that + # would have been inherited from the context node won't be handled correctly. This hack + # was written in 2010, and I regret it, because it's silently degrading functionality in + # a way that's not easily prevented (or even detected). + # + # I think preferable behavior would be to either: + # + # a. add an error noting that we "fell back" and pointing the user to turning off the + # +recover+ option + # b. don't recover, but raise a sensible exception + # + # For context and background: + # - https://github.com/sparklemotion/nokogiri/issues/313 + # - https://github.com/sparklemotion/nokogiri/issues/2092 fragment = document.related_class("DocumentFragment").parse(contents) node_set = fragment.children - else - raise document.errors[error_count] end end node_set