diff --git a/lib/rexml/document.rb b/lib/rexml/document.rb index d1747dd4..86bcf755 100644 --- a/lib/rexml/document.rb +++ b/lib/rexml/document.rb @@ -284,6 +284,8 @@ def version # Returns the XMLDecl encoding of the document, # if it has been set, otherwise the default encoding: # + # d = REXML::Document.new('') + # d.encoding # => "UTF-32" # d = REXML::Document.new('') # d.encoding # => "UTF-16" # d = REXML::Document.new('') diff --git a/lib/rexml/output.rb b/lib/rexml/output.rb index 88a5fb37..40f2e5b9 100644 --- a/lib/rexml/output.rb +++ b/lib/rexml/output.rb @@ -13,7 +13,10 @@ def initialize real_IO, encd="iso-8859-1" @to_utf = encoding != 'UTF-8' - if encoding == "UTF-16" + if encoding == "UTF-32" + @output << "\ufeff".encode("UTF-32BE") + self.encoding = "UTF-32BE" + elsif encoding == "UTF-16" @output << "\ufeff".encode("UTF-16BE") self.encoding = "UTF-16BE" end diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 7bd8adf8..693138cb 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -640,7 +640,7 @@ def record_entity_expansion(delta=1) def need_source_encoding_update?(xml_declaration_encoding) return false if xml_declaration_encoding.nil? - return false if /\AUTF-16\z/i =~ xml_declaration_encoding + return false if /\AUTF-(32|16)\z/i =~ xml_declaration_encoding true end @@ -748,8 +748,12 @@ def process_instruction if need_source_encoding_update?(encoding) @source.encoding = encoding end - if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding - encoding = "UTF-16" + if encoding.nil? + if /\AUTF-32(?:BE|LE)\z/i =~ @source.encoding + encoding = "UTF-32" + elsif encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding + encoding = "UTF-16" + end end standalone = STANDALONE.match(content) standalone = standalone[1] unless standalone.nil? diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index dc0b5323..116fcba2 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -154,7 +154,11 @@ def detect_encoding detected_encoding = "UTF-8" begin @scanner.string.force_encoding("ASCII-8BIT") - if @scanner.scan(/\xfe\xff/n) + if @scanner.scan(/\x00\x00\xfe\xff/n) + detected_encoding = "UTF-32BE" + elsif @scanner.scan(/\xff\xfe\x00\x00/n) + detected_encoding = "UTF-32LE" + elsif @scanner.scan(/\xfe\xff/n) detected_encoding = "UTF-16BE" elsif @scanner.scan(/\xff\xfe/n) detected_encoding = "UTF-16LE" @@ -192,7 +196,7 @@ def initialize(arg, block_size=500, encoding=nil) if encoding super("", encoding) else - super(@source.read(3) || "") + super(@source.read(4) || "") end if !@to_utf and @@ -321,7 +325,7 @@ def readline(term = nil) def encoding_updated case @encoding - when "UTF-16BE", "UTF-16LE" + when "UTF-32BE", "UTF-32LE", "UTF-16BE", "UTF-16LE" @source.binmode @source.set_encoding(@encoding, @encoding) end diff --git a/test/data/utf32.xml b/test/data/utf32.xml new file mode 100644 index 00000000..322a19cf Binary files /dev/null and b/test/data/utf32.xml differ diff --git a/test/parse/test_text.rb b/test/parse/test_text.rb index bb208d47..ebb10461 100644 --- a/test/parse/test_text.rb +++ b/test/parse/test_text.rb @@ -32,9 +32,9 @@ def test_before_root assert_equal(<<~DETAIL.chomp, exception.to_s) Malformed XML: Content at the start of the document (got 'b') Line: 1 - Position: 4 + Position: 8 Last 80 unconsumed characters: - + DETAIL end diff --git a/test/test_core.rb b/test/test_core.rb index 48666c86..0298e8f9 100644 --- a/test/test_core.rb +++ b/test/test_core.rb @@ -488,6 +488,11 @@ def test_xmldecl_utf_16be_encoding_name XMLDecl.new("1.0", "UTF-16").to_s) end + def test_xmldecl_utf_32be_encoding_name + assert_equal("", + XMLDecl.new("1.0", "UTF-32").to_s) + end + def each_test( element, xpath, num_children ) count = 0 element.each_element( xpath ) { |child| diff --git a/test/test_document.rb b/test/test_document.rb index 609aeba2..691d892c 100644 --- a/test/test_document.rb +++ b/test/test_document.rb @@ -351,6 +351,26 @@ def test_utf_16be document = REXML::Document.new(bom + xml) assert_equal("UTF-16", document.encoding) end + + def test_utf_32le + xml = <<-EOX.encode("UTF-32LE").force_encoding("ASCII-8BIT") + +Hello world! +EOX + bom = "\ufeff".encode("UTF-32LE").force_encoding("ASCII-8BIT") + document = REXML::Document.new(bom + xml) + assert_equal("UTF-32", document.encoding) + end + + def test_utf_32be + xml = <<-EOX.encode("UTF-32BE").force_encoding("ASCII-8BIT") + +Hello world! +EOX + bom = "\ufeff".encode("UTF-32BE").force_encoding("ASCII-8BIT") + document = REXML::Document.new(bom + xml) + assert_equal("UTF-32", document.encoding) + end end class NoEncodingTest < self @@ -383,6 +403,26 @@ def test_utf_16be document = REXML::Document.new(bom + xml) assert_equal("UTF-16", document.encoding) end + + def test_utf_32le + xml = <<-EOX.encode("UTF-32LE").force_encoding("ASCII-8BIT") + +Hello world! +EOX + bom = "\ufeff".encode("UTF-32LE").force_encoding("ASCII-8BIT") + document = REXML::Document.new(bom + xml) + assert_equal("UTF-32", document.encoding) + end + + def test_utf_32be + xml = <<-EOX.encode("UTF-32BE").force_encoding("ASCII-8BIT") + +Hello world! +EOX + bom = "\ufeff".encode("UTF-32BE").force_encoding("ASCII-8BIT") + document = REXML::Document.new(bom + xml) + assert_equal("UTF-32", document.encoding) + end end class WriteTest < self @@ -399,13 +439,30 @@ def test_utf_16 expected_xml = <<-EOX.chomp.encode("UTF-16BE") \ufeff Hello world! +EOX + assert_equal(expected_xml, actual_xml) + end + + def test_utf_32 + xml = <<-EOX.encode("UTF-32LE").force_encoding("ASCII-8BIT") + +Hello world! +EOX + bom = "\ufeff".encode("UTF-32LE").force_encoding("ASCII-8BIT") + document = REXML::Document.new(bom + xml) + + actual_xml = "" + document.write(actual_xml) + expected_xml = <<-EOX.chomp.encode("UTF-32BE") +\ufeff +Hello world! EOX assert_equal(expected_xml, actual_xml) end end class ReadUntilTest < Test::Unit::TestCase - def test_utf_8 + def test_utf_8 xml = <<-EOX.force_encoding("ASCII-8BIT") Hello world! @@ -436,6 +493,28 @@ def test_utf_16be assert_equal("UTF-16", document.encoding) assert_equal(">", REXML::XPath.match(document, "/message")[0].attribute("testing").value) end + + def test_utf_32le + xml = <<-EOX.encode("UTF-32LE").force_encoding("ASCII-8BIT") + +Hello world! +EOX + bom = "\ufeff".encode("UTF-32LE").force_encoding("ASCII-8BIT") + document = REXML::Document.new(bom + xml) + assert_equal("UTF-32", document.encoding) + assert_equal(">", REXML::XPath.match(document, "/message")[0].attribute("testing").value) + end + + def test_utf_32be + xml = <<-EOX.encode("UTF-32BE").force_encoding("ASCII-8BIT") + +Hello world! +EOX + bom = "\ufeff".encode("UTF-32BE").force_encoding("ASCII-8BIT") + document = REXML::Document.new(bom + xml) + assert_equal("UTF-32", document.encoding) + assert_equal(">", REXML::XPath.match(document, "/message")[0].attribute("testing").value) + end end end end diff --git a/test/test_encoding.rb b/test/test_encoding.rb index 6887ffbe..09366c65 100644 --- a/test/test_encoding.rb +++ b/test/test_encoding.rb @@ -92,7 +92,15 @@ def test_parse_utf16 REXML::Document.new(f) end assert_equal("UTF-16", utf16.encoding) - assert( utf16[0].kind_of?(REXML::XMLDecl)) + assert(utf16[0].kind_of?(REXML::XMLDecl)) + end + + def test_parse_utf32 + utf32 = File.open(fixture_path("utf32.xml")) do |f| + REXML::Document.new(f) + end + assert_equal("UTF-32", utf32.encoding) + assert(utf32[0].kind_of?(REXML::XMLDecl)) end def test_parse_utf16_with_utf8_default_internal @@ -103,5 +111,14 @@ def test_parse_utf16_with_utf8_default_internal assert_equal("UTF-16", utf16.encoding) end end + + def test_parse_utf32_with_utf8_default_internal + with_default_internal("UTF-8") do + utf32 = File.open(fixture_path("utf32.xml")) do |f| + REXML::Document.new(f) + end + assert_equal("UTF-32", utf32.encoding) + end + end end end