diff --git a/lib/rexml/document.rb b/lib/rexml/document.rb
index d1747dd4..86bcf755 100644
--- a/lib/rexml/document.rb
+++ b/lib/rexml/document.rb
@@ -284,6 +284,8 @@ def version
# Returns the XMLDecl encoding of the document,
# if it has been set, otherwise the default encoding:
#
+ # d = REXML::Document.new('')
+ # d.encoding # => "UTF-32"
# d = REXML::Document.new('')
# d.encoding # => "UTF-16"
# d = REXML::Document.new('')
diff --git a/lib/rexml/output.rb b/lib/rexml/output.rb
index 88a5fb37..40f2e5b9 100644
--- a/lib/rexml/output.rb
+++ b/lib/rexml/output.rb
@@ -13,7 +13,10 @@ def initialize real_IO, encd="iso-8859-1"
@to_utf = encoding != 'UTF-8'
- if encoding == "UTF-16"
+ if encoding == "UTF-32"
+ @output << "\ufeff".encode("UTF-32BE")
+ self.encoding = "UTF-32BE"
+ elsif encoding == "UTF-16"
@output << "\ufeff".encode("UTF-16BE")
self.encoding = "UTF-16BE"
end
diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 7bd8adf8..693138cb 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -640,7 +640,7 @@ def record_entity_expansion(delta=1)
def need_source_encoding_update?(xml_declaration_encoding)
return false if xml_declaration_encoding.nil?
- return false if /\AUTF-16\z/i =~ xml_declaration_encoding
+ return false if /\AUTF-(32|16)\z/i =~ xml_declaration_encoding
true
end
@@ -748,8 +748,12 @@ def process_instruction
if need_source_encoding_update?(encoding)
@source.encoding = encoding
end
- if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
- encoding = "UTF-16"
+ if encoding.nil?
+ if /\AUTF-32(?:BE|LE)\z/i =~ @source.encoding
+ encoding = "UTF-32"
+ elsif encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
+ encoding = "UTF-16"
+ end
end
standalone = STANDALONE.match(content)
standalone = standalone[1] unless standalone.nil?
diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
index dc0b5323..116fcba2 100644
--- a/lib/rexml/source.rb
+++ b/lib/rexml/source.rb
@@ -154,7 +154,11 @@ def detect_encoding
detected_encoding = "UTF-8"
begin
@scanner.string.force_encoding("ASCII-8BIT")
- if @scanner.scan(/\xfe\xff/n)
+ if @scanner.scan(/\x00\x00\xfe\xff/n)
+ detected_encoding = "UTF-32BE"
+ elsif @scanner.scan(/\xff\xfe\x00\x00/n)
+ detected_encoding = "UTF-32LE"
+ elsif @scanner.scan(/\xfe\xff/n)
detected_encoding = "UTF-16BE"
elsif @scanner.scan(/\xff\xfe/n)
detected_encoding = "UTF-16LE"
@@ -192,7 +196,7 @@ def initialize(arg, block_size=500, encoding=nil)
if encoding
super("", encoding)
else
- super(@source.read(3) || "")
+ super(@source.read(4) || "")
end
if !@to_utf and
@@ -321,7 +325,7 @@ def readline(term = nil)
def encoding_updated
case @encoding
- when "UTF-16BE", "UTF-16LE"
+ when "UTF-32BE", "UTF-32LE", "UTF-16BE", "UTF-16LE"
@source.binmode
@source.set_encoding(@encoding, @encoding)
end
diff --git a/test/data/utf32.xml b/test/data/utf32.xml
new file mode 100644
index 00000000..322a19cf
Binary files /dev/null and b/test/data/utf32.xml differ
diff --git a/test/parse/test_text.rb b/test/parse/test_text.rb
index bb208d47..ebb10461 100644
--- a/test/parse/test_text.rb
+++ b/test/parse/test_text.rb
@@ -32,9 +32,9 @@ def test_before_root
assert_equal(<<~DETAIL.chomp, exception.to_s)
Malformed XML: Content at the start of the document (got 'b')
Line: 1
- Position: 4
+ Position: 8
Last 80 unconsumed characters:
-
+
DETAIL
end
diff --git a/test/test_core.rb b/test/test_core.rb
index 48666c86..0298e8f9 100644
--- a/test/test_core.rb
+++ b/test/test_core.rb
@@ -488,6 +488,11 @@ def test_xmldecl_utf_16be_encoding_name
XMLDecl.new("1.0", "UTF-16").to_s)
end
+ def test_xmldecl_utf_32be_encoding_name
+ assert_equal("",
+ XMLDecl.new("1.0", "UTF-32").to_s)
+ end
+
def each_test( element, xpath, num_children )
count = 0
element.each_element( xpath ) { |child|
diff --git a/test/test_document.rb b/test/test_document.rb
index 609aeba2..691d892c 100644
--- a/test/test_document.rb
+++ b/test/test_document.rb
@@ -351,6 +351,26 @@ def test_utf_16be
document = REXML::Document.new(bom + xml)
assert_equal("UTF-16", document.encoding)
end
+
+ def test_utf_32le
+ xml = <<-EOX.encode("UTF-32LE").force_encoding("ASCII-8BIT")
+
+Hello world!
+EOX
+ bom = "\ufeff".encode("UTF-32LE").force_encoding("ASCII-8BIT")
+ document = REXML::Document.new(bom + xml)
+ assert_equal("UTF-32", document.encoding)
+ end
+
+ def test_utf_32be
+ xml = <<-EOX.encode("UTF-32BE").force_encoding("ASCII-8BIT")
+
+Hello world!
+EOX
+ bom = "\ufeff".encode("UTF-32BE").force_encoding("ASCII-8BIT")
+ document = REXML::Document.new(bom + xml)
+ assert_equal("UTF-32", document.encoding)
+ end
end
class NoEncodingTest < self
@@ -383,6 +403,26 @@ def test_utf_16be
document = REXML::Document.new(bom + xml)
assert_equal("UTF-16", document.encoding)
end
+
+ def test_utf_32le
+ xml = <<-EOX.encode("UTF-32LE").force_encoding("ASCII-8BIT")
+
+Hello world!
+EOX
+ bom = "\ufeff".encode("UTF-32LE").force_encoding("ASCII-8BIT")
+ document = REXML::Document.new(bom + xml)
+ assert_equal("UTF-32", document.encoding)
+ end
+
+ def test_utf_32be
+ xml = <<-EOX.encode("UTF-32BE").force_encoding("ASCII-8BIT")
+
+Hello world!
+EOX
+ bom = "\ufeff".encode("UTF-32BE").force_encoding("ASCII-8BIT")
+ document = REXML::Document.new(bom + xml)
+ assert_equal("UTF-32", document.encoding)
+ end
end
class WriteTest < self
@@ -399,13 +439,30 @@ def test_utf_16
expected_xml = <<-EOX.chomp.encode("UTF-16BE")
\ufeff
Hello world!
+EOX
+ assert_equal(expected_xml, actual_xml)
+ end
+
+ def test_utf_32
+ xml = <<-EOX.encode("UTF-32LE").force_encoding("ASCII-8BIT")
+
+Hello world!
+EOX
+ bom = "\ufeff".encode("UTF-32LE").force_encoding("ASCII-8BIT")
+ document = REXML::Document.new(bom + xml)
+
+ actual_xml = ""
+ document.write(actual_xml)
+ expected_xml = <<-EOX.chomp.encode("UTF-32BE")
+\ufeff
+Hello world!
EOX
assert_equal(expected_xml, actual_xml)
end
end
class ReadUntilTest < Test::Unit::TestCase
- def test_utf_8
+ def test_utf_8
xml = <<-EOX.force_encoding("ASCII-8BIT")
Hello world!
@@ -436,6 +493,28 @@ def test_utf_16be
assert_equal("UTF-16", document.encoding)
assert_equal(">", REXML::XPath.match(document, "/message")[0].attribute("testing").value)
end
+
+ def test_utf_32le
+ xml = <<-EOX.encode("UTF-32LE").force_encoding("ASCII-8BIT")
+
+Hello world!
+EOX
+ bom = "\ufeff".encode("UTF-32LE").force_encoding("ASCII-8BIT")
+ document = REXML::Document.new(bom + xml)
+ assert_equal("UTF-32", document.encoding)
+ assert_equal(">", REXML::XPath.match(document, "/message")[0].attribute("testing").value)
+ end
+
+ def test_utf_32be
+ xml = <<-EOX.encode("UTF-32BE").force_encoding("ASCII-8BIT")
+
+Hello world!
+EOX
+ bom = "\ufeff".encode("UTF-32BE").force_encoding("ASCII-8BIT")
+ document = REXML::Document.new(bom + xml)
+ assert_equal("UTF-32", document.encoding)
+ assert_equal(">", REXML::XPath.match(document, "/message")[0].attribute("testing").value)
+ end
end
end
end
diff --git a/test/test_encoding.rb b/test/test_encoding.rb
index 6887ffbe..09366c65 100644
--- a/test/test_encoding.rb
+++ b/test/test_encoding.rb
@@ -92,7 +92,15 @@ def test_parse_utf16
REXML::Document.new(f)
end
assert_equal("UTF-16", utf16.encoding)
- assert( utf16[0].kind_of?(REXML::XMLDecl))
+ assert(utf16[0].kind_of?(REXML::XMLDecl))
+ end
+
+ def test_parse_utf32
+ utf32 = File.open(fixture_path("utf32.xml")) do |f|
+ REXML::Document.new(f)
+ end
+ assert_equal("UTF-32", utf32.encoding)
+ assert(utf32[0].kind_of?(REXML::XMLDecl))
end
def test_parse_utf16_with_utf8_default_internal
@@ -103,5 +111,14 @@ def test_parse_utf16_with_utf8_default_internal
assert_equal("UTF-16", utf16.encoding)
end
end
+
+ def test_parse_utf32_with_utf8_default_internal
+ with_default_internal("UTF-8") do
+ utf32 = File.open(fixture_path("utf32.xml")) do |f|
+ REXML::Document.new(f)
+ end
+ assert_equal("UTF-32", utf32.encoding)
+ end
+ end
end
end