Use string scanner with baseparser (#105)

Using StringScanner reduces the string copying process and speeds up the process. And I removed unnecessary methods. https://github.com/ruby/rexml/actions/runs/7549990000/job/20554906140?pr=105 ``` ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [x86_64-linux] Calculating ------------------------------------- rexml 3.2.6 master 3.2.6(YJIT) master(YJIT) dom 4.868 5.077 8.137 8.303 i/s - 100.000 times in 20.540529s 19.696590s 12.288900s 12.043666s sax 13.597 13.953 19.206 20.948 i/s - 100.000 times in 7.354343s 7.167142s 5.206745s 4.773765s pull 15.641 16.918 22.266 25.378 i/s - 100.000 times in 6.393424s 5.910955s 4.491201s 3.940471s stream 14.339 15.844 19.810 22.206 i/s - 100.000 times in 6.973856s 6.311350s 5.047957s 4.503244s Comparison: dom master(YJIT): 8.3 i/s 3.2.6(YJIT): 8.1 i/s - 1.02x slower master: 5.1 i/s - 1.64x slower rexml 3.2.6: 4.9 i/s - 1.71x slower sax master(YJIT): 20.9 i/s 3.2.6(YJIT): 19.2 i/s - 1.09x slower master: 14.0 i/s - 1.50x slower rexml 3.2.6: 13.6 i/s - 1.54x slower pull master(YJIT): 25.4 i/s 3.2.6(YJIT): 22.3 i/s - 1.14x slower master: 16.9 i/s - 1.50x slower rexml 3.2.6: 15.6 i/s - 1.62x slower stream master(YJIT): 22.2 i/s 3.2.6(YJIT): 19.8 i/s - 1.12x slower master: 15.8 i/s - 1.40x slower rexml 3.2.6: 14.3 i/s - 1.55x slower ``` - YJIT=ON : 1.02x - 1.14x faster - YJIT=OFF : 1.02x - 1.10x faster --------- Co-authored-by: Sutou Kouhei <kou@cozmixng.org>
ruby · Jan 17, 2024 · 810d228 · 810d228
1 parent 72a26d6
commit 810d228
Show file tree

Hide file tree

Showing 6 changed files with 99 additions and 115 deletions.
diff --git a/benchmark/parse.yaml b/benchmark/parse.yaml
@@ -5,6 +5,8 @@ contexts:
     require: false
     prelude: require 'rexml'
   - name: master
+    gems:
+      strscan: 3.0.8
     prelude: |
       $LOAD_PATH.unshift(File.expand_path("lib"))
       require 'rexml'
@@ -16,6 +18,8 @@ contexts:
       require 'rexml'
       RubyVM::YJIT.enable
   - name: master(YJIT)
+    gems:
+      strscan: 3.0.8
     prelude: |
       $LOAD_PATH.unshift(File.expand_path("lib"))
       require 'rexml'

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
@@ -96,7 +96,7 @@ class BaseParser
       ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
       PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
       GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
-      ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
+      ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
 
       NOTATIONDECL_START = /\A\s*<!NOTATION/um
       EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
@@ -259,7 +259,7 @@ def pull_event
           else
             @document_status = :after_doctype
             if @source.encoding == "UTF-8"
-              @source.buffer.force_encoding(::Encoding::UTF_8)
+              @source.buffer_encoding = ::Encoding::UTF_8
             end
           end
         end
@@ -274,8 +274,7 @@ def pull_event
             return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
 
           when ENTITY_START
-            match = @source.match( ENTITYDECL, true ).to_a.compact
-            match[0] = :entitydecl
+            match = [:entitydecl, *@source.match( ENTITYDECL, true ).captures.compact]
             ref = false
             if match[1] == '%'
               ref = true
@@ -392,6 +391,7 @@ def pull_event
               unless md
                 raise REXML::ParseException.new("malformed XML: missing tag start", @source)
               end
+              tag = md[1]
               @document_status = :in_element
               prefixes = Set.new
               prefixes << md[2] if md[2]
@@ -405,23 +405,20 @@ def pull_event
               end
 
               if closed
-                @closed = md[1]
+                @closed = tag
                 @nsstack.shift
               else
-                @tags.push( md[1] )
+                @tags.push( tag )
               end
-              return [ :start_element, md[1], attributes ]
+              return [ :start_element, tag, attributes ]
             end
           else
             md = @source.match( TEXT_PATTERN, true )
+            text = md[1]
             if md[0].length == 0
               @source.match( /(\s+)/, true )
             end
-            #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
-            #return [ :text, "" ] if md[0].length == 0
-            # unnormalized = Text::unnormalize( md[1], self )
-            # return PullEvent.new( :text, md[1], unnormalized )
-            return [ :text, md[1] ]
+            return [ :text, text ]
           end
         rescue REXML::UndefinedNamespaceException
           raise

diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
@@ -30,8 +30,6 @@ def SourceFactory::create_from(arg)
   # objects and provides consumption of text
   class Source
     include Encoding
-    # The current buffer (what we're going to read next)
-    attr_reader :buffer
     # The line number of the last consumed text
     attr_reader :line
     attr_reader :encoding
@@ -41,7 +39,8 @@ class Source
     # @param encoding if non-null, sets the encoding of the source to this
     # value, overriding all encoding detection
     def initialize(arg, encoding=nil)
-      @orig = @buffer = arg
+      @orig = arg
+      @scanner = StringScanner.new(@orig)
       if encoding
         self.encoding = encoding
       else
@@ -50,6 +49,14 @@ def initialize(arg, encoding=nil)
       @line = 0
     end
 
+    # The current buffer (what we're going to read next)
+    def buffer
+      @scanner.rest
+    end
+
+    def buffer_encoding=(encoding)
+      @scanner.string.force_encoding(encoding)
+    end
 
     # Inherited from Encoding
     # Overridden to support optimized en/decoding
@@ -58,98 +65,57 @@ def encoding=(enc)
       encoding_updated
     end
 
-    # Scans the source for a given pattern.  Note, that this is not your
-    # usual scan() method.  For one thing, the pattern argument has some
-    # requirements; for another, the source can be consumed.  You can easily
-    # confuse this method.  Originally, the patterns were easier
-    # to construct and this method more robust, because this method
-    # generated search regexps on the fly; however, this was
-    # computationally expensive and slowed down the entire REXML package
-    # considerably, since this is by far the most commonly called method.
-    # @param pattern must be a Regexp, and must be in the form of
-    # /^\s*(#{your pattern, with no groups})(.*)/.  The first group
-    # will be returned; the second group is used if the consume flag is
-    # set.
-    # @param consume if true, the pattern returned will be consumed, leaving
-    # everything after it in the Source.
-    # @return the pattern, if found, or nil if the Source is empty or the
-    # pattern is not found.
-    def scan(pattern, cons=false)
-      return nil if @buffer.nil?
-      rv = @buffer.scan(pattern)
-      @buffer = $' if cons and rv.size>0
-      rv
-    end
-
     def read
     end
 
-    def consume( pattern )
-      @buffer = $' if pattern.match( @buffer )
-    end
-
-    def match_to( char, pattern )
-      return pattern.match(@buffer)
-    end
-
-    def match_to_consume( char, pattern )
-      md = pattern.match(@buffer)
-      @buffer = $'
-      return md
-    end
-
     def match(pattern, cons=false)
-      md = pattern.match(@buffer)
-      @buffer = $' if cons and md
-      return md
+      if cons
+        @scanner.scan(pattern).nil? ? nil : @scanner
+      else
+        @scanner.check(pattern).nil? ? nil : @scanner
+      end
     end
 
     # @return true if the Source is exhausted
     def empty?
-      @buffer == ""
-    end
-
-    def position
-      @orig.index( @buffer )
+      @scanner.eos?
     end
 
     # @return the current line in the source
     def current_line
       lines = @orig.split
-      res = lines.grep @buffer[0..30]
+      res = lines.grep @scanner.rest[0..30]
       res = res[-1] if res.kind_of? Array
       lines.index( res ) if res
     end
 
     private
+
     def detect_encoding
-      buffer_encoding = @buffer.encoding
+      scanner_encoding = @scanner.rest.encoding
       detected_encoding = "UTF-8"
       begin
-        @buffer.force_encoding("ASCII-8BIT")
-        if @buffer[0, 2] == "\xfe\xff"
-          @buffer[0, 2] = ""
+        @scanner.string.force_encoding("ASCII-8BIT")
+        if @scanner.scan(/\xfe\xff/n)
           detected_encoding = "UTF-16BE"
-        elsif @buffer[0, 2] == "\xff\xfe"
-          @buffer[0, 2] = ""
+        elsif @scanner.scan(/\xff\xfe/n)
           detected_encoding = "UTF-16LE"
-        elsif @buffer[0, 3] == "\xef\xbb\xbf"
-          @buffer[0, 3] = ""
+        elsif @scanner.scan(/\xef\xbb\xbf/n)
           detected_encoding = "UTF-8"
         end
       ensure
-        @buffer.force_encoding(buffer_encoding)
+        @scanner.string.force_encoding(scanner_encoding)
       end
       self.encoding = detected_encoding
     end
 
     def encoding_updated
       if @encoding != 'UTF-8'
-        @buffer = decode(@buffer)
+        @scanner.string = decode(@scanner.rest)
         @to_utf = true
       else
         @to_utf = false
-        @buffer.force_encoding ::Encoding::UTF_8
+        @scanner.string.force_encoding(::Encoding::UTF_8)
       end
     end
   end
@@ -172,7 +138,7 @@ def initialize(arg, block_size=500, encoding=nil)
       end
 
       if !@to_utf and
-          @buffer.respond_to?(:force_encoding) and
+          @orig.respond_to?(:force_encoding) and
           @source.respond_to?(:external_encoding) and
           @source.external_encoding != ::Encoding::UTF_8
         @force_utf8 = true
@@ -181,65 +147,44 @@ def initialize(arg, block_size=500, encoding=nil)
       end
     end
 
-    def scan(pattern, cons=false)
-      rv = super
-      # You'll notice that this next section is very similar to the same
-      # section in match(), but just a liiittle different.  This is
-      # because it is a touch faster to do it this way with scan()
-      # than the way match() does it; enough faster to warrant duplicating
-      # some code
-      if rv.size == 0
-        until @buffer =~ pattern or @source.nil?
-          begin
-            @buffer << readline
-          rescue Iconv::IllegalSequence
-            raise
-          rescue
-            @source = nil
-          end
-        end
-        rv = super
-      end
-      rv.taint if RUBY_VERSION < '2.7'
-      rv
-    end
-
     def read
       begin
-        @buffer << readline
+        # NOTE: `@scanner << readline` does not free memory, so when parsing huge XML in JRuby's DOM,
+        # out-of-memory error `Java::JavaLang::OutOfMemoryError: Java heap space` occurs.
+        # `@scanner.string = @scanner.rest + readline` frees memory that is already consumed
+        # and avoids this problem.
+        @scanner.string = @scanner.rest + readline
       rescue Exception, NameError
         @source = nil
       end
     end
 
-    def consume( pattern )
-      match( pattern, true )
-    end
-
     def match( pattern, cons=false )
-      rv = pattern.match(@buffer)
-      @buffer = $' if cons and rv
-      while !rv and @source
+      if cons
+        md = @scanner.scan(pattern)
+      else
+        md = @scanner.check(pattern)
+      end
+      while md.nil? and @source
         begin
-          @buffer << readline
-          rv = pattern.match(@buffer)
-          @buffer = $' if cons and rv
+          @scanner << readline
+          if cons
+            md = @scanner.scan(pattern)
+          else
+            md = @scanner.check(pattern)
+          end
         rescue
           @source = nil
         end
       end
-      rv.taint if RUBY_VERSION < '2.7'
-      rv
+
+      md.nil? ? nil : @scanner
     end
 
     def empty?
       super and ( @source.nil? || @source.eof? )
     end
 
-    def position
-      @er_source.pos rescue 0
-    end
-
     # @return the current line in the source
     def current_line
       begin
@@ -290,7 +235,7 @@ def encoding_updated
         @source.set_encoding(@encoding, @encoding)
       end
       @line_break = encode(">")
-      @pending_buffer, @buffer = @buffer, ""
+      @pending_buffer, @scanner.string = @scanner.rest, ""
       @pending_buffer.force_encoding(@encoding)
       super
     end

diff --git a/rexml.gemspec b/rexml.gemspec
@@ -55,6 +55,8 @@ Gem::Specification.new do |spec|
 
   spec.required_ruby_version = '>= 2.5.0'
 
+  spec.add_runtime_dependency("strscan", ">= 3.0.8")
+
   spec.add_development_dependency "benchmark_driver"
   spec.add_development_dependency "bundler"
   spec.add_development_dependency "rake"

diff --git a/test/parse/test_entity_declaration.rb b/test/parse/test_entity_declaration.rb
@@ -0,0 +1,36 @@
+# frozen_string_literal: false
+require 'test/unit'
+require 'rexml/document'
+
+module REXMLTests
+  class TestParseEntityDeclaration < Test::Unit::TestCase
+    private
+    def xml(internal_subset)
+      <<-XML
+<!DOCTYPE r SYSTEM "urn:x-henrikmartensson:test" [
+#{internal_subset}
+]>
+<r/>
+      XML
+    end
+
+    def parse(internal_subset)
+      REXML::Document.new(xml(internal_subset)).doctype
+    end
+
+    def test_empty
+      exception = assert_raise(REXML::ParseException) do
+        parse(<<-INTERNAL_SUBSET)
+<!ENTITY>
+        INTERNAL_SUBSET
+      end
+      assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed notation declaration: name is missing
+Line: 5
+Position: 72
+Last 80 unconsumed characters:
+ <!ENTITY>  ]> <r/>
+      DETAIL
+    end
+  end
+end
diff --git a/test/test_core.rb b/test/test_core.rb
@@ -727,7 +727,7 @@ def test_iso_8859_1_output_function
       koln_iso_8859_1 = "K\xF6ln"
       koln_utf8 = "K\xc3\xb6ln"
       source = Source.new( koln_iso_8859_1, 'iso-8859-1' )
-      results = source.scan(/.*/)[0]
+      results = source.match(/.*/)[0]
       koln_utf8.force_encoding('UTF-8') if koln_utf8.respond_to?(:force_encoding)
       assert_equal koln_utf8, results
       output << results