Skip to content

Commit

Permalink
Fix char, codepoint for single digit hex escapes
Browse files Browse the repository at this point in the history
  • Loading branch information
jaynetics committed Dec 25, 2024
1 parent b359448 commit 79a351d
Show file tree
Hide file tree
Showing 6 changed files with 99 additions and 90 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Fixed

- fixed `#char` & `#codepoint` errors for single-digit hex escapes
* e.g. `\xA`

## [2.9.3] - 2024-11-29 - Janosch Müller

### Fixed
Expand Down
2 changes: 2 additions & 0 deletions lib/regexp_parser/expression.rb
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
require_relative 'expression/classes/unicode_property'

require_relative 'expression/methods/construct'
require_relative 'expression/methods/escape_sequence_char'
require_relative 'expression/methods/escape_sequence_codepoint'
require_relative 'expression/methods/human_name'
require_relative 'expression/methods/match'
require_relative 'expression/methods/match_length'
Expand Down
108 changes: 18 additions & 90 deletions lib/regexp_parser/expression/classes/escape_sequence.rb
Original file line number Diff line number Diff line change
@@ -1,100 +1,28 @@
module Regexp::Expression
module EscapeSequence
class Base < Regexp::Expression::Base
def codepoint
char.ord
end
Base = Class.new(Regexp::Expression::Base)

if ''.respond_to?(:undump)
def char
%("#{text}").undump
end
else
# poor man's unescape without using eval
require 'yaml'
def char
YAML.load(%Q(---\n"#{text}"\n))
end
end
end
AsciiEscape = Class.new(Base) # \e
Backspace = Class.new(Base) # \b
Bell = Class.new(Base) # \a
FormFeed = Class.new(Base) # \f
Newline = Class.new(Base) # \n
Return = Class.new(Base) # \r
Tab = Class.new(Base) # \t
VerticalTab = Class.new(Base) # \v

class Literal < EscapeSequence::Base
def char
text[1..-1]
end
end
Literal = Class.new(Base) # e.g. \j, \@, \😀 (ineffectual escapes)

class AsciiEscape < EscapeSequence::Base; end
class Backspace < EscapeSequence::Base; end
class Bell < EscapeSequence::Base; end
class FormFeed < EscapeSequence::Base; end
class Newline < EscapeSequence::Base; end
class Return < EscapeSequence::Base; end
class Tab < EscapeSequence::Base; end
class VerticalTab < EscapeSequence::Base; end
Octal = Class.new(Base) # e.g. \012
Hex = Class.new(Base) # e.g. \x0A
Codepoint = Class.new(Base) # e.g. \u000A

class Hex < EscapeSequence::Base; end
class Codepoint < EscapeSequence::Base; end
CodepointList = Class.new(Base) # e.g. \u{A B}

class CodepointList < EscapeSequence::Base
def char
raise NoMethodError, 'CodepointList responds only to #chars'
end

def codepoint
raise NoMethodError, 'CodepointList responds only to #codepoints'
end

def chars
codepoints.map { |cp| cp.chr('utf-8') }
end

def codepoints
text.scan(/\h+/).map(&:hex)
end
end

class Octal < EscapeSequence::Base
def char
text[1..-1].to_i(8).chr('utf-8')
end
end

class AbstractMetaControlSequence < EscapeSequence::Base
def char
codepoint.chr('utf-8')
end

private

def control_sequence_to_s(control_sequence)
five_lsb = control_sequence.unpack('B*').first[-5..-1]
["000#{five_lsb}"].pack('B*')
end

def meta_char_to_codepoint(meta_char)
byte_value = meta_char.ord
byte_value < 128 ? byte_value + 128 : byte_value
end
end

class Control < AbstractMetaControlSequence
def codepoint
control_sequence_to_s(text).ord
end
end

class Meta < AbstractMetaControlSequence
def codepoint
meta_char_to_codepoint(text[-1])
end
end

class MetaControl < AbstractMetaControlSequence
def codepoint
meta_char_to_codepoint(control_sequence_to_s(text))
end
end
AbstractMetaControlSequence = Class.new(Base)
Control = Class.new(AbstractMetaControlSequence) # e.g. \cB
Meta = Class.new(AbstractMetaControlSequence) # e.g. \M-Z
MetaControl = Class.new(AbstractMetaControlSequence) # e.g. \M-\cX
end

# alias for symmetry between Token::* and Expression::*
Expand Down
5 changes: 5 additions & 0 deletions lib/regexp_parser/expression/methods/escape_sequence_char.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Regexp::Expression::EscapeSequence::Base.class_eval do
def char
codepoint.chr('utf-8')
end
end
68 changes: 68 additions & 0 deletions lib/regexp_parser/expression/methods/escape_sequence_codepoint.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
module Regexp::Expression::EscapeSequence
AsciiEscape.class_eval { def codepoint; 0x1B end }
Backspace.class_eval { def codepoint; 0x8 end }
Bell.class_eval { def codepoint; 0x7 end }
FormFeed.class_eval { def codepoint; 0xC end }
Newline.class_eval { def codepoint; 0xA end }
Return.class_eval { def codepoint; 0xD end }
Tab.class_eval { def codepoint; 0x9 end }
VerticalTab.class_eval { def codepoint; 0xB end }

Literal.class_eval { def codepoint; text[1].ord end }

Octal.class_eval { def codepoint; text[/\d+/].to_i(8) end }

Hex.class_eval { def codepoint; text[/\h+/].hex end }
Codepoint.class_eval { def codepoint; text[/\h+/].hex end }

CodepointList.class_eval do
# Maybe this should be a unique top-level expression class?
def char
raise NoMethodError, 'CodepointList responds only to #chars'
end

def codepoint
raise NoMethodError, 'CodepointList responds only to #codepoints'
end

def chars
codepoints.map { |cp| cp.chr('utf-8') }
end

def codepoints
text.scan(/\h+/).map(&:hex)
end
end

AbstractMetaControlSequence.class_eval do
private

def control_sequence_to_s(control_sequence)
five_lsb = control_sequence.unpack('B*').first[-5..-1]
["000#{five_lsb}"].pack('B*')
end

def meta_char_to_codepoint(meta_char)
byte_value = meta_char.ord
byte_value < 128 ? byte_value + 128 : byte_value
end
end

Control.class_eval do
def codepoint
control_sequence_to_s(text).ord
end
end

Meta.class_eval do
def codepoint
meta_char_to_codepoint(text[-1])
end
end

MetaControl.class_eval do
def codepoint
meta_char_to_codepoint(control_sequence_to_s(text))
end
end
end
1 change: 1 addition & 0 deletions spec/parser/escapes_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
include_examples 'parse', /\?/, 0 => [char: '?', codepoint: 63 ]
include_examples 'parse', /\101/, 0 => [char: 'A', codepoint: 65 ]
include_examples 'parse', /\x42/, 0 => [char: 'B', codepoint: 66 ]
include_examples 'parse', /\xA/, 0 => [char: "\n", codepoint: 10 ]
include_examples 'parse', /\u0043/, 0 => [char: 'C', codepoint: 67 ]
include_examples 'parse', /\u{44 45}/, 0 => [chars: %w[D E], codepoints: [68, 69]]

Expand Down

0 comments on commit 79a351d

Please sign in to comment.