diff --git a/hy/lex/lexer.py b/hy/lex/lexer.py index 9fe70467c..8716c8dc1 100755 --- a/hy/lex/lexer.py +++ b/hy/lex/lexer.py @@ -2,12 +2,23 @@ # This file is part of Hy, which is free software licensed under the Expat # license. See the LICENSE. -import re +import regex from rply import LexerGenerator +import rply.lexergenerator lg = LexerGenerator() +class RegexRule(rply.lexergenerator.Rule): + """Like rply.lexergenerator.Rule, but uses the `regex` module instead + of Python's `re`.""" + def __init__(self, name, pattern, flags=0): + self.name = name + self.re = regex.compile(pattern, flags=flags) +def add(name, pattern, flags=0): + lg.rules.append(RegexRule(name, pattern, flags=flags)) +def ignore(pattern, flags=0): + lg.ignore_rules.append(RegexRule("", pattern, flags=flags)) # A regexp for something that should end a quoting/unquoting operator # i.e. a space or a closing brace/paren/curly @@ -21,26 +32,26 @@ # BidiBrackets.txt ("(", ")"), ("[", "]"), ("{", "}"), ("༺", "༻"), ("༼", "༽"), ("᚛", "᚜"), ("⁅", "⁆"), ("⁽", "⁾"), ("₍", "₎"), ("⌈", "⌉"), ("⌊", "⌋"), ("〈", "〉"), ("❨", "❩"), ("❪", "❫"), ("❬", "❭"), ("❮", "❯"), ("❰", "❱"), ("❲", "❳"), ("❴", "❵"), ("⟅", "⟆"), ("⟦", "⟧"), ("⟨", "⟩"), ("⟪", "⟫"), ("⟬", "⟭"), ("⟮", "⟯"), ("⦃", "⦄"), ("⦅", "⦆"), ("⦇", "⦈"), ("⦉", "⦊"), ("⦋", "⦌"), ("⦍", "⦐"), ("⦏", "⦎"), ("⦑", "⦒"), ("⦓", "⦔"), ("⦕", "⦖"), ("⦗", "⦘"), ("⧘", "⧙"), ("⧚", "⧛"), ("⧼", "⧽"), ("⸢", "⸣"), ("⸤", "⸥"), ("⸦", "⸧"), ("⸨", "⸩"), ("〈", "〉"), ("《", "》"), ("「", "」"), ("『", "』"), ("【", "】"), ("〔", "〕"), ("〖", "〗"), ("〘", "〙"), ("〚", "〛"), ("﹙", "﹚"), ("﹛", "﹜"), ("﹝", "﹞"), ("(", ")"), ("[", "]"), ("{", "}"), ("⦅", "⦆"), ("「", "」")) # noqa -lg.add('LPAREN', r'\(') -lg.add('RPAREN', r'\)') -lg.add('LBRACKET', r'\[') -lg.add('RBRACKET', r'\]') -lg.add('LCURLY', r'\{') -lg.add('RCURLY', r'\}') -lg.add('HLCURLY', r'#\{') -lg.add('QUOTE', r'\'%s' % end_quote) -lg.add('QUASIQUOTE', r'`%s' % end_quote) -lg.add('UNQUOTESPLICE', r'~@%s' % end_quote) -lg.add('UNQUOTE', r'~%s' % end_quote) -lg.add('HASHBANG', r'#!.*[^\r\n]') +add('LPAREN', r'\(') +add('RPAREN', r'\)') +add('LBRACKET', r'\[') +add('RBRACKET', r'\]') +add('LCURLY', r'\{') +add('RCURLY', r'\}') +add('HLCURLY', r'#\{') +add('QUOTE', r'\'%s' % end_quote) +add('QUASIQUOTE', r'`%s' % end_quote) +add('UNQUOTESPLICE', r'~@%s' % end_quote) +add('UNQUOTE', r'~%s' % end_quote) +add('HASHBANG', r'#!.*[^\r\n]') for opener, closer in hashstring_paired_delims: - lg.add('HASHSTRING', r'#q{}(?:.|\n)*?{}'.format( - re.escape(opener), re.escape(closer))) -lg.add('HASHSTRING', r'#q([^{}])(?:.|\n)*?\1'.format(''.join( - re.escape(opener) for opener, _ in hashstring_paired_delims))) + add('HASHSTRING', r'#q{}(?:.|\n)*?{}'.format( + regex.escape(opener), regex.escape(closer))) +add('HASHSTRING', r'#q([^{}])(?:.|\n)*?\1'.format(''.join( + regex.escape(opener) for opener, _ in hashstring_paired_delims))) -lg.add('HASHOTHER', r'#%s' % identifier) +add('HASHOTHER', r'#%s' % identifier) # A regexp which matches incomplete strings, used to support # multi-line strings in the interpreter @@ -56,14 +67,14 @@ )* # one or more times ''' -lg.add('STRING', r'%s"' % partial_string) -lg.add('PARTIAL_STRING', partial_string) +add('STRING', r'%s"' % partial_string) +add('PARTIAL_STRING', partial_string) -lg.add('IDENTIFIER', identifier) +add('IDENTIFIER', identifier) -lg.ignore(r';.*(?=\r|\n|$)') -lg.ignore(r'\s+') +ignore(r';.*(?=\r|\n|$)') +ignore(r'\s+') lexer = lg.build()