From 594405d81b242bcec926247ac2ed8b012f30f2df Mon Sep 17 00:00:00 2001 From: Pawel Lampe Date: Tue, 30 Apr 2024 21:55:24 +0200 Subject: [PATCH] Extend custom post-lexer to yield newlines after dedents --- gdtoolkit/parser/gdscript.lark | 2 +- gdtoolkit/parser/gdscript_indenter.py | 63 +++++++++++++++++++++++++++ gdtoolkit/parser/parser.py | 12 +---- 3 files changed, 66 insertions(+), 11 deletions(-) create mode 100644 gdtoolkit/parser/gdscript_indenter.py diff --git a/gdtoolkit/parser/gdscript.lark b/gdtoolkit/parser/gdscript.lark index b2908ac2..73d68ad8 100644 --- a/gdtoolkit/parser/gdscript.lark +++ b/gdtoolkit/parser/gdscript.lark @@ -79,7 +79,7 @@ func_args: "(" [func_arg ("," func_arg)* [trailing_comma]] ")" func_arg_regular: NAME ["=" expr] func_arg_inf: NAME ":" "=" expr func_arg_typed: NAME ":" TYPE_HINT ["=" expr] -_func_suite: _func_body +_func_suite: _func_body _NL | _func_stmt _func_body: _NL _INDENT (_func_stmt+ | _func_stmt* (_simple_func_stmt | annotation+)) _DEDENT _func_stmt: _simple_func_stmt _NL diff --git a/gdtoolkit/parser/gdscript_indenter.py b/gdtoolkit/parser/gdscript_indenter.py new file mode 100644 index 00000000..e8762eba --- /dev/null +++ b/gdtoolkit/parser/gdscript_indenter.py @@ -0,0 +1,63 @@ +from typing import Iterator + +from lark import Token, indenter + + +class GDScriptIndenter(indenter.Indenter): + NL_type = "_NL" + OPEN_PAREN_types = ["LPAR", "LSQB", "LBRACE"] + CLOSE_PAREN_types = ["RPAR", "RSQB", "RBRACE"] + INDENT_type = "_INDENT" + DEDENT_type = "_DEDENT" + # TODO: guess tab length + tab_len = 4 + + def handle_NL(self, token: Token) -> Iterator[Token]: + if self.paren_level > 0: + return # TODO: special handling for lambdas + + yield token + + indent_str = token.rsplit("\n", 1)[1] # Tabs and spaces + indent = indent_str.count(" ") + indent_str.count("\t") * self.tab_len + + if indent > self.indent_level[-1]: + self.indent_level.append(indent) + yield Token.new_borrow_pos(self.INDENT_type, indent_str, token) + else: + while indent < self.indent_level[-1]: + self.indent_level.pop() + yield Token.new_borrow_pos(self.DEDENT_type, indent_str, token) + # produce extra newline after dedent to simplify grammar: + yield token + + if indent != self.indent_level[-1]: + raise DedentError( + "Unexpected dedent to column %s. Expected dedent to %s" + % (indent, self.indent_level[-1]) + ) + + def _process(self, stream): + for token in stream: + if token.type == self.NL_type: + yield from self.handle_NL(token) + else: + yield token + + if token.type in self.OPEN_PAREN_types: + self.paren_level += 1 + elif token.type in self.CLOSE_PAREN_types: + self.paren_level -= 1 + assert self.paren_level >= 0 + + while len(self.indent_level) > 1: + self.indent_level.pop() + yield Token(self.DEDENT_type, "") + + assert self.indent_level == [0], self.indent_level + + # def process(self, stream): + # import pdb;pdb.set_trace() + # self.paren_level = 0 + # self.indent_level = [0] + # return self._process(stream) diff --git a/gdtoolkit/parser/parser.py b/gdtoolkit/parser/parser.py index 71cef1ff..333ef7cf 100644 --- a/gdtoolkit/parser/parser.py +++ b/gdtoolkit/parser/parser.py @@ -9,15 +9,7 @@ from lark import Lark, Tree, indenter - -class Indenter(indenter.Indenter): - NL_type = "_NL" - OPEN_PAREN_types = ["LPAR", "LSQB", "LBRACE"] - CLOSE_PAREN_types = ["RPAR", "RSQB", "RBRACE"] - INDENT_type = "_INDENT" - DEDENT_type = "_DEDENT" - # TODO: guess tab length - tab_len = 4 +from .gdscript_indenter import GDScriptIndenter # TODO: when upgrading to Python 3.8, replace with functools.cached_property @@ -92,7 +84,7 @@ def _get_parser( grammar_filepath, parser="lalr", start="start", - postlex=Indenter(), # type: ignore + postlex=GDScriptIndenter(), # type: ignore propagate_positions=add_metadata, maybe_placeholders=False, cache=cache_filepath,