Display nice errors from Python tokenizer exceptions

jmanuel1 · Nov 6, 2024 · 3998da7 · 3998da7
1 parent 849f5af
commit 3998da7
Show file tree

Hide file tree

Showing 9 changed files with 264 additions and 106 deletions.
diff --git a/concat/__main__.py b/concat/__main__.py
@@ -2,7 +2,12 @@
 
 import argparse
 from concat.transpile import parse, transpile_ast, typecheck
-from concat.error_reporting import get_line_at, create_parsing_failure_message
+from concat.error_reporting import (
+    get_line_at,
+    create_indentation_error_message,
+    create_lexical_error_message,
+    create_parsing_failure_message,
+)
 import concat.execute
 import concat.lex
 import concat.parser_combinators
@@ -11,7 +16,7 @@
 import json
 import os.path
 import sys
-from typing import Callable, IO, AnyStr
+from typing import Callable, IO, AnyStr, assert_never
 
 
 filename = '<stdin>'
@@ -55,69 +60,99 @@ def func(name: str) -> IO[AnyStr]:
     help='tokenize input from the given file and print the tokens as a JSON array',
 )
 
+
+def main():
+    # interactive mode
+    if args.file.isatty():
+        concat.stdlib.repl.repl([], [], args.debug)
+    else:
+        try:
+            token_results = concat.lex.tokenize(args.file.read())
+            tokens = list[concat.lex.Token]()
+            for r in token_results:
+                if r.type == 'token':
+                    tokens.append(r.token)
+                elif r.type == 'indent-err':
+                    position = (r.err.lineno or 1, r.err.offset or 0)
+                    message = r.err.msg
+                    print('Indentation error:')
+                    print(
+                        create_indentation_error_message(
+                            args.file, position, message
+                        )
+                    )
+                elif r.type == 'token-err':
+                    position = r.location
+                    message = str(r.err)
+                    print('Lexical error:')
+                    print(
+                        create_lexical_error_message(
+                            args.file, position, message
+                        )
+                    )
+                else:
+                    assert_never(r)
+            concat_ast = parse(tokens)
+            recovered_parsing_failures = concat_ast.parsing_failures
+            for failure in recovered_parsing_failures:
+                print('Parse Error:')
+                print(
+                    create_parsing_failure_message(args.file, tokens, failure)
+                )
+            source_dir = os.path.dirname(filename)
+            typecheck(concat_ast, source_dir)
+            python_ast = transpile_ast(concat_ast)
+        except concat.typecheck.StaticAnalysisError as e:
+            if e.path is None:
+                in_path = ''
+            else:
+                in_path = ' in file ' + str(e.path)
+            print(f'Static Analysis Error{in_path}:\n')
+            print(e, 'in line:')
+            if e.location:
+                if e.path is not None:
+                    with e.path.open() as f:
+                        print(get_line_at(f, e.location), end='')
+                else:
+                    print(get_line_at(args.file, e.location), end='')
+                print(' ' * e.location[1] + '^')
+            if args.verbose:
+                raise
+        except concat.parser_combinators.ParseError as e:
+            print('Parse Error:')
+            print(
+                create_parsing_failure_message(
+                    args.file, tokens, e.args[0].failures
+                )
+            )
+        except Exception:
+            print('An internal error has occurred.')
+            print('This is a bug in Concat.')
+            raise
+        else:
+            concat.execute.execute(
+                filename,
+                python_ast,
+                {},
+                should_log_stacks=args.debug,
+                import_resolution_start_directory=source_dir,
+            )
+            if list(concat_ast.parsing_failures):
+                sys.exit(1)
+        finally:
+            args.file.close()
+
+
 # We should pass any unknown args onto the program we're about to run.
 # FIXME: There might be a better way to go about this, but I think this is fine
 # for now.
 args, rest = arg_parser.parse_known_args()
 sys.argv = [sys.argv[0], *rest]
 
-
 if args.tokenize:
     code = args.file.read()
     tokens = concat.lex.tokenize(code, should_preserve_comments=True)
     json.dump(tokens, sys.stdout, cls=concat.lex.TokenEncoder)
     sys.exit()
 
-# interactive mode
-if args.file.isatty():
-    concat.stdlib.repl.repl([], [], args.debug)
-else:
-    try:
-        tokens = concat.lex.tokenize(args.file.read())
-        concat_ast = parse(tokens)
-        recovered_parsing_failures = concat_ast.parsing_failures
-        for failure in recovered_parsing_failures:
-            print('Parse Error:')
-            print(create_parsing_failure_message(args.file, tokens, failure))
-        source_dir = os.path.dirname(filename)
-        typecheck(concat_ast, source_dir)
-        python_ast = transpile_ast(concat_ast)
-    except concat.typecheck.StaticAnalysisError as e:
-        if e.path is None:
-            in_path = ''
-        else:
-            in_path = ' in file ' + str(e.path)
-        print(f'Static Analysis Error{in_path}:\n')
-        print(e, 'in line:')
-        if e.location:
-            if e.path is not None:
-                with e.path.open() as f:
-                    print(get_line_at(f, e.location), end='')
-            else:
-                print(get_line_at(args.file, e.location), end='')
-            print(' ' * e.location[1] + '^')
-        if args.verbose:
-            raise
-    except concat.parser_combinators.ParseError as e:
-        print('Parse Error:')
-        print(
-            create_parsing_failure_message(
-                args.file, tokens, e.args[0].failures
-            )
-        )
-    except Exception:
-        print('An internal error has occurred.')
-        print('This is a bug in Concat.')
-        raise
-    else:
-        concat.execute.execute(
-            filename,
-            python_ast,
-            {},
-            should_log_stacks=args.debug,
-            import_resolution_start_directory=source_dir,
-        )
-        if list(concat_ast.parsing_failures):
-            sys.exit(1)
-    finally:
-        args.file.close()
+main()
diff --git a/concat/error_reporting.py b/concat/error_reporting.py
@@ -16,7 +16,10 @@ def create_parsing_failure_message(
     stream: Sequence[concat.lex.Token],
     failure: concat.parser_combinators.FailureTree,
 ) -> str:
-    location = stream[failure.furthest_index].start
+    if failure.furthest_index < len(stream):
+        location = stream[failure.furthest_index].start
+    else:
+        location = stream[-1].start
     line = get_line_at(file, location)
     message = f'Expected {failure.expected} at line {location[0]}, column {location[1] + 1}:\n{line.rstrip()}\n{" " * location[1] + "^"}'
     if failure.children:
@@ -26,3 +29,19 @@ def create_parsing_failure_message(
                 create_parsing_failure_message(file, stream, f), '  '
             )
     return message
+
+
+def create_lexical_error_message(
+    file: TextIO, location: concat.astutils.Location, message: str
+) -> str:
+    line = get_line_at(file, location)
+    message = f'Cannot tokenize file at line {location[0]}, column {location[1] + 1}:\n{line.rstrip()}\n{' ' * location[1] + '^'}\n'
+    return message
+
+
+def create_indentation_error_message(
+    file: TextIO, location: concat.astutils.Location, message: str
+) -> str:
+    line = get_line_at(file, location)
+    message = f'Malformed indentation at line {location[0]}, column {location[1] + 1}:\n{line.rstrip()}\n'
+    return message
diff --git a/concat/lex.py b/concat/lex.py
@@ -1,9 +1,10 @@
-import concat.astutils
+from __future__ import annotations
+from concat.astutils import Location, are_on_same_line_and_offset_by
 import dataclasses
 import io
 import json
 import tokenize as py_tokenize
-from typing import Iterator, List, Optional, Tuple, Union
+from typing import Iterator, List, Literal, Optional, Tuple, Union
 
 
 @dataclasses.dataclass
@@ -19,8 +20,8 @@ class Token:
 
     type: str = ''
     value: str = ''
-    start: 'concat.astutils.Location' = (0, 0)
-    end: 'concat.astutils.Location' = (0, 0)
+    start: Location = (0, 0)
+    end: Location = (0, 0)
     is_keyword: bool = False
 
 
@@ -33,7 +34,10 @@ def default(self, obj):
         return super().default(obj)
 
 
-def tokenize(code: str, should_preserve_comments: bool = False) -> List[Token]:
+def tokenize(
+    code: str,
+    should_preserve_comments: bool = False,
+) -> List[Result]:
     lexer = Lexer()
     lexer.input(code, should_preserve_comments)
     tokens = []
@@ -46,10 +50,8 @@ def tokenize(code: str, should_preserve_comments: bool = False) -> List[Token]:
 
 
 TokenTuple = Union[
-    Tuple[str, str, 'concat.astutils.Location', 'concat.astutils.Location'],
-    Tuple[
-        str, str, 'concat.astutils.Location', 'concat.astutils.Location', bool
-    ],
+    Tuple[str, str, Location, Location],
+    Tuple[str, str, Location, Location, bool],
 ]
 
 
@@ -64,7 +66,7 @@ def __init__(self) -> None:
         self.tokens: Optional[Iterator[py_tokenize.TokenInfo]]
         self.lineno: int
         self.lexpos: int
-        self._concat_token_iterator: Iterator['Token']
+        self._concat_token_iterator: Iterator[Result]
         self._should_preserve_comments: bool
 
     def input(self, data: str, should_preserve_comments: bool = False) -> None:
@@ -76,20 +78,28 @@ def input(self, data: str, should_preserve_comments: bool = False) -> None:
         self._concat_token_iterator = self._tokens()
         self._should_preserve_comments = should_preserve_comments
 
-    def token(self) -> Optional['Token']:
+    def token(self) -> Optional[Result]:
         """Return the next token as a Token object."""
         return next(self._concat_token_iterator, None)
 
-    def _tokens(self) -> Iterator['Token']:
+    def _tokens(self) -> Iterator[Result]:
         import token
 
         if self.tokens is None:
             self.tokens = py_tokenize.tokenize(
                 io.BytesIO(self.data.encode('utf-8')).readline
             )
 
-        glued_token_prefix = None
-        for token_ in self.tokens:
+        glued_token_prefix: Token | None = None
+        while True:
+            try:
+                token_ = next(self.tokens)
+            except StopIteration:
+                return
+            except IndentationError as e:
+                yield IndentationErrorResult(e)
+            except py_tokenize.TokenError as e:
+                yield TokenErrorResult(e, (self.lineno, self.lexpos))
             tok = Token()
             _, tok.value, tok.start, tok.end, _ = token_
             tok.type = token.tok_name[token_.exact_type]
@@ -98,15 +108,15 @@ def _tokens(self) -> Iterator['Token']:
                 if (
                     glued_token_prefix.value == '-'
                     and tok.value == '-'
-                    and concat.astutils.are_on_same_line_and_offset_by(
+                    and are_on_same_line_and_offset_by(
                         glued_token_prefix.start, tok.start, 1
                     )
                 ):
                     glued_token_prefix.value = '--'
                     glued_token_prefix.type = 'MINUSMINUS'
                     glued_token_prefix.end = tok.end
                     self._update_position(glued_token_prefix)
-                    yield glued_token_prefix
+                    yield TokenResult(glued_token_prefix)
                     glued_token_prefix = None
                     continue
                 else:
@@ -119,7 +129,7 @@ def _tokens(self) -> Iterator['Token']:
                         self._should_preserve_comments
                         and tok.type == 'COMMENT'
                     ):
-                        yield tok
+                        yield TokenResult(tok)
                     continue
                 elif tok.type == 'ERRORTOKEN':
                     if tok.value == ' ':
@@ -182,16 +192,49 @@ def _tokens(self) -> Iterator['Token']:
                 elif tok.type == 'EXCLAMATION':
                     tok.type = 'EXCLAMATIONMARK'
 
-                yield tok
+                yield TokenResult(tok)
 
     def _update_position(self, tok: 'Token') -> None:
-        self.lexpos += len(tok.value)
-        if tok.type in {'NEWLINE', 'NL'}:
-            self.lineno += 1
+        self.lineno, self.lexpos = tok.start
 
     def __is_bytes_literal(self, literal: str) -> bool:
         return isinstance(eval(literal), bytes)
 
 
+@dataclasses.dataclass
+class TokenResult:
+    type: Literal['token']
+    token: Token
+
+    def __init__(self, token: Token) -> None:
+        self.type = 'token'
+        self.token = token
+
+
+@dataclasses.dataclass
+class IndentationErrorResult:
+    type: Literal['indent-err']
+    err: IndentationError
+
+    def __init__(self, err: IndentationError) -> None:
+        self.type = 'indent-err'
+        self.err = err
+
+
+@dataclasses.dataclass
+class TokenErrorResult:
+    type: Literal['token-err']
+    err: py_tokenize.TokenError
+    location: Location
+
+    def __init__(self, err: py_tokenize.TokenError, loc: Location) -> None:
+        self.type = 'token-err'
+        self.err = err
+        self.location = loc
+
+
+type Result = TokenResult | IndentationErrorResult | TokenErrorResult
+
+
 def to_tokens(*tokTuples: TokenTuple) -> List[Token]:
     return [Token(*tuple) for tuple in tokTuples]