Skip to content

Commit

Permalink
Display nice errors from Python tokenizer exceptions
Browse files Browse the repository at this point in the history
  • Loading branch information
jmanuel1 committed Nov 6, 2024
1 parent 849f5af commit 3998da7
Show file tree
Hide file tree
Showing 9 changed files with 264 additions and 106 deletions.
147 changes: 91 additions & 56 deletions concat/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,12 @@

import argparse
from concat.transpile import parse, transpile_ast, typecheck
from concat.error_reporting import get_line_at, create_parsing_failure_message
from concat.error_reporting import (
get_line_at,
create_indentation_error_message,
create_lexical_error_message,
create_parsing_failure_message,
)
import concat.execute
import concat.lex
import concat.parser_combinators
Expand All @@ -11,7 +16,7 @@
import json
import os.path
import sys
from typing import Callable, IO, AnyStr
from typing import Callable, IO, AnyStr, assert_never


filename = '<stdin>'
Expand Down Expand Up @@ -55,69 +60,99 @@ def func(name: str) -> IO[AnyStr]:
help='tokenize input from the given file and print the tokens as a JSON array',
)


def main():
# interactive mode
if args.file.isatty():
concat.stdlib.repl.repl([], [], args.debug)
else:
try:
token_results = concat.lex.tokenize(args.file.read())
tokens = list[concat.lex.Token]()
for r in token_results:
if r.type == 'token':
tokens.append(r.token)
elif r.type == 'indent-err':
position = (r.err.lineno or 1, r.err.offset or 0)
message = r.err.msg
print('Indentation error:')
print(
create_indentation_error_message(
args.file, position, message
)
)
elif r.type == 'token-err':
position = r.location
message = str(r.err)
print('Lexical error:')
print(
create_lexical_error_message(
args.file, position, message
)
)
else:
assert_never(r)
concat_ast = parse(tokens)
recovered_parsing_failures = concat_ast.parsing_failures
for failure in recovered_parsing_failures:
print('Parse Error:')
print(
create_parsing_failure_message(args.file, tokens, failure)
)
source_dir = os.path.dirname(filename)
typecheck(concat_ast, source_dir)
python_ast = transpile_ast(concat_ast)
except concat.typecheck.StaticAnalysisError as e:
if e.path is None:
in_path = ''
else:
in_path = ' in file ' + str(e.path)
print(f'Static Analysis Error{in_path}:\n')
print(e, 'in line:')
if e.location:
if e.path is not None:
with e.path.open() as f:
print(get_line_at(f, e.location), end='')
else:
print(get_line_at(args.file, e.location), end='')
print(' ' * e.location[1] + '^')
if args.verbose:
raise
except concat.parser_combinators.ParseError as e:
print('Parse Error:')
print(
create_parsing_failure_message(
args.file, tokens, e.args[0].failures
)
)
except Exception:
print('An internal error has occurred.')
print('This is a bug in Concat.')
raise
else:
concat.execute.execute(
filename,
python_ast,
{},
should_log_stacks=args.debug,
import_resolution_start_directory=source_dir,
)
if list(concat_ast.parsing_failures):
sys.exit(1)
finally:
args.file.close()


# We should pass any unknown args onto the program we're about to run.
# FIXME: There might be a better way to go about this, but I think this is fine
# for now.
args, rest = arg_parser.parse_known_args()
sys.argv = [sys.argv[0], *rest]


if args.tokenize:
code = args.file.read()
tokens = concat.lex.tokenize(code, should_preserve_comments=True)
json.dump(tokens, sys.stdout, cls=concat.lex.TokenEncoder)
sys.exit()

# interactive mode
if args.file.isatty():
concat.stdlib.repl.repl([], [], args.debug)
else:
try:
tokens = concat.lex.tokenize(args.file.read())
concat_ast = parse(tokens)
recovered_parsing_failures = concat_ast.parsing_failures
for failure in recovered_parsing_failures:
print('Parse Error:')
print(create_parsing_failure_message(args.file, tokens, failure))
source_dir = os.path.dirname(filename)
typecheck(concat_ast, source_dir)
python_ast = transpile_ast(concat_ast)
except concat.typecheck.StaticAnalysisError as e:
if e.path is None:
in_path = ''
else:
in_path = ' in file ' + str(e.path)
print(f'Static Analysis Error{in_path}:\n')
print(e, 'in line:')
if e.location:
if e.path is not None:
with e.path.open() as f:
print(get_line_at(f, e.location), end='')
else:
print(get_line_at(args.file, e.location), end='')
print(' ' * e.location[1] + '^')
if args.verbose:
raise
except concat.parser_combinators.ParseError as e:
print('Parse Error:')
print(
create_parsing_failure_message(
args.file, tokens, e.args[0].failures
)
)
except Exception:
print('An internal error has occurred.')
print('This is a bug in Concat.')
raise
else:
concat.execute.execute(
filename,
python_ast,
{},
should_log_stacks=args.debug,
import_resolution_start_directory=source_dir,
)
if list(concat_ast.parsing_failures):
sys.exit(1)
finally:
args.file.close()
main()
21 changes: 20 additions & 1 deletion concat/error_reporting.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@ def create_parsing_failure_message(
stream: Sequence[concat.lex.Token],
failure: concat.parser_combinators.FailureTree,
) -> str:
location = stream[failure.furthest_index].start
if failure.furthest_index < len(stream):
location = stream[failure.furthest_index].start
else:
location = stream[-1].start
line = get_line_at(file, location)
message = f'Expected {failure.expected} at line {location[0]}, column {location[1] + 1}:\n{line.rstrip()}\n{" " * location[1] + "^"}'
if failure.children:
Expand All @@ -26,3 +29,19 @@ def create_parsing_failure_message(
create_parsing_failure_message(file, stream, f), ' '
)
return message


def create_lexical_error_message(
file: TextIO, location: concat.astutils.Location, message: str
) -> str:
line = get_line_at(file, location)
message = f'Cannot tokenize file at line {location[0]}, column {location[1] + 1}:\n{line.rstrip()}\n{' ' * location[1] + '^'}\n'
return message


def create_indentation_error_message(
file: TextIO, location: concat.astutils.Location, message: str
) -> str:
line = get_line_at(file, location)
message = f'Malformed indentation at line {location[0]}, column {location[1] + 1}:\n{line.rstrip()}\n'
return message
85 changes: 64 additions & 21 deletions concat/lex.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import concat.astutils
from __future__ import annotations
from concat.astutils import Location, are_on_same_line_and_offset_by
import dataclasses
import io
import json
import tokenize as py_tokenize
from typing import Iterator, List, Optional, Tuple, Union
from typing import Iterator, List, Literal, Optional, Tuple, Union


@dataclasses.dataclass
Expand All @@ -19,8 +20,8 @@ class Token:

type: str = ''
value: str = ''
start: 'concat.astutils.Location' = (0, 0)
end: 'concat.astutils.Location' = (0, 0)
start: Location = (0, 0)
end: Location = (0, 0)
is_keyword: bool = False


Expand All @@ -33,7 +34,10 @@ def default(self, obj):
return super().default(obj)


def tokenize(code: str, should_preserve_comments: bool = False) -> List[Token]:
def tokenize(
code: str,
should_preserve_comments: bool = False,
) -> List[Result]:
lexer = Lexer()
lexer.input(code, should_preserve_comments)
tokens = []
Expand All @@ -46,10 +50,8 @@ def tokenize(code: str, should_preserve_comments: bool = False) -> List[Token]:


TokenTuple = Union[
Tuple[str, str, 'concat.astutils.Location', 'concat.astutils.Location'],
Tuple[
str, str, 'concat.astutils.Location', 'concat.astutils.Location', bool
],
Tuple[str, str, Location, Location],
Tuple[str, str, Location, Location, bool],
]


Expand All @@ -64,7 +66,7 @@ def __init__(self) -> None:
self.tokens: Optional[Iterator[py_tokenize.TokenInfo]]
self.lineno: int
self.lexpos: int
self._concat_token_iterator: Iterator['Token']
self._concat_token_iterator: Iterator[Result]
self._should_preserve_comments: bool

def input(self, data: str, should_preserve_comments: bool = False) -> None:
Expand All @@ -76,20 +78,28 @@ def input(self, data: str, should_preserve_comments: bool = False) -> None:
self._concat_token_iterator = self._tokens()
self._should_preserve_comments = should_preserve_comments

def token(self) -> Optional['Token']:
def token(self) -> Optional[Result]:
"""Return the next token as a Token object."""
return next(self._concat_token_iterator, None)

def _tokens(self) -> Iterator['Token']:
def _tokens(self) -> Iterator[Result]:
import token

if self.tokens is None:
self.tokens = py_tokenize.tokenize(
io.BytesIO(self.data.encode('utf-8')).readline
)

glued_token_prefix = None
for token_ in self.tokens:
glued_token_prefix: Token | None = None
while True:
try:
token_ = next(self.tokens)
except StopIteration:
return
except IndentationError as e:
yield IndentationErrorResult(e)
except py_tokenize.TokenError as e:
yield TokenErrorResult(e, (self.lineno, self.lexpos))
tok = Token()
_, tok.value, tok.start, tok.end, _ = token_
tok.type = token.tok_name[token_.exact_type]
Expand All @@ -98,15 +108,15 @@ def _tokens(self) -> Iterator['Token']:
if (
glued_token_prefix.value == '-'
and tok.value == '-'
and concat.astutils.are_on_same_line_and_offset_by(
and are_on_same_line_and_offset_by(
glued_token_prefix.start, tok.start, 1
)
):
glued_token_prefix.value = '--'
glued_token_prefix.type = 'MINUSMINUS'
glued_token_prefix.end = tok.end
self._update_position(glued_token_prefix)
yield glued_token_prefix
yield TokenResult(glued_token_prefix)
glued_token_prefix = None
continue
else:
Expand All @@ -119,7 +129,7 @@ def _tokens(self) -> Iterator['Token']:
self._should_preserve_comments
and tok.type == 'COMMENT'
):
yield tok
yield TokenResult(tok)
continue
elif tok.type == 'ERRORTOKEN':
if tok.value == ' ':
Expand Down Expand Up @@ -182,16 +192,49 @@ def _tokens(self) -> Iterator['Token']:
elif tok.type == 'EXCLAMATION':
tok.type = 'EXCLAMATIONMARK'

yield tok
yield TokenResult(tok)

def _update_position(self, tok: 'Token') -> None:
self.lexpos += len(tok.value)
if tok.type in {'NEWLINE', 'NL'}:
self.lineno += 1
self.lineno, self.lexpos = tok.start

def __is_bytes_literal(self, literal: str) -> bool:
return isinstance(eval(literal), bytes)


@dataclasses.dataclass
class TokenResult:
type: Literal['token']
token: Token

def __init__(self, token: Token) -> None:
self.type = 'token'
self.token = token


@dataclasses.dataclass
class IndentationErrorResult:
type: Literal['indent-err']
err: IndentationError

def __init__(self, err: IndentationError) -> None:
self.type = 'indent-err'
self.err = err


@dataclasses.dataclass
class TokenErrorResult:
type: Literal['token-err']
err: py_tokenize.TokenError
location: Location

def __init__(self, err: py_tokenize.TokenError, loc: Location) -> None:
self.type = 'token-err'
self.err = err
self.location = loc


type Result = TokenResult | IndentationErrorResult | TokenErrorResult


def to_tokens(*tokTuples: TokenTuple) -> List[Token]:
return [Token(*tuple) for tuple in tokTuples]
Loading

0 comments on commit 3998da7

Please sign in to comment.