Skip to content

Commit

Permalink
gh-105390: Correctly raise TokenError instead of SyntaxError for toke…
Browse files Browse the repository at this point in the history
…nize errors
  • Loading branch information
pablogsal committed Jun 6, 2023
1 parent 0cb6b9b commit 06debb5
Show file tree
Hide file tree
Showing 6 changed files with 35 additions and 24 deletions.
5 changes: 0 additions & 5 deletions Doc/library/tokenize.rst
Original file line number Diff line number Diff line change
Expand Up @@ -139,11 +139,6 @@ function it uses to do this is available:
2,
3

Note that unclosed single-quoted strings do not cause an error to be
raised. They are tokenized as :data:`~token.ERRORTOKEN`, followed by the
tokenization of their contents.


.. _tokenize-cli:

Command-Line Usage
Expand Down
11 changes: 6 additions & 5 deletions Doc/whatsnew/3.12.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1490,13 +1490,14 @@ Changes in the Python API
Additionally, there may be some minor behavioral changes as a consecuence of the
changes required to support :pep:`701`. Some of these changes include:

* Some final ``DEDENT`` tokens are now emitted within the bounds of the
input. This means that for a file containing 3 lines, the old version of the
tokenizer returned a ``DEDENT`` token in line 4 whilst the new version returns
the token in line 3.

* The ``type`` attribute of the tokens emitted when tokenizing some invalid Python
characters such as ``!`` has changed from ``ERRORTOKEN`` to ``OP``.

* Incomplete single-line strings now also raise :exc:`TokenError` as incomplete
multiline strings do.

* Some incomplete or invalid Python code now raises :exc:`TokenError` instead of
returning arbitrary ``ERRORTOKEN`` tokens when tokenizing it.

Build Changes
=============
Expand Down
11 changes: 6 additions & 5 deletions Lib/test/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
from tokenize import (tokenize, untokenize, NUMBER, NAME, OP,
STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
open as tokenize_open, Untokenizer, generate_tokens,
NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT, TokenInfo)
NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT, TokenInfo,
TokenError)
from io import BytesIO, StringIO
import unittest
from textwrap import dedent
Expand Down Expand Up @@ -286,7 +287,7 @@ def number_token(s):
for lit in INVALID_UNDERSCORE_LITERALS:
try:
number_token(lit)
except SyntaxError:
except TokenError:
continue
self.assertNotEqual(number_token(lit), lit)

Expand Down Expand Up @@ -1379,7 +1380,7 @@ def test_latin1_normalization(self):
self.assertEqual(found, "iso-8859-1")

def test_syntaxerror_latin1(self):
# Issue 14629: need to raise SyntaxError if the first
# Issue 14629: need to raise TokenError if the first
# line(s) have non-UTF-8 characters
lines = (
b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
Expand Down Expand Up @@ -2754,7 +2755,7 @@ def get_tokens(string):
"]",
]:
with self.subTest(case=case):
self.assertRaises(SyntaxError, get_tokens, case)
self.assertRaises(TokenError, get_tokens, case)

def test_max_indent(self):
MAXINDENT = 100
Expand All @@ -2773,7 +2774,7 @@ def generate_source(indents):

invalid = generate_source(MAXINDENT)
the_input = StringIO(invalid)
self.assertRaises(SyntaxError, lambda: list(_generate_tokens_from_c_tokenizer(the_input.readline)))
self.assertRaises(IndentationError, lambda: list(_generate_tokens_from_c_tokenizer(the_input.readline)))
self.assertRaises(
IndentationError, compile, invalid, "<string>", "exec"
)
Expand Down
20 changes: 18 additions & 2 deletions Lib/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -517,14 +517,30 @@ def error(message, filename=None, location=None):
perror("unexpected error: %s" % err)
raise

def _transform_msg(msg):
"""Transform error messages from the C tokenizer into the Python tokenize
The C tokenizer is more picky than the Python one, so we need to massage
the error messages a bit for backwards compatibility.
"""
if "unterminated triple-quoted string literal" in msg:
return "EOF in multi-line string"
return msg

def _generate_tokens_from_c_tokenizer(source, encoding=None, extra_tokens=False):
"""Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
if encoding is None:
it = _tokenize.TokenizerIter(source, extra_tokens=extra_tokens)
else:
it = _tokenize.TokenizerIter(source, encoding=encoding, extra_tokens=extra_tokens)
for info in it:
yield TokenInfo._make(info)
try:
for info in it:
yield TokenInfo._make(info)
except SyntaxError as e:
if type(e) != SyntaxError:
raise e from None
msg = _transform_msg(e.msg)
raise TokenError(msg, (e.lineno, e.offset)) from None


if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Correctly raise :exc:`tokenize.TokenError` exceptions instead of
:exc:`SyntaxError` for tokenize errors such as incomplete input. Patch by
Pablo Galindo
9 changes: 2 additions & 7 deletions Python/Python-tokenize.c
Original file line number Diff line number Diff line change
Expand Up @@ -84,13 +84,8 @@ _tokenizer_error(struct tok_state *tok)
msg = "invalid token";
break;
case E_EOF:
if (tok->level > 0) {
PyErr_Format(PyExc_SyntaxError,
"parenthesis '%c' was never closed",
tok->parenstack[tok->level-1]);
} else {
PyErr_SetString(PyExc_SyntaxError, "unexpected EOF while parsing");
}
PyErr_SetString(PyExc_SyntaxError, "unexpected EOF in multi-line statement");
PyErr_SyntaxLocationObject(tok->filename, tok->lineno, tok->inp - tok->buf < 0 ? 0 : tok->inp - tok->buf);
return -1;
case E_DEDENT:
msg = "unindent does not match any outer indentation level";
Expand Down

0 comments on commit 06debb5

Please sign in to comment.