Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gh-105390: Correctly raise TokenError instead of SyntaxError for tokenize errors #105399

Merged
merged 2 commits into from
Jun 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions Doc/library/tokenize.rst
Original file line number Diff line number Diff line change
Expand Up @@ -139,11 +139,6 @@ function it uses to do this is available:
2,
3

Note that unclosed single-quoted strings do not cause an error to be
raised. They are tokenized as :data:`~token.ERRORTOKEN`, followed by the
tokenization of their contents.


.. _tokenize-cli:

Command-Line Usage
Expand Down
11 changes: 6 additions & 5 deletions Doc/whatsnew/3.12.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1490,14 +1490,15 @@ Changes in the Python API
Additionally, there may be some minor behavioral changes as a consecuence of the
changes required to support :pep:`701`. Some of these changes include:

* Some final ``DEDENT`` tokens are now emitted within the bounds of the
input. This means that for a file containing 3 lines, the old version of the
tokenizer returned a ``DEDENT`` token in line 4 whilst the new version returns
the token in line 3.

* The ``type`` attribute of the tokens emitted when tokenizing some invalid Python
characters such as ``!`` has changed from ``ERRORTOKEN`` to ``OP``.

* Incomplete single-line strings now also raise :exc:`tokenize.TokenError` as incomplete
multiline strings do.

* Some incomplete or invalid Python code now raises :exc:`tokenize.TokenError` instead of
returning arbitrary ``ERRORTOKEN`` tokens when tokenizing it.

Build Changes
=============

Expand Down
11 changes: 6 additions & 5 deletions Lib/test/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
from tokenize import (tokenize, untokenize, NUMBER, NAME, OP,
STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
open as tokenize_open, Untokenizer, generate_tokens,
NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT, TokenInfo)
NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT, TokenInfo,
TokenError)
from io import BytesIO, StringIO
import unittest
from textwrap import dedent
Expand Down Expand Up @@ -286,7 +287,7 @@ def number_token(s):
for lit in INVALID_UNDERSCORE_LITERALS:
try:
number_token(lit)
except SyntaxError:
except TokenError:
continue
self.assertNotEqual(number_token(lit), lit)

Expand Down Expand Up @@ -1379,7 +1380,7 @@ def test_latin1_normalization(self):
self.assertEqual(found, "iso-8859-1")

def test_syntaxerror_latin1(self):
# Issue 14629: need to raise SyntaxError if the first
# Issue 14629: need to raise TokenError if the first
pablogsal marked this conversation as resolved.
Show resolved Hide resolved
# line(s) have non-UTF-8 characters
lines = (
b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
Expand Down Expand Up @@ -2754,7 +2755,7 @@ def get_tokens(string):
"]",
]:
with self.subTest(case=case):
self.assertRaises(SyntaxError, get_tokens, case)
self.assertRaises(TokenError, get_tokens, case)

def test_max_indent(self):
MAXINDENT = 100
Expand All @@ -2773,7 +2774,7 @@ def generate_source(indents):

invalid = generate_source(MAXINDENT)
the_input = StringIO(invalid)
self.assertRaises(SyntaxError, lambda: list(_generate_tokens_from_c_tokenizer(the_input.readline)))
self.assertRaises(IndentationError, lambda: list(_generate_tokens_from_c_tokenizer(the_input.readline)))
self.assertRaises(
IndentationError, compile, invalid, "<string>", "exec"
)
Expand Down
20 changes: 18 additions & 2 deletions Lib/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -517,14 +517,30 @@ def error(message, filename=None, location=None):
perror("unexpected error: %s" % err)
raise

def _transform_msg(msg):
"""Transform error messages from the C tokenizer into the Python tokenize
The C tokenizer is more picky than the Python one, so we need to massage
the error messages a bit for backwards compatibility.
"""
if "unterminated triple-quoted string literal" in msg:
return "EOF in multi-line string"
return msg

def _generate_tokens_from_c_tokenizer(source, encoding=None, extra_tokens=False):
"""Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
if encoding is None:
it = _tokenize.TokenizerIter(source, extra_tokens=extra_tokens)
else:
it = _tokenize.TokenizerIter(source, encoding=encoding, extra_tokens=extra_tokens)
for info in it:
yield TokenInfo._make(info)
try:
for info in it:
yield TokenInfo._make(info)
except SyntaxError as e:
if type(e) != SyntaxError:
raise e from None
msg = _transform_msg(e.msg)
raise TokenError(msg, (e.lineno, e.offset)) from None


if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Correctly raise :exc:`tokenize.TokenError` exceptions instead of
:exc:`SyntaxError` for tokenize errors such as incomplete input. Patch by
Pablo Galindo
9 changes: 2 additions & 7 deletions Python/Python-tokenize.c
Original file line number Diff line number Diff line change
Expand Up @@ -84,13 +84,8 @@ _tokenizer_error(struct tok_state *tok)
msg = "invalid token";
break;
case E_EOF:
if (tok->level > 0) {
PyErr_Format(PyExc_SyntaxError,
"parenthesis '%c' was never closed",
tok->parenstack[tok->level-1]);
} else {
PyErr_SetString(PyExc_SyntaxError, "unexpected EOF while parsing");
}
PyErr_SetString(PyExc_SyntaxError, "unexpected EOF in multi-line statement");
PyErr_SyntaxLocationObject(tok->filename, tok->lineno, tok->inp - tok->buf < 0 ? 0 : tok->inp - tok->buf);
return -1;
case E_DEDENT:
msg = "unindent does not match any outer indentation level";
Expand Down