diff --git a/Doc/library/tokenize.rst b/Doc/library/tokenize.rst index 11f569df2e7cde..41222a771d1b47 100644 --- a/Doc/library/tokenize.rst +++ b/Doc/library/tokenize.rst @@ -139,11 +139,6 @@ function it uses to do this is available: 2, 3 -Note that unclosed single-quoted strings do not cause an error to be -raised. They are tokenized as :data:`~token.ERRORTOKEN`, followed by the -tokenization of their contents. - - .. _tokenize-cli: Command-Line Usage diff --git a/Doc/whatsnew/3.12.rst b/Doc/whatsnew/3.12.rst index 7e7942550a797b..13afaf1ab28255 100644 --- a/Doc/whatsnew/3.12.rst +++ b/Doc/whatsnew/3.12.rst @@ -1490,13 +1490,14 @@ Changes in the Python API Additionally, there may be some minor behavioral changes as a consecuence of the changes required to support :pep:`701`. Some of these changes include: - * Some final ``DEDENT`` tokens are now emitted within the bounds of the - input. This means that for a file containing 3 lines, the old version of the - tokenizer returned a ``DEDENT`` token in line 4 whilst the new version returns - the token in line 3. - * The ``type`` attribute of the tokens emitted when tokenizing some invalid Python characters such as ``!`` has changed from ``ERRORTOKEN`` to ``OP``. + + * Incomplete single-line strings now also raise :exc:`TokenError` as incomplete + multiline strings do. + + * Some incomplete or invalid Python code now raises :exc:`TokenError` instead of + returning arbitrary ``ERRORTOKEN`` tokens when tokenizing it. Build Changes ============= diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 5ac17095b185f5..f2847b2fb327f8 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -3,7 +3,8 @@ from tokenize import (tokenize, untokenize, NUMBER, NAME, OP, STRING, ENDMARKER, ENCODING, tok_name, detect_encoding, open as tokenize_open, Untokenizer, generate_tokens, - NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT, TokenInfo) + NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT, TokenInfo, + TokenError) from io import BytesIO, StringIO import unittest from textwrap import dedent @@ -286,7 +287,7 @@ def number_token(s): for lit in INVALID_UNDERSCORE_LITERALS: try: number_token(lit) - except SyntaxError: + except TokenError: continue self.assertNotEqual(number_token(lit), lit) @@ -1379,7 +1380,7 @@ def test_latin1_normalization(self): self.assertEqual(found, "iso-8859-1") def test_syntaxerror_latin1(self): - # Issue 14629: need to raise SyntaxError if the first + # Issue 14629: need to raise TokenError if the first # line(s) have non-UTF-8 characters lines = ( b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S @@ -2754,7 +2755,7 @@ def get_tokens(string): "]", ]: with self.subTest(case=case): - self.assertRaises(SyntaxError, get_tokens, case) + self.assertRaises(TokenError, get_tokens, case) def test_max_indent(self): MAXINDENT = 100 @@ -2773,7 +2774,7 @@ def generate_source(indents): invalid = generate_source(MAXINDENT) the_input = StringIO(invalid) - self.assertRaises(SyntaxError, lambda: list(_generate_tokens_from_c_tokenizer(the_input.readline))) + self.assertRaises(IndentationError, lambda: list(_generate_tokens_from_c_tokenizer(the_input.readline))) self.assertRaises( IndentationError, compile, invalid, "", "exec" ) diff --git a/Lib/tokenize.py b/Lib/tokenize.py index a07a8bf45891ac..49e8144edddab7 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -517,14 +517,30 @@ def error(message, filename=None, location=None): perror("unexpected error: %s" % err) raise +def _transform_msg(msg): + """Transform error messages from the C tokenizer into the Python tokenize + + The C tokenizer is more picky than the Python one, so we need to massage + the error messages a bit for backwards compatibility. + """ + if "unterminated triple-quoted string literal" in msg: + return "EOF in multi-line string" + return msg + def _generate_tokens_from_c_tokenizer(source, encoding=None, extra_tokens=False): """Tokenize a source reading Python code as unicode strings using the internal C tokenizer""" if encoding is None: it = _tokenize.TokenizerIter(source, extra_tokens=extra_tokens) else: it = _tokenize.TokenizerIter(source, encoding=encoding, extra_tokens=extra_tokens) - for info in it: - yield TokenInfo._make(info) + try: + for info in it: + yield TokenInfo._make(info) + except SyntaxError as e: + if type(e) != SyntaxError: + raise e from None + msg = _transform_msg(e.msg) + raise TokenError(msg, (e.lineno, e.offset)) from None if __name__ == "__main__": diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-06-06-17-10-42.gh-issue-105390.DvqI-e.rst b/Misc/NEWS.d/next/Core and Builtins/2023-06-06-17-10-42.gh-issue-105390.DvqI-e.rst new file mode 100644 index 00000000000000..de59b54d8f6053 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2023-06-06-17-10-42.gh-issue-105390.DvqI-e.rst @@ -0,0 +1,3 @@ +Correctly raise :exc:`tokenize.TokenError` exceptions instead of +:exc:`SyntaxError` for tokenize errors such as incomplete input. Patch by +Pablo Galindo diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c index 223de54d658507..4d2179348eed20 100644 --- a/Python/Python-tokenize.c +++ b/Python/Python-tokenize.c @@ -84,13 +84,8 @@ _tokenizer_error(struct tok_state *tok) msg = "invalid token"; break; case E_EOF: - if (tok->level > 0) { - PyErr_Format(PyExc_SyntaxError, - "parenthesis '%c' was never closed", - tok->parenstack[tok->level-1]); - } else { - PyErr_SetString(PyExc_SyntaxError, "unexpected EOF while parsing"); - } + PyErr_SetString(PyExc_SyntaxError, "unexpected EOF in multi-line statement"); + PyErr_SyntaxLocationObject(tok->filename, tok->lineno, tok->inp - tok->buf < 0 ? 0 : tok->inp - tok->buf); return -1; case E_DEDENT: msg = "unindent does not match any outer indentation level";