From 6db032a759ebbcf99761989f29637c22384bb1f4 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Fri, 26 May 2023 15:14:00 +0100 Subject: [PATCH] gh-104976: Ensure trailing dedent tokens are emitted as the previous tokenizer Signed-off-by: Pablo Galindo --- Lib/test/test_tokenize.py | 17 +++++++++-------- Lib/tokenize.py | 5 ----- ...23-05-26-15-16-11.gh-issue-104976.6dLitD.rst | 3 +++ Python/Python-tokenize.c | 5 ++++- 4 files changed, 16 insertions(+), 14 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2023-05-26-15-16-11.gh-issue-104976.6dLitD.rst diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 251ce2b864a9d8c..c2be99f00c557a1 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -82,7 +82,7 @@ def test_basic(self): NAME 'False' (4, 11) (4, 16) COMMENT '# NEWLINE' (4, 17) (4, 26) NEWLINE '\\n' (4, 26) (4, 27) - DEDENT '' (4, 27) (4, 27) + DEDENT '' (5, 0) (5, 0) """) indent_error_file = b"""\ def k(x): @@ -755,8 +755,8 @@ def test_tabs(self): NEWLINE '\\n' (2, 5) (2, 6) INDENT ' \\t' (3, 0) (3, 9) NAME 'pass' (3, 9) (3, 13) - DEDENT '' (3, 14) (3, 14) - DEDENT '' (3, 14) (3, 14) + DEDENT '' (4, 0) (4, 0) + DEDENT '' (4, 0) (4, 0) """) def test_non_ascii_identifiers(self): @@ -968,7 +968,7 @@ async def foo(): NUMBER '1' (2, 17) (2, 18) OP ':' (2, 18) (2, 19) NAME 'pass' (2, 20) (2, 24) - DEDENT '' (2, 25) (2, 25) + DEDENT '' (3, 0) (3, 0) """) self.check_tokenize('''async def foo(async): await''', """\ @@ -1016,7 +1016,7 @@ async def bar(): pass NAME 'await' (6, 2) (6, 7) OP '=' (6, 8) (6, 9) NUMBER '2' (6, 10) (6, 11) - DEDENT '' (6, 12) (6, 12) + DEDENT '' (7, 0) (7, 0) """) self.check_tokenize('''\ @@ -1054,7 +1054,7 @@ async def bar(): pass NAME 'await' (6, 2) (6, 7) OP '=' (6, 8) (6, 9) NUMBER '2' (6, 10) (6, 11) - DEDENT '' (6, 12) (6, 12) + DEDENT '' (7, 0) (7, 0) """) def test_newline_after_parenthesized_block_with_comment(self): @@ -1174,7 +1174,7 @@ def readline(): # skip the initial encoding token and the end tokens tokens = list(_tokenize(readline(), encoding='utf-8'))[:-2] - expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')] + expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"\n')] self.assertEqual(tokens, expected_tokens, "bytes not decoded with encoding") @@ -2669,7 +2669,8 @@ def generate_source(indents): valid = generate_source(MAXINDENT - 1) tokens = list(_generate_tokens_from_c_tokenizer(valid)) - self.assertEqual(tokens[-1].type, DEDENT) + self.assertEqual(tokens[-2].type, DEDENT) + self.assertEqual(tokens[-1].type, ENDMARKER) compile(valid, "", "exec") invalid = generate_source(MAXINDENT) diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 911f0f12f9bb7e8..4895e94d1dfda7b 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -447,13 +447,8 @@ def tokenize(readline): def _tokenize(rl_gen, encoding): source = b"".join(rl_gen).decode(encoding) - token = None for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True): yield token - if token is not None: - last_line, _ = token.start - yield TokenInfo(ENDMARKER, '', (last_line + 1, 0), (last_line + 1, 0), '') - def generate_tokens(readline): """Tokenize a source reading Python code as unicode strings. diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-05-26-15-16-11.gh-issue-104976.6dLitD.rst b/Misc/NEWS.d/next/Core and Builtins/2023-05-26-15-16-11.gh-issue-104976.6dLitD.rst new file mode 100644 index 000000000000000..377e8e76362687a --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2023-05-26-15-16-11.gh-issue-104976.6dLitD.rst @@ -0,0 +1,3 @@ +Ensure that trailing ``DEDENT`` :class:`tokenize.TokenInfo` objects emitted +by the :mod:`tokenize` module are reported as in Python 3.11. Patch by Pablo +Galindo diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c index 0023e303b96e836..68606497a5c21e3 100644 --- a/Python/Python-tokenize.c +++ b/Python/Python-tokenize.c @@ -30,6 +30,7 @@ class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_t typedef struct { PyObject_HEAD struct tok_state *tok; + int done; } tokenizeriterobject; /*[clinic input] @@ -63,6 +64,7 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source, if (extra_tokens) { self->tok->tok_extra_tokens = 1; } + self->done = 0; return (PyObject *)self; } @@ -179,8 +181,9 @@ tokenizeriter_next(tokenizeriterobject *it) } goto exit; } - if (type == ERRORTOKEN || type == ENDMARKER) { + if (it->done || type == ERRORTOKEN) { PyErr_SetString(PyExc_StopIteration, "EOF"); + it->done = 1; goto exit; } PyObject *str = NULL;