diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 251ce2b864a9d8c..c2be99f00c557a1 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -82,7 +82,7 @@ def test_basic(self): NAME 'False' (4, 11) (4, 16) COMMENT '# NEWLINE' (4, 17) (4, 26) NEWLINE '\\n' (4, 26) (4, 27) - DEDENT '' (4, 27) (4, 27) + DEDENT '' (5, 0) (5, 0) """) indent_error_file = b"""\ def k(x): @@ -755,8 +755,8 @@ def test_tabs(self): NEWLINE '\\n' (2, 5) (2, 6) INDENT ' \\t' (3, 0) (3, 9) NAME 'pass' (3, 9) (3, 13) - DEDENT '' (3, 14) (3, 14) - DEDENT '' (3, 14) (3, 14) + DEDENT '' (4, 0) (4, 0) + DEDENT '' (4, 0) (4, 0) """) def test_non_ascii_identifiers(self): @@ -968,7 +968,7 @@ async def foo(): NUMBER '1' (2, 17) (2, 18) OP ':' (2, 18) (2, 19) NAME 'pass' (2, 20) (2, 24) - DEDENT '' (2, 25) (2, 25) + DEDENT '' (3, 0) (3, 0) """) self.check_tokenize('''async def foo(async): await''', """\ @@ -1016,7 +1016,7 @@ async def bar(): pass NAME 'await' (6, 2) (6, 7) OP '=' (6, 8) (6, 9) NUMBER '2' (6, 10) (6, 11) - DEDENT '' (6, 12) (6, 12) + DEDENT '' (7, 0) (7, 0) """) self.check_tokenize('''\ @@ -1054,7 +1054,7 @@ async def bar(): pass NAME 'await' (6, 2) (6, 7) OP '=' (6, 8) (6, 9) NUMBER '2' (6, 10) (6, 11) - DEDENT '' (6, 12) (6, 12) + DEDENT '' (7, 0) (7, 0) """) def test_newline_after_parenthesized_block_with_comment(self): @@ -1174,7 +1174,7 @@ def readline(): # skip the initial encoding token and the end tokens tokens = list(_tokenize(readline(), encoding='utf-8'))[:-2] - expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')] + expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"\n')] self.assertEqual(tokens, expected_tokens, "bytes not decoded with encoding") @@ -2669,7 +2669,8 @@ def generate_source(indents): valid = generate_source(MAXINDENT - 1) tokens = list(_generate_tokens_from_c_tokenizer(valid)) - self.assertEqual(tokens[-1].type, DEDENT) + self.assertEqual(tokens[-2].type, DEDENT) + self.assertEqual(tokens[-1].type, ENDMARKER) compile(valid, "", "exec") invalid = generate_source(MAXINDENT) diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 911f0f12f9bb7e8..4895e94d1dfda7b 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -447,13 +447,8 @@ def tokenize(readline): def _tokenize(rl_gen, encoding): source = b"".join(rl_gen).decode(encoding) - token = None for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True): yield token - if token is not None: - last_line, _ = token.start - yield TokenInfo(ENDMARKER, '', (last_line + 1, 0), (last_line + 1, 0), '') - def generate_tokens(readline): """Tokenize a source reading Python code as unicode strings. diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-05-26-15-16-11.gh-issue-104976.6dLitD.rst b/Misc/NEWS.d/next/Core and Builtins/2023-05-26-15-16-11.gh-issue-104976.6dLitD.rst new file mode 100644 index 000000000000000..377e8e76362687a --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2023-05-26-15-16-11.gh-issue-104976.6dLitD.rst @@ -0,0 +1,3 @@ +Ensure that trailing ``DEDENT`` :class:`tokenize.TokenInfo` objects emitted +by the :mod:`tokenize` module are reported as in Python 3.11. Patch by Pablo +Galindo diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c index 0023e303b96e836..ef29091f1c78245 100644 --- a/Python/Python-tokenize.c +++ b/Python/Python-tokenize.c @@ -30,6 +30,7 @@ class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_t typedef struct { PyObject_HEAD struct tok_state *tok; + int done; } tokenizeriterobject; /*[clinic input] @@ -63,6 +64,7 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source, if (extra_tokens) { self->tok->tok_extra_tokens = 1; } + self->done = 0; return (PyObject *)self; } @@ -179,8 +181,9 @@ tokenizeriter_next(tokenizeriterobject *it) } goto exit; } - if (type == ERRORTOKEN || type == ENDMARKER) { + if (it->done || type == ERRORTOKEN) { PyErr_SetString(PyExc_StopIteration, "EOF"); + it->done = 1; goto exit; } PyObject *str = NULL; @@ -194,15 +197,14 @@ tokenizeriter_next(tokenizeriterobject *it) goto exit; } - Py_ssize_t size = it->tok->inp - it->tok->buf; - assert(it->tok->buf[size-1] == '\n'); - size -= 1; // Remove the newline character from the end of the line - PyObject *line = PyUnicode_DecodeUTF8(it->tok->buf, size, "replace"); + const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start; + Py_ssize_t size = it->tok->inp - line_start; + PyObject *line = PyUnicode_DecodeUTF8(line_start, size, "replace"); if (line == NULL) { Py_DECREF(str); goto exit; } - const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start; + Py_ssize_t lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno; Py_ssize_t end_lineno = it->tok->lineno; Py_ssize_t col_offset = -1; @@ -215,6 +217,10 @@ tokenizeriter_next(tokenizeriterobject *it) } if (it->tok->tok_extra_tokens) { + if (type == ENDMARKER || (type == DEDENT && it->tok->done == E_EOF)) { + lineno = end_lineno = lineno + 1; + col_offset = end_col_offset = 0; + } // Necessary adjustments to match the original Python tokenize // implementation if (type > DEDENT && type < OP) { @@ -232,6 +238,9 @@ tokenizeriter_next(tokenizeriterobject *it) result = Py_BuildValue("(iN(nn)(nn)N)", type, str, lineno, col_offset, end_lineno, end_col_offset, line); exit: _PyToken_Free(&token); + if (type == ENDMARKER) { + it->done = 1; + } return result; }