diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index cd11dddd0fe51ab..c40d227fe1a3fe8 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1827,9 +1827,10 @@ class CTokenizeTest(TestCase): def check_tokenize(self, s, expected): # Format the tokens in s in a table format. # The ENDMARKER and final NEWLINE are omitted. + f = StringIO(s) with self.subTest(source=s): result = stringify_tokens_from_source( - _generate_tokens_from_c_tokenizer(s), s + _generate_tokens_from_c_tokenizer(f.readline), s ) self.assertEqual(result, expected.rstrip().splitlines()) diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 4895e94d1dfda7b..c91fa0c3eea8651 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -446,8 +446,14 @@ def tokenize(readline): yield from _tokenize(rl_gen, encoding) def _tokenize(rl_gen, encoding): - source = b"".join(rl_gen).decode(encoding) - for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True): + def gen(rl_gen): + while True: + try: + yield next(rl_gen).decode(encoding) + except StopIteration: + return + g = gen(rl_gen) + for token in _generate_tokens_from_c_tokenizer(g.__next__, extra_tokens=True): yield token def generate_tokens(readline): diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 59c817293fbfcd9..28a335bc85016a2 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -542,7 +542,13 @@ tok_readline_recode(struct tok_state *tok) { if (line == NULL) { line = PyObject_CallNoArgs(tok->decoding_readline); if (line == NULL) { - error_ret(tok); + if (!PyErr_ExceptionMatches(PyExc_StopIteration)) { + error_ret(tok); + } else { + PyErr_Clear(); + tok->inp = tok->cur; + tok->done = E_EOF; + } goto error; } } @@ -569,6 +575,7 @@ tok_readline_recode(struct tok_state *tok) { goto error; } Py_DECREF(line); +exit: return 1; error: Py_XDECREF(line); @@ -900,6 +907,30 @@ _PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf) return tok; } +struct tok_state * +_PyTokenizer_FromUTF8Readline(PyObject* readline, int exec_input, int preserve_crlf) +{ + struct tok_state *tok = tok_new(); + if (tok == NULL) + return NULL; + if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) { + _PyTokenizer_Free(tok); + return NULL; + } + tok->cur = tok->inp = tok->buf; + tok->end = tok->buf + BUFSIZ; + tok->fp = NULL; + + tok->enc = NULL; + tok->encoding = new_string("utf-8", 5, tok); + Py_INCREF(readline); + tok->decoding_readline = readline; + tok->decoding_state = STATE_NORMAL; + return tok; +} + + + /* Set up tokenizer for UTF-8 string */ struct tok_state * @@ -1238,7 +1269,10 @@ tok_nextc(struct tok_state *tok) if (tok->done != E_OK) { return EOF; } - if (tok->fp == NULL) { + if (tok->decoding_readline != NULL) { + rc = tok_underflow_file(tok); + } + else if (tok->fp == NULL) { rc = tok_underflow_string(tok); } else if (tok->prompt != NULL) { diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h index 02749e355da8124..e173584bc9278dd 100644 --- a/Parser/tokenizer.h +++ b/Parser/tokenizer.h @@ -137,6 +137,7 @@ struct tok_state { extern struct tok_state *_PyTokenizer_FromString(const char *, int, int); extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int, int); +extern struct tok_state *_PyTokenizer_FromUTF8Readline(PyObject*, int, int); extern struct tok_state *_PyTokenizer_FromFile(FILE *, const char*, const char *, const char *); extern void _PyTokenizer_Free(struct tok_state *); diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c index 4eced66b6177085..dc27408f96702df 100644 --- a/Python/Python-tokenize.c +++ b/Python/Python-tokenize.c @@ -37,15 +37,15 @@ typedef struct @classmethod _tokenizer.tokenizeriter.__new__ as tokenizeriter_new - source: str + source: object * extra_tokens: bool [clinic start generated code]*/ static PyObject * -tokenizeriter_new_impl(PyTypeObject *type, const char *source, +tokenizeriter_new_impl(PyTypeObject *type, PyObject *source, int extra_tokens) -/*[clinic end generated code: output=f6f9d8b4beec8106 input=90dc5b6a5df180c2]*/ +/*[clinic end generated code: output=f174f61e34b2c306 input=32ddfe6d52575938]*/ { tokenizeriterobject *self = (tokenizeriterobject *)type->tp_alloc(type, 0); if (self == NULL) { @@ -55,7 +55,7 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source, if (filename == NULL) { return NULL; } - self->tok = _PyTokenizer_FromUTF8(source, 1, 1); + self->tok = _PyTokenizer_FromUTF8Readline(source, 1, 1); if (self->tok == NULL) { Py_DECREF(filename); return NULL; diff --git a/Python/clinic/Python-tokenize.c.h b/Python/clinic/Python-tokenize.c.h index 7e779388a92dbf3..25ed3427210a21e 100644 --- a/Python/clinic/Python-tokenize.c.h +++ b/Python/clinic/Python-tokenize.c.h @@ -9,7 +9,7 @@ preserve static PyObject * -tokenizeriter_new_impl(PyTypeObject *type, const char *source, +tokenizeriter_new_impl(PyTypeObject *type, PyObject *source, int extra_tokens); static PyObject * @@ -44,26 +44,14 @@ tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) PyObject *argsbuf[2]; PyObject * const *fastargs; Py_ssize_t nargs = PyTuple_GET_SIZE(args); - const char *source; + PyObject *source; int extra_tokens; fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser, 1, 1, 1, argsbuf); if (!fastargs) { goto exit; } - if (!PyUnicode_Check(fastargs[0])) { - _PyArg_BadArgument("tokenizeriter", "argument 'source'", "str", fastargs[0]); - goto exit; - } - Py_ssize_t source_length; - source = PyUnicode_AsUTF8AndSize(fastargs[0], &source_length); - if (source == NULL) { - goto exit; - } - if (strlen(source) != (size_t)source_length) { - PyErr_SetString(PyExc_ValueError, "embedded null character"); - goto exit; - } + source = fastargs[0]; extra_tokens = PyObject_IsTrue(fastargs[1]); if (extra_tokens < 0) { goto exit; @@ -73,4 +61,4 @@ tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) exit: return return_value; } -/*[clinic end generated code: output=940b564c67f6e0e2 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=71b499214c6a9d7d input=a9049054013a1b77]*/