Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
pablogsal committed May 29, 2023
1 parent 1668b41 commit b3bae11
Show file tree
Hide file tree
Showing 6 changed files with 55 additions and 25 deletions.
3 changes: 2 additions & 1 deletion Lib/test/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -1827,9 +1827,10 @@ class CTokenizeTest(TestCase):
def check_tokenize(self, s, expected):
# Format the tokens in s in a table format.
# The ENDMARKER and final NEWLINE are omitted.
f = StringIO(s)
with self.subTest(source=s):
result = stringify_tokens_from_source(
_generate_tokens_from_c_tokenizer(s), s
_generate_tokens_from_c_tokenizer(f.readline), s
)
self.assertEqual(result, expected.rstrip().splitlines())

Expand Down
10 changes: 8 additions & 2 deletions Lib/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,8 +446,14 @@ def tokenize(readline):
yield from _tokenize(rl_gen, encoding)

def _tokenize(rl_gen, encoding):
source = b"".join(rl_gen).decode(encoding)
for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True):
def gen(rl_gen):
while True:
try:
yield next(rl_gen).decode(encoding)
except StopIteration:
return
g = gen(rl_gen)
for token in _generate_tokens_from_c_tokenizer(g.__next__, extra_tokens=True):
yield token

def generate_tokens(readline):
Expand Down
38 changes: 36 additions & 2 deletions Parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -542,7 +542,13 @@ tok_readline_recode(struct tok_state *tok) {
if (line == NULL) {
line = PyObject_CallNoArgs(tok->decoding_readline);
if (line == NULL) {
error_ret(tok);
if (!PyErr_ExceptionMatches(PyExc_StopIteration)) {
error_ret(tok);
} else {
PyErr_Clear();
tok->inp = tok->cur;
tok->done = E_EOF;
}
goto error;
}
}
Expand All @@ -569,6 +575,7 @@ tok_readline_recode(struct tok_state *tok) {
goto error;
}
Py_DECREF(line);
exit:
return 1;
error:
Py_XDECREF(line);
Expand Down Expand Up @@ -900,6 +907,30 @@ _PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf)
return tok;
}

struct tok_state *
_PyTokenizer_FromUTF8Readline(PyObject* readline, int exec_input, int preserve_crlf)
{
struct tok_state *tok = tok_new();
if (tok == NULL)
return NULL;
if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {
_PyTokenizer_Free(tok);
return NULL;
}
tok->cur = tok->inp = tok->buf;
tok->end = tok->buf + BUFSIZ;
tok->fp = NULL;

tok->enc = NULL;
tok->encoding = new_string("utf-8", 5, tok);
Py_INCREF(readline);
tok->decoding_readline = readline;
tok->decoding_state = STATE_NORMAL;
return tok;
}



/* Set up tokenizer for UTF-8 string */

struct tok_state *
Expand Down Expand Up @@ -1238,7 +1269,10 @@ tok_nextc(struct tok_state *tok)
if (tok->done != E_OK) {
return EOF;
}
if (tok->fp == NULL) {
if (tok->decoding_readline != NULL) {
rc = tok_underflow_file(tok);
}
else if (tok->fp == NULL) {
rc = tok_underflow_string(tok);
}
else if (tok->prompt != NULL) {
Expand Down
1 change: 1 addition & 0 deletions Parser/tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ struct tok_state {

extern struct tok_state *_PyTokenizer_FromString(const char *, int, int);
extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int, int);
extern struct tok_state *_PyTokenizer_FromUTF8Readline(PyObject*, int, int);
extern struct tok_state *_PyTokenizer_FromFile(FILE *, const char*,
const char *, const char *);
extern void _PyTokenizer_Free(struct tok_state *);
Expand Down
8 changes: 4 additions & 4 deletions Python/Python-tokenize.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,15 +37,15 @@ typedef struct
@classmethod
_tokenizer.tokenizeriter.__new__ as tokenizeriter_new
source: str
source: object
*
extra_tokens: bool
[clinic start generated code]*/

static PyObject *
tokenizeriter_new_impl(PyTypeObject *type, const char *source,
tokenizeriter_new_impl(PyTypeObject *type, PyObject *source,
int extra_tokens)
/*[clinic end generated code: output=f6f9d8b4beec8106 input=90dc5b6a5df180c2]*/
/*[clinic end generated code: output=f174f61e34b2c306 input=32ddfe6d52575938]*/
{
tokenizeriterobject *self = (tokenizeriterobject *)type->tp_alloc(type, 0);
if (self == NULL) {
Expand All @@ -55,7 +55,7 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source,
if (filename == NULL) {
return NULL;
}
self->tok = _PyTokenizer_FromUTF8(source, 1, 1);
self->tok = _PyTokenizer_FromUTF8Readline(source, 1, 1);
if (self->tok == NULL) {
Py_DECREF(filename);
return NULL;
Expand Down
20 changes: 4 additions & 16 deletions Python/clinic/Python-tokenize.c.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit b3bae11

Please sign in to comment.