From 9089ece4dade1a3c56ffa452679aab43331cd8e8 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Mon, 29 May 2023 19:57:08 +0100 Subject: [PATCH] Blech --- Lib/test/test_tokenize.py | 85 +++++++++++++++++++++------------------ Parser/tokenizer.c | 69 ++++++++++++++++++++++++------- Parser/tokenizer.h | 1 + 3 files changed, 100 insertions(+), 55 deletions(-) diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index c40d227fe1a3fe8..4aab8df4fd8031b 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -2669,43 +2669,44 @@ def test_unicode(self): def test_invalid_syntax(self): def get_tokens(string): - return list(_generate_tokens_from_c_tokenizer(string)) - - self.assertRaises(SyntaxError, get_tokens, "(1+2]") - self.assertRaises(SyntaxError, get_tokens, "(1+2}") - self.assertRaises(SyntaxError, get_tokens, "{1+2]") - - self.assertRaises(SyntaxError, get_tokens, "1_") - self.assertRaises(SyntaxError, get_tokens, "1.2_") - self.assertRaises(SyntaxError, get_tokens, "1e2_") - self.assertRaises(SyntaxError, get_tokens, "1e+") - - self.assertRaises(SyntaxError, get_tokens, "\xa0") - self.assertRaises(SyntaxError, get_tokens, "€") - - self.assertRaises(SyntaxError, get_tokens, "0b12") - self.assertRaises(SyntaxError, get_tokens, "0b1_2") - self.assertRaises(SyntaxError, get_tokens, "0b2") - self.assertRaises(SyntaxError, get_tokens, "0b1_") - self.assertRaises(SyntaxError, get_tokens, "0b") - self.assertRaises(SyntaxError, get_tokens, "0o18") - self.assertRaises(SyntaxError, get_tokens, "0o1_8") - self.assertRaises(SyntaxError, get_tokens, "0o8") - self.assertRaises(SyntaxError, get_tokens, "0o1_") - self.assertRaises(SyntaxError, get_tokens, "0o") - self.assertRaises(SyntaxError, get_tokens, "0x1_") - self.assertRaises(SyntaxError, get_tokens, "0x") - self.assertRaises(SyntaxError, get_tokens, "1_") - self.assertRaises(SyntaxError, get_tokens, "012") - self.assertRaises(SyntaxError, get_tokens, "1.2_") - self.assertRaises(SyntaxError, get_tokens, "1e2_") - self.assertRaises(SyntaxError, get_tokens, "1e+") - - self.assertRaises(SyntaxError, get_tokens, "'sdfsdf") - self.assertRaises(SyntaxError, get_tokens, "'''sdfsdf''") - - self.assertRaises(SyntaxError, get_tokens, "("*1000+"a"+")"*1000) - self.assertRaises(SyntaxError, get_tokens, "]") + the_string = StringIO(string) + return list(_generate_tokens_from_c_tokenizer(the_string.readline)) + + for case in [ + "(1+2]", + "(1+2}", + "{1+2]", + "1_", + "1.2_", + "1e2_", + "1e+", + + "\xa0", + "€", + "0b12", + "0b1_2", + "0b2", + "0b1_", + "0b", + "0o18", + "0o1_8", + "0o8", + "0o1_", + "0o", + "0x1_", + "0x", + "1_", + "012", + "1.2_", + "1e2_", + "1e+", + "'sdfsdf", + "'''sdfsdf''", + "("*1000+"a"+")"*1000, + "]", + ]: + with self.subTest(case=case): + self.assertRaises(SyntaxError, get_tokens, case) def test_max_indent(self): MAXINDENT = 100 @@ -2716,20 +2717,24 @@ def generate_source(indents): return source valid = generate_source(MAXINDENT - 1) - tokens = list(_generate_tokens_from_c_tokenizer(valid)) + the_input = StringIO(valid) + tokens = list(_generate_tokens_from_c_tokenizer(the_input.readline)) self.assertEqual(tokens[-2].type, DEDENT) self.assertEqual(tokens[-1].type, ENDMARKER) compile(valid, "", "exec") invalid = generate_source(MAXINDENT) - self.assertRaises(SyntaxError, lambda: list(_generate_tokens_from_c_tokenizer(invalid))) + the_input = StringIO(invalid) + self.assertRaises(SyntaxError, lambda: list(_generate_tokens_from_c_tokenizer(the_input.readline))) self.assertRaises( IndentationError, compile, invalid, "", "exec" ) def test_continuation_lines_indentation(self): def get_tokens(string): - return [(kind, string) for (kind, string, *_) in _generate_tokens_from_c_tokenizer(string)] + the_string = StringIO(string) + return [(kind, string) for (kind, string, *_) + in _generate_tokens_from_c_tokenizer(the_string.readline)] code = dedent(""" def fib(n): diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 28a335bc85016a2..9605fcb3d8abcce 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -103,6 +103,7 @@ tok_new(void) tok->filename = NULL; tok->decoding_readline = NULL; tok->decoding_buffer = NULL; + tok->readline = NULL; tok->type_comments = 0; tok->async_hacks = 0; tok->async_def = 0; @@ -542,13 +543,7 @@ tok_readline_recode(struct tok_state *tok) { if (line == NULL) { line = PyObject_CallNoArgs(tok->decoding_readline); if (line == NULL) { - if (!PyErr_ExceptionMatches(PyExc_StopIteration)) { - error_ret(tok); - } else { - PyErr_Clear(); - tok->inp = tok->cur; - tok->done = E_EOF; - } + error_ret(tok); goto error; } } @@ -575,7 +570,6 @@ tok_readline_recode(struct tok_state *tok) { goto error; } Py_DECREF(line); -exit: return 1; error: Py_XDECREF(line); @@ -924,13 +918,11 @@ _PyTokenizer_FromUTF8Readline(PyObject* readline, int exec_input, int preserve_c tok->enc = NULL; tok->encoding = new_string("utf-8", 5, tok); Py_INCREF(readline); - tok->decoding_readline = readline; + tok->readline = readline; tok->decoding_state = STATE_NORMAL; return tok; } - - /* Set up tokenizer for UTF-8 string */ struct tok_state * @@ -1000,6 +992,7 @@ _PyTokenizer_Free(struct tok_state *tok) } Py_XDECREF(tok->decoding_readline); Py_XDECREF(tok->decoding_buffer); + Py_XDECREF(tok->readline); Py_XDECREF(tok->filename); if (tok->fp != NULL && tok->buf != NULL) { PyMem_Free(tok->buf); @@ -1052,6 +1045,47 @@ tok_readline_raw(struct tok_state *tok) return 1; } +static int +tok_readline_string(struct tok_state* tok) { + PyObject* line = PyObject_CallNoArgs(tok->readline); + if (line == NULL) { + if (PyErr_ExceptionMatches(PyExc_StopIteration)) { + PyErr_Clear(); + return 1; + } + error_ret(tok); + goto error; + } + Py_ssize_t buflen; + const char* buf = PyUnicode_AsUTF8AndSize(line, &buflen); + if (buf == NULL) { + error_ret(tok); + goto error; + } + + // Make room for the null terminator *and* potentially + // an extra newline character that we may need to artificially + // add. + size_t buffer_size = buflen + 2; + if (!tok_reserve_buf(tok, buffer_size)) { + goto error; + } + memcpy(tok->inp, buf, buflen); + tok->inp += buflen; + *tok->inp = '\0'; + + if (tok->start == NULL) { + tok->buf = tok->cur; + } + tok->line_start = tok->cur; + + Py_DECREF(line); + return 1; +error: + Py_XDECREF(line); + return 0; +} + static int tok_underflow_string(struct tok_state *tok) { char *end = strchr(tok->inp, '\n'); @@ -1167,7 +1201,7 @@ tok_underflow_interactive(struct tok_state *tok) { } static int -tok_underflow_file(struct tok_state *tok) { +tok_underflow_file(struct tok_state *tok, int use_readline) { if (tok->start == NULL && !INSIDE_FSTRING(tok)) { tok->cur = tok->inp = tok->buf; } @@ -1188,6 +1222,11 @@ tok_underflow_file(struct tok_state *tok) { return 0; } } + else if(use_readline) { + if (!tok_readline_string(tok)) { + return 0; + } + } else { /* We want a 'raw' read. */ if (!tok_readline_raw(tok)) { @@ -1269,8 +1308,8 @@ tok_nextc(struct tok_state *tok) if (tok->done != E_OK) { return EOF; } - if (tok->decoding_readline != NULL) { - rc = tok_underflow_file(tok); + if (tok->readline) { + rc = tok_underflow_file(tok, 1); } else if (tok->fp == NULL) { rc = tok_underflow_string(tok); @@ -1279,7 +1318,7 @@ tok_nextc(struct tok_state *tok) rc = tok_underflow_interactive(tok); } else { - rc = tok_underflow_file(tok); + rc = tok_underflow_file(tok, 0); } #if defined(Py_DEBUG) if (tok->debug) { diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h index e173584bc9278dd..435442b8019ebaa 100644 --- a/Parser/tokenizer.h +++ b/Parser/tokenizer.h @@ -109,6 +109,7 @@ struct tok_state { expression (cf. issue 16806) */ PyObject *decoding_readline; /* open(...).readline */ PyObject *decoding_buffer; + PyObject *readline; const char* enc; /* Encoding for the current str. */ char* str; /* Source string being tokenized (if tokenizing from a string)*/ char* input; /* Tokenizer's newline translated copy of the string. */