From 709d7e1130e7c8b706dee49d45c172be2faab0e6 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 6 Sep 2022 16:19:53 -0400 Subject: [PATCH 1/3] gh-96611: Fix error message for invalid UTF-8 in mid-multiline string --- Lib/test/test_source_encoding.py | 10 ++++++++++ Parser/tokenizer.c | 2 ++ 2 files changed, 12 insertions(+) diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py index 8e68b4eae33003..8fc93161e8ae81 100644 --- a/Lib/test/test_source_encoding.py +++ b/Lib/test/test_source_encoding.py @@ -147,6 +147,16 @@ def test_error_from_string(self): self.assertTrue(c.exception.args[0].startswith(expected), msg=c.exception.args[0]) + def test_file_parse_error_multiline(self): + # gh96611: + with open(TESTFN, "wb") as fd: + fd.write(b'print("""\n\xb1""")\n') + + retcode, stdout, stderr = script_helper.assert_python_failure(TESTFN) + + self.assertGreater(retcode, 0) + self.assertIn(b"Non-UTF-8 code starting with '\\xb1'", stderr) + class AbstractSourceEncodingTest: diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index f2606f17d14630..6d08db5ebd5498 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -1936,6 +1936,8 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) /* Get rest of string */ while (end_quote_size != quote_size) { c = tok_nextc(tok); + if (tok->done == E_DECODE) + break; if (c == EOF || (quote_size == 1 && c == '\n')) { assert(tok->multi_line_start != NULL); // shift the tok_state's location into From 420d6ff31809e3cbaaeb5d05317b64b5ca9dd7f2 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 6 Sep 2022 16:22:19 -0400 Subject: [PATCH 2/3] Add blurb --- .../2022-09-06-16-22-13.gh-issue-96611.14wIX8.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2022-09-06-16-22-13.gh-issue-96611.14wIX8.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-09-06-16-22-13.gh-issue-96611.14wIX8.rst b/Misc/NEWS.d/next/Core and Builtins/2022-09-06-16-22-13.gh-issue-96611.14wIX8.rst new file mode 100644 index 00000000000000..08bd409bc9f997 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2022-09-06-16-22-13.gh-issue-96611.14wIX8.rst @@ -0,0 +1,2 @@ +When loading a file with invalid UTF-8 inside a multi-line string, a correct +SyntaxError is emitted. From 80aaa5ee56238e6d6fba4d1050cfb4738a2ced0c Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 6 Sep 2022 18:13:41 -0400 Subject: [PATCH 3/3] Fix MacOS --- Lib/test/test_source_encoding.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py index 8fc93161e8ae81..feaff4770f755b 100644 --- a/Lib/test/test_source_encoding.py +++ b/Lib/test/test_source_encoding.py @@ -152,11 +152,13 @@ def test_file_parse_error_multiline(self): with open(TESTFN, "wb") as fd: fd.write(b'print("""\n\xb1""")\n') - retcode, stdout, stderr = script_helper.assert_python_failure(TESTFN) - - self.assertGreater(retcode, 0) - self.assertIn(b"Non-UTF-8 code starting with '\\xb1'", stderr) + try: + retcode, stdout, stderr = script_helper.assert_python_failure(TESTFN) + self.assertGreater(retcode, 0) + self.assertIn(b"Non-UTF-8 code starting with '\\xb1'", stderr) + finally: + os.unlink(TESTFN) class AbstractSourceEncodingTest: