Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bpo-40612: Fix SyntaxError edge cases in traceback formatting #20072

Merged
merged 8 commits into from
May 15, 2020
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Lib/test/test_cmd_line_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -633,7 +633,7 @@ def test_syntaxerror_multi_line_fstring(self):
stderr.splitlines()[-3:],
[
b' foo"""',
b' ^',
b' ^',
b'SyntaxError: f-string: empty expression not allowed',
],
)
Expand Down
34 changes: 26 additions & 8 deletions Lib/test/test_traceback.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,13 +58,13 @@ def test_caret(self):
SyntaxError)
self.assertIn("^", err[2]) # third line has caret
self.assertEqual(err[2].count('\n'), 1) # and no additional newline
self.assertEqual(err[1].find("+"), err[2].find("^")) # in the right place
self.assertEqual(err[1].find("+") + 1, err[2].find("^")) # in the right place

err = self.get_exception_format(self.syntax_error_with_caret_non_ascii,
SyntaxError)
self.assertIn("^", err[2]) # third line has caret
self.assertEqual(err[2].count('\n'), 1) # and no additional newline
self.assertEqual(err[1].find("+"), err[2].find("^")) # in the right place
self.assertEqual(err[1].find("+") + 1, err[2].find("^")) # in the right place

def test_nocaret(self):
exc = SyntaxError("error", ("x.py", 23, None, "bad syntax"))
Expand All @@ -78,14 +78,13 @@ def test_bad_indentation(self):
self.assertEqual(len(err), 4)
self.assertEqual(err[1].strip(), "print(2)")
self.assertIn("^", err[2])
self.assertEqual(err[1].find(")"), err[2].find("^"))
self.assertEqual(err[1].find(")") + 1, err[2].find("^"))

# No caret for "unexpected indent"
err = self.get_exception_format(self.syntax_error_bad_indentation2,
IndentationError)
self.assertEqual(len(err), 4)
self.assertEqual(len(err), 3)
self.assertEqual(err[1].strip(), "print(2)")
self.assertIn("^", err[2])
self.assertEqual(err[1].find("p"), err[2].find("^"))

def test_base_exception(self):
# Test that exceptions derived from BaseException are formatted right
Expand Down Expand Up @@ -656,7 +655,7 @@ def outer_raise():
self.assertIn('inner_raise() # Marker', blocks[2])
self.check_zero_div(blocks[2])

@support.skip_if_new_parser("Pegen is arguably better here, so no need to fix this")
@unittest.skipIf(support.use_old_parser(), "Pegen is arguably better here, so no need to fix this")
def test_syntax_error_offset_at_eol(self):
# See #10186.
def e():
Expand All @@ -666,7 +665,7 @@ def e():
def e():
exec("x = 5 | 4 |")
msg = self.get_report(e).splitlines()
self.assertEqual(msg[-2], ' ^')
self.assertEqual(msg[-2], ' ^')

def test_message_none(self):
# A message that looks like "None" should not be treated specially
Expand All @@ -679,6 +678,25 @@ def test_message_none(self):
err = self.get_report(Exception(''))
self.assertIn('Exception\n', err)

def test_syntax_error_various_offsets(self):
for offset in range(-5, 10):
for add in [0, 2]:
text = " "*add + "text%d" % offset
expected = [' File "file.py", line 1']
if offset < 1:
expected.append(" %s" % text.lstrip())
elif offset <= 6:
expected.append(" %s" % text.lstrip())
expected.append(" %s^" % (" "*(offset-1)))
else:
expected.append(" %s" % text.lstrip())
expected.append(" %s^" % (" "*5))
expected.append("SyntaxError: msg")
expected.append("")
err = self.get_report(SyntaxError("msg", ("file.py", 1, offset+add, text)))
exp = "\n".join(expected)
self.assertEqual(exp, err)


class PyExcReportingTests(BaseExceptionReportingTests, unittest.TestCase):
#
Expand Down
29 changes: 18 additions & 11 deletions Lib/traceback.py
Original file line number Diff line number Diff line change
Expand Up @@ -569,23 +569,30 @@ def format_exception_only(self):

if not issubclass(self.exc_type, SyntaxError):
yield _format_final_exc_line(stype, self._str)
return
else:
yield from self._format_syntax_error(stype)

# It was a syntax error; show exactly where the problem was found.
def _format_syntax_error(self, stype):
"""Format SyntaxError exceptions (internal helper)."""
# Show exactly where the problem was found.
filename = self.filename or "<string>"
lineno = str(self.lineno) or '?'
yield ' File "{}", line {}\n'.format(filename, lineno)

badline = self.text
offset = self.offset
if badline is not None:
yield ' {}\n'.format(badline.strip())
if offset is not None:
caretspace = badline.rstrip('\n')
offset = min(len(caretspace), offset) - 1
caretspace = caretspace[:offset].lstrip()
text = self.text
if text is not None:
# text = " foo\n"
# rtext = " foo"
# ltext = "foo"
rtext = text.rstrip('\n')
ltext = rtext.lstrip(' \n\f')
spaces = len(rtext) - len(ltext)
yield ' {}\n'.format(ltext)
# Convert 1-based column offset to 0-based index into stripped text
caret = (self.offset or 0) - 1 - spaces
if caret >= 0:
# non-space whitespace (likes tabs) must be kept for alignment
caretspace = ((c.isspace() and c or ' ') for c in caretspace)
caretspace = ((c if c.isspace() else ' ') for c in ltext[:caret])
yield ' {}^\n'.format(''.join(caretspace))
msg = self.msg or "<no detail available>"
yield "{}: {}\n".format(stype, msg)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Fix edge cases in SyntaxError formatting. If the offset is <= 0, no caret is printed.
If the offset is > line length, the caret is printed pointing just after the last character.
64 changes: 43 additions & 21 deletions Python/pythonrun.c
Original file line number Diff line number Diff line change
Expand Up @@ -554,36 +554,58 @@ parse_syntax_error(PyObject *err, PyObject **message, PyObject **filename,
static void
print_error_text(PyObject *f, int offset, PyObject *text_obj)
{
const char *text;
const char *nl;

text = PyUnicode_AsUTF8(text_obj);
/* Convert text to a char pointer; return if error */
const char *text = PyUnicode_AsUTF8(text_obj);
if (text == NULL)
return;

if (offset >= 0) {
if (offset > 0 && (size_t)offset == strlen(text) && text[offset - 1] == '\n')
offset--;
for (;;) {
nl = strchr(text, '\n');
if (nl == NULL || nl-text >= offset)
break;
offset -= (int)(nl+1-text);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just fyi this code right here is the one that I mentioned in we-like-parsers#121 (comment)

Whereby a SyntaxError with an offset relative to the start of the file will end up pointing to the right place. I'm tempted to say we should eventually just remove it since the new parser will always provide line-relative offsets.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm... Why would the offset in the SyntaxError object ever end up being file-relative? Do you know of any code that produces such SyntaxErrors?

Copy link
Member

@ammaraskar ammaraskar May 15, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the old parser it was this code:

cpython/Parser/parsetok.c

Lines 425 to 431 in 4a12d12

err_ret->offset = col_offset != -1 ? col_offset + 1 : ((int)(tok->cur - tok->buf));
len = tok->inp - tok->buf;
err_ret->text = (char *) PyObject_MALLOC(len + 1);
if (err_ret->text != NULL) {
if (len > 0)
strncpy(err_ret->text, tok->buf, len);
err_ret->text[len] = '\0';

tok->cur is the current read index of the tokenizer and tok->buf is the start of the file. Also see how it copies the entire file up till the error into the SyntaxError.text field.

Try this out with the old parser as a quick example:

code = """\
a = \\
    \\
    \\?"""

try:
  compile(code, '<stdin>', 'exec')
except SyntaxError as e:
  print(e)
  print(e.lineno, e.offset)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, you mean the opposite of file. :-) It occurs when it's not read from a file.

Also it doesn't occur all the time -- perhaps only when there's a continuation line? E.g. here all is good:

>>> try: compile("def f():\n 1+\n", "", "exec")
... except SyntaxError as e: e
... 
SyntaxError('invalid syntax', ('', 2, 4, ' 1+\n'))
>>> 

Copy link
Member Author

@gvanrossum gvanrossum May 15, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW, your example does show up weird in the old parser:

>>> code = """\
a = \\
    \\
    \\?"""
... ... ... >>> code
'a = \\\n    \\\n    \\?'
>>> try: compile(code, "", "exec")
... except SyntaxError as e: e
... 
SyntaxError('unexpected character after line continuation character', ('', 3, 19, 'a = \\\n    \\\n    \\?\n'))
>>> 

The new parser seems to solve the dilemma by suppressing the source text (also the offset is set to zero, meaning unknown):

>>> code = """\
a = \\
    \\
    \\?"""
... ... ... >>> 
>>> try: compile(code, "", "exec")
... except Exception as e: e; e.lineno, e.offset, e.text
... 
SyntaxError('unexpected character after line continuation character')
(3, 0, None)
>>> 

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Aah yes, it seems like it's just an issue with line continuations. I thought I had another example but it must have been a misunderstanding because tok->buf gets advanced with newlines.

So I guess this code here in pythonrun.c is just for this one case?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, so I have a serious question here. Is there a regression due to this PR for people using -X oldparser? I have tried to research this and cannot find a regression. The C code in pythonrun.c still skips through newlines. The traceback.py code doesn't, but it never did.

Here is what I did for research. First Python 3.8:

>>> import traceback; traceback.print_exception(None, SyntaxError("msg", ("f.py", 3, 10, "aaa\nbbb\nccc\n")), None)
  File "f.py", line 3
    aaa
bbb
ccc
       
   
 ^
SyntaxError: msg
>>> raise SyntaxError("msg", ("f.py", 3, 10, "aaa\nbbb\nccc\n"))
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "f.py", line 3
    ccc
     ^
SyntaxError: msg
>>> 

Then the master branch:

>>> import traceback; traceback.print_exception(None, SyntaxError("msg", ("f.py", 3, 10, "aaa\nbbb\nccc\n")), None)
  File "f.py", line 3
    aaa
bbb
ccc
       
   
 ^
SyntaxError: msg
>>> raise SyntaxError("msg", ("f.py", 3, 10, "aaa\nbbb\nccc\n"))
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "f.py", line 3
    ccc
     ^
SyntaxError: msg
>>> 

To me that looks like in both versions, the C formatter (invoked by raise) does the right thing, while traceback.py messes up the output.

text = nl+1;
}
while (*text == ' ' || *text == '\t' || *text == '\f') {
text++;
offset--;
}
/* Convert offset from 1-based to 0-based */
offset--;

/* Strip leading whitespace from text, adjusting offset as we go */
while (*text == ' ' || *text == '\t' || *text == '\f') {
text++;
offset--;
}

/* Calculate text length excluding trailing newline */
Py_ssize_t len = strlen(text);
if (len > 0 && text[len-1] == '\n')
len--;
gvanrossum marked this conversation as resolved.
Show resolved Hide resolved

/* Clip offset to at most len */
if (offset > len)
offset = len;
gvanrossum marked this conversation as resolved.
Show resolved Hide resolved

/* Skip past newlines embedded in text */
for (;;) {
const char *nl = strchr(text, '\n');
if (nl == NULL)
break;
gvanrossum marked this conversation as resolved.
Show resolved Hide resolved
Py_ssize_t inl = nl - text;
if (inl >= (Py_ssize_t)offset)
break;
gvanrossum marked this conversation as resolved.
Show resolved Hide resolved
inl += 1;
text += inl;
len -= inl;
offset -= (int)inl;
}

/* Print text */
PyFile_WriteString(" ", f);
PyFile_WriteString(text, f);
if (*text == '\0' || text[strlen(text)-1] != '\n')

/* Make sure there's a newline at the end */
if (text[len] != '\n')
PyFile_WriteString("\n", f);
gvanrossum marked this conversation as resolved.
Show resolved Hide resolved
if (offset == -1)

/* Don't print caret if it points to the left of the text */
if (offset < 0)
return;

/* Write caret line */
PyFile_WriteString(" ", f);
while (--offset > 0)
while (--offset >= 0)
PyFile_WriteString(" ", f);
gvanrossum marked this conversation as resolved.
Show resolved Hide resolved
PyFile_WriteString("^\n", f);
}
Expand Down