Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bpo-37764: Fix infinite loop when parsing unstructured email headers. #15239

Merged
merged 17 commits into from
Aug 31, 2019
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions Lib/email/_header_value_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -1039,7 +1039,10 @@ def get_encoded_word(value):
raise errors.HeaderParseError(
"expected encoded word but found {}".format(value))
remstr = ''.join(remainder)
if len(remstr) > 1 and remstr[0] in hexdigits and remstr[1] in hexdigits:
if (len(remstr) > 1 and
remstr[0] in hexdigits and
remstr[1] in hexdigits and
tok.count('?') < 2):
# The ? after the CTE was followed by an encoded word escape (=XX).
rest, *remainder = remstr.split('?=', 1)
tok = tok + '?=' + rest
Expand All @@ -1051,7 +1054,7 @@ def get_encoded_word(value):
try:
text, charset, lang, defects = _ew.decode('=?' + tok + '?=')
except ValueError:
raise errors.HeaderParseError(
raise errors.InvalidEwError(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's move this exception out of errors.py and into this module as _InvalidEwError (and then it also probably doesn't need to inherit from HeaderParseError). It's just an internal implementation exception, so let's not expose it, even if it's not in __all__ or documented.

Copy link
Contributor Author

@epicfaace epicfaace Aug 30, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done. I've kept it inheriting from HeaderParseError, though, because there are some tests that rely on a HeaderParseError to be raised (when an _InvalidEwError is actually raised) and it didn't seem right to have a test that tests to see if an internal implementation exception is raised.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree the inheriting from HeaderParseError makes sense because of the way it is used internally as both an implementation detail and an external error.

"encoded word format invalid: '{}'".format(ew.cte))
ew.charset = charset
ew.lang = lang
Expand Down Expand Up @@ -1101,9 +1104,12 @@ def get_unstructured(value):
token, value = get_fws(value)
unstructured.append(token)
continue
valid_ew = True
if value.startswith('=?'):
try:
token, value = get_encoded_word(value)
except errors.InvalidEwError:
valid_ew = False
except errors.HeaderParseError:
# XXX: Need to figure out how to register defects when
# appropriate here.
Expand All @@ -1125,7 +1131,10 @@ def get_unstructured(value):
# Split in the middle of an atom if there is a rfc2047 encoded word
# which does not have WSP on both sides. The defect will be registered
# the next time through the loop.
if rfc2047_matcher.search(tok):
# This needs to only be performed when the encoded word is valid;
# otherwise, performing it on an invalid encoded word can cause
# the parser to go in an infinite loop.
if valid_ew and rfc2047_matcher.search(tok):
tok, *remainder = value.partition('=?')
vtext = ValueTerminal(tok, 'vtext')
_validate_xtext(vtext)
Expand Down
4 changes: 4 additions & 0 deletions Lib/email/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ class HeaderParseError(MessageParseError):
"""Error while parsing headers."""


class InvalidEwError(HeaderParseError):
"""Invalid encoded word found while parsing headers."""


class BoundaryError(MessageParseError):
"""Couldn't find terminating boundary."""

Expand Down
16 changes: 16 additions & 0 deletions Lib/test/test_email/test__header_value_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,22 @@ def test_get_unstructured_ew_without_trailing_whitespace(self):
[errors.InvalidHeaderDefect],
'')

def test_get_unstructured_without_trailing_whitespace_hang_case(self):
self._test_get_x(self._get_unst,
'=?utf-8?q?somevalue?=aa',
'somevalueaa',
'somevalueaa',
[errors.InvalidHeaderDefect],
'')

def test_get_unstructured_invalid_ew(self):
self._test_get_x(self._get_unst,
'=?utf-8?q?=somevalue?=',
'=?utf-8?q?=somevalue?=',
'=?utf-8?q?=somevalue?=',
[],
'')

# get_qp_ctext

def test_get_qp_ctext_only(self):
Expand Down
21 changes: 21 additions & 0 deletions Lib/test/test_email/test_email.py
Original file line number Diff line number Diff line change
Expand Up @@ -5381,6 +5381,27 @@ def test_rfc2231_unencoded_then_encoded_segments(self):
eq(language, 'en-us')
eq(s, 'My Document For You')

def test_should_not_hang_on_invalid_ew_messages(self):
messages = ["""From: user@host.com
To: user@host.com
Bad-Header:
=?us-ascii?Q?LCSwrV11+IB0rSbSker+M9vWR7wEDSuGqmHD89Gt=ea0nJFSaiz4vX3XMJPT4vrE?=
=?us-ascii?Q?xGUZeOnp0o22pLBB7CYLH74Js=wOlK6Tfru2U47qR?=
=?us-ascii?Q?72OfyEY2p2=2FrA9xNFyvH+fBTCmazxwzF8nGkK6D?=
Hello!
""", """From: ����� �������� <xxx@xxx>
To: "xxx" <xxx@xxx>
Subject: ��� ���������� ����� ����� � ��������� �� ����
MIME-Version: 1.0
Content-Type: text/plain; charset="windows-1251";
Content-Transfer-Encoding: 8bit
�� ����� � ���� ������ ��� ��������
"""]
for m in messages:
with self.subTest(m=m):
msg = email.message_from_string(m)


# Tests to ensure that signed parts of an email are completely preserved, as
Expand Down
1 change: 1 addition & 0 deletions Misc/ACKS
Original file line number Diff line number Diff line change
Expand Up @@ -1332,6 +1332,7 @@ Burton Radons
Abhilash Raj
Shorya Raj
Dhushyanth Ramasamy
Ashwin Ramaswami
Jeff Ramnani
Bayard Randel
Varpu Rantala
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fixes email._header_value_parser.get_unstructured going into an infinite loop for a specific case in which the email header does not have trailing whitespace, and the case in which it contains an invalid encoded word. Patch by Ashwin Ramaswami.