Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bpo-37764: Fix infinite loop when parsing unstructured email headers. #15239

Merged
merged 17 commits into from
Aug 31, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 16 additions & 3 deletions Lib/email/_header_value_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -935,6 +935,10 @@ def __str__(self):
return ''


class _InvalidEwError(errors.HeaderParseError):
"""Invalid encoded word found while parsing headers."""


# XXX these need to become classes and used as instances so
# that a program can't change them in a parse tree and screw
# up other parse trees. Maybe should have tests for that, too.
Expand Down Expand Up @@ -1039,7 +1043,10 @@ def get_encoded_word(value):
raise errors.HeaderParseError(
"expected encoded word but found {}".format(value))
remstr = ''.join(remainder)
if len(remstr) > 1 and remstr[0] in hexdigits and remstr[1] in hexdigits:
if (len(remstr) > 1 and
remstr[0] in hexdigits and
remstr[1] in hexdigits and
tok.count('?') < 2):
# The ? after the CTE was followed by an encoded word escape (=XX).
rest, *remainder = remstr.split('?=', 1)
tok = tok + '?=' + rest
Expand All @@ -1051,7 +1058,7 @@ def get_encoded_word(value):
try:
text, charset, lang, defects = _ew.decode('=?' + tok + '?=')
except ValueError:
raise errors.HeaderParseError(
raise _InvalidEwError(
"encoded word format invalid: '{}'".format(ew.cte))
ew.charset = charset
ew.lang = lang
Expand Down Expand Up @@ -1101,9 +1108,12 @@ def get_unstructured(value):
token, value = get_fws(value)
unstructured.append(token)
continue
valid_ew = True
if value.startswith('=?'):
try:
token, value = get_encoded_word(value)
except _InvalidEwError:
valid_ew = False
except errors.HeaderParseError:
# XXX: Need to figure out how to register defects when
# appropriate here.
Expand All @@ -1125,7 +1135,10 @@ def get_unstructured(value):
# Split in the middle of an atom if there is a rfc2047 encoded word
# which does not have WSP on both sides. The defect will be registered
# the next time through the loop.
if rfc2047_matcher.search(tok):
# This needs to only be performed when the encoded word is valid;
# otherwise, performing it on an invalid encoded word can cause
# the parser to go in an infinite loop.
if valid_ew and rfc2047_matcher.search(tok):
tok, *remainder = value.partition('=?')
vtext = ValueTerminal(tok, 'vtext')
_validate_xtext(vtext)
Expand Down
16 changes: 16 additions & 0 deletions Lib/test/test_email/test__header_value_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,22 @@ def test_get_unstructured_ew_without_trailing_whitespace(self):
[errors.InvalidHeaderDefect],
'')

def test_get_unstructured_without_trailing_whitespace_hang_case(self):
self._test_get_x(self._get_unst,
'=?utf-8?q?somevalue?=aa',
'somevalueaa',
'somevalueaa',
[errors.InvalidHeaderDefect],
'')

def test_get_unstructured_invalid_ew(self):
self._test_get_x(self._get_unst,
'=?utf-8?q?=somevalue?=',
'=?utf-8?q?=somevalue?=',
'=?utf-8?q?=somevalue?=',
[],
'')

# get_qp_ctext

def test_get_qp_ctext_only(self):
Expand Down
21 changes: 21 additions & 0 deletions Lib/test/test_email/test_email.py
Original file line number Diff line number Diff line change
Expand Up @@ -5381,6 +5381,27 @@ def test_rfc2231_unencoded_then_encoded_segments(self):
eq(language, 'en-us')
eq(s, 'My Document For You')

def test_should_not_hang_on_invalid_ew_messages(self):
messages = ["""From: user@host.com
To: user@host.com
Bad-Header:
=?us-ascii?Q?LCSwrV11+IB0rSbSker+M9vWR7wEDSuGqmHD89Gt=ea0nJFSaiz4vX3XMJPT4vrE?=
=?us-ascii?Q?xGUZeOnp0o22pLBB7CYLH74Js=wOlK6Tfru2U47qR?=
=?us-ascii?Q?72OfyEY2p2=2FrA9xNFyvH+fBTCmazxwzF8nGkK6D?=

Hello!
""", """From: ����� �������� <xxx@xxx>
To: "xxx" <xxx@xxx>
Subject: ��� ���������� ����� ����� � ��������� �� ����
MIME-Version: 1.0
Content-Type: text/plain; charset="windows-1251";
Content-Transfer-Encoding: 8bit

�� ����� � ���� ������ ��� ��������
"""]
for m in messages:
with self.subTest(m=m):
msg = email.message_from_string(m)


# Tests to ensure that signed parts of an email are completely preserved, as
Expand Down
1 change: 1 addition & 0 deletions Misc/ACKS
Original file line number Diff line number Diff line change
Expand Up @@ -1332,6 +1332,7 @@ Burton Radons
Abhilash Raj
Shorya Raj
Dhushyanth Ramasamy
Ashwin Ramaswami
Jeff Ramnani
Bayard Randel
Varpu Rantala
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fixes email._header_value_parser.get_unstructured going into an infinite loop for a specific case in which the email header does not have trailing whitespace, and the case in which it contains an invalid encoded word. Patch by Ashwin Ramaswami.