From c5b242f87f31286ad38991bc3868cf4cfbf2b681 Mon Sep 17 00:00:00 2001 From: Ashwin Ramaswami Date: Sat, 31 Aug 2019 10:25:35 -0500 Subject: [PATCH] bpo-37764: Fix infinite loop when parsing unstructured email headers. (GH-15239) Fixes a case in which email._header_value_parser.get_unstructured hangs the system for some invalid headers. This covers the cases in which the header contains either: - a case without trailing whitespace - an invalid encoded word https://bugs.python.org/issue37764 This fix should also be backported to 3.7 and 3.8 https://bugs.python.org/issue37764 --- Lib/email/_header_value_parser.py | 19 ++++++++++++++--- .../test_email/test__header_value_parser.py | 16 ++++++++++++++ Lib/test/test_email/test_email.py | 21 +++++++++++++++++++ Misc/ACKS | 1 + .../2019-08-27-01-13-05.bpo-37764.qv67PQ.rst | 1 + 5 files changed, 55 insertions(+), 3 deletions(-) create mode 100644 Misc/NEWS.d/next/Security/2019-08-27-01-13-05.bpo-37764.qv67PQ.rst diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index b5003943ab0d97..16c19907d68d59 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -935,6 +935,10 @@ def __str__(self): return '' +class _InvalidEwError(errors.HeaderParseError): + """Invalid encoded word found while parsing headers.""" + + # XXX these need to become classes and used as instances so # that a program can't change them in a parse tree and screw # up other parse trees. Maybe should have tests for that, too. @@ -1039,7 +1043,10 @@ def get_encoded_word(value): raise errors.HeaderParseError( "expected encoded word but found {}".format(value)) remstr = ''.join(remainder) - if len(remstr) > 1 and remstr[0] in hexdigits and remstr[1] in hexdigits: + if (len(remstr) > 1 and + remstr[0] in hexdigits and + remstr[1] in hexdigits and + tok.count('?') < 2): # The ? after the CTE was followed by an encoded word escape (=XX). rest, *remainder = remstr.split('?=', 1) tok = tok + '?=' + rest @@ -1051,7 +1058,7 @@ def get_encoded_word(value): try: text, charset, lang, defects = _ew.decode('=?' + tok + '?=') except ValueError: - raise errors.HeaderParseError( + raise _InvalidEwError( "encoded word format invalid: '{}'".format(ew.cte)) ew.charset = charset ew.lang = lang @@ -1101,9 +1108,12 @@ def get_unstructured(value): token, value = get_fws(value) unstructured.append(token) continue + valid_ew = True if value.startswith('=?'): try: token, value = get_encoded_word(value) + except _InvalidEwError: + valid_ew = False except errors.HeaderParseError: # XXX: Need to figure out how to register defects when # appropriate here. @@ -1125,7 +1135,10 @@ def get_unstructured(value): # Split in the middle of an atom if there is a rfc2047 encoded word # which does not have WSP on both sides. The defect will be registered # the next time through the loop. - if rfc2047_matcher.search(tok): + # This needs to only be performed when the encoded word is valid; + # otherwise, performing it on an invalid encoded word can cause + # the parser to go in an infinite loop. + if valid_ew and rfc2047_matcher.search(tok): tok, *remainder = value.partition('=?') vtext = ValueTerminal(tok, 'vtext') _validate_xtext(vtext) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index b3e6b2661524e9..058d902459b602 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -383,6 +383,22 @@ def test_get_unstructured_ew_without_trailing_whitespace(self): [errors.InvalidHeaderDefect], '') + def test_get_unstructured_without_trailing_whitespace_hang_case(self): + self._test_get_x(self._get_unst, + '=?utf-8?q?somevalue?=aa', + 'somevalueaa', + 'somevalueaa', + [errors.InvalidHeaderDefect], + '') + + def test_get_unstructured_invalid_ew(self): + self._test_get_x(self._get_unst, + '=?utf-8?q?=somevalue?=', + '=?utf-8?q?=somevalue?=', + '=?utf-8?q?=somevalue?=', + [], + '') + # get_qp_ctext def test_get_qp_ctext_only(self): diff --git a/Lib/test/test_email/test_email.py b/Lib/test/test_email/test_email.py index ae962584564656..8ec39190ea8da4 100644 --- a/Lib/test/test_email/test_email.py +++ b/Lib/test/test_email/test_email.py @@ -5381,6 +5381,27 @@ def test_rfc2231_unencoded_then_encoded_segments(self): eq(language, 'en-us') eq(s, 'My Document For You') + def test_should_not_hang_on_invalid_ew_messages(self): + messages = ["""From: user@host.com +To: user@host.com +Bad-Header: + =?us-ascii?Q?LCSwrV11+IB0rSbSker+M9vWR7wEDSuGqmHD89Gt=ea0nJFSaiz4vX3XMJPT4vrE?= + =?us-ascii?Q?xGUZeOnp0o22pLBB7CYLH74Js=wOlK6Tfru2U47qR?= + =?us-ascii?Q?72OfyEY2p2=2FrA9xNFyvH+fBTCmazxwzF8nGkK6D?= + +Hello! +""", """From: ����� �������� +To: "xxx" +Subject: ��� ���������� ����� ����� � ��������� �� ���� +MIME-Version: 1.0 +Content-Type: text/plain; charset="windows-1251"; +Content-Transfer-Encoding: 8bit + +�� ����� � ���� ������ ��� �������� +"""] + for m in messages: + with self.subTest(m=m): + msg = email.message_from_string(m) # Tests to ensure that signed parts of an email are completely preserved, as diff --git a/Misc/ACKS b/Misc/ACKS index e9ae0ed56b0deb..ce8b144900ebc0 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -1336,6 +1336,7 @@ Burton Radons Abhilash Raj Shorya Raj Dhushyanth Ramasamy +Ashwin Ramaswami Jeff Ramnani Bayard Randel Varpu Rantala diff --git a/Misc/NEWS.d/next/Security/2019-08-27-01-13-05.bpo-37764.qv67PQ.rst b/Misc/NEWS.d/next/Security/2019-08-27-01-13-05.bpo-37764.qv67PQ.rst new file mode 100644 index 00000000000000..27fa8e192f0c07 --- /dev/null +++ b/Misc/NEWS.d/next/Security/2019-08-27-01-13-05.bpo-37764.qv67PQ.rst @@ -0,0 +1 @@ +Fixes email._header_value_parser.get_unstructured going into an infinite loop for a specific case in which the email header does not have trailing whitespace, and the case in which it contains an invalid encoded word. Patch by Ashwin Ramaswami. \ No newline at end of file