Skip to content

Commit

Permalink
bpo-21315: Fix parsing of encoded words with missing leading ws. (pyt…
Browse files Browse the repository at this point in the history
…honGH-13425) (python#13846)

* bpo-21315: Fix parsing of encoded words with missing leading ws.

Because of missing leading whitespace, encoded word would get parsed as
unstructured token. This patch fixes that by looking for encoded words when
splitting tokens with whitespace.

Missing trailing whitespace around encoded word now register a defect
instead.

Original patch suggestion by David R. Murray on bpo-21315.
(cherry picked from commit 66c4f3f)

Co-authored-by: Abhilash Raj <maxking@users.noreply.github.com>
  • Loading branch information
2 people authored and warsaw committed Jun 6, 2019
1 parent 28be388 commit dc20fc4
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 3 deletions.
21 changes: 21 additions & 0 deletions Lib/email/_header_value_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,18 @@
def quote_string(value):
return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"'

# Match a RFC 2047 word, looks like =?utf-8?q?someword?=
rfc2047_matcher = re.compile(r'''
=\? # literal =?
[^?]* # charset
\? # literal ?
[qQbB] # literal 'q' or 'b', case insensitive
\? # literal ?
.*? # encoded word
\?= # literal ?=
''', re.VERBOSE | re.MULTILINE)


#
# TokenList and its subclasses
#
Expand Down Expand Up @@ -1050,6 +1062,10 @@ def get_encoded_word(value):
_validate_xtext(vtext)
ew.append(vtext)
text = ''.join(remainder)
# Encoded words should be followed by a WS
if value and value[0] not in WSP:
ew.defects.append(errors.InvalidHeaderDefect(
"missing trailing whitespace after encoded-word"))
return ew, value

def get_unstructured(value):
Expand Down Expand Up @@ -1102,6 +1118,11 @@ def get_unstructured(value):
unstructured.append(token)
continue
tok, *remainder = _wsp_splitter(value, 1)
# Split in the middle of an atom if there is a rfc2047 encoded word
# which does not have WSP on both sides. The defect will be registered
# the next time through the loop.
if rfc2047_matcher.search(tok):
tok, *remainder = value.partition('=?')
vtext = ValueTerminal(tok, 'vtext')
_validate_xtext(vtext)
unstructured.append(vtext)
Expand Down
24 changes: 22 additions & 2 deletions Lib/test/test_email/test__header_value_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def test_get_encoded_word_gets_first_even_if_no_space(self):
'=?us-ascii?q?first?==?utf-8?q?second?=',
'first',
'first',
[],
[errors.InvalidHeaderDefect],
'=?utf-8?q?second?=')

def test_get_encoded_word_sets_extra_attributes(self):
Expand Down Expand Up @@ -361,6 +361,25 @@ def test_get_unstructured_no_whitespace_between_ews(self):
'=?utf-8?q?foo?==?utf-8?q?bar?=',
'foobar',
'foobar',
[errors.InvalidHeaderDefect,
errors.InvalidHeaderDefect],
'')

def test_get_unstructured_ew_without_leading_whitespace(self):
self._test_get_x(
self._get_unst,
'nowhitespace=?utf-8?q?somevalue?=',
'nowhitespacesomevalue',
'nowhitespacesomevalue',
[errors.InvalidHeaderDefect],
'')

def test_get_unstructured_ew_without_trailing_whitespace(self):
self._test_get_x(
self._get_unst,
'=?utf-8?q?somevalue?=nowhitespace',
'somevaluenowhitespace',
'somevaluenowhitespace',
[errors.InvalidHeaderDefect],
'')

Expand Down Expand Up @@ -546,7 +565,8 @@ def test_encoded_word_inside_quotes(self):
'"=?utf-8?Q?not_really_valid?="',
'"not really valid"',
'not really valid',
[errors.InvalidHeaderDefect],
[errors.InvalidHeaderDefect,
errors.InvalidHeaderDefect],
'')

# get_comment
Expand Down
3 changes: 2 additions & 1 deletion Lib/test/test_email/test_headerregistry.py
Original file line number Diff line number Diff line change
Expand Up @@ -1180,7 +1180,8 @@ class TestAddressHeader(TestHeaderBase):

'rfc2047_atom_in_quoted_string_is_decoded':
('"=?utf-8?q?=C3=89ric?=" <foo@example.com>',
[errors.InvalidHeaderDefect],
[errors.InvalidHeaderDefect,
errors.InvalidHeaderDefect],
'Éric <foo@example.com>',
'Éric',
'foo@example.com',
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Email headers containing RFC2047 encoded words are parsed despite the missing
whitespace, and a defect registered. Also missing trailing whitespace after
encoded words is now registered as a defect.

0 comments on commit dc20fc4

Please sign in to comment.