Skip to content

Commit

Permalink
Merge pull request #3387 from bdarnell/chunked-parsing
Browse files Browse the repository at this point in the history
http1connection: Stricter handling of transfer-encoding and whitespace
  • Loading branch information
bdarnell authored Jun 6, 2024
2 parents 7786f09 + 8d721a8 commit d65f6e7
Show file tree
Hide file tree
Showing 5 changed files with 133 additions and 22 deletions.
57 changes: 38 additions & 19 deletions tornado/http1connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,14 +391,11 @@ def write_headers(
self._request_start_line = start_line
lines.append(utf8("%s %s HTTP/1.1" % (start_line[0], start_line[1])))
# Client requests with a non-empty body must have either a
# Content-Length or a Transfer-Encoding.
# Content-Length or a Transfer-Encoding. If Content-Length is not
# present we'll add our Transfer-Encoding below.
self._chunking_output = (
start_line.method in ("POST", "PUT", "PATCH")
and "Content-Length" not in headers
and (
"Transfer-Encoding" not in headers
or headers["Transfer-Encoding"] == "chunked"
)
)
else:
assert isinstance(start_line, httputil.ResponseStartLine)
Expand All @@ -420,9 +417,6 @@ def write_headers(
and (start_line.code < 100 or start_line.code >= 200)
# No need to chunk the output if a Content-Length is specified.
and "Content-Length" not in headers
# Applications are discouraged from touching Transfer-Encoding,
# but if they do, leave it alone.
and "Transfer-Encoding" not in headers
)
# If connection to a 1.1 client will be closed, inform client
if (
Expand Down Expand Up @@ -562,7 +556,7 @@ def _can_keep_alive(
return connection_header != "close"
elif (
"Content-Length" in headers
or headers.get("Transfer-Encoding", "").lower() == "chunked"
or is_transfer_encoding_chunked(headers)
or getattr(start_line, "method", None) in ("HEAD", "GET")
):
# start_line may be a request or response start line; only
Expand Down Expand Up @@ -600,13 +594,6 @@ def _read_body(
delegate: httputil.HTTPMessageDelegate,
) -> Optional[Awaitable[None]]:
if "Content-Length" in headers:
if "Transfer-Encoding" in headers:
# Response cannot contain both Content-Length and
# Transfer-Encoding headers.
# http://tools.ietf.org/html/rfc7230#section-3.3.3
raise httputil.HTTPInputError(
"Response with both Transfer-Encoding and Content-Length"
)
if "," in headers["Content-Length"]:
# Proxies sometimes cause Content-Length headers to get
# duplicated. If all the values are identical then we can
Expand All @@ -633,20 +620,22 @@ def _read_body(
else:
content_length = None

is_chunked = is_transfer_encoding_chunked(headers)

if code == 204:
# This response code is not allowed to have a non-empty body,
# and has an implicit length of zero instead of read-until-close.
# http://www.w3.org/Protocols/rfc2616/rfc2616-sec4.html#sec4.3
if "Transfer-Encoding" in headers or content_length not in (None, 0):
if is_chunked or content_length not in (None, 0):
raise httputil.HTTPInputError(
"Response with code %d should not have body" % code
)
content_length = 0

if is_chunked:
return self._read_chunked_body(delegate)
if content_length is not None:
return self._read_fixed_body(content_length, delegate)
if headers.get("Transfer-Encoding", "").lower() == "chunked":
return self._read_chunked_body(delegate)
if self.is_client:
return self._read_body_until_close(delegate)
return None
Expand Down Expand Up @@ -865,3 +854,33 @@ def parse_hex_int(s: str) -> int:
if HEXDIGITS.fullmatch(s) is None:
raise ValueError("not a hexadecimal integer: %r" % s)
return int(s, 16)


def is_transfer_encoding_chunked(headers: httputil.HTTPHeaders) -> bool:
"""Returns true if the headers specify Transfer-Encoding: chunked.
Raise httputil.HTTPInputError if any other transfer encoding is used.
"""
# Note that transfer-encoding is an area in which postel's law can lead
# us astray. If a proxy and a backend server are liberal in what they accept,
# but accept slightly different things, this can lead to mismatched framing
# and request smuggling issues. Therefore we are as strict as possible here
# (even technically going beyond the requirements of the RFCs: a value of
# ",chunked" is legal but doesn't appear in practice for legitimate traffic)
if "Transfer-Encoding" not in headers:
return False
if "Content-Length" in headers:
# Message cannot contain both Content-Length and
# Transfer-Encoding headers.
# http://tools.ietf.org/html/rfc7230#section-3.3.3
raise httputil.HTTPInputError(
"Message with both Transfer-Encoding and Content-Length"
)
if headers["Transfer-Encoding"].lower() == "chunked":
return True
# We do not support any transfer-encodings other than chunked, and we do not
# expect to add any support because the concept of transfer-encoding has
# been removed in HTTP/2.
raise httputil.HTTPInputError(
"Unsupported Transfer-Encoding %s" % headers["Transfer-Encoding"]
)
7 changes: 5 additions & 2 deletions tornado/httputil.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@
from asyncio import Future # noqa: F401
import unittest # noqa: F401

# To be used with str.strip() and related methods.
HTTP_WHITESPACE = " \t"


@lru_cache(1000)
def _normalize_header(name: str) -> str:
Expand Down Expand Up @@ -171,15 +174,15 @@ def parse_line(self, line: str) -> None:
# continuation of a multi-line header
if self._last_key is None:
raise HTTPInputError("first header line cannot start with whitespace")
new_part = " " + line.lstrip()
new_part = " " + line.lstrip(HTTP_WHITESPACE)
self._as_list[self._last_key][-1] += new_part
self._dict[self._last_key] += new_part
else:
try:
name, value = line.split(":", 1)
except ValueError:
raise HTTPInputError("no colon in header line")
self.add(name, value.strip())
self.add(name, value.strip(HTTP_WHITESPACE))

@classmethod
def parse(cls, headers: str) -> "HTTPHeaders":
Expand Down
70 changes: 70 additions & 0 deletions tornado/test/httpserver_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -581,6 +581,76 @@ def test_chunked_request_body_invalid_size(self):
)
self.assertEqual(400, start_line.code)

def test_chunked_request_body_duplicate_header(self):
# Repeated Transfer-Encoding headers should be an error (and not confuse
# the chunked-encoding detection to mess up framing).
self.stream.write(
b"""\
POST /echo HTTP/1.1
Transfer-Encoding: chunked
Transfer-encoding: chunked
2
ok
0
"""
)
with ExpectLog(
gen_log,
".*Unsupported Transfer-Encoding chunked,chunked",
level=logging.INFO,
):
start_line, headers, response = self.io_loop.run_sync(
lambda: read_stream_body(self.stream)
)
self.assertEqual(400, start_line.code)

def test_chunked_request_body_unsupported_transfer_encoding(self):
# We don't support transfer-encodings other than chunked.
self.stream.write(
b"""\
POST /echo HTTP/1.1
Transfer-Encoding: gzip, chunked
2
ok
0
"""
)
with ExpectLog(
gen_log, ".*Unsupported Transfer-Encoding gzip, chunked", level=logging.INFO
):
start_line, headers, response = self.io_loop.run_sync(
lambda: read_stream_body(self.stream)
)
self.assertEqual(400, start_line.code)

def test_chunked_request_body_transfer_encoding_and_content_length(self):
# Transfer-encoding and content-length are mutually exclusive
self.stream.write(
b"""\
POST /echo HTTP/1.1
Transfer-Encoding: chunked
Content-Length: 2
2
ok
0
"""
)
with ExpectLog(
gen_log,
".*Message with both Transfer-Encoding and Content-Length",
level=logging.INFO,
):
start_line, headers, response = self.io_loop.run_sync(
lambda: read_stream_body(self.stream)
)
self.assertEqual(400, start_line.code)

@gen_test
def test_invalid_content_length(self):
# HTTP only allows decimal digits in content-length. Make sure we don't
Expand Down
19 changes: 19 additions & 0 deletions tornado/test/httputil_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,25 @@ def test_unicode_newlines(self):
gen_log.warning("failed while trying %r in %s", newline, encoding)
raise

def test_unicode_whitespace(self):
# Only tabs and spaces are to be stripped according to the HTTP standard.
# Other unicode whitespace is to be left as-is. In the context of headers,
# this specifically means the whitespace characters falling within the
# latin1 charset.
whitespace = [
(" ", True), # SPACE
("\t", True), # TAB
("\u00a0", False), # NON-BREAKING SPACE
("\u0085", False), # NEXT LINE
]
for c, stripped in whitespace:
headers = HTTPHeaders.parse("Transfer-Encoding: %schunked" % c)
if stripped:
expected = [("Transfer-Encoding", "chunked")]
else:
expected = [("Transfer-Encoding", "%schunked" % c)]
self.assertEqual(expected, list(headers.get_all()))

def test_optional_cr(self):
# Both CRLF and LF should be accepted as separators. CR should not be
# part of the data when followed by LF, but it is a normal char
Expand Down
2 changes: 1 addition & 1 deletion tornado/test/simple_httpclient_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -828,7 +828,7 @@ def test_chunked_with_content_length(self):
with ExpectLog(
gen_log,
(
"Malformed HTTP message from None: Response "
"Malformed HTTP message from None: Message "
"with both Transfer-Encoding and Content-Length"
),
level=logging.INFO,
Expand Down

0 comments on commit d65f6e7

Please sign in to comment.