diff --git a/CHANGES/7663.feature b/CHANGES/7663.feature new file mode 100644 index 00000000000..509a7ad7e2a --- /dev/null +++ b/CHANGES/7663.feature @@ -0,0 +1 @@ +Updated Python parser to comply with latest HTTP specs and allow lax response parsing -- by :user:`Dreamorcerer` diff --git a/Makefile b/Makefile index 103de1ec745..9bfea9122bc 100644 --- a/Makefile +++ b/Makefile @@ -58,7 +58,7 @@ aiohttp/_find_header.c: $(call to-hash,aiohttp/hdrs.py ./tools/gen.py) # _find_headers generator creates _headers.pyi as well aiohttp/%.c: aiohttp/%.pyx $(call to-hash,$(CYS)) aiohttp/_find_header.c - cython -3 -o $@ $< -I aiohttp + cython -3 -o $@ $< -I aiohttp -Werror vendor/llhttp/node_modules: vendor/llhttp/package.json cd vendor/llhttp; npm install diff --git a/aiohttp/http_parser.py b/aiohttp/http_parser.py index 6d4261e337a..24be6a28bdd 100644 --- a/aiohttp/http_parser.py +++ b/aiohttp/http_parser.py @@ -5,9 +5,12 @@ from contextlib import suppress from enum import IntEnum from typing import ( + Any, + ClassVar, Final, Generic, List, + Literal, NamedTuple, Optional, Pattern, @@ -24,7 +27,7 @@ from . import hdrs from .base_protocol import BaseProtocol from .compression_utils import HAS_BROTLI, BrotliDecompressor, ZLibDecompressor -from .helpers import NO_EXTENSIONS, BaseTimerContext +from .helpers import DEBUG, NO_EXTENSIONS, BaseTimerContext from .http_exceptions import ( BadHttpMessage, BadStatusLine, @@ -48,6 +51,8 @@ "RawResponseMessage", ) +_SEP = Literal[b"\r\n", b"\n"] + ASCIISET: Final[Set[str]] = set(string.printable) # See https://www.rfc-editor.org/rfc/rfc9110.html#name-overview @@ -60,6 +65,7 @@ METHRE: Final[Pattern[str]] = re.compile(r"[!#$%&'*+\-.^_`|~0-9A-Za-z]+") VERSRE: Final[Pattern[str]] = re.compile(r"HTTP/(\d).(\d)") HDRRE: Final[Pattern[bytes]] = re.compile(rb"[\x00-\x1F\x7F()<>@,;:\[\]={} \t\"\\]") +HEXDIGIT = re.compile(rb"[0-9a-fA-F]+") class RawRequestMessage(NamedTuple): @@ -206,6 +212,8 @@ def parse_headers( class HttpParser(abc.ABC, Generic[_MsgT]): + lax: ClassVar[bool] = False + def __init__( self, protocol: BaseProtocol, @@ -266,7 +274,7 @@ def feed_eof(self) -> Optional[_MsgT]: def feed_data( self, data: bytes, - SEP: bytes = b"\r\n", + SEP: _SEP = b"\r\n", EMPTY: bytes = b"", CONTENT_LENGTH: istr = hdrs.CONTENT_LENGTH, METH_CONNECT: str = hdrs.METH_CONNECT, @@ -288,13 +296,16 @@ def feed_data( pos = data.find(SEP, start_pos) # consume \r\n if pos == start_pos and not self._lines: - start_pos = pos + 2 + start_pos = pos + len(SEP) continue if pos >= start_pos: # line found - self._lines.append(data[start_pos:pos]) - start_pos = pos + 2 + line = data[start_pos:pos] + if SEP == b"\n": # For lax response parsing + line = line.rstrip(b"\r") + self._lines.append(line) + start_pos = pos + len(SEP) # \r\n\r\n found if self._lines[-1] == EMPTY: @@ -311,7 +322,7 @@ def get_content_length() -> Optional[int]: # Shouldn't allow +/- or other number formats. # https://www.rfc-editor.org/rfc/rfc9110#section-8.6-2 - if not length_hdr.strip(" \t").isdigit(): + if not length_hdr.strip(" \t").isdecimal(): raise InvalidHeader(CONTENT_LENGTH) return int(length_hdr) @@ -348,6 +359,7 @@ def get_content_length() -> Optional[int]: readall=self.readall, response_with_body=self.response_with_body, auto_decompress=self._auto_decompress, + lax=self.lax, ) if not payload_parser.done: self._payload_parser = payload_parser @@ -366,6 +378,7 @@ def get_content_length() -> Optional[int]: compression=msg.compression, readall=True, auto_decompress=self._auto_decompress, + lax=self.lax, ) else: if ( @@ -389,6 +402,7 @@ def get_content_length() -> Optional[int]: readall=True, response_with_body=self.response_with_body, auto_decompress=self._auto_decompress, + lax=self.lax, ) if not payload_parser.done: self._payload_parser = payload_parser @@ -411,7 +425,7 @@ def get_content_length() -> Optional[int]: assert not self._lines assert self._payload_parser is not None try: - eof, data = self._payload_parser.feed_data(data[start_pos:]) + eof, data = self._payload_parser.feed_data(data[start_pos:], SEP) except BaseException as exc: if self.payload_exception is not None: self._payload_parser.payload.set_exception( @@ -456,12 +470,21 @@ def parse_headers( # https://www.rfc-editor.org/rfc/rfc9110.html#section-5.5-6 # https://www.rfc-editor.org/rfc/rfc9110.html#name-collected-abnf - singletons = (hdrs.CONTENT_LENGTH, hdrs.CONTENT_LOCATION, hdrs.CONTENT_RANGE, - hdrs.CONTENT_TYPE, hdrs.ETAG, hdrs.HOST, hdrs.MAX_FORWARDS, - hdrs.SERVER, hdrs.TRANSFER_ENCODING, hdrs.USER_AGENT) + singletons = ( + hdrs.CONTENT_LENGTH, + hdrs.CONTENT_LOCATION, + hdrs.CONTENT_RANGE, + hdrs.CONTENT_TYPE, + hdrs.ETAG, + hdrs.HOST, + hdrs.MAX_FORWARDS, + hdrs.SERVER, + hdrs.TRANSFER_ENCODING, + hdrs.USER_AGENT, + ) bad_hdr = next((h for h in singletons if len(headers.getall(h, ())) > 1), None) if bad_hdr is not None: - raise BadHttpMessage("Duplicate '{}' header found.".format(bad_hdr)) + raise BadHttpMessage(f"Duplicate '{bad_hdr}' header found.") # keep-alive conn = headers.get(hdrs.CONNECTION) @@ -597,6 +620,20 @@ class HttpResponseParser(HttpParser[RawResponseMessage]): Returns RawResponseMessage. """ + # Lax mode should only be enabled on response parser. + lax = not DEBUG + + def feed_data( + self, + data: bytes, + SEP: Optional[_SEP] = None, + *args: Any, + **kwargs: Any, + ) -> Tuple[List[Tuple[RawResponseMessage, StreamReader]], bool, bytes]: + if SEP is None: + SEP = b"\r\n" if DEBUG else b"\n" + return super().feed_data(data, SEP, *args, **kwargs) + def parse_message(self, lines: List[bytes]) -> RawResponseMessage: line = lines[0].decode("utf-8", "surrogateescape") try: @@ -621,7 +658,7 @@ def parse_message(self, lines: List[bytes]) -> RawResponseMessage: version_o = HttpVersion(int(match.group(1)), int(match.group(2))) # The status code is a three-digit number - if len(status) != 3 or not status.isdigit(): + if len(status) != 3 or not status.isdecimal(): raise BadStatusLine(line) status_i = int(status) @@ -663,6 +700,7 @@ def __init__( readall: bool = False, response_with_body: bool = True, auto_decompress: bool = True, + lax: bool = False, ) -> None: self._length = 0 self._type = ParseState.PARSE_NONE @@ -670,6 +708,7 @@ def __init__( self._chunk_size = 0 self._chunk_tail = b"" self._auto_decompress = auto_decompress + self._lax = lax self.done = False # payload decompression wrapper @@ -721,7 +760,7 @@ def feed_eof(self) -> None: ) def feed_data( - self, chunk: bytes, SEP: bytes = b"\r\n", CHUNK_EXT: bytes = b";" + self, chunk: bytes, SEP: _SEP = b"\r\n", CHUNK_EXT: bytes = b";" ) -> Tuple[bool, bytes]: # Read specified amount of bytes if self._type == ParseState.PARSE_LENGTH: @@ -757,7 +796,10 @@ def feed_data( else: size_b = chunk[:pos] - if not size_b.isdigit(): + if self._lax: # Allow whitespace in lax mode. + size_b = size_b.strip() + + if not re.fullmatch(HEXDIGIT, size_b): exc = TransferEncodingError( chunk[:pos].decode("ascii", "surrogateescape") ) @@ -765,9 +807,11 @@ def feed_data( raise exc size = int(bytes(size_b), 16) - chunk = chunk[pos + 2 :] + chunk = chunk[pos + len(SEP) :] if size == 0: # eof marker self._chunk = ChunkState.PARSE_MAYBE_TRAILERS + if self._lax and chunk.startswith(b"\r"): + chunk = chunk[1:] else: self._chunk = ChunkState.PARSE_CHUNKED_CHUNK self._chunk_size = size @@ -789,13 +833,15 @@ def feed_data( self._chunk_size = 0 self.payload.feed_data(chunk[:required], required) chunk = chunk[required:] + if self._lax and chunk.startswith(b"\r"): + chunk = chunk[1:] self._chunk = ChunkState.PARSE_CHUNKED_CHUNK_EOF self.payload.end_http_chunk_receiving() # toss the CRLF at the end of the chunk if self._chunk == ChunkState.PARSE_CHUNKED_CHUNK_EOF: - if chunk[:2] == SEP: - chunk = chunk[2:] + if chunk[: len(SEP)] == SEP: + chunk = chunk[len(SEP) :] self._chunk = ChunkState.PARSE_CHUNKED_SIZE else: self._chunk_tail = chunk @@ -805,11 +851,11 @@ def feed_data( # we should get another \r\n otherwise # trailers needs to be skiped until \r\n\r\n if self._chunk == ChunkState.PARSE_MAYBE_TRAILERS: - head = chunk[:2] + head = chunk[: len(SEP)] if head == SEP: # end of stream self.payload.feed_eof() - return True, chunk[2:] + return True, chunk[len(SEP) :] # Both CR and LF, or only LF may not be received yet. It is # expected that CRLF or LF will be shown at the very first # byte next time, otherwise trailers should come. The last @@ -827,7 +873,7 @@ def feed_data( if self._chunk == ChunkState.PARSE_TRAILERS: pos = chunk.find(SEP) if pos >= 0: - chunk = chunk[pos + 2 :] + chunk = chunk[pos + len(SEP) :] self._chunk = ChunkState.PARSE_MAYBE_TRAILERS else: self._chunk_tail = chunk diff --git a/tests/test_http_parser.py b/tests/test_http_parser.py index 89d936211da..bfdd10389c8 100644 --- a/tests/test_http_parser.py +++ b/tests/test_http_parser.py @@ -161,6 +161,83 @@ def test_invalid_linebreak(loop: Any, protocol: Any, request: Any) -> None: parser.feed_data(text) +def test_cve_2023_37276(parser: Any) -> None: + text = b"""POST / HTTP/1.1\r\nHost: localhost:8080\r\nX-Abc: \rxTransfer-Encoding: chunked\r\n\r\n""" + with pytest.raises(http_exceptions.BadHttpMessage): + parser.feed_data(text) + + +@pytest.mark.parametrize( + "hdr", + ( + "Content-Length: -5", # https://www.rfc-editor.org/rfc/rfc9110.html#name-content-length + "Content-Length: +256", + "Foo: abc\rdef", # https://www.rfc-editor.org/rfc/rfc9110.html#section-5.5-5 + "Bar: abc\ndef", + "Baz: abc\x00def", + "Foo : bar", # https://www.rfc-editor.org/rfc/rfc9112.html#section-5.1-2 + "Foo\t: bar", + ), +) +def test_bad_headers(parser: Any, hdr: str) -> None: + text = f"POST / HTTP/1.1\r\n{hdr}\r\n\r\n".encode() + with pytest.raises(http_exceptions.BadHttpMessage): + parser.feed_data(text) + + +def test_content_length_transfer_encoding(parser: Any) -> None: + text = ( + b"GET / HTTP/1.1\r\nHost: a\r\nContent-Length: 5\r\nTransfer-Encoding: a\r\n\r\n" + + b"apple\r\n" + ) + with pytest.raises(http_exceptions.BadHttpMessage): + parser.feed_data(text) + + +def test_bad_chunked_py(loop: Any, protocol: Any) -> None: + """Test that invalid chunked encoding doesn't allow content-length to be used.""" + parser = HttpRequestParserPy( + protocol, + loop, + 2**16, + max_line_size=8190, + max_field_size=8190, + ) + text = ( + b"GET / HTTP/1.1\r\nHost: a\r\nTransfer-Encoding: chunked\r\n\r\n0_2e\r\n\r\n" + + b"GET / HTTP/1.1\r\nHost: a\r\nContent-Length: 5\r\n\r\n0\r\n\r\n" + ) + messages, upgrade, tail = parser.feed_data(text) + assert isinstance(messages[0][1].exception(), http_exceptions.TransferEncodingError) + + +@pytest.mark.skipif( + "HttpRequestParserC" not in dir(aiohttp.http_parser), + reason="C based HTTP parser not available", +) +def test_bad_chunked_c(loop: Any, protocol: Any) -> None: + """C parser behaves differently. Maybe we should align them later.""" + parser = HttpRequestParserC( + protocol, + loop, + 2**16, + max_line_size=8190, + max_field_size=8190, + ) + text = ( + b"GET / HTTP/1.1\r\nHost: a\r\nTransfer-Encoding: chunked\r\n\r\n0_2e\r\n\r\n" + + b"GET / HTTP/1.1\r\nHost: a\r\nContent-Length: 5\r\n\r\n0\r\n\r\n" + ) + with pytest.raises(http_exceptions.BadHttpMessage): + parser.feed_data(text) + + +def test_whitespace_before_header(parser: Any) -> None: + text = b"GET / HTTP/1.1\r\n\tContent-Length: 1\r\n\r\nX" + with pytest.raises(http_exceptions.BadHttpMessage): + parser.feed_data(text) + + def test_parse_headers_longline(parser: Any) -> None: invalid_unicode_byte = b"\xd9" header_name = b"Test" + invalid_unicode_byte + b"Header" + b"A" * 8192 @@ -475,71 +552,6 @@ def test_invalid_name(parser: Any) -> None: parser.feed_data(text) -def test_cve_2023_37276(parser: Any) -> None: - text = b"""POST / HTTP/1.1\r\nHost: localhost:8080\r\nX-Abc: \rxTransfer-Encoding: chunked\r\n\r\n""" - with pytest.raises(http_exceptions.BadHttpMessage): - parser.feed_data(text) - - -@pytest.mark.parametrize( - "hdr", - ( - "Content-Length: -5", # https://www.rfc-editor.org/rfc/rfc9110.html#name-content-length - "Content-Length: +256", - "Foo: abc\rdef", # https://www.rfc-editor.org/rfc/rfc9110.html#section-5.5-5 - "Bar: abc\ndef", - "Baz: abc\x00def", - "Foo : bar", # https://www.rfc-editor.org/rfc/rfc9112.html#section-5.1-2 - "Foo\t: bar", - ) -) -def test_bad_headers(parser: Any, hdr: str) -> None: - text = "POST / HTTP/1.1\r\n{}\r\n\r\n".format(hdr).encode() - with pytest.raises(http_exceptions.InvalidHeader): - parser.feed_data(text) - - -def test_bad_chunked_py(loop: Any, protocol: Any) -> None: - """Test that invalid chunked encoding doesn't allow content-length to be used.""" - parser = HttpRequestParserPy( - protocol, - loop, - 2**16, - max_line_size=8190, - max_field_size=8190, - ) - text = (b"GET / HTTP/1.1\r\nHost: a\r\nTransfer-Encoding: chunked\r\n\r\n0_2e\r\n\r\n" - + b"GET / HTTP/1.1\r\nHost: a\r\nContent-Length: 5\r\n\r\n0\r\n\r\n") - messages, upgrade, tail = parser.feed_data(text) - assert isinstance(messages[0][1].exception(), http_exceptions.TransferEncodingError) - - -@pytest.mark.skipif( - "HttpRequestParserC" not in dir(aiohttp.http_parser), - reason="C based HTTP parser not available", -) -def test_bad_chunked_c(loop: Any, protocol: Any) -> None: - """C parser behaves differently. Maybe we should align them later.""" - payload = b"GET1 /test HTTP/1.1\r\n\r\n" - parser = HttpRequestParserC( - protocol, - loop, - 2**16, - max_line_size=8190, - max_field_size=8190, - ) - text = (b"GET / HTTP/1.1\r\nHost: a\r\nTransfer-Encoding: chunked\r\n\r\n0_2e\r\n\r\n" - + b"GET / HTTP/1.1\r\nHost: a\r\nContent-Length: 5\r\n\r\n0\r\n\r\n") - with pytest.raises(http_exceptions.BadHttpMessage): - parser.feed_data(text) - - -def test_whitespace_before_header(parser: Any) -> None: - text = b"GET / HTTP/1.1\r\n\tContent-Length: 1\r\n\r\nX" - with pytest.raises(http_exceptions.BadHttpMessage): - parser.feed_data(text) - - @pytest.mark.parametrize("size", [40960, 8191]) def test_max_header_field_size(parser: Any, size: Any) -> None: name = b"t" * size @@ -825,6 +837,66 @@ def test_http_response_parser_strict_headers(response: Any) -> None: response.feed_data(b"HTTP/1.1 200 test\r\nFoo: abc\x01def\r\n\r\n") +def test_http_response_parser_bad_crlf(response: Any) -> None: + """Still a lot of dodgy servers sending bad requests like this.""" + messages, upgrade, tail = response.feed_data( + b"HTTP/1.0 200 OK\nFoo: abc\nBar: def\n\nBODY\n" + ) + msg = messages[0][0] + + assert msg.headers["Foo"] == "abc" + assert msg.headers["Bar"] == "def" + + +async def test_http_response_parser_bad_chunked_lax(response: Any) -> None: + text = ( + b"HTTP/1.1 200 OK\r\nTransfer-Encoding: chunked\r\n\r\n5 \r\nabcde\r\n0\r\n\r\n" + ) + messages, upgrade, tail = response.feed_data(text) + + assert await messages[0][1].read(5) == b"abcde" + + +@pytest.mark.dev_mode +async def test_http_response_parser_bad_chunked_strict_py( + loop: Any, protocol: Any +) -> None: + response = HttpResponseParserPy( + protocol, + loop, + 2**16, + max_line_size=8190, + max_field_size=8190, + ) + text = ( + b"HTTP/1.1 200 OK\r\nTransfer-Encoding: chunked\r\n\r\n5 \r\nabcde\r\n0\r\n\r\n" + ) + messages, upgrade, tail = response.feed_data(text) + assert isinstance(messages[0][1].exception(), http_exceptions.TransferEncodingError) + + +@pytest.mark.dev_mode +@pytest.mark.skipif( + "HttpRequestParserC" not in dir(aiohttp.http_parser), + reason="C based HTTP parser not available", +) +async def test_http_response_parser_bad_chunked_strict_c( + loop: Any, protocol: Any +) -> None: + response = HttpResponseParserC( + protocol, + loop, + 2**16, + max_line_size=8190, + max_field_size=8190, + ) + text = ( + b"HTTP/1.1 200 OK\r\nTransfer-Encoding: chunked\r\n\r\n5 \r\nabcde\r\n0\r\n\r\n" + ) + with pytest.raises(http_exceptions.BadHttpMessage): + response.feed_data(text) + + def test_http_response_parser_bad(response: Any) -> None: with pytest.raises(http_exceptions.BadHttpMessage): response.feed_data(b"HTT/1\r\n\r\n")