diff --git a/CHANGES/8074.bugfix.rst b/CHANGES/8074.bugfix.rst new file mode 100644 index 00000000000..16c71445476 --- /dev/null +++ b/CHANGES/8074.bugfix.rst @@ -0,0 +1,5 @@ +Fixed an unhandled exception in the Python HTTP parser on header lines starting with a colon -- by :user:`pajod`. + +Invalid request lines with anything but a dot between the HTTP major and minor version are now rejected. Invalid header field names containing question mark or slash are now rejected. Such requests are incompatible with :rfc:`9110#section-5.6.2` and are not known to be of any legitimate use. + +(BACKWARD INCOMPATIBLE) diff --git a/CONTRIBUTORS.txt b/CONTRIBUTORS.txt index e94381dcf28..3062bf5811e 100644 --- a/CONTRIBUTORS.txt +++ b/CONTRIBUTORS.txt @@ -255,6 +255,7 @@ Pankaj Pandey Parag Jain Pau Freixes Paul Colomiets +Paul J. Dorn Paulius Šileikis Paulus Schoutsen Pavel Kamaev diff --git a/aiohttp/http_parser.py b/aiohttp/http_parser.py index 85499177701..1877f558308 100644 --- a/aiohttp/http_parser.py +++ b/aiohttp/http_parser.py @@ -69,12 +69,11 @@ # tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*" / "+" / "-" / "." / # "^" / "_" / "`" / "|" / "~" / DIGIT / ALPHA # token = 1*tchar -METHRE: Final[Pattern[str]] = re.compile(r"[!#$%&'*+\-.^_`|~0-9A-Za-z]+") -VERSRE: Final[Pattern[str]] = re.compile(r"HTTP/(\d).(\d)") -HDRRE: Final[Pattern[bytes]] = re.compile( - rb"[\x00-\x1F\x7F-\xFF()<>@,;:\[\]={} \t\"\\]" -) -HEXDIGIT = re.compile(rb"[0-9a-fA-F]+") +_TCHAR_SPECIALS: Final[str] = re.escape("!#$%&'*+-.^_`|~") +TOKENRE: Final[Pattern[str]] = re.compile(f"[0-9A-Za-z{_TCHAR_SPECIALS}]+") +VERSRE: Final[Pattern[str]] = re.compile(r"HTTP/(\d)\.(\d)", re.ASCII) +DIGITS: Final[Pattern[str]] = re.compile(r"\d+", re.ASCII) +HEXDIGITS: Final[Pattern[bytes]] = re.compile(rb"[0-9a-fA-F]+") class RawRequestMessage(NamedTuple): @@ -136,6 +135,7 @@ def parse_headers( self, lines: List[bytes] ) -> Tuple["CIMultiDictProxy[str]", RawHeaders]: headers: CIMultiDict[str] = CIMultiDict() + # note: "raw" does not mean inclusion of OWS before/after the field value raw_headers = [] lines_idx = 1 @@ -149,13 +149,14 @@ def parse_headers( except ValueError: raise InvalidHeader(line) from None + if len(bname) == 0: + raise InvalidHeader(bname) + # https://www.rfc-editor.org/rfc/rfc9112.html#section-5.1-2 if {bname[0], bname[-1]} & {32, 9}: # {" ", "\t"} raise InvalidHeader(line) bvalue = bvalue.lstrip(b" \t") - if HDRRE.search(bname): - raise InvalidHeader(bname) if len(bname) > self.max_field_size: raise LineTooLong( "request header name {}".format( @@ -164,6 +165,9 @@ def parse_headers( str(self.max_field_size), str(len(bname)), ) + name = bname.decode("utf-8", "surrogateescape") + if not TOKENRE.fullmatch(name): + raise InvalidHeader(bname) header_length = len(bvalue) @@ -210,7 +214,6 @@ def parse_headers( ) bvalue = bvalue.strip(b" \t") - name = bname.decode("utf-8", "surrogateescape") value = bvalue.decode("utf-8", "surrogateescape") # https://www.rfc-editor.org/rfc/rfc9110.html#section-5.5-5 @@ -338,7 +341,8 @@ def get_content_length() -> Optional[int]: # Shouldn't allow +/- or other number formats. # https://www.rfc-editor.org/rfc/rfc9110#section-8.6-2 - if not length_hdr.strip(" \t").isdecimal(): + # msg.headers is already stripped of leading/trailing wsp + if not DIGITS.fullmatch(length_hdr): raise InvalidHeader(CONTENT_LENGTH) return int(length_hdr) @@ -566,7 +570,7 @@ def parse_message(self, lines: List[bytes]) -> RawRequestMessage: ) # method - if not METHRE.fullmatch(method): + if not TOKENRE.fullmatch(method): raise BadStatusLine(method) # version @@ -683,8 +687,8 @@ def parse_message(self, lines: List[bytes]) -> RawResponseMessage: raise BadStatusLine(line) version_o = HttpVersion(int(match.group(1)), int(match.group(2))) - # The status code is a three-digit number - if len(status) != 3 or not status.isdecimal(): + # The status code is a three-digit ASCII number, no padding + if len(status) != 3 or not DIGITS.fullmatch(status): raise BadStatusLine(line) status_i = int(status) @@ -826,7 +830,7 @@ def feed_data( if self._lax: # Allow whitespace in lax mode. size_b = size_b.strip() - if not re.fullmatch(HEXDIGIT, size_b): + if not re.fullmatch(HEXDIGITS, size_b): exc = TransferEncodingError( chunk[:pos].decode("ascii", "surrogateescape") ) diff --git a/tests/test_http_parser.py b/tests/test_http_parser.py index 820a76cb821..b931730529d 100644 --- a/tests/test_http_parser.py +++ b/tests/test_http_parser.py @@ -2,7 +2,8 @@ import asyncio import re -from typing import Any, List +from contextlib import nullcontext +from typing import Any, Dict, List from unittest import mock from urllib.parse import quote @@ -169,11 +170,27 @@ def test_cve_2023_37276(parser: Any) -> None: parser.feed_data(text) +@pytest.mark.parametrize( + "rfc9110_5_6_2_token_delim", + r'"(),/:;<=>?@[\]{}', +) +def test_bad_header_name(parser: Any, rfc9110_5_6_2_token_delim: str) -> None: + text = f"POST / HTTP/1.1\r\nhead{rfc9110_5_6_2_token_delim}er: val\r\n\r\n".encode() + expectation = pytest.raises(http_exceptions.BadHttpMessage) + if rfc9110_5_6_2_token_delim == ":": + # Inserting colon into header just splits name/value earlier. + expectation = nullcontext() + with expectation: + parser.feed_data(text) + + @pytest.mark.parametrize( "hdr", ( "Content-Length: -5", # https://www.rfc-editor.org/rfc/rfc9110.html#name-content-length "Content-Length: +256", + "Content-Length: \N{superscript one}", + "Content-Length: \N{mathematical double-struck digit one}", "Foo: abc\rdef", # https://www.rfc-editor.org/rfc/rfc9110.html#section-5.5-5 "Bar: abc\ndef", "Baz: abc\x00def", @@ -266,6 +283,20 @@ def test_parse_headers_longline(parser: Any) -> None: parser.feed_data(text) +def test_parse_unusual_request_line(parser) -> None: + if not isinstance(response, HttpResponseParserPy): + pytest.xfail("Regression test for Py parser. May match C behaviour later.") + text = b"#smol //a HTTP/1.3\r\n\r\n" + messages, upgrade, tail = parser.feed_data(text) + assert len(messages) == 1 + msg, _ = messages[0] + assert msg.compression is None + assert not msg.upgrade + assert msg.method == "#smol" + assert msg.path == "//a" + assert msg.version == (1, 3) + + def test_parse(parser) -> None: text = b"GET /test HTTP/1.1\r\n\r\n" messages, upgrade, tail = parser.feed_data(text) @@ -568,6 +599,43 @@ def test_headers_content_length_err_2(parser) -> None: parser.feed_data(text) +_pad: Dict[bytes, str] = { + b"": "empty", + # not a typo. Python likes triple zero + b"\000": "NUL", + b" ": "SP", + b" ": "SPSP", + # not a typo: both 0xa0 and 0x0a in case of 8-bit fun + b"\n": "LF", + b"\xa0": "NBSP", + b"\t ": "TABSP", +} + + +@pytest.mark.parametrize("hdr", [b"", b"foo"], ids=["name-empty", "with-name"]) +@pytest.mark.parametrize("pad2", _pad.keys(), ids=["post-" + n for n in _pad.values()]) +@pytest.mark.parametrize("pad1", _pad.keys(), ids=["pre-" + n for n in _pad.values()]) +def test_invalid_header_spacing(parser, pad1: bytes, pad2: bytes, hdr: bytes) -> None: + text = b"GET /test HTTP/1.1\r\n" b"%s%s%s: value\r\n\r\n" % (pad1, hdr, pad2) + expectation = pytest.raises(http_exceptions.BadHttpMessage) + if pad1 == pad2 == b"" and hdr != b"": + # one entry in param matrix is correct: non-empty name, not padded + expectation = nullcontext() + if pad1 == pad2 == hdr == b"": + if not isinstance(response, HttpResponseParserPy): + pytest.xfail("Regression test for Py parser. May match C behaviour later.") + with expectation: + parser.feed_data(text) + + +def test_empty_header_name(parser) -> None: + if not isinstance(response, HttpResponseParserPy): + pytest.xfail("Regression test for Py parser. May match C behaviour later.") + text = b"GET /test HTTP/1.1\r\n" b":test\r\n\r\n" + with pytest.raises(http_exceptions.BadHttpMessage): + parser.feed_data(text) + + def test_invalid_header(parser) -> None: text = b"GET /test HTTP/1.1\r\n" b"test line\r\n\r\n" with pytest.raises(http_exceptions.BadHttpMessage): @@ -690,6 +758,34 @@ def test_http_request_bad_status_line(parser) -> None: assert r"\n" not in exc_info.value.message +_num: Dict[bytes, str] = { + # dangerous: accepted by Python int() + # unicodedata.category("\U0001D7D9") == 'Nd' + "\N{mathematical double-struck digit one}".encode(): "utf8digit", + # only added for interop tests, refused by Python int() + # unicodedata.category("\U000000B9") == 'No' + "\N{superscript one}".encode(): "utf8number", + "\N{superscript one}".encode("latin-1"): "latin1number", +} + + +@pytest.mark.parametrize("nonascii_digit", _num.keys(), ids=_num.values()) +def test_http_request_bad_status_line_number( + parser: Any, nonascii_digit: bytes +) -> None: + text = b"GET /digit HTTP/1." + nonascii_digit + b"\r\n\r\n" + with pytest.raises(http_exceptions.BadStatusLine): + parser.feed_data(text) + + +def test_http_request_bad_status_line_separator(parser: Any) -> None: + # single code point, old, multibyte NFKC, multibyte NFKD + utf8sep = "\N{arabic ligature sallallahou alayhe wasallam}".encode() + text = b"GET /ligature HTTP/1" + utf8sep + b"1\r\n\r\n" + with pytest.raises(http_exceptions.BadStatusLine): + parser.feed_data(text) + + def test_http_request_bad_status_line_whitespace(parser: Any) -> None: text = b"GET\n/path\fHTTP/1.1\r\n\r\n" with pytest.raises(http_exceptions.BadStatusLine): @@ -711,6 +807,31 @@ def test_http_request_upgrade(parser: Any) -> None: assert tail == b"some raw data" +def test_http_request_parser_utf8_request_line(parser) -> None: + if not isinstance(response, HttpResponseParserPy): + pytest.xfail("Regression test for Py parser. May match C behaviour later.") + messages, upgrade, tail = parser.feed_data( + # note the truncated unicode sequence + b"GET /P\xc3\xbcnktchen\xa0\xef\xb7 HTTP/1.1\r\n" + + # for easier grep: ASCII 0xA0 more commonly known as non-breaking space + # note the leading and trailing spaces + "sTeP: \N{latin small letter sharp s}nek\t\N{no-break space} " + "\r\n\r\n".encode() + ) + msg = messages[0][0] + + assert msg.method == "GET" + assert msg.path == "/Pünktchen\udca0\udcef\udcb7" + assert msg.version == (1, 1) + assert msg.headers == CIMultiDict([("STEP", "ßnek\t\xa0")]) + assert msg.raw_headers == ((b"sTeP", "ßnek\t\xa0".encode()),) + assert not msg.should_close + assert msg.compression is None + assert not msg.upgrade + assert not msg.chunked + assert msg.url.path == URL("/P%C3%BCnktchen\udca0\udcef\udcb7").path + + def test_http_request_parser_utf8(parser) -> None: text = "GET /path HTTP/1.1\r\nx-test:тест\r\n\r\n".encode() messages, upgrade, tail = parser.feed_data(text) @@ -760,9 +881,15 @@ def test_http_request_parser_two_slashes(parser) -> None: assert not msg.chunked -def test_http_request_parser_bad_method(parser) -> None: +@pytest.mark.parametrize( + "rfc9110_5_6_2_token_delim", + [bytes([i]) for i in rb'"(),/:;<=>?@[\]{}'], +) +def test_http_request_parser_bad_method( + parser, rfc9110_5_6_2_token_delim: bytes +) -> None: with pytest.raises(http_exceptions.BadStatusLine): - parser.feed_data(b'G=":<>(e),[T];?" /get HTTP/1.1\r\n\r\n') + parser.feed_data(rfc9110_5_6_2_token_delim + b'ET" /get HTTP/1.1\r\n\r\n') def test_http_request_parser_bad_version(parser) -> None: @@ -974,6 +1101,12 @@ def test_http_response_parser_code_not_int(response) -> None: response.feed_data(b"HTTP/1.1 ttt test\r\n\r\n") +@pytest.mark.parametrize("nonascii_digit", _num.keys(), ids=_num.values()) +def test_http_response_parser_code_not_ascii(response, nonascii_digit: bytes) -> None: + with pytest.raises(http_exceptions.BadStatusLine): + response.feed_data(b"HTTP/1.1 20" + nonascii_digit + b" test\r\n\r\n") + + def test_http_request_chunked_payload(parser) -> None: text = b"GET /test HTTP/1.1\r\n" b"transfer-encoding: chunked\r\n\r\n" msg, payload = parser.feed_data(text)[0][0]