-
-
Notifications
You must be signed in to change notification settings - Fork 2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improve validation in HTTP parser #8074
Changes from all commits
dc567ba
00cb2a2
3b7e9eb
e0dde7c
e2b0483
b38306b
ea85a3c
f0314ca
70cd60e
4878638
372fe5b
43c127a
4e8306c
9234224
39549e5
ddb4bc2
dd4bdc4
c38e243
8cc25ff
75b4489
d1b746f
480fc51
5ecbf79
6b82936
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
Fixed an unhandled exception in the Python HTTP parser on header lines starting with a colon -- by :user:`pajod`. | ||
|
||
Invalid request lines with anything but a dot between the HTTP major and minor version are now rejected. Invalid header field names containing question mark or slash are now rejected. Such requests are incompatible with :rfc:`9110#section-5.6.2` and are not known to be of any legitimate use. | ||
|
||
(BACKWARD INCOMPATIBLE) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,7 +3,8 @@ | |
|
||
import asyncio | ||
import re | ||
from typing import Any, List | ||
from contextlib import nullcontext | ||
from typing import Any, Dict, List | ||
from unittest import mock | ||
from urllib.parse import quote | ||
|
||
|
@@ -168,11 +169,27 @@ | |
parser.feed_data(text) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"rfc9110_5_6_2_token_delim", | ||
r'"(),/:;<=>?@[\]{}', | ||
) | ||
def test_bad_header_name(parser: Any, rfc9110_5_6_2_token_delim: str) -> None: | ||
text = f"POST / HTTP/1.1\r\nhead{rfc9110_5_6_2_token_delim}er: val\r\n\r\n".encode() | ||
expectation = pytest.raises(http_exceptions.BadHttpMessage) | ||
if rfc9110_5_6_2_token_delim == ":": | ||
# Inserting colon into header just splits name/value earlier. | ||
expectation = nullcontext() | ||
with expectation: | ||
parser.feed_data(text) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"hdr", | ||
( | ||
"Content-Length: -5", # https://www.rfc-editor.org/rfc/rfc9110.html#name-content-length | ||
"Content-Length: +256", | ||
"Content-Length: \N{superscript one}", | ||
"Content-Length: \N{mathematical double-struck digit one}", | ||
"Foo: abc\rdef", # https://www.rfc-editor.org/rfc/rfc9110.html#section-5.5-5 | ||
"Bar: abc\ndef", | ||
"Baz: abc\x00def", | ||
|
@@ -265,6 +282,20 @@ | |
parser.feed_data(text) | ||
|
||
|
||
def test_parse_unusual_request_line(parser: Any) -> None: | ||
if not isinstance(response, HttpResponseParserPy): | ||
pytest.xfail("Regression test for Py parser. May match C behaviour later.") | ||
text = b"#smol //a HTTP/1.3\r\n\r\n" | ||
messages, upgrade, tail = parser.feed_data(text) | ||
assert len(messages) == 1 | ||
msg, _ = messages[0] | ||
assert msg.compression is None | ||
assert not msg.upgrade | ||
assert msg.method == "#smol" | ||
assert msg.path == "//a" | ||
assert msg.version == (1, 3) | ||
|
||
|
||
def test_parse(parser: Any) -> None: | ||
text = b"GET /test HTTP/1.1\r\n\r\n" | ||
messages, upgrade, tail = parser.feed_data(text) | ||
|
@@ -567,6 +598,45 @@ | |
parser.feed_data(text) | ||
|
||
|
||
_pad: Dict[bytes, str] = { | ||
b"": "empty", | ||
# not a typo. Python likes triple zero | ||
b"\000": "NUL", | ||
b" ": "SP", | ||
b" ": "SPSP", | ||
# not a typo: both 0xa0 and 0x0a in case of 8-bit fun | ||
b"\n": "LF", | ||
b"\xa0": "NBSP", | ||
b"\t ": "TABSP", | ||
} | ||
|
||
|
||
@pytest.mark.parametrize("hdr", [b"", b"foo"], ids=["name-empty", "with-name"]) | ||
@pytest.mark.parametrize("pad2", _pad.keys(), ids=["post-" + n for n in _pad.values()]) | ||
@pytest.mark.parametrize("pad1", _pad.keys(), ids=["pre-" + n for n in _pad.values()]) | ||
def test_invalid_header_spacing( | ||
parser: Any, pad1: bytes, pad2: bytes, hdr: bytes | ||
) -> None: | ||
text = b"GET /test HTTP/1.1\r\n" b"%s%s%s: value\r\n\r\n" % (pad1, hdr, pad2) | ||
expectation = pytest.raises(http_exceptions.BadHttpMessage) | ||
if pad1 == pad2 == b"" and hdr != b"": | ||
# one entry in param matrix is correct: non-empty name, not padded | ||
expectation = nullcontext() | ||
if pad1 == pad2 == hdr == b"": | ||
if not isinstance(response, HttpResponseParserPy): | ||
pytest.xfail("Regression test for Py parser. May match C behaviour later.") | ||
with expectation: | ||
parser.feed_data(text) | ||
|
||
|
||
def test_empty_header_name(parser: Any) -> None: | ||
if not isinstance(response, HttpResponseParserPy): | ||
pytest.xfail("Regression test for Py parser. May match C behaviour later.") | ||
text = b"GET /test HTTP/1.1\r\n" b":test\r\n\r\n" | ||
with pytest.raises(http_exceptions.BadHttpMessage): | ||
parser.feed_data(text) | ||
|
||
|
||
def test_invalid_header(parser: Any) -> None: | ||
text = b"GET /test HTTP/1.1\r\n" b"test line\r\n\r\n" | ||
with pytest.raises(http_exceptions.BadHttpMessage): | ||
|
@@ -689,6 +759,34 @@ | |
assert r"\n" not in exc_info.value.message | ||
|
||
|
||
_num: Dict[bytes, str] = { | ||
# dangerous: accepted by Python int() | ||
# unicodedata.category("\U0001D7D9") == 'Nd' | ||
"\N{mathematical double-struck digit one}".encode(): "utf8digit", | ||
# only added for interop tests, refused by Python int() | ||
# unicodedata.category("\U000000B9") == 'No' | ||
"\N{superscript one}".encode(): "utf8number", | ||
"\N{superscript one}".encode("latin-1"): "latin1number", | ||
} | ||
|
||
|
||
@pytest.mark.parametrize("nonascii_digit", _num.keys(), ids=_num.values()) | ||
def test_http_request_bad_status_line_number( | ||
parser: Any, nonascii_digit: bytes | ||
) -> None: | ||
text = b"GET /digit HTTP/1." + nonascii_digit + b"\r\n\r\n" | ||
with pytest.raises(http_exceptions.BadStatusLine): | ||
parser.feed_data(text) | ||
|
||
|
||
def test_http_request_bad_status_line_separator(parser: Any) -> None: | ||
# single code point, old, multibyte NFKC, multibyte NFKD | ||
utf8sep = "\N{arabic ligature sallallahou alayhe wasallam}".encode() | ||
text = b"GET /ligature HTTP/1" + utf8sep + b"1\r\n\r\n" | ||
with pytest.raises(http_exceptions.BadStatusLine): | ||
parser.feed_data(text) | ||
|
||
|
||
def test_http_request_bad_status_line_whitespace(parser: Any) -> None: | ||
text = b"GET\n/path\fHTTP/1.1\r\n\r\n" | ||
with pytest.raises(http_exceptions.BadStatusLine): | ||
|
@@ -710,6 +808,31 @@ | |
assert tail == b"some raw data" | ||
|
||
|
||
def test_http_request_parser_utf8_request_line(parser: Any) -> None: | ||
if not isinstance(response, HttpResponseParserPy): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @pajod FYI it's much cleaner to use the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But that only explains why a llhttp-only code coverage report complains!? There is something much more wrong here. Like, I completely broke that test levels of wrong. Edit: Sorry, I did. Four times. I meant to acknowledge (in two cases, expected to be changed) behaviour differences of the C parser, while keeping my tests parametrized to keep running both parsers anyway. But each time I copied the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, I noticed there was no coverage on these tests for some reason. Something to look at later. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep. I started looking into the coverage drop and that's how I noticed this thing. FWIW we really should refactor how the tests with and without extensions are parametrized/generated, like I did in multidict recently. Essentially, there was an import loop in some places in tests that prevented C-extension tests from being executed (aio-libs/multidict#837 / aio-libs/multidict#915 / https://multidict.aio-libs.org/en/latest/changes/#contributor-facing-changes). So I fixed that by having an explicit global option for requiring one mode or the other, with a collection of fixtures reused everywhere and zero magic around import attempts and handling failures in weird ways. Another thing I configured is the module classification in Codecov with different expected coverage thresholds — the goal should be that tests get 100% coverage in every CI run (from all jobs combined, of course). And the actual project code coverage should be measured as a separate metric. Currently, a global threshold value allows coverage to drop in tests if it's compensated by coverage in the project, meaning that we may be gaining more dead code (read: tests that are never executed), which results in a false sense of things being tested, when they aren't. |
||
pytest.xfail("Regression test for Py parser. May match C behaviour later.") | ||
messages, upgrade, tail = parser.feed_data( | ||
# note the truncated unicode sequence | ||
b"GET /P\xc3\xbcnktchen\xa0\xef\xb7 HTTP/1.1\r\n" + | ||
# for easier grep: ASCII 0xA0 more commonly known as non-breaking space | ||
# note the leading and trailing spaces | ||
"sTeP: \N{latin small letter sharp s}nek\t\N{no-break space} " | ||
"\r\n\r\n".encode() | ||
) | ||
msg = messages[0][0] | ||
|
||
assert msg.method == "GET" | ||
assert msg.path == "/Pünktchen\udca0\udcef\udcb7" | ||
assert msg.version == (1, 1) | ||
assert msg.headers == CIMultiDict([("STEP", "ßnek\t\xa0")]) | ||
assert msg.raw_headers == ((b"sTeP", "ßnek\t\xa0".encode()),) | ||
assert not msg.should_close | ||
assert msg.compression is None | ||
assert not msg.upgrade | ||
assert not msg.chunked | ||
assert msg.url.path == URL("/P%C3%BCnktchen\udca0\udcef\udcb7").path | ||
|
||
|
||
def test_http_request_parser_utf8(parser: Any) -> None: | ||
text = "GET /path HTTP/1.1\r\nx-test:тест\r\n\r\n".encode() | ||
messages, upgrade, tail = parser.feed_data(text) | ||
|
@@ -759,9 +882,15 @@ | |
assert not msg.chunked | ||
|
||
|
||
def test_http_request_parser_bad_method(parser: Any) -> None: | ||
@pytest.mark.parametrize( | ||
"rfc9110_5_6_2_token_delim", | ||
[bytes([i]) for i in rb'"(),/:;<=>?@[\]{}'], | ||
) | ||
def test_http_request_parser_bad_method( | ||
parser: Any, rfc9110_5_6_2_token_delim: bytes | ||
) -> None: | ||
with pytest.raises(http_exceptions.BadStatusLine): | ||
parser.feed_data(b'G=":<>(e),[T];?" /get HTTP/1.1\r\n\r\n') | ||
parser.feed_data(rfc9110_5_6_2_token_delim + b'ET" /get HTTP/1.1\r\n\r\n') | ||
|
||
|
||
def test_http_request_parser_bad_version(parser: Any) -> None: | ||
|
@@ -979,6 +1108,14 @@ | |
response.feed_data(b"HTTP/1.1 ttt test\r\n\r\n") | ||
|
||
|
||
@pytest.mark.parametrize("nonascii_digit", _num.keys(), ids=_num.values()) | ||
def test_http_response_parser_code_not_ascii( | ||
response: Any, nonascii_digit: bytes | ||
) -> None: | ||
with pytest.raises(http_exceptions.BadStatusLine): | ||
response.feed_data(b"HTTP/1.1 20" + nonascii_digit + b" test\r\n\r\n") | ||
|
||
|
||
def test_http_request_chunked_payload(parser: Any) -> None: | ||
text = b"GET /test HTTP/1.1\r\n" b"transfer-encoding: chunked\r\n\r\n" | ||
msg, payload = parser.feed_data(text)[0][0] | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hey @pajod, could you expand what this comment refers to? Does the
utf8sep
variable contain a value that matches all the listed cases? I'm rather confused. Or did you mean to test different cases but added just one?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That utf8sep has those properties (and LTR), a popular choice for being multiple edge cases in one. None of which are strictly needed for.. comparing to the literal ASCII dot as we do here, but some of which I expect to regain relevance on future refactoring.