diff --git a/CHANGES/880.bugfix.rst b/CHANGES/880.bugfix.rst new file mode 100644 index 00000000..39a64ea0 --- /dev/null +++ b/CHANGES/880.bugfix.rst @@ -0,0 +1,3 @@ +Started rejecting ASCII hostnames with invalid characters. For host strings that +look like authority strings, the exception message includes advice on what to do +instead -- by :user:`mjpieters`. diff --git a/CHANGES/954.bugfix.rst b/CHANGES/954.bugfix.rst new file mode 120000 index 00000000..e6d28843 --- /dev/null +++ b/CHANGES/954.bugfix.rst @@ -0,0 +1 @@ +880.bugfix.rst \ No newline at end of file diff --git a/docs/spelling_wordlist.txt b/docs/spelling_wordlist.txt index 4465e908..f7c1120c 100644 --- a/docs/spelling_wordlist.txt +++ b/docs/spelling_wordlist.txt @@ -18,6 +18,7 @@ downstreams facto glibc google +hostnames mailto manylinux multi diff --git a/tests/test_url_build.py b/tests/test_url_build.py index 51969fa8..88ed6642 100644 --- a/tests/test_url_build.py +++ b/tests/test_url_build.py @@ -84,6 +84,25 @@ def test_build_with_authority_and_host(): URL.build(authority="host.com", host="example.com") +@pytest.mark.parametrize( + ("host", "is_authority"), + [ + ("user:pass@host.com", True), + ("user@host.com", True), + ("host:com", False), + ("not_percent_encoded%Zf", False), + ("still_not_percent_encoded%fZ", False), + *(("other_gen_delim_" + c, False) for c in "/?#[]"), + ], +) +def test_build_with_invalid_host(host: str, is_authority: bool): + match = r"Host '[^']+' cannot contain '[^']+' \(at position \d+\)" + if is_authority: + match += ", if .* use 'authority' instead of 'host'" + with pytest.raises(ValueError, match=f"{match}$"): + URL.build(host=host) + + def test_build_with_authority(): url = URL.build(scheme="http", authority="ваня:bar@host.com:8000", path="path") assert str(url) == "http://%D0%B2%D0%B0%D0%BD%D1%8F:bar@host.com:8000/path" diff --git a/tests/test_url_update_netloc.py b/tests/test_url_update_netloc.py index cf0cc1c4..fbb08243 100644 --- a/tests/test_url_update_netloc.py +++ b/tests/test_url_update_netloc.py @@ -166,6 +166,26 @@ def test_with_host_non_ascii(): assert url2.authority == "историк.рф:123" +@pytest.mark.parametrize( + ("host", "is_authority"), + [ + ("user:pass@host.com", True), + ("user@host.com", True), + ("host:com", False), + ("not_percent_encoded%Zf", False), + ("still_not_percent_encoded%fZ", False), + *(("other_gen_delim_" + c, False) for c in "/?#[]"), + ], +) +def test_with_invalid_host(host: str, is_authority: bool): + url = URL("http://example.com:123") + match = r"Host '[^']+' cannot contain '[^']+' \(at position \d+\)" + if is_authority: + match += ", if .* use 'authority' instead of 'host'" + with pytest.raises(ValueError, match=f"{match}$"): + url.with_host(host=host) + + def test_with_host_percent_encoded(): url = URL("http://%25cf%2580%cf%80:%25cf%2580%cf%80@example.com:123") url2 = url.with_host("%cf%80.org") diff --git a/yarl/_url.py b/yarl/_url.py index c8f2acb3..d5f8f877 100644 --- a/yarl/_url.py +++ b/yarl/_url.py @@ -1,5 +1,6 @@ import functools import math +import re import warnings from collections.abc import Mapping, Sequence from contextlib import suppress @@ -15,6 +16,22 @@ sentinel = object() +# reg-name: unreserved / pct-encoded / sub-delims +# this pattern matches anything that is *not* in those classes. and is only used +# on lower-cased ASCII values. +_not_reg_name = re.compile( + r""" + # any character not in the unreserved or sub-delims sets, plus % + # (validated with the additional check for pct-encoded sequences below) + [^a-z0-9\-._~!$&'()*+,;=%] + | + # % only allowed if it is part of a pct-encoded + # sequence of 2 hex digits. + %(?![0-9a-f]{2}) + """, + re.VERBOSE, +) + def rewrite_module(obj: object) -> object: obj.__module__ = "yarl" @@ -765,11 +782,27 @@ def _encode_host(cls, host, human=False): ip = ip_address(ip) except ValueError: host = host.lower() - # IDNA encoding is slow, - # skip it for ASCII-only strings + # skip it for humanized and ASCII-only strings # Don't move the check into _idna_encode() helper # to reduce the cache size - if human or host.isascii(): + if human: + return host + if host.isascii(): + # Check for invalid characters explicitly; _idna_encode() does this + # for non-ascii host names. + invalid = _not_reg_name.search(host) + if invalid is not None: + value, pos, extra = invalid.group(), invalid.start(), "" + if value == "@" or (value == ":" and "@" in host[pos:]): + # this looks like an authority string + extra = ( + ", if the value includes a username or password, " + "use 'authority' instead of 'host'" + ) + raise ValueError( + f"Host {host!r} cannot contain {value!r} (at position " + f"{pos}){extra}" + ) from None return host host = _idna_encode(host) else: