Skip to content

Commit

Permalink
Validate ASCII host values
Browse files Browse the repository at this point in the history
Host values must match the unreserved, sub-delims or pct-encoded grammar
rules. The IDNA encoder already checks for this, but for ASCII values we
skip IDNA encoding.

If the invalid character is `@`, or `:` followed by `@` further in the
host value, tack on advice about using `authority` instead of `host`.
  • Loading branch information
mjpieters committed Nov 20, 2023
1 parent ceaad70 commit 7f21ff2
Show file tree
Hide file tree
Showing 6 changed files with 80 additions and 3 deletions.
3 changes: 3 additions & 0 deletions CHANGES/880.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Started rejecting ASCII hostnames with invalid characters. For host strings that
look like authority strings, the exception message includes advice on what to do
instead -- by :user:`mjpieters`.
1 change: 1 addition & 0 deletions CHANGES/954.bugfix.rst
1 change: 1 addition & 0 deletions docs/spelling_wordlist.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ downstreams
facto
glibc
google
hostnames
mailto
manylinux
multi
Expand Down
19 changes: 19 additions & 0 deletions tests/test_url_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,25 @@ def test_build_with_authority_and_host():
URL.build(authority="host.com", host="example.com")


@pytest.mark.parametrize(
("host", "is_authority"),
[
("user:pass@host.com", True),
("user@host.com", True),
("host:com", False),
("not_percent_encoded%Zf", False),
("still_not_percent_encoded%fZ", False),
*(("other_gen_delim_" + c, False) for c in "/?#[]"),
],
)
def test_build_with_invalid_host(host: str, is_authority: bool):
match = r"Host '[^']+' cannot contain '[^']+' \(at position \d+\)"
if is_authority:
match += ", if .* use 'authority' instead of 'host'"
with pytest.raises(ValueError, match=f"{match}$"):
URL.build(host=host)


def test_build_with_authority():
url = URL.build(scheme="http", authority="ваня:bar@host.com:8000", path="path")
assert str(url) == "http://%D0%B2%D0%B0%D0%BD%D1%8F:bar@host.com:8000/path"
Expand Down
20 changes: 20 additions & 0 deletions tests/test_url_update_netloc.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,26 @@ def test_with_host_non_ascii():
assert url2.authority == "историк.рф:123"


@pytest.mark.parametrize(
("host", "is_authority"),
[
("user:pass@host.com", True),
("user@host.com", True),
("host:com", False),
("not_percent_encoded%Zf", False),
("still_not_percent_encoded%fZ", False),
*(("other_gen_delim_" + c, False) for c in "/?#[]"),
],
)
def test_with_invalid_host(host: str, is_authority: bool):
url = URL("http://example.com:123")
match = r"Host '[^']+' cannot contain '[^']+' \(at position \d+\)"
if is_authority:
match += ", if .* use 'authority' instead of 'host'"
with pytest.raises(ValueError, match=f"{match}$"):
url.with_host(host=host)


def test_with_host_percent_encoded():
url = URL("http://%25cf%2580%cf%80:%25cf%2580%cf%80@example.com:123")
url2 = url.with_host("%cf%80.org")
Expand Down
39 changes: 36 additions & 3 deletions yarl/_url.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import functools
import math
import re

Check warning on line 3 in yarl/_url.py

View check run for this annotation

Codecov / codecov/patch

yarl/_url.py#L3

Added line #L3 was not covered by tests
import warnings
from collections.abc import Mapping, Sequence
from contextlib import suppress
Expand All @@ -15,6 +16,22 @@

sentinel = object()

# reg-name: unreserved / pct-encoded / sub-delims
# this pattern matches anything that is *not* in those classes. and is only used
# on lower-cased ASCII values.
_not_reg_name = re.compile(

Check warning on line 22 in yarl/_url.py

View check run for this annotation

Codecov / codecov/patch

yarl/_url.py#L22

Added line #L22 was not covered by tests
r"""
# any character not in the unreserved or sub-delims sets, plus %
# (validated with the additional check for pct-encoded sequences below)
[^a-z0-9\-._~!$&'()*+,;=%]
|
# % only allowed if it is part of a pct-encoded
# sequence of 2 hex digits.
%(?![0-9a-f]{2})
""",
re.VERBOSE,
)


def rewrite_module(obj: object) -> object:
obj.__module__ = "yarl"
Expand Down Expand Up @@ -765,11 +782,27 @@ def _encode_host(cls, host, human=False):
ip = ip_address(ip)
except ValueError:
host = host.lower()
# IDNA encoding is slow,
# skip it for ASCII-only strings
# skip it for humanized and ASCII-only strings
# Don't move the check into _idna_encode() helper
# to reduce the cache size
if human or host.isascii():
if human:
return host
if host.isascii():

Check warning on line 790 in yarl/_url.py

View check run for this annotation

Codecov / codecov/patch

yarl/_url.py#L788-L790

Added lines #L788 - L790 were not covered by tests
# Check for invalid characters explicitly; _idna_encode() does this
# for non-ascii host names.
invalid = _not_reg_name.search(host)
if invalid is not None:
value, pos, extra = invalid.group(), invalid.start(), ""
if value == "@" or (value == ":" and "@" in host[pos:]):

Check warning on line 796 in yarl/_url.py

View check run for this annotation

Codecov / codecov/patch

yarl/_url.py#L793-L796

Added lines #L793 - L796 were not covered by tests
# this looks like an authority string
extra = (

Check warning on line 798 in yarl/_url.py

View check run for this annotation

Codecov / codecov/patch

yarl/_url.py#L798

Added line #L798 was not covered by tests
", if the value includes a username or password, "
"use 'authority' instead of 'host'"
)
raise ValueError(

Check warning on line 802 in yarl/_url.py

View check run for this annotation

Codecov / codecov/patch

yarl/_url.py#L802

Added line #L802 was not covered by tests
f"Host {host!r} cannot contain {value!r} (at position "
f"{pos}){extra}"
) from None
return host
host = _idna_encode(host)
else:
Expand Down

0 comments on commit 7f21ff2

Please sign in to comment.