Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gh-87389: avoid treating path as URI with netloc #93894

Open
wants to merge 19 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 31 additions & 8 deletions Lib/http/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -664,6 +664,33 @@ def do_HEAD(self):
if f:
f.close()

def _request_path_split(self, path):
"""Parse a path that can include an optional query and fragment.
"""
# We only handle the 'abs_path' case for the Request-URI part of the
# request line (the second word). We don't handle the case of a URL
# containing a scheme or netloc.
path, _, query = path.partition('?')
path, _, fragment = path.partition('#')
return urllib.parse.SplitResult('', '', path, query, fragment)

def _get_redirect_url_for_dir(self):
"""Returns URL with trailing slash on path, if required. If not
required, returns None.
"""
# Previous versions of this class used urllib.parse.urlsplit() here.
# However, the 'path' is being treated as a local filesystem path and
# it can't have a scheme or netloc. We need to avoid parsing it
# incorrectly. For example, as reported in gh-87389, a path starting
# with a double slash should not be treated as a relative URI. Also, a
# path with a colon in the first component could also be parsed
# wrongly.
parts = self._request_path_split(self.path)
if parts.path.endswith('/'):
return None # already has slash, no redirect needed
return urllib.parse.urlunsplit(('', '', parts.path + '/', parts.query,
parts.fragment))

def send_head(self):
"""Common code for GET and HEAD commands.

Expand All @@ -678,13 +705,10 @@ def send_head(self):
path = self.translate_path(self.path)
f = None
if os.path.isdir(path):
parts = urllib.parse.urlsplit(self.path)
if not parts.path.endswith('/'):
new_url = self._get_redirect_url_for_dir()
if new_url:
# redirect browser - doing basically what apache does
self.send_response(HTTPStatus.MOVED_PERMANENTLY)
new_parts = (parts[0], parts[1], parts[2] + '/',
parts[3], parts[4])
new_url = urllib.parse.urlunsplit(new_parts)
self.send_header("Location", new_url)
self.send_header("Content-Length", "0")
self.end_headers()
Expand Down Expand Up @@ -817,9 +841,8 @@ def translate_path(self, path):
probably be diagnosed.)

"""
# abandon query parameters
path = path.split('?',1)[0]
path = path.split('#',1)[0]
# extract only path, abandon query parameters and fragment
path = self._request_path_split(path).path
# Don't forget explicit trailing slash when normalizing. Issue17324
trailing_slash = path.rstrip().endswith('/')
try:
Expand Down
54 changes: 52 additions & 2 deletions Lib/test/test_httpservers.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,7 @@ class request_handler(NoLogRequestHandler, SimpleHTTPRequestHandler):
pass

def setUp(self):
BaseTestCase.setUp(self)
super().setUp()
self.cwd = os.getcwd()
basetempdir = tempfile.gettempdir()
os.chdir(basetempdir)
Expand Down Expand Up @@ -362,7 +362,7 @@ def tearDown(self):
except:
pass
finally:
BaseTestCase.tearDown(self)
super().tearDown()

def check_status_and_reason(self, response, status, data=None):
def close_conn():
Expand Down Expand Up @@ -418,6 +418,56 @@ def test_undecodable_filename(self):
self.check_status_and_reason(response, HTTPStatus.OK,
data=os_helper.TESTFN_UNDECODABLE)

def test_get_dir_redirect_location_domain_injection_bug(self):
nascheme marked this conversation as resolved.
Show resolved Hide resolved
"""Ensure //evil.co/..%2f../../X does not put //evil.co/ in Location.

//netloc/ in a Location header is a redirect to a new host.
https://github.com/python/cpython/issues/87389

This checks that a path resolving to a directory on our server cannot
resolve into a redirect to another server.
"""
os.mkdir(os.path.join(self.tempdir, 'existing_directory'))
url = f'/python.org/..%2f..%2f..%2f..%2f..%2f../%0a%0d/../{self.tempdir_name}/existing_directory'
expected_location = f'{url}/' # /python.org.../ single slash single prefix, trailing slash
# Canonicalizes to /tmp/tempdir_name/existing_directory which does
# exist and is a dir, triggering the 301 redirect logic.
response = self.request(url)
self.check_status_and_reason(response, HTTPStatus.MOVED_PERMANENTLY)
location = response.getheader('Location')
self.assertEqual(location, expected_location, msg='non-attack failed!')

# //python.org... multi-slash prefix, no trailing slash
attack_url = f'/{url}'
response = self.request(attack_url)
self.check_status_and_reason(response, HTTPStatus.MOVED_PERMANENTLY)
location = response.getheader('Location')
self.assertFalse(location.startswith('//'), msg=location)
self.assertEqual(location, expected_location,
msg='Expected Location header to start with a single / and '
'end with a / as this is a directory redirect.')

# ///python.org... triple-slash prefix, no trailing slash
attack3_url = f'//{url}'
response = self.request(attack3_url)
self.check_status_and_reason(response, HTTPStatus.MOVED_PERMANENTLY)
self.assertEqual(response.getheader('Location'), expected_location)

# If the second word in the http request (Request-URI for the http
# method) has a scheme and netloc, it still gets treated as an
# absolute path by the server. In that case, the redirect is
# constructed so it is parsed as a path. The './' part of the path
# is added by urlunsplit() so that the 'https:' part of what is being
# treated as a path is not treated as a scheme in the redirect
# location. http.server is not a proxy and doesn't handle Request-URI
# being an absolute URI with a scheme and or netloc.
attack_scheme_netloc_2slash_url = f'https://pypi.org/{url}'
expected_location = f'./{attack_scheme_netloc_2slash_url}/'
response = self.request(attack_scheme_netloc_2slash_url)
self.check_status_and_reason(response, HTTPStatus.MOVED_PERMANENTLY)
location = response.getheader('Location')
self.assertEqual(location, expected_location)

def test_get(self):
#constructs the path relative to the root directory of the HTTPServer
response = self.request(self.base_url + '/test')
Expand Down
19 changes: 19 additions & 0 deletions Lib/test/test_urlparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -1101,6 +1101,25 @@ def test_urlsplit_normalization(self):
with self.assertRaises(ValueError):
urllib.parse.urlsplit(url)

def test_urlunsplit_relative(self):
cases = [
# expected result is a relative URL without netloc and scheme
(('', 'a', '', '', ''), '//a'),
# extra leading slashes need to be stripped to avoid confusion
# with a relative URL
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

confusion with a protocol-relative URL? [as opposed to a host-relative URL]

(('', '', '//a', '', ''), '/a'),
(('', '', '///a', '', ''), '/a'),
# not relative so extra leading slashes don't need stripping since
# they don't cause confusion
(('http', 'x.y', '//a', '', ''), 'http://x.y//a'),
# avoid confusion with path containing colon
(('', '', 'a:b', '', ''), './a:b'),
]
for parts, result in cases:
self.assertEqual(urllib.parse.urlunsplit(parts), result,
msg=f'{parts=}')


class Utility_Tests(unittest.TestCase):
"""Testcase to test the various utility functions in the urllib."""
# In Python 2 this test class was in test_urllib.
Expand Down
20 changes: 19 additions & 1 deletion Lib/urllib/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,14 +491,32 @@ def urlunparse(components):
url = "%s;%s" % (url, params)
return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))

# Returns true if path can confused with a scheme. I.e. a relative path
# without leading dot that includes a colon in the first component.
_is_scheme_like = re.compile(r'[^/.][^/]*:').match
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why the special allowance for a leading dot? Is there a test case for it? Yes, a scheme cannot start with a dot, but a path-noscheme component of .: is no more legal than https: according to RFC 3986.


def urlunsplit(components):
"""Combine the elements of a tuple as returned by urlsplit() into a
complete URL as a string. The data argument can be any five-item iterable.
This may result in a slightly different, but equivalent URL, if the URL that
was parsed originally had unnecessary delimiters (for example, a ? with an
empty query; the RFC states that these are equivalent)."""
scheme, netloc, url, query, fragment, _coerce_result = (
scheme, netloc, path, query, fragment, _coerce_result = (
_coerce_args(*components))
if not scheme and not netloc:
# Building a relative URI. Need to be careful that path is not
# confused with scheme or netloc.
if path.startswith('//'):
# gh-87389: don't treat first component of path as netloc
url = '/' + path.lstrip('/')
elif _is_scheme_like(path):
# first component has colon, ensure it will not be parsed as the
# scheme
url = './' + path
else:
url = path
else:
url = path
if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
if url and url[:1] != '/': url = '/' + url
url = '//' + (netloc or '') + url
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Change :func:`urllib.parse.urlunsplit` to sanitize ``path`` argument in order
to avoid confusing the first component of the path as a net location or
scheme.

Co-authored-by: Gregory P. Smith <greg@krypto.org> [Google]