python · nascheme · Jun 16, 2022 · Jun 16, 2022 · Jun 16, 2022 · Jun 16, 2022
diff --git a/Lib/http/server.py b/Lib/http/server.py
@@ -664,6 +664,33 @@ def do_HEAD(self):
         if f:
             f.close()
 
+    def _request_path_split(self, path):
+        """Parse a path that can include an optional query and fragment.
+        """
+        # We only handle the 'abs_path' case for the Request-URI part of the
+        # request line (the second word).  We don't handle the case of a URL
+        # containing a scheme or netloc.
+        path, _, query = path.partition('?')
+        path, _, fragment = path.partition('#')
+        return urllib.parse.SplitResult('', '', path, query, fragment)
+
+    def _get_redirect_url_for_dir(self):
+        """Returns URL with trailing slash on path, if required.  If not
+        required, returns None.
+        """
+        # Previous versions of this class used urllib.parse.urlsplit() here.
+        # However, the 'path' is being treated as a local filesystem path and
+        # it can't have a scheme or netloc.  We need to avoid parsing it
+        # incorrectly.  For example, as reported in gh-87389, a path starting
+        # with a double slash should not be treated as a relative URI.  Also, a
+        # path with a colon in the first component could also be parsed
+        # wrongly.
+        parts = self._request_path_split(self.path)
+        if parts.path.endswith('/'):
+            return None  # already has slash, no redirect needed
+        return urllib.parse.urlunsplit(('', '', parts.path + '/', parts.query,
+                                        parts.fragment))
+
     def send_head(self):
         """Common code for GET and HEAD commands.
 
@@ -678,13 +705,10 @@ def send_head(self):
         path = self.translate_path(self.path)
         f = None
         if os.path.isdir(path):
-            parts = urllib.parse.urlsplit(self.path)
-            if not parts.path.endswith('/'):
+            new_url = self._get_redirect_url_for_dir()
+            if new_url:
                 # redirect browser - doing basically what apache does
                 self.send_response(HTTPStatus.MOVED_PERMANENTLY)
-                new_parts = (parts[0], parts[1], parts[2] + '/',
-                             parts[3], parts[4])
-                new_url = urllib.parse.urlunsplit(new_parts)
                 self.send_header("Location", new_url)
                 self.send_header("Content-Length", "0")
                 self.end_headers()
@@ -817,9 +841,8 @@ def translate_path(self, path):
         probably be diagnosed.)
 
         """
-        # abandon query parameters
-        path = path.split('?',1)[0]
-        path = path.split('#',1)[0]
+        # extract only path, abandon query parameters and fragment
+        path = self._request_path_split(path).path
         # Don't forget explicit trailing slash when normalizing. Issue17324
         trailing_slash = path.rstrip().endswith('/')
         try:

diff --git a/Lib/test/test_httpservers.py b/Lib/test/test_httpservers.py
@@ -334,7 +334,7 @@ class request_handler(NoLogRequestHandler, SimpleHTTPRequestHandler):
         pass
 
     def setUp(self):
-        BaseTestCase.setUp(self)
+        super().setUp()
         self.cwd = os.getcwd()
         basetempdir = tempfile.gettempdir()
         os.chdir(basetempdir)
@@ -362,7 +362,7 @@ def tearDown(self):
             except:
                 pass
         finally:
-            BaseTestCase.tearDown(self)
+            super().tearDown()
 
     def check_status_and_reason(self, response, status, data=None):
         def close_conn():
@@ -418,6 +418,56 @@ def test_undecodable_filename(self):
         self.check_status_and_reason(response, HTTPStatus.OK,
                                      data=os_helper.TESTFN_UNDECODABLE)
 
+    def test_get_dir_redirect_location_domain_injection_bug(self):
+        """Ensure //evil.co/..%2f../../X does not put //evil.co/ in Location.
+
+        //netloc/ in a Location header is a redirect to a new host.
+        https://github.com/python/cpython/issues/87389
+
+        This checks that a path resolving to a directory on our server cannot
+        resolve into a redirect to another server.
+        """
+        os.mkdir(os.path.join(self.tempdir, 'existing_directory'))
+        url = f'/python.org/..%2f..%2f..%2f..%2f..%2f../%0a%0d/../{self.tempdir_name}/existing_directory'
+        expected_location = f'{url}/'  # /python.org.../ single slash single prefix, trailing slash
+        # Canonicalizes to /tmp/tempdir_name/existing_directory which does
+        # exist and is a dir, triggering the 301 redirect logic.
+        response = self.request(url)
+        self.check_status_and_reason(response, HTTPStatus.MOVED_PERMANENTLY)
+        location = response.getheader('Location')
+        self.assertEqual(location, expected_location, msg='non-attack failed!')
+
+        # //python.org... multi-slash prefix, no trailing slash
+        attack_url = f'/{url}'
+        response = self.request(attack_url)
+        self.check_status_and_reason(response, HTTPStatus.MOVED_PERMANENTLY)
+        location = response.getheader('Location')
+        self.assertFalse(location.startswith('//'), msg=location)
+        self.assertEqual(location, expected_location,
+                msg='Expected Location header to start with a single / and '
+                'end with a / as this is a directory redirect.')
+
+        # ///python.org... triple-slash prefix, no trailing slash
+        attack3_url = f'//{url}'
+        response = self.request(attack3_url)
+        self.check_status_and_reason(response, HTTPStatus.MOVED_PERMANENTLY)
+        self.assertEqual(response.getheader('Location'), expected_location)
+
+        # If the second word in the http request (Request-URI for the http
+        # method) has a scheme and netloc, it still gets treated as an
+        # absolute path by the server.  In that case, the redirect is
+        # constructed so it is parsed as a path.  The './' part of the path
+        # is added by urlunsplit() so that the 'https:' part of what is being
+        # treated as a path is not treated as a scheme in the redirect
+        # location.  http.server is not a proxy and doesn't handle Request-URI
+        # being an absolute URI with a scheme and or netloc.
+        attack_scheme_netloc_2slash_url = f'https://pypi.org/{url}'
+        expected_location = f'./{attack_scheme_netloc_2slash_url}/'
+        response = self.request(attack_scheme_netloc_2slash_url)
+        self.check_status_and_reason(response, HTTPStatus.MOVED_PERMANENTLY)
+        location = response.getheader('Location')
+        self.assertEqual(location, expected_location)
+
     def test_get(self):
         #constructs the path relative to the root directory of the HTTPServer
         response = self.request(self.base_url + '/test')

diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py
@@ -1101,6 +1101,25 @@ def test_urlsplit_normalization(self):
                         with self.assertRaises(ValueError):
                             urllib.parse.urlsplit(url)
 
+    def test_urlunsplit_relative(self):
+        cases = [
+            # expected result is a relative URL without netloc and scheme
+            (('', 'a', '', '', ''), '//a'),
+            # extra leading slashes need to be stripped to avoid confusion
+            # with a relative URL
+            (('', '', '//a', '', ''), '/a'),
+            (('', '', '///a', '', ''), '/a'),
+            # not relative so extra leading slashes don't need stripping since
+            # they don't cause confusion
+            (('http', 'x.y', '//a', '', ''), 'http://x.y//a'),
+            # avoid confusion with path containing colon
+            (('', '', 'a:b', '', ''), './a:b'),
+        ]
+        for parts, result in cases:
+            self.assertEqual(urllib.parse.urlunsplit(parts), result,
+                             msg=f'{parts=}')
+
+
 class Utility_Tests(unittest.TestCase):
     """Testcase to test the various utility functions in the urllib."""
     # In Python 2 this test class was in test_urllib.

diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py
@@ -491,14 +491,32 @@ def urlunparse(components):
         url = "%s;%s" % (url, params)
     return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
 
+# Returns true if path can confused with a scheme.  I.e. a relative path
+# without leading dot that includes a colon in the first component.
+_is_scheme_like = re.compile(r'[^/.][^/]*:').match
+
 def urlunsplit(components):
     """Combine the elements of a tuple as returned by urlsplit() into a
     complete URL as a string. The data argument can be any five-item iterable.
     This may result in a slightly different, but equivalent URL, if the URL that
     was parsed originally had unnecessary delimiters (for example, a ? with an
     empty query; the RFC states that these are equivalent)."""
-    scheme, netloc, url, query, fragment, _coerce_result = (
+    scheme, netloc, path, query, fragment, _coerce_result = (
                                           _coerce_args(*components))
+    if not scheme and not netloc:
+        # Building a relative URI.  Need to be careful that path is not
+        # confused with scheme or netloc.
+        if path.startswith('//'):
+            # gh-87389: don't treat first component of path as netloc
+            url = '/' + path.lstrip('/')
+        elif _is_scheme_like(path):
+            # first component has colon, ensure it will not be parsed as the
+            # scheme
+            url = './' + path
+        else:
+            url = path
+    else:
+        url = path
     if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
         if url and url[:1] != '/': url = '/' + url
         url = '//' + (netloc or '') + url

diff --git a/Misc/NEWS.d/next/Security/2022-06-16-12-13-55.gh-issue-87389.MS9wAR.rst b/Misc/NEWS.d/next/Security/2022-06-16-12-13-55.gh-issue-87389.MS9wAR.rst
@@ -0,0 +1,5 @@
+Change :func:`urllib.parse.urlunsplit` to sanitize ``path`` argument in order
+to avoid confusing the first component of the path as a net location or
+scheme.
+
+Co-authored-by: Gregory P. Smith <greg@krypto.org> [Google]