Skip to content

Commit

Permalink
pythongh-66543: Fix mimetype.guess_type() (pythonGH-117217)
Browse files Browse the repository at this point in the history
Fix parsing of the following corner cases:

* URLs with only a host name
* URLs containing a fragment
* URLs containing a query
* filenames with only a UNC sharepoint on Windows

Co-authored-by: Dong-hee Na <donghee.na92@gmail.com>
  • Loading branch information
2 people authored and diegorusso committed Apr 17, 2024
1 parent 6ca8a3e commit 13bf03d
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 9 deletions.
8 changes: 7 additions & 1 deletion Lib/mimetypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,13 @@ def guess_type(self, url, strict=True):
but non-standard types.
"""
url = os.fspath(url)
scheme, url = urllib.parse._splittype(url)
p = urllib.parse.urlparse(url)
if p.scheme and len(p.scheme) > 1:
scheme = p.scheme
url = p.path
else:
scheme = None
url = os.path.splitdrive(url)[1]
if scheme == 'data':
# syntax of data URLs:
# dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
Expand Down
40 changes: 33 additions & 7 deletions Lib/test/test_mimetypes.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import io
import mimetypes
import os
import pathlib
import sys
import unittest.mock
Expand Down Expand Up @@ -109,15 +110,40 @@ def test_filename_with_url_delimiters(self):
# compared to when interpreted as filename because of the semicolon.
eq = self.assertEqual
gzip_expected = ('application/x-tar', 'gzip')
eq(self.db.guess_type(";1.tar.gz"), gzip_expected)
eq(self.db.guess_type("?1.tar.gz"), gzip_expected)
eq(self.db.guess_type("#1.tar.gz"), gzip_expected)
eq(self.db.guess_type("#1#.tar.gz"), gzip_expected)
eq(self.db.guess_type(";1#.tar.gz"), gzip_expected)
eq(self.db.guess_type(";&1=123;?.tar.gz"), gzip_expected)
eq(self.db.guess_type("?k1=v1&k2=v2.tar.gz"), gzip_expected)
for name in (
';1.tar.gz',
'?1.tar.gz',
'#1.tar.gz',
'#1#.tar.gz',
';1#.tar.gz',
';&1=123;?.tar.gz',
'?k1=v1&k2=v2.tar.gz',
):
for prefix in ('', '/', '\\',
'c:', 'c:/', 'c:\\', 'c:/d/', 'c:\\d\\',
'//share/server/', '\\\\share\\server\\'):
path = prefix + name
with self.subTest(path=path):
eq(self.db.guess_type(path), gzip_expected)
expected = (None, None) if os.name == 'nt' else gzip_expected
for prefix in ('//', '\\\\', '//share/', '\\\\share\\'):
path = prefix + name
with self.subTest(path=path):
eq(self.db.guess_type(path), expected)
eq(self.db.guess_type(r" \"\`;b&b&c |.tar.gz"), gzip_expected)

def test_url(self):
result = self.db.guess_type('http://host.html')
msg = 'URL only has a host name, not a file'
self.assertSequenceEqual(result, (None, None), msg)
result = self.db.guess_type('http://example.com/host.html')
msg = 'Should be text/html'
self.assertSequenceEqual(result, ('text/html', None), msg)
result = self.db.guess_type('http://example.com/host.html#x.tar')
self.assertSequenceEqual(result, ('text/html', None))
result = self.db.guess_type('http://example.com/host.html?q=x.tar')
self.assertSequenceEqual(result, ('text/html', None))

def test_guess_all_types(self):
# First try strict. Use a set here for testing the results because if
# test_urllib2 is run before test_mimetypes, global state is modified
Expand Down
2 changes: 1 addition & 1 deletion Lib/test/test_urllib2.py
Original file line number Diff line number Diff line change
Expand Up @@ -777,7 +777,7 @@ def connect_ftp(self, user, passwd, host, port, dirs,
["foo", "bar"], "", None),
("ftp://localhost/baz.gif;type=a",
"localhost", ftplib.FTP_PORT, "", "", "A",
[], "baz.gif", None), # XXX really this should guess image/gif
[], "baz.gif", "image/gif"),
]:
req = Request(url)
req.timeout = None
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Make :func:`mimetypes.guess_type` properly parsing of URLs with only a host
name, URLs containing fragment or query, and filenames with only a UNC
sharepoint on Windows.
Based on patch by Dong-hee Na.

0 comments on commit 13bf03d

Please sign in to comment.