Add checks against requirements-file-dwelling hashes for most kinds o…

…f packages. Close #1175. * Add --require-hashes option. This is handy in deployment scripts to force application authors to hash their requirements. It is also a convenient way to get pip to show computed hashes for a virgin, unhashed requirements file. Eventually, additions to `pip freeze` should fill a superset of this use case. * In --require-hashes mode, at least one hash is required to match for each requirement. * Option-based requirements (--sha256=...) turn on --require-hashes mode implicitly. * Internet-derived URL-based hashes are "necessary but not sufficient": they do not satisfy --require-hashes mode when they match, but they are still used to guard against transmission errors. * Other URL-based requirements (#md5=...) are treated just like flag-based ones, except they don't turn on --require-hashes. * Complain informatively, with the most devastating errors first so you don't chase your tail all day only to run up against a brick wall at the end. This also means we don't complain that a hash is missing, only for the user to find, after fixing it, that we have no idea how to even compute a hash for that type of requirement. * Complain about unpinned requirements when hash-checking mode is on, lest they cause the user surprise later. * Complain about missing hashes. * Complain about requirement types we don't know how to hash (like VCS ones and local dirs). * Have InstallRequirement keep its original Link around (original_link) so we can differentiate between URL hashes from requirements files and ones downloaded from the (untrustworthy) internet. * Remove test_download_hashes, which is obsolete. Similar coverage is provided in test_utils.TestHashes and the various hash cases in test_req.py.
pypa · Sep 25, 2015 · 1e41f01 · 1e41f01
1 parent 3303be0
commit 1e41f01
Show file tree

Hide file tree

Showing 14 changed files with 777 additions and 378 deletions.
diff --git a/pip/commands/install.py b/pip/commands/install.py
@@ -159,6 +159,15 @@ def __init__(self, *args, **kw):
 
         cmd_opts.add_option(cmdoptions.no_clean())
 
+        cmd_opts.add_option(
+            '--require-hashes',
+            dest='require_hashes',
+            action='store_true',
+            help='Perform a provably repeatable installation by requiring a '
+                 'hash to check each package against. Implied by the presence '
+                 'of a hash flag, like --sha256, on any individual '
+                 'requirement')
+
         index_opts = cmdoptions.make_option_group(
             cmdoptions.index_group,
             self.parser,
@@ -266,6 +275,7 @@ def run(self, options, args):
                     pycompile=options.compile,
                     isolated=options.isolated_mode,
                     wheel_cache=wheel_cache,
+                    require_hashes=options.require_hashes,
                 )
 
                 self.populate_requirement_set(

diff --git a/pip/download.py b/pip/download.py
@@ -29,7 +29,7 @@
 from pip.models import PyPI
 from pip.utils import (splitext, rmtree, format_size, display_path,
                        backup_dir, ask_path_exists, unpack_file,
-                       call_subprocess, ARCHIVE_EXTENSIONS)
+                       call_subprocess, ARCHIVE_EXTENSIONS, consume)
 from pip.utils.filesystem import check_path_owner
 from pip.utils.logging import indent_log
 from pip.utils.ui import DownloadProgressBar, DownloadProgressSpinner
@@ -485,57 +485,22 @@ def is_file_url(link):
     return link.url.lower().startswith('file:')
 
 
-def _check_hash(download_hash, link):
-    if download_hash.digest_size != hashlib.new(link.hash_name).digest_size:
-        logger.critical(
-            "Hash digest size of the package %d (%s) doesn't match the "
-            "expected hash name %s!",
-            download_hash.digest_size, link, link.hash_name,
-        )
-        raise HashMismatch('Hash name mismatch for package %s' % link)
-    if download_hash.hexdigest() != link.hash:
-        logger.critical(
-            "Hash of the package %s (%s) doesn't match the expected hash %s!",
-            link, download_hash.hexdigest(), link.hash,
-        )
-        raise HashMismatch(
-            'Bad %s hash for package %s' % (link.hash_name, link)
-        )
+def is_dir_url(link):
+    """Return whether a file:// Link points to a directory.
 
+    ``link`` must not have any other scheme but file://. Call is_file_url()
+    first.
 
-def _get_hash_from_file(target_file, link):
-    try:
-        download_hash = hashlib.new(link.hash_name)
-    except (ValueError, TypeError):
-        logger.warning(
-            "Unsupported hash name %s for package %s", link.hash_name, link,
-        )
-        return None
-
-    with open(target_file, 'rb') as fp:
-        while True:
-            chunk = fp.read(4096)
-            if not chunk:
-                break
-            download_hash.update(chunk)
-    return download_hash
+    """
+    link_path = url_to_path(link.url_without_fragment)
+    return os.path.isdir(link_path)
 
 
 def _progress_indicator(iterable, *args, **kwargs):
     return iterable
 
 
-def _download_url(resp, link, content_file):
-    download_hash = None
-    if link.hash and link.hash_name:
-        try:
-            download_hash = hashlib.new(link.hash_name)
-        except ValueError:
-            logger.warning(
-                "Unsupported hash name %s for package %s",
-                link.hash_name, link,
-            )
-
+def _download_url(resp, link, content_file, hashes):
     try:
         total_length = int(resp.headers['content-length'])
     except (ValueError, KeyError, TypeError):
@@ -593,6 +558,11 @@ def resp_read(chunk_size):
                     break
                 yield chunk
 
+    def written_chunks(chunks):
+        for chunk in chunks:
+            content_file.write(chunk)
+            yield chunk
+
     progress_indicator = _progress_indicator
 
     if link.netloc == PyPI.netloc:
@@ -614,13 +584,12 @@ def resp_read(chunk_size):
 
     logger.debug('Downloading from URL %s', link)
 
-    for chunk in progress_indicator(resp_read(4096), 4096):
-        if download_hash is not None:
-            download_hash.update(chunk)
-        content_file.write(chunk)
-    if link.hash and link.hash_name:
-        _check_hash(download_hash, link)
-    return download_hash
+    downloaded_chunks = written_chunks(progress_indicator(resp_read(4096),
+                                                          4096))
+    if hashes:
+         hashes.check_against_chunks(downloaded_chunks)
+    else:
+        consume(downloaded_chunks)
 
 
 def _copy_file(filename, location, content_type, link):
@@ -648,7 +617,11 @@ def _copy_file(filename, location, content_type, link):
         logger.info('Saved %s', display_path(download_location))
 
 
-def unpack_http_url(link, location, download_dir=None, session=None):
+def unpack_http_url(link,
+                    location,
+                    download_dir=None,
+                    session=None,
+                    hashes=None):
     if session is None:
         raise TypeError(
             "unpack_http_url() missing 1 required keyword argument: 'session'"
@@ -659,14 +632,19 @@ def unpack_http_url(link, location, download_dir=None, session=None):
     # If a download dir is specified, is the file already downloaded there?
     already_downloaded_path = None
     if download_dir:
-        already_downloaded_path = _check_download_dir(link, download_dir)
+        already_downloaded_path = _check_download_dir(link,
+                                                      download_dir,
+                                                      hashes)
 
     if already_downloaded_path:
         from_path = already_downloaded_path
         content_type = mimetypes.guess_type(from_path)[0]
     else:
         # let's download to a tmp dir
-        from_path, content_type = _download_http_url(link, session, temp_dir)
+        from_path, content_type = _download_http_url(link,
+                                                     session,
+                                                     temp_dir,
+                                                     hashes)
 
     # unpack the archive to the build dir location. even when only downloading
     # archives, they have to be unpacked to parse dependencies
@@ -681,31 +659,34 @@ def unpack_http_url(link, location, download_dir=None, session=None):
     rmtree(temp_dir)
 
 
-def unpack_file_url(link, location, download_dir=None):
+def unpack_file_url(link, location, download_dir=None, hashes=None):
     """Unpack link into location.
-    If download_dir is provided and link points to a file, make a copy
-    of the link file inside download_dir."""
 
+    If download_dir is provided and link points to a file, make a copy
+    of the link file inside download_dir.
+    """
     link_path = url_to_path(link.url_without_fragment)
 
     # If it's a url to a local directory
-    if os.path.isdir(link_path):
+    if is_dir_url(link):
         if os.path.isdir(location):
             rmtree(location)
         shutil.copytree(link_path, location, symlinks=True)
         if download_dir:
             logger.info('Link is a directory, ignoring download_dir')
         return
 
-    # if link has a hash, let's confirm it matches
-    if link.hash:
-        link_path_hash = _get_hash_from_file(link_path, link)
-        _check_hash(link_path_hash, link)
+    # If --require-hashes is off, `hashes` is either empty, the link hash, or
+    # MissingHashes, and it's required to match. If --require-hashes is on, we
+    # are satisfied by any hash in `hashes` matching: a URL-based or an
+    # option-based one; no internet-sourced hash will be in `hashes`.
+    if hashes:
+        hashes.check_against_path(link_path)
 
     # If a download dir is specified, is the file already there and valid?
     already_downloaded_path = None
     if download_dir:
-        already_downloaded_path = _check_download_dir(link, download_dir)
+        already_downloaded_path = _check_download_dir(link, download_dir, hashes)
 
     if already_downloaded_path:
         from_path = already_downloaded_path
@@ -752,7 +733,7 @@ def request(self, host, handler, request_body, verbose=False):
 
 
 def unpack_url(link, location, download_dir=None,
-               only_download=False, session=None):
+               only_download=False, session=None, hashes=None):
     """Unpack link.
        If link is a VCS link:
          if only_download, export into download_dir and ignore location
@@ -761,14 +742,19 @@ def unpack_url(link, location, download_dir=None,
          - unpack into location
          - if download_dir, copy the file into download_dir
          - if only_download, mark location for deletion
+
+    :param hashes: A Hashes object, one of whose embedded hashes must match,
+        or I'll raise HashMismatch. If the Hashes is empty, no matches are
+        required, and unhashable types of requirements (like VCS ones, which
+        would ordinarily raise HashUnsupported) are allowed.
     """
     # non-editable vcs urls
     if is_vcs_url(link):
         unpack_vcs_link(link, location)
 
     # file urls
     elif is_file_url(link):
-        unpack_file_url(link, location, download_dir)
+        unpack_file_url(link, location, download_dir, hashes=hashes)
 
     # http urls
     else:
@@ -780,12 +766,13 @@ def unpack_url(link, location, download_dir=None,
             location,
             download_dir,
             session,
+            hashes=hashes
         )
     if only_download:
         write_delete_marker_file(location)
 
 
-def _download_http_url(link, session, temp_dir):
+def _download_http_url(link, session, temp_dir, hashes):
     """Download link url into temp_dir using provided session"""
     target_url = link.url.split('#', 1)[0]
     try:
@@ -840,22 +827,21 @@ def _download_http_url(link, session, temp_dir):
             filename += ext
     file_path = os.path.join(temp_dir, filename)
     with open(file_path, 'wb') as content_file:
-        _download_url(resp, link, content_file)
+        _download_url(resp, link, content_file, hashes)
     return file_path, content_type
 
 
-def _check_download_dir(link, download_dir):
+def _check_download_dir(link, download_dir, hashes):
     """ Check download_dir for previously downloaded file with correct hash
         If a correct file is found return its path else None
     """
     download_path = os.path.join(download_dir, link.filename)
     if os.path.exists(download_path):
         # If already downloaded, does its hash match?
         logger.info('File was already downloaded %s', download_path)
-        if link.hash:
-            download_hash = _get_hash_from_file(download_path, link)
+        if hashes:
             try:
-                _check_hash(download_hash, link)
+                hashes.check_against_path(download_path)
             except HashMismatch:
                 logger.warning(
                     'Previously-downloaded file %s has bad hash. '