Skip to content

Commit

Permalink
Add checks against requirements-file-dwelling hashes for most kinds o…
Browse files Browse the repository at this point in the history
…f packages. Close #1175.

* Add --require-hashes option. This is handy in deployment scripts to force application authors to hash their requirements. It is also a convenient way to get pip to show computed hashes for a virgin, unhashed requirements file. Eventually, additions to `pip freeze` should fill a superset of this use case.
  * In --require-hashes mode, at least one hash is required to match for each requirement.
  * Option-based requirements (--sha256=...) turn on --require-hashes mode implicitly.
  * Internet-derived URL-based hashes are "necessary but not sufficient": they do not satisfy --require-hashes mode when they match, but they are still used to guard against transmission errors.
  * Other URL-based requirements (#md5=...) are treated just like flag-based ones, except they don't turn on --require-hashes.
* Complain informatively, with the most devastating errors first so you don't chase your tail all day only to run up against a brick wall at the end. This also means we don't complain that a hash is missing, only for the user to find, after fixing it, that we have no idea how to even compute a hash for that type of requirement.
  * Complain about unpinned requirements when hash-checking mode is on, lest they cause the user surprise later.
  * Complain about missing hashes.
  * Complain about requirement types we don't know how to hash (like VCS ones and local dirs).
* Have InstallRequirement keep its original Link around (original_link) so we can differentiate between URL hashes from requirements files and ones downloaded from the (untrustworthy) internet.
* Remove test_download_hashes, which is obsolete. Similar coverage is provided in test_utils.TestHashes and the various hash cases in test_req.py.
  • Loading branch information
erikrose committed Sep 25, 2015
1 parent 3303be0 commit 1e41f01
Show file tree
Hide file tree
Showing 14 changed files with 777 additions and 378 deletions.
10 changes: 10 additions & 0 deletions pip/commands/install.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,15 @@ def __init__(self, *args, **kw):

cmd_opts.add_option(cmdoptions.no_clean())

cmd_opts.add_option(
'--require-hashes',
dest='require_hashes',
action='store_true',
help='Perform a provably repeatable installation by requiring a '
'hash to check each package against. Implied by the presence '
'of a hash flag, like --sha256, on any individual '
'requirement')

index_opts = cmdoptions.make_option_group(
cmdoptions.index_group,
self.parser,
Expand Down Expand Up @@ -266,6 +275,7 @@ def run(self, options, args):
pycompile=options.compile,
isolated=options.isolated_mode,
wheel_cache=wheel_cache,
require_hashes=options.require_hashes,
)

self.populate_requirement_set(
Expand Down
128 changes: 57 additions & 71 deletions pip/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from pip.models import PyPI
from pip.utils import (splitext, rmtree, format_size, display_path,
backup_dir, ask_path_exists, unpack_file,
call_subprocess, ARCHIVE_EXTENSIONS)
call_subprocess, ARCHIVE_EXTENSIONS, consume)
from pip.utils.filesystem import check_path_owner
from pip.utils.logging import indent_log
from pip.utils.ui import DownloadProgressBar, DownloadProgressSpinner
Expand Down Expand Up @@ -485,57 +485,22 @@ def is_file_url(link):
return link.url.lower().startswith('file:')


def _check_hash(download_hash, link):
if download_hash.digest_size != hashlib.new(link.hash_name).digest_size:
logger.critical(
"Hash digest size of the package %d (%s) doesn't match the "
"expected hash name %s!",
download_hash.digest_size, link, link.hash_name,
)
raise HashMismatch('Hash name mismatch for package %s' % link)
if download_hash.hexdigest() != link.hash:
logger.critical(
"Hash of the package %s (%s) doesn't match the expected hash %s!",
link, download_hash.hexdigest(), link.hash,
)
raise HashMismatch(
'Bad %s hash for package %s' % (link.hash_name, link)
)
def is_dir_url(link):
"""Return whether a file:// Link points to a directory.
``link`` must not have any other scheme but file://. Call is_file_url()
first.
def _get_hash_from_file(target_file, link):
try:
download_hash = hashlib.new(link.hash_name)
except (ValueError, TypeError):
logger.warning(
"Unsupported hash name %s for package %s", link.hash_name, link,
)
return None

with open(target_file, 'rb') as fp:
while True:
chunk = fp.read(4096)
if not chunk:
break
download_hash.update(chunk)
return download_hash
"""
link_path = url_to_path(link.url_without_fragment)
return os.path.isdir(link_path)


def _progress_indicator(iterable, *args, **kwargs):
return iterable


def _download_url(resp, link, content_file):
download_hash = None
if link.hash and link.hash_name:
try:
download_hash = hashlib.new(link.hash_name)
except ValueError:
logger.warning(
"Unsupported hash name %s for package %s",
link.hash_name, link,
)

def _download_url(resp, link, content_file, hashes):
try:
total_length = int(resp.headers['content-length'])
except (ValueError, KeyError, TypeError):
Expand Down Expand Up @@ -593,6 +558,11 @@ def resp_read(chunk_size):
break
yield chunk

def written_chunks(chunks):
for chunk in chunks:
content_file.write(chunk)
yield chunk

progress_indicator = _progress_indicator

if link.netloc == PyPI.netloc:
Expand All @@ -614,13 +584,12 @@ def resp_read(chunk_size):

logger.debug('Downloading from URL %s', link)

for chunk in progress_indicator(resp_read(4096), 4096):
if download_hash is not None:
download_hash.update(chunk)
content_file.write(chunk)
if link.hash and link.hash_name:
_check_hash(download_hash, link)
return download_hash
downloaded_chunks = written_chunks(progress_indicator(resp_read(4096),
4096))
if hashes:
hashes.check_against_chunks(downloaded_chunks)
else:
consume(downloaded_chunks)


def _copy_file(filename, location, content_type, link):
Expand Down Expand Up @@ -648,7 +617,11 @@ def _copy_file(filename, location, content_type, link):
logger.info('Saved %s', display_path(download_location))


def unpack_http_url(link, location, download_dir=None, session=None):
def unpack_http_url(link,
location,
download_dir=None,
session=None,
hashes=None):
if session is None:
raise TypeError(
"unpack_http_url() missing 1 required keyword argument: 'session'"
Expand All @@ -659,14 +632,19 @@ def unpack_http_url(link, location, download_dir=None, session=None):
# If a download dir is specified, is the file already downloaded there?
already_downloaded_path = None
if download_dir:
already_downloaded_path = _check_download_dir(link, download_dir)
already_downloaded_path = _check_download_dir(link,
download_dir,
hashes)

if already_downloaded_path:
from_path = already_downloaded_path
content_type = mimetypes.guess_type(from_path)[0]
else:
# let's download to a tmp dir
from_path, content_type = _download_http_url(link, session, temp_dir)
from_path, content_type = _download_http_url(link,
session,
temp_dir,
hashes)

# unpack the archive to the build dir location. even when only downloading
# archives, they have to be unpacked to parse dependencies
Expand All @@ -681,31 +659,34 @@ def unpack_http_url(link, location, download_dir=None, session=None):
rmtree(temp_dir)


def unpack_file_url(link, location, download_dir=None):
def unpack_file_url(link, location, download_dir=None, hashes=None):
"""Unpack link into location.
If download_dir is provided and link points to a file, make a copy
of the link file inside download_dir."""
If download_dir is provided and link points to a file, make a copy
of the link file inside download_dir.
"""
link_path = url_to_path(link.url_without_fragment)

# If it's a url to a local directory
if os.path.isdir(link_path):
if is_dir_url(link):
if os.path.isdir(location):
rmtree(location)
shutil.copytree(link_path, location, symlinks=True)
if download_dir:
logger.info('Link is a directory, ignoring download_dir')
return

# if link has a hash, let's confirm it matches
if link.hash:
link_path_hash = _get_hash_from_file(link_path, link)
_check_hash(link_path_hash, link)
# If --require-hashes is off, `hashes` is either empty, the link hash, or
# MissingHashes, and it's required to match. If --require-hashes is on, we
# are satisfied by any hash in `hashes` matching: a URL-based or an
# option-based one; no internet-sourced hash will be in `hashes`.
if hashes:
hashes.check_against_path(link_path)

# If a download dir is specified, is the file already there and valid?
already_downloaded_path = None
if download_dir:
already_downloaded_path = _check_download_dir(link, download_dir)
already_downloaded_path = _check_download_dir(link, download_dir, hashes)

if already_downloaded_path:
from_path = already_downloaded_path
Expand Down Expand Up @@ -752,7 +733,7 @@ def request(self, host, handler, request_body, verbose=False):


def unpack_url(link, location, download_dir=None,
only_download=False, session=None):
only_download=False, session=None, hashes=None):
"""Unpack link.
If link is a VCS link:
if only_download, export into download_dir and ignore location
Expand All @@ -761,14 +742,19 @@ def unpack_url(link, location, download_dir=None,
- unpack into location
- if download_dir, copy the file into download_dir
- if only_download, mark location for deletion
:param hashes: A Hashes object, one of whose embedded hashes must match,
or I'll raise HashMismatch. If the Hashes is empty, no matches are
required, and unhashable types of requirements (like VCS ones, which
would ordinarily raise HashUnsupported) are allowed.
"""
# non-editable vcs urls
if is_vcs_url(link):
unpack_vcs_link(link, location)

# file urls
elif is_file_url(link):
unpack_file_url(link, location, download_dir)
unpack_file_url(link, location, download_dir, hashes=hashes)

# http urls
else:
Expand All @@ -780,12 +766,13 @@ def unpack_url(link, location, download_dir=None,
location,
download_dir,
session,
hashes=hashes
)
if only_download:
write_delete_marker_file(location)


def _download_http_url(link, session, temp_dir):
def _download_http_url(link, session, temp_dir, hashes):
"""Download link url into temp_dir using provided session"""
target_url = link.url.split('#', 1)[0]
try:
Expand Down Expand Up @@ -840,22 +827,21 @@ def _download_http_url(link, session, temp_dir):
filename += ext
file_path = os.path.join(temp_dir, filename)
with open(file_path, 'wb') as content_file:
_download_url(resp, link, content_file)
_download_url(resp, link, content_file, hashes)
return file_path, content_type


def _check_download_dir(link, download_dir):
def _check_download_dir(link, download_dir, hashes):
""" Check download_dir for previously downloaded file with correct hash
If a correct file is found return its path else None
"""
download_path = os.path.join(download_dir, link.filename)
if os.path.exists(download_path):
# If already downloaded, does its hash match?
logger.info('File was already downloaded %s', download_path)
if link.hash:
download_hash = _get_hash_from_file(download_path, link)
if hashes:
try:
_check_hash(download_hash, link)
hashes.check_against_path(download_path)
except HashMismatch:
logger.warning(
'Previously-downloaded file %s has bad hash. '
Expand Down
Loading

0 comments on commit 1e41f01

Please sign in to comment.