Skip to content

Commit

Permalink
Use a hash based file paths on *new* uploads
Browse files Browse the repository at this point in the history
Instead of using a file path like {pyver}/{name[0]}/{name}/{filename}, we will
instead use one that looks like {hash[:2]}/{hash[2:4]}/{hash[4:]}{filename},
where `hash` is the blake2b(digest_size=32) hex digest (lowercased) of the
actual file contents. This will ensure that the contents referred to by a
specific file URL cannot change without also changing the URL.

This also adds File.blake2_256_digest to hold this new hash.
  • Loading branch information
dstufft committed Apr 19, 2016
1 parent b1e15f4 commit 43e9a7b
Show file tree
Hide file tree
Showing 8 changed files with 146 additions and 27 deletions.
1 change: 1 addition & 0 deletions requirements/main.in
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ packaging>=15.2
paginate>=0.5.2
passlib>=1.6.4
psycopg2
pyblake2
pyramid>=1.7a1
pyramid_jinja2>=2.5
pyramid_multiauth
Expand Down
2 changes: 2 additions & 0 deletions requirements/main.txt
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ PasteDeploy==1.5.2 \
--hash=sha256:d5858f89a255e6294e63ed46b73613c56e3b9a2d82a42f1df4d06c8421a9e3cb
psycopg2==2.6.1 \
--hash=sha256:6acf9abbbe757ef75dc2ecd9d91ba749547941abaffbe69ff2086a9e37d4904c
pyblake2==0.9.3 \
--hash=sha256:626448e1fe1cc01d2197118954bec9f158378577e12686d5b01979f7f0fa2212
pycparser==2.14 \
--hash=sha256:7959b4a74abdc27b312fed1c21e6caf9309ce0b29ea86b591fd2e99ecdf27f73
Pygments==2.1.3 \
Expand Down
37 changes: 21 additions & 16 deletions tests/unit/legacy/api/test_pypi.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@

import hashlib
import io
import os.path
import re
import tempfile
import zipfile
Expand Down Expand Up @@ -733,12 +732,12 @@ def storage_service_store(path, file_path, *, meta):
assert db_request.find_service.calls == [pretend.call(IFileStorage)]
assert len(storage_service.store.calls) == 2 if has_signature else 1
assert storage_service.store.calls[0] == pretend.call(
os.path.join(
"source",
project.name[0],
project.name,
"/".join([
"4e",
"6e",
"fa4c0ee2bbad071b4f5b5ea68f1aea89fa716e7754eb13e2314d45a5916e",
filename,
),
]),
mock.ANY,
meta={
"project": project.normalized_name,
Expand All @@ -750,12 +749,15 @@ def storage_service_store(path, file_path, *, meta):

if has_signature:
assert storage_service.store.calls[1] == pretend.call(
os.path.join(
"source",
project.name[0],
project.name,
"/".join([
"4e",
"6e",
(
"fa4c0ee2bbad071b4f5b5ea68f1aea89fa716e7754eb13e2314d"
"45a5916e"
),
filename + ".asc",
),
]),
mock.ANY,
meta={
"project": project.normalized_name,
Expand Down Expand Up @@ -1352,12 +1354,15 @@ def storage_service_store(path, file_path, *, meta):
assert db_request.find_service.calls == [pretend.call(IFileStorage)]
assert storage_service.store.calls == [
pretend.call(
os.path.join(
"cp34",
project.name[0],
project.name,
"/".join([
"4e",
"6e",
(
"fa4c0ee2bbad071b4f5b5ea68f1aea89fa716e7754eb13e2314d4"
"5a5916e"
),
filename,
),
]),
mock.ANY,
meta={
"project": project.normalized_name,
Expand Down
20 changes: 20 additions & 0 deletions tests/unit/packaging/test_services.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,3 +297,23 @@ def test_stores_metadata(self, tmpdir):
ExtraArgs={"Metadata": {"foo": "bar"}},
),
]

def test_hashed_path_with_prefix(self):
s3key = pretend.stub(get=lambda: {"Body": io.BytesIO(b"my contents")})
bucket = pretend.stub(Object=pretend.call_recorder(lambda path: s3key))
storage = S3FileStorage(bucket, prefix="packages/")

file_object = storage.get("ab/file.txt")

assert file_object.read() == b"my contents"
assert bucket.Object.calls == [pretend.call("packages/ab/file.txt")]

def test_hashed_path_without_prefix(self):
s3key = pretend.stub(get=lambda: {"Body": io.BytesIO(b"my contents")})
bucket = pretend.stub(Object=pretend.call_recorder(lambda path: s3key))
storage = S3FileStorage(bucket)

file_object = storage.get("ab/file.txt")

assert file_object.read() == b"my contents"
assert bucket.Object.calls == [pretend.call("ab/file.txt")]
42 changes: 35 additions & 7 deletions warehouse/legacy/api/pypi.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import wtforms.validators
from rfc3986 import uri_reference

from pyblake2 import blake2b
from pyramid.httpexceptions import HTTPBadRequest, HTTPForbidden, HTTPGone
from pyramid.response import Response
from pyramid.view import forbidden_view_config, view_config
Expand All @@ -44,6 +45,8 @@
MAX_FILESIZE = 60 * 1024 * 1024 # 60M
MAX_SIGSIZE = 8 * 1024 # 8K

PATH_HASHER = "blake2_256"


# Wheel platform checking
# These platforms can be handled by a simple static list:
Expand Down Expand Up @@ -381,6 +384,16 @@ class MetadataForm(forms.Form):
),
]
)
blake2_256_digest = wtforms.StringField(
validators=[
wtforms.validators.Optional(),
wtforms.validators.Regexp(
r"^[A-F0-9]{64}$",
re.IGNORECASE,
message="Must be a valid, hex encoded, blake2 message digest.",
),
]
)

# Legacy dependency information
requires = ListField(
Expand Down Expand Up @@ -775,7 +788,11 @@ def file_upload(request):
# go along.
with open(temporary_filename, "wb") as fp:
file_size = 0
file_hashes = {"md5": hashlib.md5(), "sha256": hashlib.sha256()}
file_hashes = {
"md5": hashlib.md5(),
"sha256": hashlib.sha256(),
"blake2_256": blake2b(digest_size=256 // 8),
}
for chunk in iter(
lambda: request.POST["content"].file.read(8096), b""):
file_size += len(chunk)
Expand All @@ -785,14 +802,20 @@ def file_upload(request):
for hasher in file_hashes.values():
hasher.update(chunk)

# Take our hash functions and compute the final hashes for them now.
file_hashes = {
k: h.hexdigest().lower()
for k, h in file_hashes.items()
}

# Actually verify the digests that we've gotten. We're going to use
# hmac.compare_digest even though we probably don't actually need to
# because it's better safe than sorry. In the case of multiple digests
# we expect them all to be given.
if not all([
hmac.compare_digest(
getattr(form, "{}_digest".format(digest_name)).data.lower(),
digest_value.hexdigest().lower(),
digest_value,
)
for digest_name, digest_value in file_hashes.items()
if getattr(form, "{}_digest".format(digest_name)).data
Expand Down Expand Up @@ -863,12 +886,17 @@ def file_upload(request):
comment_text=form.comment.data,
size=file_size,
has_signature=bool(has_signature),
md5_digest=file_hashes["md5"].hexdigest().lower(),
sha256_digest=file_hashes["sha256"].hexdigest().lower(),
md5_digest=file_hashes["md5"],
sha256_digest=file_hashes["sha256"],
blake2_256_digest=file_hashes["blake2_256"],
# Figure out what our filepath is going to be, we're going to use a
# directory structure based on the hash of the file contents. This
# will ensure that the contents of the file cannot change without
# it also changing the path that the file is saved too.
path="/".join([
form.pyversion.data,
release.project.name[0],
release.project.name,
file_hashes[PATH_HASHER][:2],
file_hashes[PATH_HASHER][2:4],
file_hashes[PATH_HASHER][4:],
filename,
]),
)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Add File.blake2_256_digest
Revision ID: 0977b97fce94
Revises: f46672a776f1
Create Date: 2016-04-18 20:40:22.101245
"""

import citext
import sqlalchemy as sa

from alembic import op


revision = "0977b97fce94"
down_revision = "f46672a776f1"


def upgrade():
op.add_column(
"release_files",
sa.Column("blake2_256_digest", citext.CIText(), nullable=True),
)
op.create_unique_constraint(None, "release_files", ["blake2_256_digest"])
op.create_check_constraint(
None,
"release_files",
"sha256_digest ~* '^[A-F0-9]{64}$'",
)


def downgrade():
raise RuntimeError("Cannot Go Backwards In Time.")
2 changes: 2 additions & 0 deletions warehouse/packaging/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,7 @@ class File(db.Model):
),

CheckConstraint("sha256_digest ~* '^[A-F0-9]{64}$'"),
CheckConstraint("blake2_256_digest ~* '^[A-F0-9]{64}$'"),

Index("release_files_name_idx", "name"),
Index("release_files_name_version_idx", "name", "version"),
Expand All @@ -373,6 +374,7 @@ class File(db.Model):
has_signature = Column(Boolean)
md5_digest = Column(Text, unique=True)
sha256_digest = Column(CIText, unique=True, nullable=False)
blake2_256_digest = Column(CIText, unique=True, nullable=True)
downloads = Column(Integer, server_default=sql.text("0"))
upload_time = Column(DateTime(timezone=False), server_default=func.now())

Expand Down
25 changes: 21 additions & 4 deletions warehouse/packaging/services.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,28 +116,45 @@ def store(self, path, file_path, *, meta=None):
@implementer(IFileStorage)
class S3FileStorage:

def __init__(self, bucket):
def __init__(self, bucket, *, prefix=None):
self.bucket = bucket
self.prefix = prefix

@classmethod
def create_service(cls, context, request):
session = request.find_service(name="aws.session")
s3 = session.resource("s3")
bucket = s3.Bucket(request.registry.settings["files.bucket"])
return cls(bucket)
prefix = request.registry.settings.get("files.prefix")
return cls(bucket, prefix=prefix)

def _get_path(self, path):
# Legacy paths will have a first directory of something like 2.7, we
# want to just continue to support them for now.
if len(path.split("/")[0]) > 2:
return path

# If we have a prefix, then prepend it to our path. This will let us
# store items inside of a sub directory without exposing that to end
# users.
if self.prefix:
path = self.prefix + path

return path

def get(self, path):
try:
return self.bucket.Object(path).get()["Body"]
return self.bucket.Object(self._get_path(path)).get()["Body"]
except botocore.exceptions.ClientError as exc:
if exc.response["Error"]["Code"] != "NoSuchKey":
raise
raise FileNotFoundError("No such key: {!r}".format(path)) from None

def store(self, path, file_path, *, meta=None):
extra_args = {}

if meta is not None:
extra_args["Metadata"] = meta

path = self._get_path(path)

self.bucket.upload_file(file_path, path, ExtraArgs=extra_args)

2 comments on commit 43e9a7b

@akx
Copy link

@akx akx commented on 43e9a7b Apr 25, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Out of curiosity, why Blake2b instead of something, well, more standard?

@dstufft
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Blake2 is faster than MD5, Sha1, Sha2, and Sha3 while being at least as secure (in terms of bits of protection against preimage and collision resistance)... but really mostly because I like it and there wasn't any reason not to :)

Please sign in to comment.