diff --git a/pex/common.py b/pex/common.py index 7f9728638..622eb6c3d 100644 --- a/pex/common.py +++ b/pex/common.py @@ -20,6 +20,7 @@ from datetime import datetime from uuid import uuid4 +from pex.orderedset import OrderedSet from pex.typing import TYPE_CHECKING, cast if TYPE_CHECKING: @@ -621,11 +622,18 @@ def zip( # type: (...) -> None if labels: - selected_files = set( - itertools.chain.from_iterable(self.filesets.get(label, ()) for label in labels) + selected_files = OrderedSet( + itertools.chain.from_iterable( + # If labels are provided, respect the given ordering, but still sort the files + # within each label to get deterministic output. + sorted(self.filesets.get(label, ())) + # NB: An iterable of labels with non-deterministic order is not reproducible! + for label in labels + ) ) else: - selected_files = self.files() + # Otherwise, sort the files to get reproducible output by default. + selected_files = OrderedSet(sorted(self.files())) compression = zipfile.ZIP_DEFLATED if compress else zipfile.ZIP_STORED with open_zip(filename, mode, compression) as zf: @@ -667,7 +675,7 @@ def maybe_write_parent_dirs(path): def iter_files(): # type: () -> Iterator[Tuple[str, str]] - for path in sorted(selected_files): + for path in selected_files: full_path = os.path.join(self.chroot, path) if os.path.isfile(full_path): if exclude_file(full_path): diff --git a/pex/pex_builder.py b/pex/pex_builder.py index 3f13b1545..926796b46 100644 --- a/pex/pex_builder.py +++ b/pex/pex_builder.py @@ -4,6 +4,7 @@ from __future__ import absolute_import import hashlib +import itertools import logging import os import shutil @@ -39,7 +40,7 @@ from pex.util import CacheHelper if TYPE_CHECKING: - from typing import Dict, Optional + from typing import ClassVar, Dict, Iterable, Iterator, Optional, Tuple # N.B.: __file__ will be relative when this module is loaded from a "" `sys.path` entry under @@ -95,14 +96,14 @@ def __maybe_run_venv__(pex, pex_root, pex_path): venv_dir = venv_dir( pex_file=pex, - pex_root=pex_root, + pex_root=pex_root, pex_hash={pex_hash!r}, has_interpreter_constraints={has_interpreter_constraints!r}, pex_path=pex_path, ) venv_pex = os.path.join(venv_dir, 'pex') if not __execute__ or not is_exe(venv_pex): - # Code in bootstrap_pex will (re)create the venv after selecting the correct interpreter. + # Code in bootstrap_pex will (re)create the venv after selecting the correct interpreter. return venv_dir TRACER.log('Executing venv PEX for {{}} at {{}}'.format(pex, venv_pex)) @@ -702,6 +703,59 @@ def set_sh_boot_script( ) self.set_header(script) + @classmethod + def iter_bootstrap_script_labels(cls): + # type: () -> Iterator[str] + """``Chroot`` labels covering the scripts immediately executed by the python interpreter.""" + # __pex__/__init__.py: This version of the bootstrap script will be executed by the python + # interpreter if the zipapp is on the PYTHONPATH. + yield "importhook" + + # __main__.py: This version of the bootstrap script is what is first executed by the python + # interpreter for an executable entry point (which includes use as a REPL). + yield "main" + + @classmethod + def iter_metadata_labels(cls): + # type: () -> Iterator[str] + """``Chroot`` labels covering metadata files.""" + # PEX-INFO + yield "manifest" + + @classmethod + def iter_bootstrap_libs_labels(cls): + # type: () -> Iterator[str] + """``Chroot`` labels covering code that may be imported from the bootstrap scripts.""" + # .bootstrap/ + yield "bootstrap" + + @classmethod + def iter_deps_libs_labels(cls, pex_info): + # type: (PexInfo) -> Iterator[str] + """``Chroot`` labels covering the third-party code that was resolved into dists.""" + # Subdirectories of .deps/: Keys need to be sorted for deterministic output. + for dist_label in sorted(pex_info.distributions.keys()): + yield dist_label + + @classmethod + def iter_direct_source_labels(cls): + # type: () -> Iterator[str] + """User source/resource files.""" + # Deprecated. + yield "resource" + + # Source files from -D/--sources-directory. + yield "source" + + # The value of --exe, if provided. Accessed after unpacking the zip. + yield "executable" + + def _setup_pex_info(self): + # type: () -> PexInfo + pex_info = self._pex_info.copy() + pex_info.update(PexInfo.from_env()) + return pex_info + def _build_packedapp( self, dirname, # type: str @@ -710,15 +764,20 @@ def _build_packedapp( ): # type: (...) -> None - pex_info = self._pex_info.copy() - pex_info.update(PexInfo.from_env()) + pex_info = self._setup_pex_info() # Include user sources, PEX-INFO and __main__ as loose files in src/. - for fileset in ("executable", "importhook", "main", "manifest", "resource", "source"): - for f in self._chroot.filesets.get(fileset, ()): - dest = os.path.join(dirname, f) - safe_mkdir(os.path.dirname(dest)) - safe_copy(os.path.realpath(os.path.join(self._chroot.chroot, f)), dest) + with TRACER.timed("copying over uncached sources", V=9): + uncached_label_groups = [ + self.iter_direct_source_labels(), + self.iter_metadata_labels(), + self.iter_bootstrap_script_labels(), + ] + for fileset in itertools.chain.from_iterable(uncached_label_groups): + for f in self._chroot.filesets.get(fileset, ()): + dest = os.path.join(dirname, f) + safe_mkdir(os.path.dirname(dest)) + safe_copy(os.path.realpath(os.path.join(self._chroot.chroot, f)), dest) # Pex historically only supported compressed zips in packed layout, so we don't disturb the # old cache structure for those zips and instead just use a subdir for un-compressed zips. @@ -779,6 +838,37 @@ def zip_cache_dir(path): os.path.join(internal_cache, location), ) + @classmethod + def iter_zipapp_labels(cls, pex_info): + # type: (PexInfo) -> Iterator[str] + """User sources, bootstrap sources, metadata, and dependencies, optimized for tail access. + + This method returns all the ``Chroot`` labels used by this ``PEXBuilder``. For zipapp + layouts, the order is significant. The python interpreter from the shebang line will process + the zipapp PEX by seeking to and parsing its central directory records at the end of the + file first. This method orders labels so the files which are first accessed by the python + interpreter will be closest to the location of the file pointer in the interpreter's file + handle for this zipapp after processing those central directory records. + """ + label_groups = [ + cls.iter_deps_libs_labels(pex_info), + cls.iter_direct_source_labels(), + # As these may also be imported before unzipping the zipapp, it should be as close to + # end of the file as possible. + cls.iter_bootstrap_libs_labels(), + # Metadata files such as PEX-INFO may be accessed outside the context of an executable + # zipapp, such as with ``PexInfo.from_pex()`` and ``unzip -p pex.pex PEX-INFO``. + # While they are only accessed after unpacking to the ``"unzipped_pexes"`` cache during + # execution, these should still be near enough to the central directory record to + # minimize the seek distance to then access these files in non-execution scenarios. + cls.iter_metadata_labels(), + # The following bootstrap scripts are executed immediately after opening the zipapp, so + # they should be right near the end by the central directory records. + cls.iter_bootstrap_script_labels(), + ] + for label in itertools.chain.from_iterable(label_groups): + yield label + def _build_zipapp( self, filename, # type: str @@ -805,5 +895,6 @@ def _build_zipapp( # racy. exclude_file=is_pyc_temporary_file, compress=compress, + labels=list(self.iter_zipapp_labels(self._setup_pex_info())), ) chmod_plus_x(filename) diff --git a/tests/integration/test_reproducible.py b/tests/integration/test_reproducible.py index aab05c84b..b43970e7b 100644 --- a/tests/integration/test_reproducible.py +++ b/tests/integration/test_reproducible.py @@ -97,6 +97,9 @@ def explode_pex(path): ), "{} and {} have different content.".format(member1, member2) # Check that the entire file is equal, including metadata. assert filecmp.cmp(member1, member2, shallow=False) + # Check that the file list is identical. + with ZipFile(pex1) as zfp1, ZipFile(pex2) as zfp2: + assert zfp1.namelist() == zfp2.namelist() # Finally, check that the .pex files are byte-for-byte identical. assert filecmp.cmp(pex1, pex2, shallow=False) diff --git a/tests/test_common.py b/tests/test_common.py index 67e3049f4..11e805f81 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -4,6 +4,7 @@ import contextlib import errno import os +import uuid import pytest @@ -27,7 +28,7 @@ import mock # type: ignore[no-redef,import] if TYPE_CHECKING: - from typing import Any, Iterator, Tuple + from typing import Any, Iterator, List, Tuple def extract_perms(path): @@ -140,6 +141,29 @@ def test_chroot_zip(): assert b"data" == zip.read("directory/subdirectory/file") +def test_chroot_zip_preserves_label_order(): + # type: () -> None + """Verify that the order of the labels presented to ``Chroot.zip()`` transfers to the order + of the zip file entries.""" + with temporary_dir() as tmp: + chroot = Chroot(os.path.join(tmp, "chroot")) + + ordered_labels = [] # type: List[str] + file_todo = [] # type: List[Tuple[str, str]] + for i in range(10): + label = uuid.uuid4().hex + file_todo.append(("{}-{}".format(uuid.uuid4().hex, i), label)) + ordered_labels.append(label) + for filename, label in reversed(file_todo): + chroot.touch(filename, label=label) + + zip_dst = os.path.join(tmp, "chroot.zip") + chroot.zip(zip_dst, labels=ordered_labels) + with open_zip(zip_dst) as zip: + for i, filename in enumerate(zip.namelist()): + assert filename.endswith("-{}".format(i)) + + def test_chroot_zip_symlink(): # type: () -> None with temporary_dir() as tmp: diff --git a/tests/test_pex_builder.py b/tests/test_pex_builder.py index a4c7b4a9c..3631ec2de 100644 --- a/tests/test_pex_builder.py +++ b/tests/test_pex_builder.py @@ -455,22 +455,35 @@ def assert_pex_env_var_nested(**env): assert_pex_env_var_nested(PEX_VENV=True) +@pytest.fixture(scope="function") +def tmp_pex_root_pex_builder(tmpdir): + # type: (Any) -> Iterator[PEXBuilder] + pex_root = os.path.join(str(tmpdir), "pex_root") + pb = PEXBuilder() + pb.info.pex_root = pex_root + yield pb + + +@pytest.fixture(scope="function") +def pex_dist_pex_builder(tmp_pex_root_pex_builder, pex_project_dir): + # type: (PEXBuilder, str) -> Iterator[PEXBuilder] + pb = tmp_pex_root_pex_builder + with ENV.patch(PEX_ROOT=pb.info.pex_root): + pb.add_dist_location(install_wheel(WheelBuilder(pex_project_dir).bdist()).location) + yield pb + + @pytest.mark.parametrize( "layout", [pytest.param(layout, id=layout.value) for layout in (Layout.PACKED, Layout.ZIPAPP)] ) def test_build_compression( tmpdir, # type: Any layout, # type: Layout.Value - pex_project_dir, # type: str + pex_dist_pex_builder, # type: PEXBuilder ): # type: (...) -> None + pb = pex_dist_pex_builder - pex_root = os.path.join(str(tmpdir), "pex_root") - - pb = PEXBuilder() - pb.info.pex_root = pex_root - with ENV.patch(PEX_ROOT=pex_root): - pb.add_dist_location(install_wheel(WheelBuilder(pex_project_dir).bdist()).location) exe = os.path.join(str(tmpdir), "exe.py") with open(exe, "w") as fp: fp.write("import pex; print(pex.__file__)") @@ -479,7 +492,9 @@ def test_build_compression( def assert_pex(pex): # type: (str) -> None assert ( - subprocess.check_output(args=[sys.executable, pex]).decode("utf-8").startswith(pex_root) + subprocess.check_output(args=[sys.executable, pex]) + .decode("utf-8") + .startswith(pb.info.pex_root) ) compressed_pex = os.path.join(str(tmpdir), "compressed.pex") @@ -499,3 +514,31 @@ def size(pex): ) assert size(compressed_pex) < size(uncompressed_pex) + + +def test_bootstrap_at_end_of_zipapp(tmpdir, pex_dist_pex_builder): + # type: (Any, PEXBuilder) -> None + pb = pex_dist_pex_builder + + src_file = os.path.join(str(tmpdir), "src.py") + with open(src_file, mode="w") as fp: + fp.write("print('hello')") + pb.add_source(src_file, "src.py") + + exe = os.path.join(str(tmpdir), "exe.py") + with open(exe, "w") as fp: + fp.write("import pex; print(pex.__file__)") + pb.set_executable(exe) + + compressed_pex = os.path.join(str(tmpdir), "test.pex") + pb.build(compressed_pex, layout=Layout.ZIPAPP) + + with open_zip(compressed_pex, mode="r") as zf: + names = list(reversed(zf.namelist())) + assert names[0:4] == [ + "__main__.py", + "__pex__/__init__.py", + "__pex__/", + "PEX-INFO", + ] + assert names[4].startswith(".bootstrap/") and names[4].endswith(".py")