From 1cd13a4863ce0dd41d69012e1f01eefd6ba186f9 Mon Sep 17 00:00:00 2001 From: Rob Ashton Date: Tue, 30 Jan 2024 10:26:22 +0000 Subject: [PATCH 01/22] Add function to pull a packet from a location --- src/outpack/filestore.py | 32 +- src/outpack/index.py | 29 +- src/outpack/location.py | 113 +------ src/outpack/location_pull.py | 423 ++++++++++++++++++++++++++ src/outpack/metadata.py | 7 +- src/outpack/root.py | 10 +- src/outpack/search.py | 3 + src/outpack/util.py | 28 ++ tests/helpers/__init__.py | 80 +++++ tests/test_filestore.py | 44 ++- tests/test_location.py | 234 +-------------- tests/test_location_pull.py | 559 +++++++++++++++++++++++++++++++++++ tests/test_root.py | 3 +- tests/test_tools.py | 5 +- tests/test_util.py | 32 +- 15 files changed, 1224 insertions(+), 378 deletions(-) create mode 100644 src/outpack/location_pull.py create mode 100644 tests/test_location_pull.py diff --git a/src/outpack/filestore.py b/src/outpack/filestore.py index 2ad1e17..3851cca 100644 --- a/src/outpack/filestore.py +++ b/src/outpack/filestore.py @@ -2,39 +2,44 @@ import os.path import shutil import stat +import tempfile +from pathlib import Path from outpack.hash import Hash, hash_parse, hash_validate_file class FileStore: def __init__(self, path): - self._path = path + self._path = Path(path) os.makedirs(path, exist_ok=True) def filename(self, hash): dat = hash_parse(hash) - return os.path.join( - self._path, dat.algorithm, dat.value[:2], dat.value[2:] - ) + return self._path / dat.algorithm / dat.value[:2] / dat.value[2:] - def get(self, hash, dst): + def get(self, hash, dst, overwrite): src = self.filename(hash) if not os.path.exists(src): msg = f"Hash '{hash}' not found in store" raise Exception(msg) os.makedirs(os.path.dirname(dst), exist_ok=True) - # todo - control over overwrite args needed. + if not overwrite and os.path.exists(dst): + msg = f"Failed to copy '{src}' to '{dst}', file already exists" + raise Exception(msg) shutil.copyfile(src, dst) def exists(self, hash): return os.path.exists(self.filename(hash)) - def put(self, src, hash): + def put(self, src, hash, move=False): hash_validate_file(src, hash) dst = self.filename(hash) if not os.path.exists(dst): os.makedirs(os.path.dirname(dst), exist_ok=True) - shutil.copyfile(src, dst) + if move: + shutil.move(src, dst) + else: + shutil.copyfile(src, dst) os.chmod(dst, stat.S_IREAD | stat.S_IRGRP | stat.S_IROTH) return hash @@ -43,9 +48,18 @@ def ls(self): # (os.walk, Path.glob etc), but this is probably clearest. ret = [] for algorithm in os.listdir(self._path): - path_alg = os.path.join(self._path, algorithm) + path_alg = self._path / algorithm for prefix in os.listdir(path_alg): path_prefix = os.path.join(path_alg, prefix) for suffix in os.listdir(path_prefix): ret.append(Hash(algorithm, prefix + suffix)) return ret + + def destroy(self) -> None: + shutil.rmtree(self._path) + + def tmp(self) -> str: + path = self._path / "tmp" + path.mkdir(exist_ok=True) + return tempfile.NamedTemporaryFile(dir=path).name + diff --git a/src/outpack/index.py b/src/outpack/index.py index d84b9e6..6c7ff84 100644 --- a/src/outpack/index.py +++ b/src/outpack/index.py @@ -1,15 +1,15 @@ import pathlib from dataclasses import dataclass -from typing import List -from outpack.metadata import read_metadata_core, read_packet_location +from outpack.metadata import read_metadata_core, read_packet_location, \ + MetadataCore, PacketLocation @dataclass class IndexData: - metadata: dict - location: dict - unpacked: List[str] + metadata: dict[str, MetadataCore] + location: dict[str, dict[str, PacketLocation]] + unpacked: list[str] @staticmethod def new(): @@ -29,21 +29,28 @@ def refresh(self): self.data = _index_update(self._path, self.data) return self - def all_metadata(self): + def all_metadata(self) -> dict[str, MetadataCore]: return self.refresh().data.metadata - def metadata(self, id): + def metadata(self, id) -> MetadataCore: if id in self.data.metadata: return self.data.metadata[id] return self.refresh().data.metadata[id] - def all_locations(self): + def all_locations(self) -> dict[str, dict[str, PacketLocation]]: return self.refresh().data.location - def location(self, name): + def location(self, name) -> dict[str, PacketLocation]: return self.refresh().data.location[name] - def unpacked(self): + def packets_in_location(self, name) -> list[str]: + try: + packets = list(self.location(name).keys()) + except KeyError: + packets = [] + return packets + + def unpacked(self) -> list[str]: return self.refresh().data.unpacked @@ -62,7 +69,7 @@ def _read_metadata(path_root, data): return data -def _read_locations(path_root, data): +def _read_locations(path_root, data) -> dict[str, dict[str, PacketLocation]]: path = path_root / ".outpack" / "location" for loc in path.iterdir(): if loc.name not in data: diff --git a/src/outpack/location.py b/src/outpack/location.py index cd4cdca..e0b37d7 100644 --- a/src/outpack/location.py +++ b/src/outpack/location.py @@ -1,13 +1,8 @@ import collections -import os import shutil -from typing import List from outpack.config import Location, update_config -from outpack.hash import hash_validate_string from outpack.location_path import OutpackLocationPath -from outpack.metadata import PacketLocation -from outpack.packet import mark_known from outpack.root import root_open from outpack.static import ( LOCATION_LOCAL, @@ -119,111 +114,6 @@ def location_resolve_valid( return location -def outpack_location_pull_metadata(location=None, root=None, *, locate=True): - root = root_open(root, locate=locate) - location_name = location_resolve_valid( - location, - root, - include_local=False, - include_orphan=False, - allow_no_locations=True, - ) - for name in location_name: - driver = _location_driver(name, root) - _pull_all_metadata(driver, root, name) - known_packets = [] - for packet_location in root.index.all_locations().values(): - known_packets.extend(list(packet_location.values())) - _validate_hashes(driver, name, known_packets) - _mark_all_known(driver, root, name) - - # TODO: mrc-4601 deorphan recovered packets - - -def _pull_all_metadata(driver, root, location_name): - known_there = driver.list() - known_here = root.index.all_metadata().keys() - for packet_id in known_there: - if packet_id not in known_here: - _pull_packet_metadata(driver, root, location_name, packet_id) - - -def _get_remove_location_hint(location_name): - return ( - f'Probably all you can do at this point is ' - f'remove this location from your configuration ' - f'by running ' - f'orderly_location_remove("{location_name}")' - ) - - -def _pull_packet_metadata(driver, root, location_name, packet_id): - metadata = driver.metadata(packet_id)[packet_id] - expected_hash = driver.list()[packet_id].hash - - hash_validate_string( - metadata, - expected_hash, - f"metadata for '{packet_id}' from '{location_name}'", - [ - "This is bad news, I'm afraid. Your location is sending data " - "that does not match the hash it says it does. Please let us " - "know how this might have happened.", - _get_remove_location_hint(location_name), - ], - ) - - path_metadata = root.path / ".outpack" / "metadata" - os.makedirs(path_metadata, exist_ok=True) - filename = path_metadata / packet_id - with open(filename, "w") as f: - f.writelines(metadata) - - -def _validate_hashes(driver, location_name, packets: List[PacketLocation]): - mismatched_hashes = set() - known_there = driver.list() - for packet in packets: - if known_there.get(packet.packet) is not None: - hash_there = known_there[packet.packet].hash - hash_here = packet.hash - if hash_there != hash_here: - mismatched_hashes.add(packet.packet) - - if len(mismatched_hashes) > 0: - id_text = "', '".join(mismatched_hashes) - msg = ( - f"Location '{location_name}' has conflicting metadata\n" - f"This is really bad news. We have been offered metadata " - f"from '{location_name}' that has a different hash to " - f"metadata that we have already imported from other " - f"locations. I'm not going to import this new metadata, " - f"but there's no guarantee that the older metadata is " - f"actually what you want!\nConflicts for: '{id_text}'\n" - f"We would be interested in this case, please let us know\n" - f"{_get_remove_location_hint(location_name)}" - ) - raise Exception(msg) - - -def _mark_all_known(driver, root, location_name): - try: - known_here = root.index.location(location_name) - except KeyError: - known_here = {} - - known_there = driver.list() - for packet_id in known_there: - if packet_id not in known_here.keys(): - mark_known( - root, - packet_id, - location_name, - known_there[packet_id].hash, - known_there[packet_id].time, - ) - - def _location_check_new_name(root, name): if _location_exists(root, name): msg = f"A location with name '{name}' already exists" @@ -250,3 +140,6 @@ def _location_driver(location_name, root): elif location.type == "custom": # pragma: no cover msg = "custom remote not yet supported" raise Exception(msg) + +# TODO: Create a driver interface and have location path extend it? +# Because atm we can't specify a type for driver as a function arg diff --git a/src/outpack/location_pull.py b/src/outpack/location_pull.py new file mode 100644 index 0000000..6e92bf4 --- /dev/null +++ b/src/outpack/location_pull.py @@ -0,0 +1,423 @@ +import itertools +import os +import time +from dataclasses import dataclass +from typing import Callable, Union + +import humanize + +from outpack.filestore import FileStore +from outpack.hash import hash_validate_string +from outpack.location import location_resolve_valid, _location_driver +from outpack.metadata import MetadataCore, PacketFile, PacketLocation +from outpack.packet import mark_known +from outpack.root import OutpackRoot, root_open, find_file_by_hash +from outpack.search_options import SearchOptions +from outpack.static import LOCATION_LOCAL +from outpack.util import format_list, pl, partition + + +def outpack_location_pull_metadata(location=None, root=None, *, locate=True): + root = root_open(root, locate=locate) + location_name = location_resolve_valid( + location, + root, + include_local=False, + include_orphan=False, + allow_no_locations=True, + ) + for name in location_name: + driver = _location_driver(name, root) + _pull_all_metadata(driver, root, name) + known_packets = [] + for packet_location in root.index.all_locations().values(): + known_packets.extend(list(packet_location.values())) + _validate_hashes(driver, name, known_packets) + _mark_all_known(driver, root, name) + + # TODO: mrc-4601 deorphan recovered packets + + +def _pull_packet_metadata(driver, root, location_name, packet_id): + metadata = driver.metadata(packet_id)[packet_id] + expected_hash = driver.list()[packet_id].hash + + hash_validate_string( + metadata, + expected_hash, + f"metadata for '{packet_id}' from '{location_name}'", + [ + "This is bad news, I'm afraid. Your location is sending data " + "that does not match the hash it says it does. Please let us " + "know how this might have happened.", + _get_remove_location_hint(location_name), + ], + ) + + path_metadata = root.path / ".outpack" / "metadata" + os.makedirs(path_metadata, exist_ok=True) + filename = path_metadata / packet_id + with open(filename, "w") as f: + f.writelines(metadata) + + +def _get_remove_location_hint(location_name): + return ( + f'Probably all you can do at this point is ' + f'remove this location from your configuration ' + f'by running ' + f'outpack_location_remove("{location_name}")' + ) + + +def _validate_hashes(driver, location_name, packets: list[PacketLocation]): + mismatched_hashes = set() + known_there = driver.list() + for packet in packets: + if known_there.get(packet.packet) is not None: + hash_there = known_there[packet.packet].hash + hash_here = packet.hash + if hash_there != hash_here: + mismatched_hashes.add(packet.packet) + + if len(mismatched_hashes) > 0: + id_text = "', '".join(mismatched_hashes) + msg = ( + f"Location '{location_name}' has conflicting metadata\n" + f"This is really bad news. We have been offered metadata " + f"from '{location_name}' that has a different hash to " + f"metadata that we have already imported from other " + f"locations. I'm not going to import this new metadata, " + f"but there's no guarantee that the older metadata is " + f"actually what you want!\nConflicts for: '{id_text}'\n" + f"We would be interested in this case, please let us know\n" + f"{_get_remove_location_hint(location_name)}" + ) + raise Exception(msg) + + +def _mark_all_known(driver, root, location_name): + try: + known_here = root.index.location(location_name) + except KeyError: + known_here = {} + + known_there = driver.list() + for packet_id in known_there: + if packet_id not in known_here.keys(): + mark_known( + root, + packet_id, + location_name, + known_there[packet_id].hash, + known_there[packet_id].time, + ) + + +def outpack_location_pull_packet(ids: Union[str, list[str]], + options=None, + recursive=None, + root=None, *, locate=True): + root = root_open(root, locate=locate) + options = SearchOptions(options) + if isinstance(ids, str): + ids = [ids] + plan = location_build_pull_plan(ids, options.location, recursive, root) + + ## Warn people of extra pulls and skips + if plan.info.n_extra > 0: + print(f"Also pulling {plan.info.n_extra} " + f"{pl(plan.info.n_extra, 'packet')}, which are " + f"dependencies of those requested") + if plan.info.n_skip > 0: + print(f"Skipping {plan.info.n_skip} of {plan.info.n_total} " + f"{pl(plan.info.n_total, 'packet')} which are already " + f"unpacked") + + store, cleanup = location_pull_files(plan.files, root) + + use_archive = root.config.core.path_archive is not None + n_packets = len(plan.packets) + time_start = time.time() + for idx, packet in enumerate(plan.packets.values()): + if use_archive: + print( + f"Writing files for '{packet.packet}' (packet {idx + 1}/" + f"{n_packets})") + location_pull_files_archive(packet.packet, store, root) + + mark_known( + root, + packet.packet, + LOCATION_LOCAL, + packet.hash, + time.time() + ) + + print(f"Unpacked {n_packets} {pl(n_packets, 'packet')} in " + f"{humanize.time.precisedelta(int(time.time() - time_start))}.") + + cleanup() + return list(plan.packets.keys()) + + +# This approach may be suboptimal in the case where the user does not +# already have a file store, as it means that files will be copied +# around and hashed more than ideal: +# +# * hash the candidate file +# * rehash on entry into the file store +# * copy into the file store +# * copy from the file store into the final location +# +# So in the case where a hash is only present once in a chain of +# packets being pulled this will be one too many hashes and one too +# many copies. +# +# However, this approach makes the logic fairly easy to deal with, +# and copes well with data races and corruption of data on disk +# (e.g., users having edited files that we rely on, or editing them +# after we hash them the first time). +def location_pull_files(files: list[PacketFile], root: OutpackRoot) \ + -> (FileStore, Callable[[], None]): + + store = root.files + if store is not None: + def cleanup(): + return None + + exists, missing = partition(lambda file: store.exists(file.hash), files) + + if len(exists) > 0: + print(f"Found {len(exists)} {pl(exists, 'file')} in the " + f"file store") + else: + print("Looking for suitable files already on disk") + store = temporary_filestore(root) + + def cleanup(): + store.destroy() + + for file in files: + path = find_file_by_hash(root, file.hash) + if path is not None: + store.put(path, file.hash) + + if len(files) == 0: + print("All files available locally, no need to fetch any") + else: + locations = {file.location for file in files} + total_size = humanize.naturalsize(sum(file.size for file in files)) + print(f"Need to fetch {len(files)} {pl(files, 'file')} " + f"({total_size}) from {len(locations)} " + f"{pl(locations, 'location')}") + for location in locations: + from_this_location = [file for file in files + if file.location == location] + location_pull_hash_store(from_this_location, location, + _location_driver(location, root), store) + + return store, cleanup + + +def location_pull_hash_store(files: list[PacketFile], + location_name: str, + driver, + store: FileStore): + total_size = humanize.naturalsize(sum(file.size for file in files)) + no_of_files = len(files) + # TODO: show a nice progress bar for users + for idx, file in enumerate(files): + print(f"Fetching file {idx + 1}/{no_of_files} " + f"({humanize.naturalsize(file.size)}) from '{location_name}'") + tmp = driver.fetch_file(file.hash, store.tmp()) + store.put(tmp, file.hash) + + +def location_pull_files_archive(packet_id: str, + store, + root: OutpackRoot): + meta = root.index.metadata(packet_id) + dest_root = (root.path / root.config.core.path_archive / meta.name / + packet_id) + for file in meta.files: + store.get(file.hash, dest_root / file.path, overwrite=True) + + +def _pull_all_metadata(driver, root, location_name): + known_there = driver.list() + known_here = root.index.all_metadata().keys() + for packet_id in known_there: + if packet_id not in known_here: + _pull_packet_metadata(driver, root, location_name, packet_id) + + +@dataclass +class PullPlanInfo: + n_extra: int + n_skip: int + n_total: int + + +@dataclass +class LocationPullPlan: + packets: dict[str, PacketLocation] + files: list[PacketFile] + info: PullPlanInfo + + +@dataclass +class PullPlanPackets: + requested: list[str] + full: list[str] + skip: set[str] + fetch: set[str] + + +def location_build_pull_plan(packet_ids: list[str], + locations: list[str], + recursive: bool, + root: OutpackRoot) -> LocationPullPlan: + packets = location_build_pull_plan_packets(packet_ids, recursive, root) + locations = location_build_pull_plan_location(packets, locations, root) + files = location_build_pull_plan_files(packets.fetch, locations, root) + fetch = location_build_packet_locations(packets.fetch, locations, root) + + info = PullPlanInfo(n_extra=len(packets.full) - len(packets.requested), + n_skip=len(packets.skip), + n_total=len(packets.full)) + + return LocationPullPlan(packets=fetch, + files=files, + info=info) + + +def location_build_pull_plan_packets(packet_ids: list[str], + recursive: bool, + root: OutpackRoot) -> PullPlanPackets: + requested = packet_ids + if recursive is None: + recursive = root.config.core.require_complete_tree + if root.config.core.require_complete_tree and not recursive: + msg = (f"'recursive' must be True (or None) with your " + f"configuration\nBecause 'core.require_complete_tree' is " + f"true, we can't do a non-recursive pull, as this might " + f"leave an incomplete tree") + raise Exception(msg) + + index = root.index + if recursive: + full = find_all_dependencies(packet_ids, index.all_metadata()) + else: + full = packet_ids + + skip = set(full) & set(index.unpacked()) + fetch = set(full) - skip + + return PullPlanPackets(requested=requested, + full=full, + skip=skip, + fetch=fetch) + + +def find_all_dependencies(packet_ids: list[str], + metadata: dict[str, MetadataCore]) -> list[str]: + ret = set(packet_ids) + while len(packet_ids) > 0: + dependency_ids = set( + dependencies.packet + for packet_id in packet_ids + if packet_id in metadata.keys() + for dependencies in metadata.get(packet_id).depends) + packet_ids = dependency_ids - ret + ret = packet_ids | ret + + return sorted(ret) + + +def location_build_pull_plan_location(packets: PullPlanPackets, + locations: list[str], + root: OutpackRoot) -> list[str]: + location_names = location_resolve_valid( + locations, + root, + include_local=False, + include_orphan=False, + allow_no_locations=len(packets.fetch) == 0, + ) + + known_packets = [root.index.packets_in_location(location_name) + for location_name in location_names] + missing = packets.fetch - set(itertools.chain(*known_packets)) + if len(missing) > 0: + extra = missing - set(packets.requested) + if len(extra) > 0: + hint = (f"{len(extra)} missing " + f"{pl(extra, 'packet was', 'packets were')} " + f"requested as " + f"{pl(extra, 'dependency', 'dependencies')} " + f"of the {pl(extra, 'one')} you asked for: " + f"{format_list(extra)}") + else: + # In the case where the above is used, we probably have + # up-to-date metadata, so we don't display this. + hint = "Do you need to run 'outpack_location_pull_metadata()'?" + + msg = (f"Failed to find {pl(missing, 'packet')} " + f"{format_list(missing)}\n" + f"Looked in {pl(location_names, 'location')} " + f"{format_list(location_names)}\n" + + hint) + raise Exception(msg) + + return location_names + + +def location_build_pull_plan_files(packet_ids: set[str], + locations: list[str], + root: OutpackRoot) -> list[PacketFile]: + metadata = root.index.all_metadata() + file_hashes = [file.hash + for packet_id in packet_ids + for file in metadata[packet_id].files] + n_files = len(file_hashes) + + if n_files == 0: + return [] + + # Find first location within the set which contains each packet + # We've already checked earlier that the file is in at least 1 + # location so we don't have to worry about that here + all_files = [] + files = {} + for location_name in locations: + location_packets = set(root.index.packets_in_location(location_name)) + packets_in_location = location_packets & packet_ids + for packet_id in packets_in_location: + for file in metadata[packet_id].files: + file.location = location_name + all_files.append(files.setdefault(file.hash, file)) + + # We only want to pull each file once, remove any with duplicate hashes + # files = {} + # packet_files = [files.setdefault(packet.hash, packet) + # for packet in packet_files if packet.hash not in files] + + return all_files + + +def location_build_packet_locations( + packets: set[str], + locations: list[str], + root: OutpackRoot) -> dict[str, PacketLocation]: + packets_fetch = {} + for location in locations: + packets_from_location = root.index.location(location) + packets_in_this_location = packets & packets_from_location.keys() + for packet_id in packets_in_this_location: + packets_fetch[packet_id] = packets_from_location[packet_id] + return packets_fetch + + +def temporary_filestore(root: OutpackRoot) -> FileStore: + return FileStore(root.path / "orderly" / "pull") diff --git a/src/outpack/metadata.py b/src/outpack/metadata.py index 26046a5..1dddc82 100644 --- a/src/outpack/metadata.py +++ b/src/outpack/metadata.py @@ -1,4 +1,4 @@ -from dataclasses import dataclass +from dataclasses import dataclass, field from pathlib import Path from typing import Dict, List, Optional, Union @@ -14,6 +14,7 @@ class PacketFile: path: str size: float hash: str + location: Optional[str] = field(default=None) @staticmethod def from_file(directory, path, hash_algorithm): @@ -74,11 +75,11 @@ class PacketLocation: hash: str -def read_metadata_core(path): +def read_metadata_core(path) -> MetadataCore: with open(path) as f: return MetadataCore.from_json(f.read().strip()) -def read_packet_location(path): +def read_packet_location(path) -> PacketLocation: with open(path) as f: return PacketLocation.from_json(f.read().strip()) diff --git a/src/outpack/root.py b/src/outpack/root.py index 672af84..cb78a13 100644 --- a/src/outpack/root.py +++ b/src/outpack/root.py @@ -1,7 +1,7 @@ import os import shutil from pathlib import Path -from typing import Union +from typing import Union, Optional from outpack.config import read_config from outpack.filestore import FileStore @@ -11,7 +11,7 @@ class OutpackRoot: - files = None + files: Optional[FileStore] = None def __init__(self, path): self.path = Path(path) @@ -26,7 +26,7 @@ def export_file(self, id, there, here, dest): dest = Path(dest) here_full = dest / here if self.config.core.use_file_store: - self.files.get(hash, here_full) + self.files.get(hash, here_full, False) else: # consider starting from the case most likely to contain # this hash, since we already know that it's 'id' unless @@ -78,8 +78,8 @@ def find_file_by_hash(root, hash): return path else: msg = ( - f"Rejecting file from archive '{f.path}'" - f"in {meta.name}/{meta.id}" + f"Rejecting file from archive '{f.path}' " + f"in '{meta.name}/{meta.id}'" ) print(msg) return None diff --git a/src/outpack/search.py b/src/outpack/search.py index 55646a9..aa45580 100644 --- a/src/outpack/search.py +++ b/src/outpack/search.py @@ -4,6 +4,7 @@ import outpack_query_parser as parser +from outpack.ids import is_outpack_id from outpack.metadata import MetadataCore, Parameters from outpack.root import OutpackRoot, root_open from outpack.search_options import SearchOptions @@ -70,6 +71,8 @@ def as_query(query: Union[Query, str]) -> Query: if isinstance(query, Query): return query else: + if is_outpack_id(query): + query = f"'{query}'" return Query.parse(query) diff --git a/src/outpack/util.py b/src/outpack/util.py index 697cc39..2dd38f0 100644 --- a/src/outpack/util.py +++ b/src/outpack/util.py @@ -3,6 +3,7 @@ import runpy import time from contextlib import contextmanager +from itertools import tee, filterfalse from pathlib import Path @@ -106,3 +107,30 @@ def read_string(path): with open(path) as f: lines = f.read().rstrip() return lines + + +def format_list(x): + return ", ".join("'" + item + "'" for item in x) + + +def pl(x, singular, plural=None): + if plural is None: + plural = singular + 's' + + if isinstance(x, int): + length = x + else: + length = len(x) + return "{}".format(singular if length == 1 else plural) + + +def partition(pred, iterable): + """Partition entries into false entries and true entries. + + This is slightly modified version of partition from itertools + recipes https://docs.python.org/dev/library/itertools.html#itertools-recipes + If *pred* is slow, consider wrapping it with functools.lru_cache(). + """ + # partition(is_odd, range(10)) --> 1 3 5 7 9 and 0 2 4 6 8 + t1, t2 = tee(iterable) + return list(filter(pred, t1)), list(filterfalse(pred, t2)) diff --git a/tests/helpers/__init__.py b/tests/helpers/__init__.py index 1e0d720..7216496 100644 --- a/tests/helpers/__init__.py +++ b/tests/helpers/__init__.py @@ -1,12 +1,19 @@ +import os import random import shutil +import string from contextlib import contextmanager from pathlib import Path from tempfile import TemporaryDirectory +from typing import List +from outpack.ids import outpack_id from outpack.init import outpack_init +from outpack.metadata import MetadataCore, PacketDepends from outpack.packet import Packet from outpack.root import root_open +from outpack.schema import outpack_schema_version +from outpack.util import run_script @contextmanager @@ -41,11 +48,61 @@ def create_random_packet(root, name="data", *, parameters=None, packet_id=None): return p.id +## Create a chain of packets a, b, c, ... that depend on each other +def create_random_packet_chain(root, length, base=None): + ids = {} + with TemporaryDirectory() as src: + for i in range(length): + name = chr(i + ord("a")) + packet_id = outpack_id() + ids[name] = packet_id + packet_path = Path(src) / name / packet_id + os.makedirs(packet_path) + + packet = Packet( + root, packet_path, name, id=packet_id, locate=False + ) + + if i == 0 and base is None: + with open(packet_path / "data.txt", "w") as f: + f.write("0") + else: + lines = ["with open('input.txt', 'r') as f:", + " data = f.read()", + "with open('data.txt', 'w') as f:", + f" f.write(data + '{i}')"] + with open(packet_path / "orderly.py", "w") as f: + f.write('\n'.join(lines) + '\n') + + if i > 0: + id_use = ids[chr(i - 1 + ord("a"))] + else: + id_use = base + + if id_use is not None: + packet.use_dependency(id_use, {"input.txt": "data.txt"}) + + run_script(packet_path, "orderly.py", None) + + packet.end(insert=True) + + return ids + + def create_temporary_root(path, **kwargs): outpack_init(path, **kwargs) return root_open(path, locate=False) +def create_temporary_roots(path, location_names=None, **kwargs): + if location_names is None: + location_names = ["src", "dst"] + root = {} + for name in location_names: + root[name] = create_temporary_root(path / name, **kwargs) + return root + + def copy_examples(names, root): if isinstance(names, str): names = [names] @@ -53,3 +110,26 @@ def copy_examples(names, root): path_src = root.path / "src" / nm path_src.parent.mkdir(parents=True, exist_ok=True) shutil.copytree(Path("tests/orderly/examples") / nm, path_src) + + +def create_metadata_depends(id: str, depends: List[str] = None): + if depends is None: + depends = [] + dependencies = [PacketDepends(dependency_id, "", []) + for dependency_id in depends] + return {id: MetadataCore( + outpack_schema_version(), + id, + "name_" + random_characters(4), + {}, + {}, + [], + dependencies, + None, + None + )} + + +def random_characters(n): + return ''.join(random.choice(string.ascii_letters + string.digits) + for _ in range(n)) diff --git a/tests/test_filestore.py b/tests/test_filestore.py index 56e4a89..eb9de86 100644 --- a/tests/test_filestore.py +++ b/tests/test_filestore.py @@ -1,3 +1,4 @@ +import os import random import pytest @@ -33,7 +34,7 @@ def test_can_store_files(tmp_path): assert s.ls() == [h] dest = tmp_path / "dest" - s.get(h, dest) + s.get(h, dest, False) assert dest.exists() assert hash_file(dest, "md5") == h @@ -44,11 +45,50 @@ def test_can_store_files(tmp_path): assert len(s.ls()) == 10 +def test_get_files_fails_if_overwrite_false(tmp_path): + tmp = tmp_path / "tmp" + tmp.mkdir() + letters = [chr(i + ord("a")) for i in range(10)] + for i in range(10): + with open(tmp_path / "tmp" / letters[i], "w") as f: + f.write(randstr(10)) + + store = FileStore(str(tmp_path / "store")) + p = tmp_path / "tmp" / "a" + h = hash_file(p, "md5") + assert not store.exists(h) + assert store.put(p, h) == h + assert store.exists(h) + + dest = tmp_path / "dest" + assert not dest.exists() + store.get(h, dest, False) + assert dest.exists() + assert hash_file(dest, "md5") == h + + store.get(h, dest, True) + assert dest.exists() + assert hash_file(dest, "md5") == h + + with pytest.raises(Exception) as e: + store.get(h, dest, False) + assert e.match(f"Failed to copy '.+' to '{dest}', file already exists") + + def test_if_hash_not_found(tmp_path): p = str(tmp_path / "store") s = FileStore(p) h = Hash("md5", "7c4d97e580abb6c2ffb8b1872907d84b") dest = tmp_path / "dest" with pytest.raises(Exception) as e: - s.get(h, dest) + s.get(h, dest, False) assert e.match("Hash 'md5:7c4d97e.+' not found in store") + + +def test_can_create_filename_within_the_store(tmp_path): + path = str(tmp_path / "store") + store = FileStore(path) + temp_file = store.tmp() + assert os.path.dirname(temp_file) == str(store._path / "tmp") + assert not os.path.exists(temp_file) + assert os.path.exists(os.path.dirname(temp_file)) diff --git a/tests/test_location.py b/tests/test_location.py index 1f31568..5937184 100644 --- a/tests/test_location.py +++ b/tests/test_location.py @@ -1,19 +1,12 @@ -import json - import pytest -from outpack.ids import outpack_id from outpack.location import ( location_resolve_valid, outpack_location_add, outpack_location_list, - outpack_location_pull_metadata, outpack_location_remove, - outpack_location_rename, -) -from outpack.util import read_string - -from .helpers import create_random_packet, create_temporary_root + outpack_location_rename) +from .helpers import create_temporary_root def test_no_locations_except_local_by_default(tmp_path): @@ -217,205 +210,6 @@ def test_cant_add_wip_location_type(tmp_path): assert e.match("Cannot add a location with type 'custom' yet.") -def test_can_pull_metadata_from_a_file_base_location(tmp_path): - root_upstream = create_temporary_root( - tmp_path / "upstream", use_file_store=True - ) - - ids = [create_random_packet(root_upstream) for _ in range(3)] - root_downstream = create_temporary_root( - tmp_path / "downstream", use_file_store=True - ) - - outpack_location_add( - "upstream", - "path", - {"path": str(root_upstream.path)}, - root=root_downstream, - ) - assert outpack_location_list(root_downstream) == ["local", "upstream"] - - outpack_location_pull_metadata("upstream", root=root_downstream) - - metadata = root_downstream.index.all_metadata() - assert len(metadata) == 3 - assert set(metadata.keys()) == set(ids) - assert metadata == root_upstream.index.all_metadata() - - packets = root_downstream.index.location("upstream") - assert len(packets) == 3 - assert set(packets.keys()) == set(ids) - - -def test_can_pull_empty_metadata(tmp_path): - root_upstream = create_temporary_root( - tmp_path / "upstream", use_file_store=True - ) - root_downstream = create_temporary_root( - tmp_path / "downstream", use_file_store=True - ) - - outpack_location_add( - "upstream", - "path", - {"path": str(root_upstream.path)}, - root=root_downstream, - ) - outpack_location_pull_metadata("upstream", root=root_downstream) - - index = root_downstream.index.data - assert len(index.metadata) == 0 - - -def test_can_pull_metadata_from_subset_of_locations(tmp_path): - root = {"a": create_temporary_root(tmp_path / "a", use_file_store=True)} - - location_names = ["x", "y", "z"] - for name in location_names: - root[name] = create_temporary_root(tmp_path / name, use_file_store=True) - outpack_location_add( - name, "path", {"path": str(root[name].path)}, root=root["a"] - ) - - assert outpack_location_list(root["a"]) == ["local", "x", "y", "z"] - - ids = {} - for name in location_names: - ids[name] = [create_random_packet(root[name]) for _ in range(3)] - - outpack_location_pull_metadata(["x", "y"], root=root["a"]) - index = root["a"].index - - assert set(index.all_metadata()) == set(ids["x"] + ids["y"]) - locations = index.all_locations() - assert set(locations.keys()) == {"local", "x", "y"} - assert len(locations["local"]) == 0 - assert len(locations["x"]) == 3 - assert len(locations["y"]) == 3 - - x_metadata = root["x"].index.all_metadata().keys() - y_metadata = root["y"].index.all_metadata().keys() - for packet_id in index.all_metadata().keys(): - if packet_id in ids["x"]: - assert packet_id in x_metadata - else: - assert packet_id in y_metadata - - outpack_location_pull_metadata(root=root["a"]) - index = root["a"].index - - assert set(index.all_metadata()) == set(ids["x"] + ids["y"] + ids["z"]) - locations = index.all_locations() - assert set(locations.keys()) == {"local", "x", "y", "z"} - assert len(locations["local"]) == 0 - assert len(locations["x"]) == 3 - assert len(locations["y"]) == 3 - assert len(locations["z"]) == 3 - z_metadata = root["z"].index.all_metadata().keys() - for packet_id in index.all_metadata().keys(): - if packet_id in ids["x"]: - assert packet_id in x_metadata - elif packet_id in ids["y"]: - assert packet_id in y_metadata - else: - assert packet_id in z_metadata - - -def test_cant_pull_metadata_from_an_unknown_location(tmp_path): - root = create_temporary_root(tmp_path) - with pytest.raises(Exception) as e: - outpack_location_pull_metadata("upstream", root=root) - - assert e.match("Unknown location: 'upstream'") - - -def test_noop_to_pull_metadata_from_no_locations(tmp_path): - root = create_temporary_root(tmp_path) - outpack_location_pull_metadata("local", root=root) - outpack_location_pull_metadata(root=root) - - -def test_handle_metadata_where_hash_does_not_match_reported(tmp_path): - here = create_temporary_root(tmp_path / "here") - there = create_temporary_root(tmp_path / "there") - outpack_location_add("server", "path", {"path": str(there.path)}, root=here) - packet_id = create_random_packet(there) - - path_metadata = there.path / ".outpack" / "metadata" / packet_id - parsed = json.loads(read_string(path_metadata)) - with open(path_metadata, "w") as f: - f.write(json.dumps(parsed, indent=4)) - - with pytest.raises(Exception) as e: - outpack_location_pull_metadata(root=here) - - assert e.match( - f"Hash of metadata for '{packet_id}' from 'server' does not match:" - ) - assert e.match("This is bad news") - assert e.match("remove this location") - - -def test_handle_metadata_where_two_locations_differ_in_hash_for_same_id( - tmp_path, -): - root = {} - - for name in ["a", "b", "us"]: - root[name] = create_temporary_root(tmp_path / name) - - packet_id = outpack_id() - create_random_packet(root["a"], packet_id=packet_id) - create_random_packet(root["b"], packet_id=packet_id) - - outpack_location_add( - "a", "path", {"path": str(root["a"].path)}, root=root["us"] - ) - outpack_location_add( - "b", "path", {"path": str(root["b"].path)}, root=root["us"] - ) - - outpack_location_pull_metadata(location="a", root=root["us"]) - - with pytest.raises(Exception) as e: - outpack_location_pull_metadata(location="b", root=root["us"]) - - assert e.match( - "We have been offered metadata from 'b' that has a different" - ) - assert e.match(f"Conflicts for: '{packet_id}'") - assert e.match("please let us know") - assert e.match("remove this location") - - -def test_can_pull_metadata_through_chain_of_locations(tmp_path): - root = {} - for name in ["a", "b", "c", "d"]: - root[name] = create_temporary_root(tmp_path / name) - - # More interesting topology, with a chain of locations, but d also - # knowing directly about an earlier location - # > a -> b -> c -> d - # > `-------/ - outpack_location_add( - "a", "path", {"path": str(root["a"].path)}, root=root["b"] - ) - outpack_location_add( - "b", "path", {"path": str(root["b"].path)}, root=root["c"] - ) - outpack_location_add( - "b", "path", {"path": str(root["b"].path)}, root=root["d"] - ) - outpack_location_add( - "c", "path", {"path": str(root["c"].path)}, root=root["d"] - ) - - # Create a packet and make sure it's in both b and c - create_random_packet(root["a"]) - outpack_location_pull_metadata(root=root["b"]) - # TODO: complete test once orderly_location_pull_packet - - def test_can_resolve_locations(tmp_path): root = {} for name in ["dst", "a", "b", "c", "d"]: @@ -500,27 +294,3 @@ def test_can_resolve_locations(tmp_path): ) assert e.match("Unknown location: '[fg]', '[fg]'") - - -def test_informative_error_when_no_locations_configured(tmp_path): - root = create_temporary_root(tmp_path) - - locations = location_resolve_valid( - None, - root, - include_local=False, - include_orphan=False, - allow_no_locations=True, - ) - assert locations == [] - - with pytest.raises(Exception) as e: - location_resolve_valid( - None, - root, - include_local=False, - include_orphan=False, - allow_no_locations=False, - ) - - assert e.match("No suitable location found") diff --git a/tests/test_location_pull.py b/tests/test_location_pull.py new file mode 100644 index 0000000..a26ad89 --- /dev/null +++ b/tests/test_location_pull.py @@ -0,0 +1,559 @@ +import contextlib +import io +import json +import os +import re +from operator import itemgetter + +import pytest + +from outpack.hash import hash_file +from outpack.ids import outpack_id +from outpack.location import outpack_location_add, location_resolve_valid, \ + outpack_location_list +from outpack.location_pull import find_all_dependencies, \ + outpack_location_pull_metadata, outpack_location_pull_packet +from outpack.packet import Packet +from outpack.util import read_string +from .helpers import create_metadata_depends, create_temporary_root, \ + create_random_packet, create_temporary_roots, create_random_packet_chain + + +def test_can_pull_metadata_from_a_file_base_location(tmp_path): + root_upstream = create_temporary_root( + tmp_path / "upstream", use_file_store=True + ) + + ids = [create_random_packet(root_upstream) for _ in range(3)] + root_downstream = create_temporary_root( + tmp_path / "downstream", use_file_store=True + ) + + outpack_location_add( + "upstream", + "path", + {"path": str(root_upstream.path)}, + root=root_downstream, + ) + assert outpack_location_list(root_downstream) == ["local", "upstream"] + + outpack_location_pull_metadata("upstream", root=root_downstream) + + metadata = root_downstream.index.all_metadata() + assert len(metadata) == 3 + assert set(metadata.keys()) == set(ids) + assert metadata == root_upstream.index.all_metadata() + + packets = root_downstream.index.location("upstream") + assert len(packets) == 3 + assert set(packets.keys()) == set(ids) + + +def test_can_pull_empty_metadata(tmp_path): + root_upstream = create_temporary_root( + tmp_path / "upstream", use_file_store=True + ) + root_downstream = create_temporary_root( + tmp_path / "downstream", use_file_store=True + ) + + outpack_location_add( + "upstream", + "path", + {"path": str(root_upstream.path)}, + root=root_downstream, + ) + outpack_location_pull_metadata("upstream", root=root_downstream) + + index = root_downstream.index.data + assert len(index.metadata) == 0 + + +def test_can_pull_metadata_from_subset_of_locations(tmp_path): + root = {"a": create_temporary_root(tmp_path / "a", use_file_store=True)} + + location_names = ["x", "y", "z"] + for name in location_names: + root[name] = create_temporary_root(tmp_path / name, use_file_store=True) + outpack_location_add( + name, "path", {"path": str(root[name].path)}, root=root["a"] + ) + + assert outpack_location_list(root["a"]) == ["local", "x", "y", "z"] + + ids = {} + for name in location_names: + ids[name] = [create_random_packet(root[name]) for _ in range(3)] + + outpack_location_pull_metadata(["x", "y"], root=root["a"]) + index = root["a"].index + + assert set(index.all_metadata()) == set(ids["x"] + ids["y"]) + locations = index.all_locations() + assert set(locations.keys()) == {"local", "x", "y"} + assert len(locations["local"]) == 0 + assert len(locations["x"]) == 3 + assert len(locations["y"]) == 3 + + x_metadata = root["x"].index.all_metadata().keys() + y_metadata = root["y"].index.all_metadata().keys() + for packet_id in index.all_metadata().keys(): + if packet_id in ids["x"]: + assert packet_id in x_metadata + else: + assert packet_id in y_metadata + + outpack_location_pull_metadata(root=root["a"]) + index = root["a"].index + + assert set(index.all_metadata()) == set(ids["x"] + ids["y"] + ids["z"]) + locations = index.all_locations() + assert set(locations.keys()) == {"local", "x", "y", "z"} + assert len(locations["local"]) == 0 + assert len(locations["x"]) == 3 + assert len(locations["y"]) == 3 + assert len(locations["z"]) == 3 + z_metadata = root["z"].index.all_metadata().keys() + for packet_id in index.all_metadata().keys(): + if packet_id in ids["x"]: + assert packet_id in x_metadata + elif packet_id in ids["y"]: + assert packet_id in y_metadata + else: + assert packet_id in z_metadata + + +def test_cant_pull_metadata_from_an_unknown_location(tmp_path): + root = create_temporary_root(tmp_path) + with pytest.raises(Exception) as e: + outpack_location_pull_metadata("upstream", root=root) + + assert e.match("Unknown location: 'upstream'") + + +def test_noop_to_pull_metadata_from_no_locations(tmp_path): + root = create_temporary_root(tmp_path) + outpack_location_pull_metadata("local", root=root) + outpack_location_pull_metadata(root=root) + + +def test_handle_metadata_where_hash_does_not_match_reported(tmp_path): + here = create_temporary_root(tmp_path / "here") + there = create_temporary_root(tmp_path / "there") + outpack_location_add("server", "path", {"path": str(there.path)}, root=here) + packet_id = create_random_packet(there) + + path_metadata = there.path / ".outpack" / "metadata" / packet_id + parsed = json.loads(read_string(path_metadata)) + with open(path_metadata, "w") as f: + f.write(json.dumps(parsed, indent=4)) + + with pytest.raises(Exception) as e: + outpack_location_pull_metadata(root=here) + + assert e.match( + f"Hash of metadata for '{packet_id}' from 'server' does not match:" + ) + assert e.match("This is bad news") + assert e.match("remove this location") + + +def test_handle_metadata_where_two_locations_differ_in_hash_for_same_id( + tmp_path, +): + root = {} + + for name in ["a", "b", "us"]: + root[name] = create_temporary_root(tmp_path / name) + + packet_id = outpack_id() + create_random_packet(root["a"], packet_id=packet_id) + create_random_packet(root["b"], packet_id=packet_id) + + outpack_location_add( + "a", "path", {"path": str(root["a"].path)}, root=root["us"] + ) + outpack_location_add( + "b", "path", {"path": str(root["b"].path)}, root=root["us"] + ) + + outpack_location_pull_metadata(location="a", root=root["us"]) + + with pytest.raises(Exception) as e: + outpack_location_pull_metadata(location="b", root=root["us"]) + + assert e.match( + "We have been offered metadata from 'b' that has a different" + ) + assert e.match(f"Conflicts for: '{packet_id}'") + assert e.match("please let us know") + assert e.match("remove this location") + + +def test_can_pull_metadata_through_chain_of_locations(tmp_path): + root = {} + for name in ["a", "b", "c", "d"]: + root[name] = create_temporary_root(tmp_path / name) + + # More interesting topology, with a chain of locations, but d also + # knowing directly about an earlier location + # > a -> b -> c -> d + # > `-------/ + outpack_location_add( + "a", "path", {"path": str(root["a"].path)}, root=root["b"] + ) + outpack_location_add( + "b", "path", {"path": str(root["b"].path)}, root=root["c"] + ) + outpack_location_add( + "b", "path", {"path": str(root["b"].path)}, root=root["d"] + ) + outpack_location_add( + "c", "path", {"path": str(root["c"].path)}, root=root["d"] + ) + + # Create a packet and make sure it's in both b and c + create_random_packet(root["a"]) + outpack_location_pull_metadata(root=root["b"]) + # TODO: complete test once orderly_location_pull_packet + + +def test_informative_error_when_no_locations_configured(tmp_path): + root = create_temporary_root(tmp_path) + + locations = location_resolve_valid( + None, + root, + include_local=False, + include_orphan=False, + allow_no_locations=True, + ) + assert locations == [] + + with pytest.raises(Exception) as e: + location_resolve_valid( + None, + root, + include_local=False, + include_orphan=False, + allow_no_locations=False, + ) + + assert e.match("No suitable location found") + + with pytest.raises(Exception) as e: + outpack_location_pull_packet(outpack_id(), root=root) + assert e.match("No suitable location found") + + +def test_can_resolve_dependencies_where_there_are_none(): + metadata = create_metadata_depends("a") + deps = find_all_dependencies(["a"], metadata) + assert deps == ["a"] + + metadata = (create_metadata_depends("a") | + create_metadata_depends("b", ["a"])) + deps = find_all_dependencies(["a"], metadata) + assert deps == ["a"] + + +def test_can_find_dependencies(): + metadata = (create_metadata_depends("a") | + create_metadata_depends("b") | + create_metadata_depends("c") | + create_metadata_depends("d", ["a", "b"]) | + create_metadata_depends("e", ["b", "c"]) | + create_metadata_depends("f", ["a", "c"]) | + create_metadata_depends("g", ["a", "f", "c"]) | + create_metadata_depends("h", ["a", "b", "c"]) | + create_metadata_depends("i", ["f"]) | + create_metadata_depends("j", ["i", "e", "a"])) + + assert find_all_dependencies(["a"], metadata) == ["a"] + assert find_all_dependencies(["b"], metadata) == ["b"] + assert find_all_dependencies(["c"], metadata) == ["c"] + + assert find_all_dependencies(["d"], metadata) == ["a", "b", "d"] + assert find_all_dependencies(["e"], metadata) == ["b", "c", "e"] + assert find_all_dependencies(["f"], metadata) == ["a", "c", "f"] + + assert (find_all_dependencies(["g"], metadata) == + ["a", "c", "f", "g"]) + assert (find_all_dependencies(["h"], metadata) == + ["a", "b", "c", "h"]) + assert (find_all_dependencies(["i"], metadata) == + ["a", "c", "f", "i"]) + assert (find_all_dependencies(["j"], metadata) == + ["a", "b", "c", "e", "f", "i", "j"]) + + +def test_can_find_multiple_dependencies_at_once(): + metadata = (create_metadata_depends("a") | + create_metadata_depends("b") | + create_metadata_depends("c") | + create_metadata_depends("d", ["a", "b"]) | + create_metadata_depends("e", ["b", "c"]) | + create_metadata_depends("f", ["a", "c"]) | + create_metadata_depends("g", ["a", "f", "c"]) | + create_metadata_depends("h", ["a", "b", "c"]) | + create_metadata_depends("i", ["f"]) | + create_metadata_depends("j", ["i", "e", "a"])) + + assert find_all_dependencies([], metadata) == [] + assert (find_all_dependencies(["c", "b", "a"], metadata) == + ["a", "b", "c"]) + assert (find_all_dependencies(["d", "e", "f"], metadata) == + ["a", "b", "c", "d", "e", "f"]) + + +def test_can_pull_packet_from_location_into_another_file_store(tmp_path): + root = create_temporary_roots(tmp_path, use_file_store=True) + + id = create_random_packet(root["src"]) + outpack_location_add("src", "path", + {"path": str(root["src"].path)}, + root=root["dst"]) + outpack_location_pull_metadata(root=root["dst"]) + outpack_location_pull_packet(id, root=root["dst"]) + + index = root["dst"].index + assert index.unpacked() == [id] + assert os.path.exists(root["dst"].path / "archive" / "data" / id / + "data.txt") + + meta = index.metadata(id) + assert all(root["dst"].files.exists(file.hash) for file in meta.files) + + +def test_can_pull_packet_from_one_location_to_another_archive(tmp_path): + root = create_temporary_roots(tmp_path, use_file_store=False) + + id = create_random_packet(root["src"]) + outpack_location_add("src", "path", + {"path": str(root["src"].path)}, + root=root["dst"]) + outpack_location_pull_metadata(root=root["dst"]) + outpack_location_pull_packet(id, root=root["dst"]) + + index = root["dst"].index + assert index.unpacked() == [id] + assert os.path.exists(root["dst"].path / "archive" / "data" / id / + "data.txt") + + +def test_detect_and_avoid_modified_files_in_source_repository(tmp_path): + root = create_temporary_roots(tmp_path, use_file_store=False) + + tmp_dir = tmp_path / "new_dir" + os.mkdir(tmp_dir) + with open(tmp_dir / "a.txt", "w") as f: + f.writelines("my data a") + with open(tmp_dir / "b.txt", "w") as f: + f.writelines("my data b") + ids = [None] * 2 + for i in range(len(ids)): + p = Packet(root["src"], tmp_dir, "data") + p.end() + ids[i] = p.id + + outpack_location_add("src", "path", + {"path": str(root["src"].path)}, + root["dst"]) + outpack_location_pull_metadata(root=root["dst"]) + + ## When I corrupt the file in the first ID by truncating it: + src_data = root["src"].path / "archive" / "data" + dest_data = root["dst"].path / "archive" / "data" + with open(src_data / ids[0] / "a.txt", "w") as f: + f.truncate(0) + + ## and then try and pull it, user is warned + f = io.StringIO() + with contextlib.redirect_stdout(f): + outpack_location_pull_packet(ids[0], root=root["dst"]) + + print(f.getvalue()) + assert re.search( + r"Rejecting file from archive 'a\.txt' in 'data/", + f.getvalue()) + + assert (hash_file(dest_data / ids[0] / "a.txt") == + hash_file(src_data / ids[1] / "a.txt")) + assert (hash_file(dest_data / ids[0] / "b.txt") == + hash_file(src_data / ids[1] / "b.txt")) + + +def test_do_not_unpack_packet_twice(tmp_path): + root = create_temporary_roots(tmp_path) + + id = create_random_packet(root["src"]) + outpack_location_add("src", "path", + {"path": str(root["src"].path)}, + root["dst"]) + outpack_location_pull_metadata(root=root["dst"]) + + assert outpack_location_pull_packet(id, root=root["dst"]) == [id] + assert outpack_location_pull_packet(id, root=root["dst"]) == [] + + +def test_sensible_error_if_packet_not_known(tmp_path): + root = create_temporary_roots(tmp_path) + id = create_random_packet(root["src"]) + outpack_location_add("src", "path", + {"path": str(root["src"].path)}, + root=root["dst"]) + + with pytest.raises(Exception) as e: + outpack_location_pull_packet(id, root=root["dst"]) + assert e.match(f"Failed to find packet '{id}'") + assert e.match("Looked in location 'src'") + assert e.match("Do you need to run 'outpack_location_pull_metadata'?") + + +def test_error_if_dependent_packet_not_known(tmp_path): + root = create_temporary_roots(tmp_path, ["a", "c"], + require_complete_tree=True) + root["b"] = create_temporary_root(tmp_path / "b", + require_complete_tree=False) + + ids = create_random_packet_chain(root["a"], 5) + outpack_location_add("a", "path", + {"path": str(root["a"].path)}, + root=root["b"]) + outpack_location_pull_metadata(root=root["b"]) + outpack_location_pull_packet(ids["e"], root=root["b"]) + + outpack_location_add("b", "path", + {"path": str(root["b"].path)}, + root=root["c"]) + outpack_location_pull_metadata(root=root["c"]) + + with pytest.raises(Exception) as e: + outpack_location_pull_packet(ids["e"], root=root["c"]) + print(e.value) + assert e.match(f"Failed to find packet '{ids['d']}") + assert e.match("Looked in location 'b'") + assert e.match("1 missing packet was requested as dependency of the " + f"one you asked for: '{ids['d']}'") + + +def test_can_pull_a_tree_recursively(tmp_path): + root = create_temporary_roots(tmp_path) + ids = create_random_packet_chain(root["src"], 3) + + outpack_location_add("src", "path", + {"path": str(root["src"].path)}, + root["dst"]) + outpack_location_pull_metadata(root=root["dst"]) + pulled_packets = outpack_location_pull_packet(ids["c"], recursive=True, + root=root["dst"]) + assert set(pulled_packets) == set(itemgetter("a", "b", "c")(ids)) + + assert (set(root["dst"].index.unpacked()) == + set(root["src"].index.unpacked())) + + pulled_packets = outpack_location_pull_packet(ids["c"], recursive=True, + root=root["dst"]) + assert pulled_packets == [] + + +def test_can_filter_locations(tmp_path): + location_names = ["dst", "a", "b", "c", "d"] + root = create_temporary_roots(tmp_path, location_names, use_file_store=True) + for name in location_names: + if name != "dst": + outpack_location_add(name, "path", + {"path": str(root[name].path)}, + root=root["dst"]) + + ids_a = [create_random_packet(root["a"]) for _ in range(3)] + outpack_location_add("a", "path", + {"path": str(root["a"].path)}, + root=root["b"]) + outpack_location_pull_metadata(root=root["b"]) + outpack_location_pull_packet(ids_a, root=root["b"]) + + ids_b = ids_a + [create_random_packet(root["b"]) for _ in range(3)] + ids_c = [create_random_packet(root["c"]) for _ in range(3)] + outpack_location_add("a", "path", + {"path": str(root["a"].path)}, + root=root["d"]) + outpack_location_add("c", "path", + {"path": str(root["c"].path)}, + root=root["d"]) + outpack_location_pull_metadata(root=root["d"]) + outpack_location_pull_packet(ids_a, root=root["d"]) + outpack_location_pull_packet(ids_c, root=root["d"]) + ids_d = ids_c + [create_random_packet(root["d"]) for _ in range(3)] + + outpack_location_pull_metadata(root=root["dst"]) + + ids = set(ids_a + ids_b + ids_c + ids_d) + + +def test_nonrecursive_pulls_are_prevented_by_configuration(tmp_path): + root = create_temporary_roots(tmp_path, require_complete_tree=True) + ids = create_random_packet_chain(root["src"], 3) + + with pytest.raises(Exception) as e: + outpack_location_pull_packet(ids["c"], recursive=False, + root=root["dst"]) + + assert e.match("'recursive' must be True \(or None\) with your " + "configuration") + + +def test_if_recursive_pulls_are_required_pulls_are_default_recursive(tmp_path): + root = create_temporary_roots(tmp_path, ["src", "shallow"], + require_complete_tree=False) + root["deep"] = create_temporary_root(tmp_path, require_complete_tree=True) + + ids = create_random_packet_chain(root["src"], 3) + + for r in [root["shallow"], root["deep"]]: + outpack_location_add("src", "path", + {"path": str(root["src"].path)}, + root=r) + outpack_location_pull_metadata(root=r) + + outpack_location_pull_packet(ids["c"], recursive=None, root=root["shallow"]) + assert root["shallow"].index.unpacked() == [ids["c"]] + + outpack_location_pull_packet(ids["c"], recursive=None, root=root["deep"]) + assert root["deep"].index.unpacked() == list(ids.values()) + +## TODO: Uncomment when wired up with searching +# def test_can_pull_packets_as_result_of_query(tmp_path): +# TODO +# def test_pull_packet_errors_if_allow_remote_is_false(tmp_path): +# root = create_temporary_roots(tmp_path) +# id = create_random_packet(root["src"]) +# +# outpack_location_add("src", "path", +# {"path": str(root["src"].path)}, +# root=root["dst"]) +# outpack_location_pull_metadata(root=root["dst"]) +# +# with pytest.raises(Exception): +# outpack_location_pull_packet(None, options={"allow_remote" = False}, root=root["dst"]) +# assert e.match("If specifying 'options', 'allow_remote' must be True") + + +def test_skip_files_in_file_store(tmp_path): + root = create_temporary_roots(tmp_path, use_file_store=True) + + ids = create_random_packet_chain(root["src"], 3) + outpack_location_add("src", "path", + {"path": str(root["src"].path)}, + root=root["dst"]) + + outpack_location_pull_metadata(root=root["dst"]) + outpack_location_pull_packet(ids["a"], root=root["dst"]) + + f = io.StringIO() + with contextlib.redirect_stdout(f): + outpack_location_pull_packet(ids["b"], root=root["dst"]) + text = f.getvalue() + assert re.search("Found 1 file in the file store", text) + assert re.search( + r"Need to fetch 3 files \([0-9]* Bytes\) from 1 location", text) diff --git a/tests/test_root.py b/tests/test_root.py index 1b283ab..fba9a73 100644 --- a/tests/test_root.py +++ b/tests/test_root.py @@ -8,7 +8,6 @@ from outpack.init import outpack_init from outpack.root import find_file_by_hash, root_open from outpack.util import transient_working_directory - from . import helpers @@ -89,7 +88,7 @@ def test_can_reject_corrupted_files(tmp_path, capsys): captured = capsys.readouterr() assert ( captured.out - == f"Rejecting file from archive 'data.txt'in data/{id[1]}\n" + == f"Rejecting file from archive 'data.txt' in 'data/{id[1]}'\n" ) dest = tmp_path / "dest" with pytest.raises(Exception, match="File not found in archive"): diff --git a/tests/test_tools.py b/tests/test_tools.py index 78746dc..d03b3a4 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -29,9 +29,8 @@ def test_git_report_no_info_without_git_repo(tmp_path): def test_git_report_git_info_if_possible(tmp_path): sha = simple_git_example(tmp_path) res = git_info(tmp_path) - # This default branch name won't be robust to changes in future - # git versions - assert res == GitInfo(branch="master", sha=sha, url=[]) + assert (res == GitInfo(branch="master", sha=sha, url=[]) or + res == GitInfo(branch="main", sha=sha, url=[])) def test_git_report_single_url(tmp_path): diff --git a/tests/test_util.py b/tests/test_util.py index 43c8fcf..708a128 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -12,7 +12,7 @@ num_to_time, read_string, run_script, - time_to_num, + time_to_num, format_list, pl, partition, ) @@ -115,3 +115,33 @@ def test_can_inject_data_into_run(tmp_path): assert tmp_path.joinpath("result.txt").exists() with open(tmp_path.joinpath("result.txt")) as f: assert f.read() == "hello" + + +def test_can_format_list(): + assert format_list(["one", "two"]) == "'one', 'two'" + assert format_list(["one"]) == "'one'" + format_set = format_list({"one", "two"}) + assert format_set == "'one', 'two'" or format_set == "'two', 'one'" + assert format_list({"one", "one"}) == "'one'" + + +def test_can_pluralise(): + assert pl([], "item") == "items" + assert pl(["one"], "item") == "item" + assert pl(["one", "two"], "item") == "items" + assert pl({"Inky"}, "octopus", "octopodes") == "octopus" + assert (pl({"Inky", "Tentacool"}, "octopus", "octopodes") == + "octopodes") + assert pl(2, "item") == "items" + assert pl(1, "item") == "item" + + +def test_can_partition(): + test_list = ["one", "two", "three", "four", "five"] + true_list, false_list = partition(lambda x: "e" in x, test_list) + assert true_list == ["one", "three", "five"] + assert false_list == ["two", "four"] + + true_list, false_list = partition(lambda x: "z" in x, test_list) + assert true_list == [] + assert false_list == test_list From 5f6c5b6a44a22a987dfec28f53e2ee6195094d8b Mon Sep 17 00:00:00 2001 From: Rob Ashton Date: Mon, 5 Feb 2024 11:06:48 +0000 Subject: [PATCH 02/22] Add remaining testing, make functions explicitly private --- src/outpack/location_pull.py | 90 +++++++++++++++++------------------ tests/helpers/__init__.py | 14 ++++++ tests/test_location_pull.py | 92 +++++++++++++++++++++++++++--------- tests/test_util.py | 5 +- 4 files changed, 132 insertions(+), 69 deletions(-) diff --git a/src/outpack/location_pull.py b/src/outpack/location_pull.py index 6e92bf4..67f2196 100644 --- a/src/outpack/location_pull.py +++ b/src/outpack/location_pull.py @@ -1,8 +1,9 @@ +import dataclasses import itertools import os import time from dataclasses import dataclass -from typing import Callable, Union +from typing import Callable, Union, Optional import humanize @@ -122,7 +123,7 @@ def outpack_location_pull_packet(ids: Union[str, list[str]], options = SearchOptions(options) if isinstance(ids, str): ids = [ids] - plan = location_build_pull_plan(ids, options.location, recursive, root) + plan = _location_build_pull_plan(ids, options.location, recursive, root) ## Warn people of extra pulls and skips if plan.info.n_extra > 0: @@ -134,7 +135,7 @@ def outpack_location_pull_packet(ids: Union[str, list[str]], f"{pl(plan.info.n_total, 'packet')} which are already " f"unpacked") - store, cleanup = location_pull_files(plan.files, root) + store, cleanup = _location_pull_files(plan.files, root) use_archive = root.config.core.path_archive is not None n_packets = len(plan.packets) @@ -144,7 +145,7 @@ def outpack_location_pull_packet(ids: Union[str, list[str]], print( f"Writing files for '{packet.packet}' (packet {idx + 1}/" f"{n_packets})") - location_pull_files_archive(packet.packet, store, root) + _location_pull_files_archive(packet.packet, store, root) mark_known( root, @@ -178,7 +179,7 @@ def outpack_location_pull_packet(ids: Union[str, list[str]], # and copes well with data races and corruption of data on disk # (e.g., users having edited files that we rely on, or editing them # after we hash them the first time). -def location_pull_files(files: list[PacketFile], root: OutpackRoot) \ +def _location_pull_files(files: list[PacketFile], root: OutpackRoot) \ -> (FileStore, Callable[[], None]): store = root.files @@ -193,7 +194,7 @@ def cleanup(): f"file store") else: print("Looking for suitable files already on disk") - store = temporary_filestore(root) + store = _temporary_filestore(root) def cleanup(): store.destroy() @@ -214,17 +215,16 @@ def cleanup(): for location in locations: from_this_location = [file for file in files if file.location == location] - location_pull_hash_store(from_this_location, location, - _location_driver(location, root), store) + _location_pull_hash_store(from_this_location, location, + _location_driver(location, root), store) return store, cleanup -def location_pull_hash_store(files: list[PacketFile], - location_name: str, - driver, - store: FileStore): - total_size = humanize.naturalsize(sum(file.size for file in files)) +def _location_pull_hash_store(files: list[PacketFile], + location_name: str, + driver, + store: FileStore): no_of_files = len(files) # TODO: show a nice progress bar for users for idx, file in enumerate(files): @@ -234,9 +234,9 @@ def location_pull_hash_store(files: list[PacketFile], store.put(tmp, file.hash) -def location_pull_files_archive(packet_id: str, - store, - root: OutpackRoot): +def _location_pull_files_archive(packet_id: str, + store, + root: OutpackRoot): meta = root.index.metadata(packet_id) dest_root = (root.path / root.config.core.path_archive / meta.name / packet_id) @@ -274,14 +274,14 @@ class PullPlanPackets: fetch: set[str] -def location_build_pull_plan(packet_ids: list[str], - locations: list[str], - recursive: bool, - root: OutpackRoot) -> LocationPullPlan: - packets = location_build_pull_plan_packets(packet_ids, recursive, root) - locations = location_build_pull_plan_location(packets, locations, root) - files = location_build_pull_plan_files(packets.fetch, locations, root) - fetch = location_build_packet_locations(packets.fetch, locations, root) +def _location_build_pull_plan(packet_ids: list[str], + locations: Optional[list[str]], + recursive: Optional[bool], + root: OutpackRoot) -> LocationPullPlan: + packets = _location_build_pull_plan_packets(packet_ids, recursive, root) + locations = _location_build_pull_plan_location(packets, locations, root) + files = _location_build_pull_plan_files(packets.fetch, locations, root) + fetch = _location_build_packet_locations(packets.fetch, locations, root) info = PullPlanInfo(n_extra=len(packets.full) - len(packets.requested), n_skip=len(packets.skip), @@ -292,9 +292,9 @@ def location_build_pull_plan(packet_ids: list[str], info=info) -def location_build_pull_plan_packets(packet_ids: list[str], - recursive: bool, - root: OutpackRoot) -> PullPlanPackets: +def _location_build_pull_plan_packets(packet_ids: list[str], + recursive: bool, + root: OutpackRoot) -> PullPlanPackets: requested = packet_ids if recursive is None: recursive = root.config.core.require_complete_tree @@ -307,7 +307,7 @@ def location_build_pull_plan_packets(packet_ids: list[str], index = root.index if recursive: - full = find_all_dependencies(packet_ids, index.all_metadata()) + full = _find_all_dependencies(packet_ids, index.all_metadata()) else: full = packet_ids @@ -320,8 +320,8 @@ def location_build_pull_plan_packets(packet_ids: list[str], fetch=fetch) -def find_all_dependencies(packet_ids: list[str], - metadata: dict[str, MetadataCore]) -> list[str]: +def _find_all_dependencies(packet_ids: list[str], + metadata: dict[str, MetadataCore]) -> list[str]: ret = set(packet_ids) while len(packet_ids) > 0: dependency_ids = set( @@ -335,9 +335,9 @@ def find_all_dependencies(packet_ids: list[str], return sorted(ret) -def location_build_pull_plan_location(packets: PullPlanPackets, - locations: list[str], - root: OutpackRoot) -> list[str]: +def _location_build_pull_plan_location(packets: PullPlanPackets, + locations: list[str], + root: OutpackRoot) -> list[str]: location_names = location_resolve_valid( locations, root, @@ -373,9 +373,9 @@ def location_build_pull_plan_location(packets: PullPlanPackets, return location_names -def location_build_pull_plan_files(packet_ids: set[str], - locations: list[str], - root: OutpackRoot) -> list[PacketFile]: +def _location_build_pull_plan_files(packet_ids: set[str], + locations: list[str], + root: OutpackRoot) -> list[PacketFile]: metadata = root.index.all_metadata() file_hashes = [file.hash for packet_id in packet_ids @@ -389,24 +389,22 @@ def location_build_pull_plan_files(packet_ids: set[str], # We've already checked earlier that the file is in at least 1 # location so we don't have to worry about that here all_files = [] - files = {} + file_hashes = set() for location_name in locations: location_packets = set(root.index.packets_in_location(location_name)) packets_in_location = location_packets & packet_ids for packet_id in packets_in_location: for file in metadata[packet_id].files: - file.location = location_name - all_files.append(files.setdefault(file.hash, file)) - - # We only want to pull each file once, remove any with duplicate hashes - # files = {} - # packet_files = [files.setdefault(packet.hash, packet) - # for packet in packet_files if packet.hash not in files] + new_file = dataclasses.replace(file) + new_file.location = location_name + if file.hash not in file_hashes: + file_hashes.add(file.hash) + all_files.append(new_file) return all_files -def location_build_packet_locations( +def _location_build_packet_locations( packets: set[str], locations: list[str], root: OutpackRoot) -> dict[str, PacketLocation]: @@ -419,5 +417,5 @@ def location_build_packet_locations( return packets_fetch -def temporary_filestore(root: OutpackRoot) -> FileStore: +def _temporary_filestore(root: OutpackRoot) -> FileStore: return FileStore(root.path / "orderly" / "pull") diff --git a/tests/helpers/__init__.py b/tests/helpers/__init__.py index 7216496..653a19f 100644 --- a/tests/helpers/__init__.py +++ b/tests/helpers/__init__.py @@ -133,3 +133,17 @@ def create_metadata_depends(id: str, depends: List[str] = None): def random_characters(n): return ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(n)) + + +# Like Rs rep function, useful for setting up test values +def rep(x, each): + ret = [] + if isinstance(each, int): + each = [each] * len(x) + if len(x) != len(each): + raise Exception("Repeats must be int or same length as the thing you " + "want to repeat") + for item, times in zip(x, each): + ret.extend([item] * times) + + return ret diff --git a/tests/test_location_pull.py b/tests/test_location_pull.py index a26ad89..780482f 100644 --- a/tests/test_location_pull.py +++ b/tests/test_location_pull.py @@ -9,14 +9,22 @@ from outpack.hash import hash_file from outpack.ids import outpack_id -from outpack.location import outpack_location_add, location_resolve_valid, \ - outpack_location_list -from outpack.location_pull import find_all_dependencies, \ - outpack_location_pull_metadata, outpack_location_pull_packet +from outpack.location import (outpack_location_add, + location_resolve_valid, + outpack_location_list) +from outpack.location_pull import (_find_all_dependencies, + outpack_location_pull_metadata, + outpack_location_pull_packet, + _location_build_pull_plan, + PullPlanInfo) from outpack.packet import Packet from outpack.util import read_string -from .helpers import create_metadata_depends, create_temporary_root, \ - create_random_packet, create_temporary_roots, create_random_packet_chain +from .helpers import (create_metadata_depends, + create_temporary_root, + create_random_packet, + create_temporary_roots, + create_random_packet_chain, + rep) def test_can_pull_metadata_from_a_file_base_location(tmp_path): @@ -248,12 +256,12 @@ def test_informative_error_when_no_locations_configured(tmp_path): def test_can_resolve_dependencies_where_there_are_none(): metadata = create_metadata_depends("a") - deps = find_all_dependencies(["a"], metadata) + deps = _find_all_dependencies(["a"], metadata) assert deps == ["a"] metadata = (create_metadata_depends("a") | create_metadata_depends("b", ["a"])) - deps = find_all_dependencies(["a"], metadata) + deps = _find_all_dependencies(["a"], metadata) assert deps == ["a"] @@ -269,21 +277,21 @@ def test_can_find_dependencies(): create_metadata_depends("i", ["f"]) | create_metadata_depends("j", ["i", "e", "a"])) - assert find_all_dependencies(["a"], metadata) == ["a"] - assert find_all_dependencies(["b"], metadata) == ["b"] - assert find_all_dependencies(["c"], metadata) == ["c"] + assert _find_all_dependencies(["a"], metadata) == ["a"] + assert _find_all_dependencies(["b"], metadata) == ["b"] + assert _find_all_dependencies(["c"], metadata) == ["c"] - assert find_all_dependencies(["d"], metadata) == ["a", "b", "d"] - assert find_all_dependencies(["e"], metadata) == ["b", "c", "e"] - assert find_all_dependencies(["f"], metadata) == ["a", "c", "f"] + assert _find_all_dependencies(["d"], metadata) == ["a", "b", "d"] + assert _find_all_dependencies(["e"], metadata) == ["b", "c", "e"] + assert _find_all_dependencies(["f"], metadata) == ["a", "c", "f"] - assert (find_all_dependencies(["g"], metadata) == + assert (_find_all_dependencies(["g"], metadata) == ["a", "c", "f", "g"]) - assert (find_all_dependencies(["h"], metadata) == + assert (_find_all_dependencies(["h"], metadata) == ["a", "b", "c", "h"]) - assert (find_all_dependencies(["i"], metadata) == + assert (_find_all_dependencies(["i"], metadata) == ["a", "c", "f", "i"]) - assert (find_all_dependencies(["j"], metadata) == + assert (_find_all_dependencies(["j"], metadata) == ["a", "b", "c", "e", "f", "i", "j"]) @@ -299,10 +307,10 @@ def test_can_find_multiple_dependencies_at_once(): create_metadata_depends("i", ["f"]) | create_metadata_depends("j", ["i", "e", "a"])) - assert find_all_dependencies([], metadata) == [] - assert (find_all_dependencies(["c", "b", "a"], metadata) == + assert _find_all_dependencies([], metadata) == [] + assert (_find_all_dependencies(["c", "b", "a"], metadata) == ["a", "b", "c"]) - assert (find_all_dependencies(["d", "e", "f"], metadata) == + assert (_find_all_dependencies(["d", "e", "f"], metadata) == ["a", "b", "c", "d", "e", "f"]) @@ -488,7 +496,47 @@ def test_can_filter_locations(tmp_path): outpack_location_pull_metadata(root=root["dst"]) - ids = set(ids_a + ids_b + ids_c + ids_d) + ids = list(set(ids_a + ids_b + ids_c + ids_d)) + + def locs(location_names): + return location_resolve_valid(location_names, root["dst"], + include_local=False, + include_orphan=False, + allow_no_locations=False) + + plan = _location_build_pull_plan(ids, None, None, + root=root["dst"]) + locations = [file.location for file in plan.files] + assert locations == rep(["a", "b", "c", "d"], 3) + + # Invert order, prefer "d" + plan = _location_build_pull_plan(ids, locs(["d", "c", "b", "a"]), None, + root=root["dst"]) + locations = [file.location for file in plan.files] + assert locations == rep(["d", "b"], [9, 3]) + + # Drop redundant locations + plan = _location_build_pull_plan(ids, locs(["b", "d"]), None, + root=root["dst"]) + locations = [file.location for file in plan.files] + assert locations == rep(["b", "d"], 6) + + # Corner cases + plan = _location_build_pull_plan([ids_a[0]], None, None, root=root["dst"]) + locations = [file.location for file in plan.files] + assert locations == ["a"] + + plan = _location_build_pull_plan([], None, None, root=root["dst"]) + assert plan.files == [] + assert plan.packets == {} + assert plan.info == PullPlanInfo(0, 0, 0) + + # Failure to find packets + with pytest.raises(Exception) as e: + _location_build_pull_plan(ids, ["a", "b", "c"], None, root=root["dst"]) + assert "Looked in locations 'a', 'b', 'c'" in e.value.args[0] + assert ("Do you need to run 'outpack_location_pull_metadata()'?" + in e.value.args[0]) def test_nonrecursive_pulls_are_prevented_by_configuration(tmp_path): diff --git a/tests/test_util.py b/tests/test_util.py index 708a128..2f6467e 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -12,7 +12,10 @@ num_to_time, read_string, run_script, - time_to_num, format_list, pl, partition, + time_to_num, + format_list, + pl, + partition, ) From 667a69e4f5643c576ea11254cb0e0418a227bceb Mon Sep 17 00:00:00 2001 From: Rob Ashton Date: Mon, 5 Feb 2024 11:23:43 +0000 Subject: [PATCH 03/22] Run formatter, fix codestyle issues --- src/outpack/filestore.py | 3 +- src/outpack/index.py | 8 +- src/outpack/location.py | 1 + src/outpack/location_pull.py | 220 ++++++++++++++----------- src/outpack/root.py | 2 +- src/outpack/util.py | 6 +- tests/helpers/__init__.py | 61 +++---- tests/test_location.py | 4 +- tests/test_location_pull.py | 306 ++++++++++++++++++++--------------- tests/test_root.py | 1 + tests/test_tools.py | 5 +- tests/test_util.py | 14 +- 12 files changed, 359 insertions(+), 272 deletions(-) diff --git a/src/outpack/filestore.py b/src/outpack/filestore.py index 3851cca..f6abadd 100644 --- a/src/outpack/filestore.py +++ b/src/outpack/filestore.py @@ -31,7 +31,7 @@ def get(self, hash, dst, overwrite): def exists(self, hash): return os.path.exists(self.filename(hash)) - def put(self, src, hash, move=False): + def put(self, src, hash, *, move=False): hash_validate_file(src, hash) dst = self.filename(hash) if not os.path.exists(dst): @@ -62,4 +62,3 @@ def tmp(self) -> str: path = self._path / "tmp" path.mkdir(exist_ok=True) return tempfile.NamedTemporaryFile(dir=path).name - diff --git a/src/outpack/index.py b/src/outpack/index.py index 6c7ff84..7295158 100644 --- a/src/outpack/index.py +++ b/src/outpack/index.py @@ -1,8 +1,12 @@ import pathlib from dataclasses import dataclass -from outpack.metadata import read_metadata_core, read_packet_location, \ - MetadataCore, PacketLocation +from outpack.metadata import ( + MetadataCore, + PacketLocation, + read_metadata_core, + read_packet_location, +) @dataclass diff --git a/src/outpack/location.py b/src/outpack/location.py index e0b37d7..12a8828 100644 --- a/src/outpack/location.py +++ b/src/outpack/location.py @@ -141,5 +141,6 @@ def _location_driver(location_name, root): msg = "custom remote not yet supported" raise Exception(msg) + # TODO: Create a driver interface and have location path extend it? # Because atm we can't specify a type for driver as a function arg diff --git a/src/outpack/location_pull.py b/src/outpack/location_pull.py index 67f2196..aae88f4 100644 --- a/src/outpack/location_pull.py +++ b/src/outpack/location_pull.py @@ -3,19 +3,19 @@ import os import time from dataclasses import dataclass -from typing import Callable, Union, Optional +from typing import Callable, Optional, Union import humanize from outpack.filestore import FileStore from outpack.hash import hash_validate_string -from outpack.location import location_resolve_valid, _location_driver +from outpack.location import _location_driver, location_resolve_valid from outpack.metadata import MetadataCore, PacketFile, PacketLocation from outpack.packet import mark_known -from outpack.root import OutpackRoot, root_open, find_file_by_hash +from outpack.root import OutpackRoot, find_file_by_hash, root_open from outpack.search_options import SearchOptions from outpack.static import LOCATION_LOCAL -from outpack.util import format_list, pl, partition +from outpack.util import format_list, partition, pl def outpack_location_pull_metadata(location=None, root=None, *, locate=True): @@ -115,10 +115,14 @@ def _mark_all_known(driver, root, location_name): ) -def outpack_location_pull_packet(ids: Union[str, list[str]], - options=None, - recursive=None, - root=None, *, locate=True): +def outpack_location_pull_packet( + ids: Union[str, list[str]], + options=None, + recursive=None, + root=None, + *, + locate=True, +): root = root_open(root, locate=locate) options = SearchOptions(options) if isinstance(ids, str): @@ -127,13 +131,17 @@ def outpack_location_pull_packet(ids: Union[str, list[str]], ## Warn people of extra pulls and skips if plan.info.n_extra > 0: - print(f"Also pulling {plan.info.n_extra} " - f"{pl(plan.info.n_extra, 'packet')}, which are " - f"dependencies of those requested") + print( + f"Also pulling {plan.info.n_extra} " + f"{pl(plan.info.n_extra, 'packet')}, which are " + f"dependencies of those requested" + ) if plan.info.n_skip > 0: - print(f"Skipping {plan.info.n_skip} of {plan.info.n_total} " - f"{pl(plan.info.n_total, 'packet')} which are already " - f"unpacked") + print( + f"Skipping {plan.info.n_skip} of {plan.info.n_total} " + f"{pl(plan.info.n_total, 'packet')} which are already " + f"unpacked" + ) store, cleanup = _location_pull_files(plan.files, root) @@ -144,19 +152,18 @@ def outpack_location_pull_packet(ids: Union[str, list[str]], if use_archive: print( f"Writing files for '{packet.packet}' (packet {idx + 1}/" - f"{n_packets})") + f"{n_packets})" + ) _location_pull_files_archive(packet.packet, store, root) mark_known( - root, - packet.packet, - LOCATION_LOCAL, - packet.hash, - time.time() + root, packet.packet, LOCATION_LOCAL, packet.hash, time.time() ) - print(f"Unpacked {n_packets} {pl(n_packets, 'packet')} in " - f"{humanize.time.precisedelta(int(time.time() - time_start))}.") + print( + f"Unpacked {n_packets} {pl(n_packets, 'packet')} in " + f"{humanize.time.precisedelta(int(time.time() - time_start))}." + ) cleanup() return list(plan.packets.keys()) @@ -179,19 +186,22 @@ def outpack_location_pull_packet(ids: Union[str, list[str]], # and copes well with data races and corruption of data on disk # (e.g., users having edited files that we rely on, or editing them # after we hash them the first time). -def _location_pull_files(files: list[PacketFile], root: OutpackRoot) \ - -> (FileStore, Callable[[], None]): - +def _location_pull_files( + files: list[PacketFile], root: OutpackRoot +) -> (FileStore, Callable[[], None]): store = root.files if store is not None: + def cleanup(): return None exists, missing = partition(lambda file: store.exists(file.hash), files) if len(exists) > 0: - print(f"Found {len(exists)} {pl(exists, 'file')} in the " - f"file store") + print( + f"Found {len(exists)} {pl(exists, 'file')} in the " + f"file store" + ) else: print("Looking for suitable files already on disk") store = _temporary_filestore(root) @@ -209,37 +219,44 @@ def cleanup(): else: locations = {file.location for file in files} total_size = humanize.naturalsize(sum(file.size for file in files)) - print(f"Need to fetch {len(files)} {pl(files, 'file')} " + print( + f"Need to fetch {len(files)} {pl(files, 'file')} " f"({total_size}) from {len(locations)} " - f"{pl(locations, 'location')}") + f"{pl(locations, 'location')}" + ) for location in locations: - from_this_location = [file for file in files - if file.location == location] - _location_pull_hash_store(from_this_location, location, - _location_driver(location, root), store) + from_this_location = [ + file for file in files if file.location == location + ] + _location_pull_hash_store( + from_this_location, + location, + _location_driver(location, root), + store, + ) return store, cleanup -def _location_pull_hash_store(files: list[PacketFile], - location_name: str, - driver, - store: FileStore): +def _location_pull_hash_store( + files: list[PacketFile], location_name: str, driver, store: FileStore +): no_of_files = len(files) # TODO: show a nice progress bar for users for idx, file in enumerate(files): - print(f"Fetching file {idx + 1}/{no_of_files} " - f"({humanize.naturalsize(file.size)}) from '{location_name}'") + print( + f"Fetching file {idx + 1}/{no_of_files} " + f"({humanize.naturalsize(file.size)}) from '{location_name}'" + ) tmp = driver.fetch_file(file.hash, store.tmp()) store.put(tmp, file.hash) -def _location_pull_files_archive(packet_id: str, - store, - root: OutpackRoot): +def _location_pull_files_archive(packet_id: str, store, root: OutpackRoot): meta = root.index.metadata(packet_id) - dest_root = (root.path / root.config.core.path_archive / meta.name / - packet_id) + dest_root = ( + root.path / root.config.core.path_archive / meta.name / packet_id + ) for file in meta.files: store.get(file.hash, dest_root / file.path, overwrite=True) @@ -274,35 +291,41 @@ class PullPlanPackets: fetch: set[str] -def _location_build_pull_plan(packet_ids: list[str], - locations: Optional[list[str]], - recursive: Optional[bool], - root: OutpackRoot) -> LocationPullPlan: - packets = _location_build_pull_plan_packets(packet_ids, recursive, root) +def _location_build_pull_plan( + packet_ids: list[str], + locations: Optional[list[str]], + recursive: Optional[bool], + root: OutpackRoot, +) -> LocationPullPlan: + packets = _location_build_pull_plan_packets( + packet_ids, root, recursive=recursive + ) locations = _location_build_pull_plan_location(packets, locations, root) files = _location_build_pull_plan_files(packets.fetch, locations, root) fetch = _location_build_packet_locations(packets.fetch, locations, root) - info = PullPlanInfo(n_extra=len(packets.full) - len(packets.requested), - n_skip=len(packets.skip), - n_total=len(packets.full)) + info = PullPlanInfo( + n_extra=len(packets.full) - len(packets.requested), + n_skip=len(packets.skip), + n_total=len(packets.full), + ) - return LocationPullPlan(packets=fetch, - files=files, - info=info) + return LocationPullPlan(packets=fetch, files=files, info=info) -def _location_build_pull_plan_packets(packet_ids: list[str], - recursive: bool, - root: OutpackRoot) -> PullPlanPackets: +def _location_build_pull_plan_packets( + packet_ids: list[str], root: OutpackRoot, *, recursive: bool +) -> PullPlanPackets: requested = packet_ids if recursive is None: recursive = root.config.core.require_complete_tree if root.config.core.require_complete_tree and not recursive: - msg = (f"'recursive' must be True (or None) with your " - f"configuration\nBecause 'core.require_complete_tree' is " - f"true, we can't do a non-recursive pull, as this might " - f"leave an incomplete tree") + msg = ( + "'recursive' must be True (or None) with your " + "configuration\nBecause 'core.require_complete_tree' is " + "true, we can't do a non-recursive pull, as this might " + "leave an incomplete tree" + ) raise Exception(msg) index = root.index @@ -314,30 +337,31 @@ def _location_build_pull_plan_packets(packet_ids: list[str], skip = set(full) & set(index.unpacked()) fetch = set(full) - skip - return PullPlanPackets(requested=requested, - full=full, - skip=skip, - fetch=fetch) + return PullPlanPackets( + requested=requested, full=full, skip=skip, fetch=fetch + ) -def _find_all_dependencies(packet_ids: list[str], - metadata: dict[str, MetadataCore]) -> list[str]: +def _find_all_dependencies( + packet_ids: list[str], metadata: dict[str, MetadataCore] +) -> list[str]: ret = set(packet_ids) while len(packet_ids) > 0: - dependency_ids = set( + dependency_ids = { dependencies.packet for packet_id in packet_ids if packet_id in metadata.keys() - for dependencies in metadata.get(packet_id).depends) + for dependencies in metadata.get(packet_id).depends + } packet_ids = dependency_ids - ret ret = packet_ids | ret return sorted(ret) -def _location_build_pull_plan_location(packets: PullPlanPackets, - locations: list[str], - root: OutpackRoot) -> list[str]: +def _location_build_pull_plan_location( + packets: PullPlanPackets, locations: list[str], root: OutpackRoot +) -> list[str]: location_names = location_resolve_valid( locations, root, @@ -346,40 +370,47 @@ def _location_build_pull_plan_location(packets: PullPlanPackets, allow_no_locations=len(packets.fetch) == 0, ) - known_packets = [root.index.packets_in_location(location_name) - for location_name in location_names] + known_packets = [ + root.index.packets_in_location(location_name) + for location_name in location_names + ] missing = packets.fetch - set(itertools.chain(*known_packets)) if len(missing) > 0: extra = missing - set(packets.requested) if len(extra) > 0: - hint = (f"{len(extra)} missing " - f"{pl(extra, 'packet was', 'packets were')} " - f"requested as " - f"{pl(extra, 'dependency', 'dependencies')} " - f"of the {pl(extra, 'one')} you asked for: " - f"{format_list(extra)}") + hint = ( + f"{len(extra)} missing " + f"{pl(extra, 'packet was', 'packets were')} " + f"requested as " + f"{pl(extra, 'dependency', 'dependencies')} " + f"of the {pl(extra, 'one')} you asked for: " + f"{format_list(extra)}" + ) else: # In the case where the above is used, we probably have # up-to-date metadata, so we don't display this. hint = "Do you need to run 'outpack_location_pull_metadata()'?" - msg = (f"Failed to find {pl(missing, 'packet')} " - f"{format_list(missing)}\n" - f"Looked in {pl(location_names, 'location')} " - f"{format_list(location_names)}\n" - + hint) + msg = ( + f"Failed to find {pl(missing, 'packet')} " + f"{format_list(missing)}\n" + f"Looked in {pl(location_names, 'location')} " + f"{format_list(location_names)}\n" + hint + ) raise Exception(msg) return location_names -def _location_build_pull_plan_files(packet_ids: set[str], - locations: list[str], - root: OutpackRoot) -> list[PacketFile]: +def _location_build_pull_plan_files( + packet_ids: set[str], locations: list[str], root: OutpackRoot +) -> list[PacketFile]: metadata = root.index.all_metadata() - file_hashes = [file.hash - for packet_id in packet_ids - for file in metadata[packet_id].files] + file_hashes = [ + file.hash + for packet_id in packet_ids + for file in metadata[packet_id].files + ] n_files = len(file_hashes) if n_files == 0: @@ -405,9 +436,8 @@ def _location_build_pull_plan_files(packet_ids: set[str], def _location_build_packet_locations( - packets: set[str], - locations: list[str], - root: OutpackRoot) -> dict[str, PacketLocation]: + packets: set[str], locations: list[str], root: OutpackRoot +) -> dict[str, PacketLocation]: packets_fetch = {} for location in locations: packets_from_location = root.index.location(location) diff --git a/src/outpack/root.py b/src/outpack/root.py index cb78a13..d3e66ae 100644 --- a/src/outpack/root.py +++ b/src/outpack/root.py @@ -1,7 +1,7 @@ import os import shutil from pathlib import Path -from typing import Union, Optional +from typing import Optional, Union from outpack.config import read_config from outpack.filestore import FileStore diff --git a/src/outpack/util.py b/src/outpack/util.py index 2dd38f0..e319ad4 100644 --- a/src/outpack/util.py +++ b/src/outpack/util.py @@ -3,7 +3,7 @@ import runpy import time from contextlib import contextmanager -from itertools import tee, filterfalse +from itertools import filterfalse, tee from pathlib import Path @@ -115,13 +115,13 @@ def format_list(x): def pl(x, singular, plural=None): if plural is None: - plural = singular + 's' + plural = singular + "s" if isinstance(x, int): length = x else: length = len(x) - return "{}".format(singular if length == 1 else plural) + return f"{singular if length == 1 else plural}" def partition(pred, iterable): diff --git a/tests/helpers/__init__.py b/tests/helpers/__init__.py index 653a19f..d13fbc2 100644 --- a/tests/helpers/__init__.py +++ b/tests/helpers/__init__.py @@ -5,7 +5,7 @@ from contextlib import contextmanager from pathlib import Path from tempfile import TemporaryDirectory -from typing import List +from typing import List, Optional from outpack.ids import outpack_id from outpack.init import outpack_init @@ -59,20 +59,20 @@ def create_random_packet_chain(root, length, base=None): packet_path = Path(src) / name / packet_id os.makedirs(packet_path) - packet = Packet( - root, packet_path, name, id=packet_id, locate=False - ) + packet = Packet(root, packet_path, name, id=packet_id, locate=False) if i == 0 and base is None: with open(packet_path / "data.txt", "w") as f: f.write("0") else: - lines = ["with open('input.txt', 'r') as f:", - " data = f.read()", - "with open('data.txt', 'w') as f:", - f" f.write(data + '{i}')"] + lines = [ + "with open('input.txt', 'r') as f:", + " data = f.read()", + "with open('data.txt', 'w') as f:", + f" f.write(data + '{i}')", + ] with open(packet_path / "orderly.py", "w") as f: - f.write('\n'.join(lines) + '\n') + f.write("\n".join(lines) + "\n") if i > 0: id_use = ids[chr(i - 1 + ord("a"))] @@ -112,27 +112,32 @@ def copy_examples(names, root): shutil.copytree(Path("tests/orderly/examples") / nm, path_src) -def create_metadata_depends(id: str, depends: List[str] = None): +def create_metadata_depends(id: str, depends: Optional[List[str]] = None): if depends is None: depends = [] - dependencies = [PacketDepends(dependency_id, "", []) - for dependency_id in depends] - return {id: MetadataCore( - outpack_schema_version(), - id, - "name_" + random_characters(4), - {}, - {}, - [], - dependencies, - None, - None - )} + dependencies = [ + PacketDepends(dependency_id, "", []) for dependency_id in depends + ] + return { + id: MetadataCore( + outpack_schema_version(), + id, + "name_" + random_characters(4), + {}, + {}, + [], + dependencies, + None, + None, + ) + } def random_characters(n): - return ''.join(random.choice(string.ascii_letters + string.digits) - for _ in range(n)) + return "".join( + random.choice(string.ascii_letters + string.digits) # noqa: S311 + for _ in range(n) + ) # Like Rs rep function, useful for setting up test values @@ -141,8 +146,10 @@ def rep(x, each): if isinstance(each, int): each = [each] * len(x) if len(x) != len(each): - raise Exception("Repeats must be int or same length as the thing you " - "want to repeat") + msg = ( + "Repeats must be int or same length as the thing you want to repeat" + ) + raise Exception(msg) for item, times in zip(x, each): ret.extend([item] * times) diff --git a/tests/test_location.py b/tests/test_location.py index 5937184..3da0dea 100644 --- a/tests/test_location.py +++ b/tests/test_location.py @@ -5,7 +5,9 @@ outpack_location_add, outpack_location_list, outpack_location_remove, - outpack_location_rename) + outpack_location_rename, +) + from .helpers import create_temporary_root diff --git a/tests/test_location_pull.py b/tests/test_location_pull.py index 780482f..8ada822 100644 --- a/tests/test_location_pull.py +++ b/tests/test_location_pull.py @@ -9,22 +9,29 @@ from outpack.hash import hash_file from outpack.ids import outpack_id -from outpack.location import (outpack_location_add, - location_resolve_valid, - outpack_location_list) -from outpack.location_pull import (_find_all_dependencies, - outpack_location_pull_metadata, - outpack_location_pull_packet, - _location_build_pull_plan, - PullPlanInfo) +from outpack.location import ( + location_resolve_valid, + outpack_location_add, + outpack_location_list, +) +from outpack.location_pull import ( + PullPlanInfo, + _find_all_dependencies, + _location_build_pull_plan, + outpack_location_pull_metadata, + outpack_location_pull_packet, +) from outpack.packet import Packet from outpack.util import read_string -from .helpers import (create_metadata_depends, - create_temporary_root, - create_random_packet, - create_temporary_roots, - create_random_packet_chain, - rep) + +from .helpers import ( + create_metadata_depends, + create_random_packet, + create_random_packet_chain, + create_temporary_root, + create_temporary_roots, + rep, +) def test_can_pull_metadata_from_a_file_base_location(tmp_path): @@ -259,23 +266,26 @@ def test_can_resolve_dependencies_where_there_are_none(): deps = _find_all_dependencies(["a"], metadata) assert deps == ["a"] - metadata = (create_metadata_depends("a") | - create_metadata_depends("b", ["a"])) + metadata = create_metadata_depends("a") | create_metadata_depends( + "b", ["a"] + ) deps = _find_all_dependencies(["a"], metadata) assert deps == ["a"] def test_can_find_dependencies(): - metadata = (create_metadata_depends("a") | - create_metadata_depends("b") | - create_metadata_depends("c") | - create_metadata_depends("d", ["a", "b"]) | - create_metadata_depends("e", ["b", "c"]) | - create_metadata_depends("f", ["a", "c"]) | - create_metadata_depends("g", ["a", "f", "c"]) | - create_metadata_depends("h", ["a", "b", "c"]) | - create_metadata_depends("i", ["f"]) | - create_metadata_depends("j", ["i", "e", "a"])) + metadata = ( + create_metadata_depends("a") + | create_metadata_depends("b") + | create_metadata_depends("c") + | create_metadata_depends("d", ["a", "b"]) + | create_metadata_depends("e", ["b", "c"]) + | create_metadata_depends("f", ["a", "c"]) + | create_metadata_depends("g", ["a", "f", "c"]) + | create_metadata_depends("h", ["a", "b", "c"]) + | create_metadata_depends("i", ["f"]) + | create_metadata_depends("j", ["i", "e", "a"]) + ) assert _find_all_dependencies(["a"], metadata) == ["a"] assert _find_all_dependencies(["b"], metadata) == ["b"] @@ -285,49 +295,61 @@ def test_can_find_dependencies(): assert _find_all_dependencies(["e"], metadata) == ["b", "c", "e"] assert _find_all_dependencies(["f"], metadata) == ["a", "c", "f"] - assert (_find_all_dependencies(["g"], metadata) == - ["a", "c", "f", "g"]) - assert (_find_all_dependencies(["h"], metadata) == - ["a", "b", "c", "h"]) - assert (_find_all_dependencies(["i"], metadata) == - ["a", "c", "f", "i"]) - assert (_find_all_dependencies(["j"], metadata) == - ["a", "b", "c", "e", "f", "i", "j"]) + assert _find_all_dependencies(["g"], metadata) == ["a", "c", "f", "g"] + assert _find_all_dependencies(["h"], metadata) == ["a", "b", "c", "h"] + assert _find_all_dependencies(["i"], metadata) == ["a", "c", "f", "i"] + assert _find_all_dependencies(["j"], metadata) == [ + "a", + "b", + "c", + "e", + "f", + "i", + "j", + ] def test_can_find_multiple_dependencies_at_once(): - metadata = (create_metadata_depends("a") | - create_metadata_depends("b") | - create_metadata_depends("c") | - create_metadata_depends("d", ["a", "b"]) | - create_metadata_depends("e", ["b", "c"]) | - create_metadata_depends("f", ["a", "c"]) | - create_metadata_depends("g", ["a", "f", "c"]) | - create_metadata_depends("h", ["a", "b", "c"]) | - create_metadata_depends("i", ["f"]) | - create_metadata_depends("j", ["i", "e", "a"])) + metadata = ( + create_metadata_depends("a") + | create_metadata_depends("b") + | create_metadata_depends("c") + | create_metadata_depends("d", ["a", "b"]) + | create_metadata_depends("e", ["b", "c"]) + | create_metadata_depends("f", ["a", "c"]) + | create_metadata_depends("g", ["a", "f", "c"]) + | create_metadata_depends("h", ["a", "b", "c"]) + | create_metadata_depends("i", ["f"]) + | create_metadata_depends("j", ["i", "e", "a"]) + ) assert _find_all_dependencies([], metadata) == [] - assert (_find_all_dependencies(["c", "b", "a"], metadata) == - ["a", "b", "c"]) - assert (_find_all_dependencies(["d", "e", "f"], metadata) == - ["a", "b", "c", "d", "e", "f"]) + assert _find_all_dependencies(["c", "b", "a"], metadata) == ["a", "b", "c"] + assert _find_all_dependencies(["d", "e", "f"], metadata) == [ + "a", + "b", + "c", + "d", + "e", + "f", + ] def test_can_pull_packet_from_location_into_another_file_store(tmp_path): root = create_temporary_roots(tmp_path, use_file_store=True) id = create_random_packet(root["src"]) - outpack_location_add("src", "path", - {"path": str(root["src"].path)}, - root=root["dst"]) + outpack_location_add( + "src", "path", {"path": str(root["src"].path)}, root=root["dst"] + ) outpack_location_pull_metadata(root=root["dst"]) outpack_location_pull_packet(id, root=root["dst"]) index = root["dst"].index assert index.unpacked() == [id] - assert os.path.exists(root["dst"].path / "archive" / "data" / id / - "data.txt") + assert os.path.exists( + root["dst"].path / "archive" / "data" / id / "data.txt" + ) meta = index.metadata(id) assert all(root["dst"].files.exists(file.hash) for file in meta.files) @@ -337,16 +359,17 @@ def test_can_pull_packet_from_one_location_to_another_archive(tmp_path): root = create_temporary_roots(tmp_path, use_file_store=False) id = create_random_packet(root["src"]) - outpack_location_add("src", "path", - {"path": str(root["src"].path)}, - root=root["dst"]) + outpack_location_add( + "src", "path", {"path": str(root["src"].path)}, root=root["dst"] + ) outpack_location_pull_metadata(root=root["dst"]) outpack_location_pull_packet(id, root=root["dst"]) index = root["dst"].index assert index.unpacked() == [id] - assert os.path.exists(root["dst"].path / "archive" / "data" / id / - "data.txt") + assert os.path.exists( + root["dst"].path / "archive" / "data" / id / "data.txt" + ) def test_detect_and_avoid_modified_files_in_source_repository(tmp_path): @@ -364,9 +387,9 @@ def test_detect_and_avoid_modified_files_in_source_repository(tmp_path): p.end() ids[i] = p.id - outpack_location_add("src", "path", - {"path": str(root["src"].path)}, - root["dst"]) + outpack_location_add( + "src", "path", {"path": str(root["src"].path)}, root["dst"] + ) outpack_location_pull_metadata(root=root["dst"]) ## When I corrupt the file in the first ID by truncating it: @@ -382,22 +405,24 @@ def test_detect_and_avoid_modified_files_in_source_repository(tmp_path): print(f.getvalue()) assert re.search( - r"Rejecting file from archive 'a\.txt' in 'data/", - f.getvalue()) + r"Rejecting file from archive 'a\.txt' in 'data/", f.getvalue() + ) - assert (hash_file(dest_data / ids[0] / "a.txt") == - hash_file(src_data / ids[1] / "a.txt")) - assert (hash_file(dest_data / ids[0] / "b.txt") == - hash_file(src_data / ids[1] / "b.txt")) + assert hash_file(dest_data / ids[0] / "a.txt") == hash_file( + src_data / ids[1] / "a.txt" + ) + assert hash_file(dest_data / ids[0] / "b.txt") == hash_file( + src_data / ids[1] / "b.txt" + ) def test_do_not_unpack_packet_twice(tmp_path): root = create_temporary_roots(tmp_path) id = create_random_packet(root["src"]) - outpack_location_add("src", "path", - {"path": str(root["src"].path)}, - root["dst"]) + outpack_location_add( + "src", "path", {"path": str(root["src"].path)}, root["dst"] + ) outpack_location_pull_metadata(root=root["dst"]) assert outpack_location_pull_packet(id, root=root["dst"]) == [id] @@ -407,9 +432,9 @@ def test_do_not_unpack_packet_twice(tmp_path): def test_sensible_error_if_packet_not_known(tmp_path): root = create_temporary_roots(tmp_path) id = create_random_packet(root["src"]) - outpack_location_add("src", "path", - {"path": str(root["src"].path)}, - root=root["dst"]) + outpack_location_add( + "src", "path", {"path": str(root["src"].path)}, root=root["dst"] + ) with pytest.raises(Exception) as e: outpack_location_pull_packet(id, root=root["dst"]) @@ -419,21 +444,23 @@ def test_sensible_error_if_packet_not_known(tmp_path): def test_error_if_dependent_packet_not_known(tmp_path): - root = create_temporary_roots(tmp_path, ["a", "c"], - require_complete_tree=True) - root["b"] = create_temporary_root(tmp_path / "b", - require_complete_tree=False) + root = create_temporary_roots( + tmp_path, ["a", "c"], require_complete_tree=True + ) + root["b"] = create_temporary_root( + tmp_path / "b", require_complete_tree=False + ) ids = create_random_packet_chain(root["a"], 5) - outpack_location_add("a", "path", - {"path": str(root["a"].path)}, - root=root["b"]) + outpack_location_add( + "a", "path", {"path": str(root["a"].path)}, root=root["b"] + ) outpack_location_pull_metadata(root=root["b"]) outpack_location_pull_packet(ids["e"], root=root["b"]) - outpack_location_add("b", "path", - {"path": str(root["b"].path)}, - root=root["c"]) + outpack_location_add( + "b", "path", {"path": str(root["b"].path)}, root=root["c"] + ) outpack_location_pull_metadata(root=root["c"]) with pytest.raises(Exception) as e: @@ -441,27 +468,32 @@ def test_error_if_dependent_packet_not_known(tmp_path): print(e.value) assert e.match(f"Failed to find packet '{ids['d']}") assert e.match("Looked in location 'b'") - assert e.match("1 missing packet was requested as dependency of the " - f"one you asked for: '{ids['d']}'") + assert e.match( + "1 missing packet was requested as dependency of the " + f"one you asked for: '{ids['d']}'" + ) def test_can_pull_a_tree_recursively(tmp_path): root = create_temporary_roots(tmp_path) ids = create_random_packet_chain(root["src"], 3) - outpack_location_add("src", "path", - {"path": str(root["src"].path)}, - root["dst"]) + outpack_location_add( + "src", "path", {"path": str(root["src"].path)}, root["dst"] + ) outpack_location_pull_metadata(root=root["dst"]) - pulled_packets = outpack_location_pull_packet(ids["c"], recursive=True, - root=root["dst"]) + pulled_packets = outpack_location_pull_packet( + ids["c"], recursive=True, root=root["dst"] + ) assert set(pulled_packets) == set(itemgetter("a", "b", "c")(ids)) - assert (set(root["dst"].index.unpacked()) == - set(root["src"].index.unpacked())) + assert set(root["dst"].index.unpacked()) == set( + root["src"].index.unpacked() + ) - pulled_packets = outpack_location_pull_packet(ids["c"], recursive=True, - root=root["dst"]) + pulled_packets = outpack_location_pull_packet( + ids["c"], recursive=True, root=root["dst"] + ) assert pulled_packets == [] @@ -470,25 +502,25 @@ def test_can_filter_locations(tmp_path): root = create_temporary_roots(tmp_path, location_names, use_file_store=True) for name in location_names: if name != "dst": - outpack_location_add(name, "path", - {"path": str(root[name].path)}, - root=root["dst"]) + outpack_location_add( + name, "path", {"path": str(root[name].path)}, root=root["dst"] + ) ids_a = [create_random_packet(root["a"]) for _ in range(3)] - outpack_location_add("a", "path", - {"path": str(root["a"].path)}, - root=root["b"]) + outpack_location_add( + "a", "path", {"path": str(root["a"].path)}, root=root["b"] + ) outpack_location_pull_metadata(root=root["b"]) outpack_location_pull_packet(ids_a, root=root["b"]) ids_b = ids_a + [create_random_packet(root["b"]) for _ in range(3)] ids_c = [create_random_packet(root["c"]) for _ in range(3)] - outpack_location_add("a", "path", - {"path": str(root["a"].path)}, - root=root["d"]) - outpack_location_add("c", "path", - {"path": str(root["c"].path)}, - root=root["d"]) + outpack_location_add( + "a", "path", {"path": str(root["a"].path)}, root=root["d"] + ) + outpack_location_add( + "c", "path", {"path": str(root["c"].path)}, root=root["d"] + ) outpack_location_pull_metadata(root=root["d"]) outpack_location_pull_packet(ids_a, root=root["d"]) outpack_location_pull_packet(ids_c, root=root["d"]) @@ -499,25 +531,29 @@ def test_can_filter_locations(tmp_path): ids = list(set(ids_a + ids_b + ids_c + ids_d)) def locs(location_names): - return location_resolve_valid(location_names, root["dst"], - include_local=False, - include_orphan=False, - allow_no_locations=False) + return location_resolve_valid( + location_names, + root["dst"], + include_local=False, + include_orphan=False, + allow_no_locations=False, + ) - plan = _location_build_pull_plan(ids, None, None, - root=root["dst"]) + plan = _location_build_pull_plan(ids, None, None, root=root["dst"]) locations = [file.location for file in plan.files] assert locations == rep(["a", "b", "c", "d"], 3) # Invert order, prefer "d" - plan = _location_build_pull_plan(ids, locs(["d", "c", "b", "a"]), None, - root=root["dst"]) + plan = _location_build_pull_plan( + ids, locs(["d", "c", "b", "a"]), None, root=root["dst"] + ) locations = [file.location for file in plan.files] assert locations == rep(["d", "b"], [9, 3]) # Drop redundant locations - plan = _location_build_pull_plan(ids, locs(["b", "d"]), None, - root=root["dst"]) + plan = _location_build_pull_plan( + ids, locs(["b", "d"]), None, root=root["dst"] + ) locations = [file.location for file in plan.files] assert locations == rep(["b", "d"], 6) @@ -535,8 +571,10 @@ def locs(location_names): with pytest.raises(Exception) as e: _location_build_pull_plan(ids, ["a", "b", "c"], None, root=root["dst"]) assert "Looked in locations 'a', 'b', 'c'" in e.value.args[0] - assert ("Do you need to run 'outpack_location_pull_metadata()'?" - in e.value.args[0]) + assert ( + "Do you need to run 'outpack_location_pull_metadata()'?" + in e.value.args[0] + ) def test_nonrecursive_pulls_are_prevented_by_configuration(tmp_path): @@ -544,24 +582,28 @@ def test_nonrecursive_pulls_are_prevented_by_configuration(tmp_path): ids = create_random_packet_chain(root["src"], 3) with pytest.raises(Exception) as e: - outpack_location_pull_packet(ids["c"], recursive=False, - root=root["dst"]) + outpack_location_pull_packet( + ids["c"], recursive=False, root=root["dst"] + ) - assert e.match("'recursive' must be True \(or None\) with your " - "configuration") + assert ( + "'recursive' must be True (or None) with your configuration" + in e.value.args[0] + ) def test_if_recursive_pulls_are_required_pulls_are_default_recursive(tmp_path): - root = create_temporary_roots(tmp_path, ["src", "shallow"], - require_complete_tree=False) + root = create_temporary_roots( + tmp_path, ["src", "shallow"], require_complete_tree=False + ) root["deep"] = create_temporary_root(tmp_path, require_complete_tree=True) ids = create_random_packet_chain(root["src"], 3) for r in [root["shallow"], root["deep"]]: - outpack_location_add("src", "path", - {"path": str(root["src"].path)}, - root=r) + outpack_location_add( + "src", "path", {"path": str(root["src"].path)}, root=r + ) outpack_location_pull_metadata(root=r) outpack_location_pull_packet(ids["c"], recursive=None, root=root["shallow"]) @@ -570,6 +612,7 @@ def test_if_recursive_pulls_are_required_pulls_are_default_recursive(tmp_path): outpack_location_pull_packet(ids["c"], recursive=None, root=root["deep"]) assert root["deep"].index.unpacked() == list(ids.values()) + ## TODO: Uncomment when wired up with searching # def test_can_pull_packets_as_result_of_query(tmp_path): # TODO @@ -591,9 +634,9 @@ def test_skip_files_in_file_store(tmp_path): root = create_temporary_roots(tmp_path, use_file_store=True) ids = create_random_packet_chain(root["src"], 3) - outpack_location_add("src", "path", - {"path": str(root["src"].path)}, - root=root["dst"]) + outpack_location_add( + "src", "path", {"path": str(root["src"].path)}, root=root["dst"] + ) outpack_location_pull_metadata(root=root["dst"]) outpack_location_pull_packet(ids["a"], root=root["dst"]) @@ -604,4 +647,5 @@ def test_skip_files_in_file_store(tmp_path): text = f.getvalue() assert re.search("Found 1 file in the file store", text) assert re.search( - r"Need to fetch 3 files \([0-9]* Bytes\) from 1 location", text) + r"Need to fetch 3 files \([0-9]* Bytes\) from 1 location", text + ) diff --git a/tests/test_root.py b/tests/test_root.py index fba9a73..d1bbf51 100644 --- a/tests/test_root.py +++ b/tests/test_root.py @@ -8,6 +8,7 @@ from outpack.init import outpack_init from outpack.root import find_file_by_hash, root_open from outpack.util import transient_working_directory + from . import helpers diff --git a/tests/test_tools.py b/tests/test_tools.py index d03b3a4..87c6885 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -29,8 +29,9 @@ def test_git_report_no_info_without_git_repo(tmp_path): def test_git_report_git_info_if_possible(tmp_path): sha = simple_git_example(tmp_path) res = git_info(tmp_path) - assert (res == GitInfo(branch="master", sha=sha, url=[]) or - res == GitInfo(branch="main", sha=sha, url=[])) + assert res == GitInfo(branch="master", sha=sha, url=[]) or res == GitInfo( + branch="main", sha=sha, url=[] + ) def test_git_report_single_url(tmp_path): diff --git a/tests/test_util.py b/tests/test_util.py index 2f6467e..9d03794 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -7,15 +7,15 @@ assert_file_exists, expand_dirs, find_file_descend, + format_list, iso_time_str, match_value, num_to_time, + partition, + pl, read_string, run_script, time_to_num, - format_list, - pl, - partition, ) @@ -123,9 +123,8 @@ def test_can_inject_data_into_run(tmp_path): def test_can_format_list(): assert format_list(["one", "two"]) == "'one', 'two'" assert format_list(["one"]) == "'one'" - format_set = format_list({"one", "two"}) - assert format_set == "'one', 'two'" or format_set == "'two', 'one'" - assert format_list({"one", "one"}) == "'one'" + assert format_list({"one", "two"}) in ("'one', 'two'", "'two', 'one'") + assert format_list({"one", "one"}) == "'one'" # noqa:B033 def test_can_pluralise(): @@ -133,8 +132,7 @@ def test_can_pluralise(): assert pl(["one"], "item") == "item" assert pl(["one", "two"], "item") == "items" assert pl({"Inky"}, "octopus", "octopodes") == "octopus" - assert (pl({"Inky", "Tentacool"}, "octopus", "octopodes") == - "octopodes") + assert pl({"Inky", "Tentacool"}, "octopus", "octopodes") == "octopodes" assert pl(2, "item") == "items" assert pl(1, "item") == "item" From 00d88118545d455f1913c104119bc7b56212a6a9 Mon Sep 17 00:00:00 2001 From: Rob Ashton Date: Mon, 5 Feb 2024 11:27:28 +0000 Subject: [PATCH 04/22] Add humanize dependency --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 89b949c..51a81e6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,8 @@ dependencies = [ "importlib_resources", "jsonschema", "pygit2", - "outpack-query-parser" + "outpack-query-parser", + "humanize" ] [project.urls] From f6eef58835168469597b2735908be06c5fd09341 Mon Sep 17 00:00:00 2001 From: Rob Ashton Date: Mon, 5 Feb 2024 12:21:52 +0000 Subject: [PATCH 05/22] Fix mypy issues --- src/outpack/location_pull.py | 54 +++++++++++++++++++++--------------- src/outpack/metadata.py | 19 ++++++++++--- 2 files changed, 47 insertions(+), 26 deletions(-) diff --git a/src/outpack/location_pull.py b/src/outpack/location_pull.py index aae88f4..aa3e700 100644 --- a/src/outpack/location_pull.py +++ b/src/outpack/location_pull.py @@ -1,16 +1,19 @@ -import dataclasses import itertools import os import time from dataclasses import dataclass -from typing import Callable, Optional, Union +from typing import Callable, Optional, Tuple, Union import humanize from outpack.filestore import FileStore from outpack.hash import hash_validate_string from outpack.location import _location_driver, location_resolve_valid -from outpack.metadata import MetadataCore, PacketFile, PacketLocation +from outpack.metadata import ( + MetadataCore, + PacketFileWithLocation, + PacketLocation, +) from outpack.packet import mark_known from outpack.root import OutpackRoot, find_file_by_hash, root_open from outpack.search_options import SearchOptions @@ -187,8 +190,8 @@ def outpack_location_pull_packet( # (e.g., users having edited files that we rely on, or editing them # after we hash them the first time). def _location_pull_files( - files: list[PacketFile], root: OutpackRoot -) -> (FileStore, Callable[[], None]): + files: list[PacketFileWithLocation], root: OutpackRoot +) -> Tuple[FileStore, Callable[[], None]]: store = root.files if store is not None: @@ -239,7 +242,10 @@ def cleanup(): def _location_pull_hash_store( - files: list[PacketFile], location_name: str, driver, store: FileStore + files: list[PacketFileWithLocation], + location_name: str, + driver, + store: FileStore, ): no_of_files = len(files) # TODO: show a nice progress bar for users @@ -279,7 +285,7 @@ class PullPlanInfo: @dataclass class LocationPullPlan: packets: dict[str, PacketLocation] - files: list[PacketFile] + files: list[PacketFileWithLocation] info: PullPlanInfo @@ -314,7 +320,7 @@ def _location_build_pull_plan( def _location_build_pull_plan_packets( - packet_ids: list[str], root: OutpackRoot, *, recursive: bool + packet_ids: list[str], root: OutpackRoot, *, recursive: Optional[bool] ) -> PullPlanPackets: requested = packet_ids if recursive is None: @@ -346,21 +352,26 @@ def _find_all_dependencies( packet_ids: list[str], metadata: dict[str, MetadataCore] ) -> list[str]: ret = set(packet_ids) - while len(packet_ids) > 0: + packets = set(packet_ids) + while len(packets) > 0: dependency_ids = { dependencies.packet - for packet_id in packet_ids + for packet_id in packets if packet_id in metadata.keys() - for dependencies in metadata.get(packet_id).depends + for dependencies in ( + [] + if metadata.get(packet_id) is None + else metadata[packet_id].depends + ) } - packet_ids = dependency_ids - ret - ret = packet_ids | ret + packets = dependency_ids.difference(ret) + ret = packets.union(ret) return sorted(ret) def _location_build_pull_plan_location( - packets: PullPlanPackets, locations: list[str], root: OutpackRoot + packets: PullPlanPackets, locations: Optional[list[str]], root: OutpackRoot ) -> list[str]: location_names = location_resolve_valid( locations, @@ -404,13 +415,13 @@ def _location_build_pull_plan_location( def _location_build_pull_plan_files( packet_ids: set[str], locations: list[str], root: OutpackRoot -) -> list[PacketFile]: +) -> list[PacketFileWithLocation]: metadata = root.index.all_metadata() - file_hashes = [ + file_hashes = { file.hash for packet_id in packet_ids for file in metadata[packet_id].files - ] + } n_files = len(file_hashes) if n_files == 0: @@ -420,17 +431,16 @@ def _location_build_pull_plan_files( # We've already checked earlier that the file is in at least 1 # location so we don't have to worry about that here all_files = [] - file_hashes = set() for location_name in locations: location_packets = set(root.index.packets_in_location(location_name)) packets_in_location = location_packets & packet_ids for packet_id in packets_in_location: for file in metadata[packet_id].files: - new_file = dataclasses.replace(file) - new_file.location = location_name + file_with_location = PacketFileWithLocation.from_packet_file( + file, location_name + ) if file.hash not in file_hashes: - file_hashes.add(file.hash) - all_files.append(new_file) + all_files.append(file_with_location) return all_files diff --git a/src/outpack/metadata.py b/src/outpack/metadata.py index 1dddc82..8411dbd 100644 --- a/src/outpack/metadata.py +++ b/src/outpack/metadata.py @@ -1,4 +1,4 @@ -from dataclasses import dataclass, field +from dataclasses import dataclass from pathlib import Path from typing import Dict, List, Optional, Union @@ -14,7 +14,6 @@ class PacketFile: path: str size: float hash: str - location: Optional[str] = field(default=None) @staticmethod def from_file(directory, path, hash_algorithm): @@ -24,6 +23,18 @@ def from_file(directory, path, hash_algorithm): return PacketFile(path, s, h) +@dataclass +class PacketFileWithLocation: + path: str + size: float + hash: str + location: str + + @staticmethod + def from_packet_file(file: PacketFile, location: str): + return PacketFileWithLocation(file.path, file.size, file.hash, location) + + @dataclass_json() @dataclass class PacketDependsPath: @@ -77,9 +88,9 @@ class PacketLocation: def read_metadata_core(path) -> MetadataCore: with open(path) as f: - return MetadataCore.from_json(f.read().strip()) + return MetadataCore.from_json(f.read().strip()) # type: ignore def read_packet_location(path) -> PacketLocation: with open(path) as f: - return PacketLocation.from_json(f.read().strip()) + return PacketLocation.from_json(f.read().strip()) # type: ignore From 7c6913b9768cc03914e567f9f039508d9e697489 Mon Sep 17 00:00:00 2001 From: Rob Ashton Date: Mon, 5 Feb 2024 14:09:11 +0000 Subject: [PATCH 06/22] Use types compatible with python 3.8 --- src/outpack/index.py | 19 +++++++------- src/outpack/location_pull.py | 48 +++++++++++++++++++----------------- 2 files changed, 35 insertions(+), 32 deletions(-) diff --git a/src/outpack/index.py b/src/outpack/index.py index 7295158..cd4649f 100644 --- a/src/outpack/index.py +++ b/src/outpack/index.py @@ -1,5 +1,6 @@ import pathlib from dataclasses import dataclass +from typing import Dict, List from outpack.metadata import ( MetadataCore, @@ -11,9 +12,9 @@ @dataclass class IndexData: - metadata: dict[str, MetadataCore] - location: dict[str, dict[str, PacketLocation]] - unpacked: list[str] + metadata: Dict[str, MetadataCore] + location: Dict[str, Dict[str, PacketLocation]] + unpacked: List[str] @staticmethod def new(): @@ -33,7 +34,7 @@ def refresh(self): self.data = _index_update(self._path, self.data) return self - def all_metadata(self) -> dict[str, MetadataCore]: + def all_metadata(self) -> Dict[str, MetadataCore]: return self.refresh().data.metadata def metadata(self, id) -> MetadataCore: @@ -41,20 +42,20 @@ def metadata(self, id) -> MetadataCore: return self.data.metadata[id] return self.refresh().data.metadata[id] - def all_locations(self) -> dict[str, dict[str, PacketLocation]]: + def all_locations(self) -> Dict[str, Dict[str, PacketLocation]]: return self.refresh().data.location - def location(self, name) -> dict[str, PacketLocation]: + def location(self, name) -> Dict[str, PacketLocation]: return self.refresh().data.location[name] - def packets_in_location(self, name) -> list[str]: + def packets_in_location(self, name) -> List[str]: try: packets = list(self.location(name).keys()) except KeyError: packets = [] return packets - def unpacked(self) -> list[str]: + def unpacked(self) -> List[str]: return self.refresh().data.unpacked @@ -73,7 +74,7 @@ def _read_metadata(path_root, data): return data -def _read_locations(path_root, data) -> dict[str, dict[str, PacketLocation]]: +def _read_locations(path_root, data) -> Dict[str, Dict[str, PacketLocation]]: path = path_root / ".outpack" / "location" for loc in path.iterdir(): if loc.name not in data: diff --git a/src/outpack/location_pull.py b/src/outpack/location_pull.py index aa3e700..96ce5c0 100644 --- a/src/outpack/location_pull.py +++ b/src/outpack/location_pull.py @@ -2,7 +2,7 @@ import os import time from dataclasses import dataclass -from typing import Callable, Optional, Tuple, Union +from typing import Callable, Dict, List, Optional, Set, Tuple, Union import humanize @@ -74,7 +74,7 @@ def _get_remove_location_hint(location_name): ) -def _validate_hashes(driver, location_name, packets: list[PacketLocation]): +def _validate_hashes(driver, location_name, packets: List[PacketLocation]): mismatched_hashes = set() known_there = driver.list() for packet in packets: @@ -119,7 +119,7 @@ def _mark_all_known(driver, root, location_name): def outpack_location_pull_packet( - ids: Union[str, list[str]], + ids: Union[str, List[str]], options=None, recursive=None, root=None, @@ -190,7 +190,7 @@ def outpack_location_pull_packet( # (e.g., users having edited files that we rely on, or editing them # after we hash them the first time). def _location_pull_files( - files: list[PacketFileWithLocation], root: OutpackRoot + files: List[PacketFileWithLocation], root: OutpackRoot ) -> Tuple[FileStore, Callable[[], None]]: store = root.files if store is not None: @@ -242,7 +242,7 @@ def cleanup(): def _location_pull_hash_store( - files: list[PacketFileWithLocation], + files: List[PacketFileWithLocation], location_name: str, driver, store: FileStore, @@ -284,22 +284,22 @@ class PullPlanInfo: @dataclass class LocationPullPlan: - packets: dict[str, PacketLocation] - files: list[PacketFileWithLocation] + packets: Dict[str, PacketLocation] + files: List[PacketFileWithLocation] info: PullPlanInfo @dataclass class PullPlanPackets: - requested: list[str] - full: list[str] - skip: set[str] - fetch: set[str] + requested: List[str] + full: List[str] + skip: Set[str] + fetch: Set[str] def _location_build_pull_plan( - packet_ids: list[str], - locations: Optional[list[str]], + packet_ids: List[str], + locations: Optional[List[str]], recursive: Optional[bool], root: OutpackRoot, ) -> LocationPullPlan: @@ -320,7 +320,7 @@ def _location_build_pull_plan( def _location_build_pull_plan_packets( - packet_ids: list[str], root: OutpackRoot, *, recursive: Optional[bool] + packet_ids: List[str], root: OutpackRoot, *, recursive: Optional[bool] ) -> PullPlanPackets: requested = packet_ids if recursive is None: @@ -349,8 +349,8 @@ def _location_build_pull_plan_packets( def _find_all_dependencies( - packet_ids: list[str], metadata: dict[str, MetadataCore] -) -> list[str]: + packet_ids: List[str], metadata: Dict[str, MetadataCore] +) -> List[str]: ret = set(packet_ids) packets = set(packet_ids) while len(packets) > 0: @@ -371,8 +371,8 @@ def _find_all_dependencies( def _location_build_pull_plan_location( - packets: PullPlanPackets, locations: Optional[list[str]], root: OutpackRoot -) -> list[str]: + packets: PullPlanPackets, locations: Optional[List[str]], root: OutpackRoot +) -> List[str]: location_names = location_resolve_valid( locations, root, @@ -414,8 +414,8 @@ def _location_build_pull_plan_location( def _location_build_pull_plan_files( - packet_ids: set[str], locations: list[str], root: OutpackRoot -) -> list[PacketFileWithLocation]: + packet_ids: Set[str], locations: List[str], root: OutpackRoot +) -> List[PacketFileWithLocation]: metadata = root.index.all_metadata() file_hashes = { file.hash @@ -431,6 +431,7 @@ def _location_build_pull_plan_files( # We've already checked earlier that the file is in at least 1 # location so we don't have to worry about that here all_files = [] + seen_hashes = set() for location_name in locations: location_packets = set(root.index.packets_in_location(location_name)) packets_in_location = location_packets & packet_ids @@ -439,15 +440,16 @@ def _location_build_pull_plan_files( file_with_location = PacketFileWithLocation.from_packet_file( file, location_name ) - if file.hash not in file_hashes: + if file.hash not in seen_hashes: + seen_hashes.add(file_with_location.hash) all_files.append(file_with_location) return all_files def _location_build_packet_locations( - packets: set[str], locations: list[str], root: OutpackRoot -) -> dict[str, PacketLocation]: + packets: Set[str], locations: List[str], root: OutpackRoot +) -> Dict[str, PacketLocation]: packets_fetch = {} for location in locations: packets_from_location = root.index.location(location) From 2a62672f3e01ed010213b2c8c11b0fb947b65ba7 Mon Sep 17 00:00:00 2001 From: Rob Ashton Date: Mon, 5 Feb 2024 15:22:03 +0000 Subject: [PATCH 07/22] Use dict concatenation compatible with python 3.8 --- tests/test_location_pull.py | 56 ++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/tests/test_location_pull.py b/tests/test_location_pull.py index 8ada822..3f013c2 100644 --- a/tests/test_location_pull.py +++ b/tests/test_location_pull.py @@ -23,7 +23,6 @@ ) from outpack.packet import Packet from outpack.util import read_string - from .helpers import ( create_metadata_depends, create_random_packet, @@ -266,26 +265,27 @@ def test_can_resolve_dependencies_where_there_are_none(): deps = _find_all_dependencies(["a"], metadata) assert deps == ["a"] - metadata = create_metadata_depends("a") | create_metadata_depends( - "b", ["a"] - ) + metadata = { + **create_metadata_depends("a"), + **create_metadata_depends("b", ["a"]), + } deps = _find_all_dependencies(["a"], metadata) assert deps == ["a"] def test_can_find_dependencies(): - metadata = ( - create_metadata_depends("a") - | create_metadata_depends("b") - | create_metadata_depends("c") - | create_metadata_depends("d", ["a", "b"]) - | create_metadata_depends("e", ["b", "c"]) - | create_metadata_depends("f", ["a", "c"]) - | create_metadata_depends("g", ["a", "f", "c"]) - | create_metadata_depends("h", ["a", "b", "c"]) - | create_metadata_depends("i", ["f"]) - | create_metadata_depends("j", ["i", "e", "a"]) - ) + metadata = { + **create_metadata_depends("a") + **create_metadata_depends("b") + **create_metadata_depends("c") + **create_metadata_depends("d", ["a", "b"]) + **create_metadata_depends("e", ["b", "c"]) + **create_metadata_depends("f", ["a", "c"]) + **create_metadata_depends("g", ["a", "f", "c"]) + **create_metadata_depends("h", ["a", "b", "c"]) + **create_metadata_depends("i", ["f"]) + **create_metadata_depends("j", ["i", "e", "a"]) + } assert _find_all_dependencies(["a"], metadata) == ["a"] assert _find_all_dependencies(["b"], metadata) == ["b"] @@ -310,18 +310,18 @@ def test_can_find_dependencies(): def test_can_find_multiple_dependencies_at_once(): - metadata = ( - create_metadata_depends("a") - | create_metadata_depends("b") - | create_metadata_depends("c") - | create_metadata_depends("d", ["a", "b"]) - | create_metadata_depends("e", ["b", "c"]) - | create_metadata_depends("f", ["a", "c"]) - | create_metadata_depends("g", ["a", "f", "c"]) - | create_metadata_depends("h", ["a", "b", "c"]) - | create_metadata_depends("i", ["f"]) - | create_metadata_depends("j", ["i", "e", "a"]) - ) + metadata = { + **create_metadata_depends("a") + **create_metadata_depends("b") + **create_metadata_depends("c") + **create_metadata_depends("d", ["a", "b"]) + **create_metadata_depends("e", ["b", "c"]) + **create_metadata_depends("f", ["a", "c"]) + **create_metadata_depends("g", ["a", "f", "c"]) + **create_metadata_depends("h", ["a", "b", "c"]) + **create_metadata_depends("i", ["f"]) + **create_metadata_depends("j", ["i", "e", "a"]) + } assert _find_all_dependencies([], metadata) == [] assert _find_all_dependencies(["c", "b", "a"], metadata) == ["a", "b", "c"] From 9f3793ac1aae1281eb4573380f05d1318837d701 Mon Sep 17 00:00:00 2001 From: Rob Ashton Date: Mon, 5 Feb 2024 15:42:32 +0000 Subject: [PATCH 08/22] Fix issues removing files on windows --- src/outpack/filestore.py | 21 ++++++++++++++++++- tests/test_filestore.py | 2 +- tests/test_location_pull.py | 41 +++++++++++++++++++------------------ 3 files changed, 42 insertions(+), 22 deletions(-) diff --git a/src/outpack/filestore.py b/src/outpack/filestore.py index f6abadd..ee8b785 100644 --- a/src/outpack/filestore.py +++ b/src/outpack/filestore.py @@ -56,7 +56,26 @@ def ls(self): return ret def destroy(self) -> None: - shutil.rmtree(self._path) + def onerror(func, path, _exc_info): + """ + Error handler for ``shutil.rmtree``. + + If the error is due to an access error (read only file) + it attempts to add write permission and then retries. + + If the error is for another reason it re-raises the error. + We manually remove write permission in ``put`` above so this + is expected + + Usage : ``shutil.rmtree(path, onerror=onerror)`` + """ + if not os.access(path, os.W_OK): + os.chmod(path, stat.S_IWUSR) + func(path) + else: + raise + + shutil.rmtree(self._path, onerror=onerror) def tmp(self) -> str: path = self._path / "tmp" diff --git a/tests/test_filestore.py b/tests/test_filestore.py index eb9de86..cd94ab2 100644 --- a/tests/test_filestore.py +++ b/tests/test_filestore.py @@ -72,7 +72,7 @@ def test_get_files_fails_if_overwrite_false(tmp_path): with pytest.raises(Exception) as e: store.get(h, dest, False) - assert e.match(f"Failed to copy '.+' to '{dest}', file already exists") + assert e.match(rf"Failed to copy '.+' to '{dest}', file already exists") def test_if_hash_not_found(tmp_path): diff --git a/tests/test_location_pull.py b/tests/test_location_pull.py index 3f013c2..f0b4a34 100644 --- a/tests/test_location_pull.py +++ b/tests/test_location_pull.py @@ -23,6 +23,7 @@ ) from outpack.packet import Packet from outpack.util import read_string + from .helpers import ( create_metadata_depends, create_random_packet, @@ -275,16 +276,16 @@ def test_can_resolve_dependencies_where_there_are_none(): def test_can_find_dependencies(): metadata = { - **create_metadata_depends("a") - **create_metadata_depends("b") - **create_metadata_depends("c") - **create_metadata_depends("d", ["a", "b"]) - **create_metadata_depends("e", ["b", "c"]) - **create_metadata_depends("f", ["a", "c"]) - **create_metadata_depends("g", ["a", "f", "c"]) - **create_metadata_depends("h", ["a", "b", "c"]) - **create_metadata_depends("i", ["f"]) - **create_metadata_depends("j", ["i", "e", "a"]) + **create_metadata_depends("a"), + **create_metadata_depends("b"), + **create_metadata_depends("c"), + **create_metadata_depends("d", ["a", "b"]), + **create_metadata_depends("e", ["b", "c"]), + **create_metadata_depends("f", ["a", "c"]), + **create_metadata_depends("g", ["a", "f", "c"]), + **create_metadata_depends("h", ["a", "b", "c"]), + **create_metadata_depends("i", ["f"]), + **create_metadata_depends("j", ["i", "e", "a"]), } assert _find_all_dependencies(["a"], metadata) == ["a"] @@ -311,16 +312,16 @@ def test_can_find_dependencies(): def test_can_find_multiple_dependencies_at_once(): metadata = { - **create_metadata_depends("a") - **create_metadata_depends("b") - **create_metadata_depends("c") - **create_metadata_depends("d", ["a", "b"]) - **create_metadata_depends("e", ["b", "c"]) - **create_metadata_depends("f", ["a", "c"]) - **create_metadata_depends("g", ["a", "f", "c"]) - **create_metadata_depends("h", ["a", "b", "c"]) - **create_metadata_depends("i", ["f"]) - **create_metadata_depends("j", ["i", "e", "a"]) + **create_metadata_depends("a"), + **create_metadata_depends("b"), + **create_metadata_depends("c"), + **create_metadata_depends("d", ["a", "b"]), + **create_metadata_depends("e", ["b", "c"]), + **create_metadata_depends("f", ["a", "c"]), + **create_metadata_depends("g", ["a", "f", "c"]), + **create_metadata_depends("h", ["a", "b", "c"]), + **create_metadata_depends("i", ["f"]), + **create_metadata_depends("j", ["i", "e", "a"]), } assert _find_all_dependencies([], metadata) == [] From 4d2e4854dece16d584b6d56d0c0ceb0cb8f8ace8 Mon Sep 17 00:00:00 2001 From: Rob Ashton Date: Mon, 5 Feb 2024 15:46:13 +0000 Subject: [PATCH 09/22] Make regex more lenient to avoid issues on windows --- tests/test_filestore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_filestore.py b/tests/test_filestore.py index cd94ab2..26b9b17 100644 --- a/tests/test_filestore.py +++ b/tests/test_filestore.py @@ -72,7 +72,7 @@ def test_get_files_fails_if_overwrite_false(tmp_path): with pytest.raises(Exception) as e: store.get(h, dest, False) - assert e.match(rf"Failed to copy '.+' to '{dest}', file already exists") + assert e.match(r"Failed to copy '.+' to '.+', file already exists") def test_if_hash_not_found(tmp_path): From c38b16f35c4013de9989ad36edcd5ff126c5e01b Mon Sep 17 00:00:00 2001 From: Robert Ashton Date: Wed, 7 Feb 2024 10:41:43 +0000 Subject: [PATCH 10/22] Tidy up packet fetch logic and code review quality improvements --- src/outpack/location.py | 13 +++++---- src/outpack/location_path.py | 2 +- src/outpack/location_pull.py | 51 ++++++++++++++++++++---------------- tests/test_location_pull.py | 23 +++++++++++++++- 4 files changed, 60 insertions(+), 29 deletions(-) diff --git a/src/outpack/location.py b/src/outpack/location.py index 12a8828..365f95b 100644 --- a/src/outpack/location.py +++ b/src/outpack/location.py @@ -91,7 +91,7 @@ def location_resolve_valid( isinstance(item, str) for item in location ): unknown = set(location).difference(outpack_location_list(root)) - if len(unknown) > 0: + if unknown: unknown_text = "', '".join(unknown) msg = f"Unknown location: '{unknown_text}'" raise Exception(msg) @@ -130,6 +130,13 @@ def _location_exists(root, name): return name in outpack_location_list(root) +# TODO: Create a driver interface type +# atm we can't specify a type for driver return +# in this function. We want to return either an +# OutpackLocationPath driver or an http driver +# or other types down the line. We could set union type but +# would be nicer to use an interface-like pattern +# see mrc-5043 def _location_driver(location_name, root): location = root.config.location[location_name] if location.type == "path": @@ -140,7 +147,3 @@ def _location_driver(location_name, root): elif location.type == "custom": # pragma: no cover msg = "custom remote not yet supported" raise Exception(msg) - - -# TODO: Create a driver interface and have location path extend it? -# Because atm we can't specify a type for driver as a function arg diff --git a/src/outpack/location_path.py b/src/outpack/location_path.py index 48dfa7a..3ea285c 100644 --- a/src/outpack/location_path.py +++ b/src/outpack/location_path.py @@ -19,7 +19,7 @@ def metadata(self, packet_ids): all_ids = self.__root.index.location(LOCATION_LOCAL).keys() missing_ids = set(packet_ids).difference(all_ids) - if len(missing_ids) > 0: + if missing_ids: missing_msg = "', '".join(missing_ids) msg = f"Some packet ids not found: '{missing_msg}'" raise Exception(msg) diff --git a/src/outpack/location_pull.py b/src/outpack/location_pull.py index 96ce5c0..da13122 100644 --- a/src/outpack/location_pull.py +++ b/src/outpack/location_pull.py @@ -84,7 +84,7 @@ def _validate_hashes(driver, location_name, packets: List[PacketLocation]): if hash_there != hash_here: mismatched_hashes.add(packet.packet) - if len(mismatched_hashes) > 0: + if mismatched_hashes: id_text = "', '".join(mismatched_hashes) msg = ( f"Location '{location_name}' has conflicting metadata\n" @@ -120,10 +120,10 @@ def _mark_all_known(driver, root, location_name): def outpack_location_pull_packet( ids: Union[str, List[str]], + *, options=None, recursive=None, root=None, - *, locate=True, ): root = root_open(root, locate=locate) @@ -200,7 +200,7 @@ def cleanup(): exists, missing = partition(lambda file: store.exists(file.hash), files) - if len(exists) > 0: + if exists: print( f"Found {len(exists)} {pl(exists, 'file')} in the " f"file store" @@ -212,24 +212,31 @@ def cleanup(): def cleanup(): store.destroy() + missing = [] + no_found = 0 for file in files: path = find_file_by_hash(root, file.hash) if path is not None: store.put(path, file.hash) + no_found += 1 + else: + missing.append(file) + + print(f"Found {no_found} {pl(no_found, 'file')} on disk ") - if len(files) == 0: + if len(missing) == 0: print("All files available locally, no need to fetch any") else: - locations = {file.location for file in files} - total_size = humanize.naturalsize(sum(file.size for file in files)) + locations = {file.location for file in missing} + total_size = humanize.naturalsize(sum(file.size for file in missing)) print( - f"Need to fetch {len(files)} {pl(files, 'file')} " + f"Need to fetch {len(missing)} {pl(missing, 'file')} " f"({total_size}) from {len(locations)} " f"{pl(locations, 'location')}" ) for location in locations: from_this_location = [ - file for file in files if file.location == location + file for file in missing if file.location == location ] _location_pull_hash_store( from_this_location, @@ -326,12 +333,12 @@ def _location_build_pull_plan_packets( if recursive is None: recursive = root.config.core.require_complete_tree if root.config.core.require_complete_tree and not recursive: - msg = ( - "'recursive' must be True (or None) with your " - "configuration\nBecause 'core.require_complete_tree' is " - "true, we can't do a non-recursive pull, as this might " - "leave an incomplete tree" - ) + msg = """ + 'recursive' must be True (or None) with your + configuration\nBecause 'core.require_complete_tree' is + true, we can't do a non-recursive pull, as this might + leave an incomplete tree + """ raise Exception(msg) index = root.index @@ -340,8 +347,8 @@ def _location_build_pull_plan_packets( else: full = packet_ids - skip = set(full) & set(index.unpacked()) - fetch = set(full) - skip + skip = set(full).intersection(index.unpacked()) + fetch = set(full).difference(skip) return PullPlanPackets( requested=requested, full=full, skip=skip, fetch=fetch @@ -353,7 +360,7 @@ def _find_all_dependencies( ) -> List[str]: ret = set(packet_ids) packets = set(packet_ids) - while len(packets) > 0: + while packets: dependency_ids = { dependencies.packet for packet_id in packets @@ -385,10 +392,10 @@ def _location_build_pull_plan_location( root.index.packets_in_location(location_name) for location_name in location_names ] - missing = packets.fetch - set(itertools.chain(*known_packets)) - if len(missing) > 0: - extra = missing - set(packets.requested) - if len(extra) > 0: + missing = packets.fetch.difference(itertools.chain(*known_packets)) + if missing: + extra = missing.difference(packets.requested) + if extra: hint = ( f"{len(extra)} missing " f"{pl(extra, 'packet was', 'packets were')} " @@ -434,7 +441,7 @@ def _location_build_pull_plan_files( seen_hashes = set() for location_name in locations: location_packets = set(root.index.packets_in_location(location_name)) - packets_in_location = location_packets & packet_ids + packets_in_location = location_packets.intersection(packet_ids) for packet_id in packets_in_location: for file in metadata[packet_id].files: file_with_location = PacketFileWithLocation.from_packet_file( diff --git a/tests/test_location_pull.py b/tests/test_location_pull.py index f0b4a34..77c8b4d 100644 --- a/tests/test_location_pull.py +++ b/tests/test_location_pull.py @@ -648,5 +648,26 @@ def test_skip_files_in_file_store(tmp_path): text = f.getvalue() assert re.search("Found 1 file in the file store", text) assert re.search( - r"Need to fetch 3 files \([0-9]* Bytes\) from 1 location", text + r"Need to fetch 2 files \([0-9]* Bytes\) from 1 location", text + ) + + +def test_skip_files_already_on_disk(tmp_path): + root = create_temporary_roots(tmp_path, use_file_store=False) + + ids = create_random_packet_chain(root["src"], 3) + outpack_location_add( + "src", "path", {"path": str(root["src"].path)}, root=root["dst"] + ) + + outpack_location_pull_metadata(root=root["dst"]) + outpack_location_pull_packet(ids["a"], root=root["dst"]) + + f = io.StringIO() + with contextlib.redirect_stdout(f): + outpack_location_pull_packet(ids["b"], root=root["dst"]) + text = f.getvalue() + assert re.search("Found 1 file on disk", text) + assert re.search( + r"Need to fetch 2 files \([0-9]* Bytes\) from 1 location", text ) From 59675866ecd45b3c0c04ee61306959d89816e0cb Mon Sep 17 00:00:00 2001 From: Robert Ashton Date: Wed, 7 Feb 2024 11:24:25 +0000 Subject: [PATCH 11/22] Tidy up creating chain of packets helper function --- src/outpack/filestore.py | 2 +- src/outpack/location_pull.py | 9 +++---- src/outpack/root.py | 2 +- tests/helpers/__init__.py | 49 +++++++++++------------------------- tests/test_filestore.py | 8 +++--- tests/test_location_pull.py | 6 ++--- 6 files changed, 25 insertions(+), 51 deletions(-) diff --git a/src/outpack/filestore.py b/src/outpack/filestore.py index ee8b785..2602b0f 100644 --- a/src/outpack/filestore.py +++ b/src/outpack/filestore.py @@ -17,7 +17,7 @@ def filename(self, hash): dat = hash_parse(hash) return self._path / dat.algorithm / dat.value[:2] / dat.value[2:] - def get(self, hash, dst, overwrite): + def get(self, hash, dst, *, overwrite=False): src = self.filename(hash) if not os.path.exists(src): msg = f"Hash '{hash}' not found in store" diff --git a/src/outpack/location_pull.py b/src/outpack/location_pull.py index da13122..a0739ad 100644 --- a/src/outpack/location_pull.py +++ b/src/outpack/location_pull.py @@ -333,12 +333,9 @@ def _location_build_pull_plan_packets( if recursive is None: recursive = root.config.core.require_complete_tree if root.config.core.require_complete_tree and not recursive: - msg = """ - 'recursive' must be True (or None) with your - configuration\nBecause 'core.require_complete_tree' is - true, we can't do a non-recursive pull, as this might - leave an incomplete tree - """ + msg = """'recursive' must be True (or None) with your configuration +Because 'core.require_complete_tree' is true, we can't do a \ +non-recursive pull, as this might leave an incomplete tree""" raise Exception(msg) index = root.index diff --git a/src/outpack/root.py b/src/outpack/root.py index d3e66ae..cdb10b6 100644 --- a/src/outpack/root.py +++ b/src/outpack/root.py @@ -26,7 +26,7 @@ def export_file(self, id, there, here, dest): dest = Path(dest) here_full = dest / here if self.config.core.use_file_store: - self.files.get(hash, here_full, False) + self.files.get(hash, here_full, overwrite=False) else: # consider starting from the case most likely to contain # this hash, since we already know that it's 'id' unless diff --git a/tests/helpers/__init__.py b/tests/helpers/__init__.py index d13fbc2..3146d2a 100644 --- a/tests/helpers/__init__.py +++ b/tests/helpers/__init__.py @@ -29,7 +29,8 @@ def create_packet(root, name, *, packet_id=None, parameters=None): p = Packet(root, src, name, id=packet_id, parameters=parameters) try: yield p - except BaseException: + except BaseException as e: + print("Error in packet creation: ", e) p.end(insert=False) else: p.end(insert=True) @@ -51,40 +52,18 @@ def create_random_packet(root, name="data", *, parameters=None, packet_id=None): ## Create a chain of packets a, b, c, ... that depend on each other def create_random_packet_chain(root, length, base=None): ids = {} - with TemporaryDirectory() as src: - for i in range(length): - name = chr(i + ord("a")) - packet_id = outpack_id() - ids[name] = packet_id - packet_path = Path(src) / name / packet_id - os.makedirs(packet_path) - - packet = Packet(root, packet_path, name, id=packet_id, locate=False) - - if i == 0 and base is None: - with open(packet_path / "data.txt", "w") as f: - f.write("0") - else: - lines = [ - "with open('input.txt', 'r') as f:", - " data = f.read()", - "with open('data.txt', 'w') as f:", - f" f.write(data + '{i}')", - ] - with open(packet_path / "orderly.py", "w") as f: - f.write("\n".join(lines) + "\n") - - if i > 0: - id_use = ids[chr(i - 1 + ord("a"))] - else: - id_use = base - - if id_use is not None: - packet.use_dependency(id_use, {"input.txt": "data.txt"}) - - run_script(packet_path, "orderly.py", None) - - packet.end(insert=True) + for i in range(length): + name = chr(i + ord("a")) + with create_packet(root, name) as p: + if base is not None: + p.use_dependency(base, {"input.txt": "data.txt"}) + + d = [f"{random.random()}\n" for _ in range(10)] # noqa: S311 + with open(p.path / "data.txt", "w") as f: + f.writelines(d) + + ids[name] = p.id + base = p.id return ids diff --git a/tests/test_filestore.py b/tests/test_filestore.py index 26b9b17..af2d9c3 100644 --- a/tests/test_filestore.py +++ b/tests/test_filestore.py @@ -34,7 +34,7 @@ def test_can_store_files(tmp_path): assert s.ls() == [h] dest = tmp_path / "dest" - s.get(h, dest, False) + s.get(h, dest, overwrite=False) assert dest.exists() assert hash_file(dest, "md5") == h @@ -62,16 +62,16 @@ def test_get_files_fails_if_overwrite_false(tmp_path): dest = tmp_path / "dest" assert not dest.exists() - store.get(h, dest, False) + store.get(h, dest, overwrite=False) assert dest.exists() assert hash_file(dest, "md5") == h - store.get(h, dest, True) + store.get(h, dest, overwrite=True) assert dest.exists() assert hash_file(dest, "md5") == h with pytest.raises(Exception) as e: - store.get(h, dest, False) + store.get(h, dest, overwrite=False) assert e.match(r"Failed to copy '.+' to '.+', file already exists") diff --git a/tests/test_location_pull.py b/tests/test_location_pull.py index 77c8b4d..5639254 100644 --- a/tests/test_location_pull.py +++ b/tests/test_location_pull.py @@ -404,7 +404,6 @@ def test_detect_and_avoid_modified_files_in_source_repository(tmp_path): with contextlib.redirect_stdout(f): outpack_location_pull_packet(ids[0], root=root["dst"]) - print(f.getvalue()) assert re.search( r"Rejecting file from archive 'a\.txt' in 'data/", f.getvalue() ) @@ -466,7 +465,6 @@ def test_error_if_dependent_packet_not_known(tmp_path): with pytest.raises(Exception) as e: outpack_location_pull_packet(ids["e"], root=root["c"]) - print(e.value) assert e.match(f"Failed to find packet '{ids['d']}") assert e.match("Looked in location 'b'") assert e.match( @@ -648,7 +646,7 @@ def test_skip_files_in_file_store(tmp_path): text = f.getvalue() assert re.search("Found 1 file in the file store", text) assert re.search( - r"Need to fetch 2 files \([0-9]* Bytes\) from 1 location", text + r"Need to fetch 1 file \([0-9]* Bytes\) from 1 location", text ) @@ -669,5 +667,5 @@ def test_skip_files_already_on_disk(tmp_path): text = f.getvalue() assert re.search("Found 1 file on disk", text) assert re.search( - r"Need to fetch 2 files \([0-9]* Bytes\) from 1 location", text + r"Need to fetch 1 file \([0-9]* Bytes\) from 1 location", text ) From 217babe550806868dd4ff19670ebfd1577d3a13f Mon Sep 17 00:00:00 2001 From: Robert Ashton Date: Wed, 7 Feb 2024 12:29:26 +0000 Subject: [PATCH 12/22] Add better cleanup of resources using contextmanagers --- src/outpack/filestore.py | 5 ++-- src/outpack/location_pull.py | 52 +++++++++++++++++------------------- tests/helpers/__init__.py | 3 --- tests/test_filestore.py | 10 +++---- 4 files changed, 33 insertions(+), 37 deletions(-) diff --git a/src/outpack/filestore.py b/src/outpack/filestore.py index 2602b0f..afb946d 100644 --- a/src/outpack/filestore.py +++ b/src/outpack/filestore.py @@ -4,6 +4,7 @@ import stat import tempfile from pathlib import Path +from tempfile import _TemporaryFileWrapper from outpack.hash import Hash, hash_parse, hash_validate_file @@ -77,7 +78,7 @@ def onerror(func, path, _exc_info): shutil.rmtree(self._path, onerror=onerror) - def tmp(self) -> str: + def tmp(self) -> _TemporaryFileWrapper: path = self._path / "tmp" path.mkdir(exist_ok=True) - return tempfile.NamedTemporaryFile(dir=path).name + return tempfile.NamedTemporaryFile(dir=path) diff --git a/src/outpack/location_pull.py b/src/outpack/location_pull.py index a0739ad..9c80ec2 100644 --- a/src/outpack/location_pull.py +++ b/src/outpack/location_pull.py @@ -1,8 +1,9 @@ import itertools import os import time +from contextlib import contextmanager from dataclasses import dataclass -from typing import Callable, Dict, List, Optional, Set, Tuple, Union +from typing import Dict, Generator, List, Optional, Set, Union import humanize @@ -146,29 +147,27 @@ def outpack_location_pull_packet( f"unpacked" ) - store, cleanup = _location_pull_files(plan.files, root) + with _location_pull_files(plan.files, root) as store: + use_archive = root.config.core.path_archive is not None + n_packets = len(plan.packets) + time_start = time.time() + for idx, packet in enumerate(plan.packets.values()): + if use_archive: + print( + f"Writing files for '{packet.packet}' (packet {idx + 1}/" + f"{n_packets})" + ) + _location_pull_files_archive(packet.packet, store, root) - use_archive = root.config.core.path_archive is not None - n_packets = len(plan.packets) - time_start = time.time() - for idx, packet in enumerate(plan.packets.values()): - if use_archive: - print( - f"Writing files for '{packet.packet}' (packet {idx + 1}/" - f"{n_packets})" + mark_known( + root, packet.packet, LOCATION_LOCAL, packet.hash, time.time() ) - _location_pull_files_archive(packet.packet, store, root) - - mark_known( - root, packet.packet, LOCATION_LOCAL, packet.hash, time.time() - ) print( f"Unpacked {n_packets} {pl(n_packets, 'packet')} in " f"{humanize.time.precisedelta(int(time.time() - time_start))}." ) - cleanup() return list(plan.packets.keys()) @@ -189,15 +188,13 @@ def outpack_location_pull_packet( # and copes well with data races and corruption of data on disk # (e.g., users having edited files that we rely on, or editing them # after we hash them the first time). +@contextmanager def _location_pull_files( files: List[PacketFileWithLocation], root: OutpackRoot -) -> Tuple[FileStore, Callable[[], None]]: +) -> Generator[FileStore, None, None]: store = root.files + cleanup_store = False if store is not None: - - def cleanup(): - return None - exists, missing = partition(lambda file: store.exists(file.hash), files) if exists: @@ -208,9 +205,7 @@ def cleanup(): else: print("Looking for suitable files already on disk") store = _temporary_filestore(root) - - def cleanup(): - store.destroy() + cleanup_store = True missing = [] no_found = 0 @@ -245,7 +240,9 @@ def cleanup(): store, ) - return store, cleanup + yield store + if cleanup_store: + store.destroy() def _location_pull_hash_store( @@ -261,8 +258,9 @@ def _location_pull_hash_store( f"Fetching file {idx + 1}/{no_of_files} " f"({humanize.naturalsize(file.size)}) from '{location_name}'" ) - tmp = driver.fetch_file(file.hash, store.tmp()) - store.put(tmp, file.hash) + with store.tmp() as temp_file: + tmp = driver.fetch_file(file.hash, temp_file.name) + store.put(tmp, file.hash) def _location_pull_files_archive(packet_id: str, store, root: OutpackRoot): diff --git a/tests/helpers/__init__.py b/tests/helpers/__init__.py index 3146d2a..8d0e0a4 100644 --- a/tests/helpers/__init__.py +++ b/tests/helpers/__init__.py @@ -1,4 +1,3 @@ -import os import random import shutil import string @@ -7,13 +6,11 @@ from tempfile import TemporaryDirectory from typing import List, Optional -from outpack.ids import outpack_id from outpack.init import outpack_init from outpack.metadata import MetadataCore, PacketDepends from outpack.packet import Packet from outpack.root import root_open from outpack.schema import outpack_schema_version -from outpack.util import run_script @contextmanager diff --git a/tests/test_filestore.py b/tests/test_filestore.py index af2d9c3..0860495 100644 --- a/tests/test_filestore.py +++ b/tests/test_filestore.py @@ -81,14 +81,14 @@ def test_if_hash_not_found(tmp_path): h = Hash("md5", "7c4d97e580abb6c2ffb8b1872907d84b") dest = tmp_path / "dest" with pytest.raises(Exception) as e: - s.get(h, dest, False) + s.get(h, dest, overwrite=False) assert e.match("Hash 'md5:7c4d97e.+' not found in store") def test_can_create_filename_within_the_store(tmp_path): path = str(tmp_path / "store") store = FileStore(path) - temp_file = store.tmp() - assert os.path.dirname(temp_file) == str(store._path / "tmp") - assert not os.path.exists(temp_file) - assert os.path.exists(os.path.dirname(temp_file)) + with store.tmp() as temp_file: + assert os.path.dirname(temp_file.name) == str(store._path / "tmp") + assert os.path.exists(temp_file.name) + assert os.path.exists(os.path.dirname(temp_file.name)) From 6a2ad5e1a408ea800ae7d22245192332e6a99727 Mon Sep 17 00:00:00 2001 From: Robert Ashton Date: Wed, 7 Feb 2024 17:38:27 +0000 Subject: [PATCH 13/22] Set permissions on temporary file --- src/outpack/filestore.py | 2 +- src/outpack/location_path.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/outpack/filestore.py b/src/outpack/filestore.py index afb946d..f98c5eb 100644 --- a/src/outpack/filestore.py +++ b/src/outpack/filestore.py @@ -81,4 +81,4 @@ def onerror(func, path, _exc_info): def tmp(self) -> _TemporaryFileWrapper: path = self._path / "tmp" path.mkdir(exist_ok=True) - return tempfile.NamedTemporaryFile(dir=path) + return tempfile.NamedTemporaryFile(dir=path, mode="wb") diff --git a/src/outpack/location_path.py b/src/outpack/location_path.py index 3ea285c..288caae 100644 --- a/src/outpack/location_path.py +++ b/src/outpack/location_path.py @@ -32,7 +32,6 @@ def metadata(self, packet_ids): def fetch_file(self, hash, dest): if self.__root.config.core.use_file_store: path = self.__root.files.filename(hash) - print(path) if not os.path.exists(path): msg = f"Hash '{hash}' not found at location" raise Exception(msg) From a6411f4ef8e8f3790dca633614a10d9da8f7e54c Mon Sep 17 00:00:00 2001 From: Rob Ashton Date: Thu, 8 Feb 2024 13:44:50 +0000 Subject: [PATCH 14/22] Use custom contextmanager and manually manage file deletion This is so we can control when cleanup on windows --- src/outpack/filestore.py | 15 ++++++++++++--- src/outpack/location_pull.py | 4 ++-- tests/test_filestore.py | 6 +++--- 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/src/outpack/filestore.py b/src/outpack/filestore.py index f98c5eb..e0a5086 100644 --- a/src/outpack/filestore.py +++ b/src/outpack/filestore.py @@ -3,8 +3,8 @@ import shutil import stat import tempfile +from contextlib import contextmanager from pathlib import Path -from tempfile import _TemporaryFileWrapper from outpack.hash import Hash, hash_parse, hash_validate_file @@ -78,7 +78,16 @@ def onerror(func, path, _exc_info): shutil.rmtree(self._path, onerror=onerror) - def tmp(self) -> _TemporaryFileWrapper: + @contextmanager + def tmp(self): + # On a newer version of tempfile we could use `delete_on_close = True` path = self._path / "tmp" path.mkdir(exist_ok=True) - return tempfile.NamedTemporaryFile(dir=path, mode="wb") + f = tempfile.NamedTemporaryFile(dir=path, delete=False) + try: + yield f.name + finally: + try: + os.unlink(f.name) + except OSError: + pass diff --git a/src/outpack/location_pull.py b/src/outpack/location_pull.py index 9c80ec2..6d2741c 100644 --- a/src/outpack/location_pull.py +++ b/src/outpack/location_pull.py @@ -258,8 +258,8 @@ def _location_pull_hash_store( f"Fetching file {idx + 1}/{no_of_files} " f"({humanize.naturalsize(file.size)}) from '{location_name}'" ) - with store.tmp() as temp_file: - tmp = driver.fetch_file(file.hash, temp_file.name) + with store.tmp() as path: + tmp = driver.fetch_file(file.hash, path) store.put(tmp, file.hash) diff --git a/tests/test_filestore.py b/tests/test_filestore.py index 0860495..fe7c08a 100644 --- a/tests/test_filestore.py +++ b/tests/test_filestore.py @@ -89,6 +89,6 @@ def test_can_create_filename_within_the_store(tmp_path): path = str(tmp_path / "store") store = FileStore(path) with store.tmp() as temp_file: - assert os.path.dirname(temp_file.name) == str(store._path / "tmp") - assert os.path.exists(temp_file.name) - assert os.path.exists(os.path.dirname(temp_file.name)) + assert os.path.dirname(temp_file) == str(store._path / "tmp") + assert os.path.exists(temp_file) + assert os.path.exists(os.path.dirname(temp_file)) From 19be89404e2b25fb14bd86a76f52fd104aa4ecc9 Mon Sep 17 00:00:00 2001 From: Rob <39248272+r-ash@users.noreply.github.com> Date: Fri, 23 Feb 2024 09:36:39 +0000 Subject: [PATCH 15/22] Handle error inside context manager MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Paul LiƩtar --- src/outpack/location_pull.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/outpack/location_pull.py b/src/outpack/location_pull.py index 6d2741c..36d6e49 100644 --- a/src/outpack/location_pull.py +++ b/src/outpack/location_pull.py @@ -240,9 +240,11 @@ def _location_pull_files( store, ) - yield store - if cleanup_store: - store.destroy() + try: + yield store + finally: + if cleanup_store: + store.destroy() def _location_pull_hash_store( From 18ed99d456441fa3a50e19f9fd48b83f5e7bf913 Mon Sep 17 00:00:00 2001 From: Rob Ashton Date: Fri, 23 Feb 2024 11:17:49 +0000 Subject: [PATCH 16/22] Improve note and link to discussion --- src/outpack/filestore.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/outpack/filestore.py b/src/outpack/filestore.py index e0a5086..4cbfbb8 100644 --- a/src/outpack/filestore.py +++ b/src/outpack/filestore.py @@ -80,7 +80,9 @@ def onerror(func, path, _exc_info): @contextmanager def tmp(self): - # On a newer version of tempfile we could use `delete_on_close = True` + # On a newer version of tempfile we could use `delete_on_close = False` + # see + # https://github.com/mrc-ide/outpack-py/pull/33#discussion_r1500522877 path = self._path / "tmp" path.mkdir(exist_ok=True) f = tempfile.NamedTemporaryFile(dir=path, delete=False) From 5a47a33e72113d63abc585f89315220d03e99b65 Mon Sep 17 00:00:00 2001 From: Rob Ashton Date: Fri, 23 Feb 2024 12:37:42 +0000 Subject: [PATCH 17/22] Fix test coverage holes and issue with permissions being set on linux --- pyproject.toml | 1 + src/outpack/filestore.py | 3 ++- tests/test_filestore.py | 51 ++++++++++++++++++++++++++++++++-------- 3 files changed, 44 insertions(+), 11 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 51a81e6..0e323ed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,6 +45,7 @@ path = "src/outpack/__about__.py" dependencies = [ "coverage[toml]>=6.5", "pytest", + "pytest_mock" ] [tool.hatch.envs.default.scripts] test = "pytest {args:tests}" diff --git a/src/outpack/filestore.py b/src/outpack/filestore.py index 4cbfbb8..199e037 100644 --- a/src/outpack/filestore.py +++ b/src/outpack/filestore.py @@ -41,7 +41,8 @@ def put(self, src, hash, *, move=False): shutil.move(src, dst) else: shutil.copyfile(src, dst) - os.chmod(dst, stat.S_IREAD | stat.S_IRGRP | stat.S_IROTH) + # Make file readonly for everyone + dst.chmod(0o444) return hash def ls(self): diff --git a/tests/test_filestore.py b/tests/test_filestore.py index fe7c08a..b7c259f 100644 --- a/tests/test_filestore.py +++ b/tests/test_filestore.py @@ -26,25 +26,56 @@ def test_can_store_files(tmp_path): f.write(randstr(10)) s = FileStore(str(tmp_path / "store")) - p = tmp_path / "tmp" / "a" - h = hash_file(p, "md5") - assert not s.exists(h) - assert s.put(p, h) == h - assert s.exists(h) - assert s.ls() == [h] + path_a = tmp_path / "tmp" / "a" + hash_a = hash_file(path_a, "md5") + assert not s.exists(hash_a) + assert s.put(path_a, hash_a, move=True) == hash_a + assert s.exists(hash_a) + assert not path_a.exists() + assert s.ls() == [hash_a] + + path_b = tmp_path / "tmp" / "b" + hash_b = hash_file(path_b, "md5") + assert not s.exists(hash_b) + assert s.put(path_b, hash_b) == hash_b + assert s.exists(hash_b) + assert path_b.exists() + assert all(h in s.ls() for h in [hash_a, hash_b]) dest = tmp_path / "dest" - s.get(h, dest, overwrite=False) + s.get(hash_a, dest, overwrite=False) assert dest.exists() - assert hash_file(dest, "md5") == h + assert hash_file(dest, "md5") == hash_a - for i in range(10): - p = tmp_path / "tmp" / letters[i] + for i in range(9): + p = tmp_path / "tmp" / letters[i + 1] s.put(p, hash_file(p, "md5")) assert len(s.ls()) == 10 +def test_destroy_store_raises_error(tmp_path, mocker): + store_path = tmp_path / "store" + + with mocker.patch("os.chmod", side_effect=Exception("unexpected err")): + store = FileStore(str(store_path)) + assert store_path.exists() + file_path = tmp_path / "a" + with open(file_path, "w") as f: + f.write(randstr(10)) + file_hash = hash_file(file_path, "md5") + assert store.put(file_path, file_hash, move=False) == file_hash + assert store.ls() == [file_hash] + # Force the directory to be readonly, otherwise on linux + # this is removing the directory and it's contents without + # dropping into the onerror function we are trying to test here + store_path.chmod(0o444) + + # Error raised from anything other than file permission issue + with pytest.raises(Exception, matches="unexpected err") as e: + store.destroy() + + def test_get_files_fails_if_overwrite_false(tmp_path): tmp = tmp_path / "tmp" tmp.mkdir() From e9d89444a1c2ce8dc3e4c8dd3367dc8cd9930931 Mon Sep 17 00:00:00 2001 From: Rob Ashton Date: Fri, 23 Feb 2024 14:14:14 +0000 Subject: [PATCH 18/22] Try only run destroy test on Windows --- src/outpack/filestore.py | 6 +++++- tests/test_filestore.py | 12 +++++++----- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/outpack/filestore.py b/src/outpack/filestore.py index 199e037..1bdcb3d 100644 --- a/src/outpack/filestore.py +++ b/src/outpack/filestore.py @@ -67,7 +67,11 @@ def onerror(func, path, _exc_info): If the error is for another reason it re-raises the error. We manually remove write permission in ``put`` above so this - is expected + is expected. + + Note we only need this on windows, on Linux shutils.rmtree will + successfully remove the dir and its contents without having + to add write permission to individual files Usage : ``shutil.rmtree(path, onerror=onerror)`` """ diff --git a/tests/test_filestore.py b/tests/test_filestore.py index b7c259f..f7a92bd 100644 --- a/tests/test_filestore.py +++ b/tests/test_filestore.py @@ -1,4 +1,5 @@ import os +import platform import random import pytest @@ -54,7 +55,11 @@ def test_can_store_files(tmp_path): assert len(s.ls()) == 10 +@pytest.mark.skipif(platform.system() != "Windows", + reason="destroy onerror handler only invoked on Windows \ +so only run this test on Windows") def test_destroy_store_raises_error(tmp_path, mocker): + store_path = tmp_path / "store" with mocker.patch("os.chmod", side_effect=Exception("unexpected err")): @@ -66,16 +71,13 @@ def test_destroy_store_raises_error(tmp_path, mocker): file_hash = hash_file(file_path, "md5") assert store.put(file_path, file_hash, move=False) == file_hash assert store.ls() == [file_hash] - # Force the directory to be readonly, otherwise on linux - # this is removing the directory and it's contents without - # dropping into the onerror function we are trying to test here - store_path.chmod(0o444) # Error raised from anything other than file permission issue - with pytest.raises(Exception, matches="unexpected err") as e: + with pytest.raises(Exception, match="unexpected err"): store.destroy() + def test_get_files_fails_if_overwrite_false(tmp_path): tmp = tmp_path / "tmp" tmp.mkdir() From e5508207fa069718fa85ee4924f28da5335346e3 Mon Sep 17 00:00:00 2001 From: Rob Ashton Date: Fri, 23 Feb 2024 14:49:33 +0000 Subject: [PATCH 19/22] Fix lint issue --- tests/test_filestore.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_filestore.py b/tests/test_filestore.py index f7a92bd..6dab9d7 100644 --- a/tests/test_filestore.py +++ b/tests/test_filestore.py @@ -55,14 +55,15 @@ def test_can_store_files(tmp_path): assert len(s.ls()) == 10 -@pytest.mark.skipif(platform.system() != "Windows", - reason="destroy onerror handler only invoked on Windows \ -so only run this test on Windows") +@pytest.mark.skipif( + platform.system() != "Windows", + reason="destroy onerror handler only invoked on Windows \ +so only run this test on Windows", +) def test_destroy_store_raises_error(tmp_path, mocker): - store_path = tmp_path / "store" - with mocker.patch("os.chmod", side_effect=Exception("unexpected err")): + with mocker.patch("os.chmod", side_effect=[1, Exception("unexpected err")]): store = FileStore(str(store_path)) assert store_path.exists() file_path = tmp_path / "a" @@ -77,7 +78,6 @@ def test_destroy_store_raises_error(tmp_path, mocker): store.destroy() - def test_get_files_fails_if_overwrite_false(tmp_path): tmp = tmp_path / "tmp" tmp.mkdir() From a8a88ca2b437eb41759fbfa1e88d432181055bca Mon Sep 17 00:00:00 2001 From: Rob Ashton Date: Fri, 23 Feb 2024 15:25:45 +0000 Subject: [PATCH 20/22] Create patch as call, not context manager to avoid warning --- tests/test_filestore.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/tests/test_filestore.py b/tests/test_filestore.py index 6dab9d7..934a6ab 100644 --- a/tests/test_filestore.py +++ b/tests/test_filestore.py @@ -63,19 +63,20 @@ def test_can_store_files(tmp_path): def test_destroy_store_raises_error(tmp_path, mocker): store_path = tmp_path / "store" - with mocker.patch("os.chmod", side_effect=[1, Exception("unexpected err")]): - store = FileStore(str(store_path)) - assert store_path.exists() - file_path = tmp_path / "a" - with open(file_path, "w") as f: - f.write(randstr(10)) - file_hash = hash_file(file_path, "md5") - assert store.put(file_path, file_hash, move=False) == file_hash - assert store.ls() == [file_hash] - - # Error raised from anything other than file permission issue - with pytest.raises(Exception, match="unexpected err"): - store.destroy() + mocker.patch("os.chmod", side_effect=[1, Exception("unexpected err")]) + + store = FileStore(str(store_path)) + assert store_path.exists() + file_path = tmp_path / "a" + with open(file_path, "w") as f: + f.write(randstr(10)) + file_hash = hash_file(file_path, "md5") + assert store.put(file_path, file_hash, move=False) == file_hash + assert store.ls() == [file_hash] + + # Error raised from anything other than file permission issue + with pytest.raises(Exception, match="unexpected err"): + store.destroy() def test_get_files_fails_if_overwrite_false(tmp_path): From a8c42c2b92d1cea19fe94a23c4d5e83359a52572 Mon Sep 17 00:00:00 2001 From: Rob Ashton Date: Mon, 26 Feb 2024 11:54:02 +0000 Subject: [PATCH 21/22] Mock method which checks access instead of forcing throw --- tests/test_filestore.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_filestore.py b/tests/test_filestore.py index 934a6ab..af12abe 100644 --- a/tests/test_filestore.py +++ b/tests/test_filestore.py @@ -63,7 +63,7 @@ def test_can_store_files(tmp_path): def test_destroy_store_raises_error(tmp_path, mocker): store_path = tmp_path / "store" - mocker.patch("os.chmod", side_effect=[1, Exception("unexpected err")]) + mocker.patch("os.access", return_value=True) store = FileStore(str(store_path)) assert store_path.exists() @@ -75,7 +75,7 @@ def test_destroy_store_raises_error(tmp_path, mocker): assert store.ls() == [file_hash] # Error raised from anything other than file permission issue - with pytest.raises(Exception, match="unexpected err"): + with pytest.raises(Exception, match="Access is denied"): store.destroy() From 45c7e77ddfd9caaac6b1789f3ff69411f420aea5 Mon Sep 17 00:00:00 2001 From: Rob Ashton Date: Mon, 26 Feb 2024 13:50:51 +0000 Subject: [PATCH 22/22] Fix remaining coverage hole --- tests/test_location_pull.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/test_location_pull.py b/tests/test_location_pull.py index 5639254..bd8f530 100644 --- a/tests/test_location_pull.py +++ b/tests/test_location_pull.py @@ -356,6 +356,28 @@ def test_can_pull_packet_from_location_into_another_file_store(tmp_path): assert all(root["dst"].files.exists(file.hash) for file in meta.files) +def test_can_pull_packet_from_location_file_store_only(tmp_path): + root = create_temporary_roots( + tmp_path, use_file_store=True, path_archive=None + ) + + id = create_random_packet(root["src"]) + outpack_location_add( + "src", "path", {"path": str(root["src"].path)}, root=root["dst"] + ) + outpack_location_pull_metadata(root=root["dst"]) + outpack_location_pull_packet(id, root=root["dst"]) + + index = root["dst"].index + assert index.unpacked() == [id] + assert not os.path.exists( + root["dst"].path / "archive" / "data" / id / "data.txt" + ) + + meta = index.metadata(id) + assert all(root["dst"].files.exists(file.hash) for file in meta.files) + + def test_can_pull_packet_from_one_location_to_another_archive(tmp_path): root = create_temporary_roots(tmp_path, use_file_store=False)