From 962cbe7ba8433e0b6d7e4e93b18b7b4217304723 Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Tue, 1 Sep 2020 07:20:44 +0300 Subject: [PATCH] dvc: separate cache and tree hashes Related to #4144 , #3069 , #1676 --- dvc/cache/base.py | 65 +++++++++++++++++++++++++++++++----- dvc/hash_info.py | 3 +- dvc/output/base.py | 4 ++- dvc/tree/base.py | 55 ++++++------------------------ tests/unit/tree/test_dvc.py | 13 ++++---- tests/unit/tree/test_repo.py | 13 ++++---- 6 files changed, 87 insertions(+), 66 deletions(-) diff --git a/dvc/cache/base.py b/dvc/cache/base.py index 92d5b5bd0c..cd79d8fafd 100644 --- a/dvc/cache/base.py +++ b/dvc/cache/base.py @@ -1,6 +1,7 @@ import json import logging from copy import copy +from operator import itemgetter from shortuuid import uuid @@ -255,10 +256,52 @@ def _cache_is_copy(self, path_info): self.cache_type_confirmed = True return self.cache_types[0] == "copy" + def _get_dir_info_hash(self, dir_info): + import tempfile + + from dvc.path_info import PathInfo + from dvc.utils import tmp_fname + + # Sorting the list by path to ensure reproducibility + dir_info = sorted(dir_info, key=itemgetter(self.tree.PARAM_RELPATH)) + + tmp = tempfile.NamedTemporaryFile(delete=False).name + with open(tmp, "w+") as fobj: + json.dump(dir_info, fobj, sort_keys=True) + + from_info = PathInfo(tmp) + to_info = self.tree.path_info / tmp_fname("") + self.tree.upload(from_info, to_info, no_progress_bar=True) + + hash_info = self.tree.get_hash(to_info) + hash_info.value += self.tree.CHECKSUM_DIR_SUFFIX + hash_info.dir_info = dir_info + + return hash_info, to_info + + def save_dir_info(self, dir_info, hash_info=None): + if hash_info and not self.changed_cache_file(hash_info): + return hash_info + + hi, tmp_info = self._get_dir_info_hash(dir_info) + if hash_info: + assert hi == hash_info + + new_info = self.tree.hash_to_path_info(hi.value) + if self.changed_cache_file(hi): + self.tree.makedirs(new_info.parent) + self.tree.move(tmp_info, new_info, mode=self.CACHE_MODE) + + self.tree.state.save(new_info, hi.value) + + return hi + def _save_dir(self, path_info, tree, hash_info, save_link=True, **kwargs): - dir_info = self.get_dir_cache(hash_info) + if not hash_info.dir_info: + hash_info.dir_info = tree.cache.get_dir_cache(hash_info) + hi = self.save_dir_info(hash_info.dir_info, hash_info) for entry in Tqdm( - dir_info, desc="Saving " + path_info.name, unit="file" + hi.dir_info, desc="Saving " + path_info.name, unit="file" ): entry_info = path_info / entry[self.tree.PARAM_RELPATH] entry_hash = HashInfo( @@ -271,10 +314,10 @@ def _save_dir(self, path_info, tree, hash_info, save_link=True, **kwargs): if save_link: self.tree.state.save_link(path_info) if self.tree.exists(path_info): - self.tree.state.save(path_info, hash_info.value) + self.tree.state.save(path_info, hi.value) - cache_info = self.tree.hash_to_path_info(hash_info.value) - self.tree.state.save(cache_info, hash_info.value) + cache_info = self.tree.hash_to_path_info(hi.value) + self.tree.state.save(cache_info, hi.value) def save(self, path_info, tree, hash_info, save_link=True, **kwargs): if path_info.scheme != self.tree.scheme: @@ -602,8 +645,6 @@ def _diff(ancestor, other, allow_removed=False): return result def _merge_dirs(self, ancestor_info, our_info, their_info): - from operator import itemgetter - from dictdiffer import patch ancestor = self._to_dict(ancestor_info) @@ -641,4 +682,12 @@ def merge(self, ancestor_info, our_info, their_info): their = self.get_dir_cache(their_info) merged = self._merge_dirs(ancestor, our, their) - return self.tree.save_dir_info(merged) + return self.save_dir_info(merged) + + def get_hash(self, tree, path_info): + hash_info = tree.get_hash(path_info) + if not hash_info.isdir: + assert hash_info.name == self.tree.PARAM_CHECKSUM + return hash_info + + return self.save_dir_info(hash_info.dir_info, hash_info) diff --git a/dvc/hash_info.py b/dvc/hash_info.py index fdf3c914c8..715e76da39 100644 --- a/dvc/hash_info.py +++ b/dvc/hash_info.py @@ -1,4 +1,4 @@ -from dataclasses import dataclass +from dataclasses import dataclass, field HASH_DIR_SUFFIX = ".dir" @@ -7,6 +7,7 @@ class HashInfo: name: str value: str + dir_info: list = field(default=None, compare=False) def __bool__(self): return bool(self.value) diff --git a/dvc/output/base.py b/dvc/output/base.py index c219b436aa..0a7b044544 100644 --- a/dvc/output/base.py +++ b/dvc/output/base.py @@ -174,7 +174,9 @@ def cache_path(self): return self.cache.tree.hash_to_path_info(self.hash_info.value).url def get_hash(self): - return self.tree.get_hash(self.path_info) + if not self.use_cache: + return self.tree.get_hash(self.path_info) + return self.cache.get_hash(self.tree, self.path_info) @property def is_dir_checksum(self): diff --git a/dvc/tree/base.py b/dvc/tree/base.py index 329c90c880..614c53a246 100644 --- a/dvc/tree/base.py +++ b/dvc/tree/base.py @@ -1,11 +1,8 @@ import itertools -import json import logging -import tempfile from concurrent.futures import ThreadPoolExecutor, as_completed from functools import partial from multiprocessing import cpu_count -from operator import itemgetter from urllib.parse import urlparse from funcy import cached_property @@ -17,7 +14,7 @@ ) from dvc.hash_info import HashInfo from dvc.ignore import DvcIgnore -from dvc.path_info import PathInfo, URLInfo +from dvc.path_info import URLInfo from dvc.progress import Tqdm from dvc.state import StateNoop from dvc.utils import tmp_fname @@ -261,7 +258,10 @@ def get_hash(self, path_info, **kwargs): hash_ = None if hash_: - return HashInfo(self.PARAM_CHECKSUM, hash_) + hash_info = HashInfo(self.PARAM_CHECKSUM, hash_) + if hash_info.isdir: + hash_info.dir_info = self.cache.get_dir_cache(hash_info) + return hash_info if self.isdir(path_info): hash_info = self.get_dir_hash(path_info, **kwargs) @@ -276,13 +276,6 @@ def get_hash(self, path_info, **kwargs): def get_file_hash(self, path_info): raise NotImplementedError - def get_dir_hash(self, path_info, **kwargs): - if not self.cache: - raise RemoteCacheRequiredError(path_info) - - dir_info = self._collect_dir(path_info, **kwargs) - return self.save_dir_info(dir_info) - def hash_to_path_info(self, hash_): return self.path_info / hash_[0:2] / hash_[2:] @@ -321,7 +314,7 @@ def _collect_dir(self, path_info, **kwargs): new_hashes = self._calculate_hashes(not_in_state) hashes.update(new_hashes) - result = [ + return [ { self.PARAM_CHECKSUM: hashes[fi], # NOTE: this is lossy transformation: @@ -337,38 +330,12 @@ def _collect_dir(self, path_info, **kwargs): for fi in file_infos ] - # Sorting the list by path to ensure reproducibility - return sorted(result, key=itemgetter(self.PARAM_RELPATH)) - - def save_dir_info(self, dir_info): - hash_info, tmp_info = self._get_dir_info_hash(dir_info) - new_info = self.cache.tree.hash_to_path_info(hash_info.value) - if self.cache.changed_cache_file(hash_info): - self.cache.tree.makedirs(new_info.parent) - self.cache.tree.move( - tmp_info, new_info, mode=self.cache.CACHE_MODE - ) - - self.state.save(new_info, hash_info.value) - - return hash_info - - def _get_dir_info_hash(self, dir_info): - # Sorting the list by path to ensure reproducibility - dir_info = sorted(dir_info, key=itemgetter(self.PARAM_RELPATH)) - - tmp = tempfile.NamedTemporaryFile(delete=False).name - with open(tmp, "w+") as fobj: - json.dump(dir_info, fobj, sort_keys=True) - - tree = self.cache.tree - from_info = PathInfo(tmp) - to_info = tree.path_info / tmp_fname("") - tree.upload(from_info, to_info, no_progress_bar=True) + def get_dir_hash(self, path_info, **kwargs): + if not self.cache: + raise RemoteCacheRequiredError(path_info) - hash_info = tree.get_file_hash(to_info) - hash_info.value += self.CHECKSUM_DIR_SUFFIX - return hash_info, to_info + dir_info = self._collect_dir(path_info, **kwargs) + return self.cache.save_dir_info(dir_info) def upload(self, from_info, to_info, name=None, no_progress_bar=False): if not hasattr(self, "_upload"): diff --git a/tests/unit/tree/test_dvc.py b/tests/unit/tree/test_dvc.py index 2966ebf8ba..5f1fc33ea0 100644 --- a/tests/unit/tree/test_dvc.py +++ b/tests/unit/tree/test_dvc.py @@ -230,9 +230,10 @@ def test_get_hash_granular(tmp_dir, dvc): ) tree = DvcTree(dvc, fetch=True) subdir = PathInfo(tmp_dir) / "dir" / "subdir" - assert tree.get_hash(subdir) == HashInfo( - "md5", "af314506f1622d107e0ed3f14ec1a3b5.dir", - ) - assert tree.get_hash(subdir / "data") == HashInfo( - "md5", "8d777f385d3dfec8815d20f7496026dc", - ) + with dvc.state: + assert tree.get_hash(subdir) == HashInfo( + "md5", "af314506f1622d107e0ed3f14ec1a3b5.dir", + ) + assert tree.get_hash(subdir / "data") == HashInfo( + "md5", "8d777f385d3dfec8815d20f7496026dc", + ) diff --git a/tests/unit/tree/test_repo.py b/tests/unit/tree/test_repo.py index 40998c1063..e22a4faf8e 100644 --- a/tests/unit/tree/test_repo.py +++ b/tests/unit/tree/test_repo.py @@ -391,10 +391,11 @@ def test_get_hash_cached_granular(tmp_dir, dvc, mocker): tree = RepoTree(dvc, fetch=True) dvc_tree_spy = mocker.spy(tree._dvctrees[dvc.root_dir], "get_file_hash") subdir = PathInfo(tmp_dir) / "dir" / "subdir" - assert tree.get_hash(subdir) == HashInfo( - "md5", "af314506f1622d107e0ed3f14ec1a3b5.dir", - ) - assert tree.get_hash(subdir / "data") == HashInfo( - "md5", "8d777f385d3dfec8815d20f7496026dc", - ) + with dvc.state: + assert tree.get_hash(subdir) == HashInfo( + "md5", "af314506f1622d107e0ed3f14ec1a3b5.dir", + ) + assert tree.get_hash(subdir / "data") == HashInfo( + "md5", "8d777f385d3dfec8815d20f7496026dc", + ) assert dvc_tree_spy.called