Skip to content

Commit

Permalink
dvc: separate cache and tree hashes
Browse files Browse the repository at this point in the history
  • Loading branch information
efiop committed Sep 1, 2020
1 parent fc32168 commit 962cbe7
Show file tree
Hide file tree
Showing 6 changed files with 87 additions and 66 deletions.
65 changes: 57 additions & 8 deletions dvc/cache/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
import logging
from copy import copy
from operator import itemgetter

from shortuuid import uuid

Expand Down Expand Up @@ -255,10 +256,52 @@ def _cache_is_copy(self, path_info):
self.cache_type_confirmed = True
return self.cache_types[0] == "copy"

def _get_dir_info_hash(self, dir_info):
import tempfile

from dvc.path_info import PathInfo
from dvc.utils import tmp_fname

# Sorting the list by path to ensure reproducibility
dir_info = sorted(dir_info, key=itemgetter(self.tree.PARAM_RELPATH))

tmp = tempfile.NamedTemporaryFile(delete=False).name
with open(tmp, "w+") as fobj:
json.dump(dir_info, fobj, sort_keys=True)

from_info = PathInfo(tmp)
to_info = self.tree.path_info / tmp_fname("")
self.tree.upload(from_info, to_info, no_progress_bar=True)

hash_info = self.tree.get_hash(to_info)
hash_info.value += self.tree.CHECKSUM_DIR_SUFFIX
hash_info.dir_info = dir_info

return hash_info, to_info

def save_dir_info(self, dir_info, hash_info=None):
if hash_info and not self.changed_cache_file(hash_info):
return hash_info

hi, tmp_info = self._get_dir_info_hash(dir_info)
if hash_info:
assert hi == hash_info

new_info = self.tree.hash_to_path_info(hi.value)
if self.changed_cache_file(hi):
self.tree.makedirs(new_info.parent)
self.tree.move(tmp_info, new_info, mode=self.CACHE_MODE)

self.tree.state.save(new_info, hi.value)

return hi

def _save_dir(self, path_info, tree, hash_info, save_link=True, **kwargs):
dir_info = self.get_dir_cache(hash_info)
if not hash_info.dir_info:
hash_info.dir_info = tree.cache.get_dir_cache(hash_info)
hi = self.save_dir_info(hash_info.dir_info, hash_info)
for entry in Tqdm(
dir_info, desc="Saving " + path_info.name, unit="file"
hi.dir_info, desc="Saving " + path_info.name, unit="file"
):
entry_info = path_info / entry[self.tree.PARAM_RELPATH]
entry_hash = HashInfo(
Expand All @@ -271,10 +314,10 @@ def _save_dir(self, path_info, tree, hash_info, save_link=True, **kwargs):
if save_link:
self.tree.state.save_link(path_info)
if self.tree.exists(path_info):
self.tree.state.save(path_info, hash_info.value)
self.tree.state.save(path_info, hi.value)

cache_info = self.tree.hash_to_path_info(hash_info.value)
self.tree.state.save(cache_info, hash_info.value)
cache_info = self.tree.hash_to_path_info(hi.value)
self.tree.state.save(cache_info, hi.value)

def save(self, path_info, tree, hash_info, save_link=True, **kwargs):
if path_info.scheme != self.tree.scheme:
Expand Down Expand Up @@ -602,8 +645,6 @@ def _diff(ancestor, other, allow_removed=False):
return result

def _merge_dirs(self, ancestor_info, our_info, their_info):
from operator import itemgetter

from dictdiffer import patch

ancestor = self._to_dict(ancestor_info)
Expand Down Expand Up @@ -641,4 +682,12 @@ def merge(self, ancestor_info, our_info, their_info):
their = self.get_dir_cache(their_info)

merged = self._merge_dirs(ancestor, our, their)
return self.tree.save_dir_info(merged)
return self.save_dir_info(merged)

def get_hash(self, tree, path_info):
hash_info = tree.get_hash(path_info)
if not hash_info.isdir:
assert hash_info.name == self.tree.PARAM_CHECKSUM
return hash_info

return self.save_dir_info(hash_info.dir_info, hash_info)
3 changes: 2 additions & 1 deletion dvc/hash_info.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from dataclasses import dataclass
from dataclasses import dataclass, field

HASH_DIR_SUFFIX = ".dir"

Expand All @@ -7,6 +7,7 @@
class HashInfo:
name: str
value: str
dir_info: list = field(default=None, compare=False)

def __bool__(self):
return bool(self.value)
Expand Down
4 changes: 3 additions & 1 deletion dvc/output/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,9 @@ def cache_path(self):
return self.cache.tree.hash_to_path_info(self.hash_info.value).url

def get_hash(self):
return self.tree.get_hash(self.path_info)
if not self.use_cache:
return self.tree.get_hash(self.path_info)
return self.cache.get_hash(self.tree, self.path_info)

@property
def is_dir_checksum(self):
Expand Down
55 changes: 11 additions & 44 deletions dvc/tree/base.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
import itertools
import json
import logging
import tempfile
from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import partial
from multiprocessing import cpu_count
from operator import itemgetter
from urllib.parse import urlparse

from funcy import cached_property
Expand All @@ -17,7 +14,7 @@
)
from dvc.hash_info import HashInfo
from dvc.ignore import DvcIgnore
from dvc.path_info import PathInfo, URLInfo
from dvc.path_info import URLInfo
from dvc.progress import Tqdm
from dvc.state import StateNoop
from dvc.utils import tmp_fname
Expand Down Expand Up @@ -261,7 +258,10 @@ def get_hash(self, path_info, **kwargs):
hash_ = None

if hash_:
return HashInfo(self.PARAM_CHECKSUM, hash_)
hash_info = HashInfo(self.PARAM_CHECKSUM, hash_)
if hash_info.isdir:
hash_info.dir_info = self.cache.get_dir_cache(hash_info)
return hash_info

if self.isdir(path_info):
hash_info = self.get_dir_hash(path_info, **kwargs)
Expand All @@ -276,13 +276,6 @@ def get_hash(self, path_info, **kwargs):
def get_file_hash(self, path_info):
raise NotImplementedError

def get_dir_hash(self, path_info, **kwargs):
if not self.cache:
raise RemoteCacheRequiredError(path_info)

dir_info = self._collect_dir(path_info, **kwargs)
return self.save_dir_info(dir_info)

def hash_to_path_info(self, hash_):
return self.path_info / hash_[0:2] / hash_[2:]

Expand Down Expand Up @@ -321,7 +314,7 @@ def _collect_dir(self, path_info, **kwargs):
new_hashes = self._calculate_hashes(not_in_state)
hashes.update(new_hashes)

result = [
return [
{
self.PARAM_CHECKSUM: hashes[fi],
# NOTE: this is lossy transformation:
Expand All @@ -337,38 +330,12 @@ def _collect_dir(self, path_info, **kwargs):
for fi in file_infos
]

# Sorting the list by path to ensure reproducibility
return sorted(result, key=itemgetter(self.PARAM_RELPATH))

def save_dir_info(self, dir_info):
hash_info, tmp_info = self._get_dir_info_hash(dir_info)
new_info = self.cache.tree.hash_to_path_info(hash_info.value)
if self.cache.changed_cache_file(hash_info):
self.cache.tree.makedirs(new_info.parent)
self.cache.tree.move(
tmp_info, new_info, mode=self.cache.CACHE_MODE
)

self.state.save(new_info, hash_info.value)

return hash_info

def _get_dir_info_hash(self, dir_info):
# Sorting the list by path to ensure reproducibility
dir_info = sorted(dir_info, key=itemgetter(self.PARAM_RELPATH))

tmp = tempfile.NamedTemporaryFile(delete=False).name
with open(tmp, "w+") as fobj:
json.dump(dir_info, fobj, sort_keys=True)

tree = self.cache.tree
from_info = PathInfo(tmp)
to_info = tree.path_info / tmp_fname("")
tree.upload(from_info, to_info, no_progress_bar=True)
def get_dir_hash(self, path_info, **kwargs):
if not self.cache:
raise RemoteCacheRequiredError(path_info)

hash_info = tree.get_file_hash(to_info)
hash_info.value += self.CHECKSUM_DIR_SUFFIX
return hash_info, to_info
dir_info = self._collect_dir(path_info, **kwargs)
return self.cache.save_dir_info(dir_info)

def upload(self, from_info, to_info, name=None, no_progress_bar=False):
if not hasattr(self, "_upload"):
Expand Down
13 changes: 7 additions & 6 deletions tests/unit/tree/test_dvc.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,9 +230,10 @@ def test_get_hash_granular(tmp_dir, dvc):
)
tree = DvcTree(dvc, fetch=True)
subdir = PathInfo(tmp_dir) / "dir" / "subdir"
assert tree.get_hash(subdir) == HashInfo(
"md5", "af314506f1622d107e0ed3f14ec1a3b5.dir",
)
assert tree.get_hash(subdir / "data") == HashInfo(
"md5", "8d777f385d3dfec8815d20f7496026dc",
)
with dvc.state:
assert tree.get_hash(subdir) == HashInfo(
"md5", "af314506f1622d107e0ed3f14ec1a3b5.dir",
)
assert tree.get_hash(subdir / "data") == HashInfo(
"md5", "8d777f385d3dfec8815d20f7496026dc",
)
13 changes: 7 additions & 6 deletions tests/unit/tree/test_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,10 +391,11 @@ def test_get_hash_cached_granular(tmp_dir, dvc, mocker):
tree = RepoTree(dvc, fetch=True)
dvc_tree_spy = mocker.spy(tree._dvctrees[dvc.root_dir], "get_file_hash")
subdir = PathInfo(tmp_dir) / "dir" / "subdir"
assert tree.get_hash(subdir) == HashInfo(
"md5", "af314506f1622d107e0ed3f14ec1a3b5.dir",
)
assert tree.get_hash(subdir / "data") == HashInfo(
"md5", "8d777f385d3dfec8815d20f7496026dc",
)
with dvc.state:
assert tree.get_hash(subdir) == HashInfo(
"md5", "af314506f1622d107e0ed3f14ec1a3b5.dir",
)
assert tree.get_hash(subdir / "data") == HashInfo(
"md5", "8d777f385d3dfec8815d20f7496026dc",
)
assert dvc_tree_spy.called

0 comments on commit 962cbe7

Please sign in to comment.