Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

dvc: separate cache and tree hashes #4511

Merged
merged 1 commit into from
Sep 1, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 57 additions & 8 deletions dvc/cache/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
import logging
from copy import copy
from operator import itemgetter

from shortuuid import uuid

Expand Down Expand Up @@ -255,10 +256,52 @@ def _cache_is_copy(self, path_info):
self.cache_type_confirmed = True
return self.cache_types[0] == "copy"

def _get_dir_info_hash(self, dir_info):
import tempfile

from dvc.path_info import PathInfo
from dvc.utils import tmp_fname

# Sorting the list by path to ensure reproducibility
dir_info = sorted(dir_info, key=itemgetter(self.tree.PARAM_RELPATH))

tmp = tempfile.NamedTemporaryFile(delete=False).name
with open(tmp, "w+") as fobj:
json.dump(dir_info, fobj, sort_keys=True)

from_info = PathInfo(tmp)
to_info = self.tree.path_info / tmp_fname("")
self.tree.upload(from_info, to_info, no_progress_bar=True)

hash_info = self.tree.get_hash(to_info)
hash_info.value += self.tree.CHECKSUM_DIR_SUFFIX
hash_info.dir_info = dir_info

return hash_info, to_info

def save_dir_info(self, dir_info, hash_info=None):
if hash_info and not self.changed_cache_file(hash_info):
return hash_info

hi, tmp_info = self._get_dir_info_hash(dir_info)
if hash_info:
assert hi == hash_info
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will go away in the next PR when we support md5s for external dirs. Just a sanity check for now.


new_info = self.tree.hash_to_path_info(hi.value)
if self.changed_cache_file(hi):
self.tree.makedirs(new_info.parent)
self.tree.move(tmp_info, new_info, mode=self.CACHE_MODE)

self.tree.state.save(new_info, hi.value)

return hi

def _save_dir(self, path_info, tree, hash_info, save_link=True, **kwargs):
dir_info = self.get_dir_cache(hash_info)
if not hash_info.dir_info:
hash_info.dir_info = tree.cache.get_dir_cache(hash_info)
hi = self.save_dir_info(hash_info.dir_info, hash_info)
for entry in Tqdm(
dir_info, desc="Saving " + path_info.name, unit="file"
hi.dir_info, desc="Saving " + path_info.name, unit="file"
):
entry_info = path_info / entry[self.tree.PARAM_RELPATH]
entry_hash = HashInfo(
Expand All @@ -271,10 +314,10 @@ def _save_dir(self, path_info, tree, hash_info, save_link=True, **kwargs):
if save_link:
self.tree.state.save_link(path_info)
if self.tree.exists(path_info):
self.tree.state.save(path_info, hash_info.value)
self.tree.state.save(path_info, hi.value)

cache_info = self.tree.hash_to_path_info(hash_info.value)
self.tree.state.save(cache_info, hash_info.value)
cache_info = self.tree.hash_to_path_info(hi.value)
self.tree.state.save(cache_info, hi.value)

def save(self, path_info, tree, hash_info, save_link=True, **kwargs):
if path_info.scheme != self.tree.scheme:
Expand Down Expand Up @@ -602,8 +645,6 @@ def _diff(ancestor, other, allow_removed=False):
return result

def _merge_dirs(self, ancestor_info, our_info, their_info):
from operator import itemgetter

from dictdiffer import patch

ancestor = self._to_dict(ancestor_info)
Expand Down Expand Up @@ -641,4 +682,12 @@ def merge(self, ancestor_info, our_info, their_info):
their = self.get_dir_cache(their_info)

merged = self._merge_dirs(ancestor, our, their)
return self.tree.save_dir_info(merged)
return self.save_dir_info(merged)

def get_hash(self, tree, path_info):
hash_info = tree.get_hash(path_info)
if not hash_info.isdir:
assert hash_info.name == self.tree.PARAM_CHECKSUM
return hash_info

return self.save_dir_info(hash_info.dir_info, hash_info)
3 changes: 2 additions & 1 deletion dvc/hash_info.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from dataclasses import dataclass
from dataclasses import dataclass, field

HASH_DIR_SUFFIX = ".dir"

Expand All @@ -7,6 +7,7 @@
class HashInfo:
name: str
value: str
dir_info: list = field(default=None, compare=False)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible to have a non-nullable HashInfo, i.e. without name and value being None?
I understand that it's just a simple container, but I want to avoid possibilities of hash_info being:

  1. None
  2. A regular hash_info
  3. hash_info with value=None.

1 and 2 are fine, it'd be safer to avoid 3rd case if we can.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible to have a non-nullable HashInfo, i.e. without name and value being None?

@skshetry it is not, but we allow it. Very good points! I also thought about it, but decided not to mess with it for now. Ideally thinking about making this dataclass read-only, so everything will be set when creating an instance and so we can put appropriate checks/asserts for such things


def __bool__(self):
return bool(self.value)
Expand Down
4 changes: 3 additions & 1 deletion dvc/output/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,9 @@ def cache_path(self):
return self.cache.tree.hash_to_path_info(self.hash_info.value).url

def get_hash(self):
return self.tree.get_hash(self.path_info)
if not self.use_cache:
return self.tree.get_hash(self.path_info)
return self.cache.get_hash(self.tree, self.path_info)

@property
def is_dir_checksum(self):
Expand Down
55 changes: 11 additions & 44 deletions dvc/tree/base.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
import itertools
import json
import logging
import tempfile
from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import partial
from multiprocessing import cpu_count
from operator import itemgetter
from urllib.parse import urlparse

from funcy import cached_property
Expand All @@ -17,7 +14,7 @@
)
from dvc.hash_info import HashInfo
from dvc.ignore import DvcIgnore
from dvc.path_info import PathInfo, URLInfo
from dvc.path_info import URLInfo
from dvc.progress import Tqdm
from dvc.state import StateNoop
from dvc.utils import tmp_fname
Expand Down Expand Up @@ -261,7 +258,10 @@ def get_hash(self, path_info, **kwargs):
hash_ = None

if hash_:
return HashInfo(self.PARAM_CHECKSUM, hash_)
hash_info = HashInfo(self.PARAM_CHECKSUM, hash_)
if hash_info.isdir:
hash_info.dir_info = self.cache.get_dir_cache(hash_info)
return hash_info

if self.isdir(path_info):
hash_info = self.get_dir_hash(path_info, **kwargs)
Expand All @@ -276,13 +276,6 @@ def get_hash(self, path_info, **kwargs):
def get_file_hash(self, path_info):
raise NotImplementedError

def get_dir_hash(self, path_info, **kwargs):
if not self.cache:
raise RemoteCacheRequiredError(path_info)

dir_info = self._collect_dir(path_info, **kwargs)
return self.save_dir_info(dir_info)

def hash_to_path_info(self, hash_):
return self.path_info / hash_[0:2] / hash_[2:]

Expand Down Expand Up @@ -321,7 +314,7 @@ def _collect_dir(self, path_info, **kwargs):
new_hashes = self._calculate_hashes(not_in_state)
hashes.update(new_hashes)

result = [
return [
{
self.PARAM_CHECKSUM: hashes[fi],
# NOTE: this is lossy transformation:
Expand All @@ -337,38 +330,12 @@ def _collect_dir(self, path_info, **kwargs):
for fi in file_infos
]

# Sorting the list by path to ensure reproducibility
return sorted(result, key=itemgetter(self.PARAM_RELPATH))

def save_dir_info(self, dir_info):
hash_info, tmp_info = self._get_dir_info_hash(dir_info)
new_info = self.cache.tree.hash_to_path_info(hash_info.value)
if self.cache.changed_cache_file(hash_info):
self.cache.tree.makedirs(new_info.parent)
self.cache.tree.move(
tmp_info, new_info, mode=self.cache.CACHE_MODE
)

self.state.save(new_info, hash_info.value)

return hash_info

def _get_dir_info_hash(self, dir_info):
# Sorting the list by path to ensure reproducibility
dir_info = sorted(dir_info, key=itemgetter(self.PARAM_RELPATH))

tmp = tempfile.NamedTemporaryFile(delete=False).name
with open(tmp, "w+") as fobj:
json.dump(dir_info, fobj, sort_keys=True)

tree = self.cache.tree
from_info = PathInfo(tmp)
to_info = tree.path_info / tmp_fname("")
tree.upload(from_info, to_info, no_progress_bar=True)
def get_dir_hash(self, path_info, **kwargs):
if not self.cache:
raise RemoteCacheRequiredError(path_info)

hash_info = tree.get_file_hash(to_info)
hash_info.value += self.CHECKSUM_DIR_SUFFIX
return hash_info, to_info
dir_info = self._collect_dir(path_info, **kwargs)
return self.cache.save_dir_info(dir_info)

def upload(self, from_info, to_info, name=None, no_progress_bar=False):
if not hasattr(self, "_upload"):
Expand Down
13 changes: 7 additions & 6 deletions tests/unit/tree/test_dvc.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,9 +230,10 @@ def test_get_hash_granular(tmp_dir, dvc):
)
tree = DvcTree(dvc, fetch=True)
subdir = PathInfo(tmp_dir) / "dir" / "subdir"
assert tree.get_hash(subdir) == HashInfo(
"md5", "af314506f1622d107e0ed3f14ec1a3b5.dir",
)
assert tree.get_hash(subdir / "data") == HashInfo(
"md5", "8d777f385d3dfec8815d20f7496026dc",
)
with dvc.state:
assert tree.get_hash(subdir) == HashInfo(
"md5", "af314506f1622d107e0ed3f14ec1a3b5.dir",
)
assert tree.get_hash(subdir / "data") == HashInfo(
"md5", "8d777f385d3dfec8815d20f7496026dc",
)
13 changes: 7 additions & 6 deletions tests/unit/tree/test_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,10 +391,11 @@ def test_get_hash_cached_granular(tmp_dir, dvc, mocker):
tree = RepoTree(dvc, fetch=True)
dvc_tree_spy = mocker.spy(tree._dvctrees[dvc.root_dir], "get_file_hash")
subdir = PathInfo(tmp_dir) / "dir" / "subdir"
assert tree.get_hash(subdir) == HashInfo(
"md5", "af314506f1622d107e0ed3f14ec1a3b5.dir",
)
assert tree.get_hash(subdir / "data") == HashInfo(
"md5", "8d777f385d3dfec8815d20f7496026dc",
)
with dvc.state:
assert tree.get_hash(subdir) == HashInfo(
"md5", "af314506f1622d107e0ed3f14ec1a3b5.dir",
)
assert tree.get_hash(subdir / "data") == HashInfo(
"md5", "8d777f385d3dfec8815d20f7496026dc",
)
assert dvc_tree_spy.called