Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ChunkIndexEntry.refcount -> .flags #8513

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/borg/archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -1874,7 +1874,7 @@ def add_reference(id_, size, cdata):
# either we already have this chunk in repo and chunks index or we add it now
if id_ not in self.chunks:
assert cdata is not None
self.chunks[id_] = ChunkIndexEntry(refcount=ChunkIndex.MAX_VALUE, size=size)
self.chunks[id_] = ChunkIndexEntry(flags=ChunkIndex.F_USED, size=size)
if self.repair:
self.repository.put(id_, cdata)

Expand Down
14 changes: 7 additions & 7 deletions src/borg/archiver/compact_cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,20 +50,20 @@ def get_repository_chunks(self) -> ChunkIndex:
"""Build a dict id -> size of all chunks present in the repository"""
chunks = ChunkIndex()
for id, stored_size in repo_lister(self.repository, limit=LIST_SCAN_LIMIT):
# we add this id to the chunks index, using refcount == 0, because
# we add this id to the chunks index (as unused chunk), because
# we do not know yet whether it is actually referenced from some archives.
# we "abuse" the size field here. usually there is the plaintext size,
# but we use it for the size of the stored object here.
chunks[id] = ChunkIndexEntry(refcount=0, size=stored_size)
chunks[id] = ChunkIndexEntry(flags=ChunkIndex.F_NONE, size=stored_size)
return chunks

def save_chunk_index(self):
# first clean up:
for id, entry in self.chunks.iteritems():
# we already deleted the unused chunks, so everything left must be used:
assert entry.refcount == ChunkIndex.MAX_VALUE
assert entry.flags & ChunkIndex.F_USED
# as we put the wrong size in there, we need to clean up the size:
self.chunks[id] = ChunkIndexEntry(refcount=ChunkIndex.MAX_VALUE, size=0)
self.chunks[id] = entry._replace(size=0)
# now self.chunks is an uptodate ChunkIndex, usable for general borg usage!
write_chunkindex_to_repo_cache(self.repository, self.chunks, clear=True, force_write=True)
self.chunks = None # nothing there (cleared!)
Expand All @@ -74,8 +74,8 @@ def analyze_archives(self) -> Tuple[Set, Set, int, int, int]:
def use_it(id, *, wanted=False):
entry = self.chunks.get(id)
if entry is not None:
# the chunk is in the repo, mark it used by setting refcount to max.
self.chunks[id] = ChunkIndexEntry(refcount=ChunkIndex.MAX_VALUE, size=entry.size)
# the chunk is in the repo, mark it used.
self.chunks[id] = entry._replace(flags=entry.flags | ChunkIndex.F_USED)
if wanted:
# chunk id is from chunks_healthy list: a lost chunk has re-appeared!
reappeared_chunks.add(id)
Expand Down Expand Up @@ -131,7 +131,7 @@ def report_and_delete(self):
logger.info("Determining unused objects...")
unused = set()
for id, entry in self.chunks.iteritems():
if entry.refcount == 0:
if not (entry.flags & ChunkIndex.F_USED):
unused.add(id)
logger.info(f"Deleting {len(unused)} unused objects...")
pi = ProgressIndicatorPercent(
Expand Down
13 changes: 6 additions & 7 deletions src/borg/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,7 @@ def compress_entry(self, entry):
for id, size in entry.chunks:
cie = self.chunks.get(id)
assert cie is not None
assert cie.refcount > 0
assert cie.flags & ChunkIndex.F_USED
assert size == cie.size
idx = self.chunks.k_to_idx(id)
compressed_chunks.append(idx)
Expand All @@ -415,7 +415,7 @@ def decompress_entry(self, entry_packed):
id = self.chunks.idx_to_k(idx)
cie = self.chunks.get(id)
assert cie is not None
assert cie.refcount > 0
assert cie.flags & ChunkIndex.F_USED
assert cie.size > 0
chunks.append((id, cie.size))
entry = entry._replace(chunks=chunks)
Expand Down Expand Up @@ -671,7 +671,7 @@ def load_chunks_hash(repository) -> bytes:
return hash


CHUNKINDEX_HASH_SEED = 1
CHUNKINDEX_HASH_SEED = 2


def write_chunkindex_to_repo_cache(repository, chunks, *, clear=False, force_write=False):
Expand Down Expand Up @@ -722,10 +722,9 @@ def build_chunkindex_from_repo(repository, *, disable_caches=False, cache_immedi
chunks = ChunkIndex()
t0 = perf_counter()
num_chunks = 0
# The repo says it has these chunks, so we assume they are referenced chunks.
# We do not care for refcounting anymore, so we just set refcount = MAX_VALUE.
# The repo says it has these chunks, so we assume they are referenced/used chunks.
# We do not know the plaintext size (!= stored_size), thus we set size = 0.
init_entry = ChunkIndexEntry(refcount=ChunkIndex.MAX_VALUE, size=0)
init_entry = ChunkIndexEntry(flags=ChunkIndex.F_USED, size=0)
for id, stored_size in repo_lister(repository, limit=LIST_SCAN_LIMIT):
num_chunks += 1
chunks[id] = init_entry
Expand Down Expand Up @@ -809,7 +808,7 @@ def add_chunk(
)
self.repository.put(id, cdata, wait=wait)
self.last_refresh_dt = now # .put also refreshed the lock
self.chunks.add(id, ChunkIndex.MAX_VALUE, size)
self.chunks.add(id, size)
stats.update(size, not exists)
return ChunkListEntry(id, size)

Expand Down
6 changes: 4 additions & 2 deletions src/borg/hashindex.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,15 @@ API_VERSION: str
PATH_OR_FILE = Union[str, IO]

class ChunkIndexEntry(NamedTuple):
refcount: int
flags: int
size: int

CIE = Union[Tuple[int, int], Type[ChunkIndexEntry]]

class ChunkIndex:
def add(self, key: bytes, refs: int, size: int) -> None: ...
F_NONE: int
F_USED: int
def add(self, key: bytes, size: int) -> None: ...
def iteritems(self, marker: bytes = ...) -> Iterator: ...
def __contains__(self, key: bytes) -> bool: ...
def __getitem__(self, key: bytes) -> Type[ChunkIndexEntry]: ...
Expand Down
18 changes: 12 additions & 6 deletions src/borg/hashindex.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,16 @@ class HTProxyMixin:
self.ht.clear()


ChunkIndexEntry = namedtuple('ChunkIndexEntry', 'refcount size')
ChunkIndexEntry = namedtuple('ChunkIndexEntry', 'flags size')


class ChunkIndex(HTProxyMixin, MutableMapping):
"""
Mapping from key256 to (refcount32, size32) to track chunks in the repository.
"""
MAX_VALUE = 2**32 - 1 # borghash has the full uint32_t range
# .flags values: 2^0 .. 2^31
F_NONE = 0 # all flags cleared
F_USED = 1 # chunk is used/referenced

def __init__(self, capacity=1000, path=None, usable=None):
if path:
Expand All @@ -54,10 +56,14 @@ class ChunkIndex(HTProxyMixin, MutableMapping):
def iteritems(self):
yield from self.ht.items()

def add(self, key, refs, size):
v = self.get(key, ChunkIndexEntry(0, 0))
refcount = min(self.MAX_VALUE, v.refcount + refs)
self[key] = v._replace(refcount=refcount, size=size)
def add(self, key, size):
v = self.get(key)
if v is None:
flags = self.F_USED
else:
flags = v.flags | self.F_USED
assert v.size == 0 or v.size == size
self[key] = ChunkIndexEntry(flags=flags, size=size)

@classmethod
def read(cls, path):
Expand Down
6 changes: 3 additions & 3 deletions src/borg/repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,10 +324,10 @@ def check_object(obj):
objs_checked = objs_errors = 0
chunks = ChunkIndex()
# we don't do refcounting anymore, neither we can know here whether any archive
# is using this object, but we assume that this is the case and set refcount to
# MAX_VALUE. As we don't do garbage collection here, this is not a problem.
# is using this object, but we assume that this is the case.
# As we don't do garbage collection here, this is not a problem.
# We also don't know the plaintext size, so we set it to 0.
init_entry = ChunkIndexEntry(refcount=ChunkIndex.MAX_VALUE, size=0)
init_entry = ChunkIndexEntry(flags=ChunkIndex.F_USED, size=0)
infos = self.store.list("data")
try:
for info in infos:
Expand Down
16 changes: 10 additions & 6 deletions src/borg/testsuite/hashindex_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import pytest

from ..hashindex import ChunkIndex
from ..hashindex import ChunkIndex, ChunkIndexEntry


def H(x):
Expand All @@ -19,10 +19,14 @@ def H2(x):
def test_chunkindex_add():
chunks = ChunkIndex()
x = H2(1)
chunks.add(x, 5, 6)
assert chunks[x] == (5, 6)
chunks.add(x, 1, 2)
assert chunks[x] == (6, 2)
chunks.add(x, 0)
assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=0)
chunks.add(x, 2) # updating size (we do not have a size yet)
assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=2)
chunks.add(x, 2)
assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=2)
with pytest.raises(AssertionError):
chunks.add(x, 3) # inconsistent size (we already have a different size)


def test_keyerror():
Expand All @@ -31,4 +35,4 @@ def test_keyerror():
with pytest.raises(KeyError):
chunks[x]
with pytest.raises(struct.error):
chunks.add(x, -1, 0)
chunks[x] = ChunkIndexEntry(flags=2**33, size=0)
Loading