From 94effcd7826203425ad3fb041d0a16b6a80bfca3 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Fri, 1 Nov 2024 23:39:59 +0100 Subject: [PATCH 1/4] ChunkIndex: .refcount -> .flags We gave up refcounting quite a while ago and are only interested in whether a chunk is used (referenced) or not (orphan). So, let's keep that uint32_t value, but use it for bit flags, so we could use it to efficiently remember other chunk-related stuff also. --- src/borg/archive.py | 2 +- src/borg/archiver/compact_cmd.py | 14 +++++++------- src/borg/cache.py | 11 +++++------ src/borg/hashindex.pyx | 17 ++++++++++++----- src/borg/repository.py | 6 +++--- src/borg/testsuite/hashindex_test.py | 14 +++++++++----- 6 files changed, 37 insertions(+), 27 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index 344a465058..24411f15b6 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -1874,7 +1874,7 @@ def add_reference(id_, size, cdata): # either we already have this chunk in repo and chunks index or we add it now if id_ not in self.chunks: assert cdata is not None - self.chunks[id_] = ChunkIndexEntry(refcount=ChunkIndex.MAX_VALUE, size=size) + self.chunks[id_] = ChunkIndexEntry(flags=ChunkIndex.F_USED, size=size) if self.repair: self.repository.put(id_, cdata) diff --git a/src/borg/archiver/compact_cmd.py b/src/borg/archiver/compact_cmd.py index b854b12c3d..20c9fa4800 100644 --- a/src/borg/archiver/compact_cmd.py +++ b/src/borg/archiver/compact_cmd.py @@ -50,20 +50,20 @@ def get_repository_chunks(self) -> ChunkIndex: """Build a dict id -> size of all chunks present in the repository""" chunks = ChunkIndex() for id, stored_size in repo_lister(self.repository, limit=LIST_SCAN_LIMIT): - # we add this id to the chunks index, using refcount == 0, because + # we add this id to the chunks index (as unused chunk), because # we do not know yet whether it is actually referenced from some archives. # we "abuse" the size field here. usually there is the plaintext size, # but we use it for the size of the stored object here. - chunks[id] = ChunkIndexEntry(refcount=0, size=stored_size) + chunks[id] = ChunkIndexEntry(flags=ChunkIndex.F_NONE, size=stored_size) return chunks def save_chunk_index(self): # first clean up: for id, entry in self.chunks.iteritems(): # we already deleted the unused chunks, so everything left must be used: - assert entry.refcount == ChunkIndex.MAX_VALUE + assert entry.flags & ChunkIndex.F_USED # as we put the wrong size in there, we need to clean up the size: - self.chunks[id] = ChunkIndexEntry(refcount=ChunkIndex.MAX_VALUE, size=0) + self.chunks[id] = entry._replace(size=0) # now self.chunks is an uptodate ChunkIndex, usable for general borg usage! write_chunkindex_to_repo_cache(self.repository, self.chunks, clear=True, force_write=True) self.chunks = None # nothing there (cleared!) @@ -74,8 +74,8 @@ def analyze_archives(self) -> Tuple[Set, Set, int, int, int]: def use_it(id, *, wanted=False): entry = self.chunks.get(id) if entry is not None: - # the chunk is in the repo, mark it used by setting refcount to max. - self.chunks[id] = ChunkIndexEntry(refcount=ChunkIndex.MAX_VALUE, size=entry.size) + # the chunk is in the repo, mark it used. + self.chunks[id] = entry._replace(flags=entry.flags | ChunkIndex.F_USED) if wanted: # chunk id is from chunks_healthy list: a lost chunk has re-appeared! reappeared_chunks.add(id) @@ -131,7 +131,7 @@ def report_and_delete(self): logger.info("Determining unused objects...") unused = set() for id, entry in self.chunks.iteritems(): - if entry.refcount == 0: + if not (entry.flags & ChunkIndex.F_USED): unused.add(id) logger.info(f"Deleting {len(unused)} unused objects...") pi = ProgressIndicatorPercent( diff --git a/src/borg/cache.py b/src/borg/cache.py index 0a5c380ded..9e50e69757 100644 --- a/src/borg/cache.py +++ b/src/borg/cache.py @@ -396,7 +396,7 @@ def compress_entry(self, entry): for id, size in entry.chunks: cie = self.chunks.get(id) assert cie is not None - assert cie.refcount > 0 + assert cie.flags & ChunkIndex.F_USED assert size == cie.size idx = self.chunks.k_to_idx(id) compressed_chunks.append(idx) @@ -415,7 +415,7 @@ def decompress_entry(self, entry_packed): id = self.chunks.idx_to_k(idx) cie = self.chunks.get(id) assert cie is not None - assert cie.refcount > 0 + assert cie.flags & ChunkIndex.F_USED assert cie.size > 0 chunks.append((id, cie.size)) entry = entry._replace(chunks=chunks) @@ -722,10 +722,9 @@ def build_chunkindex_from_repo(repository, *, disable_caches=False, cache_immedi chunks = ChunkIndex() t0 = perf_counter() num_chunks = 0 - # The repo says it has these chunks, so we assume they are referenced chunks. - # We do not care for refcounting anymore, so we just set refcount = MAX_VALUE. + # The repo says it has these chunks, so we assume they are referenced/used chunks. # We do not know the plaintext size (!= stored_size), thus we set size = 0. - init_entry = ChunkIndexEntry(refcount=ChunkIndex.MAX_VALUE, size=0) + init_entry = ChunkIndexEntry(flags=ChunkIndex.F_USED, size=0) for id, stored_size in repo_lister(repository, limit=LIST_SCAN_LIMIT): num_chunks += 1 chunks[id] = init_entry @@ -809,7 +808,7 @@ def add_chunk( ) self.repository.put(id, cdata, wait=wait) self.last_refresh_dt = now # .put also refreshed the lock - self.chunks.add(id, ChunkIndex.MAX_VALUE, size) + self.chunks.add(id, 1, size) stats.update(size, not exists) return ChunkListEntry(id, size) diff --git a/src/borg/hashindex.pyx b/src/borg/hashindex.pyx index c596928b19..0d0418e43f 100644 --- a/src/borg/hashindex.pyx +++ b/src/borg/hashindex.pyx @@ -34,14 +34,16 @@ class HTProxyMixin: self.ht.clear() -ChunkIndexEntry = namedtuple('ChunkIndexEntry', 'refcount size') +ChunkIndexEntry = namedtuple('ChunkIndexEntry', 'flags size') class ChunkIndex(HTProxyMixin, MutableMapping): """ Mapping from key256 to (refcount32, size32) to track chunks in the repository. """ - MAX_VALUE = 2**32 - 1 # borghash has the full uint32_t range + # .flags values: 2^0 .. 2^31 + F_NONE = 0 # all flags cleared + F_USED = 1 # chunk is used/referenced def __init__(self, capacity=1000, path=None, usable=None): if path: @@ -55,9 +57,14 @@ class ChunkIndex(HTProxyMixin, MutableMapping): yield from self.ht.items() def add(self, key, refs, size): - v = self.get(key, ChunkIndexEntry(0, 0)) - refcount = min(self.MAX_VALUE, v.refcount + refs) - self[key] = v._replace(refcount=refcount, size=size) + assert refs > 0 + v = self.get(key) + if v is None: + flags = self.F_USED + else: + flags = v.flags | self.F_USED + assert v.size == 0 or v.size == size + self[key] = ChunkIndexEntry(flags=flags, size=size) @classmethod def read(cls, path): diff --git a/src/borg/repository.py b/src/borg/repository.py index 6f87f8ccf9..93c5a4f74c 100644 --- a/src/borg/repository.py +++ b/src/borg/repository.py @@ -324,10 +324,10 @@ def check_object(obj): objs_checked = objs_errors = 0 chunks = ChunkIndex() # we don't do refcounting anymore, neither we can know here whether any archive - # is using this object, but we assume that this is the case and set refcount to - # MAX_VALUE. As we don't do garbage collection here, this is not a problem. + # is using this object, but we assume that this is the case. + # As we don't do garbage collection here, this is not a problem. # We also don't know the plaintext size, so we set it to 0. - init_entry = ChunkIndexEntry(refcount=ChunkIndex.MAX_VALUE, size=0) + init_entry = ChunkIndexEntry(flags=ChunkIndex.F_USED, size=0) infos = self.store.list("data") try: for info in infos: diff --git a/src/borg/testsuite/hashindex_test.py b/src/borg/testsuite/hashindex_test.py index 539242835a..f0c3d9f8a3 100644 --- a/src/borg/testsuite/hashindex_test.py +++ b/src/borg/testsuite/hashindex_test.py @@ -3,7 +3,7 @@ import pytest -from ..hashindex import ChunkIndex +from ..hashindex import ChunkIndex, ChunkIndexEntry def H(x): @@ -19,10 +19,14 @@ def H2(x): def test_chunkindex_add(): chunks = ChunkIndex() x = H2(1) - chunks.add(x, 5, 6) - assert chunks[x] == (5, 6) + chunks.add(x, 1, 0) + assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=0) + chunks.add(x, 1, 2) # updating size (we do not have a size yet) + assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=2) chunks.add(x, 1, 2) - assert chunks[x] == (6, 2) + assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=2) + with pytest.raises(AssertionError): + chunks.add(x, 1, 3) # inconsistent size (we already have a different size) def test_keyerror(): @@ -31,4 +35,4 @@ def test_keyerror(): with pytest.raises(KeyError): chunks[x] with pytest.raises(struct.error): - chunks.add(x, -1, 0) + chunks[x] = ChunkIndexEntry(flags=2**33, size=0) From ba3e701730f3e49847af966a20de34e19c8237d0 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Fri, 1 Nov 2024 23:49:52 +0100 Subject: [PATCH 2/4] ChunkIndex.add: remove useless refs parameter --- src/borg/cache.py | 2 +- src/borg/hashindex.pyx | 3 +-- src/borg/testsuite/hashindex_test.py | 8 ++++---- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/borg/cache.py b/src/borg/cache.py index 9e50e69757..27f38825fd 100644 --- a/src/borg/cache.py +++ b/src/borg/cache.py @@ -808,7 +808,7 @@ def add_chunk( ) self.repository.put(id, cdata, wait=wait) self.last_refresh_dt = now # .put also refreshed the lock - self.chunks.add(id, 1, size) + self.chunks.add(id, size) stats.update(size, not exists) return ChunkListEntry(id, size) diff --git a/src/borg/hashindex.pyx b/src/borg/hashindex.pyx index 0d0418e43f..00fe684047 100644 --- a/src/borg/hashindex.pyx +++ b/src/borg/hashindex.pyx @@ -56,8 +56,7 @@ class ChunkIndex(HTProxyMixin, MutableMapping): def iteritems(self): yield from self.ht.items() - def add(self, key, refs, size): - assert refs > 0 + def add(self, key, size): v = self.get(key) if v is None: flags = self.F_USED diff --git a/src/borg/testsuite/hashindex_test.py b/src/borg/testsuite/hashindex_test.py index f0c3d9f8a3..2539d351a1 100644 --- a/src/borg/testsuite/hashindex_test.py +++ b/src/borg/testsuite/hashindex_test.py @@ -19,14 +19,14 @@ def H2(x): def test_chunkindex_add(): chunks = ChunkIndex() x = H2(1) - chunks.add(x, 1, 0) + chunks.add(x, 0) assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=0) - chunks.add(x, 1, 2) # updating size (we do not have a size yet) + chunks.add(x, 2) # updating size (we do not have a size yet) assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=2) - chunks.add(x, 1, 2) + chunks.add(x, 2) assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=2) with pytest.raises(AssertionError): - chunks.add(x, 1, 3) # inconsistent size (we already have a different size) + chunks.add(x, 3) # inconsistent size (we already have a different size) def test_keyerror(): From 2ab3c163ce5fb24de063520f86a49b646289afe9 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Fri, 1 Nov 2024 23:57:38 +0100 Subject: [PATCH 3/4] invalidate existing cache by incrementing the ChunkIndex seed we changed the semantics (and name) of the first tuple element: refcount -> flags thus, better invalidate existing cache. --- src/borg/cache.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/borg/cache.py b/src/borg/cache.py index 27f38825fd..ed5c11c1e2 100644 --- a/src/borg/cache.py +++ b/src/borg/cache.py @@ -671,7 +671,7 @@ def load_chunks_hash(repository) -> bytes: return hash -CHUNKINDEX_HASH_SEED = 1 +CHUNKINDEX_HASH_SEED = 2 def write_chunkindex_to_repo_cache(repository, chunks, *, clear=False, force_write=False): From 8a13cf2c4d2d41e79d5cbdf7ca6ede65b89b3ec2 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sat, 2 Nov 2024 00:06:09 +0100 Subject: [PATCH 4/4] make mypy happy --- src/borg/hashindex.pyi | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/borg/hashindex.pyi b/src/borg/hashindex.pyi index 4291c9691a..6b236afa78 100644 --- a/src/borg/hashindex.pyi +++ b/src/borg/hashindex.pyi @@ -5,13 +5,15 @@ API_VERSION: str PATH_OR_FILE = Union[str, IO] class ChunkIndexEntry(NamedTuple): - refcount: int + flags: int size: int CIE = Union[Tuple[int, int], Type[ChunkIndexEntry]] class ChunkIndex: - def add(self, key: bytes, refs: int, size: int) -> None: ... + F_NONE: int + F_USED: int + def add(self, key: bytes, size: int) -> None: ... def iteritems(self, marker: bytes = ...) -> Iterator: ... def __contains__(self, key: bytes) -> bool: ... def __getitem__(self, key: bytes) -> Type[ChunkIndexEntry]: ...