From 94effcd7826203425ad3fb041d0a16b6a80bfca3 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Fri, 1 Nov 2024 23:39:59 +0100
Subject: [PATCH 1/4] ChunkIndex: .refcount -> .flags

We gave up refcounting quite a while ago and are only interested
in whether a chunk is used (referenced) or not (orphan).

So, let's keep that uint32_t value, but use it for bit flags, so
we could use it to efficiently remember other chunk-related stuff also.
---
 src/borg/archive.py                  |  2 +-
 src/borg/archiver/compact_cmd.py     | 14 +++++++-------
 src/borg/cache.py                    | 11 +++++------
 src/borg/hashindex.pyx               | 17 ++++++++++++-----
 src/borg/repository.py               |  6 +++---
 src/borg/testsuite/hashindex_test.py | 14 +++++++++-----
 6 files changed, 37 insertions(+), 27 deletions(-)

diff --git a/src/borg/archive.py b/src/borg/archive.py
index 344a465058..24411f15b6 100644
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@@ -1874,7 +1874,7 @@ def add_reference(id_, size, cdata):
             # either we already have this chunk in repo and chunks index or we add it now
             if id_ not in self.chunks:
                 assert cdata is not None
-                self.chunks[id_] = ChunkIndexEntry(refcount=ChunkIndex.MAX_VALUE, size=size)
+                self.chunks[id_] = ChunkIndexEntry(flags=ChunkIndex.F_USED, size=size)
                 if self.repair:
                     self.repository.put(id_, cdata)
 
diff --git a/src/borg/archiver/compact_cmd.py b/src/borg/archiver/compact_cmd.py
index b854b12c3d..20c9fa4800 100644
--- a/src/borg/archiver/compact_cmd.py
+++ b/src/borg/archiver/compact_cmd.py
@@ -50,20 +50,20 @@ def get_repository_chunks(self) -> ChunkIndex:
         """Build a dict id -> size of all chunks present in the repository"""
         chunks = ChunkIndex()
         for id, stored_size in repo_lister(self.repository, limit=LIST_SCAN_LIMIT):
-            # we add this id to the chunks index, using refcount == 0, because
+            # we add this id to the chunks index (as unused chunk), because
             # we do not know yet whether it is actually referenced from some archives.
             # we "abuse" the size field here. usually there is the plaintext size,
             # but we use it for the size of the stored object here.
-            chunks[id] = ChunkIndexEntry(refcount=0, size=stored_size)
+            chunks[id] = ChunkIndexEntry(flags=ChunkIndex.F_NONE, size=stored_size)
         return chunks
 
     def save_chunk_index(self):
         # first clean up:
         for id, entry in self.chunks.iteritems():
             # we already deleted the unused chunks, so everything left must be used:
-            assert entry.refcount == ChunkIndex.MAX_VALUE
+            assert entry.flags & ChunkIndex.F_USED
             # as we put the wrong size in there, we need to clean up the size:
-            self.chunks[id] = ChunkIndexEntry(refcount=ChunkIndex.MAX_VALUE, size=0)
+            self.chunks[id] = entry._replace(size=0)
         # now self.chunks is an uptodate ChunkIndex, usable for general borg usage!
         write_chunkindex_to_repo_cache(self.repository, self.chunks, clear=True, force_write=True)
         self.chunks = None  # nothing there (cleared!)
@@ -74,8 +74,8 @@ def analyze_archives(self) -> Tuple[Set, Set, int, int, int]:
         def use_it(id, *, wanted=False):
             entry = self.chunks.get(id)
             if entry is not None:
-                # the chunk is in the repo, mark it used by setting refcount to max.
-                self.chunks[id] = ChunkIndexEntry(refcount=ChunkIndex.MAX_VALUE, size=entry.size)
+                # the chunk is in the repo, mark it used.
+                self.chunks[id] = entry._replace(flags=entry.flags | ChunkIndex.F_USED)
                 if wanted:
                     # chunk id is from chunks_healthy list: a lost chunk has re-appeared!
                     reappeared_chunks.add(id)
@@ -131,7 +131,7 @@ def report_and_delete(self):
         logger.info("Determining unused objects...")
         unused = set()
         for id, entry in self.chunks.iteritems():
-            if entry.refcount == 0:
+            if not (entry.flags & ChunkIndex.F_USED):
                 unused.add(id)
         logger.info(f"Deleting {len(unused)} unused objects...")
         pi = ProgressIndicatorPercent(
diff --git a/src/borg/cache.py b/src/borg/cache.py
index 0a5c380ded..9e50e69757 100644
--- a/src/borg/cache.py
+++ b/src/borg/cache.py
@@ -396,7 +396,7 @@ def compress_entry(self, entry):
         for id, size in entry.chunks:
             cie = self.chunks.get(id)
             assert cie is not None
-            assert cie.refcount > 0
+            assert cie.flags & ChunkIndex.F_USED
             assert size == cie.size
             idx = self.chunks.k_to_idx(id)
             compressed_chunks.append(idx)
@@ -415,7 +415,7 @@ def decompress_entry(self, entry_packed):
             id = self.chunks.idx_to_k(idx)
             cie = self.chunks.get(id)
             assert cie is not None
-            assert cie.refcount > 0
+            assert cie.flags & ChunkIndex.F_USED
             assert cie.size > 0
             chunks.append((id, cie.size))
         entry = entry._replace(chunks=chunks)
@@ -722,10 +722,9 @@ def build_chunkindex_from_repo(repository, *, disable_caches=False, cache_immedi
     chunks = ChunkIndex()
     t0 = perf_counter()
     num_chunks = 0
-    # The repo says it has these chunks, so we assume they are referenced chunks.
-    # We do not care for refcounting anymore, so we just set refcount = MAX_VALUE.
+    # The repo says it has these chunks, so we assume they are referenced/used chunks.
     # We do not know the plaintext size (!= stored_size), thus we set size = 0.
-    init_entry = ChunkIndexEntry(refcount=ChunkIndex.MAX_VALUE, size=0)
+    init_entry = ChunkIndexEntry(flags=ChunkIndex.F_USED, size=0)
     for id, stored_size in repo_lister(repository, limit=LIST_SCAN_LIMIT):
         num_chunks += 1
         chunks[id] = init_entry
@@ -809,7 +808,7 @@ def add_chunk(
         )
         self.repository.put(id, cdata, wait=wait)
         self.last_refresh_dt = now  # .put also refreshed the lock
-        self.chunks.add(id, ChunkIndex.MAX_VALUE, size)
+        self.chunks.add(id, 1, size)
         stats.update(size, not exists)
         return ChunkListEntry(id, size)
 
diff --git a/src/borg/hashindex.pyx b/src/borg/hashindex.pyx
index c596928b19..0d0418e43f 100644
--- a/src/borg/hashindex.pyx
+++ b/src/borg/hashindex.pyx
@@ -34,14 +34,16 @@ class HTProxyMixin:
         self.ht.clear()
 
 
-ChunkIndexEntry = namedtuple('ChunkIndexEntry', 'refcount size')
+ChunkIndexEntry = namedtuple('ChunkIndexEntry', 'flags size')
 
 
 class ChunkIndex(HTProxyMixin, MutableMapping):
     """
     Mapping from key256 to (refcount32, size32) to track chunks in the repository.
     """
-    MAX_VALUE = 2**32 - 1  # borghash has the full uint32_t range
+    # .flags values: 2^0 .. 2^31
+    F_NONE = 0  # all flags cleared
+    F_USED = 1  # chunk is used/referenced
 
     def __init__(self, capacity=1000, path=None, usable=None):
         if path:
@@ -55,9 +57,14 @@ class ChunkIndex(HTProxyMixin, MutableMapping):
         yield from self.ht.items()
 
     def add(self, key, refs, size):
-        v = self.get(key, ChunkIndexEntry(0, 0))
-        refcount = min(self.MAX_VALUE, v.refcount + refs)
-        self[key] = v._replace(refcount=refcount, size=size)
+        assert refs > 0
+        v = self.get(key)
+        if v is None:
+            flags = self.F_USED
+        else:
+            flags = v.flags | self.F_USED
+            assert v.size == 0 or v.size == size
+        self[key] = ChunkIndexEntry(flags=flags, size=size)
 
     @classmethod
     def read(cls, path):
diff --git a/src/borg/repository.py b/src/borg/repository.py
index 6f87f8ccf9..93c5a4f74c 100644
--- a/src/borg/repository.py
+++ b/src/borg/repository.py
@@ -324,10 +324,10 @@ def check_object(obj):
         objs_checked = objs_errors = 0
         chunks = ChunkIndex()
         # we don't do refcounting anymore, neither we can know here whether any archive
-        # is using this object, but we assume that this is the case and set refcount to
-        # MAX_VALUE. As we don't do garbage collection here, this is not a problem.
+        # is using this object, but we assume that this is the case.
+        # As we don't do garbage collection here, this is not a problem.
         # We also don't know the plaintext size, so we set it to 0.
-        init_entry = ChunkIndexEntry(refcount=ChunkIndex.MAX_VALUE, size=0)
+        init_entry = ChunkIndexEntry(flags=ChunkIndex.F_USED, size=0)
         infos = self.store.list("data")
         try:
             for info in infos:
diff --git a/src/borg/testsuite/hashindex_test.py b/src/borg/testsuite/hashindex_test.py
index 539242835a..f0c3d9f8a3 100644
--- a/src/borg/testsuite/hashindex_test.py
+++ b/src/borg/testsuite/hashindex_test.py
@@ -3,7 +3,7 @@
 
 import pytest
 
-from ..hashindex import ChunkIndex
+from ..hashindex import ChunkIndex, ChunkIndexEntry
 
 
 def H(x):
@@ -19,10 +19,14 @@ def H2(x):
 def test_chunkindex_add():
     chunks = ChunkIndex()
     x = H2(1)
-    chunks.add(x, 5, 6)
-    assert chunks[x] == (5, 6)
+    chunks.add(x, 1, 0)
+    assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=0)
+    chunks.add(x, 1, 2)  # updating size (we do not have a size yet)
+    assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=2)
     chunks.add(x, 1, 2)
-    assert chunks[x] == (6, 2)
+    assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=2)
+    with pytest.raises(AssertionError):
+        chunks.add(x, 1, 3)  # inconsistent size (we already have a different size)
 
 
 def test_keyerror():
@@ -31,4 +35,4 @@ def test_keyerror():
     with pytest.raises(KeyError):
         chunks[x]
     with pytest.raises(struct.error):
-        chunks.add(x, -1, 0)
+        chunks[x] = ChunkIndexEntry(flags=2**33, size=0)

From ba3e701730f3e49847af966a20de34e19c8237d0 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Fri, 1 Nov 2024 23:49:52 +0100
Subject: [PATCH 2/4] ChunkIndex.add: remove useless refs parameter

---
 src/borg/cache.py                    | 2 +-
 src/borg/hashindex.pyx               | 3 +--
 src/borg/testsuite/hashindex_test.py | 8 ++++----
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/borg/cache.py b/src/borg/cache.py
index 9e50e69757..27f38825fd 100644
--- a/src/borg/cache.py
+++ b/src/borg/cache.py
@@ -808,7 +808,7 @@ def add_chunk(
         )
         self.repository.put(id, cdata, wait=wait)
         self.last_refresh_dt = now  # .put also refreshed the lock
-        self.chunks.add(id, 1, size)
+        self.chunks.add(id, size)
         stats.update(size, not exists)
         return ChunkListEntry(id, size)
 
diff --git a/src/borg/hashindex.pyx b/src/borg/hashindex.pyx
index 0d0418e43f..00fe684047 100644
--- a/src/borg/hashindex.pyx
+++ b/src/borg/hashindex.pyx
@@ -56,8 +56,7 @@ class ChunkIndex(HTProxyMixin, MutableMapping):
     def iteritems(self):
         yield from self.ht.items()
 
-    def add(self, key, refs, size):
-        assert refs > 0
+    def add(self, key, size):
         v = self.get(key)
         if v is None:
             flags = self.F_USED
diff --git a/src/borg/testsuite/hashindex_test.py b/src/borg/testsuite/hashindex_test.py
index f0c3d9f8a3..2539d351a1 100644
--- a/src/borg/testsuite/hashindex_test.py
+++ b/src/borg/testsuite/hashindex_test.py
@@ -19,14 +19,14 @@ def H2(x):
 def test_chunkindex_add():
     chunks = ChunkIndex()
     x = H2(1)
-    chunks.add(x, 1, 0)
+    chunks.add(x, 0)
     assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=0)
-    chunks.add(x, 1, 2)  # updating size (we do not have a size yet)
+    chunks.add(x, 2)  # updating size (we do not have a size yet)
     assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=2)
-    chunks.add(x, 1, 2)
+    chunks.add(x, 2)
     assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=2)
     with pytest.raises(AssertionError):
-        chunks.add(x, 1, 3)  # inconsistent size (we already have a different size)
+        chunks.add(x, 3)  # inconsistent size (we already have a different size)
 
 
 def test_keyerror():

From 2ab3c163ce5fb24de063520f86a49b646289afe9 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Fri, 1 Nov 2024 23:57:38 +0100
Subject: [PATCH 3/4] invalidate existing cache by incrementing the ChunkIndex
 seed

we changed the semantics (and name) of the first tuple element:
refcount -> flags

thus, better invalidate existing cache.
---
 src/borg/cache.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/borg/cache.py b/src/borg/cache.py
index 27f38825fd..ed5c11c1e2 100644
--- a/src/borg/cache.py
+++ b/src/borg/cache.py
@@ -671,7 +671,7 @@ def load_chunks_hash(repository) -> bytes:
     return hash
 
 
-CHUNKINDEX_HASH_SEED = 1
+CHUNKINDEX_HASH_SEED = 2
 
 
 def write_chunkindex_to_repo_cache(repository, chunks, *, clear=False, force_write=False):

From 8a13cf2c4d2d41e79d5cbdf7ca6ede65b89b3ec2 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Sat, 2 Nov 2024 00:06:09 +0100
Subject: [PATCH 4/4] make mypy happy

---
 src/borg/hashindex.pyi | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/borg/hashindex.pyi b/src/borg/hashindex.pyi
index 4291c9691a..6b236afa78 100644
--- a/src/borg/hashindex.pyi
+++ b/src/borg/hashindex.pyi
@@ -5,13 +5,15 @@ API_VERSION: str
 PATH_OR_FILE = Union[str, IO]
 
 class ChunkIndexEntry(NamedTuple):
-    refcount: int
+    flags: int
     size: int
 
 CIE = Union[Tuple[int, int], Type[ChunkIndexEntry]]
 
 class ChunkIndex:
-    def add(self, key: bytes, refs: int, size: int) -> None: ...
+    F_NONE: int
+    F_USED: int
+    def add(self, key: bytes, size: int) -> None: ...
     def iteritems(self, marker: bytes = ...) -> Iterator: ...
     def __contains__(self, key: bytes) -> bool: ...
     def __getitem__(self, key: bytes) -> Type[ChunkIndexEntry]: ...