diff --git a/src/borg/testsuite/chunker_pytest.py b/src/borg/testsuite/chunker_pytest.py index 59c7a45151..e83c8ef3ed 100644 --- a/src/borg/testsuite/chunker_pytest.py +++ b/src/borg/testsuite/chunker_pytest.py @@ -5,7 +5,7 @@ import pytest from .chunker import cf -from ..chunker import ChunkerFixed, sparsemap, has_seek_hole +from ..chunker import Chunker, ChunkerFixed, sparsemap, has_seek_hole from ..constants import * # NOQA BS = 4096 # fs block size @@ -136,3 +136,27 @@ def get_chunks(fname, sparse, header_size): fn = str(tmpdir / fname) make_sparsefile(fn, sparse_map, header_size=header_size) get_chunks(fn, sparse=sparse, header_size=header_size) == make_content(sparse_map, header_size=header_size) + + +def test_buzhash_chunksize_distribution(): + data = os.urandom(1048576) + min_exp, max_exp, mask = 10, 16, 14 # chunk size target 16kiB, clip at 1kiB and 64kiB + chunker = Chunker(0, min_exp, max_exp, mask, 4095) + f = BytesIO(data) + chunks = cf(chunker.chunkify(f)) + chunk_sizes = [len(chunk) for chunk in chunks] + chunks_count = len(chunks) + min_chunksize_observed = min(chunk_sizes) + max_chunksize_observed = max(chunk_sizes) + min_count = sum((int(size == 2 ** min_exp) for size in chunk_sizes)) + max_count = sum((int(size == 2 ** max_exp) for size in chunk_sizes)) + print(f"count: {chunks_count} min: {min_chunksize_observed} max: {max_chunksize_observed} " + f"min count: {min_count} max count: {max_count}") + # usually there will about 64 chunks + assert 32 < chunks_count < 128 + # chunks always must be between min and max (clipping must work): + assert min_chunksize_observed >= 2 ** min_exp + assert max_chunksize_observed <= 2 ** max_exp + # most chunks should be cut due to buzhash triggering, not due to clipping at min/max size: + assert min_count < 10 + assert max_count < 10