Skip to content

Commit

Permalink
Implement chunk comparison and selective extraction
Browse files Browse the repository at this point in the history
  • Loading branch information
alighazi288 committed Jan 12, 2025
1 parent b9498ca commit d20c269
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 0 deletions.
72 changes: 72 additions & 0 deletions src/borg/archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -719,6 +719,66 @@ def extract_helper(self, item, path, hlm, *, dry_run=False):
# In this case, we *want* to extract twice, because there is no other way.
pass

def compare_and_extract_chunks(self, item, fs_path):
fs_path = os.path.normpath(fs_path.replace(self.cwd + os.sep, "", 1))
fs_path = os.path.join(self.cwd, fs_path)
print(f"Starting chunk comparison for {fs_path}")
os.makedirs(os.path.dirname(fs_path), exist_ok=True)
try:

Check warning on line 727 in src/borg/archive.py

View check run for this annotation

Codecov / codecov/patch

src/borg/archive.py#L723-L727

Added lines #L723 - L727 were not covered by tests
if os.path.isfile(fs_path):
with open(fs_path, "rb+") as fs_file:
chunk_offset = 0

Check warning on line 730 in src/borg/archive.py

View check run for this annotation

Codecov / codecov/patch

src/borg/archive.py#L729-L730

Added lines #L729 - L730 were not covered by tests
for chunk_entry in item.chunks:
chunkid_A = chunk_entry.id
size = chunk_entry.size

Check warning on line 733 in src/borg/archive.py

View check run for this annotation

Codecov / codecov/patch

src/borg/archive.py#L732-L733

Added lines #L732 - L733 were not covered by tests

fs_file.seek(chunk_offset)
data_F = fs_file.read(size)

Check warning on line 736 in src/borg/archive.py

View check run for this annotation

Codecov / codecov/patch

src/borg/archive.py#L735-L736

Added lines #L735 - L736 were not covered by tests

if len(data_F) == size:
chunkid_F = self.key.id_hash(data_F)

Check warning on line 739 in src/borg/archive.py

View check run for this annotation

Codecov / codecov/patch

src/borg/archive.py#L739

Added line #L739 was not covered by tests
if chunkid_A != chunkid_F:
fs_file.seek(chunk_offset) # Go back to the start of the chunk
chunk_data = b"".join(self.pipeline.fetch_many([chunkid_A], ro_type=ROBJ_FILE_STREAM))
fs_file.write(chunk_data)

Check warning on line 743 in src/borg/archive.py

View check run for this annotation

Codecov / codecov/patch

src/borg/archive.py#L741-L743

Added lines #L741 - L743 were not covered by tests
else:
fs_file.seek(chunk_offset)
chunk_data = b"".join(self.pipeline.fetch_many([chunkid_A], ro_type=ROBJ_FILE_STREAM))
fs_file.write(chunk_data)

Check warning on line 747 in src/borg/archive.py

View check run for this annotation

Codecov / codecov/patch

src/borg/archive.py#L745-L747

Added lines #L745 - L747 were not covered by tests

chunk_offset += size

Check warning on line 749 in src/borg/archive.py

View check run for this annotation

Codecov / codecov/patch

src/borg/archive.py#L749

Added line #L749 was not covered by tests

fs_file.truncate(item.size)

Check warning on line 751 in src/borg/archive.py

View check run for this annotation

Codecov / codecov/patch

src/borg/archive.py#L751

Added line #L751 was not covered by tests
else:
with open(fs_path, "wb") as fs_file:

Check warning on line 753 in src/borg/archive.py

View check run for this annotation

Codecov / codecov/patch

src/borg/archive.py#L753

Added line #L753 was not covered by tests
for chunk_entry in item.chunks:
chunk_data = b"".join(self.pipeline.fetch_many([chunk_entry.id], ro_type=ROBJ_FILE_STREAM))
fs_file.write(chunk_data)
fs_file.truncate(item.size)

Check warning on line 757 in src/borg/archive.py

View check run for this annotation

Codecov / codecov/patch

src/borg/archive.py#L755-L757

Added lines #L755 - L757 were not covered by tests

total_size = 0
chunk_size = 8192
with open(fs_path, "rb") as fs_file:
while True:
chunk = fs_file.read(chunk_size)

Check warning on line 763 in src/borg/archive.py

View check run for this annotation

Codecov / codecov/patch

src/borg/archive.py#L759-L763

Added lines #L759 - L763 were not covered by tests
if not chunk:
break
total_size += len(chunk)

Check warning on line 766 in src/borg/archive.py

View check run for this annotation

Codecov / codecov/patch

src/borg/archive.py#L765-L766

Added lines #L765 - L766 were not covered by tests
if total_size > item.size:
break

Check warning on line 768 in src/borg/archive.py

View check run for this annotation

Codecov / codecov/patch

src/borg/archive.py#L768

Added line #L768 was not covered by tests

fs_file.seek(0)
preview = fs_file.read(50)
print(f"Final file size: {total_size}, Expected: {item.size}")
print(f"Content preview (text): {preview.decode('utf-8', errors='replace')}")

Check warning on line 773 in src/borg/archive.py

View check run for this annotation

Codecov / codecov/patch

src/borg/archive.py#L770-L773

Added lines #L770 - L773 were not covered by tests

except OSError as e:
print(f"IO error processing {fs_path}: {e}")
raise
except Exception as e:
print(f"Error processing {fs_path}: {str(e)}")
raise

Check warning on line 780 in src/borg/archive.py

View check run for this annotation

Codecov / codecov/patch

src/borg/archive.py#L775-L780

Added lines #L775 - L780 were not covered by tests

def extract_item(
self,
item,
Expand All @@ -730,6 +790,7 @@ def extract_item(
hlm=None,
pi=None,
continue_extraction=False,
check_existing=False,
):
"""
Extract archive item.
Expand All @@ -742,6 +803,7 @@ def extract_item(
:param hlm: maps hlid to link_target for extracting subtrees with hardlinks correctly
:param pi: ProgressIndicatorPercent (or similar) for file extraction progress (in bytes)
:param continue_extraction: continue a previously interrupted extraction of same archive
:param check_existing: check against existing file/block device and only retrieve changed data
"""

def same_item(item, st):
Expand All @@ -762,6 +824,16 @@ def same_item(item, st):
# if a previous extraction was interrupted between setting the mtime and setting non-default flags.
return True

if check_existing:
dest = os.path.normpath(self.cwd)
fs_path = os.path.join(dest, item.path)

Check warning on line 829 in src/borg/archive.py

View check run for this annotation

Codecov / codecov/patch

src/borg/archive.py#L828-L829

Added lines #L828 - L829 were not covered by tests

if not os.path.normpath(fs_path).startswith(dest):
raise Exception(f"Path {fs_path} is outside of extraction directory {dest}")

Check warning on line 832 in src/borg/archive.py

View check run for this annotation

Codecov / codecov/patch

src/borg/archive.py#L832

Added line #L832 was not covered by tests

self.compare_and_extract_chunks(item, fs_path)
return

Check warning on line 835 in src/borg/archive.py

View check run for this annotation

Codecov / codecov/patch

src/borg/archive.py#L834-L835

Added lines #L834 - L835 were not covered by tests

has_damaged_chunks = "chunks_healthy" in item
if dry_run or stdout:
with self.extract_helper(item, "", hlm, dry_run=dry_run or stdout) as hardlink_set:
Expand Down
8 changes: 8 additions & 0 deletions src/borg/archiver/extract_cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ def do_extract(self, args, repository, manifest, archive):
sparse = args.sparse
strip_components = args.strip_components
continue_extraction = args.continue_extraction
check_existing = args.check_existing
dirs = []
hlm = HardLinkManager(id_type=bytes, info_type=str) # hlid -> path

Expand Down Expand Up @@ -96,6 +97,7 @@ def do_extract(self, args, repository, manifest, archive):
hlm=hlm,
pi=pi,
continue_extraction=continue_extraction,
check_existing=check_existing,
)
except BackupError as e:
self.print_warning_instance(BackupWarning(remove_surrogates(orig_path), e))
Expand Down Expand Up @@ -192,6 +194,12 @@ def build_parser_extract(self, subparsers, common_parser, mid_common_parser):
action="store_true",
help="continue a previously interrupted extraction of same archive",
)
subparser.add_argument(
"--check-existing",
dest="check_existing",
action="store_true",
help="check against existing file/block device and only retrieve changed data",
)
subparser.add_argument("name", metavar="NAME", type=archivename_validator, help="specify the archive name")
subparser.add_argument(
"paths", metavar="PATH", nargs="*", type=PathSpec, help="paths to extract; patterns are supported"
Expand Down

0 comments on commit d20c269

Please sign in to comment.