From 13a79bb2d7da44fd1f5e840077d7f52631af0665 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Tue, 4 May 2021 16:11:44 -0700 Subject: [PATCH] do not overwrite signature even if duplicate md5sum --- src/sourmash/sourmash_args.py | 27 ++++++++++++++++++++ tests/test_sourmash_args.py | 48 +++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index 556271cf01..9cb31e4625 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -606,7 +606,17 @@ def open(self): def add(self, ss): super().add(ss) md5 = ss.md5sum() + + # don't overwrite even if duplicate md5sum outname = os.path.join(self.location, f"{md5}.sig.gz") + if os.path.exists(outname): + i = 0 + while 1: + outname = os.path.join(self.location, f"{md5}_{i}.sig.gz") + if not os.path.exists(outname): + break + i += 1 + with gzip.open(outname, "wb") as fp: sig.save_signatures([ss], fp, compression=1) @@ -663,12 +673,29 @@ def close(self): def open(self): self.zf = zipfile.ZipFile(self.location, 'w', zipfile.ZIP_STORED) + def _exists(self, name): + try: + self.zf.getinfo(name) + return True + except KeyError: + return False + def add(self, ss): assert self.zf super().add(ss) md5 = ss.md5sum() outname = f"signatures/{md5}.sig.gz" + + # don't overwrite even if duplicate md5sum. + if self._exists(outname): + i = 0 + while 1: + outname = os.path.join(self.location, f"{md5}_{i}.sig.gz") + if not self._exists(outname): + break + i += 1 + json_str = sourmash.save_signatures([ss], compression=1) self.zf.writestr(outname, json_str) diff --git a/tests/test_sourmash_args.py b/tests/test_sourmash_args.py index 10c0234b02..667d016958 100644 --- a/tests/test_sourmash_args.py +++ b/tests/test_sourmash_args.py @@ -135,6 +135,31 @@ def test_save_signatures_to_location_1_zip(runtmp): assert len(saved) == 2 +def test_save_signatures_to_location_1_zip_dup(runtmp): + # save to sigfile.zip + sig2 = utils.get_test_data('2.fa.sig') + ss2 = sourmash.load_one_signature(sig2, ksize=31) + sig47 = utils.get_test_data('47.fa.sig') + ss47 = sourmash.load_one_signature(sig47, ksize=31) + + outloc = runtmp.output('foo.zip') + with sourmash_args.SaveSignaturesToLocation(outloc) as save_sig: + print(save_sig) + save_sig.add(ss2) + save_sig.add(ss47) + save_sig.add(ss2) + save_sig.add(ss47) + + # can we open as a .zip file? + with zipfile.ZipFile(outloc, "r") as zf: + assert list(zf.infolist()) + + saved = list(sourmash.load_file_as_signatures(outloc)) + assert ss2 in saved + assert ss47 in saved + assert len(saved) == 4 + + def test_save_signatures_to_location_1_dirout(runtmp): # save to sigout/ (directory) sig2 = utils.get_test_data('2.fa.sig') @@ -154,3 +179,26 @@ def test_save_signatures_to_location_1_dirout(runtmp): assert ss2 in saved assert ss47 in saved assert len(saved) == 2 + + +def test_save_signatures_to_location_1_dirout_duplicate(runtmp): + # save to sigout/ (directory) + sig2 = utils.get_test_data('2.fa.sig') + ss2 = sourmash.load_one_signature(sig2, ksize=31) + sig47 = utils.get_test_data('47.fa.sig') + ss47 = sourmash.load_one_signature(sig47, ksize=31) + + outloc = runtmp.output('sigout/') + with sourmash_args.SaveSignaturesToLocation(outloc) as save_sig: + print(save_sig) + save_sig.add(ss2) + save_sig.add(ss47) + save_sig.add(ss2) + save_sig.add(ss47) + + assert os.path.isdir(outloc) + + saved = list(sourmash.load_file_as_signatures(outloc)) + assert ss2 in saved + assert ss47 in saved + assert len(saved) == 4