From 017058869a4fc801e9e92f9f22e6bbea88dcd940 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Tue, 23 Feb 2021 16:15:59 -0800 Subject: [PATCH 1/9] fix some issues with identifiers in the LCA code --- src/sourmash/lca/command_index.py | 6 +++++- src/sourmash/lca/lca_db.py | 9 ++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/sourmash/lca/command_index.py b/src/sourmash/lca/command_index.py index d4febd4ae1..494e515abc 100644 --- a/src/sourmash/lca/command_index.py +++ b/src/sourmash/lca/command_index.py @@ -207,7 +207,11 @@ def index(args): md5_to_name[sig.md5sum()] = str(sig) # parse identifier, potentially with splitting - ident = sig.name + if sig.name: + ident = sig.name + else: + ident = sig.filename + if args.split_identifiers: # hack for NCBI-style names, etc. # split on space... ident = ident.split(' ')[0] diff --git a/src/sourmash/lca/lca_db.py b/src/sourmash/lca/lca_db.py index 1fabe47398..195db44067 100644 --- a/src/sourmash/lca/lca_db.py +++ b/src/sourmash/lca/lca_db.py @@ -129,10 +129,13 @@ def insert(self, sig, ident=None, lineage=None): except ValueError: raise ValueError("cannot downsample signature; is it a scaled signature?") - if ident is None: - ident = sig.name - if not ident: + if not ident: + if sig.name: + ident = sig.name + elif sig.filename: ident = sig.filename + else: + ident = sig.md5sum()[:8] if ident in self.ident_to_name: raise ValueError("signature {} is already in this LCA db.".format(ident)) From 2fe92f99bef5ace0cca4cdf9f3d6655679f9c8d6 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 24 Feb 2021 06:10:25 -0800 Subject: [PATCH 2/9] add test for empty sig names in lca index --- tests/test_lca.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/test_lca.py b/tests/test_lca.py index e39ad1a394..b20ec16b8f 100644 --- a/tests/test_lca.py +++ b/tests/test_lca.py @@ -643,6 +643,25 @@ def test_basic_index_column_start(): assert '1 identifiers used out of 1 distinct identifiers in spreadsheet.' in err +@utils.in_tempdir +def test_index_empty_sketch_name(c): + # create two signatures with empty 'name' attributes + cmd = ['sketch', 'dna', utils.get_test_data('genome-s12.fa.gz'), + utils.get_test_data('genome-s11.fa.gz')] + c.run_sourmash(*cmd) + + sig1 = c.output('genome-s11.fa.gz.sig') + assert os.path.exists(sig1) + sig2 = c.output('genome-s12.fa.gz.sig') + assert os.path.exists(sig2) + + # can we insert them both? + taxcsv = utils.get_test_data('lca/delmont-1.csv') + cmd = ['lca', 'index', taxcsv, 'zzz', sig1, sig2] + c.run_sourmash(*cmd) + assert os.path.exists(c.output('zzz.lca.json')) + + def test_basic_index_and_classify_with_tsv_and_gz(): with utils.TempDirectory() as location: taxcsv = utils.get_test_data('lca/delmont-1.tsv') From 74c7b98faf632472d52d7d965b37714b6867072c Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 24 Feb 2021 06:13:55 -0800 Subject: [PATCH 3/9] add check on number of distinct idents in test --- src/sourmash/lca/command_index.py | 4 ++-- tests/test_lca.py | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/sourmash/lca/command_index.py b/src/sourmash/lca/command_index.py index 494e515abc..cbdbf1b206 100644 --- a/src/sourmash/lca/command_index.py +++ b/src/sourmash/lca/command_index.py @@ -183,7 +183,7 @@ def index(args): n = 0 total_n = len(inp_files) record_duplicates = set() - record_no_lineage = set() + record_no_lineage = [] record_remnants = set(assignments) record_used_lineages = set() record_used_idents = set() @@ -246,7 +246,7 @@ def index(args): # track lineage info - either no lineage, or this lineage used. else: debug('WARNING: no lineage assignment for {}.', ident) - record_no_lineage.add(ident) + record_no_lineage.append(ident) # end main add signatures loop diff --git a/tests/test_lca.py b/tests/test_lca.py index b20ec16b8f..f4530061d8 100644 --- a/tests/test_lca.py +++ b/tests/test_lca.py @@ -661,6 +661,10 @@ def test_index_empty_sketch_name(c): c.run_sourmash(*cmd) assert os.path.exists(c.output('zzz.lca.json')) + print(c.last_result.out) + print(c.last_result.err) + assert 'WARNING: no lineage provided for 2 sig' in c.last_result.err + def test_basic_index_and_classify_with_tsv_and_gz(): with utils.TempDirectory() as location: From e98095e5c576e919f7ad936ef4cceb8f29adca79 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 24 Feb 2021 06:29:32 -0800 Subject: [PATCH 4/9] set stdin filename to be blank, not - --- src/sourmash/command_compute.py | 3 +++ src/sourmash/sig/__main__.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/sourmash/command_compute.py b/src/sourmash/command_compute.py index 665bb48387..e4dae27ee9 100644 --- a/src/sourmash/command_compute.py +++ b/src/sourmash/command_compute.py @@ -245,9 +245,12 @@ def add_seq(sigs, seq, input_is_protein, check_sequence): def set_sig_name(sigs, filename, name=None): + if filename == '-': # if stdin, set filename to empty. + filename = '' for sig in sigs: if name is not None: sig._name = name + sig.filename = filename diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py index 72fbaf7141..3c7b327563 100644 --- a/src/sourmash/sig/__main__.py +++ b/src/sourmash/sig/__main__.py @@ -223,7 +223,7 @@ def describe(args): with_abundance = 1 md5 = sig.md5sum() name = sig.name or "** no name **" - filename = sig.filename + filename = sig.filename or "** no name **" license = sig.license if w: From 2e845c5de252b84f3a153986040b3749c06908ea Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 24 Feb 2021 06:29:48 -0800 Subject: [PATCH 5/9] have identifer default to sig display name --- src/sourmash/lca/lca_db.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/sourmash/lca/lca_db.py b/src/sourmash/lca/lca_db.py index 195db44067..1c6ef8f97c 100644 --- a/src/sourmash/lca/lca_db.py +++ b/src/sourmash/lca/lca_db.py @@ -130,12 +130,7 @@ def insert(self, sig, ident=None, lineage=None): raise ValueError("cannot downsample signature; is it a scaled signature?") if not ident: - if sig.name: - ident = sig.name - elif sig.filename: - ident = sig.filename - else: - ident = sig.md5sum()[:8] + ident = str(sig) if ident in self.ident_to_name: raise ValueError("signature {} is already in this LCA db.".format(ident)) From 1e82a2e828b958cc6747d079a4e17bff59c3dbcc Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 24 Feb 2021 16:30:23 -0800 Subject: [PATCH 6/9] fixes and tests for sourmash sketch on empty sequence input --- src/sourmash/command_compute.py | 38 +++++++++++++++++++-------------- tests/test_sourmash_sketch.py | 33 ++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 16 deletions(-) diff --git a/src/sourmash/command_compute.py b/src/sourmash/command_compute.py index e4dae27ee9..2c7122be98 100644 --- a/src/sourmash/command_compute.py +++ b/src/sourmash/command_compute.py @@ -160,6 +160,7 @@ def _compute_individual(args, signatures_factory): if args.singleton: siglist = [] + n = None for n, record in enumerate(screed.open(filename)): # make a new signature for each sequence sigs = signatures_factory() @@ -169,8 +170,9 @@ def _compute_individual(args, signatures_factory): set_sig_name(sigs, filename, name=record.name) siglist.extend(sigs) - notify('calculated {} signatures for {} sequences in {}', - len(siglist), n + 1, filename) + if n is not None: + notify('calculated {} signatures for {} sequences in {}', + len(siglist), n + 1, filename) else: # make a single sig for the whole file sigs = signatures_factory() @@ -178,6 +180,7 @@ def _compute_individual(args, signatures_factory): # consume & calculate signatures notify('... reading sequences from {}', filename) name = None + n = None for n, record in enumerate(screed.open(filename)): if n % 10000 == 0: if n: @@ -188,12 +191,13 @@ def _compute_individual(args, signatures_factory): add_seq(sigs, record.sequence, args.input_is_protein, args.check_sequence) - notify('...{} {} sequences', filename, n, end='') + if n is not None: # don't write out signatures if no input + notify('...{} {} sequences', filename, n, end='') - set_sig_name(sigs, filename, name) - siglist.extend(sigs) + set_sig_name(sigs, filename, name) + siglist.extend(sigs) - notify(f'calculated {len(siglist)} signatures for {n+1} sequences in {filename}') + notify(f'calculated {len(siglist)} signatures for {n+1} sequences in {filename}') # if no --output specified, save to individual files w/in for loop if not args.output: @@ -202,7 +206,8 @@ def _compute_individual(args, signatures_factory): # if --output specified, all collected signatures => args.output if args.output: - save_siglist(siglist, args.output) + if siglist: + save_siglist(siglist, args.output) siglist = [] assert not siglist # juuuust checking. @@ -212,28 +217,29 @@ def _compute_merged(args, signatures_factory): # make a signature for the whole file sigs = signatures_factory() - n = 0 total_seq = 0 for filename in args.filenames: # consume & calculate signatures notify('... reading sequences from {}', filename) + n = None for n, record in enumerate(screed.open(filename)): if n % 10000 == 0 and n: notify('\r... {} {}', filename, n, end='') add_seq(sigs, record.sequence, args.input_is_protein, args.check_sequence) - notify('... {} {} sequences', filename, n + 1) - - total_seq += n + 1 + if n is not None: + notify('... {} {} sequences', filename, n + 1) + total_seq += n + 1 - set_sig_name(sigs, filename, name=args.merge) - notify('calculated 1 signature for {} sequences taken from {} files', - total_seq, len(args.filenames)) + if total_seq: + set_sig_name(sigs, filename, name=args.merge) + notify('calculated 1 signature for {} sequences taken from {} files', + total_seq, len(args.filenames)) - # at end, save! - save_siglist(sigs, args.output) + # at end, save! + save_siglist(sigs, args.output) def add_seq(sigs, seq, input_is_protein, check_sequence): diff --git a/tests/test_sourmash_sketch.py b/tests/test_sourmash_sketch.py index ddf9a72d65..80e9a6aac3 100644 --- a/tests/test_sourmash_sketch.py +++ b/tests/test_sourmash_sketch.py @@ -240,6 +240,39 @@ def test_do_sourmash_sketchdna(): assert str(sig).endswith('short.fa') +@utils.in_tempdir +def test_do_sourmash_sketchdna_noinput(c): + data = "" + + cmd = ['sketch', 'dna', '-', '-o', c.output('xxx.sig')] + c.run_sourmash(*cmd, stdin_data=data) + + sigfile = c.output('xxx.sig') + assert not os.path.exists(sigfile) + + +@utils.in_tempdir +def test_do_sourmash_sketchdna_noinput_singleton(c): + data = "" + + cmd = ['sketch', 'dna', '-', '-o', c.output('xxx.sig'), '--singleton'] + c.run_sourmash(*cmd, stdin_data=data) + + sigfile = c.output('xxx.sig') + assert not os.path.exists(sigfile) + + +@utils.in_tempdir +def test_do_sourmash_sketchdna_noinput_merge(c): + data = "" + + cmd = ['sketch', 'dna', '-', '-o', c.output('xxx.sig'), '--merge', 'name'] + c.run_sourmash(*cmd, stdin_data=data) + + sigfile = c.output('xxx.sig') + assert not os.path.exists(sigfile) + + @utils.in_tempdir def test_do_sourmash_sketchdna_outdir(c): testdata1 = utils.get_test_data('short.fa') From 90713931dafc4b68fcf648c1528d880f82cdeb74 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 24 Feb 2021 17:27:47 -0800 Subject: [PATCH 7/9] verify db.insert(...) can work with bad signature names --- src/sourmash/lca/lca_db.py | 2 +- tests/test_lca.py | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/src/sourmash/lca/lca_db.py b/src/sourmash/lca/lca_db.py index 1c6ef8f97c..956cf6841a 100644 --- a/src/sourmash/lca/lca_db.py +++ b/src/sourmash/lca/lca_db.py @@ -133,7 +133,7 @@ def insert(self, sig, ident=None, lineage=None): ident = str(sig) if ident in self.ident_to_name: - raise ValueError("signature {} is already in this LCA db.".format(ident)) + raise ValueError("signature '{}' is already in this LCA db.".format(ident)) # before adding, invalide any caching from @cached_property self._invalidate_cache() diff --git a/tests/test_lca.py b/tests/test_lca.py index f4530061d8..40215aefb6 100644 --- a/tests/test_lca.py +++ b/tests/test_lca.py @@ -69,6 +69,22 @@ def test_api_create_insert_bad_ksize(): lca_db.insert(ss) +def test_api_create_insert_bad_ident(): + # can we insert a ksize=21 signature into a ksize=31 DB? hopefully not. + ss1 = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), + ksize=31) + ss2 = sourmash.load_one_signature(utils.get_test_data('63.fa.sig'), + ksize=31) + ss1.name = '' + ss1.filename = '' + ss2.name = '' + ss2.filename = '' + + lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000) + lca_db.insert(ss1) + lca_db.insert(ss2) + + def test_api_create_insert_bad_scaled(): # can we insert a scaled=1000 signature into a scaled=500 DB? # hopefully not. From 2635393758c3cd9e80f0891d8e6b4c9f8807ec4c Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 24 Feb 2021 17:37:33 -0800 Subject: [PATCH 8/9] add test for sourmash describe with empty filename --- tests/test_cmd_signature.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py index b5f0610a79..4152ceab33 100644 --- a/tests/test_cmd_signature.py +++ b/tests/test_cmd_signature.py @@ -1432,6 +1432,35 @@ def test_sig_describe_stdin(c): assert 'signature: GCA_001593925' in c.last_result.out +@utils.in_tempdir +def test_sig_describe_empty(c): + sig = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') + + ss = sourmash.load_file_as_signatures(sig) + ss = list(ss) + assert len(ss) == 1 + ss = ss[0] + + ss.name = '' + ss.filename = '' + + outsig = c.output('xxx.sig') + with open(outsig, 'wt') as fp: + sourmash.save_signatures([ss], fp) + + ss = sourmash.load_file_as_signatures(outsig) + ss = list(ss) + assert len(ss) == 1 + ss = ss[0] + assert ss.name == '' + assert ss.filename == '' + + c.run_sourmash('sig', 'describe', outsig) + print(c.last_result.out) + assert 'signature: ** no name **' in c.last_result.out + assert 'source file: ** no name **' in c.last_result.out + + @utils.in_tempdir def test_sig_describe_2(c): # get info in CSV spreadsheet From 5e73f0a6bfea2093b7ab51c4104ba6ebfd8bb730 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 25 Feb 2021 16:56:14 -0800 Subject: [PATCH 9/9] fix issues noted by @bluegenes --- src/sourmash/command_compute.py | 6 ++++++ tests/test_lca.py | 4 +++- tests/test_sourmash_sketch.py | 3 +++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/sourmash/command_compute.py b/src/sourmash/command_compute.py index 2c7122be98..cc52be1b3a 100644 --- a/src/sourmash/command_compute.py +++ b/src/sourmash/command_compute.py @@ -173,6 +173,8 @@ def _compute_individual(args, signatures_factory): if n is not None: notify('calculated {} signatures for {} sequences in {}', len(siglist), n + 1, filename) + else: + notify(f"no sequences found in '{filename}'?!") else: # make a single sig for the whole file sigs = signatures_factory() @@ -198,6 +200,8 @@ def _compute_individual(args, signatures_factory): siglist.extend(sigs) notify(f'calculated {len(siglist)} signatures for {n+1} sequences in {filename}') + else: + notify(f"no sequences found in '{filename}'?!") # if no --output specified, save to individual files w/in for loop if not args.output: @@ -232,6 +236,8 @@ def _compute_merged(args, signatures_factory): if n is not None: notify('... {} {} sequences', filename, n + 1) total_seq += n + 1 + else: + notify(f"no sequences found in '{filename}'?!") if total_seq: set_sig_name(sigs, filename, name=args.merge) diff --git a/tests/test_lca.py b/tests/test_lca.py index 40215aefb6..e9cfb9db37 100644 --- a/tests/test_lca.py +++ b/tests/test_lca.py @@ -70,7 +70,7 @@ def test_api_create_insert_bad_ksize(): def test_api_create_insert_bad_ident(): - # can we insert a ksize=21 signature into a ksize=31 DB? hopefully not. + # can we insert a signature with no/empty ident? ss1 = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), ksize=31) ss2 = sourmash.load_one_signature(utils.get_test_data('63.fa.sig'), @@ -83,6 +83,8 @@ def test_api_create_insert_bad_ident(): lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000) lca_db.insert(ss1) lca_db.insert(ss2) + # SUCCESS! + # would fail, previously :) def test_api_create_insert_bad_scaled(): diff --git a/tests/test_sourmash_sketch.py b/tests/test_sourmash_sketch.py index 80e9a6aac3..d93e263181 100644 --- a/tests/test_sourmash_sketch.py +++ b/tests/test_sourmash_sketch.py @@ -249,6 +249,7 @@ def test_do_sourmash_sketchdna_noinput(c): sigfile = c.output('xxx.sig') assert not os.path.exists(sigfile) + assert 'no sequences found' in c.last_result.err @utils.in_tempdir @@ -260,6 +261,7 @@ def test_do_sourmash_sketchdna_noinput_singleton(c): sigfile = c.output('xxx.sig') assert not os.path.exists(sigfile) + assert 'no sequences found' in c.last_result.err @utils.in_tempdir @@ -271,6 +273,7 @@ def test_do_sourmash_sketchdna_noinput_merge(c): sigfile = c.output('xxx.sig') assert not os.path.exists(sigfile) + assert 'no sequences found' in c.last_result.err @utils.in_tempdir