sourmash-bio · ctb · Feb 26, 2021 · Feb 24, 2021 · Feb 24, 2021 · Feb 24, 2021
diff --git a/src/sourmash/command_compute.py b/src/sourmash/command_compute.py
@@ -160,6 +160,7 @@ def _compute_individual(args, signatures_factory):
 
         if args.singleton:
             siglist = []
+            n = None
             for n, record in enumerate(screed.open(filename)):
                 # make a new signature for each sequence
                 sigs = signatures_factory()
@@ -169,15 +170,19 @@ def _compute_individual(args, signatures_factory):
                 set_sig_name(sigs, filename, name=record.name)
                 siglist.extend(sigs)
 
-            notify('calculated {} signatures for {} sequences in {}',
-                   len(siglist), n + 1, filename)
+            if n is not None:
+                notify('calculated {} signatures for {} sequences in {}',
+                       len(siglist), n + 1, filename)
+            else:
+                notify(f"no sequences found in '{filename}'?!")
         else:
             # make a single sig for the whole file
             sigs = signatures_factory()
 
             # consume & calculate signatures
             notify('... reading sequences from {}', filename)
             name = None
+            n = None
             for n, record in enumerate(screed.open(filename)):
                 if n % 10000 == 0:
                     if n:
@@ -188,12 +193,15 @@ def _compute_individual(args, signatures_factory):
                 add_seq(sigs, record.sequence,
                         args.input_is_protein, args.check_sequence)
 
-            notify('...{} {} sequences', filename, n, end='')
+            if n is not None:       # don't write out signatures if no input
+                notify('...{} {} sequences', filename, n, end='')
 
-            set_sig_name(sigs, filename, name)
-            siglist.extend(sigs)
+                set_sig_name(sigs, filename, name)
+                siglist.extend(sigs)
 
-            notify(f'calculated {len(siglist)} signatures for {n+1} sequences in {filename}')
+                notify(f'calculated {len(siglist)} signatures for {n+1} sequences in {filename}')
+            else:
+                notify(f"no sequences found in '{filename}'?!")
 
         # if no --output specified, save to individual files w/in for loop
         if not args.output:
@@ -202,7 +210,8 @@ def _compute_individual(args, signatures_factory):
 
     # if --output specified, all collected signatures => args.output
     if args.output:
-        save_siglist(siglist, args.output)
+        if siglist:
+            save_siglist(siglist, args.output)
         siglist = []
 
     assert not siglist                    # juuuust checking.
@@ -212,28 +221,31 @@ def _compute_merged(args, signatures_factory):
     # make a signature for the whole file
     sigs = signatures_factory()
 
-    n = 0
     total_seq = 0
     for filename in args.filenames:
         # consume & calculate signatures
         notify('... reading sequences from {}', filename)
 
+        n = None
         for n, record in enumerate(screed.open(filename)):
             if n % 10000 == 0 and n:
                 notify('\r... {} {}', filename, n, end='')
 
             add_seq(sigs, record.sequence,
                     args.input_is_protein, args.check_sequence)
-        notify('... {} {} sequences', filename, n + 1)
-
-        total_seq += n + 1
+        if n is not None:
+            notify('... {} {} sequences', filename, n + 1)
+            total_seq += n + 1
+        else:
+            notify(f"no sequences found in '{filename}'?!")
 
-    set_sig_name(sigs, filename, name=args.merge)
-    notify('calculated 1 signature for {} sequences taken from {} files',
-           total_seq, len(args.filenames))
+    if total_seq:
+        set_sig_name(sigs, filename, name=args.merge)
+        notify('calculated 1 signature for {} sequences taken from {} files',
+               total_seq, len(args.filenames))
 
-    # at end, save!
-    save_siglist(sigs, args.output)
+        # at end, save!
+        save_siglist(sigs, args.output)
 
 
 def add_seq(sigs, seq, input_is_protein, check_sequence):
@@ -245,9 +257,12 @@ def add_seq(sigs, seq, input_is_protein, check_sequence):
 
 
 def set_sig_name(sigs, filename, name=None):
+    if filename == '-':         # if stdin, set filename to empty.
+        filename = ''
     for sig in sigs:
         if name is not None:
             sig._name = name
+
         sig.filename = filename
 
 

diff --git a/src/sourmash/lca/command_index.py b/src/sourmash/lca/command_index.py
@@ -183,7 +183,7 @@ def index(args):
     n = 0
     total_n = len(inp_files)
     record_duplicates = set()
-    record_no_lineage = set()
+    record_no_lineage = []
     record_remnants = set(assignments)
     record_used_lineages = set()
     record_used_idents = set()
@@ -207,7 +207,11 @@ def index(args):
             md5_to_name[sig.md5sum()] = str(sig)
 
             # parse identifier, potentially with splitting
-            ident = sig.name
+            if sig.name:
+                ident = sig.name
+            else:
+                ident = sig.filename
+
             if args.split_identifiers: # hack for NCBI-style names, etc.
                 # split on space...
                 ident = ident.split(' ')[0]
@@ -242,7 +246,7 @@ def index(args):
             # track lineage info - either no lineage, or this lineage used.
             else:
                 debug('WARNING: no lineage assignment for {}.', ident)
-                record_no_lineage.add(ident)
+                record_no_lineage.append(ident)
 
     # end main add signatures loop
 

diff --git a/src/sourmash/lca/lca_db.py b/src/sourmash/lca/lca_db.py
@@ -129,13 +129,11 @@ def insert(self, sig, ident=None, lineage=None):
         except ValueError:
             raise ValueError("cannot downsample signature; is it a scaled signature?")
 
-        if ident is None:
-            ident = sig.name
-            if not ident:
-                ident = sig.filename
+        if not ident:
+            ident = str(sig)
 
         if ident in self.ident_to_name:
-            raise ValueError("signature {} is already in this LCA db.".format(ident))
+            raise ValueError("signature '{}' is already in this LCA db.".format(ident))
 
         # before adding, invalide any caching from @cached_property
         self._invalidate_cache()

diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py
@@ -223,7 +223,7 @@ def describe(args):
                     with_abundance = 1
                 md5 = sig.md5sum()
                 name = sig.name or "** no name **"
-                filename = sig.filename
+                filename = sig.filename or "** no name **"
                 license = sig.license
 
                 if w:

diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py
@@ -1432,6 +1432,35 @@ def test_sig_describe_stdin(c):
     assert 'signature: GCA_001593925' in c.last_result.out
 
 
+@utils.in_tempdir
+def test_sig_describe_empty(c):
+    sig = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig')
+
+    ss = sourmash.load_file_as_signatures(sig)
+    ss = list(ss)
+    assert len(ss) == 1
+    ss = ss[0]
+
+    ss.name = ''
+    ss.filename = ''
+
+    outsig = c.output('xxx.sig')
+    with open(outsig, 'wt') as fp:
+        sourmash.save_signatures([ss], fp)
+
+    ss = sourmash.load_file_as_signatures(outsig)
+    ss = list(ss)
+    assert len(ss) == 1
+    ss = ss[0]
+    assert ss.name == ''
+    assert ss.filename == ''
+
+    c.run_sourmash('sig', 'describe', outsig)
+    print(c.last_result.out)
+    assert 'signature: ** no name **' in c.last_result.out
+    assert 'source file: ** no name **' in c.last_result.out
+
+
 @utils.in_tempdir
 def test_sig_describe_2(c):
     # get info in CSV spreadsheet

diff --git a/tests/test_lca.py b/tests/test_lca.py
@@ -69,6 +69,24 @@ def test_api_create_insert_bad_ksize():
         lca_db.insert(ss)
 
 
+def test_api_create_insert_bad_ident():
+    # can we insert a signature with no/empty ident?
+    ss1 = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'),
+                                      ksize=31)
+    ss2 = sourmash.load_one_signature(utils.get_test_data('63.fa.sig'),
+                                      ksize=31)
+    ss1.name = ''
+    ss1.filename = ''
+    ss2.name = ''
+    ss2.filename = ''
+
+    lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000)
+    lca_db.insert(ss1)
+    lca_db.insert(ss2)
+    # SUCCESS!
+    # would fail, previously :)
+
+
 def test_api_create_insert_bad_scaled():
     # can we insert a scaled=1000 signature into a scaled=500 DB?
     # hopefully not.
@@ -643,6 +661,29 @@ def test_basic_index_column_start():
         assert '1 identifiers used out of 1 distinct identifiers in spreadsheet.' in err
 
 
+@utils.in_tempdir
+def test_index_empty_sketch_name(c):
+    # create two signatures with empty 'name' attributes
+    cmd = ['sketch', 'dna', utils.get_test_data('genome-s12.fa.gz'),
+           utils.get_test_data('genome-s11.fa.gz')]
+    c.run_sourmash(*cmd)
+
+    sig1 = c.output('genome-s11.fa.gz.sig')
+    assert os.path.exists(sig1)
+    sig2 = c.output('genome-s12.fa.gz.sig')
+    assert os.path.exists(sig2)
+
+    # can we insert them both?
+    taxcsv = utils.get_test_data('lca/delmont-1.csv')
+    cmd = ['lca', 'index', taxcsv, 'zzz', sig1, sig2]
+    c.run_sourmash(*cmd)
+    assert os.path.exists(c.output('zzz.lca.json'))
+
+    print(c.last_result.out)
+    print(c.last_result.err)
+    assert 'WARNING: no lineage provided for 2 sig' in c.last_result.err
+
+
 def test_basic_index_and_classify_with_tsv_and_gz():
     with utils.TempDirectory() as location:
         taxcsv = utils.get_test_data('lca/delmont-1.tsv')

diff --git a/tests/test_sourmash_sketch.py b/tests/test_sourmash_sketch.py
@@ -240,6 +240,42 @@ def test_do_sourmash_sketchdna():
         assert str(sig).endswith('short.fa')
 
 
+@utils.in_tempdir
+def test_do_sourmash_sketchdna_noinput(c):
+    data = ""
+
+    cmd = ['sketch', 'dna', '-', '-o', c.output('xxx.sig')]
+    c.run_sourmash(*cmd, stdin_data=data)
+
+    sigfile = c.output('xxx.sig')
+    assert not os.path.exists(sigfile)
+    assert 'no sequences found' in c.last_result.err
+
+
+@utils.in_tempdir
+def test_do_sourmash_sketchdna_noinput_singleton(c):
+    data = ""
+
+    cmd = ['sketch', 'dna', '-', '-o', c.output('xxx.sig'), '--singleton']
+    c.run_sourmash(*cmd, stdin_data=data)
+
+    sigfile = c.output('xxx.sig')
+    assert not os.path.exists(sigfile)
+    assert 'no sequences found' in c.last_result.err
+
+
+@utils.in_tempdir
+def test_do_sourmash_sketchdna_noinput_merge(c):
+    data = ""
+
+    cmd = ['sketch', 'dna', '-', '-o', c.output('xxx.sig'), '--merge', 'name']
+    c.run_sourmash(*cmd, stdin_data=data)
+
+    sigfile = c.output('xxx.sig')
+    assert not os.path.exists(sigfile)
+    assert 'no sequences found' in c.last_result.err
+
+
 @utils.in_tempdir
 def test_do_sourmash_sketchdna_outdir(c):
     testdata1 = utils.get_test_data('short.fa')