Rationalize SourmashSignature.name and str(sig) (#1179)

* switch SourmashSignature.name to property * fix the simple sig tests * sig.name() to str(sig) * fix sig.name() / str(sig) issues * fix __repr__; refactor code a bit. * fix categorize output * regularize use of sig.name vs str(sig) * require that name be set for lca index; fix related tests * fix CSV output, re-visit str(sig)/sig.name uses * rename signatures and tests * trigger Rust CI checks when test data changes * fix rust test by adjusting expectations for recomputed signature * fix signature borked by jq
sourmash-bio · Oct 31, 2020 · 1e94bde · 1e94bde
1 parent 8587197
commit 1e94bde
Show file tree

Hide file tree

Showing 37 changed files with 243 additions and 20,172 deletions.
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
@@ -6,6 +6,7 @@ on:
   pull_request:
     paths:
       - 'src/core/**'
+      - 'tests/test-data/**'
   schedule:
     - cron: "0 0 * * *" # daily
 

diff --git a/doc/api-example.md b/doc/api-example.md
@@ -472,8 +472,8 @@ Now do a search --
 
 ```
 >>> for similarity, found_sig, filename in tree.search(query_sig, threshold=0.1):
-...    print(query_sig.name())
-...    print(found_sig.name())
+...    print(query_sig)
+...    print(found_sig)
 ...    print(similarity)
 my favorite query
 NC_000913.3 Escherichia coli str. K-12 substr. MG1655, complete genome

diff --git a/sourmash/commands.py b/sourmash/commands.py
@@ -123,7 +123,7 @@ def compare(args):
 
     # do all-by-all calculation
 
-    labeltext = [item.name() for item in siglist]
+    labeltext = [str(item) for item in siglist]
     if args.containment:
         similarity = compare_serial_containment(siglist)
     else:
@@ -133,7 +133,7 @@ def compare(args):
     if len(siglist) < 30:
         for i, E in enumerate(siglist):
             # for small matrices, pretty-print some output
-            name_num = '{}-{}'.format(i, E.name())
+            name_num = '{}-{}'.format(i, str(E))
             if len(name_num) > 20:
                 name_num = name_num[:17] + '...'
             print_results('{:20s}\t{}'.format(name_num, similarity[i, :, ],))
@@ -420,7 +420,7 @@ def search(args):
                                                ksize=args.ksize,
                                                select_moltype=moltype,
                                                select_md5=args.md5)
-    notify('loaded query: {}... (k={}, {})', query.name()[:30],
+    notify('loaded query: {}... (k={}, {})', str(query)[:30],
                                              query.minhash.ksize,
                                              sourmash_args.get_moltype(query))
 
@@ -526,7 +526,7 @@ def categorize(args):
         csv_w = csv.writer(csv_fp)
 
     for queryfile, query, query_moltype, query_ksize in loader:
-        notify('loaded query: {}... (k={}, {})', query.name()[:30],
+        notify('loaded query: {}... (k={}, {})', str(query)[:30],
                query_ksize, query_moltype)
 
         results = []
@@ -544,15 +544,15 @@ def categorize(args):
         if results:
             results.sort(key=lambda x: -x[0])   # reverse sort on similarity
             best_hit_sim, best_hit_query = results[0]
-            notify('for {}, found: {:.2f} {}', query.name(),
+            notify('for {}, found: {:.2f} {}', query,
                                                best_hit_sim,
-                                               best_hit_query.name())
-            best_hit_query_name = best_hit_query.name()
+                                               best_hit_query)
+            best_hit_query_name = best_hit_query.name
         else:
-            notify('for {}, no match found', query.name())
+            notify('for {}, no match found', query)
 
         if csv_w:
-            csv_w.writerow([queryfile, query.name(), best_hit_query_name,
+            csv_w.writerow([queryfile, query, best_hit_query_name,
                            best_hit_sim])
 
     if loader.skipped_ignore:
@@ -575,7 +575,7 @@ def gather(args):
                                                ksize=args.ksize,
                                                select_moltype=moltype,
                                                select_md5=args.md5)
-    notify('loaded query: {}... (k={}, {})', query.name()[:30],
+    notify('loaded query: {}... (k={}, {})', str(query)[:30],
                                              query.minhash.ksize,
                                              sourmash_args.get_moltype(query))
 
@@ -720,9 +720,8 @@ def multigather(args):
         for query in sourmash_args.load_file_as_signatures(queryfile,
                                                        ksize=args.ksize,
                                                        select_moltype=moltype):
-            notify('loaded query: {}... (k={}, {})', query.name()[:30],
-                                            query.minhash.ksize,
-                                            sourmash_args.get_moltype(query))
+            notify('loaded query: {}... (k={}, {})', str(query)[:30],
+                   query.minhash.ksize, sourmash_args.get_moltype(query))
 
             # verify signature was computed right.
             if query.minhash.max_hash == 0:
@@ -910,7 +909,7 @@ def do_search():
     else:
         results.sort(key=lambda x: -x[0])   # take best
         similarity, found_sig = results[0]
-        print_results('FOUND: {}, at {:.3f}', found_sig.name(),
+        print_results('FOUND: {}, at {:.3f}', found_sig,
                similarity)
 
     if args.output:

diff --git a/sourmash/index.py b/sourmash/index.py
@@ -113,7 +113,7 @@ def gather(self, query, *args, **kwargs):
             if cont and cont >= threshold:
                 results.append((cont, ss, self.filename))
 
-        results.sort(reverse=True, key=lambda x: (x[0], x[1].name()))
+        results.sort(reverse=True, key=lambda x: (x[0], x[1].md5sum()))
 
         return results
 

diff --git a/sourmash/lca/command_classify.py b/sourmash/lca/command_classify.py
@@ -128,9 +128,9 @@ def classify(args):
             for query_sig in load_file_as_signatures(query_filename,
                                                      ksize=ksize):
                 notify(u'\r\033[K', end=u'')
-                notify('... classifying {} (file {} of {})', query_sig.name(),
+                notify('... classifying {} (file {} of {})', query_sig,
                        n, total_n, end='\r')
-                debug('classifying', query_sig.name())
+                debug('classifying', query_sig)
                 total_count += 1
 
                 # make sure we're looking at the same scaled value as database
@@ -142,7 +142,7 @@ def classify(args):
                 debug(lineage)
 
                 # output each classification to the spreadsheet
-                row = [query_sig.name(), status]
+                row = [query_sig.name, status]
                 row += lca_utils.zip_lineage(lineage)
 
                 # when outputting to stdout, make output intelligible

diff --git a/sourmash/lca/command_gather.py b/sourmash/lca/command_gather.py
@@ -60,7 +60,7 @@ def gather_signature(query_sig, dblist, ignore_abundance):
     """
     Decompose 'query_sig' using the given list of databases.
     """
-    notify('loaded query: {}... (k={})', query_sig.name()[:30],
+    notify('loaded query: {}... (k={})', str(query_sig)[:30],
                                          query_sig.minhash.ksize)
 
     # extract the basic set of mins
@@ -196,7 +196,7 @@ def gather_main(args):
     # for each query, gather all the matches across databases
     moltype = dblist[0].moltype
     query_sig = sourmash_args.load_query_signature(args.query, ksize, moltype)
-    debug('classifying', query_sig.name())
+    debug('classifying', query_sig)
 
     # make sure we're looking at the same scaled value as database
     query_sig.minhash = query_sig.minhash.downsample(scaled=scaled)

diff --git a/sourmash/lca/command_index.py b/sourmash/lca/command_index.py
@@ -195,19 +195,19 @@ def index(args):
                                      yield_all_files=args.force)
         for sig in it:
             notify(u'\r\033[K', end=u'')
-            notify('\r... loading signature {} ({} of {}); skipped {} so far', sig.name()[:30], n, total_n, n_skipped, end='')
-            debug(filename, sig.name())
+            notify('\r... loading signature {} ({} of {}); skipped {} so far', str(sig)[:30], n, total_n, n_skipped, end='')
+            debug(filename, sig)
 
             # block off duplicates.
             if sig.md5sum() in md5_to_name:
                 debug('WARNING: in file {}, duplicate md5sum: {}; skipping', filename, sig.md5sum())
                 record_duplicates.add(filename)
                 continue
 
-            md5_to_name[sig.md5sum()] = sig.name()
+            md5_to_name[sig.md5sum()] = str(sig)
 
             # parse identifier, potentially with splitting
-            ident = sig.name()
+            ident = sig.name
             if args.split_identifiers: # hack for NCBI-style names, etc.
                 # split on space...
                 ident = ident.split(' ')[0]
@@ -227,7 +227,7 @@ def index(args):
                 db.insert(sig, ident=ident, lineage=lineage)
             except ValueError as e:
                 error("ERROR: cannot insert signature '{}' (md5 {}, loaded from '{}') into database.",
-                      sig.name(), sig.md5sum()[:8], filename)
+                      sig, sig.md5sum()[:8], filename)
                 error("ERROR: {}", str(e))
                 sys.exit(-1)
 

diff --git a/sourmash/lca/command_summarize.py b/sourmash/lca/command_summarize.py
@@ -74,7 +74,7 @@ def load_singletons_and_count(filenames, ksize, scaled, ignore_abundance):
         for query_sig in sourmash_args.load_file_as_signatures(query_filename,
                                                                ksize=ksize):
             notify(u'\r\033[K', end=u'')
-            notify('... loading {} (file {} of {})', query_sig.name(), n,
+            notify('... loading {} (file {} of {})', query_sig, n,
                    total_n, end='\r')
             total_count += 1
 
@@ -122,7 +122,7 @@ def output_results(lineage_counts, total_counts, filename=None, sig=None):
         p = '{:.1f}%'.format(p)
 
         if filename and sig:
-            print_results('{:5} {:>5}   {}   {}:{} {}'.format(p, count, lineage, filename, sig.md5sum()[:8], sig.name()))
+            print_results('{:5} {:>5}   {}   {}:{} {}'.format(p, count, lineage, filename, sig.md5sum()[:8], sig))
         else:
             print_results('{:5} {:>5}   {}'.format(p, count, lineage))
 
@@ -145,7 +145,7 @@ def output_csv(lineage_counts, csv_fp, filename, sig, write_header=True):
         debug('lineage:', lineage)
         row = [count] + lca_utils.zip_lineage(lineage, truncate_empty=False)
         if filename:
-            row += [filename, sig.name(), sig.md5sum()]
+            row += [filename, sig.name, sig.md5sum()]
         w.writerow(row)
 
 

diff --git a/sourmash/lca/lca_db.py b/sourmash/lca/lca_db.py
@@ -110,7 +110,7 @@ def insert(self, sig, ident=None, lineage=None):
         Takes optional arguments 'ident' and 'lineage'.
 
         'ident' must be a unique string identifer across this database;
-        if not specified, the signature name (sig.name()) is used.
+        if not specified, the signature name (sig.name) is used.
 
         'lineage', if specified, must contain a tuple of LineagePair objects.
         """
@@ -130,7 +130,9 @@ def insert(self, sig, ident=None, lineage=None):
             raise ValueError("cannot downsample signature; is it a scaled signature?")
 
         if ident is None:
-            ident = sig.name()
+            ident = sig.name
+            if not ident:
+                ident = sig.filename
 
         if ident in self.ident_to_name:
             raise ValueError("signature {} is already in this LCA db.".format(ident))
@@ -139,7 +141,7 @@ def insert(self, sig, ident=None, lineage=None):
         self._invalidate_cache()
 
         # store full name
-        self.ident_to_name[ident] = sig.name()
+        self.ident_to_name[ident] = sig.name
 
         # identifier -> integer index (idx)
         idx = self._get_ident_index(ident, fail_on_duplicate=True)

diff --git a/sourmash/search.py b/sourmash/search.py
@@ -50,7 +50,7 @@ def search_databases(query, databases, threshold, do_containment, best_only,
                               match=match,
                               md5=match.md5sum(),
                               filename=filename,
-                              name=match.name()))
+                              name=match.name))
     return x
 
 ###
@@ -89,7 +89,7 @@ def _find_best(dblist, query, threshold_bp):
             assert cont                   # all matches should be nonzero.
 
             # note, break ties based on name, to ensure consistent order.
-            if (cont == best_cont and match.name() < best_match.name()) or \
+            if (cont == best_cont and str(match) < str(best_match)) or \
                cont > best_cont:
                 # update best match.
                 best_cont = cont
@@ -205,7 +205,7 @@ def gather_databases(query, databases, threshold_bp, ignore_abundance):
                               std_abund=std_abund,
                               filename=filename,
                               md5=best_match.md5sum(),
-                              name=best_match.name(),
+                              name=best_match.name,
                               match=best_match)
 
         # construct a new query, subtracting hashes found in previous one.

diff --git a/sourmash/sig/__main__.py b/sourmash/sig/__main__.py
@@ -222,7 +222,7 @@ def describe(args):
                 if mh.track_abundance:
                     with_abundance = 1
                 md5 = sig.md5sum()
-                name = sig.name()
+                name = sig.name or "** no name **"
                 filename = sig.filename
                 license = sig.license
 
@@ -279,8 +279,8 @@ def overlap(args):
     sig1_file = args.signature1
     sig2_file = args.signature2
 
-    name1 = sig1.name()
-    name2 = sig2.name()
+    name1 = sig1.name
+    name2 = sig2.name
 
     md5_1 = sig1.md5sum()
     md5_2 = sig2.md5sum()
@@ -370,7 +370,7 @@ def merge(args):
                 mh.merge(sigobj_mh)
             except:
                 error("ERROR when merging signature '{}' ({}) from file {}",
-                      sigobj.name(), sigobj.md5sum()[:8], sigfile)
+                      sigobj, sigobj.md5sum()[:8], sigfile)
                 raise
 
             this_n += 1
@@ -565,7 +565,7 @@ def extract(args):
         if args.md5 is not None:
             siglist = [ ss for ss in siglist if args.md5 in ss.md5sum() ]
         if args.name is not None:
-            siglist = [ ss for ss in siglist if args.name in ss.name() ]
+            siglist = [ ss for ss in siglist if args.name in str(ss) ]
 
         outlist.extend(siglist)
 
@@ -606,7 +606,7 @@ def filter(args):
         if args.md5 is not None:
             siglist = [ ss for ss in siglist if args.md5 in ss.md5sum() ]
         if args.name is not None:
-            siglist = [ ss for ss in siglist if args.name in ss.name() ]
+            siglist = [ ss for ss in siglist if args.name in str(ss) ]
 
         for ss in siglist:
             mh = ss.minhash
@@ -663,7 +663,7 @@ def flatten(args):
         if args.md5 is not None:
             siglist = [ ss for ss in siglist if args.md5 in ss.md5sum() ]
         if args.name is not None:
-            siglist = [ ss for ss in siglist if args.name in ss.name() ]
+            siglist = [ ss for ss in siglist if args.name in ss.name ]
 
         for ss in siglist:
             ss.minhash = ss.minhash.flatten()
@@ -799,7 +799,7 @@ def export(args):
 
     with FileOutput(args.output, 'wt') as fp:
         print(json.dumps(x), file=fp)
-    notify("exported signature {} ({})", query.name(), query.md5sum()[:8])
+    notify("exported signature {} ({})", query, query.md5sum()[:8])
 
 
 def main(arglist=None):