Skip to content

Commit

Permalink
Rationalize SourmashSignature.name and str(sig) (#1179)
Browse files Browse the repository at this point in the history
* switch SourmashSignature.name to property
* fix the simple sig tests
* sig.name() to str(sig)
* fix sig.name() / str(sig) issues
* fix __repr__; refactor code a bit.
* fix categorize output
* regularize use of sig.name vs str(sig)
* require that name be set for lca index; fix related tests
* fix CSV output, re-visit str(sig)/sig.name uses
* rename signatures and tests
* trigger Rust CI checks when test data changes
* fix rust test by adjusting expectations for recomputed signature
* fix signature borked by jq
  • Loading branch information
ctb authored Oct 31, 2020
1 parent 8587197 commit 1e94bde
Show file tree
Hide file tree
Showing 37 changed files with 243 additions and 20,172 deletions.
1 change: 1 addition & 0 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ on:
pull_request:
paths:
- 'src/core/**'
- 'tests/test-data/**'
schedule:
- cron: "0 0 * * *" # daily

Expand Down
4 changes: 2 additions & 2 deletions doc/api-example.md
Original file line number Diff line number Diff line change
Expand Up @@ -472,8 +472,8 @@ Now do a search --

```
>>> for similarity, found_sig, filename in tree.search(query_sig, threshold=0.1):
... print(query_sig.name())
... print(found_sig.name())
... print(query_sig)
... print(found_sig)
... print(similarity)
my favorite query
NC_000913.3 Escherichia coli str. K-12 substr. MG1655, complete genome
Expand Down
27 changes: 13 additions & 14 deletions sourmash/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def compare(args):

# do all-by-all calculation

labeltext = [item.name() for item in siglist]
labeltext = [str(item) for item in siglist]
if args.containment:
similarity = compare_serial_containment(siglist)
else:
Expand All @@ -133,7 +133,7 @@ def compare(args):
if len(siglist) < 30:
for i, E in enumerate(siglist):
# for small matrices, pretty-print some output
name_num = '{}-{}'.format(i, E.name())
name_num = '{}-{}'.format(i, str(E))
if len(name_num) > 20:
name_num = name_num[:17] + '...'
print_results('{:20s}\t{}'.format(name_num, similarity[i, :, ],))
Expand Down Expand Up @@ -420,7 +420,7 @@ def search(args):
ksize=args.ksize,
select_moltype=moltype,
select_md5=args.md5)
notify('loaded query: {}... (k={}, {})', query.name()[:30],
notify('loaded query: {}... (k={}, {})', str(query)[:30],
query.minhash.ksize,
sourmash_args.get_moltype(query))

Expand Down Expand Up @@ -526,7 +526,7 @@ def categorize(args):
csv_w = csv.writer(csv_fp)

for queryfile, query, query_moltype, query_ksize in loader:
notify('loaded query: {}... (k={}, {})', query.name()[:30],
notify('loaded query: {}... (k={}, {})', str(query)[:30],
query_ksize, query_moltype)

results = []
Expand All @@ -544,15 +544,15 @@ def categorize(args):
if results:
results.sort(key=lambda x: -x[0]) # reverse sort on similarity
best_hit_sim, best_hit_query = results[0]
notify('for {}, found: {:.2f} {}', query.name(),
notify('for {}, found: {:.2f} {}', query,
best_hit_sim,
best_hit_query.name())
best_hit_query_name = best_hit_query.name()
best_hit_query)
best_hit_query_name = best_hit_query.name
else:
notify('for {}, no match found', query.name())
notify('for {}, no match found', query)

if csv_w:
csv_w.writerow([queryfile, query.name(), best_hit_query_name,
csv_w.writerow([queryfile, query, best_hit_query_name,
best_hit_sim])

if loader.skipped_ignore:
Expand All @@ -575,7 +575,7 @@ def gather(args):
ksize=args.ksize,
select_moltype=moltype,
select_md5=args.md5)
notify('loaded query: {}... (k={}, {})', query.name()[:30],
notify('loaded query: {}... (k={}, {})', str(query)[:30],
query.minhash.ksize,
sourmash_args.get_moltype(query))

Expand Down Expand Up @@ -720,9 +720,8 @@ def multigather(args):
for query in sourmash_args.load_file_as_signatures(queryfile,
ksize=args.ksize,
select_moltype=moltype):
notify('loaded query: {}... (k={}, {})', query.name()[:30],
query.minhash.ksize,
sourmash_args.get_moltype(query))
notify('loaded query: {}... (k={}, {})', str(query)[:30],
query.minhash.ksize, sourmash_args.get_moltype(query))

# verify signature was computed right.
if query.minhash.max_hash == 0:
Expand Down Expand Up @@ -910,7 +909,7 @@ def do_search():
else:
results.sort(key=lambda x: -x[0]) # take best
similarity, found_sig = results[0]
print_results('FOUND: {}, at {:.3f}', found_sig.name(),
print_results('FOUND: {}, at {:.3f}', found_sig,
similarity)

if args.output:
Expand Down
2 changes: 1 addition & 1 deletion sourmash/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def gather(self, query, *args, **kwargs):
if cont and cont >= threshold:
results.append((cont, ss, self.filename))

results.sort(reverse=True, key=lambda x: (x[0], x[1].name()))
results.sort(reverse=True, key=lambda x: (x[0], x[1].md5sum()))

return results

Expand Down
6 changes: 3 additions & 3 deletions sourmash/lca/command_classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,9 +128,9 @@ def classify(args):
for query_sig in load_file_as_signatures(query_filename,
ksize=ksize):
notify(u'\r\033[K', end=u'')
notify('... classifying {} (file {} of {})', query_sig.name(),
notify('... classifying {} (file {} of {})', query_sig,
n, total_n, end='\r')
debug('classifying', query_sig.name())
debug('classifying', query_sig)
total_count += 1

# make sure we're looking at the same scaled value as database
Expand All @@ -142,7 +142,7 @@ def classify(args):
debug(lineage)

# output each classification to the spreadsheet
row = [query_sig.name(), status]
row = [query_sig.name, status]
row += lca_utils.zip_lineage(lineage)

# when outputting to stdout, make output intelligible
Expand Down
4 changes: 2 additions & 2 deletions sourmash/lca/command_gather.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def gather_signature(query_sig, dblist, ignore_abundance):
"""
Decompose 'query_sig' using the given list of databases.
"""
notify('loaded query: {}... (k={})', query_sig.name()[:30],
notify('loaded query: {}... (k={})', str(query_sig)[:30],
query_sig.minhash.ksize)

# extract the basic set of mins
Expand Down Expand Up @@ -196,7 +196,7 @@ def gather_main(args):
# for each query, gather all the matches across databases
moltype = dblist[0].moltype
query_sig = sourmash_args.load_query_signature(args.query, ksize, moltype)
debug('classifying', query_sig.name())
debug('classifying', query_sig)

# make sure we're looking at the same scaled value as database
query_sig.minhash = query_sig.minhash.downsample(scaled=scaled)
Expand Down
10 changes: 5 additions & 5 deletions sourmash/lca/command_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,19 +195,19 @@ def index(args):
yield_all_files=args.force)
for sig in it:
notify(u'\r\033[K', end=u'')
notify('\r... loading signature {} ({} of {}); skipped {} so far', sig.name()[:30], n, total_n, n_skipped, end='')
debug(filename, sig.name())
notify('\r... loading signature {} ({} of {}); skipped {} so far', str(sig)[:30], n, total_n, n_skipped, end='')
debug(filename, sig)

# block off duplicates.
if sig.md5sum() in md5_to_name:
debug('WARNING: in file {}, duplicate md5sum: {}; skipping', filename, sig.md5sum())
record_duplicates.add(filename)
continue

md5_to_name[sig.md5sum()] = sig.name()
md5_to_name[sig.md5sum()] = str(sig)

# parse identifier, potentially with splitting
ident = sig.name()
ident = sig.name
if args.split_identifiers: # hack for NCBI-style names, etc.
# split on space...
ident = ident.split(' ')[0]
Expand All @@ -227,7 +227,7 @@ def index(args):
db.insert(sig, ident=ident, lineage=lineage)
except ValueError as e:
error("ERROR: cannot insert signature '{}' (md5 {}, loaded from '{}') into database.",
sig.name(), sig.md5sum()[:8], filename)
sig, sig.md5sum()[:8], filename)
error("ERROR: {}", str(e))
sys.exit(-1)

Expand Down
6 changes: 3 additions & 3 deletions sourmash/lca/command_summarize.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def load_singletons_and_count(filenames, ksize, scaled, ignore_abundance):
for query_sig in sourmash_args.load_file_as_signatures(query_filename,
ksize=ksize):
notify(u'\r\033[K', end=u'')
notify('... loading {} (file {} of {})', query_sig.name(), n,
notify('... loading {} (file {} of {})', query_sig, n,
total_n, end='\r')
total_count += 1

Expand Down Expand Up @@ -122,7 +122,7 @@ def output_results(lineage_counts, total_counts, filename=None, sig=None):
p = '{:.1f}%'.format(p)

if filename and sig:
print_results('{:5} {:>5} {} {}:{} {}'.format(p, count, lineage, filename, sig.md5sum()[:8], sig.name()))
print_results('{:5} {:>5} {} {}:{} {}'.format(p, count, lineage, filename, sig.md5sum()[:8], sig))
else:
print_results('{:5} {:>5} {}'.format(p, count, lineage))

Expand All @@ -145,7 +145,7 @@ def output_csv(lineage_counts, csv_fp, filename, sig, write_header=True):
debug('lineage:', lineage)
row = [count] + lca_utils.zip_lineage(lineage, truncate_empty=False)
if filename:
row += [filename, sig.name(), sig.md5sum()]
row += [filename, sig.name, sig.md5sum()]
w.writerow(row)


Expand Down
8 changes: 5 additions & 3 deletions sourmash/lca/lca_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def insert(self, sig, ident=None, lineage=None):
Takes optional arguments 'ident' and 'lineage'.
'ident' must be a unique string identifer across this database;
if not specified, the signature name (sig.name()) is used.
if not specified, the signature name (sig.name) is used.
'lineage', if specified, must contain a tuple of LineagePair objects.
"""
Expand All @@ -130,7 +130,9 @@ def insert(self, sig, ident=None, lineage=None):
raise ValueError("cannot downsample signature; is it a scaled signature?")

if ident is None:
ident = sig.name()
ident = sig.name
if not ident:
ident = sig.filename

if ident in self.ident_to_name:
raise ValueError("signature {} is already in this LCA db.".format(ident))
Expand All @@ -139,7 +141,7 @@ def insert(self, sig, ident=None, lineage=None):
self._invalidate_cache()

# store full name
self.ident_to_name[ident] = sig.name()
self.ident_to_name[ident] = sig.name

# identifier -> integer index (idx)
idx = self._get_ident_index(ident, fail_on_duplicate=True)
Expand Down
6 changes: 3 additions & 3 deletions sourmash/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def search_databases(query, databases, threshold, do_containment, best_only,
match=match,
md5=match.md5sum(),
filename=filename,
name=match.name()))
name=match.name))
return x

###
Expand Down Expand Up @@ -89,7 +89,7 @@ def _find_best(dblist, query, threshold_bp):
assert cont # all matches should be nonzero.

# note, break ties based on name, to ensure consistent order.
if (cont == best_cont and match.name() < best_match.name()) or \
if (cont == best_cont and str(match) < str(best_match)) or \
cont > best_cont:
# update best match.
best_cont = cont
Expand Down Expand Up @@ -205,7 +205,7 @@ def gather_databases(query, databases, threshold_bp, ignore_abundance):
std_abund=std_abund,
filename=filename,
md5=best_match.md5sum(),
name=best_match.name(),
name=best_match.name,
match=best_match)

# construct a new query, subtracting hashes found in previous one.
Expand Down
16 changes: 8 additions & 8 deletions sourmash/sig/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ def describe(args):
if mh.track_abundance:
with_abundance = 1
md5 = sig.md5sum()
name = sig.name()
name = sig.name or "** no name **"
filename = sig.filename
license = sig.license

Expand Down Expand Up @@ -279,8 +279,8 @@ def overlap(args):
sig1_file = args.signature1
sig2_file = args.signature2

name1 = sig1.name()
name2 = sig2.name()
name1 = sig1.name
name2 = sig2.name

md5_1 = sig1.md5sum()
md5_2 = sig2.md5sum()
Expand Down Expand Up @@ -370,7 +370,7 @@ def merge(args):
mh.merge(sigobj_mh)
except:
error("ERROR when merging signature '{}' ({}) from file {}",
sigobj.name(), sigobj.md5sum()[:8], sigfile)
sigobj, sigobj.md5sum()[:8], sigfile)
raise

this_n += 1
Expand Down Expand Up @@ -565,7 +565,7 @@ def extract(args):
if args.md5 is not None:
siglist = [ ss for ss in siglist if args.md5 in ss.md5sum() ]
if args.name is not None:
siglist = [ ss for ss in siglist if args.name in ss.name() ]
siglist = [ ss for ss in siglist if args.name in str(ss) ]

outlist.extend(siglist)

Expand Down Expand Up @@ -606,7 +606,7 @@ def filter(args):
if args.md5 is not None:
siglist = [ ss for ss in siglist if args.md5 in ss.md5sum() ]
if args.name is not None:
siglist = [ ss for ss in siglist if args.name in ss.name() ]
siglist = [ ss for ss in siglist if args.name in str(ss) ]

for ss in siglist:
mh = ss.minhash
Expand Down Expand Up @@ -663,7 +663,7 @@ def flatten(args):
if args.md5 is not None:
siglist = [ ss for ss in siglist if args.md5 in ss.md5sum() ]
if args.name is not None:
siglist = [ ss for ss in siglist if args.name in ss.name() ]
siglist = [ ss for ss in siglist if args.name in ss.name ]

for ss in siglist:
ss.minhash = ss.minhash.flatten()
Expand Down Expand Up @@ -799,7 +799,7 @@ def export(args):

with FileOutput(args.output, 'wt') as fp:
print(json.dumps(x), file=fp)
notify("exported signature {} ({})", query.name(), query.md5sum()[:8])
notify("exported signature {} ({})", query, query.md5sum()[:8])


def main(arglist=None):
Expand Down
Loading

0 comments on commit 1e94bde

Please sign in to comment.