diff --git a/sourmash_lib/lca/command_gather.py b/sourmash_lib/lca/command_gather.py index d719e80d89..8235cf05e5 100644 --- a/sourmash_lib/lca/command_gather.py +++ b/sourmash_lib/lca/command_gather.py @@ -6,8 +6,6 @@ CTB TODO: * sort out f_uniq / f_orig -* add in abundance weighting -* CSV output with full taxonomy """ from __future__ import print_function, division import sys @@ -23,7 +21,7 @@ from sourmash_lib.search import format_bp LCAGatherResult = namedtuple('LCAGatherResult', - 'intersect_bp, f_unique_to_query, f_unique_weighted, average_abund, lineage') + 'intersect_bp, f_unique_to_query, f_unique_weighted, average_abund, lineage, f_match') def format_lineage(lineage_tup): @@ -36,9 +34,12 @@ def format_lineage(lineage_tup): if 'genus' in present: genus = d['genus'] - if 'species' in present: + if 'strain' in present: + name = d['strain'] + elif 'species' in present: species = d['species'] - if species.startswith(genus + ' '): + if species.startswith(genus + ' ') or \ + species.startswith(genus + '_'): name = species else: name = '{} {}'.format(genus, species) @@ -79,34 +80,58 @@ def gather_signature(query_sig, dblist, ignore_abundance): orig_abunds = { k: 1 for k in query_mins } sum_abunds = sum(orig_abunds.values()) + # first time through, record FOO. + md5_to_lineage = {} + + # collect all mentioned lineage_ids -> md5s + x = set() + for hashval in query_mins: + for lca_db in dblist: + lineage_ids = lca_db.hashval_to_lineage_id.get(hashval, []) + for lid in lineage_ids: + md5 = lca_db.lineage_id_to_signature[lid] + x.add((lca_db, lid, md5)) + + for lca_db, lid, md5 in x: + md5_to_lineage[md5] = lca_db.lineage_dict[lid] + + # now! do the gather: while 1: # find all of the assignments for the current set of hashes - assignments = lca_utils.gather_assignments(query_mins, dblist) - + assignments = defaultdict(set) + for hashval in query_mins: + for lca_db in dblist: + lineage_ids = lca_db.hashval_to_lineage_id.get(hashval, []) + for lid in lineage_ids: + md5 = lca_db.lineage_id_to_signature[lid] + signature_size = lca_db.lineage_id_counts[lid] + assignments[hashval].add((md5, signature_size)) + # none? quit. if not assignments: break - # count 'em all. note, this will combine identical lineages from - # different source genomes; the only that is counted is the lineage - # assignment. + # count the distinct signatures. counts = Counter() - for hashval, assignment_set in assignments.items(): - for assignment in assignment_set: - counts[assignment] += 1 + for hashval, md5_set in assignments.items(): + for (md5, sigsize) in md5_set: + counts[(md5, sigsize)] += 1 # find the most abundant assignment - top_assignment, top_count = next(iter(counts.most_common())) + (top_md5, top_sigsize), top_count = next(iter(counts.most_common())) # now, remove from query mins. intersect_mins = set() - for hashval, assignment_set in assignments.items(): - if top_assignment in assignment_set: + for hashval, md5_set in assignments.items(): + if (top_md5, sigsize) in md5_set: query_mins.remove(hashval) intersect_mins.add(hashval) # should match! - assert top_count == len(intersect_mins) + # @@CTB assert top_count == len(intersect_mins) + + # calculate size of match (# of hashvals belonging to that sig) + match_size = top_sigsize # construct 'result' object intersect_bp = top_count * query_sig.minhash.scaled @@ -114,14 +139,14 @@ def gather_signature(query_sig, dblist, ignore_abundance): / sum_abunds average_abund = sum((orig_abunds[k] for k in intersect_mins)) \ / len(intersect_mins) + f_match = len(intersect_mins) / match_size result = LCAGatherResult(intersect_bp = intersect_bp, f_unique_to_query= top_count / n_mins, f_unique_weighted=f_unique_weighted, average_abund=average_abund, - lineage=top_assignment) - - + f_match=f_match, + lineage=md5_to_lineage[top_md5]) f_unassigned = len(query_mins) / n_mins est_bp = len(query_mins) * query_sig.minhash.scaled @@ -176,15 +201,17 @@ def gather_main(args): # is this our first time through the loop? print headers, if so. if not len(found): print_results("") - print_results("overlap p_query") - print_results("--------- -------") + print_results("overlap p_query p_match ") + print_results("--------- ------- --------") # output! pct_query = '{:.1f}%'.format(result.f_unique_to_query*100) + pct_match = '{:.1f}%'.format(result.f_match*100) str_bp = format_bp(result.intersect_bp) name = format_lineage(result.lineage) - print_results('{:9} {:>6} {}', str_bp, pct_query, name) + print_results('{:9} {:>6} {:>6} {}', str_bp, pct_query, + pct_match, name) found.append(result) @@ -208,7 +235,7 @@ def gather_main(args): sys.exit(0) if args.output: - fieldnames = ['intersect_bp', 'f_unique_to_query', 'f_unique_weighted', + fieldnames = ['intersect_bp', 'f_match', 'f_unique_to_query', 'f_unique_weighted', 'average_abund'] + list(lca_utils.taxlist()) w = csv.DictWriter(args.output, fieldnames=fieldnames) diff --git a/sourmash_lib/lca/command_index.py b/sourmash_lib/lca/command_index.py index b4b2fa8cea..6c59411d16 100644 --- a/sourmash_lib/lca/command_index.py +++ b/sourmash_lib/lca/command_index.py @@ -163,6 +163,7 @@ def index(args): # load signatures, construct index of hashvals to lineages hashval_to_lineage = defaultdict(set) md5_to_lineage = {} + md5_to_name = {} notify('finding signatures...') if args.traverse_directory: @@ -214,6 +215,7 @@ def index(args): # store md5 -> lineage too md5_to_lineage[sig.md5sum()] = lineage_idx + md5_to_name[sig.md5sum()] = sig.name() notify(u'\r\033[K', end=u'') notify('...found {} genomes with lineage assignments!!', @@ -242,7 +244,8 @@ def index(args): db.hashval_to_lineage_id = hashval_to_lineage db.ksize = int(args.ksize) db.scaled = int(args.scaled) - db.signatures_to_lineage = md5_to_lineage + db.signatures_to_lineage_id = md5_to_lineage + db.signatures_to_name = md5_to_name db.save(db_outfile) diff --git a/sourmash_lib/lca/lca_utils.py b/sourmash_lib/lca/lca_utils.py index c839c13d34..fcdca0dc27 100644 --- a/sourmash_lib/lca/lca_utils.py +++ b/sourmash_lib/lca/lca_utils.py @@ -21,7 +21,7 @@ # ordered list of taxonomic ranks -def taxlist(include_strain=False): +def taxlist(include_strain=True): for k in ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']: yield k @@ -120,14 +120,16 @@ class LCA_Database(object): obj.hashval_to_lineage_id: key 'hashval' => set('lineage_id') obj.ksize: k-mer size obj.scaled: scaled value - obj.signatures_to_lineage: key 'md5sum' => 'lineage_id' + obj.signatures_to_lineage_id: key 'md5sum' => 'lineage_id' + obj.signatures_to_name: key 'md5sum' => 'name' from original signature """ def __init__(self): self.lineage_dict = None self.hashval_to_lineage_id = None self.ksize = None self.scaled = None - self.signatures_to_lineage = None + self.signatures_to_lineage_id = None + self.signatures_to_name = None def load(self, db_name): "Load from a JSON file." @@ -161,16 +163,27 @@ def load(self, db_name): # JSON doesn't have a 64 bit type so stores them as strings) hashval_to_lineage_id_2 = load_d['hashval_assignments'] hashval_to_lineage_id = {} + lineage_id_counts = defaultdict(int) + for k, v in hashval_to_lineage_id_2.items(): hashval_to_lineage_id[int(k)] = v + for vv in v: + lineage_id_counts[vv] += 1 - signatures_to_lineage = load_d['signatures_to_lineage'] + signatures_to_lineage_id = load_d['signatures_to_lineage'] + signatures_to_name = load_d.get('signatures_to_name', None) self.lineage_dict = lineage_dict self.hashval_to_lineage_id = hashval_to_lineage_id self.ksize = ksize self.scaled = scaled - self.signatures_to_lineage = signatures_to_lineage + self.signature_to_lineage_id = signatures_to_lineage_id + self.signature_to_name = signatures_to_name + lineage_id_to_signature = {} + for k, v in signatures_to_lineage_id.items(): + lineage_id_to_signature[v] = k + self.lineage_id_to_signature = lineage_id_to_signature + self.lineage_id_counts = lineage_id_counts def save(self, db_name): "Save to a JSON file." @@ -196,7 +209,8 @@ def save(self, db_name): # convert values from sets to lists, so that JSON knows how to save save_d['hashval_assignments'] = \ dict((k, list(v)) for (k, v) in self.hashval_to_lineage_id.items()) - save_d['signatures_to_lineage'] = self.signatures_to_lineage + save_d['signatures_to_lineage'] = self.signatures_to_lineage_id + save_d['signatures_to_name'] = self.signatures_to_name json.dump(save_d, fp) def downsample_scaled(self, scaled): diff --git a/tests/test-data/lca/classify-by-both.csv b/tests/test-data/lca/classify-by-both.csv index fa4f55dcfc..f003b730c2 100644 --- a/tests/test-data/lca/classify-by-both.csv +++ b/tests/test-data/lca/classify-by-both.csv @@ -1,21 +1,21 @@ -ID,status,superkingdom,phylum,class,order,family,genus,species -TARA_ANE_MAG_00011,found,Bacteria,Proteobacteria,Alphaproteobacteria,Rickettsiales,,, -TARA_ANE_MAG_00015,found,Bacteria,Chloroflexi,Dehalococcoidetes,,,, -TARA_ANE_MAG_00041,found,Bacteria,Bacteroidetes,Flavobacteriia,Flavobacteriales,,, -TARA_ANE_MAG_00044,found,Bacteria,Proteobacteria,Alphaproteobacteria,Rickettsiales,,, -TARA_ANE_MAG_00063,found,Archaea,Euryarchaeota,,,,, -TARA_ANE_MAG_00068,found,Bacteria,Candidatus_Marinimicrobia ,,,,, -TARA_ANE_MAG_00069,found,Bacteria,Proteobacteria,Alphaproteobacteria,Rickettsiales,,, -TARA_ANW_MAG_00005,found,Bacteria,Proteobacteria,Gammaproteobacteria,,,, -TARA_ANW_MAG_00020,found,Bacteria,Proteobacteria,Alphaproteobacteria,Rhodospirillales,Rhodospirillaceae,, -TARA_ANW_MAG_00034,found,Bacteria,Bacteroidetes,Flavobacteriia,Flavobacteriales,,, -TARA_ANW_MAG_00051,found,Bacteria,Proteobacteria,Alphaproteobacteria,Rickettsiales,Pelagibacteraceae,, -TARA_ANW_MAG_00083,found,Eukaryota,Haptophyta,Prymnesiophyceae,Isochrysidales,Noelaerhabdaceae,Emiliania, -TARA_ANW_MAG_00084,found,Eukaryota,Cryptophyta,Cryptophyceae,Pyrenomonadales,Geminigeraceae,Guillardia, -TARA_ANW_MAG_00085,found,Eukaryota,Haptophyta,Prymnesiophyceae,Isochrysidales,Noelaerhabdaceae,Emiliania, -TARA_ASE_MAG_00007,found,Bacteria,Proteobacteria,Gammaproteobacteria,,,, -TARA_ASE_MAG_00009,found,Archaea,Euryarchaeota,,,,, -TARA_ASE_MAG_00015,found,Bacteria,Bacteroidetes,Flavobacteriia,Flavobacteriales,,, -TARA_ASE_MAG_00018,found,Archaea,Euryarchaeota,,,,, -TARA_ASE_MAG_00028,found,Bacteria,Planctomycetes,Planctomycetia,Planctomycetales,Planctomycetaceae,, -TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas_macleodii +ID,status,superkingdom,phylum,class,order,family,genus,species,strain +TARA_ANE_MAG_00011,found,Bacteria,Proteobacteria,Alphaproteobacteria,Rickettsiales,,,, +TARA_ANE_MAG_00015,found,Bacteria,Chloroflexi,Dehalococcoidetes,,,,, +TARA_ANE_MAG_00041,found,Bacteria,Bacteroidetes,Flavobacteriia,Flavobacteriales,,,, +TARA_ANE_MAG_00044,found,Bacteria,Proteobacteria,Alphaproteobacteria,Rickettsiales,,,, +TARA_ANE_MAG_00063,found,Archaea,Euryarchaeota,,,,,, +TARA_ANE_MAG_00068,found,Bacteria,Candidatus_Marinimicrobia ,,,,,, +TARA_ANE_MAG_00069,found,Bacteria,Proteobacteria,Alphaproteobacteria,Rickettsiales,,,, +TARA_ANW_MAG_00005,found,Bacteria,Proteobacteria,Gammaproteobacteria,,,,, +TARA_ANW_MAG_00020,found,Bacteria,Proteobacteria,Alphaproteobacteria,Rhodospirillales,Rhodospirillaceae,,, +TARA_ANW_MAG_00034,found,Bacteria,Bacteroidetes,Flavobacteriia,Flavobacteriales,,,, +TARA_ANW_MAG_00051,found,Bacteria,Proteobacteria,Alphaproteobacteria,Rickettsiales,Pelagibacteraceae,,, +TARA_ANW_MAG_00083,found,Eukaryota,Haptophyta,Prymnesiophyceae,Isochrysidales,Noelaerhabdaceae,Emiliania,, +TARA_ANW_MAG_00084,found,Eukaryota,Cryptophyta,Cryptophyceae,Pyrenomonadales,Geminigeraceae,Guillardia,, +TARA_ANW_MAG_00085,found,Eukaryota,Haptophyta,Prymnesiophyceae,Isochrysidales,Noelaerhabdaceae,Emiliania,, +TARA_ASE_MAG_00007,found,Bacteria,Proteobacteria,Gammaproteobacteria,,,,, +TARA_ASE_MAG_00009,found,Archaea,Euryarchaeota,,,,,, +TARA_ASE_MAG_00015,found,Bacteria,Bacteroidetes,Flavobacteriia,Flavobacteriales,,,, +TARA_ASE_MAG_00018,found,Archaea,Euryarchaeota,,,,,, +TARA_ASE_MAG_00028,found,Bacteria,Planctomycetes,Planctomycetia,Planctomycetales,Planctomycetaceae,,, +TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas_macleodii, diff --git a/tests/test_lca.py b/tests/test_lca.py index 42cd2ae0ee..c2c67f3322 100644 --- a/tests/test_lca.py +++ b/tests/test_lca.py @@ -310,7 +310,7 @@ def test_single_classify_empty(): print(out) print(err) - assert 'data/GCF_000005845.2_ASM584v2_genomic.fna.gz,nomatch,,,,,,,' in out + assert 'data/GCF_000005845.2_ASM584v2_genomic.fna.gz,nomatch,,,,,,,,' in out assert 'classified 1 signatures total' in err assert 'loaded 1 LCA databases' in err @@ -452,7 +452,7 @@ def test_unassigned_last_index_and_classify(): print(err) assert 'ID,status,superkingdom,phylum,class,order,family,genus,species' in out - assert 'TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,,\r\n' in out + assert 'TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,,,\r\n' in out assert 'classified 1 signatures total' in err assert 'loaded 1 LCA databases' in err @@ -487,7 +487,7 @@ def test_index_and_classify_internal_unassigned_multi(): print(err) assert 'ID,status,superkingdom,phylum,class,order,family,genus,species' in out - assert 'TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,unassigned,unassigned,Alteromonadaceae,,\r\n' in out + assert 'TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,unassigned,unassigned,Alteromonadaceae,,,\r\n' in out assert 'classified 1 signatures total' in err assert 'loaded 1 LCA databases' in err @@ -500,7 +500,7 @@ def test_index_and_classify_internal_unassigned_multi(): print(err) assert 'ID,status,superkingdom,phylum,class,order,family,genus,species' in out - assert 'TARA_PSW_MAG_00136,found,Eukaryota,Chlorophyta,Prasinophyceae,unassigned,unassigned,Ostreococcus,\r\n' in out + assert 'TARA_PSW_MAG_00136,found,Eukaryota,Chlorophyta,Prasinophyceae,unassigned,unassigned,Ostreococcus,,\r\n' in out assert 'classified 1 signatures total' in err assert 'loaded 1 LCA databases' in err @@ -519,7 +519,7 @@ def test_multi_db_classify(): print(err) assert 'ID,status,superkingdom,phylum,class,order,family,genus,species' in out - assert 'TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,,,' in out + assert 'TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,,,,' in out assert 'classified 1 signatures total' in err assert 'loaded 2 LCA databases' in err @@ -719,6 +719,7 @@ def test_rankinfo_on_multi(): lines.remove('family: 695 (19.2%)') lines.remove('genus: 681 (18.8%)') lines.remove('species: 200 (5.5%)') + lines.remove('strain: 0 (0.0%)') assert not lines @@ -742,6 +743,7 @@ def test_rankinfo_on_single(): lines.remove('family: 695 (19.2%)') lines.remove('genus: 681 (18.8%)') lines.remove('species: 200 (5.5%)') + lines.remove('strain: 0 (0.0%)') assert not lines @@ -806,7 +808,7 @@ def test_single_gather(): print(out) print(err) - assert '2.0 Mbp 100.0% Alteromonas Alteromonas_macleodii' in out + assert '2.0 Mbp 100.0% 100.0% Alteromonas_macleodii' in out assert 'Query is completely assigned.' @@ -836,5 +838,5 @@ def test_gather_unknown_hashes(): print(out) print(err) - assert '270.0 kbp 11.5% Archaea; family novelFamily_I' in out + assert '270.0 kbp 11.5% 21.4% Archaea; family novelFamily_I' in out assert '88.5% (2.1 Mbp) have no assignment.' in out