Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update gather with specific signature isolation. #394

Merged
merged 7 commits into from
Feb 18, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 51 additions & 24 deletions sourmash_lib/lca/command_gather.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@

CTB TODO:
* sort out f_uniq / f_orig
* add in abundance weighting
* CSV output with full taxonomy
"""
from __future__ import print_function, division
import sys
Expand All @@ -23,7 +21,7 @@
from sourmash_lib.search import format_bp

LCAGatherResult = namedtuple('LCAGatherResult',
'intersect_bp, f_unique_to_query, f_unique_weighted, average_abund, lineage')
'intersect_bp, f_unique_to_query, f_unique_weighted, average_abund, lineage, f_match')


def format_lineage(lineage_tup):
Expand All @@ -36,9 +34,12 @@ def format_lineage(lineage_tup):

if 'genus' in present:
genus = d['genus']
if 'species' in present:
if 'strain' in present:
name = d['strain']
elif 'species' in present:
species = d['species']
if species.startswith(genus + ' '):
if species.startswith(genus + ' ') or \
species.startswith(genus + '_'):
name = species
else:
name = '{} {}'.format(genus, species)
Expand Down Expand Up @@ -79,49 +80,73 @@ def gather_signature(query_sig, dblist, ignore_abundance):
orig_abunds = { k: 1 for k in query_mins }
sum_abunds = sum(orig_abunds.values())

# first time through, record FOO.
md5_to_lineage = {}

# collect all mentioned lineage_ids -> md5s
x = set()
for hashval in query_mins:
for lca_db in dblist:
lineage_ids = lca_db.hashval_to_lineage_id.get(hashval, [])
for lid in lineage_ids:
md5 = lca_db.lineage_id_to_signature[lid]
x.add((lca_db, lid, md5))

for lca_db, lid, md5 in x:
md5_to_lineage[md5] = lca_db.lineage_dict[lid]

# now! do the gather:
while 1:
# find all of the assignments for the current set of hashes
assignments = lca_utils.gather_assignments(query_mins, dblist)

assignments = defaultdict(set)
for hashval in query_mins:
for lca_db in dblist:
lineage_ids = lca_db.hashval_to_lineage_id.get(hashval, [])
for lid in lineage_ids:
md5 = lca_db.lineage_id_to_signature[lid]
signature_size = lca_db.lineage_id_counts[lid]
assignments[hashval].add((md5, signature_size))

# none? quit.
if not assignments:
break

# count 'em all. note, this will combine identical lineages from
# different source genomes; the only that is counted is the lineage
# assignment.
# count the distinct signatures.
counts = Counter()
for hashval, assignment_set in assignments.items():
for assignment in assignment_set:
counts[assignment] += 1
for hashval, md5_set in assignments.items():
for (md5, sigsize) in md5_set:
counts[(md5, sigsize)] += 1

# find the most abundant assignment
top_assignment, top_count = next(iter(counts.most_common()))
(top_md5, top_sigsize), top_count = next(iter(counts.most_common()))

# now, remove from query mins.
intersect_mins = set()
for hashval, assignment_set in assignments.items():
if top_assignment in assignment_set:
for hashval, md5_set in assignments.items():
if (top_md5, sigsize) in md5_set:
query_mins.remove(hashval)
intersect_mins.add(hashval)

# should match!
assert top_count == len(intersect_mins)
# @@CTB assert top_count == len(intersect_mins)

# calculate size of match (# of hashvals belonging to that sig)
match_size = top_sigsize

# construct 'result' object
intersect_bp = top_count * query_sig.minhash.scaled
f_unique_weighted = sum((orig_abunds[k] for k in intersect_mins)) \
/ sum_abunds
average_abund = sum((orig_abunds[k] for k in intersect_mins)) \
/ len(intersect_mins)
f_match = len(intersect_mins) / match_size

result = LCAGatherResult(intersect_bp = intersect_bp,
f_unique_to_query= top_count / n_mins,
f_unique_weighted=f_unique_weighted,
average_abund=average_abund,
lineage=top_assignment)


f_match=f_match,
lineage=md5_to_lineage[top_md5])

f_unassigned = len(query_mins) / n_mins
est_bp = len(query_mins) * query_sig.minhash.scaled
Expand Down Expand Up @@ -176,15 +201,17 @@ def gather_main(args):
# is this our first time through the loop? print headers, if so.
if not len(found):
print_results("")
print_results("overlap p_query")
print_results("--------- -------")
print_results("overlap p_query p_match ")
print_results("--------- ------- --------")

# output!
pct_query = '{:.1f}%'.format(result.f_unique_to_query*100)
pct_match = '{:.1f}%'.format(result.f_match*100)
str_bp = format_bp(result.intersect_bp)
name = format_lineage(result.lineage)

print_results('{:9} {:>6} {}', str_bp, pct_query, name)
print_results('{:9} {:>6} {:>6} {}', str_bp, pct_query,
pct_match, name)

found.append(result)

Expand All @@ -208,7 +235,7 @@ def gather_main(args):
sys.exit(0)

if args.output:
fieldnames = ['intersect_bp', 'f_unique_to_query', 'f_unique_weighted',
fieldnames = ['intersect_bp', 'f_match', 'f_unique_to_query', 'f_unique_weighted',
'average_abund'] + list(lca_utils.taxlist())

w = csv.DictWriter(args.output, fieldnames=fieldnames)
Expand Down
5 changes: 4 additions & 1 deletion sourmash_lib/lca/command_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ def index(args):
# load signatures, construct index of hashvals to lineages
hashval_to_lineage = defaultdict(set)
md5_to_lineage = {}
md5_to_name = {}

notify('finding signatures...')
if args.traverse_directory:
Expand Down Expand Up @@ -214,6 +215,7 @@ def index(args):

# store md5 -> lineage too
md5_to_lineage[sig.md5sum()] = lineage_idx
md5_to_name[sig.md5sum()] = sig.name()

notify(u'\r\033[K', end=u'')
notify('...found {} genomes with lineage assignments!!',
Expand Down Expand Up @@ -242,7 +244,8 @@ def index(args):
db.hashval_to_lineage_id = hashval_to_lineage
db.ksize = int(args.ksize)
db.scaled = int(args.scaled)
db.signatures_to_lineage = md5_to_lineage
db.signatures_to_lineage_id = md5_to_lineage
db.signatures_to_name = md5_to_name

db.save(db_outfile)

Expand Down
26 changes: 20 additions & 6 deletions sourmash_lib/lca/lca_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@


# ordered list of taxonomic ranks
def taxlist(include_strain=False):
def taxlist(include_strain=True):
for k in ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus',
'species']:
yield k
Expand Down Expand Up @@ -120,14 +120,16 @@ class LCA_Database(object):
obj.hashval_to_lineage_id: key 'hashval' => set('lineage_id')
obj.ksize: k-mer size
obj.scaled: scaled value
obj.signatures_to_lineage: key 'md5sum' => 'lineage_id'
obj.signatures_to_lineage_id: key 'md5sum' => 'lineage_id'
obj.signatures_to_name: key 'md5sum' => 'name' from original signature
"""
def __init__(self):
self.lineage_dict = None
self.hashval_to_lineage_id = None
self.ksize = None
self.scaled = None
self.signatures_to_lineage = None
self.signatures_to_lineage_id = None
self.signatures_to_name = None

def load(self, db_name):
"Load from a JSON file."
Expand Down Expand Up @@ -161,16 +163,27 @@ def load(self, db_name):
# JSON doesn't have a 64 bit type so stores them as strings)
hashval_to_lineage_id_2 = load_d['hashval_assignments']
hashval_to_lineage_id = {}
lineage_id_counts = defaultdict(int)

for k, v in hashval_to_lineage_id_2.items():
hashval_to_lineage_id[int(k)] = v
for vv in v:
lineage_id_counts[vv] += 1

signatures_to_lineage = load_d['signatures_to_lineage']
signatures_to_lineage_id = load_d['signatures_to_lineage']
signatures_to_name = load_d.get('signatures_to_name', None)

self.lineage_dict = lineage_dict
self.hashval_to_lineage_id = hashval_to_lineage_id
self.ksize = ksize
self.scaled = scaled
self.signatures_to_lineage = signatures_to_lineage
self.signature_to_lineage_id = signatures_to_lineage_id
self.signature_to_name = signatures_to_name
lineage_id_to_signature = {}
for k, v in signatures_to_lineage_id.items():
lineage_id_to_signature[v] = k
self.lineage_id_to_signature = lineage_id_to_signature
self.lineage_id_counts = lineage_id_counts

def save(self, db_name):
"Save to a JSON file."
Expand All @@ -196,7 +209,8 @@ def save(self, db_name):
# convert values from sets to lists, so that JSON knows how to save
save_d['hashval_assignments'] = \
dict((k, list(v)) for (k, v) in self.hashval_to_lineage_id.items())
save_d['signatures_to_lineage'] = self.signatures_to_lineage
save_d['signatures_to_lineage'] = self.signatures_to_lineage_id
save_d['signatures_to_name'] = self.signatures_to_name
json.dump(save_d, fp)

def downsample_scaled(self, scaled):
Expand Down
42 changes: 21 additions & 21 deletions tests/test-data/lca/classify-by-both.csv
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
ID,status,superkingdom,phylum,class,order,family,genus,species
TARA_ANE_MAG_00011,found,Bacteria,Proteobacteria,Alphaproteobacteria,Rickettsiales,,,
TARA_ANE_MAG_00015,found,Bacteria,Chloroflexi,Dehalococcoidetes,,,,
TARA_ANE_MAG_00041,found,Bacteria,Bacteroidetes,Flavobacteriia,Flavobacteriales,,,
TARA_ANE_MAG_00044,found,Bacteria,Proteobacteria,Alphaproteobacteria,Rickettsiales,,,
TARA_ANE_MAG_00063,found,Archaea,Euryarchaeota,,,,,
TARA_ANE_MAG_00068,found,Bacteria,Candidatus_Marinimicrobia ,,,,,
TARA_ANE_MAG_00069,found,Bacteria,Proteobacteria,Alphaproteobacteria,Rickettsiales,,,
TARA_ANW_MAG_00005,found,Bacteria,Proteobacteria,Gammaproteobacteria,,,,
TARA_ANW_MAG_00020,found,Bacteria,Proteobacteria,Alphaproteobacteria,Rhodospirillales,Rhodospirillaceae,,
TARA_ANW_MAG_00034,found,Bacteria,Bacteroidetes,Flavobacteriia,Flavobacteriales,,,
TARA_ANW_MAG_00051,found,Bacteria,Proteobacteria,Alphaproteobacteria,Rickettsiales,Pelagibacteraceae,,
TARA_ANW_MAG_00083,found,Eukaryota,Haptophyta,Prymnesiophyceae,Isochrysidales,Noelaerhabdaceae,Emiliania,
TARA_ANW_MAG_00084,found,Eukaryota,Cryptophyta,Cryptophyceae,Pyrenomonadales,Geminigeraceae,Guillardia,
TARA_ANW_MAG_00085,found,Eukaryota,Haptophyta,Prymnesiophyceae,Isochrysidales,Noelaerhabdaceae,Emiliania,
TARA_ASE_MAG_00007,found,Bacteria,Proteobacteria,Gammaproteobacteria,,,,
TARA_ASE_MAG_00009,found,Archaea,Euryarchaeota,,,,,
TARA_ASE_MAG_00015,found,Bacteria,Bacteroidetes,Flavobacteriia,Flavobacteriales,,,
TARA_ASE_MAG_00018,found,Archaea,Euryarchaeota,,,,,
TARA_ASE_MAG_00028,found,Bacteria,Planctomycetes,Planctomycetia,Planctomycetales,Planctomycetaceae,,
TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas_macleodii
ID,status,superkingdom,phylum,class,order,family,genus,species,strain
TARA_ANE_MAG_00011,found,Bacteria,Proteobacteria,Alphaproteobacteria,Rickettsiales,,,,
TARA_ANE_MAG_00015,found,Bacteria,Chloroflexi,Dehalococcoidetes,,,,,
TARA_ANE_MAG_00041,found,Bacteria,Bacteroidetes,Flavobacteriia,Flavobacteriales,,,,
TARA_ANE_MAG_00044,found,Bacteria,Proteobacteria,Alphaproteobacteria,Rickettsiales,,,,
TARA_ANE_MAG_00063,found,Archaea,Euryarchaeota,,,,,,
TARA_ANE_MAG_00068,found,Bacteria,Candidatus_Marinimicrobia ,,,,,,
TARA_ANE_MAG_00069,found,Bacteria,Proteobacteria,Alphaproteobacteria,Rickettsiales,,,,
TARA_ANW_MAG_00005,found,Bacteria,Proteobacteria,Gammaproteobacteria,,,,,
TARA_ANW_MAG_00020,found,Bacteria,Proteobacteria,Alphaproteobacteria,Rhodospirillales,Rhodospirillaceae,,,
TARA_ANW_MAG_00034,found,Bacteria,Bacteroidetes,Flavobacteriia,Flavobacteriales,,,,
TARA_ANW_MAG_00051,found,Bacteria,Proteobacteria,Alphaproteobacteria,Rickettsiales,Pelagibacteraceae,,,
TARA_ANW_MAG_00083,found,Eukaryota,Haptophyta,Prymnesiophyceae,Isochrysidales,Noelaerhabdaceae,Emiliania,,
TARA_ANW_MAG_00084,found,Eukaryota,Cryptophyta,Cryptophyceae,Pyrenomonadales,Geminigeraceae,Guillardia,,
TARA_ANW_MAG_00085,found,Eukaryota,Haptophyta,Prymnesiophyceae,Isochrysidales,Noelaerhabdaceae,Emiliania,,
TARA_ASE_MAG_00007,found,Bacteria,Proteobacteria,Gammaproteobacteria,,,,,
TARA_ASE_MAG_00009,found,Archaea,Euryarchaeota,,,,,,
TARA_ASE_MAG_00015,found,Bacteria,Bacteroidetes,Flavobacteriia,Flavobacteriales,,,,
TARA_ASE_MAG_00018,found,Archaea,Euryarchaeota,,,,,,
TARA_ASE_MAG_00028,found,Bacteria,Planctomycetes,Planctomycetia,Planctomycetales,Planctomycetaceae,,,
TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas_macleodii,
16 changes: 9 additions & 7 deletions tests/test_lca.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@ def test_single_classify_empty():
print(out)
print(err)

assert 'data/GCF_000005845.2_ASM584v2_genomic.fna.gz,nomatch,,,,,,,' in out
assert 'data/GCF_000005845.2_ASM584v2_genomic.fna.gz,nomatch,,,,,,,,' in out
assert 'classified 1 signatures total' in err
assert 'loaded 1 LCA databases' in err

Expand Down Expand Up @@ -452,7 +452,7 @@ def test_unassigned_last_index_and_classify():
print(err)

assert 'ID,status,superkingdom,phylum,class,order,family,genus,species' in out
assert 'TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,,\r\n' in out
assert 'TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,,,\r\n' in out
assert 'classified 1 signatures total' in err
assert 'loaded 1 LCA databases' in err

Expand Down Expand Up @@ -487,7 +487,7 @@ def test_index_and_classify_internal_unassigned_multi():
print(err)

assert 'ID,status,superkingdom,phylum,class,order,family,genus,species' in out
assert 'TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,unassigned,unassigned,Alteromonadaceae,,\r\n' in out
assert 'TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,unassigned,unassigned,Alteromonadaceae,,,\r\n' in out
assert 'classified 1 signatures total' in err
assert 'loaded 1 LCA databases' in err

Expand All @@ -500,7 +500,7 @@ def test_index_and_classify_internal_unassigned_multi():
print(err)

assert 'ID,status,superkingdom,phylum,class,order,family,genus,species' in out
assert 'TARA_PSW_MAG_00136,found,Eukaryota,Chlorophyta,Prasinophyceae,unassigned,unassigned,Ostreococcus,\r\n' in out
assert 'TARA_PSW_MAG_00136,found,Eukaryota,Chlorophyta,Prasinophyceae,unassigned,unassigned,Ostreococcus,,\r\n' in out
assert 'classified 1 signatures total' in err
assert 'loaded 1 LCA databases' in err

Expand All @@ -519,7 +519,7 @@ def test_multi_db_classify():
print(err)

assert 'ID,status,superkingdom,phylum,class,order,family,genus,species' in out
assert 'TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,,,' in out
assert 'TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,,,,' in out
assert 'classified 1 signatures total' in err
assert 'loaded 2 LCA databases' in err

Expand Down Expand Up @@ -719,6 +719,7 @@ def test_rankinfo_on_multi():
lines.remove('family: 695 (19.2%)')
lines.remove('genus: 681 (18.8%)')
lines.remove('species: 200 (5.5%)')
lines.remove('strain: 0 (0.0%)')

assert not lines

Expand All @@ -742,6 +743,7 @@ def test_rankinfo_on_single():
lines.remove('family: 695 (19.2%)')
lines.remove('genus: 681 (18.8%)')
lines.remove('species: 200 (5.5%)')
lines.remove('strain: 0 (0.0%)')

assert not lines

Expand Down Expand Up @@ -806,7 +808,7 @@ def test_single_gather():
print(out)
print(err)

assert '2.0 Mbp 100.0% Alteromonas Alteromonas_macleodii' in out
assert '2.0 Mbp 100.0% 100.0% Alteromonas_macleodii' in out
assert 'Query is completely assigned.'


Expand Down Expand Up @@ -836,5 +838,5 @@ def test_gather_unknown_hashes():
print(out)
print(err)

assert '270.0 kbp 11.5% Archaea; family novelFamily_I' in out
assert '270.0 kbp 11.5% 21.4% Archaea; family novelFamily_I' in out
assert '88.5% (2.1 Mbp) have no assignment.' in out