Merge pull request #347 from dib-lab/gather/abund

[MRG] add abundance weights in to 'sourmash gather'
sourmash-bio · Feb 7, 2018 · bf91ee2 · bf91ee2
2 parents ae9a32f + 753f086
commit bf91ee2
Show file tree

Hide file tree

Showing 9 changed files with 5,093 additions and 88 deletions.
diff --git a/doc/tutorials.md b/doc/tutorials.md
@@ -132,42 +132,37 @@ sourmash search ecoli-genome.sig ecolidb.sbt.json -n 20
 You should see output like this:
 
 ```
-# running sourmash subcommand: search
+select query k=31 automatically.
 loaded query: /home/ubuntu/data/ecoliMG1655.... (k=31, DNA)
-loaded SBT ecolidb.sbt.json
-Searching SBT ecolidb.sbt.json
+loaded 0 signatures and 1 databases total.                                     
+
 49 matches; showing first 20:
 similarity   match
 ----------   -----
- 88.4%       NZ_GG774190.1 Escherichia coli MS 196-1 Scfld2538, whole gen
- 87.8%       NZ_JMGW01000001.1 Escherichia coli 1-176-05_S4_C2 e117605S4C
- 86.6%       NZ_JMGU01000001.1 Escherichia coli 2-011-08_S3_C2 e201108S3C
- 86.6%       NZ_JH659569.1 Escherichia coli M919 supercont2.1, whole geno
- 81.3%       NZ_JHRU01000001.1 Escherichia coli strain 100854 100854_1, w
- 78.9%       NZ_JHDG01000001.1 Escherichia coli 1-176-05_S3_C1 e117605S3C
- 78.3%       NZ_MOJK01000001.1 Escherichia coli strain 469 Cleandata-BN4_
- 78.1%       NZ_JNLZ01000001.1 Escherichia coli 3-105-05_S1_C1 e310505S1C
- 78.1%       NZ_MOGK01000001.1 Escherichia coli strain 676 BN4_676_1_(pai
- 78.1%       NZ_MIWF01000001.1 Escherichia coli strain AF7759-1 contig_00
- 70.6%       NZ_KE700241.1 Escherichia coli HVH 147 (4-5893887) acYxy-sup
- 69.2%       NZ_CP011331.1 Escherichia coli O104:H4 str. C227-11, complet
- 69.0%       NZ_JHGJ01000001.1 Escherichia coli O45:H2 str. 2009C-4780 co
- 69.0%       NZ_MIWP01000001.1 Escherichia coli strain K6412 contig_0001,
- 68.6%       NZ_JHHE01000001.1 Escherichia coli O103:H2 str. 2009C-3279 c
- 68.0%       NZ_LVOV01000001.1 Escherichia coli strain swine72 swine72_co
- 67.5%       NZ_LQWB01000001.1 Escherichia coli strain GN03624 GCID_ECOLI
- 67.1%       NZ_JHGS01000001.1 Escherichia coli O111:NM str. 2009C-4052 c
- 67.1%       NZ_JHMG01000001.1 Escherichia coli O121:H19 str. 2010EL1058
- 66.7%       NZ_AIGC01000068.1 Escherichia coli DEC7C gecDEC7C.contig.67_
+ 75.4%       NZ_JMGW01000001.1 Escherichia coli 1-176-05_S4_C2 e117605...
+ 72.2%       NZ_GG774190.1 Escherichia coli MS 196-1 Scfld2538, whole ...
+ 71.4%       NZ_JMGU01000001.1 Escherichia coli 2-011-08_S3_C2 e201108...
+ 70.1%       NZ_JHRU01000001.1 Escherichia coli strain 100854 100854_1...
+ 69.0%       NZ_JH659569.1 Escherichia coli M919 supercont2.1, whole g...
+ 64.9%       NZ_JNLZ01000001.1 Escherichia coli 3-105-05_S1_C1 e310505...
+ 63.0%       NZ_MOJK01000001.1 Escherichia coli strain 469 Cleandata-B...
+ 62.9%       NZ_MOGK01000001.1 Escherichia coli strain 676 BN4_676_1_(...
+ 62.0%       NZ_JHDG01000001.1 Escherichia coli 1-176-05_S3_C1 e117605...
+ 59.9%       NZ_MIWF01000001.1 Escherichia coli strain AF7759-1 contig...
+ 52.7%       NZ_KE700241.1 Escherichia coli HVH 147 (4-5893887) acYxy-...
+ 51.7%       NZ_APWY01000001.1 Escherichia coli 178200 gec178200.conti...
+ 49.3%       NZ_LVOV01000001.1 Escherichia coli strain swine72 swine72...
+ 49.3%       NZ_MIWP01000001.1 Escherichia coli strain K6412 contig_00...
+ 49.0%       NZ_LQWB01000001.1 Escherichia coli strain GN03624 GCID_EC...
+ 48.9%       NZ_JHGJ01000001.1 Escherichia coli O45:H2 str. 2009C-4780...
+ 48.1%       NZ_CP011331.1 Escherichia coli O104:H4 str. C227-11, comp...
+ 47.7%       NZ_JHNB01000001.1 Escherichia coli O103:H25 str. 2010C-45...
+ 47.7%       NZ_JHRE01000001.1 Escherichia coli strain 302014 302014_1...
+ 47.6%       NZ_JHHE01000001.1 Escherichia coli O103:H2 str. 2009C-327...
 ```
 
 ## Compare many signatures and build a tree.
 
-Adjust plotting (this is a bug in sourmash :) --
-```
-echo 'backend : Agg' > matplotlibrc
-```
-
 Compare all the things:
 
 ```
@@ -217,7 +212,7 @@ loaded query: /home/ubuntu/data/ecoliMG1655.... (k=31, DNA)
 
 overlap     p_query p_match
 ---------   ------- --------
-4.9 Mbp     100.0%   99.8%      CP011320.1 Escherichia coli strain SQ37,
+4.9 Mbp     100.0%   99.8%      CP011320.1 Escherichia coli strain SQ...
 
 found 1 matches total;
 the recovered matches hit 100.0% of the query
@@ -239,40 +234,41 @@ sourmash gather -k 31 shakya-unaligned-contigs.sig genbank-k31.sbt.json
 
 This should yield:
 ```
-# running sourmash subcommand: gather
 loaded query: mqc500.QC.AMBIGUOUS.99.unalign... (k=31, DNA)
-loaded SBT genbank-k31.sbt.json
+loaded 0 signatures and 1 databases total.
 
-overlap     p_query p_match
+
+overlap     p_query p_match 
 ---------   ------- --------
-1.4 Mbp      11.0%   58.0%      JANA01000001.1 Fusobacterium sp. OBRC1 c
-1.0 Mbp       7.7%   25.9%      CP001957.1 Haloferax volcanii DS2 plasmi
-0.9 Mbp       7.5%   11.8%      BA000019.2 Nostoc sp. PCC 7120 DNA, comp
-0.7 Mbp       5.9%   23.0%      FOVK01000036.1 Proteiniclasticum ruminis
-0.7 Mbp       5.3%   17.6%      AE017285.1 Desulfovibrio vulgaris subsp.
-0.6 Mbp       4.9%   11.1%      CP001252.1 Shewanella baltica OS223, com
-0.6 Mbp       4.8%   27.3%      AP008226.1 Thermus thermophilus HB8 geno
-0.6 Mbp       4.4%   11.2%      CP000031.2 Ruegeria pomeroyi DSS-3, comp
-480.0 kbp     3.8%    7.6%      CP000875.1 Herpetosiphon aurantiacus DSM
-410.0 kbp     3.3%   10.5%      CH959317.1 Sulfitobacter sp. NAS-14.1 sc
-1.4 Mbp      10.9%   11.8%      LN831027.1 Fusobacterium nucleatum subsp
-0.5 Mbp       4.1%    5.3%      CP000753.1 Shewanella baltica OS185, com
-420.0 kbp     3.3%    7.7%      FNDZ01000023.1 Proteiniclasticum ruminis
-150.0 kbp     1.2%    4.5%      CP015081.1 Deinococcus radiodurans R1 ch
-150.0 kbp     1.2%    8.2%      CP000969.1 Thermotoga sp. RQ2, complete
-290.0 kbp     2.3%    4.1%      CH959311.1 Sulfitobacter sp. EE-36 scf_1
-1.2 Mbp       9.4%    5.0%      CP013328.1 Fusobacterium nucleatum subsp
-110.0 kbp     0.9%    3.5%      FREL01000833.1 Enterococcus faecalis iso
-0.6 Mbp       5.0%    2.8%      CP000527.1 Desulfovibrio vulgaris DP4, c
-340.0 kbp     2.7%    3.3%      KQ235732.1 Fusobacterium nucleatum subsp
-70.0 kbp      0.6%    1.2%      CP000850.1 Salinispora arenicola CNS-205
-60.0 kbp      0.5%    0.7%      CP000270.1 Burkholderia xenovorans LB400
-50.0 kbp      0.4%    2.6%      CP001080.1 Sulfurihydrogenibium sp. YO3A
-50.0 kbp      0.4%    3.2%      L77117.1 Methanocaldococcus jannaschii D
+1.4 Mbp      11.0%   58.0%      JANA01000001.1 Fusobacterium sp. OBRC...
+1.0 Mbp       7.7%   25.9%      CP001957.1 Haloferax volcanii DS2 pla...
+0.9 Mbp       7.4%   11.8%      BA000019.2 Nostoc sp. PCC 7120 DNA, c...
+0.7 Mbp       5.9%   23.0%      FOVK01000036.1 Proteiniclasticum rumi...
+0.7 Mbp       5.3%   17.6%      AE017285.1 Desulfovibrio vulgaris sub...
+0.6 Mbp       4.9%   11.1%      CP001252.1 Shewanella baltica OS223, ...
+0.6 Mbp       4.8%   27.3%      AP008226.1 Thermus thermophilus HB8 g...
+0.6 Mbp       4.4%   11.2%      CP000031.2 Ruegeria pomeroyi DSS-3, c...
+480.0 kbp     3.8%    7.6%      CP000875.1 Herpetosiphon aurantiacus ...
+410.0 kbp     3.3%   10.5%      CH959317.1 Sulfitobacter sp. NAS-14.1...
+1.4 Mbp       2.2%   11.8%      LN831027.1 Fusobacterium nucleatum su...
+0.5 Mbp       2.1%    5.3%      CP000753.1 Shewanella baltica OS185, ...
+420.0 kbp     1.9%    7.7%      FNDZ01000023.1 Proteiniclasticum rumi...
+150.0 kbp     1.2%    4.5%      CP015081.1 Deinococcus radiodurans R1...
+150.0 kbp     1.2%    8.2%      CP000969.1 Thermotoga sp. RQ2, comple...
+290.0 kbp     1.1%    4.1%      CH959311.1 Sulfitobacter sp. EE-36 sc...
+1.2 Mbp       1.0%    5.0%      CP013328.1 Fusobacterium nucleatum su...
+110.0 kbp     0.9%    3.5%      FREL01000833.1 Enterococcus faecalis ...
+0.6 Mbp       0.8%    2.8%      CP000527.1 Desulfovibrio vulgaris DP4...
+340.0 kbp     0.6%    3.3%      KQ235732.1 Fusobacterium nucleatum su...
+70.0 kbp      0.6%    1.2%      CP000850.1 Salinispora arenicola CNS-...
+60.0 kbp      0.5%    0.7%      CP000270.1 Burkholderia xenovorans LB...
+50.0 kbp      0.4%    2.6%      CP001080.1 Sulfurihydrogenibium sp. Y...
+50.0 kbp      0.4%    3.2%      L77117.1 Methanocaldococcus jannaschi...
 found less than 40.0 kbp in common. => exiting
 
 found 24 matches total;
-the recovered matches hit 73.4% of the query
+the recovered matches hit 73.1% of the query
+
 
 ```
 

diff --git a/sourmash_lib/commands.py b/sourmash_lib/commands.py
@@ -869,7 +869,7 @@ def categorize(args):
 
 
 def gather(args):
-    from .search import gather_databases
+    from .search import gather_databases, GatherResult, format_bp
 
     parser = argparse.ArgumentParser()
     parser.add_argument('query', help='query signature')
@@ -889,6 +889,8 @@ def gather(args):
                         help='downsample query to this scaled factor')
     parser.add_argument('-q', '--quiet', action='store_true',
                         help='suppress non-error output')
+    parser.add_argument('--ignore-abundance',  action='store_true',
+                        help='do NOT use k-mer abundances if present')
 
     sourmash_args.add_ksize_arg(parser, DEFAULT_LOAD_K)
     sourmash_args.add_moltype_args(parser)
@@ -932,61 +934,46 @@ def gather(args):
         error('Nothing found to search!')
         sys.exit(-1)
 
-    # pretty-printing code.
-    def format_bp(bp):
-        bp = float(bp)
-        if bp < 500:
-            return '{:.0f} bp '.format(bp)
-        elif bp <= 500e3:
-            return '{:.1f} kbp'.format(round(bp / 1e3, 1))
-        elif bp < 500e6:
-            return '{:.1f} Mbp'.format(round(bp / 1e6, 1))
-        elif bp < 500e9:
-            return '{:.1f} Gbp'.format(round(bp / 1e9, 1))
-        return '???'
-
     found = []
-    sum_found = 0
-    for result, n_intersect_mins, new_max_hash, next_query in gather_databases(query, databases,
-                                                     args.threshold_bp):
+    for result, weighted_missed, new_max_hash, next_query in gather_databases(query, databases, args.threshold_bp, args.ignore_abundance):
         # print interim result & save in a list for later use
         pct_query = '{:.1f}%'.format(result.f_orig_query*100)
         pct_genome = '{:.1f}%'.format(result.f_match*100)
 
         name = result.leaf._display_name(40)
 
+
         if not len(found):                # first result? print header.
             print_results("")
             print_results("overlap     p_query p_match ")
             print_results("---------   ------- --------")
 
         # print interim result & save in a list for later use
-        pct_query = '{:.1f}%'.format(result.f_orig_query*100)
+        pct_query = '{:.1f}%'.format(result.f_unique_weighted*100)
         pct_genome = '{:.1f}%'.format(result.f_match*100)
 
         name = result.leaf._display_name(40)
 
         print_results('{:9}   {:>6}  {:>6}      {}',
                       format_bp(result.intersect_bp), pct_query, pct_genome,
                       name)
-        sum_found += n_intersect_mins
         found.append(result)
 
 
     # basic reporting
     print_results('\nfound {} matches total;', len(found))
 
-    sum_found /= len(query.minhash.get_hashes())
     print_results('the recovered matches hit {:.1f}% of the query',
-           sum_found * 100)
+           (1 - weighted_missed) * 100)
     print_results('')
 
     if not found:
         sys.exit(0)
 
     if args.output:
         fieldnames = ['intersect_bp', 'f_orig_query', 'f_match',
-                      'f_unique_to_query', 'name', 'filename', 'md5']
+                      'f_unique_to_query', 'f_unique_weighted',
+                      'average_abund', 'name', 'filename', 'md5']
         w = csv.DictWriter(args.output, fieldnames=fieldnames)
         w.writeheader()
         for result in found:

diff --git a/sourmash_lib/search.py b/sourmash_lib/search.py
@@ -1,3 +1,4 @@
+from __future__ import division
 from collections import namedtuple
 
 import sourmash_lib
@@ -6,10 +7,28 @@
 from .sbtmh import search_minhashes, search_minhashes_containment
 from .sbtmh import SearchMinHashesFindBest
 
+
+
+
 # generic SearchResult across individual signatures + SBTs.
 SearchResult = namedtuple('SearchResult',
                           'similarity, match_sig, md5, filename, name')
 
+
+def format_bp(bp):
+    "Pretty-print bp information."
+    bp = float(bp)
+    if bp < 500:
+        return '{:.0f} bp '.format(bp)
+    elif bp <= 500e3:
+        return '{:.1f} kbp'.format(round(bp / 1e3, 1))
+    elif bp < 500e6:
+        return '{:.1f} Mbp'.format(round(bp / 1e6, 1))
+    elif bp < 500e9:
+        return '{:.1f} Gbp'.format(round(bp / 1e9, 1))
+    return '???'
+
+
 def search_databases(query, databases, threshold, do_containment, best_only):
     # set up the search & score function(s) - similarity vs containment
     search_fn = search_minhashes
@@ -58,12 +77,23 @@ def search_databases(query, databases, threshold, do_containment, best_only):
     return results
 
 
-def gather_databases(query, databases, threshold_bp):
+GatherResult = namedtuple('GatherResult',
+                          'intersect_bp, f_orig_query, f_match, f_unique_to_query, f_unique_weighted, average_abund, filename, name, md5, leaf')
+
+def gather_databases(query, databases, threshold_bp, ignore_abundance):
     from sourmash_lib.sbtmh import SearchMinHashesFindBestIgnoreMaxHash
 
     orig_query = query
     orig_mins = orig_query.minhash.get_hashes()
 
+    if orig_query.minhash.track_abundance and not ignore_abundance:
+        orig_abunds = orig_query.minhash.get_mins(with_abundance=True)
+    else:
+        if orig_query.minhash.track_abundance and ignore_abundance:
+            notify('** ignoring abundance')
+        orig_abunds = { k: 1 for k in orig_query.minhash.get_mins(with_abundance=False) }
+    sum_abunds = sum(orig_abunds.values())
+
     # calculate the band size/resolution R for the genome
     R_metagenome = orig_query.minhash.scaled
 
@@ -91,7 +121,7 @@ def find_best(dblist, query):
             return None, None, None
 
         # take the best result
-        results.sort(key=lambda x: -x[0])   # reverse sort on similarity
+        results.sort(key=lambda x: (-x[0], x[1].md5sum()))   # reverse sort on similarity
         best_similarity, best_leaf = results[0]
         return best_similarity, best_leaf, filename
 
@@ -107,8 +137,7 @@ def build_new_signature(mins, template_sig):
     query = build_new_signature(new_mins, orig_query)
 
     sum_found = 0.
-    GatherResult = namedtuple('GatherResult',
-                               'intersect_bp, f_orig_query, f_match, f_unique_to_query, filename, name, md5, leaf')
+
     while 1:
         best_similarity, best_leaf, filename = find_best(databases, query)
         if not best_leaf:          # no matches at all!
@@ -156,10 +185,18 @@ def build_new_signature(mins, template_sig):
         query_n_mins = len(orig_query.minhash.get_hashes())
         f_unique_to_query = len(intersect_mins) / float(query_n_mins)
 
+        # calculate scores weighted by abundances
+        f_unique_weighted = sum((orig_abunds[k] for k in intersect_mins)) \
+               / sum_abunds
+        average_abund = sum((orig_abunds[k] for k in intersect_mins)) \
+               / len(intersect_mins)
+
         result = GatherResult(intersect_bp=intersect_bp,
                               f_orig_query=f_orig_query,
                               f_match=f_match,
                               f_unique_to_query=f_unique_to_query,
+                              f_unique_weighted=f_unique_weighted,
+                              average_abund=average_abund,
                               filename=filename,
                               md5=best_leaf.md5sum(),
                               name=best_leaf.name(),
@@ -169,4 +206,7 @@ def build_new_signature(mins, template_sig):
         query_mins -= set(found_mins)
         query = build_new_signature(query_mins, orig_query)
 
-        yield result, len(intersect_mins), new_max_hash, query
+        weighted_missed = sum((orig_abunds[k] for k in query_mins)) \
+             / sum_abunds
+
+        yield result, weighted_missed, new_max_hash, query