diff --git a/doc/classifying-signatures.md b/doc/classifying-signatures.md index 089793c731..eb05de58ff 100644 --- a/doc/classifying-signatures.md +++ b/doc/classifying-signatures.md @@ -190,15 +190,17 @@ first as "abundance projection" and the second as "angular similarity". `sourmash gather` can report approximate abundance information for containment queries against genome databases. This will give you -numbers that (approximately) match what you get from counting mapped -reads. +numbers that (approximately) match what you get from counting the coverage +of each contig by mapping reads. -If you create your input signatures with `-p abund`, -`sourmash gather` will use that information -to calculate an abundance-weighted result. This will weight -each match to a hash value by the multiplicity of the hash value in +If you create your query signature with `-p abund`, +`sourmash gather` will use the resulting k-mer multiplicity information +to calculate an abundance-weighted result, weighting +each hash value match by the multiplicity of the hash value in the query signature. You can turn off this behavior with -`--ignore-abundance`. +`--ignore-abundance`. The abundance is reported as column `avg_abund` +in the console output, and columns `average_abund`, `median_abund`, and +`std_abund` in the CSV output. For example, if you have a metagenome composed of two equal sized genomes A and B, with A present at 10 times the abundance of B, `gather` on @@ -347,7 +349,7 @@ First, we make some synthetic data sets: then we make signature s10-s11 with r1 and r3, i.e. 1:1 abundance, and make signature s10x10-s11 with r2 and r3, i.e. 10:1 abundance. -### A first experiment: 1:1 abundance. +### A first experiment: 1:1 abundance ratio. When we project r1+r3, 1:1 abundance, onto s10, s11, and s12 genomes with gather: @@ -367,10 +369,10 @@ overlap p_query p_match avg_abund * approximately 50% of each query matching (first column, `p_query`) * approximately 80% of subject genome's contents being matched (this is due to the low coverage of 2 used to build queries; `p_match`) -* approximately 2.0 abundance (third column, `avg_abund`) +* approximately 2.0 coverage (third column, `avg_abund`) * no match to genome s12. -### A second experiment: 10:1 abundance. +### A second experiment: 10:1 abundance ratio. When we project r2+r3, 10:1 abundance, onto s10, s11, and s12 genomes with gather: @@ -391,11 +393,11 @@ overlap p_query p_match avg_abund * approximately 91% of s10 matching * approximately 9% of s11 matching * approximately 100% of the high coverage genome being matched, with only 80% of the low coverage genome -* approximately 2.0 abundance (third column, avg_abund) for s11, and (very) approximately 20x abundance for genome s10. +* approximately 2.0 coverage (third column, avg_abund) for s11, and (very) approximately 20x coverage for genome s10. The cause of the poor approximation for genome s10 is unclear; it could be due to low coverage, or small genome size, or something -else. More investigation needed. +else. More investigation is needed. ## Appendix C: sourmash gather output examples diff --git a/doc/command-line.md b/doc/command-line.md index 99eb7d0d4a..ee5421a6be 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -373,9 +373,9 @@ the recovered matches hit 45.6% of the query k-mers (unweighted). ``` For each match, -* 'overlap', the first column, is the estimated number of k-mers shared between the match and the query. -* 'p_query' is the _percentage_ of the query that overlaps with the match; it is the amount of the metagenome "explained" by this match. -* 'p_match' is the percentage of the _match_ that overlaps with the query; it is the "detection" of the match in the metagenome. +* 'overlap', the first column, is the estimated number of base pairs shared between the match and the query, based on the number of shared hashes. +* 'p_query' is the _percentage_ of the query that overlaps with the match; it is the amount of the metagenome "explained" by this match. It is typically a lower bound on the percent of metagenomes reads that will map to this genome. +* 'p_match' is the percentage of the _match_ that overlaps with the query; it is the "detection" of the match in the metagenome. It is typically a lower bound on the number of base pairs that will be covered by read mapping. Quite a bit more information per match row is available in the CSV output saved with `-o`; for details, see diff --git a/src/sourmash/cli/gather.py b/src/sourmash/cli/gather.py index 732fef52bf..0b0115efd2 100644 --- a/src/sourmash/cli/gather.py +++ b/src/sourmash/cli/gather.py @@ -24,7 +24,7 @@ sourmash gather query.sig [ list of signatures or SBTs ] ``` -Example output: +Example output for an unweighted/noabund query: ``` overlap p_query p_match --------- ------- -------- @@ -35,6 +35,17 @@ 0.7 Mbp 5.3%% 17.6%% AE017285.1 Desulfovibrio vulgaris sub... ``` +Example output for a weighted query: +``` +overlap p_query p_match avg_abund +--------- ------- ------- --------- +9.3 Mbp 0.8% 97.5% 6.7 NC_007951.1 Burkholderia xenovorans ... +7.3 Mbp 2.3% 99.9% 23.9 NC_003272.1 Nostoc sp. PCC 7120 DNA,... +7.0 Mbp 8.9% 100.0% 94.5 BX119912.1 Rhodopirellula baltica SH... +6.6 Mbp 1.4% 100.0% 16.3 NC_009972.1 Herpetosiphon aurantiacu... +... +``` + The command line option `--threshold-bp` sets the threshold below which matches are no longer reported; by default, this is set to 50kb. see the Appendix in Classifying Signatures [1] for details. diff --git a/src/sourmash/search.py b/src/sourmash/search.py index a019c2bbc2..7b2db8008f 100644 --- a/src/sourmash/search.py +++ b/src/sourmash/search.py @@ -433,18 +433,27 @@ class GatherResult(PrefetchResult): sum_weighted_found: int = None total_weighted_hashes: int = None - gather_write_cols = ['intersect_bp', 'f_orig_query', 'f_match', 'f_unique_to_query', - 'f_unique_weighted','average_abund', 'median_abund', 'std_abund', 'filename', # here we use 'filename' - 'name', 'md5', 'f_match_orig', 'unique_intersect_bp', 'gather_result_rank', - 'remaining_bp', 'query_filename', 'query_name', 'query_md5', 'query_bp', 'ksize', - 'moltype', 'scaled', 'query_n_hashes', 'query_abundance', 'query_containment_ani', - 'match_containment_ani', 'average_containment_ani', 'max_containment_ani', + gather_write_cols = ['intersect_bp', 'f_orig_query', 'f_match', + 'f_unique_to_query', + 'f_unique_weighted','average_abund', + 'median_abund', 'std_abund', 'filename', + 'name', 'md5', + 'f_match_orig', 'unique_intersect_bp', + 'gather_result_rank', 'remaining_bp', + 'query_filename', 'query_name', 'query_md5', + 'query_bp', 'ksize', 'moltype', 'scaled', + 'query_n_hashes', 'query_abundance', + 'query_containment_ani', + 'match_containment_ani', + 'average_containment_ani', + 'max_containment_ani', 'potential_false_negative', - 'n_unique_weighted_found', 'sum_weighted_found', + 'n_unique_weighted_found', + 'sum_weighted_found', 'total_weighted_hashes'] ci_cols = ["query_containment_ani_low", "query_containment_ani_high", - "match_containment_ani_low", "match_containment_ani_high"] + "match_containment_ani_low", "match_containment_ani_high"] gather_write_cols_ci = gather_write_cols + ci_cols @@ -671,9 +680,10 @@ def __init__(self, query, counters, *, # do we pay attention to abundances? query_mh = query.minhash query_hashes = query_mh.hashes - orig_query_abunds = { k: 1 for k in query_hashes } if track_abundance: orig_query_abunds = query_hashes + else: + orig_query_abunds = { k: 1 for k in query_hashes } # adjust for not found... if noident_mh is None: # create empty @@ -721,9 +731,9 @@ def _update_scaled(self, scaled): orig_query_abunds = self.orig_query_abunds self.noident_query_sum_abunds = sum(( orig_query_abunds[k] \ for k in self.noident_mh.hashes )) - self.sum_abunds = sum(( orig_query_abunds[k] \ + self.total_weighted_hashes = sum(( orig_query_abunds[k] \ for k in self.orig_query_mh.hashes )) - self.sum_abunds += self.noident_query_sum_abunds + self.total_weighted_hashes += self.noident_query_sum_abunds if max_scaled != scaled: return max_scaled @@ -746,7 +756,6 @@ def __next__(self): cmp_scaled = self.cmp_scaled # will not be changed:: - track_abundance = self.track_abundance threshold_bp = self.threshold_bp orig_query_abunds = self.orig_query_abunds @@ -770,7 +779,7 @@ def __next__(self): # retrieve various saved things, after potential downsampling orig_query_mh = self.orig_query_mh - sum_abunds = self.sum_abunds + total_weighted_hashes = self.total_weighted_hashes noident_mh = self.noident_mh orig_query_len = len(orig_query_mh) + len(noident_mh) @@ -784,10 +793,10 @@ def __next__(self): new_query = SourmashSignature(new_query_mh) # compute weighted information for remaining query hashes - query_hashes = set(query_mh.hashes) - set(found_mh.hashes) + query_hashes = set(new_query_mh.hashes) n_weighted_missed = sum((orig_query_abunds[k] for k in query_hashes)) n_weighted_missed += self.noident_query_sum_abunds - sum_weighted_found = sum_abunds - n_weighted_missed + sum_weighted_found = total_weighted_hashes - n_weighted_missed # build a GatherResult result = GatherResult(self.orig_query, best_match, @@ -795,18 +804,17 @@ def __next__(self): filename=filename, gather_result_rank=self.result_n, gather_querymh=query.minhash, - ignore_abundance= not track_abundance, + ignore_abundance=not self.track_abundance, threshold_bp=threshold_bp, orig_query_len=orig_query_len, - orig_query_abunds = self.orig_query_abunds, + orig_query_abunds=self.orig_query_abunds, estimate_ani_ci=self.estimate_ani_ci, sum_weighted_found=sum_weighted_found, - total_weighted_hashes=sum_abunds, + total_weighted_hashes=total_weighted_hashes, ) self.result_n += 1 self.query = new_query - self.orig_query_mh = orig_query_mh return result