From efc700bcdacd0e76a3f46e58938c14e461d0446e Mon Sep 17 00:00:00 2001 From: Tessa Pierce Ward Date: Fri, 15 Apr 2022 18:38:11 -0700 Subject: [PATCH] [MRG] add some search/gather/prefetch columns to enable ANI estimation (#1952) * add some search/gather/prefetch columns to enable ANI estimation * fix introduced err * use query_n_hashes; remove num --- src/sourmash/commands.py | 34 ++++++---------------------------- src/sourmash/search.py | 36 +++++++++++++++++++++++++++++------- 2 files changed, 35 insertions(+), 35 deletions(-) diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index 2489d13852..b736ec53c9 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -16,7 +16,7 @@ from .logging import notify, error, print_results, set_quiet from .sourmash_args import (FileOutput, FileOutputCSV, SaveSignaturesToLocation) -from .search import prefetch_database, PrefetchResult, calculate_prefetch_info +from .search import SearchResult, prefetch_database, PrefetchResult, GatherResult, calculate_prefetch_info from .index import LazyLinearIndex WATERMARK_SIZE = 10000 @@ -533,8 +533,7 @@ def search(args): notify("** reporting only one match because --best-only was set") if args.output: - fieldnames = ['similarity', 'name', 'filename', 'md5', - 'query_filename', 'query_name', 'query_md5'] + fieldnames = SearchResult._fields with FileOutputCSV(args.output) as fp: w = csv.DictWriter(fp, fieldnames=fieldnames) @@ -689,11 +688,7 @@ def gather(args): prefetch_csvout_fp = None prefetch_csvout_w = None if args.save_prefetch_csv: - fieldnames = ['intersect_bp', 'jaccard', - 'max_containment', 'f_query_match', 'f_match_query', - 'match_filename', 'match_name', 'match_md5', 'match_bp', - 'query_filename', 'query_name', 'query_md5', 'query_bp'] - + fieldnames = PrefetchResult._fields prefetch_csvout_fp = FileOutput(args.save_prefetch_csv, 'wt').open() prefetch_csvout_w = csv.DictWriter(prefetch_csvout_fp, fieldnames=fieldnames) prefetch_csvout_w.writeheader() @@ -808,13 +803,7 @@ def gather(args): # save CSV? if found and args.output: - fieldnames = ['intersect_bp', 'f_orig_query', 'f_match', - 'f_unique_to_query', 'f_unique_weighted', - 'average_abund', 'median_abund', 'std_abund', 'name', - 'filename', 'md5', 'f_match_orig', 'unique_intersect_bp', - 'gather_result_rank', 'remaining_bp', - 'query_filename', 'query_name', 'query_md5', 'query_bp'] - + fieldnames = GatherResult._fields with FileOutputCSV(args.output) as fp: w = csv.DictWriter(fp, fieldnames=fieldnames) w.writeheader() @@ -981,14 +970,7 @@ def multigather(args): output_base = os.path.basename(query_filename) output_csv = output_base + '.csv' - - fieldnames = ['intersect_bp', 'f_orig_query', 'f_match', - 'f_unique_to_query', 'f_unique_weighted', - 'average_abund', 'median_abund', 'std_abund', 'name', - 'filename', 'md5', 'f_match_orig', - 'unique_intersect_bp', 'gather_result_rank', - 'remaining_bp', 'query_filename', 'query_name', - 'query_md5', 'query_bp'] + fieldnames = GatherResult._fields with FileOutputCSV(output_csv) as fp: w = csv.DictWriter(fp, fieldnames=fieldnames) w.writeheader() @@ -1192,11 +1174,7 @@ def prefetch(args): csvout_fp = None csvout_w = None if args.output: - fieldnames = ['intersect_bp', 'jaccard', - 'max_containment', 'f_query_match', 'f_match_query', - 'match_filename', 'match_name', 'match_md5', 'match_bp', - 'query_filename', 'query_name', 'query_md5', 'query_bp'] - + fieldnames = PrefetchResult._fields csvout_fp = FileOutput(args.output, 'wt').open() csvout_w = csv.DictWriter(csvout_fp, fieldnames=fieldnames) csvout_w.writeheader() diff --git a/src/sourmash/search.py b/src/sourmash/search.py index 9867f9f697..c84cb7a4ad 100644 --- a/src/sourmash/search.py +++ b/src/sourmash/search.py @@ -160,7 +160,9 @@ def collect(self, score, match): # generic SearchResult tuple. SearchResult = namedtuple('SearchResult', - 'similarity, match, md5, filename, name, query, query_filename, query_name, query_md5') + ['similarity', 'match', 'md5', 'filename', 'name', + 'query', 'query_filename', 'query_name', 'query_md5', + 'ksize']) def format_bp(bp): @@ -193,6 +195,7 @@ def search_databases_with_flat_query(query, databases, **kwargs): results.sort(key=lambda x: -x[0]) x = [] + ksize = query.minhash.ksize for (score, match, filename) in results: x.append(SearchResult(similarity=score, match=match, @@ -202,7 +205,8 @@ def search_databases_with_flat_query(query, databases, **kwargs): query=query, query_filename=query.filename, query_name=query.name, - query_md5=query.md5sum()[:8] + query_md5=query.md5sum()[:8], + ksize=ksize, )) return x @@ -235,7 +239,8 @@ def search_databases_with_abund_query(query, databases, **kwargs): query=query, query_filename=query.filename, query_name=query.name, - query_md5=query.md5sum()[:8] + query_md5=query.md5sum()[:8], + ksize=query.minhash.ksize, )) return x @@ -243,8 +248,11 @@ def search_databases_with_abund_query(query, databases, **kwargs): ### gather code ### -GatherResult = namedtuple('GatherResult', - 'intersect_bp, f_orig_query, f_match, f_unique_to_query, f_unique_weighted, average_abund, median_abund, std_abund, filename, name, md5, match, f_match_orig, unique_intersect_bp, gather_result_rank, remaining_bp, query_filename, query_name, query_md5, query_bp') +GatherResult = namedtuple('GatherResult', ['intersect_bp', 'f_orig_query', 'f_match', 'f_unique_to_query', + 'f_unique_weighted','average_abund', 'median_abund', 'std_abund', 'filename', + 'name', 'md5', 'match', 'f_match_orig', 'unique_intersect_bp', 'gather_result_rank', + 'remaining_bp', 'query_filename', 'query_name', 'query_md5', 'query_bp', 'ksize', + 'moltype', 'scaled', 'query_n_hashes', 'query_abundance']) def _find_best(counters, query, threshold_bp): @@ -453,6 +461,11 @@ def __next__(self): query_filename=self.orig_query_filename, query_name=self.orig_query_name, query_md5=self.orig_query_md5, + ksize = self.orig_query_mh.ksize, + moltype = self.orig_query_mh.moltype, + scaled = scaled, + query_n_hashes=len(self.orig_query_mh), + query_abundance=self.orig_query_mh.track_abundance, ) self.result_n += 1 self.query = new_query @@ -466,7 +479,11 @@ def __next__(self): ### PrefetchResult = namedtuple('PrefetchResult', - 'intersect_bp, jaccard, max_containment, f_query_match, f_match_query, match, match_filename, match_name, match_md5, match_bp, query, query_filename, query_name, query_md5, query_bp') + ['intersect_bp', 'jaccard', 'max_containment', 'f_query_match', + 'f_match_query', 'match', 'match_filename', 'match_name', + 'match_md5', 'match_bp', 'query', 'query_filename', 'query_name', + 'query_md5', 'query_bp', 'ksize', 'moltype', 'scaled', + 'query_n_hashes', 'query_abundance']) def calculate_prefetch_info(query, match, scaled, threshold_bp): @@ -505,7 +522,12 @@ def calculate_prefetch_info(query, match, scaled, threshold_bp): query=query, query_filename=query.filename, query_name=query.name, - query_md5=query.md5sum()[:8] + query_md5=query.md5sum()[:8], + ksize = query_mh.ksize, + moltype = query_mh.moltype, + scaled = scaled, + query_n_hashes=len(query_mh), + query_abundance=query_mh.track_abundance, ) return result