From ec09882989b7f2f70e78e2ebe1b66b760d706a3a Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Thu, 14 Apr 2022 18:01:14 -0700 Subject: [PATCH 1/3] add some search/gather/prefetch columns to enable ANI estimation --- src/sourmash/commands.py | 34 ++++++---------------------------- src/sourmash/search.py | 38 +++++++++++++++++++++++++++++++------- 2 files changed, 37 insertions(+), 35 deletions(-) diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index baf7653a04..7b75a19380 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -16,7 +16,7 @@ from .logging import notify, error, print_results, set_quiet from .sourmash_args import (FileOutput, FileOutputCSV, SaveSignaturesToLocation) -from .search import prefetch_database, PrefetchResult, calculate_prefetch_info +from .search import SearchResult, prefetch_database, PrefetchResult, GatherResult, calculate_prefetch_info from .index import LazyLinearIndex WATERMARK_SIZE = 10000 @@ -529,8 +529,7 @@ def search(args): notify("** reporting only one match because --best-only was set") if args.output: - fieldnames = ['similarity', 'name', 'filename', 'md5', - 'query_filename', 'query_name', 'query_md5'] + fieldnames = SearchResult._fields with FileOutputCSV(args.output) as fp: w = csv.DictWriter(fp, fieldnames=fieldnames) @@ -685,11 +684,7 @@ def gather(args): prefetch_csvout_fp = None prefetch_csvout_w = None if args.save_prefetch_csv: - fieldnames = ['intersect_bp', 'jaccard', - 'max_containment', 'f_query_match', 'f_match_query', - 'match_filename', 'match_name', 'match_md5', 'match_bp', - 'query_filename', 'query_name', 'query_md5', 'query_bp'] - + fieldnames = PrefetchResult._fields prefetch_csvout_fp = FileOutput(args.save_prefetch_csv, 'wt').open() prefetch_csvout_w = csv.DictWriter(prefetch_csvout_fp, fieldnames=fieldnames) prefetch_csvout_w.writeheader() @@ -804,13 +799,7 @@ def gather(args): # save CSV? if found and args.output: - fieldnames = ['intersect_bp', 'f_orig_query', 'f_match', - 'f_unique_to_query', 'f_unique_weighted', - 'average_abund', 'median_abund', 'std_abund', 'name', - 'filename', 'md5', 'f_match_orig', 'unique_intersect_bp', - 'gather_result_rank', 'remaining_bp', - 'query_filename', 'query_name', 'query_md5', 'query_bp'] - + fieldnames = GatherResult._fields with FileOutputCSV(args.output) as fp: w = csv.DictWriter(fp, fieldnames=fieldnames) w.writeheader() @@ -977,14 +966,7 @@ def multigather(args): output_base = os.path.basename(query_filename) output_csv = output_base + '.csv' - - fieldnames = ['intersect_bp', 'f_orig_query', 'f_match', - 'f_unique_to_query', 'f_unique_weighted', - 'average_abund', 'median_abund', 'std_abund', 'name', - 'filename', 'md5', 'f_match_orig', - 'unique_intersect_bp', 'gather_result_rank', - 'remaining_bp', 'query_filename', 'query_name', - 'query_md5', 'query_bp'] + fieldnames = GatherResult._fields with FileOutputCSV(output_csv) as fp: w = csv.DictWriter(fp, fieldnames=fieldnames) w.writeheader() @@ -1188,11 +1170,7 @@ def prefetch(args): csvout_fp = None csvout_w = None if args.output: - fieldnames = ['intersect_bp', 'jaccard', - 'max_containment', 'f_query_match', 'f_match_query', - 'match_filename', 'match_name', 'match_md5', 'match_bp', - 'query_filename', 'query_name', 'query_md5', 'query_bp'] - + fieldnames = PrefetchResult._fields csvout_fp = FileOutput(args.output, 'wt').open() csvout_w = csv.DictWriter(csvout_fp, fieldnames=fieldnames) csvout_w.writeheader() diff --git a/src/sourmash/search.py b/src/sourmash/search.py index 9867f9f697..e59d55179d 100644 --- a/src/sourmash/search.py +++ b/src/sourmash/search.py @@ -160,7 +160,9 @@ def collect(self, score, match): # generic SearchResult tuple. SearchResult = namedtuple('SearchResult', - 'similarity, match, md5, filename, name, query, query_filename, query_name, query_md5') + ['similarity', 'match', 'md5', 'filename', 'name', + 'query', 'query_filename', 'query_name', 'query_md5', + 'ksize']) def format_bp(bp): @@ -193,6 +195,7 @@ def search_databases_with_flat_query(query, databases, **kwargs): results.sort(key=lambda x: -x[0]) x = [] + ksize = query_mh.ksize for (score, match, filename) in results: x.append(SearchResult(similarity=score, match=match, @@ -202,7 +205,8 @@ def search_databases_with_flat_query(query, databases, **kwargs): query=query, query_filename=query.filename, query_name=query.name, - query_md5=query.md5sum()[:8] + query_md5=query.md5sum()[:8], + ksize=ksize, )) return x @@ -235,7 +239,8 @@ def search_databases_with_abund_query(query, databases, **kwargs): query=query, query_filename=query.filename, query_name=query.name, - query_md5=query.md5sum()[:8] + query_md5=query.md5sum()[:8], + ksize=query.minhash.ksize, )) return x @@ -243,8 +248,11 @@ def search_databases_with_abund_query(query, databases, **kwargs): ### gather code ### -GatherResult = namedtuple('GatherResult', - 'intersect_bp, f_orig_query, f_match, f_unique_to_query, f_unique_weighted, average_abund, median_abund, std_abund, filename, name, md5, match, f_match_orig, unique_intersect_bp, gather_result_rank, remaining_bp, query_filename, query_name, query_md5, query_bp') +GatherResult = namedtuple('GatherResult', ['intersect_bp', 'f_orig_query', 'f_match', 'f_unique_to_query', + 'f_unique_weighted','average_abund', 'median_abund', 'std_abund', 'filename', + 'name', 'md5', 'match', 'f_match_orig', 'unique_intersect_bp', 'gather_result_rank', + 'remaining_bp', 'query_filename', 'query_name', 'query_md5', 'query_bp', 'ksize', + 'moltype', 'num', 'scaled', 'query_nhashes', 'query_abundance']) def _find_best(counters, query, threshold_bp): @@ -453,6 +461,12 @@ def __next__(self): query_filename=self.orig_query_filename, query_name=self.orig_query_name, query_md5=self.orig_query_md5, + ksize = self.orig_query_mh.ksize, + moltype = self.orig_query_mh.moltype, + num = self.orig_query_mh.num, + scaled = scaled, + query_nhashes=len(self.orig_query_mh), + query_abundance=self.orig_query_mh.track_abundance, ) self.result_n += 1 self.query = new_query @@ -466,7 +480,11 @@ def __next__(self): ### PrefetchResult = namedtuple('PrefetchResult', - 'intersect_bp, jaccard, max_containment, f_query_match, f_match_query, match, match_filename, match_name, match_md5, match_bp, query, query_filename, query_name, query_md5, query_bp') + ['intersect_bp', 'jaccard', 'max_containment', 'f_query_match', + 'f_match_query', 'match', 'match_filename', 'match_name', + 'match_md5', 'match_bp', 'query', 'query_filename', 'query_name', + 'query_md5', 'query_bp', 'ksize', 'moltype', 'num', 'scaled', + 'query_nhashes', 'query_abundance']) def calculate_prefetch_info(query, match, scaled, threshold_bp): @@ -505,7 +523,13 @@ def calculate_prefetch_info(query, match, scaled, threshold_bp): query=query, query_filename=query.filename, query_name=query.name, - query_md5=query.md5sum()[:8] + query_md5=query.md5sum()[:8], + ksize = query_mh.ksize, + moltype = query_mh.moltype, + num = query_mh.num, + scaled = scaled, + query_nhashes=len(query_mh), + query_abundance=query_mh.track_abundance, ) return result From 479d315c7f5daec8d09030e5140a99a92c2bdf07 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Fri, 15 Apr 2022 11:33:12 -0700 Subject: [PATCH 2/3] fix introduced err --- src/sourmash/search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sourmash/search.py b/src/sourmash/search.py index e59d55179d..ef3925c855 100644 --- a/src/sourmash/search.py +++ b/src/sourmash/search.py @@ -195,7 +195,7 @@ def search_databases_with_flat_query(query, databases, **kwargs): results.sort(key=lambda x: -x[0]) x = [] - ksize = query_mh.ksize + ksize = query.minhash.ksize for (score, match, filename) in results: x.append(SearchResult(similarity=score, match=match, From 644a4dd83eb7f6f48ad11c54122ddeecb35a9faa Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Fri, 15 Apr 2022 17:56:01 -0700 Subject: [PATCH 3/3] use query_n_hashes; remove num --- src/sourmash/search.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/sourmash/search.py b/src/sourmash/search.py index ef3925c855..c84cb7a4ad 100644 --- a/src/sourmash/search.py +++ b/src/sourmash/search.py @@ -252,7 +252,7 @@ def search_databases_with_abund_query(query, databases, **kwargs): 'f_unique_weighted','average_abund', 'median_abund', 'std_abund', 'filename', 'name', 'md5', 'match', 'f_match_orig', 'unique_intersect_bp', 'gather_result_rank', 'remaining_bp', 'query_filename', 'query_name', 'query_md5', 'query_bp', 'ksize', - 'moltype', 'num', 'scaled', 'query_nhashes', 'query_abundance']) + 'moltype', 'scaled', 'query_n_hashes', 'query_abundance']) def _find_best(counters, query, threshold_bp): @@ -463,9 +463,8 @@ def __next__(self): query_md5=self.orig_query_md5, ksize = self.orig_query_mh.ksize, moltype = self.orig_query_mh.moltype, - num = self.orig_query_mh.num, scaled = scaled, - query_nhashes=len(self.orig_query_mh), + query_n_hashes=len(self.orig_query_mh), query_abundance=self.orig_query_mh.track_abundance, ) self.result_n += 1 @@ -483,8 +482,8 @@ def __next__(self): ['intersect_bp', 'jaccard', 'max_containment', 'f_query_match', 'f_match_query', 'match', 'match_filename', 'match_name', 'match_md5', 'match_bp', 'query', 'query_filename', 'query_name', - 'query_md5', 'query_bp', 'ksize', 'moltype', 'num', 'scaled', - 'query_nhashes', 'query_abundance']) + 'query_md5', 'query_bp', 'ksize', 'moltype', 'scaled', + 'query_n_hashes', 'query_abundance']) def calculate_prefetch_info(query, match, scaled, threshold_bp): @@ -526,9 +525,8 @@ def calculate_prefetch_info(query, match, scaled, threshold_bp): query_md5=query.md5sum()[:8], ksize = query_mh.ksize, moltype = query_mh.moltype, - num = query_mh.num, scaled = scaled, - query_nhashes=len(query_mh), + query_n_hashes=len(query_mh), query_abundance=query_mh.track_abundance, )