Skip to content

Commit

Permalink
[MRG] add some search/gather/prefetch columns to enable ANI estimation (
Browse files Browse the repository at this point in the history
#1952)

* add some search/gather/prefetch columns to enable ANI estimation

* fix introduced err

* use query_n_hashes; remove num
  • Loading branch information
bluegenes committed Apr 16, 2022
1 parent 4080b5d commit efc700b
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 35 deletions.
34 changes: 6 additions & 28 deletions src/sourmash/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from .logging import notify, error, print_results, set_quiet
from .sourmash_args import (FileOutput, FileOutputCSV,
SaveSignaturesToLocation)
from .search import prefetch_database, PrefetchResult, calculate_prefetch_info
from .search import SearchResult, prefetch_database, PrefetchResult, GatherResult, calculate_prefetch_info
from .index import LazyLinearIndex

WATERMARK_SIZE = 10000
Expand Down Expand Up @@ -533,8 +533,7 @@ def search(args):
notify("** reporting only one match because --best-only was set")

if args.output:
fieldnames = ['similarity', 'name', 'filename', 'md5',
'query_filename', 'query_name', 'query_md5']
fieldnames = SearchResult._fields

with FileOutputCSV(args.output) as fp:
w = csv.DictWriter(fp, fieldnames=fieldnames)
Expand Down Expand Up @@ -689,11 +688,7 @@ def gather(args):
prefetch_csvout_fp = None
prefetch_csvout_w = None
if args.save_prefetch_csv:
fieldnames = ['intersect_bp', 'jaccard',
'max_containment', 'f_query_match', 'f_match_query',
'match_filename', 'match_name', 'match_md5', 'match_bp',
'query_filename', 'query_name', 'query_md5', 'query_bp']

fieldnames = PrefetchResult._fields
prefetch_csvout_fp = FileOutput(args.save_prefetch_csv, 'wt').open()
prefetch_csvout_w = csv.DictWriter(prefetch_csvout_fp, fieldnames=fieldnames)
prefetch_csvout_w.writeheader()
Expand Down Expand Up @@ -808,13 +803,7 @@ def gather(args):

# save CSV?
if found and args.output:
fieldnames = ['intersect_bp', 'f_orig_query', 'f_match',
'f_unique_to_query', 'f_unique_weighted',
'average_abund', 'median_abund', 'std_abund', 'name',
'filename', 'md5', 'f_match_orig', 'unique_intersect_bp',
'gather_result_rank', 'remaining_bp',
'query_filename', 'query_name', 'query_md5', 'query_bp']

fieldnames = GatherResult._fields
with FileOutputCSV(args.output) as fp:
w = csv.DictWriter(fp, fieldnames=fieldnames)
w.writeheader()
Expand Down Expand Up @@ -981,14 +970,7 @@ def multigather(args):

output_base = os.path.basename(query_filename)
output_csv = output_base + '.csv'

fieldnames = ['intersect_bp', 'f_orig_query', 'f_match',
'f_unique_to_query', 'f_unique_weighted',
'average_abund', 'median_abund', 'std_abund', 'name',
'filename', 'md5', 'f_match_orig',
'unique_intersect_bp', 'gather_result_rank',
'remaining_bp', 'query_filename', 'query_name',
'query_md5', 'query_bp']
fieldnames = GatherResult._fields
with FileOutputCSV(output_csv) as fp:
w = csv.DictWriter(fp, fieldnames=fieldnames)
w.writeheader()
Expand Down Expand Up @@ -1192,11 +1174,7 @@ def prefetch(args):
csvout_fp = None
csvout_w = None
if args.output:
fieldnames = ['intersect_bp', 'jaccard',
'max_containment', 'f_query_match', 'f_match_query',
'match_filename', 'match_name', 'match_md5', 'match_bp',
'query_filename', 'query_name', 'query_md5', 'query_bp']

fieldnames = PrefetchResult._fields
csvout_fp = FileOutput(args.output, 'wt').open()
csvout_w = csv.DictWriter(csvout_fp, fieldnames=fieldnames)
csvout_w.writeheader()
Expand Down
36 changes: 29 additions & 7 deletions src/sourmash/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,9 @@ def collect(self, score, match):

# generic SearchResult tuple.
SearchResult = namedtuple('SearchResult',
'similarity, match, md5, filename, name, query, query_filename, query_name, query_md5')
['similarity', 'match', 'md5', 'filename', 'name',
'query', 'query_filename', 'query_name', 'query_md5',
'ksize'])


def format_bp(bp):
Expand Down Expand Up @@ -193,6 +195,7 @@ def search_databases_with_flat_query(query, databases, **kwargs):
results.sort(key=lambda x: -x[0])

x = []
ksize = query.minhash.ksize
for (score, match, filename) in results:
x.append(SearchResult(similarity=score,
match=match,
Expand All @@ -202,7 +205,8 @@ def search_databases_with_flat_query(query, databases, **kwargs):
query=query,
query_filename=query.filename,
query_name=query.name,
query_md5=query.md5sum()[:8]
query_md5=query.md5sum()[:8],
ksize=ksize,
))
return x

Expand Down Expand Up @@ -235,16 +239,20 @@ def search_databases_with_abund_query(query, databases, **kwargs):
query=query,
query_filename=query.filename,
query_name=query.name,
query_md5=query.md5sum()[:8]
query_md5=query.md5sum()[:8],
ksize=query.minhash.ksize,
))
return x

###
### gather code
###

GatherResult = namedtuple('GatherResult',
'intersect_bp, f_orig_query, f_match, f_unique_to_query, f_unique_weighted, average_abund, median_abund, std_abund, filename, name, md5, match, f_match_orig, unique_intersect_bp, gather_result_rank, remaining_bp, query_filename, query_name, query_md5, query_bp')
GatherResult = namedtuple('GatherResult', ['intersect_bp', 'f_orig_query', 'f_match', 'f_unique_to_query',
'f_unique_weighted','average_abund', 'median_abund', 'std_abund', 'filename',
'name', 'md5', 'match', 'f_match_orig', 'unique_intersect_bp', 'gather_result_rank',
'remaining_bp', 'query_filename', 'query_name', 'query_md5', 'query_bp', 'ksize',
'moltype', 'scaled', 'query_n_hashes', 'query_abundance'])


def _find_best(counters, query, threshold_bp):
Expand Down Expand Up @@ -453,6 +461,11 @@ def __next__(self):
query_filename=self.orig_query_filename,
query_name=self.orig_query_name,
query_md5=self.orig_query_md5,
ksize = self.orig_query_mh.ksize,
moltype = self.orig_query_mh.moltype,
scaled = scaled,
query_n_hashes=len(self.orig_query_mh),
query_abundance=self.orig_query_mh.track_abundance,
)
self.result_n += 1
self.query = new_query
Expand All @@ -466,7 +479,11 @@ def __next__(self):
###

PrefetchResult = namedtuple('PrefetchResult',
'intersect_bp, jaccard, max_containment, f_query_match, f_match_query, match, match_filename, match_name, match_md5, match_bp, query, query_filename, query_name, query_md5, query_bp')
['intersect_bp', 'jaccard', 'max_containment', 'f_query_match',
'f_match_query', 'match', 'match_filename', 'match_name',
'match_md5', 'match_bp', 'query', 'query_filename', 'query_name',
'query_md5', 'query_bp', 'ksize', 'moltype', 'scaled',
'query_n_hashes', 'query_abundance'])


def calculate_prefetch_info(query, match, scaled, threshold_bp):
Expand Down Expand Up @@ -505,7 +522,12 @@ def calculate_prefetch_info(query, match, scaled, threshold_bp):
query=query,
query_filename=query.filename,
query_name=query.name,
query_md5=query.md5sum()[:8]
query_md5=query.md5sum()[:8],
ksize = query_mh.ksize,
moltype = query_mh.moltype,
scaled = scaled,
query_n_hashes=len(query_mh),
query_abundance=query_mh.track_abundance,
)

return result
Expand Down

0 comments on commit efc700b

Please sign in to comment.