Skip to content

Commit

Permalink
[MRG] fix a bug in gather when high-resolution queries are used (#399)
Browse files Browse the repository at this point in the history
* cleanup a bunch of pyflakes stuff

* put in a test for queries with too low a sampling rate

* fix bug where sum_abunds was not adjusted as scaled changed in gather

* py27 compat
  • Loading branch information
ctb committed Feb 23, 2018
1 parent 9071b1e commit 27dad7c
Show file tree
Hide file tree
Showing 11 changed files with 55 additions and 38 deletions.
2 changes: 1 addition & 1 deletion sourmash_lib/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.0.0a3
2.0.0a4
2 changes: 0 additions & 2 deletions sourmash_lib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
An implementation of a MinHash bottom sketch, applied to k-mers in DNA.
"""
from __future__ import print_function
import re
import math
import os

from ._minhash import (MinHash, get_minhash_default_seed, get_minhash_max_hash)
Expand Down
2 changes: 1 addition & 1 deletion sourmash_lib/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import sys
import argparse

from .logging import notify, error, set_quiet
from .logging import error, set_quiet

from .commands import (categorize, compare, compute, dump, import_csv,
gather, index, sbt_combine, search,
Expand Down
9 changes: 3 additions & 6 deletions sourmash_lib/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,13 @@
import os
import os.path
import sys
from collections import namedtuple
import random

import screed
import sourmash_lib
from . import signature as sig
from . import sourmash_args
from .logging import notify, error, print_results, set_quiet
from .search import format_bp

from .sourmash_args import DEFAULT_LOAD_K
DEFAULT_COMPUTE_K = '21,31,51'
Expand Down Expand Up @@ -252,7 +250,6 @@ def save_siglist(siglist, output_fp, filename=None):
elif args.name_from_first:
name = record.name

s = record.sequence
add_seq(Elist, record.sequence,
args.input_is_protein, args.check_sequence)
notify('')
Expand Down Expand Up @@ -424,7 +421,6 @@ def plot(args):
import matplotlib as mpl
mpl.use('Agg')
import numpy
import scipy
import pylab
import scipy.cluster.hierarchy as sch
from . import fig as sourmash_fig
Expand Down Expand Up @@ -885,7 +881,7 @@ def categorize(args):


def gather(args):
from .search import gather_databases, GatherResult, format_bp
from .search import gather_databases, format_bp

parser = argparse.ArgumentParser()
parser.add_argument('query', help='query signature')
Expand All @@ -907,6 +903,7 @@ def gather(args):
help='suppress non-error output')
parser.add_argument('--ignore-abundance', action='store_true',
help='do NOT use k-mer abundances if present')
parser.add_argument('-d', '--debug', action='store_true')

sourmash_args.add_ksize_arg(parser, DEFAULT_LOAD_K)
sourmash_args.add_moltype_args(parser)
Expand Down Expand Up @@ -1017,7 +1014,7 @@ def gather(args):

def watch(args):
"Build a signature from raw FASTA/FASTQ coming in on stdin, search."
from sourmash_lib.sbtmh import search_minhashes, SearchMinHashesFindBest
from sourmash_lib.sbtmh import SearchMinHashesFindBest

parser = argparse.ArgumentParser()
parser.add_argument('sbt_name', help='name of SBT to search')
Expand Down
1 change: 0 additions & 1 deletion sourmash_lib/fig.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from .logging import error, notify
try:
import numpy
import scipy
import pylab
import scipy.cluster.hierarchy as sch
except (RuntimeError, ImportError):
Expand Down
44 changes: 25 additions & 19 deletions sourmash_lib/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from .signature import SourmashSignature
from .sbtmh import search_minhashes, search_minhashes_containment
from .sbtmh import SearchMinHashesFindBest

from ._minhash import get_max_hash_for_scaled



Expand Down Expand Up @@ -89,32 +89,34 @@ def gather_databases(query, databases, threshold_bp, ignore_abundance):

orig_query = query
orig_mins = orig_query.minhash.get_hashes()
orig_abunds = { k: 1 for k in orig_mins }

# do we pay attention to abundances?o
if orig_query.minhash.track_abundance and not ignore_abundance:
orig_abunds = orig_query.minhash.get_mins(with_abundance=True)
else:
if orig_query.minhash.track_abundance and ignore_abundance:
notify('** ignoring abundance')
orig_abunds = { k: 1 for k in orig_query.minhash.get_mins(with_abundance=False) }
sum_abunds = sum(orig_abunds.values())

# calculate the band size/resolution R for the genome
R_metagenome = orig_query.minhash.scaled

# define a function to do a 'best' search and get only top match.
def find_best(dblist, query):
# CTB: could optimize by sharing scores across searches, i.e.
# a good early score truncates later searches.

results = []
for (sbt_or_siglist, filename, is_sbt) in dblist:
search_fn = SearchMinHashesFindBestIgnoreMaxHash().search

# search a tree
if is_sbt:
tree = sbt_or_siglist
search_fn = SearchMinHashesFindBestIgnoreMaxHash().search

for leaf in tree.find(search_fn, query, 0.0):
leaf_e = leaf.data.minhash
similarity = query.minhash.similarity_ignore_maxhash(leaf_e)
if similarity > 0.0:
results.append((similarity, leaf.data))

# search a signature
else:
for ss in sbt_or_siglist:
similarity = query.minhash.similarity_ignore_maxhash(ss.minhash)
Expand All @@ -131,17 +133,18 @@ def find_best(dblist, query):


# define a function to build new signature object from set of mins
def build_new_signature(mins, template_sig):
def build_new_signature(mins, template_sig, scaled=None):
e = template_sig.minhash.copy_and_clear()
e.add_many(mins)
if scaled:
e = e.downsample_scaled(scaled)
return SourmashSignature(e)

# construct a new query that doesn't have the max_hash attribute set.
new_mins = query.minhash.get_hashes()
query = build_new_signature(new_mins, orig_query)

sum_found = 0.

R_comparison = 0
while 1:
best_similarity, best_leaf, filename = find_best(databases, query)
if not best_leaf: # no matches at all!
Expand All @@ -151,9 +154,7 @@ def build_new_signature(mins, template_sig):
query_mins = set(query.minhash.get_hashes())
found_mins = best_leaf.minhash.get_hashes()

# figure out what the resolution of the banding on the genome is,
# based either on an explicit --scaled parameter, or on genome
# cardinality (deprecated)
# figure out what the resolution of the banding on the subject is
if not best_leaf.minhash.max_hash:
error('Best hash match in sbt_gather has no max_hash')
error('Please prepare database of sequences with --scaled')
Expand All @@ -162,13 +163,16 @@ def build_new_signature(mins, template_sig):
R_genome = best_leaf.minhash.scaled

# pick the highest R / lowest resolution
R_comparison = max(R_metagenome, R_genome)
R_comparison = max(R_comparison, R_metagenome, R_genome)

# CTB: these could probably be replaced by minhash.downsample_scaled.
new_max_hash = sourmash_lib.MAX_HASH / float(R_comparison)
# eliminate mins under this new resolution.
# (CTB note: this means that if a high scaled/low res signature is
# found early on, resolution will be low from then on.)
new_max_hash = get_max_hash_for_scaled(R_comparison)
query_mins = set([ i for i in query_mins if i < new_max_hash ])
found_mins = set([ i for i in found_mins if i < new_max_hash ])
orig_mins = set([ i for i in orig_mins if i < new_max_hash ])
sum_abunds = sum([ v for (k,v) in orig_abunds.items() if k < new_max_hash ])

# calculate intersection:
intersect_mins = query_mins.intersection(found_mins)
Expand All @@ -186,7 +190,8 @@ def build_new_signature(mins, template_sig):
f_orig_query = len(intersect_orig_mins) / float(len(orig_mins))

# calculate fractions wrt second denominator - metagenome size
query_n_mins = len(orig_query.minhash.get_hashes())
orig_mh = orig_query.minhash.downsample_scaled(R_comparison)
query_n_mins = len(orig_mh)
f_unique_to_query = len(intersect_mins) / float(query_n_mins)

# calculate scores weighted by abundances
Expand All @@ -195,6 +200,7 @@ def build_new_signature(mins, template_sig):
average_abund = sum((orig_abunds[k] for k in intersect_mins)) \
/ len(intersect_mins)

# build a result namedtuple
result = GatherResult(intersect_bp=intersect_bp,
f_orig_query=f_orig_query,
f_match=f_match,
Expand All @@ -208,7 +214,7 @@ def build_new_signature(mins, template_sig):

# construct a new query, minus the previous one.
query_mins -= set(found_mins)
query = build_new_signature(query_mins, orig_query)
query = build_new_signature(query_mins, orig_query, R_comparison)

weighted_missed = sum((orig_abunds[k] for k in query_mins)) \
/ sum_abunds
Expand Down
6 changes: 2 additions & 4 deletions sourmash_lib/signature.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,9 @@
Save and load MinHash sketches in a JSON format, along with some metadata.
"""
from __future__ import print_function
import sys
import hashlib
import sourmash_lib
from . import signature_json
from .logging import notify, error
from .logging import error

import io
import gzip
Expand Down Expand Up @@ -239,7 +237,7 @@ def load_one_signature(data, ksize=None, select_moltype=None,
raise ValueError("no signatures to load")

try:
next_sig = next(sigiter)
next(sigiter)
except StopIteration:
return first_sig

Expand Down
1 change: 0 additions & 1 deletion sourmash_lib/signature_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,6 @@ def load_signatureset_json_iter(data, ksize=None, ignore_md5sum=False, ijson=ijs
prefix, event, value = next(parser)
assert prefix == '' and event == 'start_array' and value is None

siglist = []
n = 0
while True:
try:
Expand Down
6 changes: 3 additions & 3 deletions sourmash_lib/sourmash_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,9 +163,9 @@ def traverse_find_sigs(dirnames, yield_all_files=False):


def filter_compatible_signatures(query, siglist, force=False):
for sig in siglist:
if check_signatures_are_compatible(query, sig):
yield sig
for ss in siglist:
if check_signatures_are_compatible(query, ss):
yield ss
else:
if not force:
raise ValueError("incompatible signature")
Expand Down
1 change: 1 addition & 0 deletions tests/test-data/GCF_000006945.2-s500.sig

Large diffs are not rendered by default.

19 changes: 19 additions & 0 deletions tests/test_sourmash.py
Original file line number Diff line number Diff line change
Expand Up @@ -2588,6 +2588,25 @@ def test_gather_metagenome_downsample():
'4.1 Mbp 4.4% 17.1%' in out))


def test_gather_query_downsample():
with utils.TempDirectory() as location:
testdata_glob = utils.get_test_data('gather/GCF*.sig')
testdata_sigs = glob.glob(testdata_glob)

query_sig = utils.get_test_data('GCF_000006945.2-s500.sig')

status, out, err = utils.runscript('sourmash',
['gather', '-k', '31',
query_sig] + testdata_sigs,
in_directory=location)

print(out)
print(err)

assert 'loaded 12 signatures' in err
assert '4.9 Mbp 100.0% 100.0% NC_003197.2' in out


def test_gather_save_matches():
with utils.TempDirectory() as location:
testdata_glob = utils.get_test_data('gather/GCF*.sig')
Expand Down

0 comments on commit 27dad7c

Please sign in to comment.