Skip to content

Commit

Permalink
Merge pull request #171 from dib-lab/tr-update-sourmash
Browse files Browse the repository at this point in the history
[MRG] update to sourmash 4.1.1
  • Loading branch information
taylorreiter authored May 24, 2021
2 parents 89eac59 + 16bc5bc commit 06e1c35
Show file tree
Hide file tree
Showing 16 changed files with 57 additions and 53 deletions.
20 changes: 11 additions & 9 deletions charcoal/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -287,22 +287,23 @@ rule contigs_sig:
"""

# run a search, query.x.database.
rule search_all:
rule prefetch_all:
input:
query = stage1_dir + '/{filename}.sig',
databases = config['gather_db']
output:
csv = stage1_dir + '/{filename}.matches.csv',
matches = stage1_dir + '/{filename}.matches.sig',
matches = stage1_dir + '/{filename}.matches.zip',
txt = stage1_dir + '/{filename}.matches.txt'
params:
moltype = "--{}".format(config['moltype'].lower()),
gather_scaled = config['gather_scaled']
gather_scaled = config['gather_scaled'],
threshold_bp = config['gather_scaled']*3
conda: 'conf/env-sourmash.yml'
shell: """
sourmash search {input.query} {input.databases} -o {output.csv} \
--containment {params.moltype} --scaled {params.gather_scaled} \
--save-matches {output.matches} --threshold=0.001 >& {output.txt}
sourmash prefetch {input.query} {input.databases} -o {output.csv} \
{params.moltype} --scaled {params.gather_scaled} \
--save-matches {output.matches} --threshold-bp {params.threshold_bp} >& {output.txt}
cat {output.txt}
touch {output.csv} {output.matches}
"""
Expand All @@ -312,7 +313,8 @@ rule make_contigs_taxonomy_json:
input:
genome = genome_dir + '/{f}',
genome_sig = stage1_dir + '/{f}.sig',
matches = stage1_dir + '/{f}.matches.sig',
matches = stage1_dir + '/{f}.matches.zip',

lineages = config['lineages_csv']
output:
json = stage1_dir + '/{f}.contigs-tax.json',
Expand All @@ -333,7 +335,7 @@ rule make_contigs_taxonomy_json:
rule compare_taxonomy_single:
input:
json = stage1_dir + '/{g}.contigs-tax.json',
sig = stage1_dir + '/{g}.matches.sig',
sig = stage1_dir + '/{g}.matches.zip',
lineages = config['lineages_csv'],
provided_lineages = provided_lineages_file,
genome_list = genome_list_file
Expand Down Expand Up @@ -443,7 +445,7 @@ checkpoint hitlist_make_contigs_matches:
input:
genome = genome_dir + '/{g}',
genome_sig = stage1_dir + '/{g}.sig',
matches = stage1_dir + '/{g}.matches.sig',
matches = stage1_dir + '/{g}.matches.zip',
lineages = config['lineages_csv'],
hitlist = output_dir + '/stage1_hitlist.csv'
output:
Expand Down
24 changes: 13 additions & 11 deletions charcoal/compare_taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,12 @@ def guess_tax_by_gather(entire_mh, lca_db, lin_db, match_rank, report_fp):
first_count = count

sum_ident += count

f_ident = sum_ident / len(entire_mh)
f_major = first_count / sum_ident

f_ident = 0
f_major = 0
if sum_ident:
f_ident = sum_ident / len(entire_mh)
f_major = first_count / sum_ident

return first_lin, f_ident, f_major

Expand Down Expand Up @@ -119,11 +122,10 @@ def choose_genome_lineage(guessed_genome_lineage, provided_lineage, match_rank,

def get_genome_taxonomy(matches_filename, genome_sig_filename, provided_lineage,
tax_assign, match_rank, min_f_ident, min_f_major):
with open(matches_filename, 'rt') as fp:
try:
siglist = list(sourmash.load_signatures(fp, do_raise=True, quiet=True))
except sourmash.exceptions.SourmashError:
siglist = None
try:
siglist = list(sourmash.load_file_as_signatures(matches_filename))
except (ValueError, AssertionError) as e:
siglist = None

if not siglist:
comment = 'no matches for this genome.'
Expand Down Expand Up @@ -233,7 +235,7 @@ def main(args):
detected_contam = {}

summary_d = {}
matches_filename = os.path.join(dirname, genome_name + '.matches.sig')
matches_filename = os.path.join(dirname, genome_name + '.matches.zip')
genome_sig = os.path.join(dirname, genome_name + '.sig')
lineage = provided_lineages.get(genome_name, '')
contigs_json = os.path.join(dirname, genome_name + '.contigs-tax.json')
Expand Down Expand Up @@ -367,8 +369,8 @@ def main(args):
vals['bad_order_bp'],
vals['bad_family_bp'],
vals['bad_genus_bp'],
f'{vals["f_ident"]:.03}',
f'{vals["f_major"]:.03}',
f'{vals["f_ident"]:.03f}',
f'{vals["f_major"]:.03f}',
vals["lineage"],
vals["comment"]])

Expand Down
2 changes: 1 addition & 1 deletion charcoal/conf/env-reporting.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,5 @@ dependencies:
- pip
- pip:
- git+https://github.com/dib-lab/charcoal.git
- sourmash==4.0.0a3
- sourmash>=4.1.0,<5
- pyinterval
2 changes: 1 addition & 1 deletion charcoal/conf/env-sourmash.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,5 @@ dependencies:
- pip
- pip:
- git+https://github.com/dib-lab/charcoal.git
- sourmash==4.0.0a3
- sourmash>=4.1.0,<5
- pyinterval
17 changes: 10 additions & 7 deletions charcoal/contigs_list_contaminants.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,11 @@ def get_matches(mh, lca_db, lin_db, match_rank, threshold_bp):

# do the gather:
while 1:
results = lca_db.gather(query_sig, threshold_bp=threshold_bp)
try:
results = lca_db.gather(query_sig, threshold_bp=threshold_bp)
except ValueError:
break

if not results:
break

Expand Down Expand Up @@ -78,12 +82,10 @@ def main(args):
genome_sig = sourmash.load_one_signature(args.genome_sig)

# load all of the matches from search --containment in the database
with open(args.matches_sig, 'rt') as fp:
try:
siglist = list(sourmash.load_signatures(fp, do_raise=True,
quiet=False))
except sourmash.exceptions.SourmashError:
siglist = []
try:
siglist = list(sourmash.load_file_as_signatures(args.matches_sig))
except ValueError:
siglist = []
print(f"loaded {len(siglist)} matches from '{args.matches_sig}'")

# Hack for examining members of our search database: remove exact matches.
Expand Down Expand Up @@ -132,6 +134,7 @@ def main(args):

screed_iter = screed.open(args.genome)
threshold_bp = empty_mh.scaled * GATHER_MIN_MATCHES

n = - 1
for n, record in enumerate(screed_iter):
# look at each contig individually
Expand Down
21 changes: 9 additions & 12 deletions charcoal/contigs_search_taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,10 @@ def main(args):
genome_sig = sourmash.load_one_signature(args.genome_sig)

# load all of the matches from search --containment in the database
with open(args.matches_sig, 'rt') as fp:
try:
siglist = list(sourmash.load_signatures(fp, do_raise=True,
quiet=False))
except sourmash.exceptions.SourmashError:
siglist = []
try:
siglist = list(sourmash.load_file_as_signatures(args.matches_sig))
except (ValueError, AssertionError) as e:
siglist = []
print(f"loaded {len(siglist)} matches from '{args.matches_sig}'")

# Hack for examining members of our search database: remove exact matches.
Expand Down Expand Up @@ -88,14 +86,13 @@ def main(args):
# look at each contig individually
mh = empty_mh.copy_and_clear()
mh.add_sequence(record.sequence, force=True)

# collect all the gather results at genus level, together w/counts;
# here, results is a list of (lineage, count) tuples.
results = list(gather_at_rank(mh, lca_db, lin_db, match_rank))

# store together with size of sequence.
info = ContigGatherInfo(len(record.sequence), len(mh), results)
contigs_tax[record.name] = info
if len(mh):
results = list(gather_at_rank(mh, lca_db, lin_db, match_rank))
# store together with size of sequence.
info = ContigGatherInfo(len(record.sequence), len(mh), results)
contigs_tax[record.name] = info

print(f"Processed {len(contigs_tax)} contigs.")

Expand Down
3 changes: 1 addition & 2 deletions charcoal/just_taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,8 +339,7 @@ def main(args):
start_column=2)
print(f'loaded {len(tax_assign)} tax assignments.')

with open(args.matches_sig, 'rt') as fp:
siglist = list(sourmash.load_signatures(fp))
siglist = list(sourmash.load_file_as_signatures(args.matches_sig))

if not siglist:
print('no matches for this genome, exiting.')
Expand Down
7 changes: 4 additions & 3 deletions charcoal/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,14 +146,15 @@ def close(self):

def gather_at_rank(mh, lca_db, lin_db, match_rank):
"Run gather, and aggregate at given rank."
import copy
minhash = copy.copy(mh)
minhash = mh.to_mutable()
query_sig = sourmash.SourmashSignature(minhash)

# do the gather:
counts = Counter()
while 1:
while query_sig.minhash:

results = lca_db.gather(query_sig, threshold_bp=0)

if not results:
break

Expand Down
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@ dependencies:
- click
- pip
- pip:
- sourmash==4.0.0a3
- sourmash>=4.1.0,<5
Binary file added tests/test-data/2.fa.gz.gather-matches.zip
Binary file not shown.

Large diffs are not rendered by default.

Binary file not shown.
2 changes: 1 addition & 1 deletion tests/test_compare_taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def test_basic(location):
args.genome = 'loomba'

shutil.copyfile(utils.relative_file("tests/test-data/loomba/LoombaR_2017__SID1050_bax__bin.11.fa.gz.sig"), os.path.join(location, "loomba.sig"))
shutil.copyfile(utils.relative_file("tests/test-data/loomba/LoombaR_2017__SID1050_bax__bin.11.fa.gz.matches.sig"), os.path.join(location, "loomba.matches.sig"))
shutil.copyfile(utils.relative_file("tests/test-data/loomba/LoombaR_2017__SID1050_bax__bin.11.fa.gz.matches.zip"), os.path.join(location, "loomba.matches.zip"))
shutil.copyfile(utils.relative_file("tests/test-data/loomba/LoombaR_2017__SID1050_bax__bin.11.fa.gz.contigs-tax.json"), os.path.join(location, "loomba.contigs-tax.json"))

status = compare_taxonomy.main(args)
Expand Down
4 changes: 2 additions & 2 deletions tests/test_contigs_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def test_1(location):
args = utils.Args()
args.genome = utils.relative_file("tests/test-data/genomes/2.fa.gz")
args.genome_sig = utils.relative_file("tests/test-data/genomes/2.fa.gz.sig")
args.matches_sig = utils.relative_file("tests/test-data/2.fa.gz.gather-matches.sig.gz")
args.matches_sig = utils.relative_file("tests/test-data/2.fa.gz.gather-matches.zip")
args.lineages_csv = utils.relative_file("tests/test-data/test-match-lineages.csv")
args.json_out = os.path.join(location, 'tax.json')
args.match_rank = 'genus'
Expand All @@ -32,7 +32,7 @@ def test_2_loomba(location):
args = utils.Args()
args.genome = utils.relative_file("demo/genomes/LoombaR_2017__SID1050_bax__bin.11.fa.gz")
args.genome_sig = utils.relative_file("tests/test-data/loomba/LoombaR_2017__SID1050_bax__bin.11.fa.gz.sig")
args.matches_sig = utils.relative_file("tests/test-data/loomba/LoombaR_2017__SID1050_bax__bin.11.fa.gz.matches.sig")
args.matches_sig = utils.relative_file("tests/test-data/loomba/LoombaR_2017__SID1050_bax__bin.11.fa.gz.matches.zip")
args.lineages_csv = utils.relative_file("tests/test-data/test-match-lineages.csv")
args.json_out = os.path.join(location, 'tax.json')
args.match_rank = 'genus'
Expand Down
2 changes: 1 addition & 1 deletion tests/test_decontam.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def make_lca_and_lineages(match_files, lineages_csv, scaled, ksize,
# load matches into a list
siglist = []
for filename in match_files:
siglist += list(sourmash.load_signatures(filename))
siglist += list(sourmash.load_file_as_signatures(filename))

# pull in taxonomic assignments
tax_assign, _ = load_taxonomy_assignments(lineages_csv,
Expand Down
2 changes: 1 addition & 1 deletion tests/test_snakemake.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def test_make_sig(genome_file):
@pytest.mark.parametrize("genome_file", demo_genomes)
def test_make_gather_matches(request, genome_file):
depends(request, [f"test_make_sig[{g}]" for g in demo_genomes])
target = f'stage1/{genome_file}.matches.sig'
target = f'stage1/{genome_file}.matches.zip'
status, out, err = _run_snakemake_test('demo/demo.conf', target)

assert status == 0
Expand Down

0 comments on commit 06e1c35

Please sign in to comment.