Skip to content

Commit

Permalink
Merge pull request #4 from Arcadia-Science/update_subtract
Browse files Browse the repository at this point in the history
Update subtraction code and add notebook with first round of results
  • Loading branch information
taylorreiter committed Sep 21, 2022
2 parents 73d061a + f43caec commit a33e96e
Show file tree
Hide file tree
Showing 2 changed files with 599 additions and 56 deletions.
94 changes: 38 additions & 56 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,12 @@ MGX = metadata['mgx_run_accession'].unique().tolist() # make
RUN_ACCESSIONS = MGX + MTX # combine mtx and mgx into one list with all run accessions
SAMPLES = metadata['sample_name'].unique().tolist() # make a list of sample names
KSIZES = [21, 31, 51] # create a list of k-mer sizes for the workflow
MTX_MINUS_MGX = [x + '-minus-' + y for x, y in zip(MTX, MGX)] # create list that binds mtx to mgx

rule all:
input: expand("outputs/sourmash_sketch_subtract/{sample}_k{ksize}.sig", sample = SAMPLES, ksize = KSIZES)
input:
expand("outputs/sourmash_sketch_subtract_describe/{mtx_minus_mgx}-k{ksize}.csv", mtx_minus_mgx = MTX_MINUS_MGX, ksize = KSIZES),
expand("outputs/sourmash_sketch_describe/{run_accession}.csv", run_accession = RUN_ACCESSIONS)

rule sourmash_sketch:
"""
Expand All @@ -28,64 +31,43 @@ rule sourmash_sketch:

rule calculate_mtx_not_in_mgx:
"""
Use the sourmash python API to subtract metagenome fracminhash sketches from metatranscriptome fracminhash sketches.
This for loop re-implements a simplified veresion of the the CLI of sourmash sig subtract.
https://github.com/sourmash-bio/sourmash/blob/latest/src/sourmash/sig/__main__.py
Doing this in a python script is the easiest way to map pairs of samples to each other within the snakemake workflow.
The run directive doesn't allow a run to have a conda environment;
therefore the run environment for this code is specified in environment.yml
Use the sourmash CLI to subtract a metagenome sketch from its paired metatranscriptome, retaining metatranscriptome abundances.
Snakemake will auto-parse the wildcard mtx_minus_mgx on the "-minus-" string to back-propagate the correct wildcards to m*x_run_accession.
Providing the accessions in this way limits the number of ways they can combine;
If MTX solved for mtx_run_accession and MGX solved for mgx_run_accession, then snakemake would execute this rule for all combinations of MTX and MGX.
Instead, by binding pairs together in MTX_MINUS_MGX, only pairs of metatranscriptomes and metagenomes are subtracted.
"""
input:
metadata = 'inputs/metadata-paired-mgx-mtx.tsv',
sigs = expand('outputs/sourmash_sketch/{run_accession}.sig', run_accession = RUN_ACCESSIONS)
output:
sigs = expand("outputs/sourmash_sketch_subtract/{sample}_k{{ksize}}.sig", sample = SAMPLES)
run:
ksize = wildcards.ksize
# read in metadata dataframe to derive sample pairs
metadata = metadata.reset_index() # make sure indexes pair with number of rows

for index, row in metadata.iterrows():
# grab the run accessions for a given paired metagenome and metatranscriptome
mtx_run_accession = row['mtx_run_accession']
mgx_run_accession = row['mgx_run_accession']
# using the run assession, create paths of the signatures
mtx_sigfile = os.path.join('outputs/sourmash_sketch', mtx_run_accession + '.sig')
mgx_sigfile = os.path.join('outputs/sourmash_sketch', mgx_run_accession + '.sig')

# read in mtx signature and grab both the minhash object and the hashes in that object
mtx_sigobj = sourmash.load_one_signature(mtx_sigfile, ksize=ksize, select_moltype='DNA')
mtx_mh = from_sigobj.minhash
# turn the mtx hashes into a set
mtx_subtract_mins = set(mtx_mh.hashes)

# read in mgx signature
mgx_sigobj = sourmash.load_one_signature(mgx_sig_path, ksize=ksize, select_moltype='DNA')

# do the subtraction
mtx_subtract_mins -= set(mgx_sigobj.minhash.hashes)

# make a new signature that only contains the mtx hashes that were not in the mgx
mtx_subtract_mh = mtx_sigobj.minhash.copy_and_clear().flatten()
mtx_subtract_mh.add_many(mtx_subtract_mins)
mtx_sig = 'outputs/sourmash_sketch/{mtx_run_accession}.sig',
mgx_sig = 'outputs/sourmash_sketch/{mgx_run_accession}.sig',
output: "outputs/sourmash_sketch_subtract/{mtx_run_accession}-minus-{mgx_run_accession}-k{ksize}.sig"
conda: "envs/sourmash.yml"
shell:'''
sourmash sig subtract -k {wildcards.ksize} -o {output} -A {input.mtx_sig} {input.mtx_sig} {input.mgx_sig}
'''

# re-establish hash abundances from mtx sig
# re-read in mtx sig just in case abundances were removed in place
abund_sig = sourmash.load_one_signature(mtx_sig_path, ksize=ksize, select_moltype='DNA')
mtx_subtract_mh = mtx_subtract_mh.inflate(abund_sig.minhash)

# create a new sig object that can be written to file
mtx_subtract_sigobj = sourmash.SourmashSignature(mtx_subtract_mh)
rule sourmash_sig_describe_sketches:
"""
Use the sourmash CLI to report detailed information about all sketches, including number of hashes.
Output the information as a csv file.
"""
input: "outputs/sourmash_sketch/{run_accession}.sig"
output: "outputs/sourmash_sketch_describe/{run_accession}.csv"
conda: 'envs/sourmash.yml'
shell:'''
sourmash sig describe --csv {output} {input}
'''

# create a name that reflects the signature contents
name = row['sample_name']
mtx_subtract_sigobj.name = name

# create output sig file name
sig_filename = os.path.join('outputs/sourmash_sketch_subtract/' + name + "_k" + ksize + ".sig")
# write sig to file
with sourmash.sourmash_args.FileOutput(sig_filename, 'wt') as fp:
sourmash.save_signatures([mtx_subtract_sigobj], fp=fp)

rule sourmash_sig_describe_subtracted_sketches:
"""
Use the sourmash CLI to report detailed information about all sketches, including number of hashes.
Output the information as a csv file.
"""
input: "outputs/sourmash_sketch_subtract/{mtx_minus_mgx}-k{ksize}.sig"
output: "outputs/sourmash_sketch_subtract_describe/{mtx_minus_mgx}-k{ksize}.csv"
conda: 'envs/sourmash.yml'
shell:'''
sourmash sig describe --csv {output} {input}
'''

Loading

0 comments on commit a33e96e

Please sign in to comment.