diff --git a/README.md b/README.md index baac250..f3294fe 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # Identifying sequences that are in a metatranscriptome but not in a metagenome This repository curates a set of paired metagenomes and metatranscriptomes and provides a pipeline to rapidly identify the fraction of sequences in a metatranscriptome that are not in a metagenome. -The pipeline is shaped around a metadata file, 'inputs/metadata-paired-mgx-mtx.tsv', that contains a sample name (`sample_name`), metagenome SRA run accession (`mgx_run_accession`; `SRR*`, `ERR*`, `DRR*`), metatranscriptome SRA run accession (`mtx_run_accession`), and a sample type (`sample_type`). +The pipeline is shaped around a metadata file, `inputs/metadata-paired-mgx-mtx.tsv`, that contains a sample name (`sample_name`), metagenome SRA run accession (`mgx_run_accession`; `SRR*`, `ERR*`, `DRR*`), metatranscriptome SRA run accession (`mtx_run_accession`), and a sample type (`sample_type`). Using the run accessions, it downloads the sequencing data from the SRA and generates a [FracMinHash sketch](https://www.biorxiv.org/content/10.1101/2022.01.11.475838v2.abstract) of each run. Then, it uses the paired information encoded in the metadata table to subtract the metagenome sketch from the metatranscriptome sketch. This produces an estimate of the fraction metatranscriptome sequences not found in the paired metagenome. @@ -12,7 +12,46 @@ Some metagenome and metatranscripome pairs are true pairs that were extracted fr ## Getting started with this repository -TBD +This repository uses snakemake to run the pipeline and conda to manage software environments and installations. +You can find operating system-specific instructions for installing miniconda [here](https://docs.conda.io/en/latest/miniconda.html). +After installing conda and [mamba](https://mamba.readthedocs.io/en/latest/), run the following command to create the pipeline run environment. +We installaed Miniconda3 version `py39_4.12.0` and mamba version `0.15.3`. + +``` +mamba env create -n mtx_mgx --file environment.yml +conda activate mtx_mgx +``` + +To start the pipeline, run: +``` +snakemake --use-conda -j 2 +``` + +### Running this repository on AWS + +This repository was executed on an AWS EC2 instance (Ubuntu 22.04 LTS ami-085284d24fe829cd0, t2.large, 500 GiB EBS `gp2` root storage). +The instance was configured using the following commands: + +``` +curl -JLO https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh # download the miniconda installation script +bash Miniconda3-latest-Linux-x86_64.sh # run the miniconda installation script. Accept the license and follow the defaults. +source ~/.bashrc # source the .bashrc for miniconda to be available in the environment + +# configure miniconda channel order +conda config --add channels defaults +conda config --add channels bioconda +conda config --add channels conda-forge +conda config --set channel_priority strict +conda install mamba # install mamba for faster software installation. +``` + +Once miniconda is configured, clone the repository and `cd` into it, then follow the instructions in the above section. +``` +git clone https://github.com/Arcadia-Science/2022-mtx-not-in-mgx-pairs.git +cd 2022-mtx-not-in-mgx-pairs +``` + +Note that there is a [known issue](https://github.com/ncbi/sra-tools/issues/645) with the conda package `sra-tools=2.11.0` that will prevent this workflow from running on Mac operating systems. ## Next steps diff --git a/Snakefile b/Snakefile new file mode 100644 index 0000000..c1f5aa0 --- /dev/null +++ b/Snakefile @@ -0,0 +1,91 @@ +import pandas as pd +import sourmash +import os + +# create wildcard variables for workflow +metadata = pd.read_csv("inputs/metadata-paired-mgx-mtx.tsv", sep = "\t") # read in metadata as a pandas dataframe +MTX = metadata['mtx_run_accession'].unique().tolist() # make run accession in mtx col into list +MGX = metadata['mgx_run_accession'].unique().tolist() # make run accession in mgx col into list +RUN_ACCESSIONS = MGX + MTX # combine mtx and mgx into one list with all run accessions +SAMPLES = metadata['sample_name'].unique().tolist() # make a list of sample names +KSIZES = [21, 31, 51] # create a list of k-mer sizes for the workflow + +rule all: + input: expand("outputs/sourmash_sketch_subtract/{sample}_k{ksize}.sig", sample = SAMPLES, ksize = KSIZES) + +rule sourmash_sketch: + """ + Use fastq-dump to download sequencing data for a given accession. + Pipe the sequencing data directly to a sketch without writing to disk. + Using fastq-dump like this dumps both single end and paired end reads into std out, which can then be piped into sourmash sketch + """ + output: "outputs/sourmash_sketch/{run_accession}.sig" + conda: 'envs/sourmash.yml' + shell:''' + fastq-dump --disable-multithreading --fasta 0 --skip-technical --readids --read-filter pass --dumpbase --split-spot --clip -Z {wildcards.run_accession} | + sourmash sketch dna -p k=21,k=31,k=51,scaled=200,abund --name {wildcards.run_accession} -o {output} - + ''' + +rule calculate_mtx_not_in_mgx: + """ + Use the sourmash python API to subtract metagenome fracminhash sketches from metatranscriptome fracminhash sketches. + This for loop re-implements a simplified veresion of the the CLI of sourmash sig subtract. + https://github.com/sourmash-bio/sourmash/blob/latest/src/sourmash/sig/__main__.py + Doing this in a python script is the easiest way to map pairs of samples to each other within the snakemake workflow. + + The run directive doesn't allow a run to have a conda environment; + therefore the run environment for this code is specified in environment.yml + """ + input: + metadata = 'inputs/metadata-paired-mgx-mtx.tsv', + sigs = expand('outputs/sourmash_sketch/{run_accession}.sig', run_accession = RUN_ACCESSIONS) + output: + sigs = expand("outputs/sourmash_sketch_subtract/{sample}_k{{ksize}}.sig", sample = SAMPLES) + run: + ksize = wildcards.ksize + # read in metadata dataframe to derive sample pairs + metadata = metadata.reset_index() # make sure indexes pair with number of rows + + for index, row in metadata.iterrows(): + # grab the run accessions for a given paired metagenome and metatranscriptome + mtx_run_accession = row['mtx_run_accession'] + mgx_run_accession = row['mgx_run_accession'] + # using the run assession, create paths of the signatures + mtx_sigfile = os.path.join('outputs/sourmash_sketch', mtx_run_accession + '.sig') + mgx_sigfile = os.path.join('outputs/sourmash_sketch', mgx_run_accession + '.sig') + + # read in mtx signature and grab both the minhash object and the hashes in that object + mtx_sigobj = sourmash.load_one_signature(mtx_sigfile, ksize=ksize, select_moltype='DNA') + mtx_mh = from_sigobj.minhash + # turn the mtx hashes into a set + mtx_subtract_mins = set(mtx_mh.hashes) + + # read in mgx signature + mgx_sigobj = sourmash.load_one_signature(mgx_sig_path, ksize=ksize, select_moltype='DNA') + + # do the subtraction + mtx_subtract_mins -= set(mgx_sigobj.minhash.hashes) + + # make a new signature that only contains the mtx hashes that were not in the mgx + mtx_subtract_mh = mtx_sigobj.minhash.copy_and_clear().flatten() + mtx_subtract_mh.add_many(mtx_subtract_mins) + + # re-establish hash abundances from mtx sig + # re-read in mtx sig just in case abundances were removed in place + abund_sig = sourmash.load_one_signature(mtx_sig_path, ksize=ksize, select_moltype='DNA') + mtx_subtract_mh = mtx_subtract_mh.inflate(abund_sig.minhash) + + # create a new sig object that can be written to file + mtx_subtract_sigobj = sourmash.SourmashSignature(mtx_subtract_mh) + + # create a name that reflects the signature contents + name = row['sample_name'] + mtx_subtract_sigobj.name = name + + # create output sig file name + sig_filename = os.path.join('outputs/sourmash_sketch_subtract/' + name + "_k" + ksize + ".sig") + # write sig to file + with sourmash.sourmash_args.FileOutput(sig_filename, 'wt') as fp: + sourmash.save_signatures([mtx_subtract_sigobj], fp=fp) + + diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..b3083fa --- /dev/null +++ b/environment.yml @@ -0,0 +1,8 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - snakemake-minimal=7.12.1 + - sourmash-minimal=4.4.3 + - pandas=1.4.3 diff --git a/envs/sourmash.yml b/envs/sourmash.yml new file mode 100644 index 0000000..ee025ae --- /dev/null +++ b/envs/sourmash.yml @@ -0,0 +1,7 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - sourmash-minimal=4.4.3 + - sra-tools=2.11.0