diff --git a/.circleci/config.yml b/.circleci/config.yml index 8f1da02..c1ec32e 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -71,7 +71,8 @@ jobs: sed -i 's/kraken2: False/kraken2: True/' config.yaml sed -i 's/metaphlan2: False/metaphlan2: True/' config.yaml sed -i 's/humann2: False/humann2: True/' config.yaml - sed -i 's/antibiotic_resistance: False/antibiotic_resistance: True/' config.yaml + sed -i 's/groot: False/groot: True/' config.yaml + sed -i 's/amrplusplus: False/amrplusplus: True/' config.yaml sed -i 's/assembly: False/assembly: True/' config.yaml sed -i 's/binning: False/binning: True/' config.yaml sed -i 's|db_path: \"\"|db_path: \"db/hg19\"|' config.yaml diff --git a/CHANGELOG.md b/CHANGELOG.md index 0674e73..bef22fc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,36 @@ files), and the patch version is typically incremented for any set of changes committed to the master branch that does not trigger any of the aforementioned situations. +## [0.4.1] Unreleased +### Added +- Created Singularity images for all conda environments. Run with + `--use-singularity` (do not combine with `--use-conda`). +- New cluster profile "pseudo-rules" for anonymous rules for mappers: `bbmap` + and `bowtie2` can now accept threads from `n` in the cluster profile. They + still use the time allocation for the `__default__` rule, however. +- Added possibility to use `extra:` to define additional arguments passed on to + Slurm submissions. Useful to request e.g. fat nodes with `extra: "-C fat"` +- Added custom reimplementation of AMRPlusPlus v2.0 which can be executed with + either `--use-singularity` or `--use-conda`. + +### Fixed +- The host removal module now correctly identifies setting `host_removal: False` + in the config file. Thank you chrsb! + +### Changed +- Do not combine `--use-singularity` with `--use-conda` anymore. The new + Singularity images already contain all dependencies. +- All rules now define the number of threads from cluster_config if defined. + Old defaults are still used for local execution. +- The shebang of `area_plot.py` has been changed to work in more environments. +- Implemented workaround for error caused by automatic report generation when + using Singularity. +- Disabled taxonomic area plot for Kaiju outputs due to issues processing the + output files. + +### Removed + + ## [0.4.0] 2020-02-18 ### Added - Added resource limiter for HUMAnN2 due to its intense use of huge temporary diff --git a/README.md b/README.md index e7c348c..7160fed 100644 --- a/README.md +++ b/README.md @@ -20,8 +20,11 @@ Go to https://stag-mwc.readthedocs.org for the full documentation. [Snakemake](https://snakemake.readthedocs.io) are required to be able to use StaG-mwc. Most people would probably want to install [Miniconda](https://conda.io/miniconda.html) and install Snakemake into their -base environment. Conda will automatically install the required versions of -all tools required to run StaG-mwc. +base environment. When running StaG with the `--use-conda` or +`--use-singularity` flags, all dependencies are managed automatically. If +using conda it will automatically install the required versions of all tools +required to run StaG-mwc. There is no need to combine the flags: the +Singularity images already contain all required dependencies. ### Step 1: Clone workflow To use StaG-mwc, you need a local copy of the workflow repository. Start by @@ -58,11 +61,16 @@ Make sure you edit the Slurm project account in documentation](https://snakemake.readthedocs.io) for further details on how to run Snakemake workflows on other types of cluster resources. +Note that in all examples above, `--use-conda` and essentially be replaced +with `--use-singularity` to run in Singularity containers instead of using a +locally installed conda. Read more about it under the Running section in the +docs. ## Testing A very basic continuous integration test is currently in place. It merely validates the syntax by trying to let Snakemake build the dependency graph if -all outputs are activated. +all outputs are activated. Suggestions for how to improve the automated +testing of StaG-mwc are very welcome! ## Contributing diff --git a/Snakefile b/Snakefile index fbedfee..a0590f5 100644 --- a/Snakefile +++ b/Snakefile @@ -16,8 +16,7 @@ min_version("5.5.4") from rules.publications import publications -stag_version = "0.4.0" -singularity: "docker://continuumio/miniconda3:4.7.10" +stag_version = "0.4.1" onstart: print("\n".join([ @@ -76,6 +75,7 @@ include: "rules/functional_profiling/humann2.smk" # Antibiotic resistance ############################# include: "rules/antibiotic_resistance/groot.smk" +include: "rules/antibiotic_resistance/amrplusplus.smk" ############################# # Mappers @@ -153,12 +153,10 @@ onsuccess: Path("citations.rst").unlink() Path("citations.rst").symlink_to(citation_filename) - snakemake_call = " ".join(argv) - shell("{snakemake_call} --unlock".format(snakemake_call=snakemake_call)) + shell("{snakemake_call} --unlock".format(snakemake_call=argv[0])) shell("{snakemake_call} --report {report}-{datetime}.html".format( - snakemake_call=snakemake_call, + snakemake_call=argv[0], report=config["report"], datetime=report_datetime, ) ) - diff --git a/cluster_configs/ctmr_gandalf/config.yaml b/cluster_configs/ctmr_gandalf/config.yaml new file mode 100644 index 0000000..7f08cd1 --- /dev/null +++ b/cluster_configs/ctmr_gandalf/config.yaml @@ -0,0 +1,8 @@ +jobscript: "cluster_configs/ctmr_gandalf/slurm-jobscript.sh" +cluster: "cluster_configs/ctmr_gandalf/slurm-submit.py --time {cluster.time} --error {cluster.stderr} --output {cluster.stdout} --job-name '{cluster.jobname}' {cluster.extra}" +cluster-status: "cluster_configs/ctmr_gandalf/slurm-status.py" +cluster-config: "cluster_configs/ctmr_gandalf/ctmr_gandalf.yaml" +max-jobs-per-second: 10 +max-status-checks-per-second: 10 +local-cores: 1 +jobs: 999 diff --git a/cluster_configs/ctmr_gandalf/ctmr_gandalf.yaml b/cluster_configs/ctmr_gandalf/ctmr_gandalf.yaml new file mode 100644 index 0000000..2c540bc --- /dev/null +++ b/cluster_configs/ctmr_gandalf/ctmr_gandalf.yaml @@ -0,0 +1,90 @@ +# Cluster config file for StaG-mwc for use on CTMR UCP +__default__: + account: "bio" + partition: "ctmr" + extra: "" + time: "03:00:00" + n: 2 + stderr: "slurm_logs/slurm-{rule}-{wildcards}.stderr" + stdout: "slurm_logs/slurm-{rule}-{wildcards}.stdout" + jobname: "[{rule}]: {wildcards}" + + +############################# +# Pre-processing +############################# +fastp: + n: 8 + time: "01:00:00" +remove_host: + n: 8 + time: "01:00:00" +bbcountunique: + n: 4 + time: "00:45:00" + +############################# +# Naive comparisons +############################# +sketch: + n: 8 + time: "00:20:00" + +############################# +# Taxonomic profiling +############################# +kaiju: + n: 10 + time: "02:00:00" +kraken2: + n: 10 + time: "02:00:00" +metaphlan2: + n: 8 + time: "01:30:00" +bracken: + n: 2 + time: "01:00:00" + +############################# +# Functional profiling +############################# +humann2: + n: 12 + time: "12:00:00" + +############################# +# Antibiotic resistance +############################# +groot_align: + n: 8 + time: "01:00:00" +align_to_amr: + n: 10 + time: "04:00:00" + +############################# +# Mappers +############################# +bbmap: + n: 10 + time: "02:00:00" +bowtie2: + n: 10 + time: "02:00:00" + +############################# +# Assembly +############################# +assembly: + n: 20 + time: "05:00:00" +assembly: + n: 20 + time: "05:00:00" +consolidate_bins: + n: 20 + time: "05:00:00" +blobology: + n: 20 + time: "05:00:00" diff --git a/cluster_configs/ctmr_gandalf/slurm-jobscript.sh b/cluster_configs/ctmr_gandalf/slurm-jobscript.sh new file mode 100755 index 0000000..391741e --- /dev/null +++ b/cluster_configs/ctmr_gandalf/slurm-jobscript.sh @@ -0,0 +1,3 @@ +#!/bin/bash +# properties = {properties} +{exec_job} diff --git a/cluster_configs/ctmr_gandalf/slurm-status.py b/cluster_configs/ctmr_gandalf/slurm-status.py new file mode 100755 index 0000000..ad98697 --- /dev/null +++ b/cluster_configs/ctmr_gandalf/slurm-status.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python3 +import re +import subprocess as sp +import shlex +import sys +import time +import logging +logger = logging.getLogger("__name__") + +STATUS_ATTEMPTS = 20 + +jobid = sys.argv[1] + +for i in range(STATUS_ATTEMPTS): + try: + sacct_res = sp.check_output(shlex.split("sacct -P -b -j {} -n".format(jobid))) + res = {x.split("|")[0]: x.split("|")[1] for x in sacct_res.decode().strip().split("\n")} + break + except sp.CalledProcessError as e: + logger.error("sacct process error") + logger.error(e) + except IndexError as e: + pass + # Try getting job with scontrol instead in case sacct is misconfigured + try: + sctrl_res = sp.check_output(shlex.split("scontrol -o show job {}".format(jobid))) + m = re.search("JobState=(\w+)", sctrl_res.decode()) + res = {jobid: m.group(1)} + break + except sp.CalledProcessError as e: + logger.error("scontrol process error") + logger.error(e) + if i >= STATUS_ATTEMPTS - 1: + print("failed") + exit(0) + else: + time.sleep(1) + +status = res[jobid] + +if (status.startswith("BOOT_FAIL")): + print("failed") +elif (status.startswith("CANCELLED")): + print("failed") +elif (status.startswith("COMPLETED")): + print("success") +elif (status.startswith("DEADLINE")): + print("failed") +elif (status.startswith("FAILED")): + print("failed") +elif (status.startswith("NODE_FAIL")): + print("failed") +elif (status.startswith("PREEMPTED")): + print("failed") +elif (status.startswith("TIMEOUT")): + print("failed") +# Unclear whether SUSPENDED should be treated as running or failed +elif (status.startswith("SUSPENDED")): + print("failed") +else: + print("running") diff --git a/cluster_configs/ctmr_gandalf/slurm-submit.py b/cluster_configs/ctmr_gandalf/slurm-submit.py new file mode 100755 index 0000000..3050c19 --- /dev/null +++ b/cluster_configs/ctmr_gandalf/slurm-submit.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python3 +import sys +import re +import argparse +import subprocess +from pathlib import Path + +from snakemake.utils import read_job_properties + +parser = argparse.ArgumentParser(add_help=False) +parser.add_argument( + "--help", help="Display help message.", action="store_true") +parser.add_argument( + "positional", action="append", + nargs="?", metavar="POS", + help="additional arguments not in slurm parser group to pass to sbatch") + +# A subset of SLURM-specific arguments +slurm_parser = parser.add_argument_group("slurm-specific arguments") +slurm_parser.add_argument( + "-a", "--array", help="job array index values") +slurm_parser.add_argument( + "-A", "--account", help="charge job to specified account") +slurm_parser.add_argument( + "--begin", help="defer job until HH:MM MM/DD/YY") +slurm_parser.add_argument( + "-c", "--cpus-per-task", help="number of cpus required per task") +slurm_parser.add_argument( + "-d", "--dependency", + help="defer job until condition on jobid is satisfied") +slurm_parser.add_argument( + "-D", "--workdir", help="set working directory for batch script") +slurm_parser.add_argument( + "-e", "--error", help="file for batch script's standard error") +slurm_parser.add_argument( + "-J", "--job-name", help="name of job") +slurm_parser.add_argument( + "--mail-type", help="notify on state change: BEGIN, END, FAIL or ALL") +slurm_parser.add_argument( + "--mail-user", help="who to send email notification for job state changes") +slurm_parser.add_argument( + "-n", "--ntasks", help="number of tasks to run") +slurm_parser.add_argument( + "-N", "--nodes", help="number of nodes on which to run (N = min[-max])") +slurm_parser.add_argument( + "-o", "--output", help="file for batch script's standard output") +slurm_parser.add_argument( + "-p", "--partition", help="partition requested") +slurm_parser.add_argument( + "-q", "--qos", help="quality of service") +slurm_parser.add_argument( + "-Q", "--quiet", help="quiet mode (suppress informational messages)") +slurm_parser.add_argument( + "-t", "--time", help="time limit") +slurm_parser.add_argument( + "--wrap", help="wrap command string in a sh script and submit") +slurm_parser.add_argument( + "-C", "--constraint", help="specify a list of constraints") +slurm_parser.add_argument( + "--mem", help="minimum amount of real memory") + +args = parser.parse_args() + +if args.help: + parser.print_help() + sys.exit(0) + +jobscript = sys.argv[-1] +job_properties = read_job_properties(jobscript) + +extras = "" +if args.positional: + for m in args.positional: + if m is not None: + extras = extras + " " + m + +arg_dict = dict(args.__dict__) + + +# Process resources +if "resources" in job_properties: + resources = job_properties["resources"] + if arg_dict["time"] is None: + if "runtime" in resources: + arg_dict["time"] = resources["runtime"] + elif "walltime" in resources: + arg_dict["time"] = resources["walltime"] + if "mem" in resources and arg_dict["mem"] is None: + arg_dict["mem"] = resources["mem"] + +# Threads +if "threads" in job_properties: + arg_dict["ntasks"] = job_properties["threads"] + +opt_keys = {"array", "account", "begin", "cpus_per_task", + "depedency", "workdir", "error", "job_name", "mail_type", + "mail_user", "ntasks", "nodes", "output", "partition", + "quiet", "time", "wrap", "constraint", "mem"} + +# Set default partition +if arg_dict["partition"] is None: + # partitions and SLURM - If not specified, the default behavior is to + # allow the slurm controller to select the default partition as + # designated by the system administrator. + opt_keys.remove("partition") + +# NOT APPLICABLE FOR CTMR UCP +#### Set default account +###if arg_dict["account"] is None: +### raise Exception("Cannot submit Slurm jobs without account!") + +# Ensure output folder for Slurm log files exist +# This is a bit hacky; it will try to create the folder +# for every Slurm submission... +if "output" in arg_dict: + stdout_folder = Path(arg_dict["output"]).parent + stdout_folder.mkdir(exist_ok=True, parents=True) +if "error" in arg_dict: + stdout_folder = Path(arg_dict["error"]).parent + stdout_folder.mkdir(exist_ok=True, parents=True) + +opts = "" +for k, v in arg_dict.items(): + if k not in opt_keys: + continue + if v is not None: + opts += " --{} \"{}\" ".format(k.replace("_", "-"), v) + +if arg_dict["wrap"] is not None: + cmd = "sbatch {opts}".format(opts=opts) +else: + cmd = "sbatch {opts} {extras}".format(opts=opts, extras=extras) + +try: + res = subprocess.run(cmd, check=True, shell=True, stdout=subprocess.PIPE) +except subprocess.CalledProcessError as e: + raise e + +# Get jobid +res = res.stdout.decode() +try: + m = re.search("Submitted batch job (\d+)", res) + jobid = m.group(1) + print(jobid) +except Exception as e: + print(e) + raise diff --git a/cluster_configs/rackham/config.yaml b/cluster_configs/rackham/config.yaml index 3e0d6ed..239f492 100644 --- a/cluster_configs/rackham/config.yaml +++ b/cluster_configs/rackham/config.yaml @@ -1,5 +1,5 @@ jobscript: "cluster_configs/rackham/slurm-jobscript.sh" -cluster: "cluster_configs/rackham/slurm-submit.py --account {cluster.account} --partition {cluster.partition} --ntasks {cluster.n} --time {cluster.time} --error {cluster.stderr} --output {cluster.stdout} --job-name '{cluster.jobname}'" +cluster: "cluster_configs/rackham/slurm-submit.py --account {cluster.account} --partition {cluster.partition} --ntasks {cluster.n} --time {cluster.time} --error {cluster.stderr} --output {cluster.stdout} --job-name '{cluster.jobname}' {cluster.extra}" cluster-status: "cluster_configs/rackham/slurm-status.py" cluster-config: "cluster_configs/rackham/rackham.yaml" max-jobs-per-second: 10 diff --git a/cluster_configs/rackham/rackham.yaml b/cluster_configs/rackham/rackham.yaml index 45298e4..a836de3 100644 --- a/cluster_configs/rackham/rackham.yaml +++ b/cluster_configs/rackham/rackham.yaml @@ -30,16 +30,6 @@ sketch: n: 2 time: "00:20:00" -############################# -# Mappers -############################# -bbmap: - n: 8 - time: "02:00:00" -bowtie2: - n: 8 - time: "02:00:00" - ############################# # Taxonomic profiling ############################# @@ -52,6 +42,9 @@ kraken2: metaphlan2: n: 8 time: "01:30:00" +bracken: + n: 2 + time: "01:00:00" ############################# # Functional profiling @@ -66,10 +59,32 @@ humann2: groot_align: n: 6 time: "01:00:00" +align_to_amr: + n: 10 + time: "04:00:00" + +############################# +# Mappers +############################# +bbmap: + n: 8 + time: "02:00:00" +bowtie2: + n: 8 + time: "02:00:00" ############################# # Assembly ############################# -megahit: +assembly: + n: 20 + time: "05:00:00" +assembly: + n: 20 + time: "05:00:00" +consolidate_bins: + n: 20 + time: "05:00:00" +blobology: n: 20 time: "05:00:00" diff --git a/config.yaml b/config.yaml index 42a0d56..5a04898 100644 --- a/config.yaml +++ b/config.yaml @@ -35,7 +35,9 @@ taxonomic_profile: metaphlan2: False functional_profile: humann2: False -antibiotic_resistance: False +antibiotic_resistance: + groot: False + amrplusplus: True mappers: bbmap: False bowtie2: False @@ -124,6 +126,17 @@ groot: covcutoff: 0.97 # Coverage cutoff for groot report lowcov: False # Report ARGs with no 5' or 3' coverage. Overrides covcutoff. +amrplusplus: + megares: + fasta: "" # Leave as "" to run with megares_modified_database_v2.00.fasta or specify database path + annotation: "" # Leave as "" to run with megares_modified_annotations_v2.00.csv or specify annotations path + rarefaction: + min: "5" # Starting rarefaction level + max: "100" # Ending rarefaction level + skip: "5" # Number of levels to skip + samples: "1" # Number of iterations to sample at + resistome: + threshold: "80" # Resistome threshold ######################### # Mappers @@ -162,4 +175,3 @@ metawrap: universal: "--universal" # Use universal marker genes minimum_completion: 70 maximum_contamination: 10 - diff --git a/docs/source/conf.py b/docs/source/conf.py index f32c1f8..21e0f43 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -56,9 +56,9 @@ # built documents. # # The short X.Y version. -version = '0.4.0' +version = '0.4.1' # The full version, including alpha/beta/rc tags. -release = '0.4.0-beta1' +release = '0.4.1' # reStructuredText prolog contains a string of reStructuredText that will be # included at the beginning of every source file that is read. diff --git a/docs/source/faq.rst b/docs/source/faq.rst index d080afe..f944aec 100644 --- a/docs/source/faq.rst +++ b/docs/source/faq.rst @@ -13,8 +13,8 @@ both the read QC step and the host removal steps if you need to. In order to do so, we must "trick" Snakemake into thinking that those rules have already been performed. The rules for read QC and host removal are configured with bypasses so that if the user sets ``host_removal: False`` or ``qc_reads: False``, those -steps will be bypassed by creating symlinks to directly to the input files in -the respective output directories. +steps will be bypassed by creating symlinks directly to the input files in the +respective output directories. Skip host removal diff --git a/docs/source/index.rst b/docs/source/index.rst index 5fc0ad7..cc8e420 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -8,7 +8,7 @@ The StaG metagenomic workflow collaboration (mwc) is a Snakemake implementation of a general metagenomic analysis workflow. It started out as a joint project between The Center for Translational Microbiome Research (`CTMR`_) at Karolinska Institutet in Stockholm and Fredrik Bäckhed's `research group`_ at -Sahlgrenska in Gothenburg, but has since evolved into the default workflow for +Sahlgrenska in Gothenburg, and has since evolved into the default workflow for metagenomics analyses used at CTMR. .. _CTMR: https://ki.se/en/research/centre-for-translational-microbiome-research-ctmr diff --git a/docs/source/installation.rst b/docs/source/installation.rst index c01624c..6bd4325 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -55,13 +55,13 @@ This will clone the repo into a folder called ``stag-mwc`` inside your current working directory. You will manage all workflow-related business from inside this folder (i.e. configuring and running the workflow). +The intended way of working with |full_name| is that you download/clone a +complete copy of the repository for each analysis you intend to make. That way, +the local copy you have will remain after the analysis has been run, so you can +go back and see exactly what was run, and how. This forms the basis of how +Snakemake enables traceability and reproducibility. Congratulations *************** You have now installed |full_name|. -The intended way of working with |full_name|, is that you download/clone a -complete copy of the repository for each analysis you intend to make. That way, -the local copy you have will remain after the analysis has been run, so you can -go back and see exactly what was run, and how. This forms the basis of how -Snakemake enables traceability and reproducibility. diff --git a/docs/source/modules.rst b/docs/source/modules.rst index ea1a7cc..30ed873 100644 --- a/docs/source/modules.rst +++ b/docs/source/modules.rst @@ -13,13 +13,15 @@ .. _SAF format: http://bioinf.wehi.edu.au/featureCounts/ .. _MEGAHIT: https://github.com/voutcn/megahit .. _MultiQC: https://multiqc.info/ +.. _amrplusplus: https://megares.meglab.org/amrplusplus/latest/html/what_AMR++_produces.html +.. _megares: https://megares.meglab.org/ Modules ======= |full_name| is a workflow framework that connects several other tools. The basic assumption is that all analyses start with a quality control of the -sequencing reads (using `FastQC`_), followed by host sequence removal (using -`BBMap`_). This section of the documentation aims to describe useful details +sequencing reads (using `FastP`_), followed by host sequence removal (using +`Kraken2`_). This section of the documentation aims to describe useful details about the separate tools that are used in |full_name|. The following subsections describe the function of each module included in @@ -252,6 +254,35 @@ The read lengths input to `groot`_ must conform to the settings used during `groot`_ database construction. The length window can be configured in the config file. +AMRPlusPlus_v2 +------- +:Tool: `amrplusplus`_ +:Output folder: ``amrplusplus`` + +`amrplusplus`_ will align reads to `megares`_ antibiotic resistance gene database to +produce antibiotic resistance gene profiles. Output is structured as:: + + ├ AlignToAMR + │ └ .amr.alignment.sam + ├ RunResistome + │ ├ .class.tsv + │ ├ .gene.tsv + │ ├ .group.tsv + │ └ .mech.tsv + ├ ResistomeResults + │ └ AMR_analytic_matrix.csv + ├ RunRarefaction + │ ├ .class.tsv + │ ├ .gene.tsv + │ ├ .group.tsv + │ └ .mech.tsv + +``AMR_analytic_matrix.csv`` contains aggregated results of gene counts for all samples +aligned against `megares`_, based on the threshold set in ``config.yaml``. Pasting a gene name +or accession number into the database will provide detailed information and links to +published papers. + +`amrplusplus`_ can be executed with either ``--use-singularity`` or ``--use-conda`` settings. Mappers ******* diff --git a/docs/source/running.rst b/docs/source/running.rst index 47b92ad..14e1aa5 100644 --- a/docs/source/running.rst +++ b/docs/source/running.rst @@ -56,10 +56,11 @@ is:: snakemake --use-conda --cores N -where ``N`` is the maximum number of cores you want to allow for the workflow. -Snakemake will automatically reduce the number of cores available to individual -steps to this limit. Another variant of ``--cores`` is called ``--jobs``, they -are equivalent. +where ``N`` is the maximum number of cores you want to allow for the +workflow. Snakemake will automatically reduce the number of cores available +to individual steps to this limit. Another variant of ``--cores`` is called +``--jobs``, which you might encounter occassionally. The two commands are +equivalent. .. note:: @@ -81,6 +82,26 @@ instruct snakemake to keep going even if a job should fail, e.g. maybe the taxonomic profiling step will fail for a sample if the sample contains no assignable reads after quality filtering (extreme example). +If you are having trouble running |full_name| with conda, try with +Singularity. There are pre-built Singularity images that are ready to use +with |full_name|. In order to prevent exhausting the project's quota on +Singularity Hub please use ``--singularity-prefix`` to specify a folder where +Snakemake can download and re-use the downloaded Singularity images. The +command to run |full_name| with Singularity instead of conda is:: + + snakemake --use-singularity --singularity-prefix /path/to/prefix/folder --dryrun + +There are some additional details that need to be considered when using +Singularity instead of conda, most notably that you will have to specify bind +paths (specifying-bind-paths_) so that your reference databases are +accessible from within the containers when running |full_name|. It might look +something like this:: + + snakemake --use-singularity --singularity-prefix /path/to/prefix/folder --singularity-args "-B /home/username/databases" + +The above example assumes you have entered paths to your databases in +``config.yaml`` with a base path like the one shown in the above command +(e.g. ``/home/username/databases/kraken2/kraken2_human/``). Running on cluster resources **************************** @@ -93,10 +114,35 @@ Slurm project account), as well as the number of CPUs, time, and memory requirements for each individual step. Snakemake uses this information when submitting jobs to the cluster scheduler. +When running on a cluster it will likely work best if you run StaG using +Singularity. The workflow comes preconfigured to download and use containers +from Singularity hub. To use Singularity launch Snakemake with the +``--use-singularity`` argument. + +.. _specifying-bind-paths: https://sylabs.io/guides/3.5/user-guide/bind_paths_and_mounts.html#specifying-bind-paths + +.. note:: + + Do not combine ``--use-conda`` with ``--use-singularity``. + + It is important that you do not download the Singularity images from + Singularity hub too often due to quota limitations. To prevent |full_name| + from downloading the image again between several projects you can use the + ``--singularity-prefix`` to specify a directory where Snakemake can store + the downloaded images for reuse between projects. + + Paths to databases need to be located so that they are accessible from + inside the Singularity containers. It's easiest if they are all available + from the same folder, so you can bind the main database folder into the + Singularity container with e.g. ``--singularity-args "-B /db"``. Note that + database paths need to specified in the config file so that the paths are + correct from inside the Singularity container. Read more about specifying + bind paths in the official Singularity docs: specifying-bind-paths_. + To run |full_name| on e.g. UPPMAX's Rackham, run the following command from inside the workflow repository directory:: - snakemake --use-conda --profile cluster_configs/rackham + snakemake --use-singularity --singularity-prefix /path/to/prefix/folder --singularity-args "-B /proj/uppstore2017086/db" --profile cluster_configs/rackham This will make Snakemake submit each workflow step as a separate cluster job using the CPU and time requirements specified in ``rackham.yaml`` inside the @@ -119,12 +165,6 @@ files before restarting pipeline using:: (base)$ rm -rfv .snakemake/metadata -.. note:: - - When running on a cluster it can speed up node initialization if you run - using Singularity containers and conda combined. This is done by running - snakemake with both the `--use-conda` and `--use-singularity` arguments. - Execution report **************** diff --git a/envs/Singularity.amrplusplus b/envs/Singularity.amrplusplus new file mode 100644 index 0000000..0b7a457 --- /dev/null +++ b/envs/Singularity.amrplusplus @@ -0,0 +1,55 @@ +Bootstrap: docker +From: debian:jessie-slim + +#Includes trimmomatic, samtools, bwa, bedtools, vcftools, htslib, kraken2, SNPfinder, freebayes, bbmap + +%environment + export LC_ALL=C + +%post + apt update \ + && apt install -y --no-install-recommends \ + build-essential ca-certificates sudo tcsh\ + git make automake autoconf openjdk-7-jre wget gzip unzip sed\ + zlib1g-dev curl libbz2-dev locales libncurses5-dev liblzma-dev libcurl4-openssl-dev software-properties-common apt-transport-https\ + python3-pip python3-docopt python3-pytest python-dev python3-dev\ + libcurl4-openssl-dev libssl-dev zlib1g-dev fonts-texgyre \ + gcc g++ gfortran libblas-dev liblapack-dev dos2unix libstdc++6\ + r-base-core r-recommended hmmer\ + && rm -rf /var/lib/apt/lists/* + + + wget -c https://repo.continuum.io/archive/Anaconda3-2020.02-Linux-x86_64.sh + sh Anaconda3-2020.02-Linux-x86_64.sh -bfp /usr/local + + # add bioconda channels + conda config --add channels defaults + conda config --add channels conda-forge + conda config --add channels bioconda + + # install bulk of bioinformatic tools using conda + conda create -n AmrPlusPlus_env python=3 trimmomatic bwa samtools bedtools freebayes bbmap vcftools htslib kraken2 + + . /usr/local/bin/activate AmrPlusPlus_env + + #ln -s /usr/local/envs/AmrPlusPlus_env/bin/* /usr/local/bin/ + + #Still experimenting with how to change $PATH location. + echo 'export PATH=$PATH:/usr/local/envs/AmrPlusPlus_env/bin/' >> $SINGULARITY_ENVIRONMENT + + # SNPfinder + cd /usr/local + git clone https://github.com/cdeanj/snpfinder.git + cd snpfinder + make + cp snpfinder /usr/local/bin + cd / + + # Make sure all the tools have the right permissions to use the tools + chmod -R 777 /usr/local/ + +%test + +%labels + Author meglab-metagenomics + version amrplusplus_v2 diff --git a/envs/Singularity.assembly b/envs/Singularity.assembly new file mode 100644 index 0000000..7d29927 --- /dev/null +++ b/envs/Singularity.assembly @@ -0,0 +1,30 @@ +Bootstrap: docker +From: continuumio/miniconda3 + +%help + Singularity image containing the conda environment defined in + assembly.yaml. + + To build, run: + sudo singularity build stag-mwc-assembly.simg Singularity.assembly + +%files + assembly.yaml + +%labels + AUTHOR boulund + VERSION 1.0 + CONDA_ENV /opt/conda/envs/assembly + +%environment + PATH=/opt/conda/envs/stag-mwc-assembly/bin:/opt/conda/bin:$PATH + export PATH + +%post + # Install stag-mwc-assembly conda environment + /opt/conda/bin/conda env create -f assembly.yaml --force + /opt/conda/bin/conda clean --yes --all + +%runscript + exec echo "$@" + diff --git a/envs/Singularity.biobakery b/envs/Singularity.biobakery new file mode 100644 index 0000000..df16f22 --- /dev/null +++ b/envs/Singularity.biobakery @@ -0,0 +1,30 @@ +Bootstrap: docker +From: continuumio/miniconda3 + +%help + Singularity image containing the conda environment defined in + humann2.yaml. + + To build, run: + sudo singularity build stag-mwc-biobakery.simg Singularity.biobakery + +%files + humann2.yaml + +%labels + AUTHOR boulund + VERSION 1.0 + CONDA_ENV /opt/conda/envs/stag-mwc-biobakery + +%environment + PATH=/opt/conda/envs/stag-mwc-biobakery/bin:/opt/conda/bin:$PATH + export PATH + +%post + # Install stag-mwc-biobakery conda environment + /opt/conda/bin/conda env create -f humann2.yaml --force + /opt/conda/bin/conda clean --yes --all + +%runscript + exec echo "$@" + diff --git a/envs/Singularity.stag-mwc b/envs/Singularity.stag-mwc new file mode 100644 index 0000000..a86cb05 --- /dev/null +++ b/envs/Singularity.stag-mwc @@ -0,0 +1,50 @@ +Bootstrap: docker +From: continuumio/miniconda3 + +%help + Singularity image containing the conda environment defined in + stag-mwc.yaml. + + To build, run: + sudo singularity build stag-mwc.simg Singularity.stag-mwc + + Note that the Kraken2 package installed through conda is ignored in favor + of one built and installed from source, due to issues with the conda + package giving segfaults. + +%files + stag-mwc.yaml + +%labels + AUTHOR boulund + VERSION 1.0 + CONDA_ENV /opt/conda/envs/stag-mwc-main + KRAKEN2_BUILD SOURCE_v2.0.8-beta + +%environment + PATH=/opt/kraken2-2.0.8-beta:/opt/conda/envs/stag-mwc-main/bin:/opt/conda/bin:$PATH + export PATH + +%post + # Install stag-mwc-main conda environment + /opt/conda/bin/conda env create -f stag-mwc.yaml --force + /opt/conda/bin/conda clean --yes --all + + # Install Kraken2 from source; the conda version segafaults. + apt-get update && apt-get -y install \ + wget \ + zlib1g-dev \ + make \ + g++ + wget \ + https://github.com/DerrickWood/kraken2/archive/v2.0.8-beta.tar.gz \ + --output-document kraken2-2.0.8-beta.tar.gz + tar -xvf kraken2-2.0.8-beta.tar.gz + mkdir -pv /opt/kraken2-2.0.8-beta + cd kraken2-2.0.8-beta + ./install_kraken2.sh /opt/kraken2-2.0.8-beta + rm -rfv kraken2-2.0.8-beta* + +%runscript + exec echo "$@" + diff --git a/envs/amrplusplus.yaml b/envs/amrplusplus.yaml new file mode 100644 index 0000000..7375e3e --- /dev/null +++ b/envs/amrplusplus.yaml @@ -0,0 +1,8 @@ +name: amrplusplus +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - samtools=1.10 + - bwa=0.7.17 diff --git a/envs/metaphlan2.yaml b/envs/metaphlan2.yaml index 1c3fe98..2315e06 100644 --- a/envs/metaphlan2.yaml +++ b/envs/metaphlan2.yaml @@ -1,4 +1,4 @@ -name: stag-mwc-biobakery +name: stag-mwc-metaphlan2 channels: - bioconda - conda-forge diff --git a/rules/antibiotic_resistance/amrplusplus.smk b/rules/antibiotic_resistance/amrplusplus.smk new file mode 100644 index 0000000..7a1be6a --- /dev/null +++ b/rules/antibiotic_resistance/amrplusplus.smk @@ -0,0 +1,232 @@ +# vim: syntax=python expandtab tabstop=4 +# AMRPlusPlus v2.0 snakemake integration +# Aron Arzoomand 2020-08-12 + +from pathlib import Path, PurePath +from snakemake.exceptions import WorkflowError + + + +localrules: + get_local_megares, + build_amr_index, + + +amrplusplus_config=config["amrplusplus"] +if config["antibiotic_resistance"]["amrplusplus"]: + if not Path(amrplusplus_config["megares"]["fasta"]).exists(): + err_message = "No database exists at: '{}' !\n".format(amrplusplus_config["megares"]["fasta"]) + err_message += "Specify the path in the amrplusplus section of config.yaml.\n" + err_message += "If you want to run with the default megares_modified_database_v2.00.fasta, leave it as \"\"" + raise WorkflowError(err_message) + if not Path(amrplusplus_config["megares"]["annotation"]).exists(): + err_message = "No annotations exists at: '{}' !\n".format(amrplusplus_config["megares"]["annotation"]) + err_message += "Specify the path in the amrplusplus section of config.yaml.\n" + err_message += "If you want to run with the default megares_modified_annotations_v2.00.csv, leave it as \"\"" + raise WorkflowError(err_message) + + amrplusplus_outputs = f"{OUTDIR}/amrplusplus/ResistomeResults/AMR_analytics_matrix.csv" + all_outputs.append(amrplusplus_outputs) + + citations.add(publications["AMRPlusPlus2"]) + + +MEGARES_FASTA = f"{config['dbdir']}/amrplusplus/megares_modified_database_v2.00.fasta" +MEGARES_ANNOT = f"{config['dbdir']}/amrplusplus/megares_modified_annotations_v2.00.csv" + + +rule get_local_megares: + """Get the Megares_2.0 database and annotations""" + output: + megares_db_local=f"{MEGARES_FASTA}", + megares_annot_local=f"{MEGARES_ANNOT}", + log: + stdout=f"{LOGDIR}/amrplusplus/get_local_megares.stdout.log", + stderr=f"{LOGDIR}/amrplusplus/get_local_megares.stderr.log", + shadow: + "shallow" + singularity: + "shub://meglab-metagenomics/amrplusplus_v2" + params: + megares_db=amrplusplus_config["megares"]["fasta"] or f"{MEGARES_FASTA}" + shell: + """ + wget -O {output.megares_db_local} https://raw.githubusercontent.com/meglab-metagenomics/amrplusplus_v2/master/data/amr/megares_modified_database_v2.00.fasta + wget -O {output.megares_annot_local} https://raw.githubusercontent.com/meglab-metagenomics/amrplusplus_v2/master/data/amr/megares_modified_annotations_v2.00.csv + """ + +rule build_amr_index: + """Build megares index""" + input: + megares_db=amrplusplus_config["megares"]["fasta"] or f"{MEGARES_FASTA}" + output: + amb=f"{amrplusplus_config['megares']['fasta']}.amb" if amrplusplus_config["megares"]["fasta"] else f"{MEGARES_FASTA}.amb", + ann=f"{amrplusplus_config['megares']['fasta']}.ann" if amrplusplus_config["megares"]["fasta"] else f"{MEGARES_FASTA}.ann", + bwt=f"{amrplusplus_config['megares']['fasta']}.bwt" if amrplusplus_config["megares"]["fasta"] else f"{MEGARES_FASTA}.bwt", + pac=f"{amrplusplus_config['megares']['fasta']}.pac" if amrplusplus_config["megares"]["fasta"] else f"{MEGARES_FASTA}.pac", + sa=f"{amrplusplus_config['megares']['fasta']}.sa" if amrplusplus_config["megares"]["fasta"] else f"{MEGARES_FASTA}.sa", + log: + stdout=f"{LOGDIR}/amrplusplus/build_amr.index.stdout.log", + stderr=f"{LOGDIR}/amrplusplus/build_amr.index.stderr.log", + shadow: + "shallow" + conda: + "../../envs/amrplusplus.yaml" + singularity: + "shub://meglab-metagenomics/amrplusplus_v2" + shell: + """ + bwa index {input.megares_db} \ + 2> {log.stderr} \ + > {log.stdout} + """ + +rule align_to_amr: + """BWA to align host filtered reads to Megares_2.0 database""" + input: + R1=f"{OUTDIR}/host_removal/{{sample}}_1.fq.gz", + R2=f"{OUTDIR}/host_removal/{{sample}}_2.fq.gz", + amb=f"{amrplusplus_config['megares']['fasta']}.amb" if amrplusplus_config["megares"]["fasta"] else f"{MEGARES_FASTA}.amb", + ann=f"{amrplusplus_config['megares']['fasta']}.ann" if amrplusplus_config["megares"]["fasta"] else f"{MEGARES_FASTA}.ann", + bwt=f"{amrplusplus_config['megares']['fasta']}.bwt" if amrplusplus_config["megares"]["fasta"] else f"{MEGARES_FASTA}.bwt", + pac=f"{amrplusplus_config['megares']['fasta']}.pac" if amrplusplus_config["megares"]["fasta"] else f"{MEGARES_FASTA}.pac", + sa=f"{amrplusplus_config['megares']['fasta']}.sa" if amrplusplus_config["megares"]["fasta"] else f"{MEGARES_FASTA}.sa", + output: + alignment=f"{OUTDIR}/amrplusplus/AlignToAMR/{{sample}}.amr.alignment.sam" + log: + stdout=f"{LOGDIR}/amrplusplus/{{sample}}.align_to_amr.stdout.log", + stderr=f"{LOGDIR}/amrplusplus/{{sample}}.align_to_amr.stderr.log", + shadow: + "shallow" + conda: + "../../envs/amrplusplus.yaml" + singularity: + "shub://meglab-metagenomics/amrplusplus_v2" + threads: + cluster_config["align_to_amr"]["n"] if "align_to_amr" in cluster_config else 10 + params: + megares_db=amrplusplus_config["megares"]["fasta"] or f"{MEGARES_FASTA}" + shell: + """ + bwa mem \ + {params.megares_db} \ + {input.R1} \ + {input.R2} \ + -t {threads} \ + -R '@RG\\tID:{wildcards.sample}\\tSM:{wildcards.sample}' > {output.alignment} + """ + +rule run_resistome: + """Annotate alignments""" + input: + alignment=f"{OUTDIR}/amrplusplus/AlignToAMR/{{sample}}.amr.alignment.sam", + output: + gene=f"{OUTDIR}/amrplusplus/RunResistome/{{sample}}.gene.tsv", + group=f"{OUTDIR}/amrplusplus/RunResistome/{{sample}}.group.tsv", + mech=f"{OUTDIR}/amrplusplus/RunResistome/{{sample}}.mechanism.tsv", + klass=f"{OUTDIR}/amrplusplus/RunResistome/{{sample}}.class.tsv", + type=f"{OUTDIR}/amrplusplus/RunResistome/{{sample}}.type.tsv", + log: + stdout=f"{LOGDIR}/amrplusplus/{{sample}}.run_resistome.stdout.log", + stderr=f"{LOGDIR}/amrplusplus/{{sample}}.run_resistome.stderr.log", + shadow: + "shallow" + conda: + "../../envs/amrplusplus.yaml" + singularity: + "shub://meglab-metagenomics/amrplusplus_v2" + params: + script="scripts/amrplusplus/resistome", + threshold=amrplusplus_config["resistome"]["threshold"], + megares_db=amrplusplus_config["megares"]["fasta"] or f"{MEGARES_FASTA}", + megares_annot=amrplusplus_config["megares"]["annotation"] or f"{MEGARES_ANNOT}", + shell: + """ + {params.script} \ + -ref_fp {params.megares_db} \ + -annot_fp {params.megares_annot} \ + -sam_fp {input.alignment} \ + -gene_fp {output.gene} \ + -group_fp {output.group} \ + -mech_fp {output.mech} \ + -class_fp {output.klass} \ + -type_fp {output.type} \ + -t {params.threshold} \ + 2> {log.stderr} \ + > {log.stdout} + """ + +rule run_rarefaction: + """Running rarefaction""" + input: + alignment=f"{OUTDIR}/amrplusplus/AlignToAMR/{{sample}}.amr.alignment.sam", + output: + gene=f"{OUTDIR}/amrplusplus/RunRarefaction/{{sample}}.gene.tsv", + group=f"{OUTDIR}/amrplusplus/RunRarefaction/{{sample}}.group.tsv", + mech=f"{OUTDIR}/amrplusplus/RunRarefaction/{{sample}}.mechanism.tsv", + klass=f"{OUTDIR}/amrplusplus/RunRarefaction/{{sample}}.class.tsv", + type=f"{OUTDIR}/amrplusplus/RunRarefaction/{{sample}}.type.tsv", + log: + stdout=f"{LOGDIR}/amrplusplus/{{sample}}.run_rarefaction.stdout.log", + stderr=f"{LOGDIR}/amrplusplus/{{sample}}.run_rarefaction.stderr.log", + shadow: + "shallow" + conda: + "../../envs/amrplusplus.yaml" + singularity: + "shub://meglab-metagenomics/amrplusplus_v2" + params: + megares_db=amrplusplus_config["megares"]["fasta"] or f"{MEGARES_FASTA}", + megares_annot=amrplusplus_config["megares"]["annotation"] or f"{MEGARES_ANNOT}", + script="scripts/amrplusplus/rarefaction", + min=amrplusplus_config["rarefaction"]["min"], + max=amrplusplus_config["rarefaction"]["max"], + skip=amrplusplus_config["rarefaction"]["skip"], + samples=amrplusplus_config["rarefaction"]["samples"], + threshold=amrplusplus_config["resistome"]["threshold"], + shell: + """ + {params.script} \ + -ref_fp {params.megares_db} \ + -sam_fp {input.alignment} \ + -annot_fp {params.megares_annot} \ + -gene_fp {output.gene} \ + -group_fp {output.group} \ + -mech_fp {output.mech} \ + -class_fp {output.klass} \ + -type_fp {output.type} \ + -min {params.min} \ + -max {params.max} \ + -skip {params.skip} \ + -samples {params.samples} \ + -t {params.threshold} \ + 2> {log.stderr} \ + > {log.stdout} + """ + +rule resistome_results: + """Creating matrix of all gene hits""" + input: + gene_resistome=expand(f"{OUTDIR}/amrplusplus/RunResistome/{{sample}}.gene.tsv", sample=SAMPLES), + gene_rarefaction=expand(f"{OUTDIR}/amrplusplus/RunRarefaction/{{sample}}.gene.tsv", sample=SAMPLES), + output: + AMR_matrix=f"{OUTDIR}/amrplusplus/ResistomeResults/AMR_analytics_matrix.csv" + log: + stdout=f"{LOGDIR}/amrplusplus/resistome_results.stdout.log", + stderr=f"{LOGDIR}/amrplusplus/resistome_results.stderr.log", + shadow: + "shallow" + conda: + "../../envs/amrplusplus.yaml" + singularity: + "shub://meglab-metagenomics/amrplusplus_v2" + params: + script="scripts/amrplusplus/amr_long_to_wide.py" + shell: + """ + python3 {params.script} \ + -i {input.gene_resistome} \ + -o {output.AMR_matrix} \ + 2> {log.stderr} \ + > {log.stdout} + """ diff --git a/rules/antibiotic_resistance/groot.smk b/rules/antibiotic_resistance/groot.smk index 614f363..54c7532 100644 --- a/rules/antibiotic_resistance/groot.smk +++ b/rules/antibiotic_resistance/groot.smk @@ -10,7 +10,7 @@ localrules: groot_db_path = Path(config["groot"]["index"]) -if config["antibiotic_resistance"]: +if config["antibiotic_resistance"]["groot"]: if not Path(groot_db_path).exists(): err_message = "No groot database found at: '{}'!\n".format(groot_db_path) err_message += "Specify the DB path in the groot section of config.yaml.\n" @@ -37,6 +37,8 @@ rule create_groot_index: "shallow" conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" params: dbdir=DBDIR/"groot/", db=groot_config["db"], @@ -76,8 +78,10 @@ rule groot_align: "shallow" conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" threads: - 8 + cluster_config["groot_align"]["n"] if "groot_align" in cluster_config else 8 params: index=groot_config["index"], minlength=groot_config["minlength"], @@ -115,6 +119,8 @@ rule groot_report: "shallow" conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" threads: 1 params: diff --git a/rules/assembly/metawrap.smk b/rules/assembly/metawrap.smk index b9ef959..878bf54 100644 --- a/rules/assembly/metawrap.smk +++ b/rules/assembly/metawrap.smk @@ -46,8 +46,10 @@ rule assembly: "shallow" conda: "../../envs/assembly.yaml" + singularity: + "shub://ctmrbio/stag-mwc:assembly" threads: - 20 + cluster_config["assembly"]["n"] if "assembly" in cluster_config else 20 params: outdir=lambda w: f"{OUTDIR}/metawrap/assembly/{mw_config['assembler']}/{w.sample}", assembler=mw_config["assembler"], @@ -60,7 +62,7 @@ rule assembly: -o {params.outdir} \ -t {threads} \ -m {params.memory} \ - --use-{params.assembler} \ + --{params.assembler} \ 2> {log.stderr} \ > {log.stdout} """ @@ -83,8 +85,10 @@ rule binning: "shallow" conda: "../../envs/assembly.yaml" + singularity: + "shub://ctmrbio/stag-mwc:assembly" threads: - 20 + cluster_config["binning"]["n"] if "binning" in cluster_config else 20 params: outdir=lambda w: f"{OUTDIR}/metawrap/binning/{mw_config['assembler']}/{w.sample}", universal=mw_config["universal"], @@ -122,8 +126,10 @@ rule consolidate_bins: "shallow" conda: "../../envs/assembly.yaml" + singularity: + "shub://ctmrbio/stag-mwc:assembly" threads: - 20 + cluster_config["consolidate_bins"]["n"] if "consolidate_bins" in cluster_config else 20 params: outdir=lambda w: f"{OUTDIR}/metawrap/consolidated_bins/{w.sample}", minimum_completion=mw_config["minimum_completion"], @@ -158,8 +164,10 @@ rule blobology: "shallow" conda: "../../envs/assembly.yaml" + singularity: + "shub://ctmrbio/stag-mwc:assembly" threads: - 20 + cluster_config["blobology"]["n"] if "blobology" in cluster_config else 20 params: outdir=lambda w: f"{OUTDIR}/metawrap/blobology/{w.sample}", shell: diff --git a/rules/functional_profiling/humann2.smk b/rules/functional_profiling/humann2.smk index b23f379..cf953c1 100644 --- a/rules/functional_profiling/humann2.smk +++ b/rules/functional_profiling/humann2.smk @@ -45,6 +45,8 @@ rule download_humann2_databases: "shallow" conda: "../../envs/humann2.yaml" + singularity: + "shub://ctmrbio/stag-mwc:biobakery" params: dbdir=config["dbdir"]+"/humann2" shell: @@ -70,8 +72,10 @@ rule humann2: "shallow" conda: "../../envs/humann2.yaml" + singularity: + "shub://ctmrbio/stag-mwc:biobakery" threads: - 20 + cluster_config["humann2"]["n"] if "humann2" in cluster_config else 20 resources: humann2=1 params: @@ -118,6 +122,8 @@ rule normalize_humann2_tables: "shallow" conda: "../../envs/humann2.yaml" + singularity: + "shub://ctmrbio/stag-mwc:biobakery" threads: 1 params: @@ -167,6 +173,8 @@ rule join_humann2_tables: "shallow" conda: "../../envs/humann2.yaml" + singularity: + "shub://ctmrbio/stag-mwc:biobakery" threads: 1 params: diff --git a/rules/mappers/bbmap.smk b/rules/mappers/bbmap.smk index 171c3a8..d274a39 100644 --- a/rules/mappers/bbmap.smk +++ b/rules/mappers/bbmap.smk @@ -68,8 +68,10 @@ for bbmap_config in config["bbmap"]: "shallow" conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" threads: - 8 + cluster_config["bbmap"]["n"] if "bbmap" in cluster_config else 8 params: db_path=bbmap_config["db_path"], min_id=bbmap_config["min_id"], @@ -113,6 +115,8 @@ for bbmap_config in config["bbmap"]: "shallow" conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" threads: 1 params: @@ -149,6 +153,8 @@ for bbmap_config in config["bbmap"]: "shallow" conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" threads: 4 params: diff --git a/rules/mappers/bowtie2.smk b/rules/mappers/bowtie2.smk index e332f0f..4d72049 100644 --- a/rules/mappers/bowtie2.smk +++ b/rules/mappers/bowtie2.smk @@ -67,7 +67,7 @@ for bt2_config in config["bowtie2"]: index=bt2_config["db_prefix"], extra=bt2_config["extra"], threads: - 8 + cluster_config["bowtie2"]["n"] if "bowtie2" in cluster_config else 8 wrapper: "0.23.1/bio/bowtie2/align" @@ -87,6 +87,8 @@ for bt2_config in config["bowtie2"]: "shallow" conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" shell: """ pileup.sh \ @@ -120,6 +122,8 @@ for bt2_config in config["bowtie2"]: "shallow" conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" threads: 1 params: @@ -156,6 +160,8 @@ for bt2_config in config["bowtie2"]: "shallow" conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" threads: 4 params: diff --git a/rules/multiqc/multiqc.smk b/rules/multiqc/multiqc.smk index 4f69f53..f232395 100644 --- a/rules/multiqc/multiqc.smk +++ b/rules/multiqc/multiqc.smk @@ -18,6 +18,8 @@ if config["multiqc_report"]: "shallow" conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" threads: 1 params: diff --git a/rules/naive/bbcountunique.smk b/rules/naive/bbcountunique.smk index e155272..af4d118 100644 --- a/rules/naive/bbcountunique.smk +++ b/rules/naive/bbcountunique.smk @@ -27,9 +27,11 @@ if config["naive"]["assess_depth"]: shadow: "shallow" threads: - 2 + cluster_config["bbcountunique"]["n"] if "bbcountunique" in cluster_config else 2 conda: "../../envs/stag-mwc.yaml", + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" params: interval=config["bbcountunique"]["interval"] shell: diff --git a/rules/naive/sketch_compare.smk b/rules/naive/sketch_compare.smk index 313299e..ae5318e 100644 --- a/rules/naive/sketch_compare.smk +++ b/rules/naive/sketch_compare.smk @@ -29,8 +29,10 @@ rule sketch: "shallow" conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" threads: - 4 + cluster_config["sketch"]["n"] if "sketch" in cluster_config else 4 shell: """ sketch.sh \ @@ -54,6 +56,8 @@ rule compare_sketches: "shallow" conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" shell: """ comparesketch.sh \ @@ -79,6 +83,8 @@ rule plot_sample_similarity: stderr=str(LOGDIR/"sketch_compare/sample_similarity_plot.stderr.log"), conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" shell: """ scripts/plot_sketch_comparison_heatmap.py \ diff --git a/rules/preproc/host_removal.smk b/rules/preproc/host_removal.smk index e53d951..4d90624 100644 --- a/rules/preproc/host_removal.smk +++ b/rules/preproc/host_removal.smk @@ -6,7 +6,7 @@ from pathlib import Path from snakemake.exceptions import WorkflowError rh_config = config["remove_host"] -if rh_config: +if config["host_removal"]: db_path = Path(config["remove_host"]["db_path"]) if not Path(db_path/"taxo.k2d").is_file(): err_message = "Cannot find database for host sequence removal at: '{}/*.k2d'!\n".format(db_path) @@ -45,8 +45,10 @@ if rh_config: "shallow" conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" threads: - 8 + cluster_config["remove_host"]["n"] if "remove_host" in cluster_config else 8 params: db=rh_config["db_path"], confidence=rh_config["confidence"], @@ -97,6 +99,8 @@ if rh_config: "shallow" conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" threads: 1 shell: diff --git a/rules/preproc/preprocessing_summary.smk b/rules/preproc/preprocessing_summary.smk index de5128b..f3fece9 100644 --- a/rules/preproc/preprocessing_summary.smk +++ b/rules/preproc/preprocessing_summary.smk @@ -34,6 +34,8 @@ rule preprocessing_summary: stderr=str(LOGDIR/"preprocessing_summary.log"), conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" threads: 1 shell: diff --git a/rules/preproc/read_quality.smk b/rules/preproc/read_quality.smk index e021666..a732a29 100644 --- a/rules/preproc/read_quality.smk +++ b/rules/preproc/read_quality.smk @@ -1,6 +1,6 @@ # vim: syntax=python expandtab # Read pre-processing -#TODO: Remove superfluous str conversions of paths in expand and log statements +# TODO: Remove superfluous str conversions of paths in expand and log statements # when Snakemake is pathlib compatible. if config["qc_reads"]: @@ -30,8 +30,10 @@ if config["qc_reads"]: "shallow" conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" threads: - 4 + cluster_config["fastp"]["n"] if "fastp" in cluster_config else 4 params: extra=fastp_config["extra"], shell: diff --git a/rules/publications.py b/rules/publications.py index cfe7896..01641ec 100644 --- a/rules/publications.py +++ b/rules/publications.py @@ -6,6 +6,7 @@ "Bracken": "Lu J, Breitwieser FP, Thielen P, Salzberg SL (2017). Bracken: estimating species abundance in metagenomics data. PeerJ Computer Science 3:e104. https://doi.org/10.7717/peerj-cs.104", "CONCOCT": "Alneberg, J., Bjarnason, B.S., de Bruijn, I., Schirmer, M., Quick, J., Ijaz, U.Z., Lahti, L., Loman, N.J, Andersson, A.F, Quince, C. (2014). Binning metagenomic contigs by coverage and composition. Nature Methods, https://doi.org/10.1038/nmeth.3103", "GROOT": "Rowe WPM, Winn MD (2018). Indexed variation graphs for efficient and accurate resistome profiling. Bioinformatics, Volume 34, Issue 21. https://doi.org/10.1093/bioinformatics/bty387", + "AMRPlusPlus2": "Doster E, Lakin S, Dean C, Wlfe C, Young J, Boucher C et al. (2019). MEGAREs 2.0: a database for classification of antimicrobial drug, biocide and metal resistance determinants in metagenomic sequence data. Nucleic Acids Research, Volume 48, Issue D1, 08 January 2020, Pages D561–D569, https://doi.org/10.1093/nar/gkz1010", "HUMAnN2": "Franzosa EA*, McIver LJ*, et al. (2018). Species-level functional profiling of metagenomes and metatranscriptomes. Nat Methods 15: 962-968. https://doi.org/10.1038/s41592-018-0176-y", "Kaiju": "Menzel, P., Ng, K. L., & Krogh, A. (2016). Fast and sensitive taxonomic classification for metagenomics with Kaiju. Nature communications, 7, 11257. Available online at: https://github.com/bioinformatics-centre/kaiju", "Kraken2": "Wood, D.E., Lu, J., & Langmead, B. (2019). Improved metagenomic analysis with Kraken 2. Genome biology, 20, 257. https://doi.org/10.1186/s13059-019-1891-0", diff --git a/rules/taxonomic_profiling/kaiju.smk b/rules/taxonomic_profiling/kaiju.smk index de44c00..90a618e 100644 --- a/rules/taxonomic_profiling/kaiju.smk +++ b/rules/taxonomic_profiling/kaiju.smk @@ -35,7 +35,7 @@ if config["taxonomic_profile"]["kaiju"]: all_outputs.extend(kaiju_reports) all_outputs.append(kaiju_krona) all_outputs.append(kaiju_joined_table) - all_outputs.append(kaiju_area_plot) + #all_outputs.append(kaiju_area_plot) # Buggy in stag 4.1 citations.add(publications["Kaiju"]) citations.add(publications["Krona"]) @@ -70,9 +70,11 @@ rule kaiju: shadow: "shallow" threads: - 4 + cluster_config["kaiju"]["n"] if "kaiju" in cluster_config else 4 conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" params: db=kaiju_config["db"], nodes=kaiju_config["nodes"], @@ -112,6 +114,8 @@ rule kaiju2krona: names=kaiju_config["names"], conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" shell: """ kaiju2krona \ @@ -131,6 +135,8 @@ rule create_kaiju_krona_plot: "shallow" conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" shell: """ ktImportText \ @@ -152,6 +158,8 @@ rule kaiju_report: names=kaiju_config["names"], conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" shell: """ kaiju2table \ @@ -181,6 +189,8 @@ rule join_kaiju_reports: value_column=kaiju_config["value_column"], conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" shell: """ scripts/join_tables.py \ @@ -203,6 +213,8 @@ rule kaiju_area_plot: str(LOGDIR/"kaiju/area_plot.log") conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" shell: """ scripts/area_plot.py \ diff --git a/rules/taxonomic_profiling/kraken2.smk b/rules/taxonomic_profiling/kraken2.smk index dc6d52f..78e11c6 100644 --- a/rules/taxonomic_profiling/kraken2.smk +++ b/rules/taxonomic_profiling/kraken2.smk @@ -85,9 +85,11 @@ rule kraken2: shadow: "shallow" threads: - 4 + cluster_config["kraken2"]["n"] if "kraken2" in cluster_config else 4 conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" params: db=kraken2_config["db"], confidence=kraken2_config["confidence"], @@ -116,9 +118,11 @@ rule kraken_mpa_style: log: str(LOGDIR/"kraken2/{sample}.mpa_style.log") threads: - 2 + 1 conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" shell: """ kreport2mpa.py \ @@ -140,9 +144,11 @@ rule join_kraken2_mpa: log: str(LOGDIR/"kraken2/join_kraken2_mpa_tables.log") threads: - 2 + 1 conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" params: value_column="reads", feature_column="taxon_name", @@ -168,6 +174,8 @@ rule kraken2_area_plot: str(LOGDIR/"kraken2/area_plot.kraken2.log") conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" shell: """ scripts/area_plot.py \ @@ -191,6 +199,8 @@ rule combine_kreports: "shallow" conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" shell: """ scripts/KrakenTools/combine_kreports.py \ @@ -214,6 +224,8 @@ rule kreport2krona: 1 conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" shell: """ scripts/KrakenTools/kreport2krona.py \ @@ -234,6 +246,8 @@ rule create_kraken2_krona_plot: "shallow" conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" shell: """ ktImportText \ @@ -283,11 +297,13 @@ rule bracken_kreport: log: str(LOGDIR/"kraken2/{sample}.bracken.log") threads: - 2 + cluster_config["bracken"]["n"] if "bracken" in cluster_config else 2 shadow: "shallow" conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" params: kmer_distrib=kraken2_config["bracken"]["kmer_distrib"], thresh=kraken2_config["bracken"]["thresh"], @@ -314,9 +330,11 @@ rule bracken_all_levels: shadow: # shadow required because est_abundance.py always creates the "shallow" # sample-level output file with fixed filename: {sample}_bracken.kreport threads: - 2 + cluster_config["bracken"]["n"] if "bracken" in cluster_config else 2 conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" params: kmer_distrib=kraken2_config["bracken"]["kmer_distrib"], thresh=kraken2_config["bracken"]["thresh"], @@ -340,9 +358,11 @@ rule bracken_mpa_style: log: str(LOGDIR/"kraken2/{sample}.bracken.mpa_style.log") threads: - 2 + 1 conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" shell: """ kreport2mpa.py \ @@ -364,9 +384,11 @@ rule join_bracken_mpa: log: str(LOGDIR/"kraken2/join_bracken_mpa_tables.log") threads: - 2 + 1 conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" params: value_column="reads", feature_column="taxon_name", @@ -392,6 +414,8 @@ rule bracken_area_plot: str(LOGDIR/"kraken2/area_plot.bracken.log") conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" shell: """ scripts/area_plot.py \ @@ -412,9 +436,11 @@ rule join_bracken: log: str(LOGDIR/"kraken2/join_bracken_tables.{level}.log") threads: - 2 + 1 conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" params: value_column="fraction_total_reads", feature_column="name", @@ -442,6 +468,8 @@ rule bracken2krona: "shallow" conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" shell: """ scripts/KrakenTools/kreport2krona.py \ @@ -462,6 +490,8 @@ rule create_bracken_krona_plot: "shallow" conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" shell: """ ktImportText \ @@ -478,9 +508,11 @@ rule filter_bracken: log: str(LOGDIR/"kraken2/{sample}.{level}.filter_bracken.log") threads: - 2 + 1 conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" params: filter_bracken="scripts/KrakenTools/filter_bracken.out.py", include=kraken2_config["filter_bracken"]["include"], @@ -506,9 +538,11 @@ rule join_bracken_filtered: log: str(LOGDIR/"kraken2/join_bracken_tables.{level}.log") threads: - 2 + 1 conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" params: value_column="fraction_total_reads", feature_column="name", diff --git a/rules/taxonomic_profiling/metaphlan2.smk b/rules/taxonomic_profiling/metaphlan2.smk index d75fa3a..ad27e97 100644 --- a/rules/taxonomic_profiling/metaphlan2.smk +++ b/rules/taxonomic_profiling/metaphlan2.smk @@ -51,8 +51,10 @@ rule metaphlan2: "shallow" conda: "../../envs/metaphlan2.yaml" + singularity: + "shub://ctmrbio/stag-mwc:biobakery" threads: - 5 + cluster_config["metaphlan2"]["n"] if "metaphlan2" in cluster_config else 5 params: bt2_db_dir=mpa_config["bt2_db_dir"], bt2_index=mpa_config["bt2_index"], @@ -117,6 +119,8 @@ rule combine_metaphlan2_tables: "shallow" conda: "../../envs/metaphlan2.yaml" + singularity: + "shub://ctmrbio/stag-mwc:biobakery" threads: 1 shell: @@ -137,6 +141,8 @@ rule metaphlan2_area_plot: str(LOGDIR/"metaphlan2/area_plot.log") conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" shell: """ scripts/area_plot.py \ @@ -161,6 +167,8 @@ rule plot_metaphlan2_heatmap: "shallow" conda: "../../envs/stag-mwc.yaml" + singularity: + "shub://ctmrbio/stag-mwc:stag-mwc" threads: 1 params: @@ -196,6 +204,8 @@ rule create_metaphlan2_krona_plots: "shallow" conda: "../../envs/metaphlan2.yaml" + singularity: + "shub://ctmrbio/stag-mwc:biobakery" threads: 1 shell: diff --git a/scripts/amrplusplus/LICENSE b/scripts/amrplusplus/LICENSE new file mode 100644 index 0000000..f0f3790 --- /dev/null +++ b/scripts/amrplusplus/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2017 Chris Dean + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/scripts/amrplusplus/amr_long_to_wide.py b/scripts/amrplusplus/amr_long_to_wide.py new file mode 100755 index 0000000..0d4d1ca --- /dev/null +++ b/scripts/amrplusplus/amr_long_to_wide.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 + +__author__ = "Steven Lakin" +__copyright__ = "" +__credits__ = ["Steven Lakin"] +__version__ = "" +__maintainer__ = "lakinsm" +__email__ = "lakinsm@colostate.edu" +__status__ = "Cows go moo." + +import argparse +import sys + +amr_level_names = {0: 'Class', 1: 'Mechanism', 2: 'Group'} + +def parse_cmdline_params(cmdline_params): + info = "" + parser = argparse.ArgumentParser(description=info) + parser.add_argument('-i', '--input_files', nargs='+', required=True, + help='Use globstar to pass a list of files, (Ex: *.tsv)') + parser.add_argument('-o', '--output_file', required=True, + help='Output file name for writing the AMR_analytic_matrix.csv file') + return parser.parse_args(cmdline_params) + +def amr_load_data(file_name_list): + samples = {} + labels = set() + for file in file_name_list: + with open(file, 'r') as f: + data = f.read().split('\n')[1:] + for entry in data: + if not entry: + continue + entry = entry.split('\t') + sample = entry[0].split('.')[0] + count = float(entry[2]) + gene_name = entry[1] + try: + samples[sample][gene_name] = count + except KeyError: + try: + samples[sample].setdefault(gene_name, count) + except KeyError: + samples.setdefault(sample, {gene_name: count}) + labels.add(gene_name) + return samples, labels + +def output_amr_analytic_data(outfile, S, L): + with open(outfile, 'w') as amr: + local_sample_names = [] + for sample, dat in S.items(): + local_sample_names.append(sample) + amr.write(','.join(local_sample_names) + '\n') + for label in L: + local_counts = [] + amr.write(label + ',') + for local_sample in local_sample_names: + if label in S[local_sample]: + local_counts.append(str(S[local_sample][label])) + else: + local_counts.append(str(0)) + amr.write(','.join(local_counts) + '\n') + +if __name__ == '__main__': + opts = parse_cmdline_params(sys.argv[1:]) + S, L = amr_load_data(opts.input_files) + output_amr_analytic_data(opts.output_file, S, L) diff --git a/scripts/amrplusplus/rarefaction b/scripts/amrplusplus/rarefaction new file mode 100755 index 0000000..4ad5eca Binary files /dev/null and b/scripts/amrplusplus/rarefaction differ diff --git a/scripts/amrplusplus/resistome b/scripts/amrplusplus/resistome new file mode 100755 index 0000000..d7bc642 Binary files /dev/null and b/scripts/amrplusplus/resistome differ