-
Notifications
You must be signed in to change notification settings - Fork 185
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Add wrapper for bwa-meme (#555)
* bwa-meme initial commit * wrapper path * Update bio/bwa-meme/index/test/Snakefile Co-authored-by: Johannes Köster <johannes.koester@uni-due.de> * Update bio/bwa-meme/mem/test/Snakefile_samtools Co-authored-by: Johannes Köster <johannes.koester@uni-due.de> * samtools test to normal test Co-authored-by: Johannes Köster <johannes.koester@uni-due.de>
- Loading branch information
1 parent
06329dd
commit 92579c4
Showing
13 changed files
with
303 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
channels: | ||
- conda-forge | ||
- bioconda | ||
- nodefaults | ||
dependencies: | ||
- bwa-meme ==1.0.4 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
name: "bwa-mem2 index" | ||
description: "Creates a bwa-meme index." | ||
authors: | ||
- Christopher Schröder | ||
- Patrik Smeds |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
rule bwa_mem2_index: | ||
input: | ||
"{genome}", | ||
output: | ||
multiext( | ||
"{genome}", | ||
".0123", | ||
".amb", | ||
".ann", | ||
".pac", | ||
".pos_packed", | ||
".suffixarray_uint64", | ||
".suffixarray_uint64_L0_PARAMETERS", | ||
".suffixarray_uint64_L1_PARAMETERS", | ||
".suffixarray_uint64_L2_PARAMETERS" | ||
) | ||
log: | ||
"logs/bwa-meme_index/{genome}.log", | ||
threads: 8 | ||
wrapper: | ||
"master/bio/bwa-meme/index" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
>Sheila | ||
GCTAGCTCAGAAAAAAAAAA |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
__author__ = "Christopher Schröder, Patrik Smeds" | ||
__copyright__ = "Copyright 2022, Christopher Schröder, Patrik Smeds" | ||
__email__ = "christopher.schroeder@tu-dortmund.de, patrik.smeds@gmail.com" | ||
__license__ = "MIT" | ||
|
||
from os import path | ||
|
||
from snakemake.shell import shell | ||
|
||
log = snakemake.log_fmt_shell(stdout=True, stderr=True) | ||
|
||
# Check inputs/arguments. | ||
if len(snakemake.input) == 0: | ||
raise ValueError("A reference genome has to be provided.") | ||
elif len(snakemake.input) > 1: | ||
raise ValueError("Please provide exactly one reference genome as input.") | ||
|
||
valid_suffixes = { | ||
".0123", | ||
".amb", | ||
".ann", | ||
".pac", | ||
".pos_packed", | ||
".suffixarray_uint64", | ||
".suffixarray_uint64_L0_PARAMETERS", | ||
".suffixarray_uint64_L1_PARAMETERS", | ||
".suffixarray_uint64_L2_PARAMETERS", | ||
} | ||
|
||
|
||
def get_valid_suffix(path): | ||
for suffix in valid_suffixes: | ||
if path.endswith(suffix): | ||
return suffix | ||
|
||
|
||
prefixes = set() | ||
for s in snakemake.output: | ||
suffix = get_valid_suffix(s) | ||
if suffix is None: | ||
raise ValueError(f"{s} cannot be generated by bwa-meme index (invalid suffix).") | ||
prefixes.add(s[: -len(suffix)]) | ||
|
||
if len(prefixes) != 1: | ||
raise ValueError("Output files must share common prefix up to their file endings.") | ||
(prefix,) = prefixes | ||
|
||
shell( | ||
"(bwa-meme index -a meme -p {prefix} {snakemake.input[0]} -t {snakemake.threads} && build_rmis_dna.sh {snakemake.input[0]}) {log}" | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
channels: | ||
- conda-forge | ||
- bioconda | ||
- nodefaults | ||
dependencies: | ||
- bwa-meme ==1.0.4 | ||
- samtools ==1.15 | ||
- samblaster == 0.1.26 | ||
- mbuffer == 20160228 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
name: "bwa-meme" | ||
description: BWA-MEME is a practical and efficient seeding algorithm based on a suffix array search algorithm that solves the challenges in utilizing learned indices for SMEM search which is extensively used in the seeding phase. It achieves up to 3.45× speedup in seeding throughput over BWA-MEM2 by reducing the number of instructions by 4.60×, memory accesses by 8.77× and LLC misses by 2.21×, while ensuring the identical SAM output to BWA-MEM2. | ||
authors: | ||
- Christopher Schröder | ||
- Johannes Köster | ||
- Julian de Ruiter |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
rule bwa_meme_index: #[hide] | ||
input: #[hide] | ||
"genome.fasta", #[hide] | ||
output: #[hide] | ||
"{genome}.0123", #[hide] | ||
"{genome}.amb", #[hide] | ||
"{genome}.ann", #[hide] | ||
"{genome}.pac", #[hide] | ||
"{genome}.pos_packed", #[hide] | ||
"{genome}.suffixarray_uint64", #[hide] | ||
"{genome}.suffixarray_uint64_L0_PARAMETERS", #[hide] | ||
"{genome}.suffixarray_uint64_L1_PARAMETERS", #[hide] | ||
"{genome}.suffixarray_uint64_L2_PARAMETERS", #[hide] | ||
threads: 8 #[hide] | ||
log: #[hide] | ||
"logs/bwa_meme_index/{genome}.log", #[hide] | ||
wrapper: #[hide] | ||
"master/bio/bwa-meme/index" #[hide] | ||
#[hide] | ||
#[hide] | ||
rule bwa_meme_mem: | ||
input: | ||
reads=["reads/{sample}.1.fastq", "reads/{sample}.2.fastq"], | ||
# Index can be a list of (all) files created by bwa, or one of them | ||
reference="genome.fasta", | ||
idx=multiext( | ||
"genome.fasta", | ||
".0123", | ||
".amb", | ||
".ann", | ||
".pac", | ||
".pos_packed", | ||
".suffixarray_uint64", | ||
".suffixarray_uint64_L0_PARAMETERS", | ||
".suffixarray_uint64_L1_PARAMETERS", | ||
".suffixarray_uint64_L2_PARAMETERS", | ||
), | ||
output: | ||
"mapped/{sample}.cram", # Output can be .cram, .bam, or .sam | ||
log: | ||
"logs/bwa_meme/{sample}.log", | ||
params: | ||
extra=r"-R '@RG\tID:{sample}\tSM:{sample}' -M", | ||
sort="samtools", # Can be 'none' or 'samtools'. | ||
sort_order="coordinate", # Can be 'coordinate' (default) or 'queryname'. | ||
sort_extra="", # Extra args for samtools. | ||
dedup="mark", # Can be 'none' (default), 'mark' or 'remove'. | ||
dedup_extra="-M", # Extra args for samblaster. | ||
exceed_thread_limit=True, # Set threads als for samtools sort / view (total used CPU may exceed threads!) | ||
embed_ref=True, # Embed reference when writing cram. | ||
threads: 8 | ||
wrapper: | ||
"master/bio/bwa-meme/mem" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
>Sheila | ||
GCTAGCTCAGAAAAAAAAAA |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
@1 | ||
ACGGCAT | ||
+ | ||
!!!!!!! |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
@1 | ||
ACGGCAT | ||
+ | ||
!!!!!!! |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
__author__ = "Christopher Schröder, Johannes Köster, Julian de Ruiter" | ||
__copyright__ = ( | ||
"Copyright 2020, Christopher Schröder, Johannes Köster and Julian de Ruiter" | ||
) | ||
__email__ = "christopher.schroeder@tu-dortmund.de koester@jimmy.harvard.edu, julianderuiter@gmail.com" | ||
__license__ = "MIT" | ||
|
||
|
||
from os import path | ||
|
||
from snakemake.shell import shell | ||
|
||
|
||
# Extract arguments. | ||
extra = snakemake.params.get("extra", "") | ||
|
||
sort = snakemake.params.get("sort", "none") | ||
sort_order = snakemake.params.get("sort_order", "coordinate") | ||
sort_extra = snakemake.params.get("sort_extra", "") | ||
embed_ref = snakemake.params.get("embed_ref", False) | ||
|
||
# Option to set the threads of samtools sort and view to the snakemake limit. | ||
# In theory, bwa and alternate and samtools view starts only when sort is | ||
# finished, so that never more threads are used than the limit. But it can | ||
# not always be guaranteed. | ||
exceed_thread_limit = snakemake.params.get("exceed_thread_limit", False) | ||
dedup = snakemake.params.get("dedup", "none") | ||
dedup_extra = snakemake.params.get("dedup_extra", "") | ||
|
||
# Detect output format. | ||
if snakemake.output[0].endswith(".sam"): | ||
output_format = "cram" | ||
elif snakemake.output[0].endswith(".bam"): | ||
output_format = "bam" | ||
elif snakemake.output[0].endswith(".cram"): | ||
output_format = "cram" | ||
else: | ||
raise ValueError("output file format must be .sam, .bam or .cram") | ||
|
||
if embed_ref: | ||
output_format += ",embed_ref" | ||
|
||
if exceed_thread_limit: | ||
samtools_threads = snakemake.threads | ||
else: | ||
samtools_threads = 1 | ||
|
||
reference = snakemake.input.get("reference") | ||
|
||
log = snakemake.log_fmt_shell(stdout=False, stderr=True) | ||
|
||
# Check inputs/arguments. | ||
if not isinstance(snakemake.input.reads, str) and len(snakemake.input.reads) not in { | ||
1, | ||
2, | ||
}: | ||
raise ValueError("input must have 1 (single-end) or 2 (paired-end) elements") | ||
|
||
if sort_order not in {"coordinate", "queryname"}: | ||
raise ValueError("Unexpected value for sort_order ({})".format(sort_order)) | ||
|
||
# Determine which pipe command to use for converting to bam or sorting. | ||
if sort == "none": | ||
# Simply convert to bam using samtools view. | ||
pipe_cmd = "samtools view -h -O {output_format} -o {snakemake.output[0]} -T {reference} -@ {samtools_threads} -" | ||
|
||
elif sort == "samtools": | ||
|
||
pipe_cmd = "samtools sort {sort_extra} -O {output_format} -o {snakemake.output[0]} --reference {reference} -@ {samtools_threads} -" | ||
|
||
# Add name flag if needed. | ||
if sort_order == "queryname": | ||
sort_extra += " -n" | ||
|
||
prefix = path.splitext(snakemake.output[0])[0] | ||
sort_extra += " -T " + prefix + ".tmp" | ||
|
||
# Sort alignments using samtools sort. | ||
|
||
# elif sort == "picard": | ||
# # Sort alignments using picard SortSam. | ||
# pipe_cmd = ( | ||
# "picard SortSam {sort_extra} INPUT=/dev/stdin" | ||
# " OUTPUT={snakemake.output[0]} SORT_ORDER={sort_order}" | ||
# ) | ||
|
||
else: | ||
raise ValueError("Unexpected value for params.sort ({})".format(sort)) | ||
|
||
|
||
# Determine which pipe command to use for converting to bam or sorting. | ||
if dedup == "none": | ||
# Do not detect duplicates. | ||
dedup_cmd = "" | ||
|
||
elif dedup == "mark": | ||
# Mark duplicates using samblaster. | ||
dedup_cmd = "samblaster -q {dedup_extra} | " | ||
|
||
elif dedup == "remove": | ||
dedup_cmd = "samblaster -q -r {dedup_extra} | " | ||
|
||
# elif sort == "picard": | ||
# # Sort alignments using picard SortSam. | ||
# pipe_cmd = ( | ||
# "picard SortSam {sort_extra} INPUT=/dev/stdin" | ||
# " OUTPUT={snakemake.output[0]} SORT_ORDER={sort_order}" | ||
# ) | ||
|
||
else: | ||
raise ValueError("Unexpected value for params.dedup ({})".format(dedup)) | ||
|
||
|
||
duplicates = snakemake.params.get("mark_duplicates", False) | ||
|
||
|
||
shell( | ||
"(bwa-meme mem -7" | ||
" -t {snakemake.threads}" | ||
" {extra}" | ||
" {reference}" | ||
" {snakemake.input.reads}" | ||
" | mbuffer -q -m 10G " | ||
" | " + dedup_cmd + pipe_cmd + ") {log}" | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters