Skip to content

Commit

Permalink
Merge pull request #24 from kircherlab/develop
Browse files Browse the repository at this point in the history
Update: rule optimization; add ichorCNA
  • Loading branch information
sroener authored Nov 17, 2022
2 parents f366e32 + 9466a70 commit 808f646
Show file tree
Hide file tree
Showing 11 changed files with 183 additions and 29 deletions.
7 changes: 1 addition & 6 deletions .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,7 @@ indent_size = 4
indent_style = tab
indent_size = 1

# Tab indentation (no size specified)
[Makefile]
indent_style = tab


# Matches the exact files either package.json or .travis.yml
[{package.json,.travis.yml}]
indent_style = space
indent_size = 2
indent_size = 2
1 change: 1 addition & 0 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ include: "rules/bam_to_fastq.smk"
include: "rules/trimming.smk"
include: "rules/filtering.smk"
include: "rules/mapping.smk"
include: "rules/ichorCNA.smk"
include: "rules/QualityControl.smk"


Expand Down
9 changes: 9 additions & 0 deletions workflow/envs/icorCNA.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
name: icorCNA
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- r-ichorcna
- hmmcopy

6 changes: 4 additions & 2 deletions workflow/rules/QualityControl.smk
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ rule fastqc:
zip="results/{ID}/qc/fastqc/{SAMPLE}.{GENOME}_fastqc.zip",
log:
"results/logs/{ID}/fastqc/{SAMPLE}_all.{GENOME}.log",
threads: 8
wrapper:
"0.75.0/bio/fastqc"

Expand All @@ -18,6 +19,7 @@ rule samtools_stats:
"results/{ID}/qc/samtools-stats/{SAMPLE}.{GENOME}.txt",
log:
"results/logs/{ID}/fastqc/{SAMPLE}.{GENOME}.log",
threads: 8
wrapper:
"0.75.0/bio/samtools/stats"

Expand All @@ -33,12 +35,12 @@ rule mosdepth:
log:
"results/logs/{ID}/mosdepth/{SAMPLE}.{GENOME}.log",
params:
extra="--no-per-base --fast-mode", # optional
extra="--no-per-base", # optional
by="500",
# additional decompression threads through `--threads`
threads: 32 # This value - 1 will be sent to `--threads`
wrapper:
"v1.1.0/bio/mosdepth"
"v1.19.1/bio/mosdepth"


rule multiqc:
Expand Down
6 changes: 3 additions & 3 deletions workflow/rules/bam_to_fastq.smk
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ rule bam_to_fastq:
shell:
"""set +o pipefail;
(
samtools sort -T {params.TMPDIR} -n -@ {threads} {input.bam} | \
samtools fastq -@ {threads} -t \
samtools sort -T {params.TMPDIR} -n -@ $(({threads}/2)) {input.bam} | \
samtools fastq -@ $(({threads}/2)) -t \
-1 {output.r1} \
-2 {output.r2} \
-0 {output.s1} \
)2> {log}"""
)2> {log}"""
24 changes: 23 additions & 1 deletion workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,27 @@ def get_final_output():
final_output.extend(
expand("results/{ID}/qc/multiqc.html", ID=samples["ID"].unique())
)
final_output.extend(
expand(
"results/{ID}/icorCNA/readcounts/{SAMPLE}_processed.{GENOME}.wig",
zip,
ID=samples["ID"],
SAMPLE=samples["sample"],
GENOME=samples["genome_build"],
)
)
final_output.extend(
expand(
"results/{ID}/icorCNA/{SAMPLE}_processed_{GENOME}",
zip,
ID=samples["ID"],
SAMPLE=samples["sample"],
GENOME=samples["genome_build"],
)
)




return final_output

Expand Down Expand Up @@ -215,6 +236,7 @@ def get_mapping_input(wildcards):
elif trimming_algorithm.lower() == "trimmomatic":
mapping_input["reads"] = ["results/{ID}/trimmed/trimmomatic/{SAMPLE}.1.fastq.gz", "results/{ID}/trimmed/trimmomatic/{SAMPLE}.2.fastq.gz",]
if all_data:
mapping_input["non_merged"] = ["results/{ID}/trimmed/trimmomatic/{SAMPLE}.1.unpaired.fastq.gz","results/{ID}/trimmed/trimmomatic/{SAMPLE}.2.unpaired.fastq.gz"]
mapping_input["noadapter_R1"] = "results/{ID}/trimmed/trimmomatic/{SAMPLE}.1.unpaired.fastq.gz"
mapping_input["noadapter_R2"] = "results/{ID}/trimmed/trimmomatic/{SAMPLE}.2.unpaired.fastq.gz"

return mapping_input
93 changes: 93 additions & 0 deletions workflow/rules/ichorCNA.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
def get_chroms_readcounter(chromfile):
if config["read_counter"]["chroms"]["autoselect_chroms"]:
with open(chromfile,"r") as f:
chrom = f.read()
return chrom
else:
return config["read_counter"]["chroms"]["chrom_list"]




rule get_chroms:
input:
bam="results/{ID}/mapped_reads/{SAMPLE}_processed.{GENOME}.bam",
bai="results/{ID}/mapped_reads/{SAMPLE}_processed.{GENOME}.bam.bai",
output:
chroms="results/{ID}/icorCNA/chroms/{SAMPLE}_processed.{GENOME}.chromosomes.txt",
log:
"results/logs/{ID}/get_chroms/{SAMPLE}_processed.{GENOME}.log",
conda:
"../envs/cfDNA_prep.yaml"
shell:
"""samtools idxstats {input.bam} | \
cut -f 1 | grep -w 'chr[1-9]\|chr[1-2][0-9]\|chr[X,Y]\|^[1-2][0-9]\|^[0-9]\|^[X,Y]' \
| tr '\n' ','| sed 's/,*\r*$//' \
1> {output.chroms} 2>{log}"""


rule read_counter:
input:
bam="results/{ID}/mapped_reads/{SAMPLE}_processed.{GENOME}.bam",
bai="results/{ID}/mapped_reads/{SAMPLE}_processed.{GENOME}.bam.bai",
chroms="results/{ID}/icorCNA/chroms/{SAMPLE}_processed.{GENOME}.chromosomes.txt",
output:
wig="results/{ID}/icorCNA/readcounts/{SAMPLE}_processed.{GENOME}.wig",
log:
"results/logs/{ID}/mapping/{SAMPLE}_all.{GENOME}.log",
params:
window = config["read_counter"]["window"],
quality = config["read_counter"]["quality"],
chroms= lambda wc: get_chroms_readcounter(f"test/{wc.ID}/icorCNA/chroms/{wc.SAMPLE}_processed.{wc.GENOME}.chromosomes.txt"),
conda:
"../envs/icorCNA.yaml"
shell:
"""
readCounter --window {params.window} --quality {params.quality} \
--chromosome {params.chroms} {input.bam} 1>{output.wig} 2>{log}
"""


rule ichorCNA:
input:
bam="results/{ID}/mapped_reads/{SAMPLE}_processed.{GENOME}.bam",
bai="results/{ID}/mapped_reads/{SAMPLE}_processed.{GENOME}.bam.bai",
wig="results/{ID}/icorCNA/readcounts/{SAMPLE}_processed.{GENOME}.wig",
resources="resources/ichorCNA/"
output:
outDir=directory("results/{ID}/icorCNA/{SAMPLE}_processed_{GENOME}/"),
summary="results/{ID}/icorCNA/{SAMPLE}_processed_{GENOME}/{SAMPLE}_processed.params.txt"
params:
sID="{SAMPLE}_processed",
ploidy = config["ichorCNA"]["ploidy"],
normal = config["ichorCNA"]["normal"],
maxCN = config["ichorCNA"]["maxCN"],
gcWIG = lambda wc: config["ichorCNA"]["gcWIG"][wc.GENOME],
mapWIG = lambda wc: config["ichorCNA"]["mapWIG"][wc.GENOME],
centro = lambda wc: config["ichorCNA"]["centro"][wc.GENOME],
normalPanel = config["ichorCNA"]["normalPanel"],
includeHOMD = config["ichorCNA"]["includeHOMD"],
chrs = config["ichorCNA"]["chrs"],
chrTrain = config["ichorCNA"]["chrTrain"],
estimateNormal = config["ichorCNA"]["estimateNormal"],
estimatePloidy = config["ichorCNA"]["estimatePloidy"],
estimateScPrevalence = config["ichorCNA"]["estimateScPrevalence"],
scStates = config["ichorCNA"]["scStates"],
txnE = config["ichorCNA"]["txnE"],
txnStrength = config["ichorCNA"]["txnStrength"],
log:
"results/logs/{ID}/mapping/{SAMPLE}_all.{GENOME}.log",
conda:
"../envs/icorCNA.yaml"
shell:
"""
runIchorCNA.R --id {params.sID} --WIG {input.wig} \
--ploidy {params.ploidy} --normal {params.normal} --maxCN {params.maxCN} \
--gcWig {params.gcWIG} --mapWig {params.mapWIG} \
--centromere {params.centro} --normalPanel {params.normalPanel} \
--includeHOMD {params.includeHOMD} --chrs {params.chrs} \
--chrTrain {params.chrTrain} --estimateNormal {params.estimateNormal} \
--estimatePloidy {params.estimatePloidy} \
--estimateScPrevalence {params.estimateScPrevalence} --scStates {params.scStates} \
--txnE {params.txnE} --txnStrength {params.txnStrength} --outDir {output.outDir}
"""
14 changes: 7 additions & 7 deletions workflow/rules/mapping.smk
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,17 @@ rule mark_duplicates:
threads: 64
shell:
#"set +o pipefail;"
"(samtools fixmate -u -@ {threads} -m {input.mapped_reads} - | "
"samtools sort -u -@ {threads} -T {params.TMPDIR} - | "
"samtools markdup -@ {threads} - {output.processed_reads}) 2>{log}"
"(samtools fixmate -u -@ $(({threads}/3)) -m {input.mapped_reads} - | "
"samtools sort -u -@ $(({threads}/3)) -T {params.TMPDIR} - | "
"samtools markdup -@ $(({threads}/3)) - {output.processed_reads}) 2>{log}"

rule index_bam:
input:
"results/{ID}/mapped_reads/{SAMPLE}_processed.{GENOME}.bam"
"{path}.bam"
output:
"results/{ID}/mapped_reads/{SAMPLE}_processed.{GENOME}.bam.bai"
log:"results/logs/{ID}/index_bam/{SAMPLE}.{GENOME}.log",
"{path}.bam.bai"
conda: "../envs/cfDNA_prep.yaml"
threads: 32
shell:
"samtools index -@ {threads} {input} {output} 2> {log}"
"samtools index -@ {threads} {input} {output}"

2 changes: 1 addition & 1 deletion workflow/rules/raw_input_QC.smk
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ rule flagstat_pre_conversion:
## samtools stats for bam files


## FastQC: BAM, SAM or FastQ files
## FastQC: BAM, SAM or FastQ files
17 changes: 13 additions & 4 deletions workflow/rules/reference.smk
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@

#! wrapper rules können auch anders benannte inputs und outputs haben -> output based on configs!


rule get_reference:
output:
"resources/reference/{GENOME}.fa",
Expand Down Expand Up @@ -50,4 +47,16 @@ rule get_trimmomatic_adapters:
log:
"logs/get_trimmomatic_adapter.log"
shell:
"wget -P {params.prefix} {params.URLs} -o {log}"
"wget -P {params.prefix} {params.URLs} -o {log}"

rule get_ichorCNA_files:
output:
zip="ichorCNA_master.zip",
dir=directory("resources/ichorCNA")
params:
prefix="resources/",
URL="https://github.com/broadinstitute/ichorCNA/archive/refs/heads/master.zip"
log:
"logs/get_ichorCNA_files.log"
shell:
'(wget -P {params.prefix} -c {params.URL} -O {output.zip}; unzip -j {output.zip} "ichorCNA-master/inst/*" -d {output.dir}) 2> {log}'
33 changes: 28 additions & 5 deletions workflow/scripts/bwa-mem2_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,24 @@


from os import path
import sys
from wsgiref.handlers import BaseCGIHandler

from snakemake.shell import shell

inputs = snakemake.input
params = snakemake.params
threads = snakemake.threads

print(threads)

log = snakemake.log_fmt_shell(stdout=False, stderr=True)

samtools_CPU = max(1,threads//5)
remaining_CPU = threads - samtools_CPU



# Check inputs/arguments.
if not isinstance(snakemake.input.reads, str) and len(snakemake.input.reads) not in {
1,
Expand All @@ -28,17 +37,31 @@
non_merged_cmd = ""
singleton_cmd = ""

if snakemake.input.get("non_merged"):
non_merged_cmd = "bwa-mem2 mem -t {snakemake.threads} -R \"{snakemake.params.RG}\" {snakemake.input.ref} {snakemake.input.noadapter_R1} {snakemake.input.noadapter_R2}| grep -v \"^@\" || true ; "

### it is okay, that all bwa-mem2 commands get the same number of CPUs, as they are run sequentially.
### Samtools after the pipe is run in a separate subshell, therefore being run in an additional process


if snakemake.input.get("noadapter_R1") and snakemake.input.get("noadapter_R2"):
non_merged_cmd = f"bwa-mem2 mem -t {remaining_CPU} -R \"{{snakemake.params.RG}}\" {{snakemake.input.ref}} {{snakemake.input.noadapter_R1}} {{snakemake.input.noadapter_R2}}| grep -v \"^@\" || true ; "
# non_merged_cmd = "bwa-mem2 mem -t {snakemake.threads} -R \"{snakemake.params.RG}\" {snakemake.input.ref} {snakemake.input.noadapter_R1} {snakemake.input.noadapter_R2}| grep -v \"^@\" || true ; "

if snakemake.input.get("single_reads"):
singleton_cmd = "bwa-mem2 mem -t {snakemake.threads} -R \"{snakemake.params.RG}\" {snakemake.input.ref} {snakemake.input.single_reads} | grep -v \"^@\" || true ; "
singleton_cmd = f"bwa-mem2 mem -t {remaining_CPU} -R \"{{snakemake.params.RG}}\" {{snakemake.input.ref}} {{snakemake.input.single_reads}} | grep -v \"^@\" || true ; "
#singleton_cmd = "bwa-mem2 mem -t {snakemake.threads} -R \"{snakemake.params.RG}\" {snakemake.input.ref} {snakemake.input.single_reads} | grep -v \"^@\" || true ; "

base_cmd = f"((bwa-mem2 mem -t {{snakemake.threads}} -R \"{{snakemake.params.RG}}\" {{snakemake.input.ref}} {{snakemake.input.reads}}; \

base_cmd = f"((bwa-mem2 mem -t {remaining_CPU} -R \"{{snakemake.params.RG}}\" {{snakemake.input.ref}} {{snakemake.input.reads}}; \
{non_merged_cmd}\
{singleton_cmd}\
) | samtools view -b -o {{snakemake.output.mapped_reads}} - ) 2>{{snakemake.log}}"
) | samtools view -b -@ {samtools_CPU} -o {{snakemake.output.mapped_reads}} - ) 2>{{snakemake.log}}"

#base_cmd = f"((bwa-mem2 mem -t {{snakemake.threads}} -R \"{{snakemake.params.RG}}\" {{snakemake.input.ref}} {{snakemake.input.reads}}; \
#{non_merged_cmd}\
#{singleton_cmd}\
#) | samtools view -b -@ {{snakemake.threads}} -o {{snakemake.output.mapped_reads}} - ) 2>{{snakemake.log}}"

print(f"Executing command:\n {base_cmd}", file=sys.stderr)

shell(base_cmd)

0 comments on commit 808f646

Please sign in to comment.