diff --git a/.editorconfig b/.editorconfig index 453226c..f719031 100644 --- a/.editorconfig +++ b/.editorconfig @@ -23,12 +23,7 @@ indent_size = 4 indent_style = tab indent_size = 1 -# Tab indentation (no size specified) -[Makefile] -indent_style = tab - - # Matches the exact files either package.json or .travis.yml [{package.json,.travis.yml}] indent_style = space -indent_size = 2 \ No newline at end of file +indent_size = 2 diff --git a/workflow/Snakefile b/workflow/Snakefile index ae65c0a..b07852e 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -12,6 +12,7 @@ include: "rules/bam_to_fastq.smk" include: "rules/trimming.smk" include: "rules/filtering.smk" include: "rules/mapping.smk" +include: "rules/ichorCNA.smk" include: "rules/QualityControl.smk" diff --git a/workflow/envs/icorCNA.yaml b/workflow/envs/icorCNA.yaml new file mode 100644 index 0000000..2553821 --- /dev/null +++ b/workflow/envs/icorCNA.yaml @@ -0,0 +1,9 @@ +name: icorCNA +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - r-ichorcna + - hmmcopy + diff --git a/workflow/rules/QualityControl.smk b/workflow/rules/QualityControl.smk index 6595903..1544192 100644 --- a/workflow/rules/QualityControl.smk +++ b/workflow/rules/QualityControl.smk @@ -7,6 +7,7 @@ rule fastqc: zip="results/{ID}/qc/fastqc/{SAMPLE}.{GENOME}_fastqc.zip", log: "results/logs/{ID}/fastqc/{SAMPLE}_all.{GENOME}.log", + threads: 8 wrapper: "0.75.0/bio/fastqc" @@ -18,6 +19,7 @@ rule samtools_stats: "results/{ID}/qc/samtools-stats/{SAMPLE}.{GENOME}.txt", log: "results/logs/{ID}/fastqc/{SAMPLE}.{GENOME}.log", + threads: 8 wrapper: "0.75.0/bio/samtools/stats" @@ -33,12 +35,12 @@ rule mosdepth: log: "results/logs/{ID}/mosdepth/{SAMPLE}.{GENOME}.log", params: - extra="--no-per-base --fast-mode", # optional + extra="--no-per-base", # optional by="500", # additional decompression threads through `--threads` threads: 32 # This value - 1 will be sent to `--threads` wrapper: - "v1.1.0/bio/mosdepth" + "v1.19.1/bio/mosdepth" rule multiqc: diff --git a/workflow/rules/bam_to_fastq.smk b/workflow/rules/bam_to_fastq.smk index 574353e..56403fd 100644 --- a/workflow/rules/bam_to_fastq.smk +++ b/workflow/rules/bam_to_fastq.smk @@ -19,9 +19,9 @@ rule bam_to_fastq: shell: """set +o pipefail; ( - samtools sort -T {params.TMPDIR} -n -@ {threads} {input.bam} | \ - samtools fastq -@ {threads} -t \ + samtools sort -T {params.TMPDIR} -n -@ $(({threads}/2)) {input.bam} | \ + samtools fastq -@ $(({threads}/2)) -t \ -1 {output.r1} \ -2 {output.r2} \ -0 {output.s1} \ - )2> {log}""" \ No newline at end of file + )2> {log}""" diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 8f2eb67..b6fcac1 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -48,6 +48,27 @@ def get_final_output(): final_output.extend( expand("results/{ID}/qc/multiqc.html", ID=samples["ID"].unique()) ) + final_output.extend( + expand( + "results/{ID}/icorCNA/readcounts/{SAMPLE}_processed.{GENOME}.wig", + zip, + ID=samples["ID"], + SAMPLE=samples["sample"], + GENOME=samples["genome_build"], + ) + ) + final_output.extend( + expand( + "results/{ID}/icorCNA/{SAMPLE}_processed_{GENOME}", + zip, + ID=samples["ID"], + SAMPLE=samples["sample"], + GENOME=samples["genome_build"], + ) + ) + + + return final_output @@ -215,6 +236,7 @@ def get_mapping_input(wildcards): elif trimming_algorithm.lower() == "trimmomatic": mapping_input["reads"] = ["results/{ID}/trimmed/trimmomatic/{SAMPLE}.1.fastq.gz", "results/{ID}/trimmed/trimmomatic/{SAMPLE}.2.fastq.gz",] if all_data: - mapping_input["non_merged"] = ["results/{ID}/trimmed/trimmomatic/{SAMPLE}.1.unpaired.fastq.gz","results/{ID}/trimmed/trimmomatic/{SAMPLE}.2.unpaired.fastq.gz"] + mapping_input["noadapter_R1"] = "results/{ID}/trimmed/trimmomatic/{SAMPLE}.1.unpaired.fastq.gz" + mapping_input["noadapter_R2"] = "results/{ID}/trimmed/trimmomatic/{SAMPLE}.2.unpaired.fastq.gz" return mapping_input diff --git a/workflow/rules/ichorCNA.smk b/workflow/rules/ichorCNA.smk new file mode 100644 index 0000000..80482a5 --- /dev/null +++ b/workflow/rules/ichorCNA.smk @@ -0,0 +1,93 @@ +def get_chroms_readcounter(chromfile): + if config["read_counter"]["chroms"]["autoselect_chroms"]: + with open(chromfile,"r") as f: + chrom = f.read() + return chrom + else: + return config["read_counter"]["chroms"]["chrom_list"] + + + + +rule get_chroms: + input: + bam="results/{ID}/mapped_reads/{SAMPLE}_processed.{GENOME}.bam", + bai="results/{ID}/mapped_reads/{SAMPLE}_processed.{GENOME}.bam.bai", + output: + chroms="results/{ID}/icorCNA/chroms/{SAMPLE}_processed.{GENOME}.chromosomes.txt", + log: + "results/logs/{ID}/get_chroms/{SAMPLE}_processed.{GENOME}.log", + conda: + "../envs/cfDNA_prep.yaml" + shell: + """samtools idxstats {input.bam} | \ + cut -f 1 | grep -w 'chr[1-9]\|chr[1-2][0-9]\|chr[X,Y]\|^[1-2][0-9]\|^[0-9]\|^[X,Y]' \ + | tr '\n' ','| sed 's/,*\r*$//' \ + 1> {output.chroms} 2>{log}""" + + +rule read_counter: + input: + bam="results/{ID}/mapped_reads/{SAMPLE}_processed.{GENOME}.bam", + bai="results/{ID}/mapped_reads/{SAMPLE}_processed.{GENOME}.bam.bai", + chroms="results/{ID}/icorCNA/chroms/{SAMPLE}_processed.{GENOME}.chromosomes.txt", + output: + wig="results/{ID}/icorCNA/readcounts/{SAMPLE}_processed.{GENOME}.wig", + log: + "results/logs/{ID}/mapping/{SAMPLE}_all.{GENOME}.log", + params: + window = config["read_counter"]["window"], + quality = config["read_counter"]["quality"], + chroms= lambda wc: get_chroms_readcounter(f"test/{wc.ID}/icorCNA/chroms/{wc.SAMPLE}_processed.{wc.GENOME}.chromosomes.txt"), + conda: + "../envs/icorCNA.yaml" + shell: + """ + readCounter --window {params.window} --quality {params.quality} \ + --chromosome {params.chroms} {input.bam} 1>{output.wig} 2>{log} + """ + + +rule ichorCNA: + input: + bam="results/{ID}/mapped_reads/{SAMPLE}_processed.{GENOME}.bam", + bai="results/{ID}/mapped_reads/{SAMPLE}_processed.{GENOME}.bam.bai", + wig="results/{ID}/icorCNA/readcounts/{SAMPLE}_processed.{GENOME}.wig", + resources="resources/ichorCNA/" + output: + outDir=directory("results/{ID}/icorCNA/{SAMPLE}_processed_{GENOME}/"), + summary="results/{ID}/icorCNA/{SAMPLE}_processed_{GENOME}/{SAMPLE}_processed.params.txt" + params: + sID="{SAMPLE}_processed", + ploidy = config["ichorCNA"]["ploidy"], + normal = config["ichorCNA"]["normal"], + maxCN = config["ichorCNA"]["maxCN"], + gcWIG = lambda wc: config["ichorCNA"]["gcWIG"][wc.GENOME], + mapWIG = lambda wc: config["ichorCNA"]["mapWIG"][wc.GENOME], + centro = lambda wc: config["ichorCNA"]["centro"][wc.GENOME], + normalPanel = config["ichorCNA"]["normalPanel"], + includeHOMD = config["ichorCNA"]["includeHOMD"], + chrs = config["ichorCNA"]["chrs"], + chrTrain = config["ichorCNA"]["chrTrain"], + estimateNormal = config["ichorCNA"]["estimateNormal"], + estimatePloidy = config["ichorCNA"]["estimatePloidy"], + estimateScPrevalence = config["ichorCNA"]["estimateScPrevalence"], + scStates = config["ichorCNA"]["scStates"], + txnE = config["ichorCNA"]["txnE"], + txnStrength = config["ichorCNA"]["txnStrength"], + log: + "results/logs/{ID}/mapping/{SAMPLE}_all.{GENOME}.log", + conda: + "../envs/icorCNA.yaml" + shell: + """ + runIchorCNA.R --id {params.sID} --WIG {input.wig} \ + --ploidy {params.ploidy} --normal {params.normal} --maxCN {params.maxCN} \ + --gcWig {params.gcWIG} --mapWig {params.mapWIG} \ + --centromere {params.centro} --normalPanel {params.normalPanel} \ + --includeHOMD {params.includeHOMD} --chrs {params.chrs} \ + --chrTrain {params.chrTrain} --estimateNormal {params.estimateNormal} \ + --estimatePloidy {params.estimatePloidy} \ + --estimateScPrevalence {params.estimateScPrevalence} --scStates {params.scStates} \ + --txnE {params.txnE} --txnStrength {params.txnStrength} --outDir {output.outDir} + """ diff --git a/workflow/rules/mapping.smk b/workflow/rules/mapping.smk index 7c8cf5f..3c383ff 100644 --- a/workflow/rules/mapping.smk +++ b/workflow/rules/mapping.smk @@ -27,17 +27,17 @@ rule mark_duplicates: threads: 64 shell: #"set +o pipefail;" - "(samtools fixmate -u -@ {threads} -m {input.mapped_reads} - | " - "samtools sort -u -@ {threads} -T {params.TMPDIR} - | " - "samtools markdup -@ {threads} - {output.processed_reads}) 2>{log}" + "(samtools fixmate -u -@ $(({threads}/3)) -m {input.mapped_reads} - | " + "samtools sort -u -@ $(({threads}/3)) -T {params.TMPDIR} - | " + "samtools markdup -@ $(({threads}/3)) - {output.processed_reads}) 2>{log}" rule index_bam: input: - "results/{ID}/mapped_reads/{SAMPLE}_processed.{GENOME}.bam" + "{path}.bam" output: - "results/{ID}/mapped_reads/{SAMPLE}_processed.{GENOME}.bam.bai" - log:"results/logs/{ID}/index_bam/{SAMPLE}.{GENOME}.log", + "{path}.bam.bai" conda: "../envs/cfDNA_prep.yaml" threads: 32 shell: - "samtools index -@ {threads} {input} {output} 2> {log}" + "samtools index -@ {threads} {input} {output}" + diff --git a/workflow/rules/raw_input_QC.smk b/workflow/rules/raw_input_QC.smk index b7e411b..839438d 100644 --- a/workflow/rules/raw_input_QC.smk +++ b/workflow/rules/raw_input_QC.smk @@ -19,4 +19,4 @@ rule flagstat_pre_conversion: ## samtools stats for bam files -## FastQC: BAM, SAM or FastQ files \ No newline at end of file +## FastQC: BAM, SAM or FastQ files diff --git a/workflow/rules/reference.smk b/workflow/rules/reference.smk index 52a17b7..7a3aaaf 100644 --- a/workflow/rules/reference.smk +++ b/workflow/rules/reference.smk @@ -1,7 +1,4 @@ -#! wrapper rules können auch anders benannte inputs und outputs haben -> output based on configs! - - rule get_reference: output: "resources/reference/{GENOME}.fa", @@ -50,4 +47,16 @@ rule get_trimmomatic_adapters: log: "logs/get_trimmomatic_adapter.log" shell: - "wget -P {params.prefix} {params.URLs} -o {log}" \ No newline at end of file + "wget -P {params.prefix} {params.URLs} -o {log}" + +rule get_ichorCNA_files: + output: + zip="ichorCNA_master.zip", + dir=directory("resources/ichorCNA") + params: + prefix="resources/", + URL="https://github.com/broadinstitute/ichorCNA/archive/refs/heads/master.zip" + log: + "logs/get_ichorCNA_files.log" + shell: + '(wget -P {params.prefix} -c {params.URL} -O {output.zip}; unzip -j {output.zip} "ichorCNA-master/inst/*" -d {output.dir}) 2> {log}' diff --git a/workflow/scripts/bwa-mem2_wrapper.py b/workflow/scripts/bwa-mem2_wrapper.py index 490d465..0700b87 100644 --- a/workflow/scripts/bwa-mem2_wrapper.py +++ b/workflow/scripts/bwa-mem2_wrapper.py @@ -8,15 +8,24 @@ from os import path +import sys from wsgiref.handlers import BaseCGIHandler from snakemake.shell import shell inputs = snakemake.input params = snakemake.params +threads = snakemake.threads + +print(threads) log = snakemake.log_fmt_shell(stdout=False, stderr=True) +samtools_CPU = max(1,threads//5) +remaining_CPU = threads - samtools_CPU + + + # Check inputs/arguments. if not isinstance(snakemake.input.reads, str) and len(snakemake.input.reads) not in { 1, @@ -28,17 +37,31 @@ non_merged_cmd = "" singleton_cmd = "" -if snakemake.input.get("non_merged"): - non_merged_cmd = "bwa-mem2 mem -t {snakemake.threads} -R \"{snakemake.params.RG}\" {snakemake.input.ref} {snakemake.input.noadapter_R1} {snakemake.input.noadapter_R2}| grep -v \"^@\" || true ; " + +### it is okay, that all bwa-mem2 commands get the same number of CPUs, as they are run sequentially. +### Samtools after the pipe is run in a separate subshell, therefore being run in an additional process + + +if snakemake.input.get("noadapter_R1") and snakemake.input.get("noadapter_R2"): + non_merged_cmd = f"bwa-mem2 mem -t {remaining_CPU} -R \"{{snakemake.params.RG}}\" {{snakemake.input.ref}} {{snakemake.input.noadapter_R1}} {{snakemake.input.noadapter_R2}}| grep -v \"^@\" || true ; " +# non_merged_cmd = "bwa-mem2 mem -t {snakemake.threads} -R \"{snakemake.params.RG}\" {snakemake.input.ref} {snakemake.input.noadapter_R1} {snakemake.input.noadapter_R2}| grep -v \"^@\" || true ; " if snakemake.input.get("single_reads"): - singleton_cmd = "bwa-mem2 mem -t {snakemake.threads} -R \"{snakemake.params.RG}\" {snakemake.input.ref} {snakemake.input.single_reads} | grep -v \"^@\" || true ; " + singleton_cmd = f"bwa-mem2 mem -t {remaining_CPU} -R \"{{snakemake.params.RG}}\" {{snakemake.input.ref}} {{snakemake.input.single_reads}} | grep -v \"^@\" || true ; " + #singleton_cmd = "bwa-mem2 mem -t {snakemake.threads} -R \"{snakemake.params.RG}\" {snakemake.input.ref} {snakemake.input.single_reads} | grep -v \"^@\" || true ; " -base_cmd = f"((bwa-mem2 mem -t {{snakemake.threads}} -R \"{{snakemake.params.RG}}\" {{snakemake.input.ref}} {{snakemake.input.reads}}; \ + +base_cmd = f"((bwa-mem2 mem -t {remaining_CPU} -R \"{{snakemake.params.RG}}\" {{snakemake.input.ref}} {{snakemake.input.reads}}; \ {non_merged_cmd}\ {singleton_cmd}\ -) | samtools view -b -o {{snakemake.output.mapped_reads}} - ) 2>{{snakemake.log}}" +) | samtools view -b -@ {samtools_CPU} -o {{snakemake.output.mapped_reads}} - ) 2>{{snakemake.log}}" + +#base_cmd = f"((bwa-mem2 mem -t {{snakemake.threads}} -R \"{{snakemake.params.RG}}\" {{snakemake.input.ref}} {{snakemake.input.reads}}; \ +#{non_merged_cmd}\ +#{singleton_cmd}\ +#) | samtools view -b -@ {{snakemake.threads}} -o {{snakemake.output.mapped_reads}} - ) 2>{{snakemake.log}}" +print(f"Executing command:\n {base_cmd}", file=sys.stderr) shell(base_cmd)