Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor quality control rules #712

Merged
merged 5 commits into from
Aug 23, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 36 additions & 37 deletions BALSAMIC/snakemake_rules/quality_control/GATK.rule
Original file line number Diff line number Diff line change
Expand Up @@ -2,42 +2,41 @@
# coding: utf-8

rule PreparePopVCF:
input:
bam = bam_dir + "tumor.merged.bam",
ref1kg = config["reference"]["1kg_snps_all"],
output:
popvcf = result_dir + "popvcf.vcf"
params:
conda = config["bioinfo_tools"].get("bcftools"),
anno_str1 = "FORMAT/GT,FORMAT/GL,FORMAT/DS,^INFO/AC,^INFO/AF,^INFO/AN,^INFO/",
popcode = "EUR"
singularity: Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix()
benchmark:
benchmark_dir + "PreparePopVCF_" + "tumor_prepare_pop_vcf.tsv"
shell:
"source activate {params.conda}; "
"readlink -f {input.bam}; "
"bcftools annotate "
"-x {params.anno_str1}{params.popcode}_AF "
"{input.ref1kg} "
" | "
"bcftools annotate "
"-i INFO/{params.popcode}_AF!=0.0 "
" | "
"awk -v OFS=\"\\t\" "
"'$1~/^#/ {{ print; }} "
" $1!~/^#/ {{ "
"split($8,INFO,\";\"); "
"newINFO=\"\";"
"for (i in INFO) {{ "
"if (INFO[i]~\"{params.popcode}_AF\") {{ "
"split(INFO[i],AF,\"=\"); "
"P=substr(AF[1], 1, length(AF[1])-3); "
"INFO[i]=P\"={{\"$4\"*=\"AF[2]\",\"$5\"=\"1-AF[2]\"}}\"; "
"INFO[i]=INFO[i]\";set=\"P;}} "
"newINFO=INFO[i] \";\" newINFO; "
"}} "
"$8=substr(newINFO, 1, length(newINFO)-1); "
"print; }}' > {output.popvcf}; "
input:
bam = bam_dir + "tumor.merged.bam",
ref1kg = config["reference"]["1kg_snps_all"],
output:
popvcf = result_dir + "popvcf.vcf"
benchmark:
Path(benchmark_dir, "PreparePopVCF_" + "tumor.tsv").as_posix()
singularity:
Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix()
params:
conda = config["bioinfo_tools"].get("bcftools"),
anno_str1 = "FORMAT/GT,FORMAT/GL,FORMAT/DS,^INFO/AC,^INFO/AF,^INFO/AN,^INFO/",
popcode = "EUR"
message:
"Generate intermediate pop vcf file for gatk analysis"
shell:
"""
source activate {params.conda};
readlink -f {input.bam};

bcftools annotate \
-x {params.anno_str1}{params.popcode}_AF \
{input.ref1kg} \
| bcftools annotate \
-i INFO/{params.popcode}_AF!=0.0 \
| awk -v OFS=\"\\t\" '$1~/^#/ {{ print; }} $1!~/^#/ {{ split($8,INFO,\";\"); newINFO=\"\";"

for (i in INFO) {{ \
if (INFO[i]~\"{params.popcode}_AF\") {{ \
split(INFO[i],AF,\"=\"); P=substr(AF[1], 1, length(AF[1])-3); \
INFO[i]=P\"={{\"$4\"*=\"AF[2]\",\"$5\"=\"1-AF[2]\"}}\"; \
INFO[i]=INFO[i]\";set=\"P; }} \
newINFO=INFO[i] \";\" newINFO; }} \
$8=sustr(newINFO, 1, length(newINFO)-1); print; }}' \
> {output.popvcf};
"""


49 changes: 24 additions & 25 deletions BALSAMIC/snakemake_rules/quality_control/contest.rule
Original file line number Diff line number Diff line change
Expand Up @@ -2,32 +2,31 @@
# coding: utf-8

rule gatk_contest:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@hassanfa: Are we using this rule?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes it is used in panel TN workflow, and related to #300

input:
bamN = bam_dir + "normal.merged.bam",
bamT = bam_dir + "tumor.merged.bam",
fa = config["reference"]["reference_genome"],
popvcf = result_dir + "popvcf.vcf",
output:
N_vs_T = bam_dir + "normal_tumor.contest",
T_vs_N = bam_dir + "tumor_normal.contest",
benchmark:
benchmark_dir + "gatk_contest_" + config["analysis"]["case_id"] + "tsv"
singularity:
Path(singularity_image, config["bioinfo_tools"].get("gatk") + ".sif").as_posix()
params:
conda = config["bioinfo_tools"].get("gatk"),
min_genotype_ratio="0.95",
popcode = "EUR",
tmpdir = tempfile.mkdtemp(prefix=tmp_dir),
message:
"Running gatk contamination estimation between normal and tumor"
shell:
"""
input:
bamN = bam_dir + "normal.merged.bam",
bamT = bam_dir + "tumor.merged.bam",
fa = config["reference"]["reference_genome"],
popvcf = result_dir + "popvcf.vcf",
output:
N_vs_T = bam_dir + "normal_tumor.contest",
T_vs_N = bam_dir + "tumor_normal.contest",
benchmark:
Path(benchmark_dir, "gatk_contest_" + config["analysis"]["case_id"] + ".tsv").as_posix()
singularity:
Path(singularity_image, config["bioinfo_tools"].get("gatk") + ".sif").as_posix()
params:
conda = config["bioinfo_tools"].get("gatk"),
min_genotype_ratio="0.95",
popcode = "EUR",
tmpdir = tempfile.mkdtemp(prefix=tmp_dir),
message:
"Running gatk contamination estimation between normal and tumor"
shell:
"""
source activate {params.conda};

mkdir -p {params.tmpdir};
export TMPDIR={params.tmpdir};

java -jar -Djava.io.tmpdir={params.tmpdir} \
-Xms8G -Xmx16G \
$CONDA_PREFIX/opt/gatk-3.8/GenomeAnalysisTK.jar \
Expand All @@ -39,7 +38,7 @@ $CONDA_PREFIX/opt/gatk-3.8/GenomeAnalysisTK.jar \
--population {params.popcode} \
--min_genotype_ratio {params.min_genotype_ratio} \
-o {output.N_vs_T};

java -jar -Djava.io.tmpdir={params.tmpdir} \
-Xms8G -Xmx16G \
$CONDA_PREFIX/opt/gatk-3.8/GenomeAnalysisTK.jar \
Expand All @@ -51,4 +50,4 @@ $CONDA_PREFIX/opt/gatk-3.8/GenomeAnalysisTK.jar \
--population {params.popcode} \
--min_genotype_ratio {params.min_genotype_ratio} \
-o {output.T_vs_N};
"""
"""
10 changes: 6 additions & 4 deletions BALSAMIC/snakemake_rules/quality_control/fastp.rule
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ rule fastp_umi:
json = qc_dir + "fastp/{sample}_fastp_umi.json",
html = qc_dir + "fastp/{sample}_fastp_umi.html",
benchmark:
benchmark_dir + "fastp_umi" + "{sample}_fastp.tsv"
Path(benchmark_dir, "fastp_umi" + "{sample}.tsv").as_posix()
singularity:
Path(singularity_image, config["bioinfo_tools"].get("fastp") + ".sif").as_posix()
params:
Expand All @@ -44,7 +44,8 @@ rule fastp_umi:
adapter = " ".join(fastp_param_adapter),
sample_name = "{sample}",
conda = config["bioinfo_tools"].get("fastp")
threads: get_threads(cluster_config, 'fastp')
threads:
get_threads(cluster_config, 'fastp')
message:
"Quality control and trimming input fastq for {params.sample_name}"
shell:
Expand Down Expand Up @@ -76,7 +77,7 @@ rule fastp:
json = qc_dir + "fastp/{sample}_fastp.json",
html = qc_dir + "fastp/{sample}_fastp.html"
benchmark:
benchmark_dir + "fastp_" + "{sample}_fastp.tsv"
Path(benchmark_dir, "fastp_" + "{sample}.tsv").as_posix()
singularity:
Path(singularity_image, config["bioinfo_tools"].get("fastp") + ".sif").as_posix()
params:
Expand All @@ -86,7 +87,8 @@ rule fastp:
minimum_length = config["QC"]["min_seq_length"],
sample_name = "{sample}",
conda = config["bioinfo_tools"].get("fastp")
threads: get_threads(cluster_config, 'fastp')
threads:
get_threads(cluster_config, 'fastp')
message:
"Quality control and trimming of umi optimized fastq file for {params.sample_name}"
shell:
Expand Down
45 changes: 22 additions & 23 deletions BALSAMIC/snakemake_rules/quality_control/fastqc.rule
Original file line number Diff line number Diff line change
Expand Up @@ -3,35 +3,34 @@

# Following rule will take input fastq files, align them using bwa mem, and convert the output to sam format
rule fastqc:
input:
read1 = fastq_dir + "{sample}_1.fastq.gz",
read2 = fastq_dir + "{sample}_2.fastq.gz",
output:
read1 = fastqc_dir + "{sample}_1_fastqc.zip",
read2 = fastqc_dir + "{sample}_2_fastqc.zip"
benchmark:
benchmark_dir + "fastqc_{sample}.tsv"
singularity:
Path(singularity_image, config["bioinfo_tools"].get("fastqc") + ".sif").as_posix()
params:
conda = config["bioinfo_tools"].get("fastqc"),
fastqc_dir = fastqc_dir,
sample = "{sample}",
tmpdir = tempfile.mkdtemp(prefix=tmp_dir),
message:
"Running FastQC on {params.sample}"
shell:
"""
input:
read1 = fastq_dir + "{sample}_1.fastq.gz",
read2 = fastq_dir + "{sample}_2.fastq.gz",
output:
read1 = fastqc_dir + "{sample}_1_fastqc.zip",
read2 = fastqc_dir + "{sample}_2_fastqc.zip"
benchmark:
Path(benchmark_dir, "fastqc_{sample}.tsv").as_posix()
singularity:
Path(singularity_image, config["bioinfo_tools"].get("fastqc") + ".sif").as_posix()
params:
conda = config["bioinfo_tools"].get("fastqc"),
fastqc_dir = fastqc_dir,
sample = "{sample}",
tmpdir = tempfile.mkdtemp(prefix=tmp_dir),
message:
"Running FastQC on {params.sample}"
shell:
"""
source activate {params.conda};

mkdir -p {params.tmpdir};
export TMPDIR={params.tmpdir};

fastqc {input.read1} \
--dir {params.tmpdir} \
--outdir {params.fastqc_dir};

fastqc {input.read2} \
--dir {params.tmpdir} \
--outdir {params.fastqc_dir};
"""
"""
57 changes: 28 additions & 29 deletions BALSAMIC/snakemake_rules/quality_control/mosdepth.rule
Original file line number Diff line number Diff line change
Expand Up @@ -2,39 +2,38 @@
# coding: utf-8

rule mosdepth_coverage:
input:
bam = bam_dir + "{sample}" + ".sorted." + picarddup + ".bam",
bed = config["panel"]["capture_kit"]
output:
bam_dir + "{sample}.mosdepth.global.dist.txt",
bam_dir + "{sample}.mosdepth.region.dist.txt",
bam_dir + "{sample}.mosdepth.summary.txt",
bam_dir + "{sample}.per-base.bed.gz",
bam_dir + "{sample}.regions.bed.gz"
benchmark:
benchmark_dir + "mosdepth_covearge_" + "{sample}.tsv"
singularity:
Path(singularity_image, config["bioinfo_tools"].get("mosdepth") + ".sif").as_posix()
params:
mapq='20',
samflag='1796',
quantize='0:1:50:150:',
sample_name='{sample}',
output_dir=bam_dir,
conda = config["bioinfo_tools"].get("mosdepth")
threads:
get_threads(cluster_config, "mosdepth")
message:
"Running mosdepth for coveage on {params.sample_name}"
shell:
"""
input:
bam = bam_dir + "{sample}" + ".sorted." + picarddup + ".bam",
bed = config["panel"]["capture_kit"]
output:
bam_dir + "{sample}.mosdepth.global.dist.txt",
bam_dir + "{sample}.mosdepth.region.dist.txt",
bam_dir + "{sample}.mosdepth.summary.txt",
bam_dir + "{sample}.per-base.bed.gz",
bam_dir + "{sample}.regions.bed.gz"
benchmark:
Path(benchmark_dir, "mosdepth_coverage_" + "{sample}.tsv").as_posix()
singularity:
Path(singularity_image, config["bioinfo_tools"].get("mosdepth") + ".sif").as_posix()
params:
mapq = '20',
samflag = '1796',
quantize = '0:1:50:150:',
sample_name = '{sample}',
output_dir = bam_dir,
conda = config["bioinfo_tools"].get("mosdepth")
threads:
get_threads(cluster_config, "mosdepth_coverage")
message:
"Calculate coverage using mosdepth for sample {params.sample_name}"
ashwini06 marked this conversation as resolved.
Show resolved Hide resolved
shell:
"""
source activate {params.conda};

export MOSDEPTH_Q0=NO_COVERAGE;
export MOSDEPTH_Q1=LOW_COVERAGE;
export MOSDEPTH_Q2=CALLABLE;
export MOSDEPTH_Q3=HIGH_COVERAGE;

mosdepth \
--by {input.bed} \
--mapq {params.mapq} \
Expand All @@ -43,4 +42,4 @@ mosdepth \
--threads {threads} \
{params.output_dir}/{params.sample_name} \
{input.bam};
"""
"""
25 changes: 16 additions & 9 deletions BALSAMIC/snakemake_rules/quality_control/multiqc.rule
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ if config['analysis']['analysis_type'] == "paired":
if config["analysis"]["sequencing_type"] == 'wgs':
picard_metrics_wildcard = ["alignment_summary_metrics", "base_distribution_by_cycle_metrics",
"base_distribution_by_cycle.pdf", "insert_size_histogram.pdf", "insert_size_metrics",
"quality_by_cycle_metrics",
"quality_by_cycle.pdf", "quality_distribution_metrics", "quality_distribution.pdf"]
"quality_by_cycle_metrics", "quality_by_cycle.pdf",
"quality_distribution_metrics", "quality_distribution.pdf"]
# fastqc metrics
multiqc_input.extend(expand(fastqc_dir + "{sample}_{read_num}_fastqc.zip", sample=config["samples"], read_num=[1, 2]))

Expand Down Expand Up @@ -66,16 +66,23 @@ rule multiqc:
output:
html = qc_dir + "multiqc_report.html",
json = qc_dir + "multiqc_data/multiqc_data.json",
benchmark:
Path(benchmark_dir, "multiqc_" + config["analysis"]["case_id"] + ".multiqc.tsv").as_posix()
singularity:
Path(singularity_image, config["bioinfo_tools"].get("multiqc") + ".sif").as_posix()
params:
housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "multiqc"},
dir_list = result_dir,
qc_dir = qc_dir,
conda = config["bioinfo_tools"].get("multiqc"),
singularity: Path(singularity_image, config["bioinfo_tools"].get("multiqc") + ".sif").as_posix()
benchmark:
benchmark_dir + "multiqc_" + config["analysis"]["case_id"] + ".multiqc.tsv"
case_name = config["analysis"]["case_id"]
message:
"Aggregrate quality metrics results using multiqc for sample {params.case_name}"
shell:
"source activate {params.conda};"
"echo -e \"{params.dir_list}\" > {params.qc_dir}/dir_list; "
"multiqc --force --outdir {params.qc_dir} --data-format json -l {params.qc_dir}/dir_list; "
"chmod -R 777 {params.qc_dir};"
"""
source activate {params.conda};

echo -e \"{params.dir_list}\" > {params.qc_dir}/dir_list;
multiqc --force --outdir {params.qc_dir} --data-format json -l {params.qc_dir}/dir_list;
chmod -R 777 {params.qc_dir};
"""
Loading