diff --git a/BALSAMIC/assets/scripts/edit_vcf_info.py b/BALSAMIC/assets/scripts/edit_vcf_info.py index e1769b494..c1f1609dc 100644 --- a/BALSAMIC/assets/scripts/edit_vcf_info.py +++ b/BALSAMIC/assets/scripts/edit_vcf_info.py @@ -44,7 +44,7 @@ def edit_vcf_info(input_vcf, output_vcf, variant_caller): new_vcf = Writer(output_vcf, vcf) - with gzip.open(output_vcf, "wb"): + with open(output_vcf, "wb"): for variant in vcf: variant.INFO["FOUND_IN"] = variant_caller + "|" + output_vcf new_vcf.write_record(variant) diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_normal.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_normal.rule index 59e826187..b51b5f584 100644 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_normal.rule +++ b/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_normal.rule @@ -31,20 +31,22 @@ rule bcftools_filter_vardict_research_tumor_normal: bcftools view {input.vcf_snv_research} | \ bcftools filter --threads {threads} --include 'INFO/GNOMADAF_popmax <= {params.pop_freq[0]} || INFO/GNOMADAF_popmax == \".\"' --soft-filter '{params.pop_freq[1]}' --mode '+' | \ bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' |\ -bcftools view -f PASS -o {output.vcf_pass_vardict}.temp -O z; - -tabix -p vcf -f {output.vcf_pass_vardict}.temp; +bcftools view -f PASS -o {output.vcf_pass_vardict}.temp1 -O z; python {params.edit_vcf_script} \ ---input_vcf {output.vcf_pass_vardict}.temp \ ---output_vcf {output.vcf_pass_vardict} \ +--input_vcf {output.vcf_pass_vardict}.temp1 \ +--output_vcf {output.vcf_pass_vardict}.temp2 \ --variant_caller {params.variant_caller}; +bgzip -@ {threads} -l 9 -c {output.vcf_pass_vardict}.temp2 > {output.vcf_pass_vardict} + tabix -p vcf -f {output.vcf_pass_vardict}; bcftools +counts {output.vcf_pass_vardict} > {output.bcftools_counts_research}; -rm {output.vcf_pass_vardict}.temp; +rm {output.vcf_pass_vardict}.temp1; + +rm {output.vcf_pass_vardict}.temp2; """ @@ -107,20 +109,22 @@ rule bcftools_filter_TNscope_umi_research_tumor_normal: bcftools view -f PASS,triallelic_site --threads {threads} {input.vcf_snv_research} | \ bcftools filter --threads {threads} --include 'INFO/GNOMADAF_popmax <= {params.pop_freq[0]} || INFO/GNOMADAF_popmax == \".\"' --soft-filter '{params.pop_freq[1]}' --mode '+' | \ bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' |\ -bcftools view --threads {threads} -f PASS,triallelic_site -o {output.vcf_pass_tnscope_umi}.temp -O z; - -tabix -p vcf -f {output.vcf_pass_tnscope_umi}.temp; +bcftools view --threads {threads} -f PASS,triallelic_site -o {output.vcf_pass_tnscope_umi}.temp1 -O z; python {params.edit_vcf_script} \ ---input_vcf {output.vcf_pass_tnscope_umi}.temp \ ---output_vcf {output.vcf_pass_tnscope_umi} \ +--input_vcf {output.vcf_pass_tnscope_umi}.temp1 \ +--output_vcf {output.vcf_pass_tnscope_umi}.temp2 \ --variant_caller {params.variant_caller}; +bgzip -@ {threads} -l 9 -c {output.vcf_pass_tnscope_umi}.temp2 > {output.vcf_pass_tnscope_umi} + tabix -p vcf -f {output.vcf_pass_tnscope_umi}; bcftools +counts {output.vcf_pass_tnscope_umi} > {output.bcftools_counts_research}; -rm {output.vcf_pass_tnscope_umi}.temp; +rm {output.vcf_pass_tnscope_umi}.temp1; + +rm {output.vcf_pass_tnscope_umi}.temp2; """ @@ -153,20 +157,22 @@ bcftools view {input.vcf_snv_clinical} | \ bcftools filter --threads {threads} --include 'INFO/GNOMADAF_popmax <= {params.pop_freq[0]} || INFO/GNOMADAF_popmax == \".\"' --soft-filter '{params.pop_freq[1]}' --mode '+' | \ bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' |\ bcftools filter --threads {threads} --include 'INFO/Frq <= {params.loqusdb_clinical_freq[0]} || INFO/Frq == \".\"' --soft-filter '{params.loqusdb_clinical_freq[1]}' --mode '+' |\ -bcftools view --threads {threads} -f PASS -o {output.vcf_pass_vardict}.temp -O z; - -tabix -p vcf -f {output.vcf_pass_vardict}.temp; +bcftools view --threads {threads} -f PASS -o {output.vcf_pass_vardict}.temp1 -O z; python {params.edit_vcf_script} \ ---input_vcf {output.vcf_pass_vardict}.temp \ ---output_vcf {output.vcf_pass_vardict} \ +--input_vcf {output.vcf_pass_vardict}.temp1 \ +--output_vcf {output.vcf_pass_vardict}.temp2 \ --variant_caller {params.variant_caller}; +bgzip -@ {threads} -l 9 -c {output.vcf_pass_vardict}.temp2 > {output.vcf_pass_vardict} + tabix -p vcf -f {output.vcf_pass_vardict}; bcftools +counts {output.vcf_pass_vardict} > {output.bcftools_counts_clinical}; -rm {output.vcf_pass_vardict}.temp; +rm {output.vcf_pass_vardict}.temp1; + +rm {output.vcf_pass_vardict}.temp2; """ @@ -199,18 +205,20 @@ bcftools view -f PASS,triallelic_site --threads {threads} {input.vcf_snv_clinic bcftools filter --threads {threads} --include 'INFO/GNOMADAF_popmax <= {params.pop_freq[0]} || INFO/GNOMADAF_popmax == \".\"' --soft-filter '{params.pop_freq[1]}' --mode '+' | \ bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' |\ bcftools filter --threads {threads} --include 'INFO/Frq <= {params.loqusdb_clinical_freq[0]} || INFO/Frq == \".\"' --soft-filter '{params.loqusdb_clinical_freq[1]}' --mode '+' |\ -bcftools view --threads {threads} -f PASS,triallelic_site -o {output.vcf_pass_tnscope_umi}.temp -O z; - -tabix -p vcf -f {output.vcf_pass_tnscope_umi}.temp; +bcftools view --threads {threads} -f PASS,triallelic_site -o {output.vcf_pass_tnscope_umi}.temp1 -O z; python {params.edit_vcf_script} \ ---input_vcf {output.vcf_pass_tnscope_umi}.temp \ ---output_vcf {output.vcf_pass_tnscope_umi} \ +--input_vcf {output.vcf_pass_tnscope_umi}.temp1 \ +--output_vcf {output.vcf_pass_tnscope_umi}.temp2 \ --variant_caller {params.variant_caller}; +bgzip -@ {threads} -l 9 -c {output.vcf_pass_tnscope_umi}.temp2 > {output.vcf_pass_tnscope_umi} + tabix -p vcf -f {output.vcf_pass_tnscope_umi}; bcftools +counts {output.vcf_pass_tnscope_umi} > {output.bcftools_counts_clinical}; -rm {output.vcf_pass_tnscope_umi}.temp; +rm {output.vcf_pass_tnscope_umi}.temp1; + +rm {output.vcf_pass_tnscope_umi}.temp2; """ diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_only.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_only.rule index 955ea2894..141585c60 100644 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_only.rule +++ b/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_only.rule @@ -31,20 +31,22 @@ rule bcftools_filter_vardict_research_tumor_only: bcftools view {input.vcf_snv_research} | \ bcftools filter --threads {threads} --include 'INFO/GNOMADAF_popmax <= {params.pop_freq[0]} || INFO/GNOMADAF_popmax == \".\"' --soft-filter '{params.pop_freq[1]}' --mode '+' | \ bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' |\ -bcftools view --threads {threads} -f PASS -o {output.vcf_pass_vardict}.temp -O z; - -tabix -p vcf -f {output.vcf_pass_vardict}.temp; +bcftools view --threads {threads} -f PASS -o {output.vcf_pass_vardict}.temp1 -O z; python {params.edit_vcf_script} \ ---input_vcf {output.vcf_pass_vardict}.temp \ ---output_vcf {output.vcf_pass_vardict} \ +--input_vcf {output.vcf_pass_vardict}.temp1 \ +--output_vcf {output.vcf_pass_vardict}.temp2 \ --variant_caller {params.variant_caller}; +bgzip -@ {threads} -l 9 -c {output.vcf_pass_vardict}.temp2 > {output.vcf_pass_vardict} + tabix -p vcf -f {output.vcf_pass_vardict}; bcftools +counts {output.vcf_pass_vardict} > {output.bcftools_counts_research}; -rm {output.vcf_pass_vardict}.temp; +rm {output.vcf_pass_vardict}.temp1; + +rm {output.vcf_pass_vardict}.temp2; """ @@ -107,20 +109,22 @@ rule bcftools_filter_TNscope_umi_research_tumor_only: bcftools view {input.vcf_snv_research} | \ bcftools filter --include 'INFO/GNOMADAF_popmax <= {params.pop_freq[0]} || INFO/GNOMADAF_popmax == \".\"' --soft-filter '{params.pop_freq[1]}' --mode '+' | \ bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' |\ -bcftools view --threads {threads} -f PASS,triallelic_site -o {output.vcf_pass_tnscope_umi}.temp -O z; - -tabix -p vcf -f {output.vcf_pass_tnscope_umi}.temp; +bcftools view --threads {threads} -f PASS,triallelic_site -o {output.vcf_pass_tnscope_umi}.temp1 -O z; python {params.edit_vcf_script} \ ---input_vcf {output.vcf_pass_tnscope_umi}.temp \ ---output_vcf {output.vcf_pass_tnscope_umi} \ +--input_vcf {output.vcf_pass_tnscope_umi}.temp1 \ +--output_vcf {output.vcf_pass_tnscope_umi}.temp2 \ --variant_caller {params.variant_caller}; +bgzip -@ {threads} -l 9 -c {output.vcf_pass_tnscope_umi}.temp2 > {output.vcf_pass_tnscope_umi} + tabix -p vcf -f {output.vcf_pass_tnscope_umi}; bcftools +counts {output.vcf_pass_tnscope_umi} > {output.bcftools_counts_research}; -rm {output.vcf_pass_tnscope_umi}.temp; +rm {output.vcf_pass_tnscope_umi}.temp1; + +rm {output.vcf_pass_tnscope_umi}.temp2; """ @@ -153,20 +157,22 @@ bcftools view {input.vcf_snv_clinical} | \ bcftools filter --include 'INFO/GNOMADAF_popmax <= {params.pop_freq[0]} || INFO/GNOMADAF_popmax == \".\"' --soft-filter '{params.pop_freq[1]}' --mode '+' | \ bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' |\ bcftools filter --threads {threads} --include 'INFO/Frq <= {params.loqusdb_clinical_freq[0]} || INFO/Frq == \".\"' --soft-filter '{params.loqusdb_clinical_freq[1]}' --mode '+' |\ -bcftools view --threads {threads} -f PASS -o {output.vcf_pass_vardict}.temp -O z; - -tabix -p vcf -f {output.vcf_pass_vardict}.temp; +bcftools view --threads {threads} -f PASS -o {output.vcf_pass_vardict}.temp1 -O z; python {params.edit_vcf_script} \ ---input_vcf {output.vcf_pass_vardict}.temp \ ---output_vcf {output.vcf_pass_vardict} \ +--input_vcf {output.vcf_pass_vardict}.temp1 \ +--output_vcf {output.vcf_pass_vardict}.temp2 \ --variant_caller {params.variant_caller}; +bgzip -@ {threads} -l 9 -c {output.vcf_pass_vardict}.temp2 > {output.vcf_pass_vardict} + tabix -p vcf -f {output.vcf_pass_vardict}; bcftools +counts {output.vcf_pass_vardict} > {output.bcftools_counts_clinical}; -rm {output.vcf_pass_vardict}.temp; +rm {output.vcf_pass_vardict}.temp1; + +rm {output.vcf_pass_vardict}.temp2; """ @@ -199,18 +205,20 @@ bcftools view {input.vcf_snv_clinical} | \ bcftools filter --include 'INFO/GNOMADAF_popmax <= {params.pop_freq[0]} || INFO/GNOMADAF_popmax == \".\"' --soft-filter '{params.pop_freq[1]}' --mode '+' | \ bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' |\ bcftools filter --threads {threads} --include 'INFO/Frq <= {params.loqusdb_clinical_freq[0]} || INFO/Frq == \".\"' --soft-filter '{params.loqusdb_clinical_freq[1]}' --mode '+' |\ -bcftools view --threads {threads} -f PASS,triallelic_site -o {output.vcf_pass_tnscope_umi}.temp -O z; - -tabix -p vcf -f {output.vcf_pass_tnscope_umi}.temp; +bcftools view --threads {threads} -f PASS,triallelic_site -o {output.vcf_pass_tnscope_umi}.temp1 -O z; python {params.edit_vcf_script} \ ---input_vcf {output.vcf_pass_tnscope_umi}.temp \ ---output_vcf {output.vcf_pass_tnscope_umi} \ +--input_vcf {output.vcf_pass_tnscope_umi}.temp1 \ +--output_vcf {output.vcf_pass_tnscope_umi}.temp2 \ --variant_caller {params.variant_caller}; +bgzip -@ {threads} -l 9 -c {output.vcf_pass_tnscope_umi}.temp2 > {output.vcf_pass_tnscope_umi} + tabix -p vcf -f {output.vcf_pass_tnscope_umi}; bcftools +counts {output.vcf_pass_tnscope_umi} > {output.bcftools_counts_clinical}; -rm {output.vcf_pass_tnscope_umi}.temp; +rm {output.vcf_pass_tnscope_umi}.temp1; + +rm {output.vcf_pass_tnscope_umi}.temp2; """ diff --git a/CHANGELOG.rst b/CHANGELOG.rst index e99ba6ca6..7da99d0d2 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -13,6 +13,7 @@ Changed: Fixed: ^^^^^^ * triallelic_site in quality filter for SNV https://github.com/Clinical-Genomics/BALSAMIC/pull/1052 +* Compression of SNV, research and clinical, VCF files https://github.com/Clinical-Genomics/BALSAMIC/pull/1060 * `test_write_json` failing locally https://github.com/Clinical-Genomics/BALSAMIC/pull/1063 * Container build and push via github actions by setting buildx `provenance` flag to false https://github.com/Clinical-Genomics/BALSAMIC/pull/1071 * Added buildx to the submodule workflow https://github.com/Clinical-Genomics/BALSAMIC/pull/1072