Skip to content

Commit

Permalink
fix: VCF compression (#1060)
Browse files Browse the repository at this point in the history
* changelog

* add bgzip

* changelog

* changelog
  • Loading branch information
khurrammaqbool authored Feb 3, 2023
1 parent d7edf47 commit fca0fac
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 49 deletions.
2 changes: 1 addition & 1 deletion BALSAMIC/assets/scripts/edit_vcf_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def edit_vcf_info(input_vcf, output_vcf, variant_caller):

new_vcf = Writer(output_vcf, vcf)

with gzip.open(output_vcf, "wb"):
with open(output_vcf, "wb"):
for variant in vcf:
variant.INFO["FOUND_IN"] = variant_caller + "|" + output_vcf
new_vcf.write_record(variant)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,20 +31,22 @@ rule bcftools_filter_vardict_research_tumor_normal:
bcftools view {input.vcf_snv_research} | \
bcftools filter --threads {threads} --include 'INFO/GNOMADAF_popmax <= {params.pop_freq[0]} || INFO/GNOMADAF_popmax == \".\"' --soft-filter '{params.pop_freq[1]}' --mode '+' | \
bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' |\
bcftools view -f PASS -o {output.vcf_pass_vardict}.temp -O z;
tabix -p vcf -f {output.vcf_pass_vardict}.temp;
bcftools view -f PASS -o {output.vcf_pass_vardict}.temp1 -O z;
python {params.edit_vcf_script} \
--input_vcf {output.vcf_pass_vardict}.temp \
--output_vcf {output.vcf_pass_vardict} \
--input_vcf {output.vcf_pass_vardict}.temp1 \
--output_vcf {output.vcf_pass_vardict}.temp2 \
--variant_caller {params.variant_caller};
bgzip -@ {threads} -l 9 -c {output.vcf_pass_vardict}.temp2 > {output.vcf_pass_vardict}
tabix -p vcf -f {output.vcf_pass_vardict};
bcftools +counts {output.vcf_pass_vardict} > {output.bcftools_counts_research};
rm {output.vcf_pass_vardict}.temp;
rm {output.vcf_pass_vardict}.temp1;
rm {output.vcf_pass_vardict}.temp2;
"""


Expand Down Expand Up @@ -107,20 +109,22 @@ rule bcftools_filter_TNscope_umi_research_tumor_normal:
bcftools view -f PASS,triallelic_site --threads {threads} {input.vcf_snv_research} | \
bcftools filter --threads {threads} --include 'INFO/GNOMADAF_popmax <= {params.pop_freq[0]} || INFO/GNOMADAF_popmax == \".\"' --soft-filter '{params.pop_freq[1]}' --mode '+' | \
bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' |\
bcftools view --threads {threads} -f PASS,triallelic_site -o {output.vcf_pass_tnscope_umi}.temp -O z;
tabix -p vcf -f {output.vcf_pass_tnscope_umi}.temp;
bcftools view --threads {threads} -f PASS,triallelic_site -o {output.vcf_pass_tnscope_umi}.temp1 -O z;
python {params.edit_vcf_script} \
--input_vcf {output.vcf_pass_tnscope_umi}.temp \
--output_vcf {output.vcf_pass_tnscope_umi} \
--input_vcf {output.vcf_pass_tnscope_umi}.temp1 \
--output_vcf {output.vcf_pass_tnscope_umi}.temp2 \
--variant_caller {params.variant_caller};
bgzip -@ {threads} -l 9 -c {output.vcf_pass_tnscope_umi}.temp2 > {output.vcf_pass_tnscope_umi}
tabix -p vcf -f {output.vcf_pass_tnscope_umi};
bcftools +counts {output.vcf_pass_tnscope_umi} > {output.bcftools_counts_research};
rm {output.vcf_pass_tnscope_umi}.temp;
rm {output.vcf_pass_tnscope_umi}.temp1;
rm {output.vcf_pass_tnscope_umi}.temp2;
"""


Expand Down Expand Up @@ -153,20 +157,22 @@ bcftools view {input.vcf_snv_clinical} | \
bcftools filter --threads {threads} --include 'INFO/GNOMADAF_popmax <= {params.pop_freq[0]} || INFO/GNOMADAF_popmax == \".\"' --soft-filter '{params.pop_freq[1]}' --mode '+' | \
bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' |\
bcftools filter --threads {threads} --include 'INFO/Frq <= {params.loqusdb_clinical_freq[0]} || INFO/Frq == \".\"' --soft-filter '{params.loqusdb_clinical_freq[1]}' --mode '+' |\
bcftools view --threads {threads} -f PASS -o {output.vcf_pass_vardict}.temp -O z;
tabix -p vcf -f {output.vcf_pass_vardict}.temp;
bcftools view --threads {threads} -f PASS -o {output.vcf_pass_vardict}.temp1 -O z;
python {params.edit_vcf_script} \
--input_vcf {output.vcf_pass_vardict}.temp \
--output_vcf {output.vcf_pass_vardict} \
--input_vcf {output.vcf_pass_vardict}.temp1 \
--output_vcf {output.vcf_pass_vardict}.temp2 \
--variant_caller {params.variant_caller};
bgzip -@ {threads} -l 9 -c {output.vcf_pass_vardict}.temp2 > {output.vcf_pass_vardict}
tabix -p vcf -f {output.vcf_pass_vardict};
bcftools +counts {output.vcf_pass_vardict} > {output.bcftools_counts_clinical};
rm {output.vcf_pass_vardict}.temp;
rm {output.vcf_pass_vardict}.temp1;
rm {output.vcf_pass_vardict}.temp2;
"""


Expand Down Expand Up @@ -199,18 +205,20 @@ bcftools view -f PASS,triallelic_site --threads {threads} {input.vcf_snv_clinic
bcftools filter --threads {threads} --include 'INFO/GNOMADAF_popmax <= {params.pop_freq[0]} || INFO/GNOMADAF_popmax == \".\"' --soft-filter '{params.pop_freq[1]}' --mode '+' | \
bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' |\
bcftools filter --threads {threads} --include 'INFO/Frq <= {params.loqusdb_clinical_freq[0]} || INFO/Frq == \".\"' --soft-filter '{params.loqusdb_clinical_freq[1]}' --mode '+' |\
bcftools view --threads {threads} -f PASS,triallelic_site -o {output.vcf_pass_tnscope_umi}.temp -O z;
tabix -p vcf -f {output.vcf_pass_tnscope_umi}.temp;
bcftools view --threads {threads} -f PASS,triallelic_site -o {output.vcf_pass_tnscope_umi}.temp1 -O z;
python {params.edit_vcf_script} \
--input_vcf {output.vcf_pass_tnscope_umi}.temp \
--output_vcf {output.vcf_pass_tnscope_umi} \
--input_vcf {output.vcf_pass_tnscope_umi}.temp1 \
--output_vcf {output.vcf_pass_tnscope_umi}.temp2 \
--variant_caller {params.variant_caller};
bgzip -@ {threads} -l 9 -c {output.vcf_pass_tnscope_umi}.temp2 > {output.vcf_pass_tnscope_umi}
tabix -p vcf -f {output.vcf_pass_tnscope_umi};
bcftools +counts {output.vcf_pass_tnscope_umi} > {output.bcftools_counts_clinical};
rm {output.vcf_pass_tnscope_umi}.temp;
rm {output.vcf_pass_tnscope_umi}.temp1;
rm {output.vcf_pass_tnscope_umi}.temp2;
"""
Original file line number Diff line number Diff line change
Expand Up @@ -31,20 +31,22 @@ rule bcftools_filter_vardict_research_tumor_only:
bcftools view {input.vcf_snv_research} | \
bcftools filter --threads {threads} --include 'INFO/GNOMADAF_popmax <= {params.pop_freq[0]} || INFO/GNOMADAF_popmax == \".\"' --soft-filter '{params.pop_freq[1]}' --mode '+' | \
bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' |\
bcftools view --threads {threads} -f PASS -o {output.vcf_pass_vardict}.temp -O z;
tabix -p vcf -f {output.vcf_pass_vardict}.temp;
bcftools view --threads {threads} -f PASS -o {output.vcf_pass_vardict}.temp1 -O z;
python {params.edit_vcf_script} \
--input_vcf {output.vcf_pass_vardict}.temp \
--output_vcf {output.vcf_pass_vardict} \
--input_vcf {output.vcf_pass_vardict}.temp1 \
--output_vcf {output.vcf_pass_vardict}.temp2 \
--variant_caller {params.variant_caller};
bgzip -@ {threads} -l 9 -c {output.vcf_pass_vardict}.temp2 > {output.vcf_pass_vardict}
tabix -p vcf -f {output.vcf_pass_vardict};
bcftools +counts {output.vcf_pass_vardict} > {output.bcftools_counts_research};
rm {output.vcf_pass_vardict}.temp;
rm {output.vcf_pass_vardict}.temp1;
rm {output.vcf_pass_vardict}.temp2;
"""


Expand Down Expand Up @@ -107,20 +109,22 @@ rule bcftools_filter_TNscope_umi_research_tumor_only:
bcftools view {input.vcf_snv_research} | \
bcftools filter --include 'INFO/GNOMADAF_popmax <= {params.pop_freq[0]} || INFO/GNOMADAF_popmax == \".\"' --soft-filter '{params.pop_freq[1]}' --mode '+' | \
bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' |\
bcftools view --threads {threads} -f PASS,triallelic_site -o {output.vcf_pass_tnscope_umi}.temp -O z;
tabix -p vcf -f {output.vcf_pass_tnscope_umi}.temp;
bcftools view --threads {threads} -f PASS,triallelic_site -o {output.vcf_pass_tnscope_umi}.temp1 -O z;
python {params.edit_vcf_script} \
--input_vcf {output.vcf_pass_tnscope_umi}.temp \
--output_vcf {output.vcf_pass_tnscope_umi} \
--input_vcf {output.vcf_pass_tnscope_umi}.temp1 \
--output_vcf {output.vcf_pass_tnscope_umi}.temp2 \
--variant_caller {params.variant_caller};
bgzip -@ {threads} -l 9 -c {output.vcf_pass_tnscope_umi}.temp2 > {output.vcf_pass_tnscope_umi}
tabix -p vcf -f {output.vcf_pass_tnscope_umi};
bcftools +counts {output.vcf_pass_tnscope_umi} > {output.bcftools_counts_research};
rm {output.vcf_pass_tnscope_umi}.temp;
rm {output.vcf_pass_tnscope_umi}.temp1;
rm {output.vcf_pass_tnscope_umi}.temp2;
"""


Expand Down Expand Up @@ -153,20 +157,22 @@ bcftools view {input.vcf_snv_clinical} | \
bcftools filter --include 'INFO/GNOMADAF_popmax <= {params.pop_freq[0]} || INFO/GNOMADAF_popmax == \".\"' --soft-filter '{params.pop_freq[1]}' --mode '+' | \
bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' |\
bcftools filter --threads {threads} --include 'INFO/Frq <= {params.loqusdb_clinical_freq[0]} || INFO/Frq == \".\"' --soft-filter '{params.loqusdb_clinical_freq[1]}' --mode '+' |\
bcftools view --threads {threads} -f PASS -o {output.vcf_pass_vardict}.temp -O z;
tabix -p vcf -f {output.vcf_pass_vardict}.temp;
bcftools view --threads {threads} -f PASS -o {output.vcf_pass_vardict}.temp1 -O z;
python {params.edit_vcf_script} \
--input_vcf {output.vcf_pass_vardict}.temp \
--output_vcf {output.vcf_pass_vardict} \
--input_vcf {output.vcf_pass_vardict}.temp1 \
--output_vcf {output.vcf_pass_vardict}.temp2 \
--variant_caller {params.variant_caller};
bgzip -@ {threads} -l 9 -c {output.vcf_pass_vardict}.temp2 > {output.vcf_pass_vardict}
tabix -p vcf -f {output.vcf_pass_vardict};
bcftools +counts {output.vcf_pass_vardict} > {output.bcftools_counts_clinical};
rm {output.vcf_pass_vardict}.temp;
rm {output.vcf_pass_vardict}.temp1;
rm {output.vcf_pass_vardict}.temp2;
"""


Expand Down Expand Up @@ -199,18 +205,20 @@ bcftools view {input.vcf_snv_clinical} | \
bcftools filter --include 'INFO/GNOMADAF_popmax <= {params.pop_freq[0]} || INFO/GNOMADAF_popmax == \".\"' --soft-filter '{params.pop_freq[1]}' --mode '+' | \
bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' |\
bcftools filter --threads {threads} --include 'INFO/Frq <= {params.loqusdb_clinical_freq[0]} || INFO/Frq == \".\"' --soft-filter '{params.loqusdb_clinical_freq[1]}' --mode '+' |\
bcftools view --threads {threads} -f PASS,triallelic_site -o {output.vcf_pass_tnscope_umi}.temp -O z;
tabix -p vcf -f {output.vcf_pass_tnscope_umi}.temp;
bcftools view --threads {threads} -f PASS,triallelic_site -o {output.vcf_pass_tnscope_umi}.temp1 -O z;
python {params.edit_vcf_script} \
--input_vcf {output.vcf_pass_tnscope_umi}.temp \
--output_vcf {output.vcf_pass_tnscope_umi} \
--input_vcf {output.vcf_pass_tnscope_umi}.temp1 \
--output_vcf {output.vcf_pass_tnscope_umi}.temp2 \
--variant_caller {params.variant_caller};
bgzip -@ {threads} -l 9 -c {output.vcf_pass_tnscope_umi}.temp2 > {output.vcf_pass_tnscope_umi}
tabix -p vcf -f {output.vcf_pass_tnscope_umi};
bcftools +counts {output.vcf_pass_tnscope_umi} > {output.bcftools_counts_clinical};
rm {output.vcf_pass_tnscope_umi}.temp;
rm {output.vcf_pass_tnscope_umi}.temp1;
rm {output.vcf_pass_tnscope_umi}.temp2;
"""
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Changed:
Fixed:
^^^^^^
* triallelic_site in quality filter for SNV https://github.com/Clinical-Genomics/BALSAMIC/pull/1052
* Compression of SNV, research and clinical, VCF files https://github.com/Clinical-Genomics/BALSAMIC/pull/1060
* `test_write_json` failing locally https://github.com/Clinical-Genomics/BALSAMIC/pull/1063
* Container build and push via github actions by setting buildx `provenance` flag to false https://github.com/Clinical-Genomics/BALSAMIC/pull/1071
* Added buildx to the submodule workflow https://github.com/Clinical-Genomics/BALSAMIC/pull/1072
Expand Down

0 comments on commit fca0fac

Please sign in to comment.