From f2421158d5303e2a0d11b03cb0474d222f99ced8 Mon Sep 17 00:00:00 2001 From: Lee Katz - Aspen Date: Fri, 22 Jan 2016 13:57:36 -0500 Subject: [PATCH] added the stabilizing changes for pooled VCF formation --- scripts/launch_set.pl | 3 ++- scripts/launch_varscan.pl | 4 +++- scripts/mergeVcf.sh | 18 ++++++++++++++++-- scripts/set_fixVcf.pl | 21 +++++++++++++++++---- 4 files changed, 38 insertions(+), 8 deletions(-) diff --git a/scripts/launch_set.pl b/scripts/launch_set.pl index 60bcc84..89dfc0f 100755 --- a/scripts/launch_set.pl +++ b/scripts/launch_set.pl @@ -703,8 +703,9 @@ sub indexAndCompressVcf{ my($vcf,$holdjid,$settings)=@_; my $j={}; eval{ + # removing fixVcf because it is done internally in launch_vcf.pl + #set_fixVcf.pl --fail-sites --fail-samples --min_alt_frac $$settings{min_alt_frac} --min_coverage $$settings{min_coverage} '$vcf' > $vcf.reevaluated && mv '$vcf.reevaluated' '$vcf' && \ $j=$sge->pleaseExecute(" - set_fixVcf.pl --min_alt_frac $$settings{min_alt_frac} --min_coverage $$settings{min_coverage} '$vcf' > $vcf.reevaluated && mv '$vcf.reevaluated' '$vcf' && \ vcf-sort < '$vcf' > '$vcf.sorted.tmp' && mv '$vcf.sorted.tmp' '$vcf' && \ bgzip -f '$vcf' && tabix '$vcf.gz' ",{qsubxopts=>"-hold_jid $holdjid",jobname=>"sortAndCompress",numcpus=>1}); diff --git a/scripts/launch_varscan.pl b/scripts/launch_varscan.pl index baa1a94..09fe2de 100755 --- a/scripts/launch_varscan.pl +++ b/scripts/launch_varscan.pl @@ -115,7 +115,9 @@ sub varscanWorker{ system("bgzip $vcf.tmp.vcf && tabix $vcf.tmp.vcf.gz"); die if $?; - # Fix the VCF and rename the sample + # Fix the VCF and rename the sample. + # Just do one CPU since this is already being + # called in a worker thread. system("set_fixVcf.pl --numcpus 1 --min_coverage $$settings{coverage} --min_alt_frac $$settings{altFreq} --fail-samples --fail-sites --rename-sample $samplename $vcf.tmp.vcf.gz > $vcf"); die if $?; diff --git a/scripts/mergeVcf.sh b/scripts/mergeVcf.sh index 55591f5..ebf06fd 100755 --- a/scripts/mergeVcf.sh +++ b/scripts/mergeVcf.sh @@ -44,6 +44,7 @@ if [ "$TEMPDIR" == "" ]; then TEMPDIR=$(mktemp --directory /tmp/mergeVcf.XXXXXX); fi mkdir -pv "$TEMPDIR"; # just in case +export TEMPDIR; REGIONSFILE="$TEMPDIR/regions.txt"; touch $REGIONSFILE; #make sure this file is accessible @@ -75,7 +76,14 @@ echo "$script: temporary directory is $TEMPDIR"; echo "$script: Running bcftools merge"; export IN; export script; -echo "$REGION" | xargs -P $NUMCPUS -n 1 -I {} bash -c 'echo "$script: merging SNPs in {}"; out='$TEMPDIR'/merged.$$.vcf; bcftools merge --merge all --regions "{}" --force-samples -o $out $IN && bgzip $out && tabix $out.gz && echo "$script: finished with region {}";' +echo "$REGION" | xargs -P $NUMCPUS -n 1 -I {} bash -c ' + echo "$script: merging SNPs in {}"; + out='$TEMPDIR'/merged.$$.vcf; + bcftools merge --merge all --regions "{}" --force-samples -o $out $IN && \ + bgzip $out && \ + tabix $out.gz && \ + echo "$script: finished with region {}"; +' if [ $? -gt 0 ]; then echo "$script: ERROR with bcftools merge" rm -rvf $TEMPDIR; @@ -91,6 +99,12 @@ if [ $? -gt 0 ]; then exit 1; fi +# Run fixVcf.pl so that if all the samples in a site failed, +# then the site fails. +#bgzip $TEMPDIR/concat.vcf && tabix $TEMPDIR/concat.vcf.gz && \ +#set_fixVcf.pl $TEMPDIR/concat.vcf > $TEMPDIR/fixed.vcf && \ +#mv $TEMPDIR/fixed.vcf $TEMPDIR/concat.vcf + # Generate a SNPs-only merged file, if requested if [ "$ALSOSNPS" == 1 ]; then echo "$script: parsing $TEMPDIR/concat.vcf for SNPs" @@ -111,7 +125,7 @@ for VCF in $TEMPDIR/concat.vcf $TEMPDIR/hqPos.vcf; do fi; echo "$script: Sorting $VCF and removing indels"; - bcftools annotate --include '%TYPE!="indel" && %FILTER!="isIndel"' < $VCF > $VCF.tmp && mv $VCF.tmp $VCF; + bcftools annotate --include '%TYPE!="indel"' < $VCF > $VCF.tmp && mv $VCF.tmp $VCF; echo "$script: compressing $VCF"; bgzip $VCF diff --git a/scripts/set_fixVcf.pl b/scripts/set_fixVcf.pl index 060f180..0604ac2 100755 --- a/scripts/set_fixVcf.pl +++ b/scripts/set_fixVcf.pl @@ -275,6 +275,12 @@ sub fixVcfLine{ # Innocent until proven guilty if($$settings{'pass-until-fail'}){ $y{FILTER}=['PASS']; + } else { + # If we are not assuming innocent until guilty, + # and if there is no value set yet, then it is FAIL + if(!defined($y{FILTER}[0]) || $y{FILTER}[0] eq '.'){ + $y{FILTER}=['FAIL']; + } } # Fix up the samples' tags @@ -290,7 +296,7 @@ sub fixVcfLine{ if(!$y{gtypes}{$samplename}{FT}){ push(@{ $y{FORMAT} }, 'FT'); # must set a default value in case it doesn't get set - $y{gtypes}{$samplename}{FT}='.'; + $y{gtypes}{$samplename}{FT}='FAIL'; } # Convert FILTER to FT tag if($y{gtypes}{$samplename}{FT} eq '.'){ @@ -300,6 +306,12 @@ sub fixVcfLine{ # Innocent until proven guilty if($$settings{'pass-until-fail'}){ $y{gtypes}{$samplename}{FT}='PASS'; + }else{ + # If we are not assuming innocent until guilty, + # and if there is no value set yet, then it is FAIL + if($y{gtypes}{$samplename}{FT} eq '.'){ + $y{gtypes}{$samplename}{FT}='FAIL'; + } } } @@ -334,9 +346,10 @@ sub fixVcfLine{ } sub usage{ - "Fixes a given VCF by adding, removing, or editing fields. This script - will also reevaluate sites on whether they have passed filters using the FT tag. - However, the FILTER field will simply be set to 'PASS'. + "Fixes a given VCF by adding, removing, or editing fields. + Without any options, useless INFO tags and ALT nts are + removed. Additionally, each site will be failed if all samples + have failed in the FT tag field Usage: $0 file.vcf.gz > fixed.vcf --numcpus 1 Num CPUS currently has no effect on this script. --tempdir /tmp/ Choose a temporary directory (optional)