Skip to content

Commit

Permalink
Fixed a bug in ReblockGVCF that could cause the first position on a c…
Browse files Browse the repository at this point in the history
…ontig to be dropped.
  • Loading branch information
samuelklee committed Sep 21, 2022
1 parent ef50c08 commit 10a2a68
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,7 @@ private void regenotypeVC(final VariantContext originalVC) {
//variants with PL[0] less than threshold get turned to homRef with PL=[0,0,0], shouldn't get INFO attributes
//make sure we can call het variants with GQ >= rgqThreshold in joint calling downstream
if(shouldBeReblocked(result)) {
if (vcfWriter.getVcfOutputEnd() != null && result.getEnd() <= vcfWriter.getVcfOutputEnd().getEnd()) {
if (vcfWriter.getVcfOutputEnd() != null && result.contigsMatch(vcfWriter.getVcfOutputEnd()) && result.getEnd() <= vcfWriter.getVcfOutputEnd().getEnd()) {
//variant is entirely overlapped by variants already output to the VCF, so drop it
return;
}
Expand Down Expand Up @@ -488,7 +488,7 @@ public VariantContextBuilder lowQualVariantToGQ0HomRef(final VariantContext lowQ
return null;
}

final Map<String, Object> attrMap = new HashMap<>();
final Map<String, Object> attrMap = new LinkedHashMap<>();

//this method does a lot of things, including fixing alleles and adding the END key
final GenotypeBuilder gb = changeCallToHomRefVersusNonRef(lowQualityVariant, attrMap); //note that gb has all zero PLs
Expand All @@ -497,7 +497,7 @@ public VariantContextBuilder lowQualVariantToGQ0HomRef(final VariantContext lowQ

final Genotype newG = gb.make();
builder.alleles(Arrays.asList(newG.getAlleles().get(0), Allele.NON_REF_ALLELE)).genotypes(newG);
if (vcfWriter.getVcfOutputEnd() != null && lowQualityVariant.getStart() <= vcfWriter.getVcfOutputEnd().getStart()) {
if (vcfWriter.getVcfOutputEnd() != null && lowQualityVariant.contigsMatch(vcfWriter.getVcfOutputEnd()) && lowQualityVariant.getStart() <= vcfWriter.getVcfOutputEnd().getStart()) {
final int newStart = vcfWriter.getVcfOutputEnd().getEnd() + 1;
if (newStart > lowQualityVariant.getEnd()) {
return null;
Expand Down Expand Up @@ -578,7 +578,7 @@ protected GenotypeBuilder changeCallToHomRefVersusNonRef(final VariantContext lo
*/
@VisibleForTesting
VariantContext cleanUpHighQualityVariant(final VariantContext variant) {
final Map<String, Object> attrMap = new HashMap<>();
final Map<String, Object> attrMap = new LinkedHashMap<>();

final Genotype genotype = getCalledGenotype(variant);
VariantContextBuilder builder = new VariantContextBuilder(variant); //QUAL from result is carried through
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -479,4 +479,28 @@ public void testTreeScoreThreshold() {
}
}
}

/**
* Regression test for https://github.com/broadinstitute/gatk/issues/7884 using 2-record snippet of gVCF discussed there.
*/
@Test
public void testFirstPositionOnContigNotDropped() {
final File input = getTestFile("testFirstPositionOnContigNotDropped.g.vcf");
final File output = createTempFile("reblockedgvcf", ".vcf");

final ArgumentsBuilder args = new ArgumentsBuilder();
args.addReference(new File(hg38Reference))
.add("V", input)
.add("L", "chr12:18282464") // in the original gVCF, 18282464 is the first variant position on chr12 that is greater than the dropped position 18173860 on chr13
.add("L", "chr13:18173860")
.addOutput(output);
runCommandLine(args);

// we only check that records at both positions are retained
final List<VariantContext> outVCs = VariantContextTestUtils.readEntireVCFIntoMemory(output.getAbsolutePath()).getRight();
Assert.assertEquals(outVCs.get(0).getContig(), "chr12");
Assert.assertEquals(outVCs.get(0).getStart(), 18282464);
Assert.assertEquals(outVCs.get(1).getContig(), "chr13");
Assert.assertEquals(outVCs.get(1).getStart(), 18173860);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele not already represented at this location by REF and ALT">
##FILTER=<ID=LowQual,Description="Low quality">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block">
##FORMAT=<ID=PGT,Number=1,Type=String,Description="Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another; will always be heterozygous and is not intended to describe called alleles">
##FORMAT=<ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Phasing set (typically the position of the first variant in the set)">
##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.">
##GATKCommandLine=<ID=HaplotypeCaller,CommandLine="HaplotypeCaller --contamination-fraction-to-filter 4.2690533333333337E-4 --gvcf-gq-bands 10 --gvcf-gq-bands 20 --gvcf-gq-bands 30 --gvcf-gq-bands 40 --gvcf-gq-bands 50 --gvcf-gq-bands 60 --gvcf-gq-bands 70 --gvcf-gq-bands 80 --gvcf-gq-bands 90 --emit-ref-confidence GVCF --output GNH-15001801323685.g.vcf.gz --intervals /cromwell_root/broad-gotc-prod-execution10/ExomeGermlineSingleSample/662e7597-4f3f-45e7-a6fb-94ca8fd9b606/call-BamToGvcf/VariantCalling/60e2aec7-70d8-43a3-9a5e-5cd44c9aee3e/call-ScatterIntervalList/glob-cb4648beeaff920acb03de7603c06f98/1scattered.interval_list --input gs://broad-gotc-prod-execution10/ExomeGermlineSingleSample/662e7597-4f3f-45e7-a6fb-94ca8fd9b606/call-UnmappedBamToAlignedBam/UnmappedBamToAlignedBam/b2e61416-9623-4aa6-bf22-f060551088fe/call-GatherBamFiles/GNH-15001801323685.bam --reference /cromwell_root/gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta --annotation-group StandardAnnotation --annotation-group StandardHCAnnotation --annotation-group AS_StandardAnnotation --use-posteriors-to-calculate-qual false --dont-use-dragstr-priors false --use-new-qual-calculator true --annotate-with-num-discovered-alleles false --heterozygosity 0.001 --indel-heterozygosity 1.25E-4 --heterozygosity-stdev 0.01 --standard-min-confidence-threshold-for-calling 30.0 --max-alternate-alleles 6 --max-genotype-count 1024 --sample-ploidy 2 --num-reference-samples-if-no-call 0 --genotype-assignment-method USE_PLS_TO_ASSIGN --output-mode EMIT_VARIANTS_ONLY --all-site-pls false --floor-blocks false --indel-size-to-eliminate-in-ref-model 10 --disable-optimizations false --dragen-mode false --apply-bqd false --apply-frd false --disable-spanning-event-genotyping false --transform-dragen-mapping-quality false --mapping-quality-threshold-for-genotyping 20 --max-effective-depth-adjustment-for-frd 0 --just-determine-active-regions false --dont-genotype false --do-not-run-physical-phasing false --do-not-correct-overlapping-quality false --use-filtered-reads-for-annotations false --adaptive-pruning false --do-not-recover-dangling-branches false --recover-dangling-heads false --kmer-size 10 --kmer-size 25 --dont-increase-kmer-sizes-for-cycles false --allow-non-unique-kmers-in-ref false --num-pruning-samples 1 --min-dangling-branch-length 4 --recover-all-dangling-branches false --max-num-haplotypes-in-population 128 --min-pruning 2 --adaptive-pruning-initial-error-rate 0.001 --pruning-lod-threshold 2.302585092994046 --pruning-seeding-lod-threshold 9.210340371976184 --max-unpruned-variants 100 --linked-de-bruijn-graph false --disable-artificial-haplotype-recovery false --enable-legacy-graph-cycle-detection false --debug-assembly false --debug-graph-transformations false --capture-assembly-failure-bam false --num-matching-bases-in-dangling-end-to-recover -1 --error-correction-log-odds -Infinity --error-correct-reads false --kmer-length-for-read-error-correction 25 --min-observations-for-kmer-to-be-solid 20 --base-quality-score-threshold 18 --dragstr-het-hom-ratio 2 --dont-use-dragstr-pair-hmm-scores false --pair-hmm-gap-continuation-penalty 10 --expected-mismatch-rate-for-read-disqualification 0.02 --pair-hmm-implementation FASTEST_AVAILABLE --pcr-indel-model CONSERVATIVE --phred-scaled-global-read-mismapping-rate 45 --disable-symmetric-hmm-normalizing false --disable-cap-base-qualities-to-map-quality false --enable-dynamic-read-disqualification-for-genotyping false --dynamic-read-disqualification-threshold 1.0 --native-pair-hmm-threads 4 --native-pair-hmm-use-double-precision false --bam-writer-type CALLED_HAPLOTYPES --dont-use-soft-clipped-bases false --min-base-quality-score 10 --smith-waterman JAVA --max-mnp-distance 0 --force-call-filtered-alleles false --soft-clip-low-quality-ends false --allele-informative-reads-overlap-margin 2 --smith-waterman-dangling-end-match-value 25 --smith-waterman-dangling-end-mismatch-penalty -50 --smith-waterman-dangling-end-gap-open-penalty -110 --smith-waterman-dangling-end-gap-extend-penalty -6 --smith-waterman-haplotype-to-reference-match-value 200 --smith-waterman-haplotype-to-reference-mismatch-penalty -150 --smith-waterman-haplotype-to-reference-gap-open-penalty -260 --smith-waterman-haplotype-to-reference-gap-extend-penalty -11 --smith-waterman-read-to-haplotype-match-value 10 --smith-waterman-read-to-haplotype-mismatch-penalty -15 --smith-waterman-read-to-haplotype-gap-open-penalty -30 --smith-waterman-read-to-haplotype-gap-extend-penalty -5 --min-assembly-region-size 50 --max-assembly-region-size 300 --active-probability-threshold 0.002 --max-prob-propagation-distance 50 --force-active false --assembly-region-padding 100 --padding-around-indels 75 --padding-around-snps 20 --padding-around-strs 75 --max-extension-into-assembly-region-padding-legacy 25 --max-reads-per-alignment-start 50 --enable-legacy-assembly-region-trimming false --interval-set-rule UNION --interval-padding 0 --interval-exclusion-padding 0 --interval-merging-rule ALL --read-validation-stringency SILENT --seconds-between-progress-updates 10.0 --disable-sequence-dictionary-validation false --create-output-bam-index true --create-output-bam-md5 false --create-output-variant-index true --create-output-variant-md5 false --max-variants-per-shard 0 --lenient false --add-output-sam-program-record true --add-output-vcf-command-line true --cloud-prefetch-buffer 40 --cloud-index-prefetch-buffer -1 --disable-bam-index-caching false --sites-only-vcf-output false --help false --version false --showHidden false --verbosity INFO --QUIET false --use-jdk-deflater false --use-jdk-inflater false --gcs-max-retries 20 --gcs-project-for-requester-pays --disable-tool-default-read-filters false --minimum-mapping-quality 20 --disable-tool-default-annotations false --enable-all-annotations false --allow-old-rms-mapping-quality-annotation-data false",Version="4.2.6.1",Date="June 2, 2022 6:32:43 PM GMT">
##GVCFBlock0-10=minGQ=0(inclusive),maxGQ=10(exclusive)
##GVCFBlock10-20=minGQ=10(inclusive),maxGQ=20(exclusive)
##GVCFBlock20-30=minGQ=20(inclusive),maxGQ=30(exclusive)
##GVCFBlock30-40=minGQ=30(inclusive),maxGQ=40(exclusive)
##GVCFBlock40-50=minGQ=40(inclusive),maxGQ=50(exclusive)
##GVCFBlock50-60=minGQ=50(inclusive),maxGQ=60(exclusive)
##GVCFBlock60-70=minGQ=60(inclusive),maxGQ=70(exclusive)
##GVCFBlock70-80=minGQ=70(inclusive),maxGQ=80(exclusive)
##GVCFBlock80-90=minGQ=80(inclusive),maxGQ=90(exclusive)
##GVCFBlock90-100=minGQ=90(inclusive),maxGQ=100(exclusive)
##INFO=<ID=AS_InbreedingCoeff,Number=A,Type=Float,Description="Allele-specific inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">
##INFO=<ID=AS_QD,Number=A,Type=Float,Description="Allele-specific Variant Confidence/Quality by Depth">
##INFO=<ID=AS_RAW_BaseQRankSum,Number=1,Type=String,Description="raw data for allele specific rank sum test of base qualities">
##INFO=<ID=AS_RAW_MQ,Number=1,Type=String,Description="Allele-specfic raw data for RMS Mapping Quality">
##INFO=<ID=AS_RAW_MQRankSum,Number=1,Type=String,Description="Allele-specfic raw data for Mapping Quality Rank Sum">
##INFO=<ID=AS_RAW_ReadPosRankSum,Number=1,Type=String,Description="allele specific raw data for rank sum test of read position bias">
##INFO=<ID=AS_SB_TABLE,Number=1,Type=String,Description="Allele-specific forward/reverse read counts for strand bias tests. Includes the reference and alleles separated by |.">
##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##INFO=<ID=ExcessHet,Number=1,Type=Float,Description="Phred-scaled p-value for exact test of excess heterozygosity">
##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">
##INFO=<ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
##INFO=<ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
##INFO=<ID=RAW_MQandDP,Number=2,Type=Integer,Description="Raw data (sum of squared MQ and total depth) for improved RMS Mapping Quality calculation. Incompatible with deprecated RAW_MQ formulation.">
##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
##contig=<ID=chr12,length=133275309,assembly=38>
##contig=<ID=chr13,length=114364328,assembly=38>
##source=HaplotypeCaller
##bcftools_viewVersion=1.10.2-105-g7cd83b7+htslib-1.10.2-110-gda59588
##bcftools_viewCommand=view -r chr12:18282464,chr13:18173860 test.g.vcf.gz; Date=Wed Sep 21 14:52:31 2022
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT test
chr12 18282464 . GCCC G,<NON_REF> 1827.6 . AS_RAW_BaseQRankSum=|-0.6,1|NaN;AS_RAW_MQ=165600.00|169200.00|0.00;AS_RAW_MQRankSum=|0.0,1|NaN;AS_RAW_ReadPosRankSum=|0.4,1|NaN;AS_SB_TABLE=18,28|18,29|0,0;BaseQRankSum=-0.55;DP=95;ExcessHet=0;MLEAC=1,0;MLEAF=0.5,0;MQRankSum=0;RAW_MQandDP=342000,95;ReadPosRankSum=0.449 GT:AD:DP:GQ:PL:SB 0/1:46,47,0:93:99:1835,0,1790,1974,1932,3905:18,28,18,29
chr13 18173860 . A C,<NON_REF> 0 . AS_RAW_BaseQRankSum=|-4.9,1|NaN;AS_RAW_MQ=51256.00|4709.00|0.00;AS_RAW_MQRankSum=|0.5,1|NaN;AS_RAW_ReadPosRankSum=|1.2,1|NaN;AS_SB_TABLE=23,2|1,1|0,0;BaseQRankSum=-4.896;DP=28;ExcessHet=0;MLEAC=0,0;MLEAF=0,0;MQRankSum=0.564;RAW_MQandDP=59565,28;ReadPosRankSum=1.252 GT:AD:DP:GQ:PL:SB 0/0:25,2,0:27:61:0,61,946,75,951,965:23,2,1,1
Binary file not shown.

0 comments on commit 10a2a68

Please sign in to comment.