Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added regularization to covariance in GMM maximization step to fix convergence issues in VQSR. #7709

Merged
merged 4 commits into from
Mar 10, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ class MultivariateGaussian {
private Matrix cachedSigmaInverse;
final private double[] pVarInGaussian;
int pVarInGaussianIndex;
final private static double EPSILON = 1e-200;
private static final double EPSILON = 1e-200;
private static final double COVARIANCE_REGULARIZATION_EPSILON = 1E-6;

public MultivariateGaussian( final int numVariants, final int numAnnotations ) {
mu = new double[numAnnotations];
Expand Down Expand Up @@ -187,9 +188,9 @@ public void maximizeGaussian(final List<VariantDatum> data, final double[] empir
for( final VariantDatum datum : data ) {
final double prob = pVarInGaussian[datumIndex++];
for( int iii = 0; iii < mu.length; iii++ ) {
double deltaMu = prob * (datum.annotations[iii]-mu[iii]);
for( int jjj = 0; jjj < mu.length; jjj++ ) {
pVarSigma.set(iii, jjj, deltaMu * (datum.annotations[jjj]-mu[jjj]));
final double regCovar = iii == jjj ? COVARIANCE_REGULARIZATION_EPSILON : 0.;
pVarSigma.set(iii, jjj, prob * (datum.annotations[iii]-mu[iii]) * (datum.annotations[jjj]-mu[jjj]) + regCovar);
}
}
sigma.plusEquals( pVarSigma );
Expand Down Expand Up @@ -228,7 +229,8 @@ public void evaluateFinalModelParameters( final List<VariantDatum> data ) {
final double prob = pVarInGaussian[datumIndex++];
for( int iii = 0; iii < mu.length; iii++ ) {
for( int jjj = 0; jjj < mu.length; jjj++ ) {
pVarSigma.set(iii, jjj, prob * (datum.annotations[iii]-mu[iii]) * (datum.annotations[jjj]-mu[jjj]));
final double regCovar = iii == jjj ? COVARIANCE_REGULARIZATION_EPSILON : 0.;
pVarSigma.set(iii, jjj, prob * (datum.annotations[iii]-mu[iii]) * (datum.annotations[jjj]-mu[jjj]) + regCovar);
}
}
sigma.plusEquals( pVarSigma );
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ public class GatherTranchesIntegrationTest extends CommandLineProgramTest {
@Test
public void testCombine2Shards() throws Exception {
final File recal1 = new File(testDir + "snpTranches.scattered.txt"); //this is the output of VariantRecalibratorIntegrationTest.testVariantRecalibratorSNPscattered
final File recal2 = new File(testDir + "snpTranches.scattered.2.txt"); //this is the output of running the equivalent commandline as above outside of the integration test :-/
samuelklee marked this conversation as resolved.
Show resolved Hide resolved
final File recal2 = new File(testDir + "snpTranches.scattered.2.txt"); //this is a copy of the above

final File recal_original = new File(testDir + "expected/snpTranches.gathered.txt");

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@
* from the command line, in order to ensure that the initial state of the random number generator is the same
* between versions. In all cases, the variants in the expected files are identical to those produced by GATK3,
* though the VCF headers were hand modified to account for small differences in the metadata lines.
*
* UPDATE: The expected results from GATK3 were updated in https://github.com/broadinstitute/gatk/pull/7709.
* However, we left the original comments referencing GATK3 below untouched.
*/
public class VariantRecalibratorIntegrationTest extends CommandLineProgramTest {

Expand Down Expand Up @@ -496,8 +499,14 @@ public void testVariantRecalibratorSNPscattered(final String[] params) throws IO
doSNPTest(params, getLargeVQSRTestDataDir() + "/snpTranches.scattered.txt", getLargeVQSRTestDataDir() + "snpRecal.vcf"); //tranches file isn't in the expected/ directory because it's input to GatherTranchesIntegrationTest
}

//One of the Gaussians has a covariance matrix with a determinant of zero, (can be confirmed that the entries of sigma for the row and column with the index of the constant annotation are zero) which leads to all +Inf LODs if we don't throw
@Test(expectedExceptions={UserException.VQSRPositiveModelFailure.class})
// One of the Gaussians has a covariance matrix with a determinant of zero,
samuelklee marked this conversation as resolved.
Show resolved Hide resolved
// (can be confirmed that the entries of sigma for the row and column with the index of the constant annotation are zero)
// which leads to all +Inf LODs if we don't throw.
//
// UPDATE: Originally, this test checked for expectedExceptions = {UserException.VQSRPositiveModelFailure.class}.
// However, the convergence failure was fixed in https://github.com/broadinstitute/gatk/pull/7709,
// so we now expect the test to complete and can instead treat it as a regression test.
@Test
public void testAnnotationsWithNoVarianceSpecified() throws IOException {
// use an ArrayList - ArgumentBuilder tokenizes using the "=" in the resource args
List<String> args = new ArrayList<>(alleleSpecificVQSRParamsTooManyAnnotations.length);
Expand All @@ -511,7 +520,7 @@ public void testAnnotationsWithNoVarianceSpecified() throws IOException {
Assert.assertEquals(varRecalTool.instanceMain(args.toArray(new String[args.size()])), true);
}

@Test(expectedExceptions={UserException.VQSRNegativeModelFailure.class})
@Test(expectedExceptions = {UserException.VQSRNegativeModelFailure.class})
public void testNoNegativeTrainingData() throws IOException {
// use an ArrayList - ArgumentBuilder tokenizes using the "=" in the resource args
List<String> args = new ArrayList<>(alleleSpecificVQSRParamsNoNegativeData.length);
Expand Down
4 changes: 2 additions & 2 deletions src/test/resources/large/VQSR/bothRecal.vcf
Git LFS file not shown
4 changes: 2 additions & 2 deletions src/test/resources/large/VQSR/bothRecalWithAggregate.vcf
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
4 changes: 2 additions & 2 deletions src/test/resources/large/VQSR/expected/bothTranches.txt
Git LFS file not shown
Git LFS file not shown
4 changes: 2 additions & 2 deletions src/test/resources/large/VQSR/expected/expected.AS.recal.vcf
Git LFS file not shown
Git LFS file not shown
2 changes: 1 addition & 1 deletion src/test/resources/large/VQSR/expected/indelTranches.txt
Git LFS file not shown
4 changes: 2 additions & 2 deletions src/test/resources/large/VQSR/expected/snpApplyResult.vcf
Git LFS file not shown
4 changes: 2 additions & 2 deletions src/test/resources/large/VQSR/expected/snpSampledRecal.vcf
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
2 changes: 1 addition & 1 deletion src/test/resources/large/VQSR/indelRecal.vcf
Git LFS file not shown
4 changes: 2 additions & 2 deletions src/test/resources/large/VQSR/snpRecal.vcf
Git LFS file not shown
4 changes: 2 additions & 2 deletions src/test/resources/large/VQSR/snpRecal.vcf.idx
Git LFS file not shown
4 changes: 2 additions & 2 deletions src/test/resources/large/VQSR/snpTranches.scattered.2.txt
Git LFS file not shown
4 changes: 2 additions & 2 deletions src/test/resources/large/VQSR/snpTranches.scattered.txt
Git LFS file not shown
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
# Variant quality score tranches file
# Version number 5
targetTruthSensitivity,numKnown,numNovel,knownTiTv,novelTiTv,minVQSLod,filterName,model,accessibleTruthSites,callsAtTruthSites,truthSensitivity
90.00,0,2507,0.0000,3.2205,-1.4017,VQSRTrancheSNP0.00to90.00,SNP,2786,2507,0.8999
97.00,0,2702,0.0000,3.2351,-2.1306,VQSRTrancheSNP90.00to97.00,SNP,2786,2702,0.9698
98.00,0,2730,0.0000,3.2457,-2.4533,VQSRTrancheSNP97.00to98.00,SNP,2786,2730,0.9799
99.00,0,2758,0.0000,3.2496,-3.2164,VQSRTrancheSNP98.00to99.00,SNP,2786,2758,0.9899
99.30,0,2766,0.0000,3.2488,-4.0653,VQSRTrancheSNP99.00to99.30,SNP,2786,2766,0.9928
99.40,0,2769,0.0000,3.2469,-5.1648,VQSRTrancheSNP99.30to99.40,SNP,2786,2769,0.9939
99.50,0,2772,0.0000,3.2450,-5.9262,VQSRTrancheSNP99.40to99.50,SNP,2786,2772,0.9950
99.60,0,2774,0.0000,3.2416,-6.5841,VQSRTrancheSNP99.50to99.60,SNP,2786,2774,0.9957
99.80,0,2780,0.0000,3.2314,-58913.3494,VQSRTrancheSNP99.60to99.80,SNP,2786,2780,0.9978
99.90,0,2783,0.0000,3.2295,-59210.6252,VQSRTrancheSNP99.80to99.90,SNP,2786,2783,0.9989
100.00,0,2786,0.0000,3.2340,-Infinity,VQSRTrancheSNP99.90to100.00,SNP,2786,2786,1.0000
samuelklee marked this conversation as resolved.
Show resolved Hide resolved
99.95,0,2786,0.0000,3.2340,-Infinity,VQSRTrancheSNP100.00to99.95,SNP,2786,2786,1.0000
90.00,0,2507,0.0000,3.2134,935.3268,VQSRTrancheSNP0.00to90.00,SNP,2786,2507,0.8999
97.00,0,2702,0.0000,3.2484,758.5107,VQSRTrancheSNP90.00to97.00,SNP,2786,2702,0.9698
98.00,0,2730,0.0000,3.2523,710.0883,VQSRTrancheSNP97.00to98.00,SNP,2786,2730,0.9799
99.00,0,2758,0.0000,3.2562,626.3145,VQSRTrancheSNP98.00to99.00,SNP,2786,2758,0.9899
99.30,0,2766,0.0000,3.2423,573.6691,VQSRTrancheSNP99.00to99.30,SNP,2786,2766,0.9928
99.40,0,2769,0.0000,3.2469,511.3402,VQSRTrancheSNP99.30to99.40,SNP,2786,2769,0.9939
99.50,0,2772,0.0000,3.2515,342.2903,VQSRTrancheSNP99.40to99.50,SNP,2786,2772,0.9950
99.60,0,2774,0.0000,3.2481,250.6941,VQSRTrancheSNP99.50to99.60,SNP,2786,2774,0.9957
99.80,0,2780,0.0000,3.2314,-5.7548,VQSRTrancheSNP99.60to99.80,SNP,2786,2780,0.9978
99.90,0,2783,0.0000,3.2295,-12.9282,VQSRTrancheSNP99.80to99.90,SNP,2786,2783,0.9989
99.95,0,2784,0.0000,3.2310,-22753.7269,VQSRTrancheSNP99.90to99.95,SNP,2786,2784,0.9993
100.00,0,2786,0.0000,3.2340,-39326.4087,VQSRTrancheSNP99.95to100.00,SNP,2786,2786,1.0000
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Variant quality score tranches file
# Version number 5
targetTruthSensitivity,numKnown,numNovel,knownTiTv,novelTiTv,minVQSLod,filterName,model,accessibleTruthSites,callsAtTruthSites,truthSensitivity
90.00,0,2507,0.0000,3.2205,-1.4017,VQSRTrancheSNP0.00to90.00,SNP,2786,2507,0.8999
99.00,0,2758,0.0000,3.2496,-3.2164,VQSRTrancheSNP90.00to99.00,SNP,2786,2758,0.9899
99.90,0,2783,0.0000,3.2295,-59210.6252,VQSRTrancheSNP99.00to99.90,SNP,2786,2783,0.9989
100.00,0,2786,0.0000,3.2340,-Infinity,VQSRTrancheSNP99.90to100.00,SNP,2786,2786,1.0000
90.00,0,2507,0.0000,3.2134,935.3268,VQSRTrancheSNP0.00to90.00,SNP,2786,2507,0.8999
99.00,0,2758,0.0000,3.2562,626.3145,VQSRTrancheSNP90.00to99.00,SNP,2786,2758,0.9899
99.90,0,2783,0.0000,3.2295,-12.9282,VQSRTrancheSNP99.00to99.90,SNP,2786,2783,0.9989
100.00,0,2786,0.0000,3.2340,-39326.4087,VQSRTrancheSNP99.90to100.00,SNP,2786,2786,1.0000