From d155f66038a242e4b9fce8b40f4d0b10ed07d3ff Mon Sep 17 00:00:00 2001 From: Michael L Heuer Date: Wed, 16 Nov 2016 12:20:29 -0600 Subject: [PATCH 1/2] Increasing unit test coverage for VariantContextConverter --- .../converters/VariantContextConverter.scala | 18 +++ .../VariantContextConverterSuite.scala | 111 ++++++++++++++++++ .../adam/rdd/ADAMContextSuite.scala | 16 +++ .../rdd/variant/VariantContextRDDSuite.scala | 22 ++-- 4 files changed, 159 insertions(+), 8 deletions(-) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/converters/VariantContextConverter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/converters/VariantContextConverter.scala index b951556481..f7d8fa2668 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/converters/VariantContextConverter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/converters/VariantContextConverter.scala @@ -397,6 +397,9 @@ private[adam] class VariantContextConverter(dict: Option[SequenceDictionary] = N if (vc.isFiltered) { builder.setFiltersFailed(new java.util.ArrayList(vc.getFilters)); } + if (vc.getAttributeAsBoolean("SOMATIC", false)) { + builder.setSomatic(true) + } builder.build } @@ -607,6 +610,21 @@ private[adam] class VariantContextConverter(dict: Option[SequenceDictionary] = N case Some(s) => vcb.id(s) } + val filtersApplied = Option(variant.getFiltersApplied).getOrElse(false) + val filtersPassed = Option(variant.getFiltersPassed).getOrElse(false) + + (filtersApplied, filtersPassed) match { + case (false, false) => vcb.unfiltered + case (false, true) => vcb.passFilters // log warning? + case (true, false) => vcb.filters(new java.util.HashSet(variant.getFiltersFailed())) + case (true, true) => vcb.passFilters + } + + val somatic: java.lang.Boolean = Option(variant.getSomatic).getOrElse(false) + if (somatic) { + vcb.attribute("SOMATIC", true) + } + // TODO: Extract provenance INFO fields try { vcb.genotypes(vc.genotypes.map(g => { diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/converters/VariantContextConverterSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/converters/VariantContextConverterSuite.scala index df3d562b28..49860e1397 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/converters/VariantContextConverterSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/converters/VariantContextConverterSuite.scala @@ -86,6 +86,27 @@ class VariantContextConverterSuite extends ADAMFunSuite { assert(variant.getReferenceAllele === "A") assert(variant.getStart === 0L) + assert(variant.getSomatic === false) + } + + test("Convert somatic htsjdk site-only SNV to ADAM") { + val converter = new VariantContextConverter + + // not sure why this doesn't work + //htsjdkSNVBuilder.attribute("SOMATIC", true) + //val adamVCs = converter.convert(htsjdkSNVBuilder.make) + + val vcb: VariantContextBuilder = new VariantContextBuilder() + .alleles(List(Allele.create("A", true), Allele.create("T"))) + .start(1L) + .stop(1L) + .chr("1") + .attribute("SOMATIC", true) + + val adamVCs = converter.convert(vcb.make) + val adamVC = adamVCs.head + val variant = adamVC.variant.variant + assert(variant.getSomatic === true) } test("Convert htsjdk site-only SNV to ADAM with contig conversion") { @@ -413,4 +434,94 @@ class VariantContextConverterSuite extends ADAMFunSuite { assert(htsjdkVC.hasID) assert(htsjdkVC.getID === "rs3131972;rs201888535") } + + test("Convert ADAM variant context with null filters applied to htsjdk") { + val variant = adamSNVBuilder() + .setFiltersApplied(null) + .build + + val converter = new VariantContextConverter + + val htsjdkVC = converter.convert(ADAMVariantContext(variant)) + assert(!htsjdkVC.filtersWereApplied) + assert(!htsjdkVC.isFiltered) + assert(htsjdkVC.getFilters.isEmpty) + } + + test("Convert ADAM variant context with no filters applied to htsjdk") { + val variant = adamSNVBuilder() + .setFiltersApplied(false) + .build + + val converter = new VariantContextConverter + + val htsjdkVC = converter.convert(ADAMVariantContext(variant)) + assert(!htsjdkVC.filtersWereApplied) + assert(!htsjdkVC.isFiltered) + assert(htsjdkVC.getFilters.isEmpty) + } + + test("Convert ADAM variant context with passing filters to htsjdk") { + val variant = adamSNVBuilder() + .setFiltersApplied(true) + .setFiltersPassed(true) + .build + + val converter = new VariantContextConverter + + val htsjdkVC = converter.convert(ADAMVariantContext(variant)) + assert(htsjdkVC.filtersWereApplied) + assert(!htsjdkVC.isFiltered) + assert(htsjdkVC.getFilters.isEmpty) + } + + test("Convert ADAM variant context with failing filters to htsjdk") { + val variant = adamSNVBuilder() + .setFiltersApplied(true) + .setFiltersPassed(false) + .setFiltersFailed(ImmutableList.of("FILTER1", "FILTER2")) + .build + + val converter = new VariantContextConverter + + val htsjdkVC = converter.convert(ADAMVariantContext(variant)) + assert(htsjdkVC.filtersWereApplied) + assert(htsjdkVC.isFiltered) + assert(htsjdkVC.getFilters.contains("FILTER1")) + assert(htsjdkVC.getFilters.contains("FILTER2")) + } + + test("Convert ADAM variant context with null somatic flag to htsjdk") { + val variant = adamSNVBuilder() + .setSomatic(null) + .build + + val converter = new VariantContextConverter + + val htsjdkVC = converter.convert(ADAMVariantContext(variant)) + assert(!htsjdkVC.hasAttribute("SOMATIC")) + } + + test("Convert ADAM variant context with non-somatic variant to htsjdk") { + val variant = adamSNVBuilder() + .setSomatic(false) + .build + + val converter = new VariantContextConverter + + val htsjdkVC = converter.convert(ADAMVariantContext(variant)) + assert(!htsjdkVC.hasAttribute("SOMATIC")) + } + + test("Convert ADAM variant context with somatic variant to htsjdk") { + val variant = adamSNVBuilder() + .setSomatic(true) + .build + + val converter = new VariantContextConverter + + val htsjdkVC = converter.convert(ADAMVariantContext(variant)) + assert(htsjdkVC.hasAttribute("SOMATIC")) + assert(htsjdkVC.getAttributeAsBoolean("SOMATIC", false) === true) + } } diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala index 6bba2a5ebb..fc82d50576 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala @@ -154,6 +154,22 @@ class ADAMContextSuite extends ADAMFunSuite { assert(vcs.size === 6) val vc = vcs.head + + /* +1 14397 . CTGT C 139.12 IndelQD AC=2;AF=0.333;AN=6;BaseQRankSum=1.800;ClippingRankSum=0.138;DP=69;FS=7.786;MLEAC=2;MLEAF=0.333;MQ=26.84;MQ0=0;MQRankSum=-1.906;QD=1.55;ReadPosRankSum=0.384 GT:AD:DP:FT:GQ:PL 0/1:16,4:20:rd:99:120,0,827 0/1:8,2:10:dp;rd:60:60,0,414 0/0:39,0:39:PASS:99:0,116,2114 + */ + val variant = vc.variant.variant + assert(variant.getContigName === "1") + assert(variant.getStart === 14396L) + assert(variant.getEnd === 14400L) + assert(variant.getReferenceAllele === "CTGT") + assert(variant.getAlternateAllele === "C") + assert(variant.getNames.isEmpty) + assert(variant.getFiltersApplied === true) + assert(variant.getFiltersPassed === false) + assert(variant.getFiltersFailed.contains("IndelQD")) + assert(variant.getSomatic === false) + assert(vc.genotypes.size === 3) val gt = vc.genotypes.head diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantContextRDDSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantContextRDDSuite.scala index ddbc72effe..015247bfff 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantContextRDDSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantContextRDDSuite.scala @@ -45,6 +45,8 @@ class VariantContextRDDSuite extends ADAMFunSuite { .setReferenceAllele("T") .setAlternateAllele("C") .setNames(ImmutableList.of("rs3131972", "rs201888535")) + .setFiltersApplied(true) + .setFiltersPassed(true) .build val g0 = Genotype.newBuilder().setVariant(v0) @@ -67,14 +69,18 @@ class VariantContextRDDSuite extends ADAMFunSuite { val vcRdd = sc.loadVcf("%s/test.vcf/part-r-00000".format(tempDir)) assert(vcRdd.rdd.count === 1) - val variant = vcRdd.rdd.first.variant - assert(variant.variant.getContigName === "chr11") - assert(variant.variant.getStart === 17409572) - assert(variant.variant.getReferenceAllele === "T") - assert(variant.variant.getAlternateAllele === "C") - assert(variant.variant.getNames.length === 2) - assert(variant.variant.getNames.get(0) === "rs3131972") - assert(variant.variant.getNames.get(1) === "rs201888535") + val variant = vcRdd.rdd.first.variant.variant + assert(variant.getContigName === "chr11") + assert(variant.getStart === 17409572) + assert(variant.getReferenceAllele === "T") + assert(variant.getAlternateAllele === "C") + assert(variant.getNames.length === 2) + assert(variant.getNames.get(0) === "rs3131972") + assert(variant.getNames.get(1) === "rs201888535") + assert(variant.getFiltersApplied === true) + assert(variant.getFiltersPassed === true) + assert(variant.getFiltersFailed.isEmpty) + assert(variant.getSomatic === false) assert(vcRdd.sequences.records.size === 1) assert(vcRdd.sequences.records(0).name === "chr11") From 447ca9a9bed3d7ac9b3f8400ef9ae5e33805cf47 Mon Sep 17 00:00:00 2001 From: Michael L Heuer Date: Fri, 18 Nov 2016 09:56:51 -0800 Subject: [PATCH 2/2] Use VCF header lines in VCFInFormatter --- .../adam/rdd/variant/VCFInFormatter.scala | 14 +++++--------- adam-core/src/test/resources/sorted.lex.vcf | 12 ++++++------ adam-core/src/test/resources/sorted.vcf | 12 ++++++------ .../converters/VariantContextConverterSuite.scala | 4 ---- .../org/bdgenomics/adam/rdd/ADAMContextSuite.scala | 4 ---- .../adam/rdd/variant/VariantContextRDDSuite.scala | 4 +--- 6 files changed, 18 insertions(+), 32 deletions(-) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/VCFInFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/VCFInFormatter.scala index 14fd6fda5e..ab8452ebf3 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/VCFInFormatter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/VCFInFormatter.scala @@ -23,10 +23,7 @@ import htsjdk.variant.variantcontext.writer.{ } import htsjdk.variant.vcf.{ VCFHeader, VCFHeaderLine } import java.io.OutputStream -import org.bdgenomics.adam.converters.{ - SupportedHeaderLines, - VariantContextConverter -} +import org.bdgenomics.adam.converters.VariantContextConverter import org.bdgenomics.adam.models.{ SequenceDictionary, VariantContext @@ -47,13 +44,14 @@ object VCFInFormatter extends InFormatterCompanion[VariantContext, VariantContex * VCF header. */ def apply(gRdd: VariantContextRDD): VCFInFormatter = { - VCFInFormatter(gRdd.sequences, gRdd.samples.map(_.getSampleId)) + VCFInFormatter(gRdd.sequences, gRdd.samples.map(_.getSampleId), gRdd.headerLines) } } private[variant] case class VCFInFormatter private ( sequences: SequenceDictionary, - samples: Seq[String]) extends InFormatter[VariantContext, VariantContextRDD, VCFInFormatter] { + samples: Seq[String], + headerLines: Seq[VCFHeaderLine]) extends InFormatter[VariantContext, VariantContextRDD, VCFInFormatter] { protected val companion = VCFInFormatter @@ -75,9 +73,7 @@ private[variant] case class VCFInFormatter private ( .unsetOption(Options.INDEX_ON_THE_FLY) .build() - val headerLines: Set[VCFHeaderLine] = (SupportedHeaderLines.infoHeaderLines ++ - SupportedHeaderLines.formatHeaderLines).toSet - val header = new VCFHeader(headerLines, samples) + val header = new VCFHeader(headerLines.toSet, samples) header.setSequenceDictionary(sequences.toSAMSequenceDictionary) writer.writeHeader(header) diff --git a/adam-core/src/test/resources/sorted.lex.vcf b/adam-core/src/test/resources/sorted.lex.vcf index 62a488bebb..efa81538bf 100644 --- a/adam-core/src/test/resources/sorted.lex.vcf +++ b/adam-core/src/test/resources/sorted.lex.vcf @@ -63,9 +63,9 @@ ##contig= ##contig= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA12891 NA12892 -1 14397 . CTGT C . . . GT:AD:DP:FT:GQ:PL 0/1:16,4:20:rd:99:120,0,256 0/1:8,2:10:dp;rd:60:60,0,256 0/0:39,0:39:PASS:99:0,116,256 -1 14522 . G A . . . GT:AD:DP:FT:GQ:PL 0/1:10,5:15:dp:99:99,0,256 0/1:2,5:7:dp;rd:34:128,0,34 0/0:26,0:26:PASS:78:0,78,256 -1 63735 rs201888535 CCTA C . . . GT:AD:DP:FT:GQ:PL 0/0:27,0:27:PASS:79:0,79,256 0/0:40,0:40:PASS:99:0,117,256 0/1:23,74:97:rd:99:256,0,256 -13 752721 rs3131972 A G . . . GT:AD:DP:FT:GQ:PL 1/1:0,27:27:PASS:81:256,81,0 1/1:0,19:19:dp:57:256,57,0 1/1:0,22:22:PASS:66:256,66,0 -13 752791 . A G . . . GT:AD:DP:FT:GQ:PL:SB 1/1:0,27:27:PASS:81:256,81,0:0,1,2,3 1/1:0,19:19:dp:57:256,57,0:4,5,6,7 1/1:0,22:22:PASS:66:256,66,0:2,3,4,5 -2 19190 . GC G . . . GT:AD:DP:FT:GQ:PL 0/1:8,14:22:PASS:99:256,0,256 0/1:18,13:31:PASS:99:256,0,256 0/1:5,15:20:rd:99:256,0,107 +1 14397 . CTGT C . IndelQD . GT:AD:DP:FT:GQ:PL 0/1:16,4:20:rd:99:120,0,256 0/1:8,2:10:dp;rd:60:60,0,256 0/0:39,0:39:PASS:99:0,116,256 +1 14522 . G A . VQSRTrancheSNP99.95to100.00 . GT:AD:DP:FT:GQ:PL 0/1:10,5:15:dp:99:99,0,256 0/1:2,5:7:dp;rd:34:128,0,34 0/0:26,0:26:PASS:78:0,78,256 +1 63735 rs201888535 CCTA C . PASS . GT:AD:DP:FT:GQ:PL 0/0:27,0:27:PASS:79:0,79,256 0/0:40,0:40:PASS:99:0,117,256 0/1:23,74:97:rd:99:256,0,256 +13 752721 rs3131972 A G . PASS . GT:AD:DP:FT:GQ:PL 1/1:0,27:27:PASS:81:256,81,0 1/1:0,19:19:dp:57:256,57,0 1/1:0,22:22:PASS:66:256,66,0 +13 752791 . A G . PASS . GT:AD:DP:FT:GQ:PL:SB 1/1:0,27:27:PASS:81:256,81,0:0,1,2,3 1/1:0,19:19:dp:57:256,57,0:4,5,6,7 1/1:0,22:22:PASS:66:256,66,0:2,3,4,5 +2 19190 . GC G . PASS . GT:AD:DP:FT:GQ:PL 0/1:8,14:22:PASS:99:256,0,256 0/1:18,13:31:PASS:99:256,0,256 0/1:5,15:20:rd:99:256,0,107 diff --git a/adam-core/src/test/resources/sorted.vcf b/adam-core/src/test/resources/sorted.vcf index c5ef3a73fa..d70a65d88e 100644 --- a/adam-core/src/test/resources/sorted.vcf +++ b/adam-core/src/test/resources/sorted.vcf @@ -63,9 +63,9 @@ ##contig= ##contig= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA12891 NA12892 -1 14397 . CTGT C . . . GT:AD:DP:FT:GQ:PL 0/1:16,4:20:rd:99:120,0,256 0/1:8,2:10:dp;rd:60:60,0,256 0/0:39,0:39:PASS:99:0,116,256 -1 14522 . G A . . . GT:AD:DP:FT:GQ:PL 0/1:10,5:15:dp:99:99,0,256 0/1:2,5:7:dp;rd:34:128,0,34 0/0:26,0:26:PASS:78:0,78,256 -1 63735 rs201888535 CCTA C . . . GT:AD:DP:FT:GQ:PL 0/0:27,0:27:PASS:79:0,79,256 0/0:40,0:40:PASS:99:0,117,256 0/1:23,74:97:rd:99:256,0,256 -2 19190 . GC G . . . GT:AD:DP:FT:GQ:PL 0/1:8,14:22:PASS:99:256,0,256 0/1:18,13:31:PASS:99:256,0,256 0/1:5,15:20:rd:99:256,0,107 -13 752721 rs3131972 A G . . . GT:AD:DP:FT:GQ:PL 1/1:0,27:27:PASS:81:256,81,0 1/1:0,19:19:dp:57:256,57,0 1/1:0,22:22:PASS:66:256,66,0 -13 752791 . A G . . . GT:AD:DP:FT:GQ:PL:SB 1/1:0,27:27:PASS:81:256,81,0:0,1,2,3 1/1:0,19:19:dp:57:256,57,0:4,5,6,7 1/1:0,22:22:PASS:66:256,66,0:2,3,4,5 +1 14397 . CTGT C . IndelQD . GT:AD:DP:FT:GQ:PL 0/1:16,4:20:rd:99:120,0,256 0/1:8,2:10:dp;rd:60:60,0,256 0/0:39,0:39:PASS:99:0,116,256 +1 14522 . G A . VQSRTrancheSNP99.95to100.00 . GT:AD:DP:FT:GQ:PL 0/1:10,5:15:dp:99:99,0,256 0/1:2,5:7:dp;rd:34:128,0,34 0/0:26,0:26:PASS:78:0,78,256 +1 63735 rs201888535 CCTA C . PASS . GT:AD:DP:FT:GQ:PL 0/0:27,0:27:PASS:79:0,79,256 0/0:40,0:40:PASS:99:0,117,256 0/1:23,74:97:rd:99:256,0,256 +2 19190 . GC G . PASS . GT:AD:DP:FT:GQ:PL 0/1:8,14:22:PASS:99:256,0,256 0/1:18,13:31:PASS:99:256,0,256 0/1:5,15:20:rd:99:256,0,107 +13 752721 rs3131972 A G . PASS . GT:AD:DP:FT:GQ:PL 1/1:0,27:27:PASS:81:256,81,0 1/1:0,19:19:dp:57:256,57,0 1/1:0,22:22:PASS:66:256,66,0 +13 752791 . A G . PASS . GT:AD:DP:FT:GQ:PL:SB 1/1:0,27:27:PASS:81:256,81,0:0,1,2,3 1/1:0,19:19:dp:57:256,57,0:4,5,6,7 1/1:0,22:22:PASS:66:256,66,0:2,3,4,5 diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/converters/VariantContextConverterSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/converters/VariantContextConverterSuite.scala index 49860e1397..3694029274 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/converters/VariantContextConverterSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/converters/VariantContextConverterSuite.scala @@ -92,10 +92,6 @@ class VariantContextConverterSuite extends ADAMFunSuite { test("Convert somatic htsjdk site-only SNV to ADAM") { val converter = new VariantContextConverter - // not sure why this doesn't work - //htsjdkSNVBuilder.attribute("SOMATIC", true) - //val adamVCs = converter.convert(htsjdkSNVBuilder.make) - val vcb: VariantContextBuilder = new VariantContextBuilder() .alleles(List(Allele.create("A", true), Allele.create("T"))) .start(1L) diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala index fc82d50576..c8c3ec193f 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala @@ -154,10 +154,6 @@ class ADAMContextSuite extends ADAMFunSuite { assert(vcs.size === 6) val vc = vcs.head - - /* -1 14397 . CTGT C 139.12 IndelQD AC=2;AF=0.333;AN=6;BaseQRankSum=1.800;ClippingRankSum=0.138;DP=69;FS=7.786;MLEAC=2;MLEAF=0.333;MQ=26.84;MQ0=0;MQRankSum=-1.906;QD=1.55;ReadPosRankSum=0.384 GT:AD:DP:FT:GQ:PL 0/1:16,4:20:rd:99:120,0,827 0/1:8,2:10:dp;rd:60:60,0,414 0/0:39,0:39:PASS:99:0,116,2114 - */ val variant = vc.variant.variant assert(variant.getContigName === "1") assert(variant.getStart === 14396L) diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantContextRDDSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantContextRDDSuite.scala index 015247bfff..5abb94686e 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantContextRDDSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantContextRDDSuite.scala @@ -122,9 +122,7 @@ class VariantContextRDDSuite extends ADAMFunSuite { } sparkTest("don't lose any variants when piping as VCF") { - val smallVcf = Thread.currentThread() - .getContextClassLoader - .getResource("small.vcf").getFile + val smallVcf = testFile("small.vcf") val rdd: VariantContextRDD = sc.loadVcf(smallVcf) val records = rdd.rdd.count