diff --git a/build.gradle b/build.gradle index 93e849720d7..0aca18e6681 100644 --- a/build.gradle +++ b/build.gradle @@ -63,8 +63,8 @@ final barclayVersion = System.getProperty('barclay.version','2.1.0') final sparkVersion = System.getProperty('spark.version', '2.2.0') final hadoopVersion = System.getProperty('hadoop.version', '2.8.2') final hadoopBamVersion = System.getProperty('hadoopBam.version','7.10.0') -final genomicsdbVersion = System.getProperty('genomicsdb.version','0.9.2-proto-3.0.0-beta-1+uuid-static') final tensorflowVersion = System.getProperty('tensorflow.version','1.4.0') +final genomicsdbVersion = System.getProperty('genomicsdb.version','0.9.2-proto-3.0.0-beta-1+b825ffa6eb47a') final testNGVersion = '6.11' // Using the shaded version to avoid conflicts between its protobuf dependency // and that of Hadoop/Spark (either the one we reference explicitly, or the one diff --git a/src/main/java/org/broadinstitute/hellbender/engine/FeatureDataSource.java b/src/main/java/org/broadinstitute/hellbender/engine/FeatureDataSource.java index 4d10b57f34a..88b09707239 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/FeatureDataSource.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/FeatureDataSource.java @@ -1,10 +1,12 @@ package org.broadinstitute.hellbender.engine; -import com.intel.genomicsdb.GenomicsDBFeatureReader; +import com.intel.genomicsdb.model.GenomicsDBExportConfiguration; +import com.intel.genomicsdb.reader.GenomicsDBFeatureReader; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.tribble.*; import htsjdk.variant.bcf2.BCF2Codec; import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.GenotypeLikelihoods; import htsjdk.variant.vcf.VCFHeader; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -22,9 +24,12 @@ import java.io.File; import java.io.IOException; import java.nio.channels.SeekableByteChannel; +import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.Paths; import java.util.Iterator; import java.util.List; +import java.util.Optional; import java.util.function.Function; /** @@ -378,26 +383,60 @@ private static FeatureReader getGenomicsDBFeatureReader(final St IOUtils.canReadFile(callsetJson); IOUtils.canReadFile(vidmapJson); IOUtils.canReadFile(vcfHeader); - } - catch ( UserException.CouldNotReadInputFile e ) { + } catch ( UserException.CouldNotReadInputFile e ) { throw new UserException("Couldn't connect to GenomicsDB because the vidmap, callset JSON files, or gVCF Header (" + GenomicsDBConstants.DEFAULT_VIDMAP_FILE_NAME + "," + GenomicsDBConstants.DEFAULT_CALLSETMAP_FILE_NAME + "," + GenomicsDBConstants.DEFAULT_VCFHEADER_FILE_NAME + ") could not be read from GenomicsDB workspace " + workspace.getAbsolutePath(), e); } + final GenomicsDBExportConfiguration.ExportConfiguration exportConfigurationBuilder = + createExportConfiguration(reference, workspace, callsetJson, vidmapJson, vcfHeader); + try { - return new GenomicsDBFeatureReader<>(vidmapJson.getAbsolutePath(), - callsetJson.getAbsolutePath(), - workspace.getAbsolutePath(), - GenomicsDBConstants.DEFAULT_ARRAY_NAME, - reference.getAbsolutePath(), - vcfHeader.getAbsolutePath(), - new BCF2Codec()); + return new GenomicsDBFeatureReader<>(exportConfigurationBuilder, new BCF2Codec(), Optional.empty()); } catch (final IOException e) { throw new UserException("Couldn't create GenomicsDBFeatureReader", e); } } + private static GenomicsDBExportConfiguration.ExportConfiguration createExportConfiguration(final File reference, final File workspace, + final File callsetJson, final File vidmapJson, + final File vcfHeader) { + GenomicsDBExportConfiguration.ExportConfiguration.Builder exportConfigurationBuilder = + GenomicsDBExportConfiguration.ExportConfiguration.newBuilder() + .setWorkspace(workspace.getAbsolutePath()) + .setReferenceGenome(reference.getAbsolutePath()) + .setVidMappingFile(vidmapJson.getAbsolutePath()) + .setCallsetMappingFile(callsetJson.getAbsolutePath()) + .setVcfHeaderFilename(vcfHeader.getAbsolutePath()) + .setProduceGTField(false) + .setProduceGTWithMinPLValueForSpanningDeletions(false) + .setSitesOnlyQuery(false) + .setMaxDiploidAltAllelesThatCanBeGenotyped(GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED); + Path arrayFolder = Paths.get(workspace.getAbsolutePath(), GenomicsDBConstants.DEFAULT_ARRAY_NAME).toAbsolutePath(); + + // For the multi-interval support, we create multiple arrays (directories) in a single workspace - + // one per interval. So, if you wish to import intervals ("chr1", [ 1, 100M ]) and ("chr2", [ 1, 100M ]), + // you end up with 2 directories named chr1$1$100M and chr2$1$100M. So, the array names depend on the + // partition bounds. + + // During the read phase, the user only supplies the workspace. The array names are obtained by scanning + // the entries in the workspace and reading the right arrays. For example, if you wish to read ("chr2", + // 50, 50M), then only the second array is queried. + + // In the previous version of the tool, the array name was a constant - genomicsdb_array. The new version + // will be backward compatible with respect to reads. Hence, if a directory named genomicsdb_array is found, + // the array name is passed to the GenomicsDBFeatureReader otherwise the array names are generated from the + // directory entries. + if (Files.exists(arrayFolder)) { + exportConfigurationBuilder.setArrayName(GenomicsDBConstants.DEFAULT_ARRAY_NAME); + } else { + exportConfigurationBuilder.setGenerateArrayNameFromPartitionBounds(true); + } + + return exportConfigurationBuilder.build(); + } + /** * Returns the sequence dictionary for this source of Features. * Uses the dictionary from the VCF header (if present) for variant inputs, diff --git a/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBConstants.java b/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBConstants.java index 633b83f0d3c..1efc0fedcc1 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBConstants.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBConstants.java @@ -9,7 +9,6 @@ public final class GenomicsDBConstants { public static final String DEFAULT_CALLSETMAP_FILE_NAME = "callset.json"; public static final String DEFAULT_VCFHEADER_FILE_NAME = "vcfheader.vcf"; - /** * Don't instantiate a utility class */ diff --git a/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImport.java b/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImport.java index 4775ee494b7..3e0c8586f47 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImport.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImport.java @@ -1,7 +1,13 @@ package org.broadinstitute.hellbender.tools.genomicsdb; import com.google.common.util.concurrent.ThreadFactoryBuilder; -import com.intel.genomicsdb.*; +import com.intel.genomicsdb.importer.GenomicsDBImporter; +import com.intel.genomicsdb.importer.model.ChromosomeInterval; +import com.intel.genomicsdb.model.Coordinates; +import com.intel.genomicsdb.model.GenomicsDBCallsetsMapProto; +import com.intel.genomicsdb.model.GenomicsDBImportConfiguration; +import com.intel.genomicsdb.model.ImportConfig; +import com.intel.genomicsdb.model.BatchCompletionCallbackFunctionArgument; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.util.Locatable; import htsjdk.tribble.AbstractFeatureReader; @@ -30,7 +36,6 @@ import org.broadinstitute.hellbender.utils.nio.SeekableByteChannelPrefetcher; import java.io.File; -import java.io.FileNotFoundException; import java.io.IOException; import java.nio.channels.SeekableByteChannel; import java.nio.file.Files; @@ -38,6 +43,7 @@ import java.util.*; import java.util.concurrent.*; import java.util.function.Function; +import java.util.stream.Collectors; /** @@ -108,14 +114,13 @@ *

Caveats

* * *

Developer Note

- * To read data from GenomicsDB, use the query interface {@link com.intel.genomicsdb.GenomicsDBFeatureReader} + * To read data from GenomicsDB, use the query interface {@link com.intel.genomicsdb.reader.GenomicsDBFeatureReader} */ @DocumentedFeature @CommandLineProgramProperties( @@ -140,6 +145,7 @@ public final class GenomicsDBImport extends GATKTool { public static final String SAMPLE_NAME_MAP_LONG_NAME = "sample-name-map"; public static final String VALIDATE_SAMPLE_MAP_LONG_NAME = "validate-sample-name-map"; public static final String VCF_INITIALIZER_THREADS_LONG_NAME = "reader-threads"; + public static final String MAX_NUM_INTERVALS_TO_IMPORT_IN_PARALLEL = "max-num-intervals-to-import-in-parallel"; @Argument(fullName = WORKSPACE_ARG_LONG_NAME, doc = "Workspace for GenomicsDB. Must be a POSIX file system path, but can be a relative path." + @@ -229,6 +235,15 @@ public final class GenomicsDBImport extends GATKTool { minValue = 1) private int vcfInitializerThreads = 1; + @Advanced + @Argument(fullName = MAX_NUM_INTERVALS_TO_IMPORT_IN_PARALLEL, + shortName = MAX_NUM_INTERVALS_TO_IMPORT_IN_PARALLEL, + doc = "Max number of intervals to import in parallel; higher values may improve performance, but require more" + + " memory and a higher number of file descriptors open at the same time", + optional = true, + minValue = 1) + private int maxNumIntervalsToImportInParallel = 1; + //executor service used when vcfInitializerThreads > 1 private ExecutorService inputPreloadExecutorService; @@ -287,6 +302,9 @@ public int getDefaultCloudIndexPrefetchBufferSize() { // used to write the callset json file on traversal success private GenomicsDBCallsetsMapProto.CallsetMappingPB callsetMappingPB; + //in-progress batchCount + private int batchCount = 1; + /** * Before traversal starts, create the feature readers * for all the input GVCFs, create the merged header and @@ -451,8 +469,6 @@ public void onTraversalStart() { logger.info("Complete VCF Header will be written to " + vcfHeaderFile); logger.info("Importing to array - " + workspace + "/" + GenomicsDBConstants.DEFAULT_ARRAY_NAME); - //Pass in true here to use the given ordering, since sampleNameToVcfPath is already sorted - callsetMappingPB = GenomicsDBImporter.generateSortedCallSetMap(new ArrayList<>(sampleNameToVcfPath.keySet()), true); initializeInputPreloadExecutorService(); } @@ -468,6 +484,59 @@ private void initializeInputPreloadExecutorService() { } } + private Map> createSampleToReaderMap( + final Map sampleNameToVcfPath, final int batchSize, final int index) { + // TODO: fix casting since it's really ugly + return inputPreloadExecutorService != null ? + getFeatureReadersInParallel((SortedMap) sampleNameToVcfPath, batchSize, index) + : getFeatureReadersSerially(sampleNameToVcfPath, batchSize, index); + } + + private Void logMessageOnBatchCompletion(final BatchCompletionCallbackFunctionArgument arg) { + progressMeter.update(intervals.get(0)); + logger.info("Done importing batch " + arg.batchCount + "/" + arg.totalBatchCount); + this.batchCount = arg.batchCount + 1; + return null; + } + + private List generatePartitionListFromIntervals(List chromosomeIntervals) { + return chromosomeIntervals.stream().map(interval -> { + GenomicsDBImportConfiguration.Partition.Builder partitionBuilder = GenomicsDBImportConfiguration.Partition.newBuilder(); + Coordinates.ContigPosition.Builder contigPositionBuilder = Coordinates.ContigPosition.newBuilder(); + Coordinates.GenomicsDBColumn.Builder columnBuilder = Coordinates.GenomicsDBColumn.newBuilder(); + //begin + contigPositionBuilder.setContig(interval.getContig()).setPosition(interval.getStart()); + columnBuilder.setContigPosition(contigPositionBuilder.build()); + partitionBuilder.setBegin(columnBuilder.build()); + //end + contigPositionBuilder.setPosition(interval.getEnd()); + columnBuilder.setContigPosition(contigPositionBuilder.build()); + partitionBuilder.setEnd(columnBuilder.build()); + partitionBuilder.setWorkspace(workspace); + partitionBuilder.setGenerateArrayNameFromPartitionBounds(true); + return partitionBuilder.build(); + }).collect(Collectors.toList()); + } + + private ImportConfig createImportConfig(final int batchSize) { + final List partitions = generatePartitionListFromIntervals(intervals); + GenomicsDBImportConfiguration.ImportConfiguration.Builder importConfigurationBuilder = + GenomicsDBImportConfiguration.ImportConfiguration.newBuilder(); + importConfigurationBuilder.addAllColumnPartitions(partitions); + importConfigurationBuilder.setSizePerColumnPartition(vcfBufferSizePerSample); + importConfigurationBuilder.setFailIfUpdating(true); + importConfigurationBuilder.setSegmentSize(segmentSize); + importConfigurationBuilder.setConsolidateTiledbArrayAfterLoad(doConsolidation); + ImportConfig importConfig = new ImportConfig(importConfigurationBuilder.build(), validateSampleToReaderMap, true, + batchSize, mergedHeaderLines, sampleNameToVcfPath, this::createSampleToReaderMap); + importConfig.setOutputCallsetmapJsonFile(callsetMapJSONFile.getAbsolutePath()); + importConfig.setOutputVidmapJsonFile(vidMapJSONFile.getAbsolutePath()); + importConfig.setOutputVcfHeaderFile(vcfHeaderFile.getAbsolutePath()); + importConfig.setUseSamplesInOrder(true); + importConfig.setFunctionToCallOnBatchCompletion(this::logMessageOnBatchCompletion); + return importConfig; + } + /** * A complete traversal from start to finish. This method will import all samples * specified in the input GVCF files. @@ -479,75 +548,26 @@ public void traverse() { final int sampleCount = sampleNameToVcfPath.size(); final int updatedBatchSize = (batchSize == DEFAULT_ZERO_BATCH_SIZE) ? sampleCount : batchSize; - final int totalBatchCount = (sampleCount/updatedBatchSize) + (sampleCount%updatedBatchSize==0 ? 0 : 1); + final ImportConfig importConfig = createImportConfig(updatedBatchSize); GenomicsDBImporter importer; - - for (int i = 0, batchCount = 1; i < sampleCount; i += updatedBatchSize, ++batchCount) { - - final SortedMap> sampleToReaderMap = - inputPreloadExecutorService != null - ? getFeatureReadersInParallel(sampleNameToVcfPath, updatedBatchSize, i) - : getFeatureReadersSerially(sampleNameToVcfPath, updatedBatchSize, i); - - logger.info("Importing batch " + batchCount + " with " + sampleToReaderMap.size() + " samples"); - final long variantContextBufferSize = vcfBufferSizePerSample * sampleToReaderMap.size(); - final GenomicsDBImportConfiguration.ImportConfiguration importConfiguration = - createImportConfiguration(workspace, GenomicsDBConstants.DEFAULT_ARRAY_NAME, - variantContextBufferSize, segmentSize, - i, (i+updatedBatchSize-1), - (batchCount == 1)); //Fail if array exists and this is the first batch - - try { - importer = new GenomicsDBImporter(sampleToReaderMap, mergedHeaderLines, intervals.get(0), validateSampleToReaderMap, importConfiguration); - } catch (final IOException e) { - throw new UserException("Error initializing GenomicsDBImporter in batch " + batchCount, e); - } catch (final IllegalArgumentException iae) { - throw new GATKException("Null feature reader found in sampleNameMap file: " + sampleNameMapFile, iae); - } - try { - importer.importBatch(); - } catch (final IOException e) { - throw new UserException("GenomicsDB import failed in batch " + batchCount, e); - } - closeReaders(sampleToReaderMap); - progressMeter.update(intervals.get(0)); - logger.info("Done importing batch " + batchCount + "/" + totalBatchCount); + try { + importer = new GenomicsDBImporter(importConfig); + importer.executeImport(maxNumIntervalsToImportInParallel); + } catch (final IOException e) { + throw new UserException("Error initializing GenomicsDBImporter", e); + } catch (final IllegalArgumentException iae) { + throw new GATKException("Null feature reader found in sampleNameMap file: " + sampleNameMapFile, iae); } } @Override public Object onTraversalSuccess() { - if (batchSize==DEFAULT_ZERO_BATCH_SIZE) { + if (batchSize == DEFAULT_ZERO_BATCH_SIZE) { logger.info("Import completed!"); } else { logger.info("Import of all batches to GenomicsDB completed!"); } - - // Write the vid and callset map JSON files - try { - GenomicsDBImporter.writeVidMapJSONFile(vidMapJSONFile.getAbsolutePath(), mergedHeaderLines); - } catch (final FileNotFoundException fe) { - throw new UserException("Unable to write vid map JSON file " + vidMapJSONFile.getAbsolutePath(), fe); - } - try { - GenomicsDBImporter.writeCallsetMapJSONFile(callsetMapJSONFile.getAbsolutePath(), callsetMappingPB); - } catch (final FileNotFoundException fe) { - throw new UserException("Unable to write callset map JSON file " + callsetMapJSONFile.getAbsolutePath(), fe); - } - try { - GenomicsDBImporter.writeVcfHeaderFile(vcfHeaderFile.getAbsolutePath(), mergedHeaderLines); - } catch (final FileNotFoundException fe) { - throw new UserException("Unable to write VCF Header file " + vcfHeaderFile.getAbsolutePath(), fe); - } - - - if (doConsolidation) { - logger.info("GenomicsDB consolidation started"); - GenomicsDBImporter.consolidateTileDBArray(workspace, GenomicsDBConstants.DEFAULT_ARRAY_NAME); - logger.info("GenomicsDB consolidation completed"); - } - return true; } @@ -560,8 +580,8 @@ public Object onTraversalSuccess() { * @param lowerSampleIndex 0-based Lower bound of sample index -- inclusive * @return Feature readers to be imported in the current batch, sorted by sample name */ - private SortedMap> getFeatureReadersInParallel(final SortedMap sampleNametoPath, - final int batchSize, final int lowerSampleIndex) { + private SortedMap> getFeatureReadersInParallel( + final SortedMap sampleNametoPath, final int batchSize, final int lowerSampleIndex) { final SortedMap> sampleToReaderMap = new TreeMap<>(); logger.info("Starting batch input file preload"); final Map>> futures = new LinkedHashMap<>(); @@ -588,6 +608,7 @@ private SortedMap> getFeatureReadersInPara } }); logger.info("Finished batch preload"); + logger.info("Importing batch " + this.batchCount + " with " + sampleToReaderMap.size() + " samples"); return sampleToReaderMap; } @@ -600,6 +621,7 @@ private SortedMap> getFeatureReadersSerial final AbstractFeatureReader reader = getReaderFromPath(sampleNameToPath.get(sampleName)); sampleToReaderMap.put(sampleName, reader); } + logger.info("Importing batch " + this.batchCount + " with " + sampleToReaderMap.size() + " samples"); return sampleToReaderMap; } @@ -620,75 +642,6 @@ private AbstractFeatureReader getReaderFromPath(fi } } - /** - * Creates a GenomicsDB configuration data structure - * instead of sending a long list of parameters to the constructor call - * - * @param workspace GenomicsDB workspace - * @param arrayName GenomicsDB array - * @param variantContextBufferSize Buffer size to store VCF records for all samples - * @param segmentSize Buffer size to store columnar data to be serialized to disk - * @param lbSampleIndex Lower bound of sample index -- inclusive (0-based) - * @param ubSampleIndex Upper bound of sample index -- inclusive (0-based) - * @return GenomicsDB import configuration object - */ - private static GenomicsDBImportConfiguration.ImportConfiguration createImportConfiguration( - final String workspace, - final String arrayName, - final long variantContextBufferSize, - final long segmentSize, - final long lbSampleIndex, - final long ubSampleIndex, - final boolean failIfArrayExists) { - - final GenomicsDBImportConfiguration.Partition.Builder pBuilder = - GenomicsDBImportConfiguration.Partition.newBuilder(); - - // Since, there is one partition for this import, the - // begin column partition index is 0 - final GenomicsDBImportConfiguration.Partition partition = - pBuilder - .setWorkspace(workspace) - .setArray(arrayName) - .setBegin(0) - .build(); - - final GenomicsDBImportConfiguration.GATK4Integration.Builder gBuilder = - GenomicsDBImportConfiguration.GATK4Integration.newBuilder(); - - final GenomicsDBImportConfiguration.GATK4Integration gatk4Parameters = - gBuilder - .setLowerSampleIndex(lbSampleIndex) - .setUpperSampleIndex(ubSampleIndex) - .build(); - - final GenomicsDBImportConfiguration.ImportConfiguration.Builder cBuilder = - GenomicsDBImportConfiguration.ImportConfiguration.newBuilder(); - - return cBuilder - .addColumnPartitions(0, partition) - .setGatk4IntegrationParameters(gatk4Parameters) - .setSizePerColumnPartition(variantContextBufferSize) - .setSegmentSize(segmentSize) - .setFailIfUpdating(failIfArrayExists) - .build(); - } - - /** - * Close all readers in the current batch - * - * @param sampleToReaderMap Map of sample names to readers - */ - private static void closeReaders(final Map> sampleToReaderMap) { - for (final Map.Entry> reader : sampleToReaderMap.entrySet()) { - try { - reader.getValue().close(); - } catch (final IOException e) { - throw new GATKException("FeatureReader close() failed for " + reader.getKey(), e); - } - } - } - /** * Input argument "overwriteExistingWorkspace" defaults to false. * The tool creates a new workspace if it doesn't exist. Deletes @@ -741,26 +694,17 @@ private static void checkIfValidWorkspace(final File workspaceDir) { */ private void initializeIntervals() { if (intervalArgumentCollection.intervalsSpecified()) { - final SAMSequenceDictionary intervalDictionary = getBestAvailableSequenceDictionary(); + if (intervalDictionary == null) { throw new UserException("We require at least one input source that " + "has a sequence dictionary (reference or reads) when intervals are specified"); } intervals = new ArrayList<>(); - - final List simpleIntervalList = - intervalArgumentCollection.getIntervals(intervalDictionary); - - if (simpleIntervalList.size() > 1) { - throw new UserException("More than one interval specified. The tool takes only one"); - } - - for (final SimpleInterval simpleInterval : simpleIntervalList) { - intervals.add(new ChromosomeInterval(simpleInterval.getContig(), - simpleInterval.getStart(), simpleInterval.getEnd())); - } + final List simpleIntervalList = intervalArgumentCollection.getIntervals(intervalDictionary); + simpleIntervalList.forEach(interval -> intervals.add(new ChromosomeInterval(interval.getContig(), + interval.getStart(), interval.getEnd()))); } else { throw new UserException("No intervals specified"); } @@ -768,7 +712,7 @@ private void initializeIntervals() { @Override public void onShutdown(){ - if( inputPreloadExecutorService != null) { + if(inputPreloadExecutorService != null) { inputPreloadExecutorService.shutdownNow(); } } diff --git a/src/test/java/org/broadinstitute/hellbender/engine/GenomicsDBIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/engine/GenomicsDBIntegrationTest.java index 48624e15c7f..f0f80af5164 100644 --- a/src/test/java/org/broadinstitute/hellbender/engine/GenomicsDBIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/engine/GenomicsDBIntegrationTest.java @@ -1,6 +1,6 @@ package org.broadinstitute.hellbender.engine; -import com.intel.genomicsdb.GenomicsDBUtils; +import com.intel.genomicsdb.GenomicsDBLibLoader; import htsjdk.variant.variantcontext.VariantContext; import org.broadinstitute.hellbender.CommandLineProgramTest; import org.broadinstitute.hellbender.tools.walkers.variantutils.SelectVariants; @@ -30,7 +30,7 @@ public String getTestedClassName() { @Test public void testGenomicsDBInClassPath(){ final String path = "/"+System.mapLibraryName("tiledbgenomicsdb"); - Assert.assertNotNull(GenomicsDBUtils.class.getResource(path), "Could not find the genomicsdb binary at " + path); + Assert.assertNotNull(GenomicsDBLibLoader.class.getResource(path), "Could not find the genomicsdb binary at " + path); } @Test diff --git a/src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImportIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImportIntegrationTest.java index bc3eb30f2f2..825ba300353 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImportIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImportIntegrationTest.java @@ -1,6 +1,7 @@ package org.broadinstitute.hellbender.tools.genomicsdb; -import com.intel.genomicsdb.GenomicsDBFeatureReader; +import com.intel.genomicsdb.model.GenomicsDBExportConfiguration; +import com.intel.genomicsdb.reader.GenomicsDBFeatureReader; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.SAMSequenceRecord; import htsjdk.tribble.AbstractFeatureReader; @@ -30,7 +31,6 @@ import org.testng.annotations.Test; import java.io.File; -import java.io.FileWriter; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Paths; @@ -39,10 +39,14 @@ import java.util.stream.Collectors; public final class GenomicsDBImportIntegrationTest extends CommandLineProgramTest { - private static final String HG_00096 = largeFileTestDir + "gvcfs/HG00096.g.vcf.gz"; private static final String HG_00268 = largeFileTestDir + "gvcfs/HG00268.g.vcf.gz"; private static final String NA_19625 = largeFileTestDir + "gvcfs/NA19625.g.vcf.gz"; + //The following 3 files were obtained by running CombineGVCFs on the above 3 files (separately). This introduces spanning + //deletions in the files. Hence, these files can be used to test for spanning deletions in the input VCF. + private static final String HG_00096_after_combine_gvcfs = largeFileTestDir + "gvcfs/HG00096_after_combine_gvcfs.g.vcf.gz"; + private static final String HG_00268_after_combine_gvcfs = largeFileTestDir + "gvcfs/HG00268_after_combine_gvcfs.g.vcf.gz"; + private static final String NA_19625_after_combine_gvcfs = largeFileTestDir + "gvcfs/NA19625_after_combine_gvcfs.g.vcf.gz"; private static final String NA_24385 = largeFileTestDir + "NA24385.vcf.gz"; private static final String NA_12878_PHASED = largeFileTestDir + "NA12878.phasedData.Chr20.vcf"; //NOTE: this is not phased according to the vcf spec but it reflects phasing currently produced by haplotype caller private static final String MULTIPLOID_DATA_HG37 = largeFileTestDir + "gvcfs/HapMap5plex.ploidy10.b37.g.vcf"; @@ -50,18 +54,56 @@ public final class GenomicsDBImportIntegrationTest extends CommandLineProgramTes private static final String ARTIFICIAL_PHASED = getTestDataDir() + "/ArtificalPhasedData.1.g.vcf"; private static final String HG_00268_WITH_SPACES = largeFileTestDir + "gvcfs/HG00268.spaceInSampleName.g.vcf"; private static final List LOCAL_GVCFS = Arrays.asList(HG_00096, HG_00268, NA_19625); + private static final List LOCAL_GVCFS_AFTER_COMBINE_GVCFS = Arrays.asList(HG_00096_after_combine_gvcfs, + HG_00268_after_combine_gvcfs, + NA_19625_after_combine_gvcfs); private static final String GENOMICSDB_TEST_DIR = toolsTestDir + "GenomicsDBImport/"; private static final String COMBINEGVCFS_TEST_DIR = toolsTestDir + "walkers/CombineGVCFs/"; - private static final String COMBINED = largeFileTestDir + "gvcfs/combined.gatk3.7.g.vcf.gz"; + private static final String COMBINED_WITH_GENOTYPES = largeFileTestDir + "gvcfs/combined_with_genotypes.g.vcf.gz"; + //This file was obtained from combined.gatk3.7.g.vcf.gz by dropping all the samples + private static final String COMBINED_SITES_ONLY = largeFileTestDir + "gvcfs/combined.gatk3.7_sites_only.g.vcf.gz"; + //Consider a gVCF with a REF block chr20:50-150. Importing this data into GenomicsDB using multiple intervals + //-L chr20:1-100 and -L chr20:101-200 will cause the REF block to be imported into both the arrays + //Now, when reading data from the workspace (assume full scan) - the data is split into 2 REF block intervals chr20:50-100 + //and chr20:101-150 one from each array + //The following COMBINED_MULTI_INTERVAL gvcf is identical to the gVCF in the previous line except at the partition break + //position + //The previous file has the following line: + //chr20 17970000 . G . . END=17970001 + // + //while this file has: + //chr20 17970000 . G . . . + //chr20 17970001 . G . . . + // + private static final String COMBINED_MULTI_INTERVAL = largeFileTestDir + "gvcfs/combined_multi_interval.gatk3.7.g.vcf.gz"; private static final String COMBINED_WITHSPACES = largeFileTestDir + "gvcfs/combined.gatk3.7.smaller_interval.g.vcf"; - private static final SimpleInterval INTERVAL = new SimpleInterval("chr20", 17960187, 17981445); - private static final SimpleInterval INTERVAL_3736 = new SimpleInterval("chr6",130365070,146544250); - private static final SimpleInterval INTERVAL_NONDIPLOID = new SimpleInterval("20", 10000000, 10100000); - private static final SimpleInterval SMALLER_INTERVAL = new SimpleInterval("chr20", 17960187, 17961973); + private static final ArrayList INTERVAL = + new ArrayList(Arrays.asList(new SimpleInterval("chr20", 17960187, 17981445))); + private static final ArrayList MULTIPLE_INTERVALS = new ArrayList(Arrays.asList( + new SimpleInterval("chr20", 17960187, 17970000), + new SimpleInterval("chr20", 17970001, 17980000), + new SimpleInterval("chr20", 17980001, 17981445) + )); + private static final ArrayList MULTIPLE_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS = + new ArrayList(Arrays.asList( + new SimpleInterval("chr20", 17960187, 17969999), + new SimpleInterval("chr20", 17970000, 17980000), + new SimpleInterval("chr20", 17980001, 17981445) + )); + private static final ArrayList MULTIPLE_NON_ADJACENT_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS = + new ArrayList(Arrays.asList( + new SimpleInterval("chr20", 17960187, 17969999), + new SimpleInterval("chr20", 17980001, 17981445), + new SimpleInterval("chr21", 29477554, 29486255) + )); + private static final ArrayList INTERVAL_3736 = + new ArrayList(Arrays.asList(new SimpleInterval("chr6",130365070,146544250))); + private static final ArrayList INTERVAL_NONDIPLOID = + new ArrayList(Arrays.asList(new SimpleInterval("20", 10000000, 10100000))); + private static final ArrayList SMALLER_INTERVAL = + new ArrayList(Arrays.asList(new SimpleInterval("chr20", 17960187, 17961973))); private static final VCFHeader VCF_HEADER = VariantContextTestUtils.getCompleteHeader(); - - private static final String SAMPLE_NAME_KEY = "SN"; private static final String ANOTHER_ATTRIBUTE_KEY = "AA"; @@ -86,11 +128,35 @@ public void testGenomicsDBImportFileInputs() throws IOException { testGenomicsDBImporter(LOCAL_GVCFS, INTERVAL, COMBINED, b38_reference_20_21, true); } + @Test + public void testGenomicsDBImportFileInputsWithMultipleIntervals() throws IOException { + testGenomicsDBImporter(LOCAL_GVCFS, MULTIPLE_INTERVALS, COMBINED_MULTI_INTERVAL, b38_reference_20_21, true); + } + @Test public void testGenomicsDBImportFileInputsAgainstCombineGVCF() throws IOException { testGenomicsDBAgainstCombineGVCFs(LOCAL_GVCFS, INTERVAL, b38_reference_20_21, new String[0]); } + @Test + public void testGenomicsDBImportFileInputsAgainstCombineGVCFWithMultipleIntervals() throws IOException { + testGenomicsDBAgainstCombineGVCFs(LOCAL_GVCFS, MULTIPLE_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS, b38_reference_20_21, new String[0]); + } + + @Test + public void testGenomicsDBImportFileInputsAgainstCombineGVCFWithMultipleNonAdjacentIntervals() throws IOException { + testGenomicsDBAgainstCombineGVCFs(LOCAL_GVCFS, MULTIPLE_NON_ADJACENT_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS, + b38_reference_20_21, new String[0]); + } + + @Test + public void testGenomicsDBImportFileInputsAgainstCombineGVCFWithMultipleNonAdjacentIntervalsForFilesProducedAfterCombineGVCFs() + throws IOException { + //this test covers the scenario where the input vcfs have spanning deletions + testGenomicsDBAgainstCombineGVCFs(LOCAL_GVCFS_AFTER_COMBINE_GVCFS, MULTIPLE_NON_ADJACENT_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS, + b38_reference_20_21, new String[0]); + } + @Test public void testGenomicsDBImportFileInputsAgainstCombineGVCFWithNonDiploidData() throws IOException { testGenomicsDBAgainstCombineGVCFs(Arrays.asList(NA12878_HG37, MULTIPLOID_DATA_HG37), INTERVAL_NONDIPLOID, b37_reference_20_21, new String[0]); @@ -101,20 +167,64 @@ public void testGenomicsDBImportPhasedData() throws IOException { testGenomicsDBImporterWithGenotypes(Arrays.asList(NA_12878_PHASED), INTERVAL, NA_12878_PHASED, b37_reference_20_21); } + @Test + public void testGenomicsDBImportPhasedDataWithMultipleIntervals() throws IOException { + testGenomicsDBImporterWithGenotypes(Arrays.asList(NA_12878_PHASED), MULTIPLE_INTERVALS, NA_12878_PHASED, b37_reference_20_21); + } + @Test public void testGenomicsDBImportArtificialPhasedData() throws IOException { - testGenomicsDBImporterWithGenotypes(Arrays.asList(ARTIFICIAL_PHASED), new SimpleInterval("1", 10109, 10297), ARTIFICIAL_PHASED, b37_reference_20_21); + ArrayList intervals = new ArrayList(Arrays.asList(new SimpleInterval("1", 10109, 10297))); + testGenomicsDBImporterWithGenotypes(Arrays.asList(ARTIFICIAL_PHASED), intervals, ARTIFICIAL_PHASED, b37_reference_20_21); } - private void testGenomicsDBImporterWithGenotypes(final List vcfInputs, final SimpleInterval interval, final String referenceFile, final String expectedCombinedVCF) throws IOException { + @Test + public void testGenomicsDBThreeLargeSamplesWithGenotypes() throws IOException { + ArrayList intervals = new ArrayList(Arrays.asList(new SimpleInterval("chr20", 1, 64444167))); + testGenomicsDBImporterWithGenotypes(LOCAL_GVCFS, intervals, COMBINED_WITH_GENOTYPES, b38_reference_20_21, true, true, false); + } + + @Test + public void testGenomicsDBThreeLargeSamplesSitesOnlyQuery() throws IOException { + ArrayList intervals = new ArrayList(Arrays.asList( + new SimpleInterval("chr20", 1, 64444167), + new SimpleInterval("chr21", 1, 46709983))); + testGenomicsDBImporterWithGenotypes(LOCAL_GVCFS, intervals, COMBINED_SITES_ONLY, b38_reference_20_21, true, true, true); + } + + private void testGenomicsDBImporterWithGenotypes(final List vcfInputs, final List intervals, + final String expectedCombinedVCF, + final String referenceFile) throws IOException { + testGenomicsDBImporterWithGenotypes(vcfInputs, intervals, + expectedCombinedVCF, referenceFile, + false, + true, + false); + } + + private void testGenomicsDBImporterWithGenotypes(final List vcfInputs, final List intervals, + final String expectedCombinedVCF, final String referenceFile, + final boolean testAll) throws IOException { + testGenomicsDBImporterWithGenotypes(vcfInputs, intervals, + expectedCombinedVCF, referenceFile, + testAll, + false, + false); + } + + private void testGenomicsDBImporterWithGenotypes(final List vcfInputs, final List intervals, + final String expectedCombinedVCF, final String referenceFile, + final boolean testAll, + final boolean produceGTField, + final boolean sitesOnlyQuery) throws IOException { final String workspace = createTempDir("genomicsdb-tests-").getAbsolutePath() + "/workspace"; - writeToGenomicsDB(vcfInputs, interval, workspace, 0, false, 0, 1); + writeToGenomicsDB(vcfInputs, intervals, workspace, 0, false, 0, 1); checkJSONFilesAreWritten(workspace); - checkGenomicsDBAgainstExpected(workspace, interval, referenceFile, expectedCombinedVCF, false); + checkGenomicsDBAgainstExpected(workspace, intervals, expectedCombinedVCF, referenceFile, testAll, produceGTField, sitesOnlyQuery); } - private File runCombineGVCFs(final List inputs, final SimpleInterval interval, final String reference, final String[] extraArgs) { + private File runCombineGVCFs(final List inputs, final List intervals, final String reference, final String[] extraArgs) { final File output = createTempFile("genotypegvcf", ".vcf"); final ArgumentsBuilder args = new ArgumentsBuilder(); @@ -123,7 +233,7 @@ private File runCombineGVCFs(final List inputs, final SimpleInterval int for (String input: inputs) { args.addArgument("V", input); } - args.addArgument("L", interval.toString()); + intervals.forEach(interval -> args.addArgument("L", interval.toString())); Arrays.stream(extraArgs).forEach(args::add); Utils.resetRandomGenerator(); @@ -131,13 +241,17 @@ private File runCombineGVCFs(final List inputs, final SimpleInterval int return output; } - private void testGenomicsDBAgainstCombineGVCFs(final List vcfInputs, final SimpleInterval interval, final String referenceFile, final String[] CombineGVCFArgs) throws IOException { + private void testGenomicsDBAgainstCombineGVCFs(final List vcfInputs, final List intervals, + final String referenceFile, final String[] CombineGVCFArgs) throws IOException { final String workspace = createTempDir("genomicsdb-tests-").getAbsolutePath() + "/workspace"; - writeToGenomicsDB(vcfInputs, interval, workspace, 0, false, 0, 1); + writeToGenomicsDB(vcfInputs, intervals, workspace, 0, false, 0, 1); checkJSONFilesAreWritten(workspace); - File expectedCombinedVCF = runCombineGVCFs(vcfInputs, interval, referenceFile, CombineGVCFArgs); - checkGenomicsDBAgainstExpected(workspace, interval, expectedCombinedVCF.getAbsolutePath(), referenceFile, true); + for(SimpleInterval currInterval : intervals) { + List tmpList = new ArrayList(Arrays.asList(currInterval)); + File expectedCombinedVCF = runCombineGVCFs(vcfInputs, tmpList, referenceFile, CombineGVCFArgs); + checkGenomicsDBAgainstExpected(workspace, tmpList, expectedCombinedVCF.getAbsolutePath(), referenceFile, true); + } } @Test(groups = {"bucket"}) @@ -159,7 +273,7 @@ public void testGenomicsDBAbsolutePathDepndency() throws IOException { @Test (enabled = true) public void testGenomicsDBAlleleSpecificAnnotations() throws IOException { testGenomicsDBAgainstCombineGVCFs(Arrays.asList(COMBINEGVCFS_TEST_DIR+"NA12878.AS.chr20snippet.g.vcf", COMBINEGVCFS_TEST_DIR+"NA12892.AS.chr20snippet.g.vcf"), - new SimpleInterval("20", 10433000, 10700000), + new ArrayList(Arrays.asList(new SimpleInterval("20", 10433000, 10700000))), b37_reference_20_21, new String[]{"-G", "StandardAnnotation", "-G", "AS_StandardAnnotation"}); } @@ -184,6 +298,11 @@ public void testGenomicsDBImportFileInputsInBatches(final int batchSize) throws testGenomicsDBImporterWithBatchSize(LOCAL_GVCFS, INTERVAL, COMBINED, batchSize); } + @Test(dataProvider = "batchSizes") + public void testGenomicsDBImportFileInputsInBatchesWithMultipleIntervals(final int batchSize) throws IOException { + testGenomicsDBImporterWithBatchSize(LOCAL_GVCFS, MULTIPLE_INTERVALS, COMBINED_MULTI_INTERVAL, batchSize); + } + @Test(groups = {"bucket"}, dataProvider = "batchSizes") public void testGenomicsDBImportGCSInputsInBatches(final int batchSize) throws IOException { testGenomicsDBImporterWithBatchSize(resolveLargeFilesAsCloudURIs(LOCAL_GVCFS), INTERVAL, COMBINED, batchSize); @@ -225,36 +344,40 @@ public void testZeroVCFBufferSize() throws IOException { } - private void testGenomicsDBImporter(final List vcfInputs, final SimpleInterval interval, final String expectedCombinedVCF, final String referenceFile, final boolean testAll) throws IOException { + private void testGenomicsDBImporter(final List vcfInputs, final List intervals, + final String expectedCombinedVCF, final String referenceFile, + final boolean testAll) throws IOException { final String workspace = createTempDir("genomicsdb-tests-").getAbsolutePath() + "/workspace"; - writeToGenomicsDB(vcfInputs, interval, workspace, 0, false, 0, 1); + writeToGenomicsDB(vcfInputs, intervals, workspace, 0, false, 0, 1); checkJSONFilesAreWritten(workspace); - checkGenomicsDBAgainstExpected(workspace, interval, expectedCombinedVCF, referenceFile, testAll); + checkGenomicsDBAgainstExpected(workspace, intervals, expectedCombinedVCF, referenceFile, testAll); } - private void testGenomicsDBImporterWithBatchSize(final List vcfInputs, final SimpleInterval interval, final String expectedCombinedVCF, final int batchSize) throws IOException { + private void testGenomicsDBImporterWithBatchSize(final List vcfInputs, final List intervals, + final String expectedCombinedVCF, final int batchSize) throws IOException { final String workspace = createTempDir("genomicsdb-batchsize-tests-").getAbsolutePath() + "/workspace-" + batchSize; - writeToGenomicsDB(vcfInputs, interval, workspace, batchSize, false, 0, 1); + writeToGenomicsDB(vcfInputs, intervals, workspace, batchSize, false, 0, 1); checkJSONFilesAreWritten(workspace); - checkGenomicsDBAgainstExpected(workspace, interval, expectedCombinedVCF, b38_reference_20_21, true); + checkGenomicsDBAgainstExpected(workspace, intervals, expectedCombinedVCF, b38_reference_20_21, true); } - private void testGenomicsDBImportWithZeroBufferSize(final List vcfInputs, final SimpleInterval interval, final String expectedCombinedVCF) throws IOException { + private void testGenomicsDBImportWithZeroBufferSize(final List vcfInputs, final List intervals, + final String expectedCombinedVCF) throws IOException { final String workspace = createTempDir("genomicsdb-buffersize-tests-").getAbsolutePath() + "/workspace"; - writeToGenomicsDB(vcfInputs, interval, workspace, 0, true, 0, 1); + writeToGenomicsDB(vcfInputs, intervals, workspace, 0, true, 0, 1); checkJSONFilesAreWritten(workspace); - checkGenomicsDBAgainstExpected(workspace, interval, expectedCombinedVCF, b38_reference_20_21, true); + checkGenomicsDBAgainstExpected(workspace, intervals, expectedCombinedVCF, b38_reference_20_21, true); } - private void writeToGenomicsDB(final List vcfInputs, final SimpleInterval interval, final String workspace, + private void writeToGenomicsDB(final List vcfInputs, final List intervals, final String workspace, final int batchSize, final Boolean useBufferSize, final int bufferSizePerSample, int threads) { final ArgumentsBuilder args = new ArgumentsBuilder(); args.addArgument(GenomicsDBImport.WORKSPACE_ARG_LONG_NAME, workspace); - args.addArgument("L", IntervalUtils.locatableToString(interval)); + intervals.forEach(interval -> args.addArgument("L", IntervalUtils.locatableToString(interval))); vcfInputs.forEach(vcf -> args.addArgument("V", vcf)); args.addArgument("batch-size", String.valueOf(batchSize)); args.addArgument(GenomicsDBImport.VCF_INITIALIZER_THREADS_LONG_NAME, String.valueOf(threads)); @@ -270,40 +393,59 @@ private static void checkJSONFilesAreWritten(final String workspace) { Assert.assertTrue(new File(workspace, GenomicsDBConstants.DEFAULT_VCFHEADER_FILE_NAME).exists()); } - private static void checkGenomicsDBAgainstExpected(final String workspace, final SimpleInterval interval, final String expectedCombinedVCF, final String referenceFile, final boolean testAll) throws IOException { + private static void checkGenomicsDBAgainstExpected(final String workspace, final List intervals, + final String expectedCombinedVCF, final String referenceFile, + final boolean testAll) throws IOException { + checkGenomicsDBAgainstExpected(workspace, intervals, + expectedCombinedVCF, referenceFile, + testAll, + false, + false); + } + + private static void checkGenomicsDBAgainstExpected(final String workspace, final List intervals, + final String expectedCombinedVCF, final String referenceFile, + final boolean testAll, + final boolean produceGTField, + final boolean sitesOnlyQuery) throws IOException { final GenomicsDBFeatureReader genomicsDBFeatureReader = - getGenomicsDBFeatureReader(workspace, referenceFile, !testAll); + getGenomicsDBFeatureReader(workspace, referenceFile, produceGTField, sitesOnlyQuery); final AbstractFeatureReader combinedVCFReader = AbstractFeatureReader.getFeatureReader(expectedCombinedVCF, new VCFCodec(), true); - try (CloseableTribbleIterator actualVcs = - genomicsDBFeatureReader.query(interval.getContig(), interval.getStart(), interval.getEnd()); - - CloseableTribbleIterator expectedVcs = - combinedVCFReader.query(interval.getContig(), interval.getStart(), interval.getEnd())) { - - BaseTest.assertCondition(actualVcs, expectedVcs, (a, e) -> { - // Test that the VCs match - if (testAll) { - // To correct a discrepancy between genotypeGVCFs which outputs empty genotypes as "./." and GenomicsDB - // which returns them as "." we simply remap the empty ones to be consistent for comparison - List genotypes = a.getGenotypes().stream() - .map(g -> g.getGenotypeString().equals(".")?new GenotypeBuilder(g).alleles(GATKVariantContextUtils.noCallAlleles(2)).make():g) - .collect(Collectors.toList()); - a = new VariantContextBuilder(a).genotypes(genotypes).make(); - VariantContextTestUtils.assertVariantContextsAreEqualAlleleOrderIndependent(a, e, Collections.emptyList(), VCF_HEADER); - - // Test only that the genotypes match - } else { - List genotypes = e.getGenotypes().stream() - .map(g -> g.getGenotypeString().equals(".")?new GenotypeBuilder(g).alleles(Collections.emptyList()).make():g) - .collect(Collectors.toList()); - e = new VariantContextBuilder(e).genotypes(genotypes).make(); - VariantContextTestUtils.assertVariantContextsHaveSameGenotypes(a, e); - } - }); - } + + intervals.forEach(interval -> { + try (CloseableTribbleIterator actualVcs = + genomicsDBFeatureReader.query(interval.getContig(), interval.getStart(), interval.getEnd()); + + CloseableTribbleIterator expectedVcs = + combinedVCFReader.query(interval.getContig(), interval.getStart(), interval.getEnd())) { + + BaseTest.assertCondition(actualVcs, expectedVcs, (a, e) -> { + // Test that the VCs match + if (testAll) { + // To correct a discrepancy between genotypeGVCFs which outputs empty genotypes as "./." and GenomicsDB + // which returns them as "." we simply remap the empty ones to be consistent for comparison + List genotypes = a.getGenotypes().stream() + .map(g -> g.getGenotypeString().equals(".")?new GenotypeBuilder(g).alleles(GATKVariantContextUtils.noCallAlleles(2)).make():g) + .collect(Collectors.toList()); + a = new VariantContextBuilder(a).genotypes(genotypes).make(); + VariantContextTestUtils.assertVariantContextsAreEqualAlleleOrderIndependent(a, e, Collections.emptyList(), VCF_HEADER); + + // Test only that the genotypes match + } else { + List genotypes = e.getGenotypes().stream() + .map(g -> g.getGenotypeString().equals(".")?new GenotypeBuilder(g).alleles(Collections.emptyList()).make():g) + .collect(Collectors.toList()); + e = new VariantContextBuilder(e).genotypes(genotypes).make(); + VariantContextTestUtils.assertVariantContextsHaveSameGenotypes(a, e); + } + }); + } catch (IOException e) { + Assert.fail(e.getMessage(), e); + } + }); } @DataProvider @@ -360,7 +502,8 @@ public void testSampleNameWithSpaces() throws IOException { ArgumentsBuilder args = new ArgumentsBuilder() .addArgument(GenomicsDBImport.BATCHSIZE_ARG_LONG_NAME, String.valueOf(2)) - .addFileArgument(GenomicsDBImport.SAMPLE_NAME_MAP_LONG_NAME, outOfOrderSampleMap).addArgument("L", IntervalUtils.locatableToString(SMALLER_INTERVAL)) + .addFileArgument(GenomicsDBImport.SAMPLE_NAME_MAP_LONG_NAME, outOfOrderSampleMap) + .addArgument("L", IntervalUtils.locatableToString(SMALLER_INTERVAL.get(0))) .addArgument(GenomicsDBImport.WORKSPACE_ARG_LONG_NAME, workspace); runCommandLine(args); @@ -372,7 +515,7 @@ public void testSampleNameWithSpaces() throws IOException { public void testSampleNameOrdering(final ArgumentsBuilder args) throws IOException { final String workspace = createTempDir("gendbtest").getAbsolutePath() + "/workspace"; - args.addArgument("L", IntervalUtils.locatableToString(INTERVAL)) + args.addArgument("L", IntervalUtils.locatableToString(INTERVAL.get(0))) .addArgument(GenomicsDBImport.WORKSPACE_ARG_LONG_NAME, workspace); runCommandLine(args); @@ -464,7 +607,7 @@ public void testRenamingSamples(final Map renamingMap, final int .addArgument(GenomicsDBImport.WORKSPACE_ARG_LONG_NAME, new File(workspace).getAbsolutePath()) .addArgument(GenomicsDBImport.VCF_INITIALIZER_THREADS_LONG_NAME, String.valueOf(threads)) .addArgument(GenomicsDBImport.BATCHSIZE_ARG_LONG_NAME, String.valueOf(batchSize)) - .addArgument("L", IntervalUtils.locatableToString(INTERVAL)); + .addArgument("L", IntervalUtils.locatableToString(INTERVAL.get(0))); runCommandLine(args); final Set expectedSampleNames = sampleMap.keySet(); @@ -505,7 +648,7 @@ private static File createInputVCF(final String sampleName) { final Allele Aref = Allele.create("A", true); final Allele C = Allele.create("C"); final List alleles = Arrays.asList(Aref, C); - final VariantContext variant = new VariantContextBuilder("invented", contig, INTERVAL.getStart(), INTERVAL.getStart(), alleles) + final VariantContext variant = new VariantContextBuilder("invented", contig, INTERVAL.get(0).getStart(), INTERVAL.get(0).getStart(), alleles) .genotypes(new GenotypeBuilder(sampleName, alleles).attribute(SAMPLE_NAME_KEY, sampleName) .attribute(ANOTHER_ATTRIBUTE_KEY, 10).make()) .make(); @@ -520,7 +663,7 @@ public void testCantSpecifyVCFAndSampleNameFile(){ .addArgument(GenomicsDBImport.SAMPLE_NAME_MAP_LONG_NAME, createInOrderSampleMap().getAbsolutePath()) .addArgument(StandardArgumentDefinitions.VARIANT_LONG_NAME, HG_00096) .addArgument(GenomicsDBImport.WORKSPACE_ARG_LONG_NAME, createTempDir("workspace").getAbsolutePath()) - .addArgument("L", IntervalUtils.locatableToString(INTERVAL)); + .addArgument("L", IntervalUtils.locatableToString(INTERVAL.get(0))); runCommandLine(args); } @@ -572,9 +715,9 @@ public void testCommandIncludedInOutputHeader() throws IOException { @Test public void testPreserveContigOrderingInHeader() throws IOException { final String workspace = createTempDir("testPreserveContigOrderingInHeader-").getAbsolutePath() + "/workspace"; - - writeToGenomicsDB(Arrays.asList(GENOMICSDB_TEST_DIR + "testHeaderContigLineSorting1.g.vcf", GENOMICSDB_TEST_DIR + "testHeaderContigLineSorting2.g.vcf"), - new SimpleInterval("chr20", 17959479, 17959479), workspace, 0, false, 0, 1); + ArrayList intervals = new ArrayList(Arrays.asList(new SimpleInterval("chr20", 17959479, 17959479))); + writeToGenomicsDB(Arrays.asList(GENOMICSDB_TEST_DIR + "testHeaderContigLineSorting1.g.vcf", + GENOMICSDB_TEST_DIR + "testHeaderContigLineSorting2.g.vcf"), intervals, workspace, 0, false, 0, 1); try ( final GenomicsDBFeatureReader genomicsDBFeatureReader = getGenomicsDBFeatureReader(workspace, b38_reference_20_21); @@ -589,53 +732,33 @@ public void testPreserveContigOrderingInHeader() throws IOException { } } + private static GenomicsDBFeatureReader getGenomicsDBFeatureReader( + final String workspace, final String reference, + final boolean produceGTField) throws IOException { + return getGenomicsDBFeatureReader(workspace, reference, + produceGTField, false); + } - private static String getQueryJsonForGenomicsDB(String vidMappingFile, String callsetMappingFile, String tiledbWorkspace, - String referenceGenome, boolean produceGTField) throws IOException { - //Produce temporary JSON query config file - String indentString = " "; - String queryJSON = "{\n"; - queryJSON += indentString + "\"scan_full\": true,\n"; - queryJSON += indentString + "\"workspace\": \""+tiledbWorkspace+"\",\n"; - queryJSON += indentString + "\"array\": \""+GenomicsDBConstants.DEFAULT_ARRAY_NAME+"\",\n"; - queryJSON += indentString + "\"vid_mapping_file\": \""+vidMappingFile+"\",\n"; - queryJSON += indentString + "\"callset_mapping_file\": \""+callsetMappingFile+"\",\n"; - queryJSON += indentString + "\"produce_GT_field\": true,\n"; - queryJSON += indentString + "\"reference_genome\": \""+referenceGenome+"\""; - queryJSON += "\n}\n"; - File tmpQueryJSONFile = File.createTempFile("queryJSON", ".json"); - tmpQueryJSONFile.deleteOnExit(); - FileWriter fptr = new FileWriter(tmpQueryJSONFile); - fptr.write(queryJSON); - fptr.close(); - return tmpQueryJSONFile.getAbsolutePath(); - } - //Produce temporary JSON query config file - - - private static GenomicsDBFeatureReader getGenomicsDBFeatureReader(final String workspace, final String reference, boolean produceGTField) throws IOException { - if (produceGTField) { - return new GenomicsDBFeatureReader<>( - "", - getQueryJsonForGenomicsDB(new File(workspace, GenomicsDBConstants.DEFAULT_VIDMAP_FILE_NAME).getAbsolutePath(), - new File(workspace, GenomicsDBConstants.DEFAULT_CALLSETMAP_FILE_NAME).getAbsolutePath(), - workspace, - reference, - produceGTField), - new BCF2Codec()); - } else { - return new GenomicsDBFeatureReader<>( - new File(workspace, GenomicsDBConstants.DEFAULT_VIDMAP_FILE_NAME).getAbsolutePath(), - new File(workspace, GenomicsDBConstants.DEFAULT_CALLSETMAP_FILE_NAME).getAbsolutePath(), - workspace, - GenomicsDBConstants.DEFAULT_ARRAY_NAME, - reference, - new File(workspace, GenomicsDBConstants.DEFAULT_VCFHEADER_FILE_NAME).getAbsolutePath(), - new BCF2Codec()); - } + private static GenomicsDBFeatureReader getGenomicsDBFeatureReader( + final String workspace, final String reference, + final boolean produceGTField, + final boolean sitesOnlyQuery) throws IOException { + GenomicsDBExportConfiguration.ExportConfiguration exportConfiguration = GenomicsDBExportConfiguration.ExportConfiguration.newBuilder() + .setWorkspace(workspace) + .setReferenceGenome(reference) + .setVidMappingFile(new File(workspace, GenomicsDBConstants.DEFAULT_VIDMAP_FILE_NAME).getAbsolutePath()) + .setCallsetMappingFile(new File(workspace, GenomicsDBConstants.DEFAULT_CALLSETMAP_FILE_NAME).getAbsolutePath()) + .setVcfHeaderFilename(new File(workspace, GenomicsDBConstants.DEFAULT_VCFHEADER_FILE_NAME).getAbsolutePath()) + .setProduceGTField(produceGTField) + .setSitesOnlyQuery(sitesOnlyQuery) + .setGenerateArrayNameFromPartitionBounds(true) + .build(); + + return new GenomicsDBFeatureReader<>(exportConfiguration, new BCF2Codec(), Optional.empty()); } - private static GenomicsDBFeatureReader getGenomicsDBFeatureReader(final String workspace, final String reference) throws IOException { + private static GenomicsDBFeatureReader getGenomicsDBFeatureReader( + final String workspace, final String reference) throws IOException { return getGenomicsDBFeatureReader(workspace, reference, false); } diff --git a/src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImportUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImportUnitTest.java index ffed41eb9b2..01bf7669c13 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImportUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImportUnitTest.java @@ -1,6 +1,6 @@ package org.broadinstitute.hellbender.tools.genomicsdb; -import com.intel.genomicsdb.GenomicsDBImporter; +import com.intel.genomicsdb.importer.GenomicsDBImporter; import htsjdk.tribble.FeatureReader; import htsjdk.variant.variantcontext.VariantContext; import org.broadinstitute.hellbender.exceptions.UserException; @@ -101,11 +101,4 @@ public void testLoadSampleNameMapFileInSortedOrder(final String sampleMapText){ Assert.assertEquals(actual, expected); Assert.assertEquals(actual.keySet().iterator().next(), "Sample1"); } - - @Test(expectedExceptions = IllegalArgumentException.class) - public void testNullFeatureReadersToFail() { - final Map> sampleToReaderMap = new LinkedHashMap<>(); - sampleToReaderMap.put("Sample1", null); - GenomicsDBImporter.generateSortedCallSetMap(sampleToReaderMap, true, true, 0L); - } } diff --git a/src/test/resources/large/gvcfs/HG00096_after_combine_gvcfs.g.vcf.gz b/src/test/resources/large/gvcfs/HG00096_after_combine_gvcfs.g.vcf.gz new file mode 100644 index 00000000000..38343348e31 --- /dev/null +++ b/src/test/resources/large/gvcfs/HG00096_after_combine_gvcfs.g.vcf.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ecd33f3c80e8568f52ed8ed45ecdcab74bf3d0c1317a6cb833199556173f80f +size 6086978 diff --git a/src/test/resources/large/gvcfs/HG00096_after_combine_gvcfs.g.vcf.gz.tbi b/src/test/resources/large/gvcfs/HG00096_after_combine_gvcfs.g.vcf.gz.tbi new file mode 100644 index 00000000000..4e7d4f94d4b --- /dev/null +++ b/src/test/resources/large/gvcfs/HG00096_after_combine_gvcfs.g.vcf.gz.tbi @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:517931164c1057c3846cde0ae891d1c852078d040d02f4fc82486fa3adc687cb +size 5955 diff --git a/src/test/resources/large/gvcfs/HG00268_after_combine_gvcfs.g.vcf.gz b/src/test/resources/large/gvcfs/HG00268_after_combine_gvcfs.g.vcf.gz new file mode 100644 index 00000000000..8d7e5bcb6c6 --- /dev/null +++ b/src/test/resources/large/gvcfs/HG00268_after_combine_gvcfs.g.vcf.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa34cade7d948d70059fe356763e9769d27a6cc543e06c5e5d723ff4b56f75b6 +size 4979805 diff --git a/src/test/resources/large/gvcfs/HG00268_after_combine_gvcfs.g.vcf.gz.tbi b/src/test/resources/large/gvcfs/HG00268_after_combine_gvcfs.g.vcf.gz.tbi new file mode 100644 index 00000000000..6468581307e --- /dev/null +++ b/src/test/resources/large/gvcfs/HG00268_after_combine_gvcfs.g.vcf.gz.tbi @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:359e1d97343d6f0de7fa89d2a3d1991b0861e564b3991d7fff5bb472dce75835 +size 5838 diff --git a/src/test/resources/large/gvcfs/NA19625_after_combine_gvcfs.g.vcf.gz b/src/test/resources/large/gvcfs/NA19625_after_combine_gvcfs.g.vcf.gz new file mode 100644 index 00000000000..86c98f89963 --- /dev/null +++ b/src/test/resources/large/gvcfs/NA19625_after_combine_gvcfs.g.vcf.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f63abaa3efd43fe0f777b4f3d2cc5d51c8a8fb9ef9f02a5880c2ceca381ffec +size 19666052 diff --git a/src/test/resources/large/gvcfs/NA19625_after_combine_gvcfs.g.vcf.gz.tbi b/src/test/resources/large/gvcfs/NA19625_after_combine_gvcfs.g.vcf.gz.tbi new file mode 100644 index 00000000000..2a4d14e7248 --- /dev/null +++ b/src/test/resources/large/gvcfs/NA19625_after_combine_gvcfs.g.vcf.gz.tbi @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f49e647fdd014766fcacade906835efa96f36aa8224cb3bb0dac85e8e30349a +size 6092 diff --git a/src/test/resources/large/gvcfs/combined.gatk3.7_sites_only.g.vcf.gz b/src/test/resources/large/gvcfs/combined.gatk3.7_sites_only.g.vcf.gz new file mode 100644 index 00000000000..e70fe44756b --- /dev/null +++ b/src/test/resources/large/gvcfs/combined.gatk3.7_sites_only.g.vcf.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a25a984f07aae0707b3ab7792b580b2feb6d9579c7f5e5de120fbc408aad17ce +size 7716766 diff --git a/src/test/resources/large/gvcfs/combined.gatk3.7_sites_only.g.vcf.gz.tbi b/src/test/resources/large/gvcfs/combined.gatk3.7_sites_only.g.vcf.gz.tbi new file mode 100644 index 00000000000..e6c1ba86249 --- /dev/null +++ b/src/test/resources/large/gvcfs/combined.gatk3.7_sites_only.g.vcf.gz.tbi @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc73c8d14ca6a9cfdcaec649f1680a76c0361a0db8f2e66af00fe8ed2876fb9a +size 2644 diff --git a/src/test/resources/large/gvcfs/combined_multi_interval.gatk3.7.g.vcf.gz b/src/test/resources/large/gvcfs/combined_multi_interval.gatk3.7.g.vcf.gz new file mode 100644 index 00000000000..33b400d1cc3 --- /dev/null +++ b/src/test/resources/large/gvcfs/combined_multi_interval.gatk3.7.g.vcf.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63cd5b9ce2552bb6b9fb2afb447ab168d4ea3b3684c8f7d224f1048e8bfb8cc6 +size 21583684 diff --git a/src/test/resources/large/gvcfs/combined_multi_interval.gatk3.7.g.vcf.gz.tbi b/src/test/resources/large/gvcfs/combined_multi_interval.gatk3.7.g.vcf.gz.tbi new file mode 100644 index 00000000000..552401d932a --- /dev/null +++ b/src/test/resources/large/gvcfs/combined_multi_interval.gatk3.7.g.vcf.gz.tbi @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:740245ac4c18fa72e3ef88484a2a91543530afb8a4f2b40c2aafc9c7797bc92c +size 4035 diff --git a/src/test/resources/large/gvcfs/combined_with_genotypes.g.vcf.gz b/src/test/resources/large/gvcfs/combined_with_genotypes.g.vcf.gz new file mode 100644 index 00000000000..5009e4fb157 --- /dev/null +++ b/src/test/resources/large/gvcfs/combined_with_genotypes.g.vcf.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e73fd6716aaf12f9ec8007bdb197fe187d826a4444dc180749969a0f4042173f +size 13135087 diff --git a/src/test/resources/large/gvcfs/combined_with_genotypes.g.vcf.gz.tbi b/src/test/resources/large/gvcfs/combined_with_genotypes.g.vcf.gz.tbi new file mode 100644 index 00000000000..a8078927f2d --- /dev/null +++ b/src/test/resources/large/gvcfs/combined_with_genotypes.g.vcf.gz.tbi @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9894e9c08e621a9a6dc531b978113b976dc6de466e6897ba893ae960f3e68ccc +size 2692