GenomicsDB: new version with multi-interval import, new protobuf API,…

… sites-only query support, and bug fixes (#4645) This PR addresses required changes in order to use latest version of GenomicsDB which exposes new functionality such as: * Multi interval import and query support: * We create multiple arrays (directories) in a single workspace - one per interval. So, if you wish to import intervals ("chr1", [ 1, 100M ]) and ("chr2", [ 1, 100M ]), you end up with 2 directories/arrays in the workspace with names chr1$1$100M and chr2$1$100M. The array names depend on the partition bounds. * During the read phase, the user only supplies the workspace. The array names are obtained by scanning the entries in the workspace and reading the right arrays. For example, if you wish to read ("chr2", [ 50, 50M] ), then only the second array is queried. In the previous version of the tool, the array name was a constant - genomicsdb_array. The new version will be backward compatible with respect to reads. Hence, if a directory named genomicsdb_array is found in the workspace directory, it's passed as the array for the GenomicsDBFeatureReader otherwise the array names are generated from the directory entry names. * Parallel import based on chromosome intervals. The number of threads to use can be specified as an integer argument to the executeImport call. If no argument is specified, the number of threads is determined by Java's ForkJoinPool (typically equal to the #cores in the system). * The max number of intervals to import in parallel can be controlled by the command line argument --max-num-intervals-to-import-in-parallel (default 1) Note that increasing parallelism increases the number of FeatureReaders opened to feed data to the importer. So, if you are using N threads and your batch size is B, you will have N*B feature readers open. * Protobuf based API for import and read #3688 #2687 * Option to produce GT field * Option to produce GT for spanning deletion based on min PL value * Doesn't support #4541 or #3689 yet - next version * Bug fixes * Fix for #4716 * More error messages
broadinstitute · Jul 6, 2018 · b34b189 · b34b189
1 parent cee1d46
commit b34b189
Show file tree

Hide file tree

Showing 19 changed files with 419 additions and 285 deletions.
diff --git a/build.gradle b/build.gradle
@@ -63,8 +63,8 @@ final barclayVersion = System.getProperty('barclay.version','2.1.0')
 final sparkVersion = System.getProperty('spark.version', '2.2.0')
 final hadoopVersion = System.getProperty('hadoop.version', '2.8.2')
 final hadoopBamVersion = System.getProperty('hadoopBam.version','7.10.0')
-final genomicsdbVersion = System.getProperty('genomicsdb.version','0.9.2-proto-3.0.0-beta-1+uuid-static')
 final tensorflowVersion = System.getProperty('tensorflow.version','1.4.0')
+final genomicsdbVersion = System.getProperty('genomicsdb.version','0.9.2-proto-3.0.0-beta-1+b825ffa6eb47a')
 final testNGVersion = '6.11'
 // Using the shaded version to avoid conflicts between its protobuf dependency
 // and that of Hadoop/Spark (either the one we reference explicitly, or the one

diff --git a/src/main/java/org/broadinstitute/hellbender/engine/FeatureDataSource.java b/src/main/java/org/broadinstitute/hellbender/engine/FeatureDataSource.java
@@ -1,11 +1,13 @@
 package org.broadinstitute.hellbender.engine;
 
-import com.intel.genomicsdb.GenomicsDBFeatureReader;
+import com.intel.genomicsdb.model.GenomicsDBExportConfiguration;
+import com.intel.genomicsdb.reader.GenomicsDBFeatureReader;
 import htsjdk.samtools.SAMSequenceDictionary;
 import htsjdk.samtools.util.IOUtil;
 import htsjdk.tribble.*;
 import htsjdk.variant.bcf2.BCF2Codec;
 import htsjdk.variant.variantcontext.VariantContext;
+import htsjdk.variant.variantcontext.GenotypeLikelihoods;
 import htsjdk.variant.vcf.VCFHeader;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
@@ -23,9 +25,12 @@
 import java.io.File;
 import java.io.IOException;
 import java.nio.channels.SeekableByteChannel;
+import java.nio.file.Files;
 import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.util.Iterator;
 import java.util.List;
+import java.util.Optional;
 import java.util.function.Function;
 
 /**
@@ -379,26 +384,60 @@ private static FeatureReader<VariantContext> getGenomicsDBFeatureReader(final St
             IOUtils.canReadFile(callsetJson);
             IOUtils.canReadFile(vidmapJson);
             IOUtils.canReadFile(vcfHeader);
-        }
-        catch ( UserException.CouldNotReadInputFile e ) {
+        } catch ( UserException.CouldNotReadInputFile e ) {
             throw new UserException("Couldn't connect to GenomicsDB because the vidmap, callset JSON files, or gVCF Header (" +
                     GenomicsDBConstants.DEFAULT_VIDMAP_FILE_NAME + "," + GenomicsDBConstants.DEFAULT_CALLSETMAP_FILE_NAME + "," +
                     GenomicsDBConstants.DEFAULT_VCFHEADER_FILE_NAME + ") could not be read from GenomicsDB workspace " + workspace.getAbsolutePath(), e);
         }
 
+        final GenomicsDBExportConfiguration.ExportConfiguration exportConfigurationBuilder =
+                createExportConfiguration(reference, workspace, callsetJson, vidmapJson, vcfHeader);
+
         try {
-            return new GenomicsDBFeatureReader<>(vidmapJson.getAbsolutePath(),
-                                                 callsetJson.getAbsolutePath(),
-                                                 workspace.getAbsolutePath(),
-                                                 GenomicsDBConstants.DEFAULT_ARRAY_NAME,
-                                                 reference.getAbsolutePath(),
-                                                 vcfHeader.getAbsolutePath(),
-                                                 new BCF2Codec());
+            return new GenomicsDBFeatureReader<>(exportConfigurationBuilder, new BCF2Codec(), Optional.empty());
         } catch (final IOException e) {
             throw new UserException("Couldn't create GenomicsDBFeatureReader", e);
         }
     }
 
+    private static GenomicsDBExportConfiguration.ExportConfiguration createExportConfiguration(final File reference, final File workspace,
+                                                                                               final File callsetJson, final File vidmapJson,
+                                                                                               final File vcfHeader) {
+        GenomicsDBExportConfiguration.ExportConfiguration.Builder exportConfigurationBuilder =
+                GenomicsDBExportConfiguration.ExportConfiguration.newBuilder()
+                        .setWorkspace(workspace.getAbsolutePath())
+                        .setReferenceGenome(reference.getAbsolutePath())
+                        .setVidMappingFile(vidmapJson.getAbsolutePath())
+                        .setCallsetMappingFile(callsetJson.getAbsolutePath())
+                        .setVcfHeaderFilename(vcfHeader.getAbsolutePath())
+                        .setProduceGTField(false)
+                        .setProduceGTWithMinPLValueForSpanningDeletions(false)
+                        .setSitesOnlyQuery(false)
+                        .setMaxDiploidAltAllelesThatCanBeGenotyped(GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED);
+        Path arrayFolder = Paths.get(workspace.getAbsolutePath(), GenomicsDBConstants.DEFAULT_ARRAY_NAME).toAbsolutePath();
+
+        // For the multi-interval support, we create multiple arrays (directories) in a single workspace -
+        // one per interval. So, if you wish to import intervals ("chr1", [ 1, 100M ]) and ("chr2", [ 1, 100M ]),
+        // you end up with 2 directories named chr1$1$100M and chr2$1$100M. So, the array names depend on the
+        // partition bounds.
+
+        // During the read phase, the user only supplies the workspace. The array names are obtained by scanning
+        // the entries in the workspace and reading the right arrays. For example, if you wish to read ("chr2",
+        // 50, 50M), then only the second array is queried.
+
+        // In the previous version of the tool, the array name was a constant - genomicsdb_array. The new version
+        // will be backward compatible with respect to reads. Hence, if a directory named genomicsdb_array is found,
+        // the array name is passed to the GenomicsDBFeatureReader otherwise the array names are generated from the
+        // directory entries.
+        if (Files.exists(arrayFolder)) {
+            exportConfigurationBuilder.setArrayName(GenomicsDBConstants.DEFAULT_ARRAY_NAME);
+        } else {
+            exportConfigurationBuilder.setGenerateArrayNameFromPartitionBounds(true);
+        }
+
+        return exportConfigurationBuilder.build();
+    }
+
     /**
      * Returns the sequence dictionary for this source of Features.
      * Uses the dictionary from the VCF header (if present) for variant inputs,

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBConstants.java b/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBConstants.java
@@ -9,7 +9,6 @@ public final class GenomicsDBConstants {
     public static final String DEFAULT_CALLSETMAP_FILE_NAME = "callset.json";
     public static final String DEFAULT_VCFHEADER_FILE_NAME = "vcfheader.vcf";
 
-
     /**
      * Don't instantiate a utility class
      */
-Original file line number
+Diff line change
@@ Expand Up / @@ -9,7 +9,6 @@ public final class GenomicsDBConstants { @@
         public static final String DEFAULT_CALLSETMAP_FILE_NAME = "callset.json";
         public static final String DEFAULT_VCFHEADER_FILE_NAME = "vcfheader.vcf";
         /**
          * Don't instantiate a utility class
          */
@@ Expand Down @@