Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GenomicsDBImport: Modifications in order to address GATK issue #3269 #4645

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,8 @@ final barclayVersion = System.getProperty('barclay.version','2.1.0')
final sparkVersion = System.getProperty('spark.version', '2.2.0')
final hadoopVersion = System.getProperty('hadoop.version', '2.8.2')
final hadoopBamVersion = System.getProperty('hadoopBam.version','7.10.0')
final genomicsdbVersion = System.getProperty('genomicsdb.version','0.9.2-proto-3.0.0-beta-1+uuid-static')
final tensorflowVersion = System.getProperty('tensorflow.version','1.4.0')
final genomicsdbVersion = System.getProperty('genomicsdb.version','0.9.2-proto-3.0.0-beta-1+b825ffa6eb47a')
final testNGVersion = '6.11'
// Using the shaded version to avoid conflicts between its protobuf dependency
// and that of Hadoop/Spark (either the one we reference explicitly, or the one
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
package org.broadinstitute.hellbender.engine;

import com.intel.genomicsdb.GenomicsDBFeatureReader;
import com.intel.genomicsdb.model.GenomicsDBExportConfiguration;
import com.intel.genomicsdb.reader.GenomicsDBFeatureReader;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.tribble.*;
import htsjdk.variant.bcf2.BCF2Codec;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.GenotypeLikelihoods;
import htsjdk.variant.vcf.VCFHeader;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
Expand All @@ -22,9 +24,12 @@
import java.io.File;
import java.io.IOException;
import java.nio.channels.SeekableByteChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Iterator;
import java.util.List;
import java.util.Optional;
import java.util.function.Function;

/**
Expand Down Expand Up @@ -378,26 +383,60 @@ private static FeatureReader<VariantContext> getGenomicsDBFeatureReader(final St
IOUtils.canReadFile(callsetJson);
IOUtils.canReadFile(vidmapJson);
IOUtils.canReadFile(vcfHeader);
}
catch ( UserException.CouldNotReadInputFile e ) {
} catch ( UserException.CouldNotReadInputFile e ) {
throw new UserException("Couldn't connect to GenomicsDB because the vidmap, callset JSON files, or gVCF Header (" +
GenomicsDBConstants.DEFAULT_VIDMAP_FILE_NAME + "," + GenomicsDBConstants.DEFAULT_CALLSETMAP_FILE_NAME + "," +
GenomicsDBConstants.DEFAULT_VCFHEADER_FILE_NAME + ") could not be read from GenomicsDB workspace " + workspace.getAbsolutePath(), e);
}

final GenomicsDBExportConfiguration.ExportConfiguration exportConfigurationBuilder =
createExportConfiguration(reference, workspace, callsetJson, vidmapJson, vcfHeader);

try {
return new GenomicsDBFeatureReader<>(vidmapJson.getAbsolutePath(),
callsetJson.getAbsolutePath(),
workspace.getAbsolutePath(),
GenomicsDBConstants.DEFAULT_ARRAY_NAME,
reference.getAbsolutePath(),
vcfHeader.getAbsolutePath(),
new BCF2Codec());
return new GenomicsDBFeatureReader<>(exportConfigurationBuilder, new BCF2Codec(), Optional.empty());
} catch (final IOException e) {
throw new UserException("Couldn't create GenomicsDBFeatureReader", e);
}
}

private static GenomicsDBExportConfiguration.ExportConfiguration createExportConfiguration(final File reference, final File workspace,
final File callsetJson, final File vidmapJson,
final File vcfHeader) {
GenomicsDBExportConfiguration.ExportConfiguration.Builder exportConfigurationBuilder =
GenomicsDBExportConfiguration.ExportConfiguration.newBuilder()
.setWorkspace(workspace.getAbsolutePath())
.setReferenceGenome(reference.getAbsolutePath())
.setVidMappingFile(vidmapJson.getAbsolutePath())
.setCallsetMappingFile(callsetJson.getAbsolutePath())
.setVcfHeaderFilename(vcfHeader.getAbsolutePath())
.setProduceGTField(false)
.setProduceGTWithMinPLValueForSpanningDeletions(false)
.setSitesOnlyQuery(false)
.setMaxDiploidAltAllelesThatCanBeGenotyped(GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED);
Path arrayFolder = Paths.get(workspace.getAbsolutePath(), GenomicsDBConstants.DEFAULT_ARRAY_NAME).toAbsolutePath();

// For the multi-interval support, we create multiple arrays (directories) in a single workspace -
// one per interval. So, if you wish to import intervals ("chr1", [ 1, 100M ]) and ("chr2", [ 1, 100M ]),
// you end up with 2 directories named chr1$1$100M and chr2$1$100M. So, the array names depend on the
// partition bounds.

// During the read phase, the user only supplies the workspace. The array names are obtained by scanning
// the entries in the workspace and reading the right arrays. For example, if you wish to read ("chr2",
// 50, 50M), then only the second array is queried.

// In the previous version of the tool, the array name was a constant - genomicsdb_array. The new version
// will be backward compatible with respect to reads. Hence, if a directory named genomicsdb_array is found,
// the array name is passed to the GenomicsDBFeatureReader otherwise the array names are generated from the
// directory entries.
if (Files.exists(arrayFolder)) {
exportConfigurationBuilder.setArrayName(GenomicsDBConstants.DEFAULT_ARRAY_NAME);
} else {
exportConfigurationBuilder.setGenerateArrayNameFromPartitionBounds(true);
}

return exportConfigurationBuilder.build();
}

/**
* Returns the sequence dictionary for this source of Features.
* Uses the dictionary from the VCF header (if present) for variant inputs,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ public final class GenomicsDBConstants {
public static final String DEFAULT_CALLSETMAP_FILE_NAME = "callset.json";
public static final String DEFAULT_VCFHEADER_FILE_NAME = "vcfheader.vcf";


/**
* Don't instantiate a utility class
*/
Expand Down
Loading