Skip to content

Commit

Permalink
Add bundle support for references.
Browse files Browse the repository at this point in the history
  • Loading branch information
cmnbroad committed Sep 3, 2024
1 parent c2b6beb commit 6dcdeff
Show file tree
Hide file tree
Showing 11 changed files with 546 additions and 141 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,10 @@ private StandardArgumentDefinitions(){}
public static final String INTERVALS_SHORT_NAME = "L";
public static final String COMPARISON_SHORT_NAME = "comp";
public static final String READ_INDEX_SHORT_NAME = READ_INDEX_LONG_NAME;
public static final String PRIMARY_INPUT_LONG_NAME = "primary";
public static final String PRIMARY_INPUT_SHORT_NAME = "PI";
public static final String SECONDARY_INPUT_LONG_NAME = "secondaryI";
public static final String SECONDARY_INPUT_SHORT_NAME = "SI";
public static final String PRIMARY_RESOURCE_LONG_NAME = "primary-resource";
public static final String PRIMARY_RESOURCE_SHORT_NAME = "PR";
public static final String SECONDARY_RESOURCE_LONG_NAME = "secondary-resource";
public static final String SECONDARY_RESOURCE_SHORT_NAME = "SR";
public static final String LENIENT_SHORT_NAME = "LE";
public static final String READ_VALIDATION_STRINGENCY_SHORT_NAME = "VS";
public static final String SAMPLE_ALIAS_SHORT_NAME = "ALIAS";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ public final class FeatureInput<T extends Feature> extends GATKPath implements S
private transient Class<FeatureCodec<T, ?>> featureCodecClass;

/**
* retain any containing bundle in case we need to extract other resources from it
* retain the parent (enclosing) bundle from which this feature input is derived, in case we need to extract
* other resources from it
*/
private Bundle parentBundle;

Expand Down Expand Up @@ -148,7 +149,7 @@ public FeatureInput(
final Bundle featureBundle,
final String name) {
super(primaryResourcePath);
// retain the containing bundle for later so we can interrogate it for other resources, like the index
// retain the enclosing bundle for later, so we can interrogate it for other resources such as the index
this.parentBundle = featureBundle;
if (name != null) {
if (primaryResourcePath.getTag() != null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,15 +86,17 @@ protected void initializeDrivingVariants() {
final List<Bundle> bundles = BundleJSON.toBundleList(IOUtils.getStringFromPath(gatkPath), GATKPath::new);
for (final Bundle bundle : bundles) {
if (bundle.getPrimaryContentType().equals(BundleResourceType.CT_VARIANT_CONTEXTS)) {
// use the bundle primary resource as the FeatureInput URI, and tear off and attach the
// individual bundle the bundle to the FI as the parent bundle so downstream code can
// extract other resources from it on demand
// note that if the original value from the user has a tag, we can't use it unless there
// is only one input, since FIs have to be unique
// use the bundle primary resource as the FeatureInput URI, and tear off and attach
// the enclosing bundle as the parent bundle for the FI so downstream code can extract
// other resources from it on demand. note that if the original value from the user has
// a tag, we can't propagate it unless there is only one input, since FIs have to be
// unique
final FeatureInput<VariantContext> bundleFI = new FeatureInput<>(
new GATKPath(bundle.getPrimaryResource().getIOPath().get().getURIString()),
bundle,
bundles.size() > 1 ? gatkPath.getTag() : "drivingVariants"
bundles.size() > 1 ?
gatkPath.getTag() :
"drivingVariants"
);
if (drivingVariantsFeatureInputs.contains(bundleFI)) {
throw new UserException.BadInput("Feature inputs must be unique: " + gatkPath);
Expand Down
330 changes: 266 additions & 64 deletions src/main/java/org/broadinstitute/hellbender/tools/CreateBundle.java

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
package org.broadinstitute.hellbender.utils.fasta;

import htsjdk.beta.io.bundle.Bundle;
import htsjdk.beta.io.bundle.BundleJSON;
import htsjdk.beta.plugin.IOUtils;
import htsjdk.io.IOPath;
import htsjdk.samtools.SAMException;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
Expand Down Expand Up @@ -143,14 +147,24 @@ public CachingIndexedFastaSequenceFile(final Path fasta, boolean preserveAmbigui
* @param preserveIUPAC If true, we will keep the IUPAC bases in the FASTA, otherwise they are converted to Ns
*/
public CachingIndexedFastaSequenceFile(final Path fasta, final long cacheSize, final boolean preserveCase, final boolean preserveIUPAC) {
// Check the FASTA path:
checkFastaPath(fasta);
Utils.validate(cacheSize > 0, () -> "Cache size must be > 0 but was " + cacheSize);

// Read reference data by creating an IndexedFastaSequenceFile.
try {
final ReferenceSequenceFile referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(fasta, true, true);
sequenceFile = requireIndex(fasta, referenceSequenceFile);
final IOPath fastaPath = new GATKPath(fasta.toUri().toString());
if (fastaPath.hasExtension(BundleJSON.BUNDLE_EXTENSION)) {
final Bundle referenceBundle = BundleJSON.toBundle(IOUtils.getStringFromPath(fastaPath), GATKPath::new);
final ReferenceSequenceFile referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFileFromBundle(
referenceBundle,
true,
true);
sequenceFile = requireIndex(fasta, referenceSequenceFile);
} else {
// Check the FASTA path:
checkFastaPath(fasta);
final ReferenceSequenceFile referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(fasta, GATKPath::new, true, true);
sequenceFile = requireIndex(fasta, referenceSequenceFile);
}
this.cacheSize = cacheSize;
this.cacheMissBackup = Math.max(cacheSize / 1000, 1);
this.preserveCase = preserveCase;
Expand All @@ -159,9 +173,6 @@ public CachingIndexedFastaSequenceFile(final Path fasta, final long cacheSize, f
catch (final IllegalArgumentException e) {
throw new UserException.CouldNotReadInputFile(fasta, "Could not read reference sequence. The FASTA must have either a .fasta or .fa extension", e);
}
catch (final Exception e) {
throw new UserException.CouldNotReadInputFile(fasta, e);
}
}

/**
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
package org.broadinstitute.hellbender.engine;

import htsjdk.beta.io.IOPathUtils;
import htsjdk.beta.io.bundle.Bundle;
import htsjdk.beta.io.bundle.BundleJSON;
import htsjdk.io.IOPath;
import org.broadinstitute.hellbender.GATKBaseTest;
import org.testng.Assert;
import org.testng.annotations.Test;

import java.io.IOException;

public class BundleSupportIntegrationTest extends GATKBaseTest {

// this test uses a serialized bundle file to ensure that we don't unintentionally pick up any
// code (like, from htsjdk) that introduces backward compatibility issues
@Test
public void testReadWriteSerializedReferenceBundle() throws IOException {
// This test file contains absolute paths to files on a local dev machine, so it shouldn't really be used
// for anything other than this test, since the absolute paths are unlikely to work on any other machine.
// But here we just want to make sure we can consume and roundtrip it without error
final IOPath testBundleFilePath = new GATKPath("src/test/resources/org/broadinstitute/hellbender/engine/print_reads_bundle_do_not_use.json");

// get our test bundle from the file (ensure we canparse it), then write it out to a temp file, read it back
// in, and compare
final Bundle testBundle = BundleJSON.toBundle(IOPathUtils.getStringFromPath(testBundleFilePath));
final IOPath roundTrippedBundleFilePath = new GATKPath(
createTempPath("testReadWriteSerializedReferenceBundle", ".json").toString());
IOPathUtils.writeStringToPath(roundTrippedBundleFilePath, BundleJSON.toJSON(testBundle));
final Bundle roundTrippedBundle = BundleJSON.toBundle(IOPathUtils.getStringFromPath(testBundleFilePath));
Assert.assertTrue(Bundle.equalsIgnoreOrder(roundTrippedBundle, testBundle));
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -167,8 +167,7 @@ private static IOPath createRemoteBundleForFile(
final File index1,
final File vcf2,
final File index2) throws IOException {
//TODO: replace this path with getGCPTestStaging()
final String remotePath = BucketUtils.randomRemotePath("gs://hellbender/test/staging/remoteBundles", "remote_bundle_test", "dir");
final String remotePath = BucketUtils.randomRemotePath(getGCPTestStaging() + "remoteBundles", "remote_bundle_test", "dir");
final Path remoteDirPath = IOUtils.getPath(remotePath + "/");

Files.createDirectory(remoteDirPath);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
package org.broadinstitute.hellbender.tools;

import htsjdk.beta.io.IOPathUtils;
import htsjdk.beta.io.bundle.Bundle;
import htsjdk.beta.io.bundle.BundleJSON;
import htsjdk.beta.plugin.registry.HaploidReferenceResolver;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SamReader;
Expand All @@ -9,6 +13,7 @@
import org.broadinstitute.hellbender.GATKBaseTest;
import org.broadinstitute.hellbender.cmdline.ReadFilterArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.engine.ReadsDataSource;
import org.broadinstitute.hellbender.engine.ReadsPathDataSource;
import org.broadinstitute.hellbender.engine.filters.ReadLengthReadFilter;
Expand All @@ -19,6 +24,7 @@
import org.broadinstitute.hellbender.testutils.SamAssertionUtils;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
Expand All @@ -37,14 +43,20 @@ public void doFileToFile(String fileIn, String extOut, String reference, boolean
String samFile = fileIn;
final File outFile = GATKBaseTest.createTempFile(samFile + ".", extOut);
final File ORIG_BAM = new File(TEST_DATA_DIR, samFile);
final File refFile;
final GATKPath refFile;

final ArrayList<String> args = new ArrayList<>();
args.add("--input"); args.add(ORIG_BAM.getAbsolutePath());
args.add("--output"); args.add(outFile.getAbsolutePath());
if (reference != null) {
refFile = new File(TEST_DATA_DIR, reference);
args.add("-R"); args.add(refFile.getAbsolutePath());
if (reference.endsWith(BundleJSON.BUNDLE_EXTENSION)) {
// the test json files are temporary files, not files in TEST_DATA_DIR
refFile = new GATKPath(reference);
args.add("-R"); args.add(reference);
} else {
refFile = new GATKPath(new File(TEST_DATA_DIR, reference).getAbsolutePath());
args.add("-R"); args.add(refFile.toString());
}
}
else {
refFile = null;
Expand All @@ -55,13 +67,33 @@ public void doFileToFile(String fileIn, String extOut, String reference, boolean
}
runCommandLine(args);

SamAssertionUtils.assertSamsEqual(outFile, ORIG_BAM, refFile);
SamAssertionUtils.assertSamsEqual(outFile, ORIG_BAM, refFile == null ? null : refFile.toPath().toFile());

if (testMD5) {
checkMD5asExpected(outFile);
}
}

public void doFileToFileUsingReferenceBundle(String fileIn, String extOut, String reference, boolean testMD5) throws Exception {
final String referenceToUse;
if (reference != null) {
// create the bundle, using inference to find the sibling files, then write the bundle out to a temp file
final Bundle referenceBundle = HaploidReferenceResolver.referenceBundleFromFastaPath(
new GATKPath(new File(TEST_DATA_DIR, reference).toPath().toString()),
GATKPath::new);
final GATKPath tempBundlePath = new GATKPath(
IOUtils.createTempFile("printReadsRefBundle", ".json").getAbsolutePath()
);
IOPathUtils.writeStringToPath(tempBundlePath, BundleJSON.toJSON(referenceBundle));
referenceToUse = tempBundlePath.toString();
} else {
referenceToUse = reference;
}

// no run the regular test, but using the reference bundle
doFileToFile(fileIn, extOut, referenceToUse, testMD5);
}

private void checkMD5asExpected(final File outFile) throws IOException {
final File md5File = new File(outFile.getAbsolutePath() + ".md5");
if (md5File.exists()) {
Expand All @@ -74,8 +106,8 @@ private void checkMD5asExpected(final File outFile) throws IOException {
}

@Test(dataProvider="testingData")
public void testFileToFile(String fileIn, String extOut, String reference) throws Exception {
doFileToFile(fileIn, extOut, reference, false);
public void testFileToFileWithReferenceBundle(String fileIn, String extOut, String reference) throws Exception {
doFileToFileUsingReferenceBundle(fileIn, extOut, reference, false);
}

@DataProvider(name="testingData")
Expand Down Expand Up @@ -120,6 +152,18 @@ public Object[][] testingData() {
};
}

@Test(dataProvider="testingData")
public void testFileToFileUsingReferenceBundle(String fileIn, String extOut, String reference) throws Exception {
if (reference != null) {
doFileToFileUsingReferenceBundle(fileIn, extOut, reference, false);
}
}

@Test(dataProvider="testingData")
public void testFileToFile(String fileIn, String extOut, String reference) throws Exception {
doFileToFile(fileIn, extOut, reference, false);
}

@Test
public void testReadThatConsumesNoReferenceBases() throws IOException {
final File zeroRefBasesReadBam = new File(TEST_DATA_DIR, "read_consumes_zero_ref_bases.bam");
Expand Down
Loading

0 comments on commit 6dcdeff

Please sign in to comment.