Skip to content

Commit

Permalink
Support for flow based sequencing (#7876)
Browse files Browse the repository at this point in the history
Added Support for Ultima Genomics flow based sequencing data. The major new features are as follows:
- Added "FlowMode" argument to HaplotypeCaller which better supports flow based calling
       - Added a new Haplotype Filtering step after assembly which removes suspicious haplotypes from the genoytper
       - Added two new likelihoods models, FlowBasedHMM and the FlowBasedAlignmentLkelihoodEngine
- Added "FlowMode" to Mutect2 which better supports flow based calling
- Added support for uncertain read end-positions in MarkDuplicatesSpark
- Added a new tool FlowFeatureMapper for quick heuristic calling of bams for diagnostics
- Added a new tool GroundTruthReadsBuilder to generate ground truth files for Basecalling
- Added a new diagnostic tool HaplotypeBasedVariantRecaller for recalling VCF files using the HaplotypeCallerEngine
- Added a new tool breaking up CRAM files by their blocks, SplitCram

There are a number of code and technical changes as well
- Added a new read interface called `FlowBasedRead` that manages the new features for FlowBased data
- Added a number of flow-specific read filters
- Added a number of flow-specific variant annotations
- Added support for read annotation-clipping as part of clipreads and GATKRead
- Added a new PartialReadsWalker that supports terminating before traversal is finished


Co-authored-by: James <emeryj@broadinstitute.org>
Co-authored-by: Megan Shand <mshand@broadinstitute.org>
Co-authored-by: Dror Kessler <dror27.kessler@gmail.com>
  • Loading branch information
4 people authored Jul 26, 2022
1 parent 8c348aa commit f1e7265
Show file tree
Hide file tree
Showing 388 changed files with 372,947 additions and 1,144 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.tools.walkers.annotator.Annotation;
import org.broadinstitute.hellbender.tools.walkers.annotator.PedigreeAnnotation;
import org.broadinstitute.hellbender.tools.walkers.annotator.flow.FlowAnnotatorBase;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.config.ConfigFactory;
import org.broadinstitute.hellbender.utils.config.GATKConfig;
Expand Down Expand Up @@ -81,6 +82,9 @@ public class GATKAnnotationPluginDescriptor extends CommandLinePluginDescriptor<
@Argument(fullName = StandardArgumentDefinitions.PEDIGREE_FILE_LONG_NAME, shortName = StandardArgumentDefinitions.PEDIGREE_FILE_SHORT_NAME, doc="Pedigree file for determining the population \"founders\"", optional=true)
private GATKPath pedigreeFile;

@Argument(fullName = StandardArgumentDefinitions.FLOW_ORDER_FOR_ANNOTATIONS, doc = "flow order used for this annotations. [readGroup:]flowOrder", optional = true)
private List<String> flowOrder;

/**
* @return the class object for the base class of all plugins managed by this descriptor
*/
Expand Down Expand Up @@ -413,6 +417,23 @@ public void validateAndResolvePlugins() throws CommandLineException {
"founder-id",
allDiscoveredAnnotations.values().stream().filter(PedigreeAnnotation.class::isInstance).map(a -> a.getClass().getSimpleName()).collect(Collectors.joining(", "))));
}

// Populating any discovered flow annotations with the flowOrder arguments from the command line.
if (flowOrder!=null && !flowOrder.isEmpty() && getResolvedInstances().stream()
.filter(FlowAnnotatorBase.class::isInstance)
.map(a -> (FlowAnnotatorBase) a)
.peek(a -> {
a.setFlowOrder(flowOrder);
})
.count() == 0) {
// Throwing an exception if no flow based annotations were found
throw new CommandLineException(
String.format(
"Flow argument \"%s\" was specified without a flow based annotation being requested, (eg: %s))",
StandardArgumentDefinitions.FLOW_ORDER_FOR_ANNOTATIONS,
allDiscoveredAnnotations.values().stream().filter(FlowAnnotatorBase.class::isInstance).map(a -> a.getClass().getSimpleName()).collect(Collectors.joining(", "))));
}

}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,8 @@ public static void setArgValues(final CommandLineParser parser, final String[] a

for ( int i = 0 ; i < argValues.length ; i += 2 ) {
if ( !hasBeenSet(parser, argValues[i]) ) {
String parserMessage = setValue(parser, argValues[i], argValues[i+1]);

if ( StringUtils.isEmpty(parserMessage) ) {
modifiedArgs.put(argValues[i], argValues[i + 1]);
} else {
modifiedArgs.put(argValues[i], argValues[i + 1] + " (" + parserMessage + ")");
}
setValue(parser, argValues[i], argValues[i+1]);
modifiedArgs.put(argValues[i], argValues[i + 1]);
} else {
logger.info("parameter not set by the '" + modeName + "' argument mode, as it was already set on the command line: " + argValues[i]);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ private StandardArgumentDefinitions(){}
public static final String SITES_ONLY_LONG_NAME = "sites-only-vcf-output";
public static final String INVALIDATE_PREVIOUS_FILTERS_LONG_NAME = "invalidate-previous-filters";
public static final String SORT_ORDER_LONG_NAME = "sort-order";
public static final String FLOW_ORDER_FOR_ANNOTATIONS = "flow-order-for-annotations";


public static final String INPUT_SHORT_NAME = "I";
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.broadinstitute.hellbender.cmdline.argumentcollections;

import org.broadinstitute.barclay.argparser.Advanced;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.utils.read.markduplicates.MarkDuplicatesScoringStrategy;
Expand All @@ -20,14 +21,24 @@ public final class MarkDuplicatesSparkArgumentCollection implements Serializable
public static final String REMOVE_ALL_DUPLICATE_READS = "remove-all-duplicates";
public static final String REMOVE_SEQUENCING_DUPLICATE_READS = "remove-sequencing-duplicates";

public static final String FLOW_MD_MODE_LONG_NAME = "flowbased";

public static final String FLOW_QUALITY_SUM_STRATEGY_LONG_NAME = "flow-quality-sum-strategy";
public static final String SINGLE_END_READS_END_POSITION_SIGNIFICANT = "single-end-reads-end-position-significant";
public static final String FLOW_END_POS_UNCERTAINTY_LONG_NAME = "flow-end-pos-uncertainty";
public static final String SINGLE_END_READS_CLIPPING_IS_END_LONG_NAME = "single-end-reads-clipping-is-end";
public static final String FLOW_SKIP_START_HOMOPOLYMERS_LONG_NAME = "flow-skip-start-homopolymers";
public static final String FLOW_Q_IS_KNOWN_END_LONG_NAME = "flow-q-is-known-end";

@Argument(shortName = StandardArgumentDefinitions.DUPLICATE_SCORING_STRATEGY_SHORT_NAME, fullName = StandardArgumentDefinitions.DUPLICATE_SCORING_STRATEGY_LONG_NAME, doc = "The scoring strategy for choosing the non-duplicate among candidates.")
public MarkDuplicatesScoringStrategy duplicatesScoringStrategy = MarkDuplicatesScoringStrategy.SUM_OF_BASE_QUALITIES;

@Argument(fullName = MarkDuplicatesSparkArgumentCollection.DO_NOT_MARK_UNMAPPED_MATES_LONG_NAME, doc = "Enabling this option will mean unmapped mates of duplicate marked reads will not be marked as duplicates.")
public boolean dontMarkUnmappedMates = false;


@Argument(fullName = MarkDuplicatesSparkArgumentCollection.DUPLICATE_TAGGING_POLICY_LONG_NAME, doc = "Determines how duplicate types are recorded in the DT optional attribute.", optional = true,
mutex = {REMOVE_ALL_DUPLICATE_READS, REMOVE_SEQUENCING_DUPLICATE_READS})
mutex = {REMOVE_ALL_DUPLICATE_READS, REMOVE_SEQUENCING_DUPLICATE_READS})
public MarkDuplicates.DuplicateTaggingPolicy taggingPolicy = MarkDuplicates.DuplicateTaggingPolicy.DontTag;

@Argument(fullName = MarkDuplicatesSparkArgumentCollection.REMOVE_ALL_DUPLICATE_READS, doc = "If true do not write duplicates to the output file instead of writing them with appropriate flags set.",
Expand All @@ -37,4 +48,48 @@ public final class MarkDuplicatesSparkArgumentCollection implements Serializable
@Argument(fullName = MarkDuplicatesSparkArgumentCollection.REMOVE_SEQUENCING_DUPLICATE_READS, doc = "If true do not write optical/sequencing duplicates to the output file instead of writing them with appropriate flags set.",
mutex = {MarkDuplicatesSparkArgumentCollection.DUPLICATE_TAGGING_POLICY_LONG_NAME, MarkDuplicatesSparkArgumentCollection.REMOVE_ALL_DUPLICATE_READS}, optional = true)
public boolean removeSequencingDuplicates = false;

@Advanced
@Argument(fullName = FLOW_QUALITY_SUM_STRATEGY_LONG_NAME, doc = "Use specific quality summing strategy for flow based reads. The strategy ensures that the same " +
"(and correct) quality value is used for all bases of the same homopolymer. Default false.", optional = true)
public boolean FLOW_QUALITY_SUM_STRATEGY = false;

@Advanced
@Argument(fullName = SINGLE_END_READS_END_POSITION_SIGNIFICANT, doc = "Make end location of read (fragment) be significant when considering duplicates, " +
"in addition to the start location, which is always significant (should only be applied to flow based reads). Default false.", optional = true)
public boolean FLOW_END_LOCATION_SIGNIFICANT = false;

@Advanced
@Argument(fullName = FLOW_END_POS_UNCERTAINTY_LONG_NAME, doc = "Maximal number of bases of reads (fragment) ends difference that is marked as match (should only be applied to flow based reads). Default 0.", optional = true)
public int ENDS_READ_UNCERTAINTY = 0;

@Advanced
@Argument(fullName = SINGLE_END_READS_CLIPPING_IS_END_LONG_NAME, doc = "Use clipped, rather than unclipped, when considering duplicates (should only be applied to flow based reads). Default false.", optional = true)
public boolean FLOW_USE_CLIPPED_LOCATIONS = false;

@Advanced
@Argument(fullName = FLOW_SKIP_START_HOMOPOLYMERS_LONG_NAME, doc = "Skip first N flows, when considering duplicates (should only be applied to flow based reads). Default 0.", optional = true)
public int FLOW_SKIP_START_HOMOPOLYMERS = 0;

@Advanced
@Argument(fullName = FLOW_Q_IS_KNOWN_END_LONG_NAME, doc = "Treat reads (fragment) clipped on tm:Q as known end position (should only be applied to flow based reads) (default: false)", optional = true)
public boolean FLOW_Q_IS_KNOWN_END = false;

@Advanced
@Argument(fullName = FLOW_MD_MODE_LONG_NAME, optional = true, doc="Single argument for enabling the bulk of flow based features (should only be applied to flow based reads).")
public Boolean useFlowFragments = false;

public boolean isFlowEnabled() {
return FLOW_QUALITY_SUM_STRATEGY || FLOW_END_LOCATION_SIGNIFICANT || FLOW_USE_CLIPPED_LOCATIONS || FLOW_SKIP_START_HOMOPOLYMERS != 0;
}

public String[] getFlowModeArgValues() {
return new String[] {
MarkDuplicatesSparkArgumentCollection.SINGLE_END_READS_END_POSITION_SIGNIFICANT, "true",
MarkDuplicatesSparkArgumentCollection.SINGLE_END_READS_CLIPPING_IS_END_LONG_NAME, "true",
MarkDuplicatesSparkArgumentCollection.FLOW_END_POS_UNCERTAINTY_LONG_NAME, "1",
MarkDuplicatesSparkArgumentCollection.FLOW_SKIP_START_HOMOPOLYMERS_LONG_NAME, "0"
};

}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
package org.broadinstitute.hellbender.cmdline.programgroups;

import org.broadinstitute.barclay.argparser.CommandLineProgramGroup;
import org.broadinstitute.hellbender.utils.help.HelpConstants;

/**
* Tools that perform variant calling and genotyping for short variants (SNPs, SNVs and Indels) on
* flow-based sequencing platforms
*/
public class FlowBasedProgramGroup implements CommandLineProgramGroup {

@Override
public String getName() { return HelpConstants.DOC_CAT_SHORT_FLOW_BASED; }

@Override
public String getDescription() { return HelpConstants.DOC_CAT_SHORT_FLOW_BASED_SUMMARY; }
}
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ public abstract class AssemblyRegionWalker extends WalkerBase {

private List<MultiIntervalLocalReadShard> readShards;

private boolean nonRandomDownsamplingMode = nonRandomDownsamplingMode();

/**
* Initialize data sources for traversal.
*
Expand Down Expand Up @@ -139,7 +141,7 @@ public List<ReadFilter> getDefaultReadFilters() {
}

protected ReadsDownsampler createDownsampler() {
return assemblyRegionArgs.maxReadsPerAlignmentStart > 0 ? new PositionalDownsampler(assemblyRegionArgs.maxReadsPerAlignmentStart, getHeaderForReads()) : null;
return assemblyRegionArgs.maxReadsPerAlignmentStart > 0 ? new PositionalDownsampler(assemblyRegionArgs.maxReadsPerAlignmentStart, getHeaderForReads(), nonRandomDownsamplingMode) : null;
}

/**
Expand Down Expand Up @@ -252,4 +254,8 @@ protected final void onShutdown() {
* @param featureContext features overlapping the padded span of the assembly region
*/
public abstract void apply( final AssemblyRegion region, final ReferenceContext referenceContext, final FeatureContext featureContext );

public boolean nonRandomDownsamplingMode() {
return false;
}
}
10 changes: 10 additions & 0 deletions src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java
Original file line number Diff line number Diff line change
Expand Up @@ -1021,6 +1021,16 @@ public String getToolName() {
return String.format("%s %s", getToolkitShortName(), getClass().getSimpleName());
}

/**
* Expose a read-only version of the raw user-supplied intervals. This can be used by tools that need to explicitly
* traverse the intervals themselves (rather than, for example, walking the reads based on the intervals)
*
* @return - the raw user-supplied intervals, as an unmodifiable list
*/
public List<SimpleInterval> getUserSuppliedIntervals() {
return Collections.unmodifiableList(userIntervals);
}

/**
* Returns the list of intervals to iterate, either limited to the user-supplied intervals or the entire reference genome if none were specified.
* If no reference was supplied, null is returned
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
package org.broadinstitute.hellbender.engine;

import org.broadinstitute.hellbender.engine.filters.CountingReadFilter;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.read.GATKRead;

import java.util.Spliterator;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.function.BiConsumer;
import java.util.stream.Stream;

/**
* A specialized read walker that may be gracefully stopped before the input stream ends
*
* A tool derived from this class should implement {@link PartialReadWalker#shouldExitEarly(GATKRead)}
* to indicate when to stop. This method is called before {@link ReadWalker#apply(GATKRead, ReferenceContext, FeatureContext)}
*
*/
abstract public class PartialReadWalker extends ReadWalker {

/**
* traverse is overridden to consult the implementation class whether to stop
*
* The stoppage is implemented using a custom forEach method to compensate for the
* lack of .takeWhile() in Java 8
*/

@Override
public void traverse() {

final CountingReadFilter countedFilter = makeReadFilter();
breakableForEach(getTransformedReadStream(countedFilter), (read, breaker) -> {

// check if we should stop
if ( shouldExitEarly(read) ) {
breaker.set(true);
} else {
// this is the body of the iteration
final SimpleInterval readInterval = getReadInterval(read);
apply(read,
new ReferenceContext(reference, readInterval), // Will create an empty ReferenceContext if reference or readInterval == null
new FeatureContext(features, readInterval)); // Will create an empty FeatureContext if features or readInterval == null

progressMeter.update(readInterval);
}
});

logger.info(countedFilter.getSummaryLine());
}

/**
* Method to be overridden by the implementation class to determine when to stop the read stream traversal
* @param read - the read to be processed next (in case it is needed)
* @return boolean indicator: true means stop!
*/
protected abstract boolean shouldExitEarly(GATKRead read);

/**
* Java 8 does not have a .takeWhile() on streams. The code below implements a custom forEach to allow
* breaking out of a stream prematurely.
*
* code adapted from: https://www.baeldung.com/java-break-stream-foreach
*/
private static <T> void breakableForEach(Stream<T> stream, BiConsumer<T, AtomicBoolean> consumer) {
Spliterator<T> spliterator = stream.spliterator();
boolean hadNext = true;
AtomicBoolean breaker = new AtomicBoolean();

while (hadNext && !breaker.get()) {
hadNext = spliterator.tryAdvance(elem -> {
consumer.accept(elem, breaker);
});
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,18 @@ public ReadFilterBinOp(final ReadFilter lhs, final ReadFilter rhs) {
this.lhs = lhs;
this.rhs = rhs;
}

@Override
public void setHeader(SAMFileHeader samHeader) {
super.setHeader(samHeader);
if ( lhs != null ) {
lhs.setHeader(samHeader);
}
if ( rhs != null ) {
rhs.setHeader(samHeader);
}
}

}

@VisibleForTesting
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

import htsjdk.samtools.Cigar;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.engine.filters.flow.FlowBasedTPAttributeSymetricReadFilter;
import org.broadinstitute.hellbender.engine.filters.flow.FlowBasedTPAttributeValidReadFilter;
import org.broadinstitute.hellbender.engine.filters.flow.HmerQualitySymetricReadFilter;
import org.broadinstitute.hellbender.engine.filters.flow.ReadGroupHasFlowOrderReadFilter;
import org.broadinstitute.hellbender.tools.AddOriginalAlignmentTags;
import org.broadinstitute.hellbender.utils.QualityUtils;
import org.broadinstitute.hellbender.utils.help.HelpConstants;
Expand Down Expand Up @@ -328,4 +332,9 @@ public static class MateUnmappedAndUnmappedReadFilter extends ReadFilter {
public static final ValidAlignmentEndReadFilter VALID_ALIGNMENT_END = new ValidAlignmentEndReadFilter();
public static final NonChimericOriginalAlignmentReadFilter NON_CHIMERIC_ORIGINAL_ALIGNMENT_READ_FILTER = new NonChimericOriginalAlignmentReadFilter();
public static final MateUnmappedAndUnmappedReadFilter MATE_UNMAPPED_AND_UNMAPPED_READ_FILTER = new MateUnmappedAndUnmappedReadFilter();

public static final ReadGroupHasFlowOrderReadFilter READ_GROUP_HAS_FLOW_ORDER_READ_FILTER = new ReadGroupHasFlowOrderReadFilter();
public static final HmerQualitySymetricReadFilter HMER_QUALITY_SYMETRIC_READ_FILTER = new HmerQualitySymetricReadFilter();
public static final FlowBasedTPAttributeValidReadFilter FLOW_BASED_TP_ATTRIBUTE_VALID_READ_FILTER = new FlowBasedTPAttributeValidReadFilter();
public static final FlowBasedTPAttributeSymetricReadFilter FLOW_BASED_TP_ATTRIBUTE_SYMETRIC_READ_FILTER = new FlowBasedTPAttributeSymetricReadFilter();
}
Loading

0 comments on commit f1e7265

Please sign in to comment.