Support for flow based sequencing (#7876)

Added Support for Ultima Genomics flow based sequencing data. The major new features are as follows: - Added "FlowMode" argument to HaplotypeCaller which better supports flow based calling - Added a new Haplotype Filtering step after assembly which removes suspicious haplotypes from the genoytper - Added two new likelihoods models, FlowBasedHMM and the FlowBasedAlignmentLkelihoodEngine - Added "FlowMode" to Mutect2 which better supports flow based calling - Added support for uncertain read end-positions in MarkDuplicatesSpark - Added a new tool FlowFeatureMapper for quick heuristic calling of bams for diagnostics - Added a new tool GroundTruthReadsBuilder to generate ground truth files for Basecalling - Added a new diagnostic tool HaplotypeBasedVariantRecaller for recalling VCF files using the HaplotypeCallerEngine - Added a new tool breaking up CRAM files by their blocks, SplitCram There are a number of code and technical changes as well - Added a new read interface called `FlowBasedRead` that manages the new features for FlowBased data - Added a number of flow-specific read filters - Added a number of flow-specific variant annotations - Added support for read annotation-clipping as part of clipreads and GATKRead - Added a new PartialReadsWalker that supports terminating before traversal is finished Co-authored-by: James <emeryj@broadinstitute.org> Co-authored-by: Megan Shand <mshand@broadinstitute.org> Co-authored-by: Dror Kessler <dror27.kessler@gmail.com>
broadinstitute · Jul 26, 2022 · f1e7265 · f1e7265
1 parent 8c348aa
commit f1e7265
Show file tree

Hide file tree

Showing 388 changed files with 372,947 additions and 1,144 deletions.
diff --git a/...java/org/broadinstitute/hellbender/cmdline/GATKPlugin/GATKAnnotationPluginDescriptor.java b/...java/org/broadinstitute/hellbender/cmdline/GATKPlugin/GATKAnnotationPluginDescriptor.java
@@ -11,6 +11,7 @@
 import org.broadinstitute.hellbender.exceptions.GATKException;
 import org.broadinstitute.hellbender.tools.walkers.annotator.Annotation;
 import org.broadinstitute.hellbender.tools.walkers.annotator.PedigreeAnnotation;
+import org.broadinstitute.hellbender.tools.walkers.annotator.flow.FlowAnnotatorBase;
 import org.broadinstitute.hellbender.utils.Utils;
 import org.broadinstitute.hellbender.utils.config.ConfigFactory;
 import org.broadinstitute.hellbender.utils.config.GATKConfig;
@@ -81,6 +82,9 @@ public class GATKAnnotationPluginDescriptor extends CommandLinePluginDescriptor<
     @Argument(fullName = StandardArgumentDefinitions.PEDIGREE_FILE_LONG_NAME, shortName = StandardArgumentDefinitions.PEDIGREE_FILE_SHORT_NAME, doc="Pedigree file for determining the population \"founders\"", optional=true)
     private GATKPath pedigreeFile;
 
+    @Argument(fullName = StandardArgumentDefinitions.FLOW_ORDER_FOR_ANNOTATIONS,  doc = "flow order used for this annotations. [readGroup:]flowOrder", optional = true)
+    private List<String> flowOrder;
+
     /**
      * @return the class object for the base class of all plugins managed by this descriptor
      */
@@ -413,6 +417,23 @@ public void validateAndResolvePlugins() throws CommandLineException {
                             "founder-id",
                             allDiscoveredAnnotations.values().stream().filter(PedigreeAnnotation.class::isInstance).map(a -> a.getClass().getSimpleName()).collect(Collectors.joining(", "))));
         }
+
+        // Populating any discovered flow annotations with the flowOrder arguments from the command line.
+        if (flowOrder!=null && !flowOrder.isEmpty() && getResolvedInstances().stream()
+                .filter(FlowAnnotatorBase.class::isInstance)
+                .map(a -> (FlowAnnotatorBase) a)
+                .peek(a -> {
+                    a.setFlowOrder(flowOrder);
+                })
+                .count() == 0) {
+            // Throwing an exception if no flow based annotations were found
+            throw new CommandLineException(
+                    String.format(
+                            "Flow argument \"%s\" was specified without a flow based annotation being requested, (eg: %s))",
+                            StandardArgumentDefinitions.FLOW_ORDER_FOR_ANNOTATIONS,
+                            allDiscoveredAnnotations.values().stream().filter(FlowAnnotatorBase.class::isInstance).map(a -> a.getClass().getSimpleName()).collect(Collectors.joining(", "))));
+        }
+
     }
 
     /**

diff --git a/src/main/java/org/broadinstitute/hellbender/cmdline/ModeArgumentUtils.java b/src/main/java/org/broadinstitute/hellbender/cmdline/ModeArgumentUtils.java
@@ -56,13 +56,8 @@ public static void setArgValues(final CommandLineParser parser, final String[] a
 
         for ( int i = 0 ; i < argValues.length ; i += 2 ) {
             if ( !hasBeenSet(parser, argValues[i]) ) {
-                String parserMessage = setValue(parser, argValues[i], argValues[i+1]);
-
-                if ( StringUtils.isEmpty(parserMessage) ) {
-                    modifiedArgs.put(argValues[i], argValues[i + 1]);
-                } else {
-                    modifiedArgs.put(argValues[i], argValues[i + 1] + " (" + parserMessage + ")");
-                }
+                setValue(parser, argValues[i], argValues[i+1]);
+                modifiedArgs.put(argValues[i], argValues[i + 1]);
             } else {
                 logger.info("parameter not set by the '" + modeName + "' argument mode, as it was already set on the command line: " + argValues[i]);
             }

diff --git a/src/main/java/org/broadinstitute/hellbender/cmdline/StandardArgumentDefinitions.java b/src/main/java/org/broadinstitute/hellbender/cmdline/StandardArgumentDefinitions.java
@@ -45,6 +45,7 @@ private StandardArgumentDefinitions(){}
     public static final String SITES_ONLY_LONG_NAME = "sites-only-vcf-output";
     public static final String INVALIDATE_PREVIOUS_FILTERS_LONG_NAME = "invalidate-previous-filters";
     public static final String SORT_ORDER_LONG_NAME = "sort-order";
+    public static final String FLOW_ORDER_FOR_ANNOTATIONS = "flow-order-for-annotations";
 
 
     public static final String INPUT_SHORT_NAME = "I";

diff --git a/...stitute/hellbender/cmdline/argumentcollections/MarkDuplicatesSparkArgumentCollection.java b/...stitute/hellbender/cmdline/argumentcollections/MarkDuplicatesSparkArgumentCollection.java
@@ -1,5 +1,6 @@
 package org.broadinstitute.hellbender.cmdline.argumentcollections;
 
+import org.broadinstitute.barclay.argparser.Advanced;
 import org.broadinstitute.barclay.argparser.Argument;
 import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
 import org.broadinstitute.hellbender.utils.read.markduplicates.MarkDuplicatesScoringStrategy;
@@ -20,14 +21,24 @@ public final class MarkDuplicatesSparkArgumentCollection implements Serializable
     public static final String REMOVE_ALL_DUPLICATE_READS = "remove-all-duplicates";
     public static final String REMOVE_SEQUENCING_DUPLICATE_READS = "remove-sequencing-duplicates";
 
+    public static final String FLOW_MD_MODE_LONG_NAME = "flowbased";
+
+    public static final String FLOW_QUALITY_SUM_STRATEGY_LONG_NAME = "flow-quality-sum-strategy";
+    public static final String SINGLE_END_READS_END_POSITION_SIGNIFICANT = "single-end-reads-end-position-significant";
+    public static final String FLOW_END_POS_UNCERTAINTY_LONG_NAME = "flow-end-pos-uncertainty";
+    public static final String SINGLE_END_READS_CLIPPING_IS_END_LONG_NAME = "single-end-reads-clipping-is-end";
+    public static final String FLOW_SKIP_START_HOMOPOLYMERS_LONG_NAME = "flow-skip-start-homopolymers";
+    public static final String FLOW_Q_IS_KNOWN_END_LONG_NAME = "flow-q-is-known-end";
+
     @Argument(shortName = StandardArgumentDefinitions.DUPLICATE_SCORING_STRATEGY_SHORT_NAME, fullName = StandardArgumentDefinitions.DUPLICATE_SCORING_STRATEGY_LONG_NAME, doc = "The scoring strategy for choosing the non-duplicate among candidates.")
     public MarkDuplicatesScoringStrategy duplicatesScoringStrategy = MarkDuplicatesScoringStrategy.SUM_OF_BASE_QUALITIES;
 
     @Argument(fullName = MarkDuplicatesSparkArgumentCollection.DO_NOT_MARK_UNMAPPED_MATES_LONG_NAME, doc = "Enabling this option will mean unmapped mates of duplicate marked reads will not be marked as duplicates.")
     public boolean dontMarkUnmappedMates = false;
 
+
     @Argument(fullName = MarkDuplicatesSparkArgumentCollection.DUPLICATE_TAGGING_POLICY_LONG_NAME, doc = "Determines how duplicate types are recorded in the DT optional attribute.", optional = true,
-              mutex = {REMOVE_ALL_DUPLICATE_READS, REMOVE_SEQUENCING_DUPLICATE_READS})
+            mutex = {REMOVE_ALL_DUPLICATE_READS, REMOVE_SEQUENCING_DUPLICATE_READS})
     public MarkDuplicates.DuplicateTaggingPolicy taggingPolicy = MarkDuplicates.DuplicateTaggingPolicy.DontTag;
 
     @Argument(fullName = MarkDuplicatesSparkArgumentCollection.REMOVE_ALL_DUPLICATE_READS, doc = "If true do not write duplicates to the output file instead of writing them with appropriate flags set.",
@@ -37,4 +48,48 @@ public final class MarkDuplicatesSparkArgumentCollection implements Serializable
     @Argument(fullName = MarkDuplicatesSparkArgumentCollection.REMOVE_SEQUENCING_DUPLICATE_READS, doc = "If true do not write optical/sequencing duplicates to the output file instead of writing them with appropriate flags set.",
             mutex = {MarkDuplicatesSparkArgumentCollection.DUPLICATE_TAGGING_POLICY_LONG_NAME, MarkDuplicatesSparkArgumentCollection.REMOVE_ALL_DUPLICATE_READS}, optional = true)
     public boolean removeSequencingDuplicates = false;
+
+    @Advanced
+    @Argument(fullName = FLOW_QUALITY_SUM_STRATEGY_LONG_NAME, doc = "Use specific quality summing strategy for flow based reads. The strategy ensures that the same " +
+            "(and correct) quality value is used for all bases of the same homopolymer. Default false.", optional = true)
+    public boolean FLOW_QUALITY_SUM_STRATEGY = false;
+
+    @Advanced
+    @Argument(fullName = SINGLE_END_READS_END_POSITION_SIGNIFICANT, doc = "Make end location of read (fragment) be significant when considering duplicates, " +
+            "in addition to the start location, which is always significant (should only be applied to flow based reads). Default false.", optional = true)
+    public boolean FLOW_END_LOCATION_SIGNIFICANT = false;
+
+    @Advanced
+    @Argument(fullName = FLOW_END_POS_UNCERTAINTY_LONG_NAME, doc = "Maximal number of bases of reads (fragment) ends difference that is marked as match (should only be applied to flow based reads). Default 0.", optional = true)
+    public int ENDS_READ_UNCERTAINTY = 0;
+
+    @Advanced
+    @Argument(fullName = SINGLE_END_READS_CLIPPING_IS_END_LONG_NAME, doc = "Use clipped, rather than unclipped, when considering duplicates (should only be applied to flow based reads). Default false.", optional = true)
+    public boolean FLOW_USE_CLIPPED_LOCATIONS = false;
+
+    @Advanced
+    @Argument(fullName = FLOW_SKIP_START_HOMOPOLYMERS_LONG_NAME, doc = "Skip first N flows, when considering duplicates (should only be applied to flow based reads). Default 0.", optional = true)
+    public int FLOW_SKIP_START_HOMOPOLYMERS = 0;
+
+    @Advanced
+    @Argument(fullName = FLOW_Q_IS_KNOWN_END_LONG_NAME, doc = "Treat reads (fragment) clipped on tm:Q as known end position (should only be applied to flow based reads) (default: false)", optional = true)
+    public boolean FLOW_Q_IS_KNOWN_END = false;
+
+    @Advanced
+    @Argument(fullName = FLOW_MD_MODE_LONG_NAME, optional = true, doc="Single argument for enabling the bulk of flow based features (should only be applied to flow based reads).")
+    public Boolean useFlowFragments = false;
+
+    public boolean isFlowEnabled() {
+        return FLOW_QUALITY_SUM_STRATEGY || FLOW_END_LOCATION_SIGNIFICANT || FLOW_USE_CLIPPED_LOCATIONS || FLOW_SKIP_START_HOMOPOLYMERS != 0;
+    }
+
+    public String[] getFlowModeArgValues() {
+        return new String[] {
+                MarkDuplicatesSparkArgumentCollection.SINGLE_END_READS_END_POSITION_SIGNIFICANT, "true",
+                MarkDuplicatesSparkArgumentCollection.SINGLE_END_READS_CLIPPING_IS_END_LONG_NAME, "true",
+                MarkDuplicatesSparkArgumentCollection.FLOW_END_POS_UNCERTAINTY_LONG_NAME, "1",
+                MarkDuplicatesSparkArgumentCollection.FLOW_SKIP_START_HOMOPOLYMERS_LONG_NAME, "0"
+        };
+
+    }
 }
diff --git a/src/main/java/org/broadinstitute/hellbender/cmdline/programgroups/FlowBasedProgramGroup.java b/src/main/java/org/broadinstitute/hellbender/cmdline/programgroups/FlowBasedProgramGroup.java
@@ -0,0 +1,17 @@
+package org.broadinstitute.hellbender.cmdline.programgroups;
+
+import org.broadinstitute.barclay.argparser.CommandLineProgramGroup;
+import org.broadinstitute.hellbender.utils.help.HelpConstants;
+
+/**
+ * Tools that perform variant calling and genotyping for short variants (SNPs, SNVs and Indels) on
+ * flow-based sequencing platforms
+ */
+public class FlowBasedProgramGroup implements CommandLineProgramGroup {
+
+    @Override
+    public String getName() { return HelpConstants.DOC_CAT_SHORT_FLOW_BASED; }
+
+    @Override
+    public String getDescription() { return HelpConstants.DOC_CAT_SHORT_FLOW_BASED_SUMMARY; }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/engine/AssemblyRegionWalker.java b/src/main/java/org/broadinstitute/hellbender/engine/AssemblyRegionWalker.java
@@ -67,6 +67,8 @@ public abstract class AssemblyRegionWalker extends WalkerBase {
 
     private List<MultiIntervalLocalReadShard> readShards;
 
+    private boolean nonRandomDownsamplingMode = nonRandomDownsamplingMode();
+
     /**
      * Initialize data sources for traversal.
      *
@@ -139,7 +141,7 @@ public List<ReadFilter> getDefaultReadFilters() {
     }
 
     protected ReadsDownsampler createDownsampler() {
-        return assemblyRegionArgs.maxReadsPerAlignmentStart > 0 ? new PositionalDownsampler(assemblyRegionArgs.maxReadsPerAlignmentStart, getHeaderForReads()) : null;
+        return assemblyRegionArgs.maxReadsPerAlignmentStart > 0 ? new PositionalDownsampler(assemblyRegionArgs.maxReadsPerAlignmentStart, getHeaderForReads(), nonRandomDownsamplingMode) : null;
     }
 
     /**
@@ -252,4 +254,8 @@ protected final void onShutdown() {
      * @param featureContext features overlapping the padded span of the assembly region
      */
     public abstract void apply( final AssemblyRegion region, final ReferenceContext referenceContext, final FeatureContext featureContext );
+
+    public boolean nonRandomDownsamplingMode() {
+        return false;
+    }
 }
diff --git a/src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java b/src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java
@@ -1021,6 +1021,16 @@ public String getToolName() {
         return String.format("%s %s", getToolkitShortName(), getClass().getSimpleName());
     }
 
+    /**
+     * Expose a read-only version of the raw user-supplied intervals. This can be used by tools that need to explicitly
+     * traverse the intervals themselves (rather than, for example, walking the reads based on the intervals)
+     *
+     * @return - the raw user-supplied intervals, as an unmodifiable list
+     */
+    public List<SimpleInterval> getUserSuppliedIntervals() {
+        return Collections.unmodifiableList(userIntervals);
+    }
+
     /**
      * Returns the list of intervals to iterate, either limited to the user-supplied intervals or the entire reference genome if none were specified.
      * If no reference was supplied, null is returned

diff --git a/src/main/java/org/broadinstitute/hellbender/engine/PartialReadWalker.java b/src/main/java/org/broadinstitute/hellbender/engine/PartialReadWalker.java
@@ -0,0 +1,75 @@
+package org.broadinstitute.hellbender.engine;
+
+import org.broadinstitute.hellbender.engine.filters.CountingReadFilter;
+import org.broadinstitute.hellbender.utils.SimpleInterval;
+import org.broadinstitute.hellbender.utils.read.GATKRead;
+
+import java.util.Spliterator;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.function.BiConsumer;
+import java.util.stream.Stream;
+
+/**
+ * A specialized read walker that may be gracefully stopped before the input stream ends
+ * 
+ * A tool derived from this class should implement {@link PartialReadWalker#shouldExitEarly(GATKRead)}
+ * to indicate when to stop. This method is called before {@link ReadWalker#apply(GATKRead, ReferenceContext, FeatureContext)}
+ *
+ */
+abstract public class PartialReadWalker extends ReadWalker {
+
+    /**
+     * traverse is overridden to consult the implementation class whether to stop
+     *
+     * The stoppage is implemented using a custom forEach method to compensate for the
+     * lack of .takeWhile() in Java 8
+     */
+
+    @Override
+    public void traverse() {
+
+        final CountingReadFilter countedFilter = makeReadFilter();
+        breakableForEach(getTransformedReadStream(countedFilter), (read, breaker) -> {
+
+            // check if we should stop
+            if ( shouldExitEarly(read) ) {
+                breaker.set(true);
+            } else {
+                // this is the body of the iteration
+                final SimpleInterval readInterval = getReadInterval(read);
+                apply(read,
+                        new ReferenceContext(reference, readInterval), // Will create an empty ReferenceContext if reference or readInterval == null
+                        new FeatureContext(features, readInterval));   // Will create an empty FeatureContext if features or readInterval == null
+
+                progressMeter.update(readInterval);
+            }
+        });
+
+        logger.info(countedFilter.getSummaryLine());
+    }
+
+    /**
+     * Method to be overridden by the implementation class to determine when to stop the read stream traversal
+     * @param read - the read to be processed next (in case it is needed)
+     * @return boolean indicator: true means stop!
+     */
+    protected abstract boolean shouldExitEarly(GATKRead read);
+
+    /**
+     * Java 8 does not have a .takeWhile() on streams. The code below implements a custom forEach to allow
+     * breaking out of a stream prematurely.
+     *
+     * code adapted from: https://www.baeldung.com/java-break-stream-foreach
+     */
+    private static <T> void breakableForEach(Stream<T> stream, BiConsumer<T, AtomicBoolean> consumer) {
+        Spliterator<T> spliterator = stream.spliterator();
+        boolean hadNext = true;
+        AtomicBoolean breaker = new AtomicBoolean();
+
+        while (hadNext && !breaker.get()) {
+            hadNext = spliterator.tryAdvance(elem -> {
+                consumer.accept(elem, breaker);
+            });
+        }
+    }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/engine/filters/ReadFilter.java b/src/main/java/org/broadinstitute/hellbender/engine/filters/ReadFilter.java
@@ -58,6 +58,18 @@ public ReadFilterBinOp(final ReadFilter lhs, final ReadFilter rhs) {
             this.lhs = lhs;
             this.rhs = rhs;
         }
+
+        @Override
+        public void setHeader(SAMFileHeader samHeader) {
+            super.setHeader(samHeader);
+            if ( lhs != null ) {
+                lhs.setHeader(samHeader);
+            }
+            if ( rhs != null ) {
+                rhs.setHeader(samHeader);
+            }
+        }
+
     }
 
     @VisibleForTesting

diff --git a/src/main/java/org/broadinstitute/hellbender/engine/filters/ReadFilterLibrary.java b/src/main/java/org/broadinstitute/hellbender/engine/filters/ReadFilterLibrary.java
@@ -2,6 +2,10 @@
 
 import htsjdk.samtools.Cigar;
 import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.engine.filters.flow.FlowBasedTPAttributeSymetricReadFilter;
+import org.broadinstitute.hellbender.engine.filters.flow.FlowBasedTPAttributeValidReadFilter;
+import org.broadinstitute.hellbender.engine.filters.flow.HmerQualitySymetricReadFilter;
+import org.broadinstitute.hellbender.engine.filters.flow.ReadGroupHasFlowOrderReadFilter;
 import org.broadinstitute.hellbender.tools.AddOriginalAlignmentTags;
 import org.broadinstitute.hellbender.utils.QualityUtils;
 import org.broadinstitute.hellbender.utils.help.HelpConstants;
@@ -328,4 +332,9 @@ public static class MateUnmappedAndUnmappedReadFilter extends ReadFilter {
     public static final ValidAlignmentEndReadFilter VALID_ALIGNMENT_END = new ValidAlignmentEndReadFilter();
     public static final NonChimericOriginalAlignmentReadFilter NON_CHIMERIC_ORIGINAL_ALIGNMENT_READ_FILTER = new NonChimericOriginalAlignmentReadFilter();
     public static final MateUnmappedAndUnmappedReadFilter MATE_UNMAPPED_AND_UNMAPPED_READ_FILTER = new MateUnmappedAndUnmappedReadFilter();
+
+    public static final ReadGroupHasFlowOrderReadFilter READ_GROUP_HAS_FLOW_ORDER_READ_FILTER = new ReadGroupHasFlowOrderReadFilter();
+    public static final HmerQualitySymetricReadFilter HMER_QUALITY_SYMETRIC_READ_FILTER = new HmerQualitySymetricReadFilter();
+    public static final FlowBasedTPAttributeValidReadFilter FLOW_BASED_TP_ATTRIBUTE_VALID_READ_FILTER = new FlowBasedTPAttributeValidReadFilter();
+    public static final FlowBasedTPAttributeSymetricReadFilter FLOW_BASED_TP_ATTRIBUTE_SYMETRIC_READ_FILTER = new FlowBasedTPAttributeSymetricReadFilter();
 }