diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/FragmentRDD.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/FragmentRDD.scala index ad741f927d..9de07907ab 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/FragmentRDD.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/FragmentRDD.scala @@ -93,6 +93,14 @@ object FragmentRDD { */ val WRITE_SUFFIXES = "org.bdgenomics.adam.rdd.fragment.FragmentRDD.writeSuffixes" + /** + * Hadoop configuration path to check for a boolean value indicating that + * the warning logging that is triggered if an interleaved fragment doesn't + * have exactly two reads should be disabled. Default is false (logging is + * enabled). + */ + val DISABLE_READ_COUNT_LOGGING = "org.bdgenomics.adam.rdd.fragment.FragmentRDD.disableReadCountLogging" + /** * Creates a FragmentRDD where no record groups or sequence info are attached. * diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/InterleavedFASTQInFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/InterleavedFASTQInFormatter.scala index 44a4deeb17..bcd065e530 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/InterleavedFASTQInFormatter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/InterleavedFASTQInFormatter.scala @@ -48,6 +48,7 @@ class InterleavedFASTQInFormatter private ( private val converter = new AlignmentRecordConverter private val writeSuffixes = conf.getBoolean(FragmentRDD.WRITE_SUFFIXES, false) private val writeOriginalQualities = conf.getBoolean(FragmentRDD.WRITE_ORIGINAL_QUALITIES, false) + private val enableLogging = !conf.getBoolean(FragmentRDD.DISABLE_READ_COUNT_LOGGING, false) /** * Writes alignment records to an output stream in interleaved FASTQ format. @@ -59,11 +60,11 @@ class InterleavedFASTQInFormatter private ( iter.flatMap(frag => { val reads = converter.convertFragment(frag).toSeq - if (reads.size < 2) { + if (enableLogging && reads.size < 2) { log.warn("Fewer than two reads for %s. Dropping...".format(frag)) None } else { - if (reads.size > 2) { + if (enableLogging && reads.size > 2) { log.warn("More than two reads for %s. Taking first 2.".format(frag)) } Some((reads(0), reads(1))) @@ -80,8 +81,20 @@ class InterleavedFASTQInFormatter private ( outputOriginalBaseQualities = writeOriginalQualities) + "\n" // write both to the output stream - os.write(fastq1.getBytes) - os.write(fastq2.getBytes) + // ensure that reads are ordered properly if ordering is known (see #1702) + if (read1.getReadInFragment == 0 && + read2.getReadInFragment == 1) { + os.write(fastq1.getBytes) + os.write(fastq2.getBytes) + } else if (read1.getReadInFragment == 1 && + read2.getReadInFragment == 0) { + os.write(fastq2.getBytes) + os.write(fastq1.getBytes) + } else { + if (enableLogging) { + log.warn("Improper pair of reads in fragment %s. Dropping...".format(p)) + } + } }) } } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AnySAMOutFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AnySAMOutFormatter.scala index 09b308a4d0..df47fa16a6 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AnySAMOutFormatter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AnySAMOutFormatter.scala @@ -29,7 +29,7 @@ import scala.collection.mutable.ListBuffer * An OutFormatter that automatically infers whether the piped input is SAM or * BAM. Autodetecting streamed CRAM is not currently supported. */ -class AnySAMOutFormatter extends OutFormatter[AlignmentRecord] { +case class AnySAMOutFormatter(stringency: ValidationStringency = ValidationStringency.STRICT) extends OutFormatter[AlignmentRecord] { /** * Reads alignment records from an input stream. Autodetects SAM/BAM format. @@ -41,6 +41,7 @@ class AnySAMOutFormatter extends OutFormatter[AlignmentRecord] { // make reader val reader = SamReaderFactory.makeDefault() + .validationStringency(stringency) .open(SamInputResource.of(is)) SAMIteratorConverter(reader)