From fcaca388fb7c1f93406fc391dc28c8e62fe658b1 Mon Sep 17 00:00:00 2001 From: Frank Austin Nothaft Date: Wed, 30 Aug 2017 10:32:45 -0700 Subject: [PATCH 1/3] [ADAM-1701] Allow InterleavedFASTQInFormatter logging to be disabled. Resolves #1701. --- .../org/bdgenomics/adam/rdd/fragment/FragmentRDD.scala | 8 ++++++++ .../adam/rdd/fragment/InterleavedFASTQInFormatter.scala | 5 +++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/FragmentRDD.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/FragmentRDD.scala index ad741f927d..9de07907ab 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/FragmentRDD.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/FragmentRDD.scala @@ -93,6 +93,14 @@ object FragmentRDD { */ val WRITE_SUFFIXES = "org.bdgenomics.adam.rdd.fragment.FragmentRDD.writeSuffixes" + /** + * Hadoop configuration path to check for a boolean value indicating that + * the warning logging that is triggered if an interleaved fragment doesn't + * have exactly two reads should be disabled. Default is false (logging is + * enabled). + */ + val DISABLE_READ_COUNT_LOGGING = "org.bdgenomics.adam.rdd.fragment.FragmentRDD.disableReadCountLogging" + /** * Creates a FragmentRDD where no record groups or sequence info are attached. * diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/InterleavedFASTQInFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/InterleavedFASTQInFormatter.scala index 44a4deeb17..2844cc3459 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/InterleavedFASTQInFormatter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/InterleavedFASTQInFormatter.scala @@ -48,6 +48,7 @@ class InterleavedFASTQInFormatter private ( private val converter = new AlignmentRecordConverter private val writeSuffixes = conf.getBoolean(FragmentRDD.WRITE_SUFFIXES, false) private val writeOriginalQualities = conf.getBoolean(FragmentRDD.WRITE_ORIGINAL_QUALITIES, false) + private val enableLogging = !conf.getBoolean(FragmentRDD.DISABLE_READ_COUNT_LOGGING, false) /** * Writes alignment records to an output stream in interleaved FASTQ format. @@ -59,11 +60,11 @@ class InterleavedFASTQInFormatter private ( iter.flatMap(frag => { val reads = converter.convertFragment(frag).toSeq - if (reads.size < 2) { + if (enableLogging && reads.size < 2) { log.warn("Fewer than two reads for %s. Dropping...".format(frag)) None } else { - if (reads.size > 2) { + if (enableLogging && reads.size > 2) { log.warn("More than two reads for %s. Taking first 2.".format(frag)) } Some((reads(0), reads(1))) From bac6c1d11493f3599208725aa48a2330bdcefe14 Mon Sep 17 00:00:00 2001 From: Frank Austin Nothaft Date: Wed, 30 Aug 2017 11:07:48 -0700 Subject: [PATCH 2/3] [ADAM-1702] Validate read numbers in InterleavedFASTQInFormatter. Resolves #1702. --- .../fragment/InterleavedFASTQInFormatter.scala | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/InterleavedFASTQInFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/InterleavedFASTQInFormatter.scala index 2844cc3459..bcd065e530 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/InterleavedFASTQInFormatter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/InterleavedFASTQInFormatter.scala @@ -81,8 +81,20 @@ class InterleavedFASTQInFormatter private ( outputOriginalBaseQualities = writeOriginalQualities) + "\n" // write both to the output stream - os.write(fastq1.getBytes) - os.write(fastq2.getBytes) + // ensure that reads are ordered properly if ordering is known (see #1702) + if (read1.getReadInFragment == 0 && + read2.getReadInFragment == 1) { + os.write(fastq1.getBytes) + os.write(fastq2.getBytes) + } else if (read1.getReadInFragment == 1 && + read2.getReadInFragment == 0) { + os.write(fastq2.getBytes) + os.write(fastq1.getBytes) + } else { + if (enableLogging) { + log.warn("Improper pair of reads in fragment %s. Dropping...".format(p)) + } + } }) } } From 69dcfa5ba293487ee35454a276d31f0fabfdc5fe Mon Sep 17 00:00:00 2001 From: Frank Austin Nothaft Date: Wed, 30 Aug 2017 11:44:32 -0700 Subject: [PATCH 3/3] [ADAM-1703] Expose ValidationStringency in AnySAMOutFormatter. Resolves #1703. --- .../org/bdgenomics/adam/rdd/read/AnySAMOutFormatter.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AnySAMOutFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AnySAMOutFormatter.scala index 09b308a4d0..df47fa16a6 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AnySAMOutFormatter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AnySAMOutFormatter.scala @@ -29,7 +29,7 @@ import scala.collection.mutable.ListBuffer * An OutFormatter that automatically infers whether the piped input is SAM or * BAM. Autodetecting streamed CRAM is not currently supported. */ -class AnySAMOutFormatter extends OutFormatter[AlignmentRecord] { +case class AnySAMOutFormatter(stringency: ValidationStringency = ValidationStringency.STRICT) extends OutFormatter[AlignmentRecord] { /** * Reads alignment records from an input stream. Autodetects SAM/BAM format. @@ -41,6 +41,7 @@ class AnySAMOutFormatter extends OutFormatter[AlignmentRecord] { // make reader val reader = SamReaderFactory.makeDefault() + .validationStringency(stringency) .open(SamInputResource.of(is)) SAMIteratorConverter(reader)