From 82838b7351dd15164fd8f8aaff13dbfb28dd54a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Garillot?= Date: Tue, 29 Dec 2015 14:31:33 +0100 Subject: [PATCH] Fix various minor code issues: * disuse of builtin functions for common reduce operations * unneceessary toString * risky/anti-idiomatic Option usage * Cleaning up TODOs in Vcfutils * retire VcfStringUtils * finer exception types in some places * more explicit Fastq error message in stringent mode * allow IUPAC codes as isRegularBase alternatives * Add debug output for Fastq record --- .../apis/java/JavaAlignmentRecordRDD.scala | 16 ++- .../org/bdgenomics/adam/cli/AlleleCount.scala | 6 +- .../bdgenomics/adam/cli/CalculateDepth.scala | 3 +- .../bdgenomics/adam/cli/CountReadKmers.scala | 3 +- .../org/bdgenomics/adam/cli/FlagStat.scala | 6 +- .../org/bdgenomics/adam/cli/Flatten.scala | 3 +- .../org/bdgenomics/adam/cli/PrintGenes.scala | 3 +- .../org/bdgenomics/adam/cli/Transform.scala | 21 +-- .../scala/org/bdgenomics/adam/cli/View.scala | 21 +-- .../adam/io/IndexedBamInputFormat.scala | 2 +- .../consensus/ConsensusGenerator.scala | 7 +- .../ConsensusGeneratorFromKnowns.scala | 11 +- .../ConsensusGeneratorFromReads.scala | 19 ++- .../ConsensusGeneratorFromSmithWaterman.scala | 28 ++-- .../smithwaterman/SmithWaterman.scala | 16 ++- .../SmithWatermanConstantGapScoring.scala | 13 +- .../SmithWatermanGapScoringFromFn.scala | 7 +- .../converters/AlignmentRecordConverter.scala | 24 ++-- .../adam/converters/FastaConverter.scala | 34 ++--- .../converters/FastqRecordConverter.scala | 88 ++++++++----- .../adam/converters/FragmentConverter.scala | 16 ++- .../GenotypesToVariantsConverter.scala | 9 +- .../adam/converters/SAMRecordConverter.scala | 17 ++- .../VariantAnnotationConverter.scala | 20 ++- .../converters/VariantContextConverter.scala | 44 ++++--- .../org/bdgenomics/adam/models/Alphabet.scala | 3 +- .../org/bdgenomics/adam/models/Gene.scala | 20 +-- .../bdgenomics/adam/models/IndelTable.scala | 10 +- .../adam/models/NonoverlappingRegions.scala | 12 +- .../adam/models/ProgramRecord.scala | 20 +-- .../bdgenomics/adam/models/ReadBucket.scala | 42 +++--- .../adam/models/RecordGroupDictionary.scala | 41 +++--- .../adam/models/ReferencePosition.scala | 9 +- .../adam/models/ReferencePositionPair.scala | 19 ++- .../adam/models/ReferenceRegion.scala | 19 +-- .../adam/models/SAMFileHeaderWritable.scala | 2 +- .../adam/models/SequenceDictionary.scala | 49 +++---- .../adam/models/SingleReadBucket.scala | 7 +- .../org/bdgenomics/adam/models/SnpTable.scala | 47 ++++--- .../adam/models/VariantContext.scala | 6 +- .../org/bdgenomics/adam/rdd/ADAMContext.scala | 78 ++++++----- .../adam/rdd/ADAMRDDFunctions.scala | 26 ++-- .../adam/rdd/BroadcastRegionJoin.scala | 14 +- .../org/bdgenomics/adam/rdd/Coverage.scala | 6 +- .../adam/rdd/GenomicPartitioners.scala | 27 ++-- .../org/bdgenomics/adam/rdd/RegionJoin.scala | 7 +- .../adam/rdd/ShuffleRegionJoin.scala | 16 ++- .../rdd/contig/FlankReferenceFragments.scala | 12 +- ...NucleotideContigFragmentRDDFunctions.scala | 27 ++-- .../adam/rdd/features/FeatureParser.scala | 24 ++-- .../rdd/features/FeatureRDDFunctions.scala | 45 ++++--- .../read/AlignmentRecordRDDFunctions.scala | 124 ++++++++++-------- .../bdgenomics/adam/rdd/read/FlagStat.scala | 27 ++-- .../bdgenomics/adam/rdd/read/MDTagging.scala | 22 ++-- .../adam/rdd/read/MarkDuplicates.scala | 4 +- .../realignment/IndelRealignmentTarget.scala | 26 ++-- .../rdd/read/realignment/RealignIndels.scala | 111 ++++++++-------- .../realignment/RealignmentTargetFinder.scala | 19 +-- .../BaseQualityRecalibration.scala | 9 +- .../read/recalibration/ObservationTable.scala | 3 +- .../rdd/read/recalibration/Recalibrator.scala | 15 ++- .../rdd/variation/ADAMVCFOutputFormat.scala | 3 +- .../rdd/variation/VariationRDDFunctions.scala | 18 +-- .../bdgenomics/adam/rich/DecadentRead.scala | 31 +++-- .../adam/rich/RichAlignmentRecord.scala | 2 +- .../org/bdgenomics/adam/rich/RichCigar.scala | 39 +++--- .../bdgenomics/adam/util/AttributeUtils.scala | 6 +- .../org/bdgenomics/adam/util/Flattener.scala | 3 +- .../adam/util/IntervalListReader.scala | 3 +- .../org/bdgenomics/adam/util/MapTools.scala | 5 +- .../org/bdgenomics/adam/util/MdTag.scala | 25 +--- .../adam/util/ParquetFileTraversable.scala | 1 - .../org/bdgenomics/adam/util/TwoBitFile.scala | 4 +- .../bdgenomics/adam/util/VcfHeaderUtils.scala | 30 +++-- .../bdgenomics/adam/util/VcfStringUtils.scala | 77 ----------- pom.xml | 6 +- 76 files changed, 892 insertions(+), 746 deletions(-) delete mode 100644 adam-core/src/main/scala/org/bdgenomics/adam/util/VcfStringUtils.scala diff --git a/adam-apis/src/main/scala/org/bdgenomics/adam/apis/java/JavaAlignmentRecordRDD.scala b/adam-apis/src/main/scala/org/bdgenomics/adam/apis/java/JavaAlignmentRecordRDD.scala index e8f2c43541..f18191e575 100644 --- a/adam-apis/src/main/scala/org/bdgenomics/adam/apis/java/JavaAlignmentRecordRDD.scala +++ b/adam-apis/src/main/scala/org/bdgenomics/adam/apis/java/JavaAlignmentRecordRDD.scala @@ -33,11 +33,12 @@ class JavaAlignmentRecordRDD(val jrdd: JavaRDD[AlignmentRecord]) extends Seriali * @param compressCodec Name of the compression codec to use. * @param disableDictionaryEncoding Whether or not to disable bit-packing. */ - def adamSave(filePath: java.lang.String, - blockSize: java.lang.Integer, - pageSize: java.lang.Integer, - compressCodec: CompressionCodecName, - disableDictionaryEncoding: java.lang.Boolean) { + def adamSave( + filePath: java.lang.String, + blockSize: java.lang.Integer, + pageSize: java.lang.Integer, + compressCodec: CompressionCodecName, + disableDictionaryEncoding: java.lang.Boolean) { jrdd.rdd.adamParquetSave( filePath, blockSize, @@ -62,8 +63,9 @@ class JavaAlignmentRecordRDD(val jrdd: JavaRDD[AlignmentRecord]) extends Seriali * @param filePath Path to save the file at. * @param asSam If true, saves as SAM. If false, saves as BAM. */ - def adamSAMSave(filePath: java.lang.String, - asSam: java.lang.Boolean) { + def adamSAMSave( + filePath: java.lang.String, + asSam: java.lang.Boolean) { jrdd.rdd.adamSAMSave(filePath, asSam) } diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/AlleleCount.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/AlleleCount.scala index 4e2d333928..8f93b75479 100644 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/AlleleCount.scala +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/AlleleCount.scala @@ -53,12 +53,14 @@ object AlleleCountHelper extends Serializable { } def countAlleles(adamVariants: RDD[Genotype], args: AlleleCountArgs) { - val usefulData = adamVariants.map(p => (p.getVariant.getContig.getContigName, + val usefulData = adamVariants.map(p => ( + p.getVariant.getContig.getContigName, p.getVariant.getStart, p.getVariant.getReferenceAllele, p.getVariant.getAlternateAllele, p.getAlleles.get(0), - p.getAlleles.get(1))) + p.getAlleles.get(1) + )) val reduced_Variants = usefulData.flatMap(p => Seq((p._1, p._2, p._3, p._4, p._5), (p._1, p._2, p._3, p._4, p._6))) val alleles = reduced_Variants.flatMap(chooseAllele) alleles.groupBy(identity).map { case (a, b) => "%s\t%s\t%s\t%d".format(a._1, a._2, a._3, b.size) } diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CalculateDepth.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CalculateDepth.scala index 0e46e461bf..69c8abadec 100644 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CalculateDepth.scala +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CalculateDepth.scala @@ -108,7 +108,8 @@ class CalculateDepth(protected val args: CalculateDepthArgs) extends BDGSparkCom println("%20s\t%15s\t% 5d".format( "%s:%d".format(region.referenceName, region.start), variantNames(region), - count)) + count + )) } } diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CountReadKmers.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CountReadKmers.scala index 234f094b35..6e035abfcb 100644 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CountReadKmers.scala +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CountReadKmers.scala @@ -61,7 +61,8 @@ class CountReadKmers(protected val args: CountReadKmersArgs) extends BDGSparkCom // read from disk var adamRecords: RDD[AlignmentRecord] = sc.loadAlignments( args.inputPath, - projection = Some(Projection(AlignmentRecordField.sequence))) + projection = Some(Projection(AlignmentRecordField.sequence)) + ) if (args.repartition != -1) { log.info("Repartitioning reads to '%d' partitions".format(args.repartition)) diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/FlagStat.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/FlagStat.scala index cebd7c5c3b..b74fa46b14 100644 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/FlagStat.scala +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/FlagStat.scala @@ -62,7 +62,8 @@ class FlagStat(protected val args: FlagStatArgs) extends BDGSparkCommand[FlagSta AlignmentRecordField.secondOfPair, AlignmentRecordField.properPair, AlignmentRecordField.mapq, - AlignmentRecordField.failedVendorQualityChecks) + AlignmentRecordField.failedVendorQualityChecks + ) val adamFile: RDD[AlignmentRecord] = sc.loadAlignments(args.inputPath, projection = Some(projection)) @@ -113,7 +114,8 @@ class FlagStat(protected val args: FlagStatArgs) extends BDGSparkCommand[FlagSta percent(passedVendorQuality.singleton, passedVendorQuality.total), percent(failedVendorQuality.singleton, failedVendorQuality.total), passedVendorQuality.withMateMappedToDiffChromosome, failedVendorQuality.withMateMappedToDiffChromosome, - passedVendorQuality.withMateMappedToDiffChromosomeMapQ5, failedVendorQuality.withMateMappedToDiffChromosomeMapQ5) + passedVendorQuality.withMateMappedToDiffChromosomeMapQ5, failedVendorQuality.withMateMappedToDiffChromosomeMapQ5 + ) Option(args.outputPath) match { case Some(outputPath) => diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Flatten.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Flatten.scala index 052f2be71a..0dece18f8f 100644 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Flatten.scala +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Flatten.scala @@ -75,6 +75,7 @@ class Flatten(val args: FlattenArgs) extends BDGSparkCommand[FlattenArgs] with L args.pageSize, args.compressionCodec, args.disableDictionaryEncoding, - Some(flatSchema)) + Some(flatSchema) + ) } } diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/PrintGenes.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/PrintGenes.scala index 61c288ced9..2779aaa05d 100644 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/PrintGenes.scala +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/PrintGenes.scala @@ -64,6 +64,7 @@ class PrintGenes(protected val args: PrintGenesArgs) transcript.region.referenceName, transcript.region.start, transcript.region.end, if (transcript.strand) "+" else "-", - transcript.exons.size) + transcript.exons.size + ) } } diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Transform.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Transform.scala index 1be8e96f38..32e93e5a63 100644 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Transform.scala +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Transform.scala @@ -145,7 +145,8 @@ class Transform(protected val args: TransformArgs) extends BDGSparkCommand[Trans log.info("Locally realigning indels.") val consensusGenerator = Option(args.knownIndelsFile) .fold(new ConsensusGeneratorFromReads().asInstanceOf[ConsensusGenerator])( - new ConsensusGeneratorFromKnowns(_, sc).asInstanceOf[ConsensusGenerator]) + new ConsensusGeneratorFromKnowns(_, sc).asInstanceOf[ConsensusGenerator] + ) adamRecords = oldRdd.adamRealignIndels( consensusGenerator, @@ -227,7 +228,8 @@ class Transform(protected val args: TransformArgs) extends BDGSparkCommand[Trans if ((args.useAlignedReadPredicate || args.limitProjection) && (args.forceLoadBam || args.forceLoadFastq || args.forceLoadIFastq)) { throw new IllegalArgumentException( - "-aligned_read_predicate and -limit_projection only apply to Parquet files, but a non-Parquet force load flag was passed.") + "-aligned_read_predicate and -limit_projection only apply to Parquet files, but a non-Parquet force load flag was passed." + ) } val rdd = @@ -246,7 +248,8 @@ class Transform(protected val args: TransformArgs) extends BDGSparkCommand[Trans None } val proj = if (args.limitProjection) { - Some(Projection(AlignmentRecordField.contig, + Some(Projection( + AlignmentRecordField.contig, AlignmentRecordField.start, AlignmentRecordField.end, AlignmentRecordField.mapq, @@ -265,13 +268,16 @@ class Transform(protected val args: TransformArgs) extends BDGSparkCommand[Trans AlignmentRecordField.duplicateRead, AlignmentRecordField.mismatchingPositions, AlignmentRecordField.secondaryAlignment, - AlignmentRecordField.supplementaryAlignment)) + AlignmentRecordField.supplementaryAlignment + )) } else { None } - sc.loadParquetAlignments(args.inputPath, + sc.loadParquetAlignments( + args.inputPath, predicate = pred, - projection = proj) + projection = proj + ) } else { sc.loadAlignments( args.inputPath, @@ -297,8 +303,7 @@ class Transform(protected val args: TransformArgs) extends BDGSparkCommand[Trans concatFilename, recordGroupOpt = Option(args.fastqRecordGroup) ) - } - ) + }) this.apply(concatRddOpt match { case Some(concatRdd) => rdd ++ concatRdd diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/View.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/View.scala index 4b022ba26f..d7a7ff7d54 100644 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/View.scala +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/View.scala @@ -36,41 +36,47 @@ class ViewArgs extends Args4jBase with ParquetArgs with ADAMSaveAnyArgs { required = false, name = "-f", metaVar = "N", - usage = "Restrict to reads that match all of the bits in ") + usage = "Restrict to reads that match all of the bits in " + ) var matchAllBits: Int = 0 @Args4jOption( required = false, name = "-F", metaVar = "N", - usage = "Restrict to reads that match none of the bits in ") + usage = "Restrict to reads that match none of the bits in " + ) var mismatchAllBits: Int = 0 @Args4jOption( required = false, name = "-g", metaVar = "N", - usage = "Restrict to reads that match any of the bits in ") + usage = "Restrict to reads that match any of the bits in " + ) var matchSomeBits: Int = 0 @Args4jOption( required = false, name = "-G", metaVar = "N", - usage = "Restrict to reads that mismatch at least one of the bits in ") + usage = "Restrict to reads that mismatch at least one of the bits in " + ) var mismatchSomeBits: Int = 0 @Args4jOption( required = false, name = "-c", - usage = "Print count of matching records, instead of the records themselves") + usage = "Print count of matching records, instead of the records themselves" + ) var printCount: Boolean = false @Args4jOption( required = false, name = "-o", metaVar = "", - usage = "Output to ; can also pass as the second argument") + usage = "Output to ; can also pass as the second argument" + ) var outputPathArg: String = null @Args4jOption(required = false, name = "-sort_fastq_output", usage = "Sets whether to sort the FASTQ output, if saving as FASTQ. False by default. Ignored if not saving as FASTQ.") @@ -148,8 +154,7 @@ class View(val args: ViewArgs) extends BDGSparkCommand[ViewArgs] { reads.filter(read => allFilters.forall(_(read)) && (matchSomeFilters.isEmpty || matchSomeFilters.exists(_(read))) && - (mismatchSomeFilters.isEmpty || mismatchSomeFilters.exists(_(read))) - ) + (mismatchSomeFilters.isEmpty || mismatchSomeFilters.exists(_(read)))) } else reads } diff --git a/adam-core/src/main/java/org/bdgenomics/adam/io/IndexedBamInputFormat.scala b/adam-core/src/main/java/org/bdgenomics/adam/io/IndexedBamInputFormat.scala index 154efb7d35..db9d08bf6d 100644 --- a/adam-core/src/main/java/org/bdgenomics/adam/io/IndexedBamInputFormat.scala +++ b/adam-core/src/main/java/org/bdgenomics/adam/io/IndexedBamInputFormat.scala @@ -57,7 +57,7 @@ class IndexedBamInputFormat extends BAMInputFormat { override def createRecordReader(split: InputSplit, ctx: TaskAttemptContext): RecordReader[LongWritable, SAMRecordWritable] = { val rr: RecordReader[LongWritable, SAMRecordWritable] = new BAMFilteredRecordReader() assert(IndexedBamInputFormat.optViewRegion.isDefined) - BAMFilteredRecordReader.setRegion(IndexedBamInputFormat.optViewRegion.get) + IndexedBamInputFormat.optViewRegion.foreach { (refReg) => BAMFilteredRecordReader.setRegion(refReg) } rr.initialize(split, ctx) rr } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/consensus/ConsensusGenerator.scala b/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/consensus/ConsensusGenerator.scala index 7727c75eed..0f833da438 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/consensus/ConsensusGenerator.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/consensus/ConsensusGenerator.scala @@ -39,9 +39,10 @@ abstract class ConsensusGenerator extends Serializable { * @param reads Reads to preprocess. * @return Preprocessed reads. */ - def preprocessReadsForRealignment(reads: Iterable[RichAlignmentRecord], - reference: String, - region: ReferenceRegion): Iterable[RichAlignmentRecord] + def preprocessReadsForRealignment( + reads: Iterable[RichAlignmentRecord], + reference: String, + region: ReferenceRegion): Iterable[RichAlignmentRecord] /** * For all reads in this region, generates the list of consensus sequences for realignment. diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/consensus/ConsensusGeneratorFromKnowns.scala b/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/consensus/ConsensusGeneratorFromKnowns.scala index 49e83f6d2d..d7ba7392d4 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/consensus/ConsensusGeneratorFromKnowns.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/consensus/ConsensusGeneratorFromKnowns.scala @@ -51,9 +51,10 @@ class ConsensusGeneratorFromKnowns(file: String, @transient sc: SparkContext) ex * @param reads Reads to preprocess. * @return Preprocessed reads. */ - def preprocessReadsForRealignment(reads: Iterable[RichAlignmentRecord], - reference: String, - region: ReferenceRegion): Iterable[RichAlignmentRecord] = { + def preprocessReadsForRealignment( + reads: Iterable[RichAlignmentRecord], + reference: String, + region: ReferenceRegion): Iterable[RichAlignmentRecord] = { reads } @@ -67,8 +68,8 @@ class ConsensusGeneratorFromKnowns(file: String, @transient sc: SparkContext) ex val table = indelTable.value // get region - val start = reads.map(_.record.getStart.toLong).reduce(_ min _) - val end = reads.map(_.getEnd.toLong).reduce(_ max _) + val start = reads.map(_.record.getStart).min + val end = reads.map(_.getEnd).max val refId = reads.head.record.getContig.getContigName val region = ReferenceRegion(refId, start, end + 1) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/consensus/ConsensusGeneratorFromReads.scala b/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/consensus/ConsensusGeneratorFromReads.scala index 936d203aa1..0fbf45ea50 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/consensus/ConsensusGeneratorFromReads.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/consensus/ConsensusGeneratorFromReads.scala @@ -44,9 +44,10 @@ class ConsensusGeneratorFromReads extends ConsensusGenerator { * @param reads Reads to process. * @return Reads with indels normalized if they contain a single indel. */ - def preprocessReadsForRealignment(reads: Iterable[RichAlignmentRecord], - reference: String, - region: ReferenceRegion): Iterable[RichAlignmentRecord] = { + def preprocessReadsForRealignment( + reads: Iterable[RichAlignmentRecord], + reference: String, + region: ReferenceRegion): Iterable[RichAlignmentRecord] = { reads.map(r => { // if there are two alignment blocks (sequence matches) then there is a single indel in the read if (r.samtoolsCigar.numAlignmentBlocks == 2) { @@ -74,10 +75,14 @@ class ConsensusGeneratorFromReads extends ConsensusGenerator { .flatMap(r => { // try to generate a consensus alignment - if a consensus exists, add it to our // list of consensuses to test - Consensus.generateAlternateConsensus(r.getSequence, - ReferencePosition(r.getContig.getContigName, - r.getStart), - r.samtoolsCigar) + Consensus.generateAlternateConsensus( + r.getSequence, + ReferencePosition( + r.getContig.getContigName, + r.getStart + ), + r.samtoolsCigar + ) }) .toSeq .distinct diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/consensus/ConsensusGeneratorFromSmithWaterman.scala b/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/consensus/ConsensusGeneratorFromSmithWaterman.scala index f9f08ee33d..dc3e8c6ff2 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/consensus/ConsensusGeneratorFromSmithWaterman.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/consensus/ConsensusGeneratorFromSmithWaterman.scala @@ -25,10 +25,11 @@ import org.bdgenomics.adam.rich.RichCigar._ import org.bdgenomics.adam.util.MdTag import org.bdgenomics.formats.avro.AlignmentRecord -class ConsensusGeneratorFromSmithWaterman(wMatch: Double, - wMismatch: Double, - wInsert: Double, - wDelete: Double) extends ConsensusGeneratorFromReads { +class ConsensusGeneratorFromSmithWaterman( + wMatch: Double, + wMismatch: Double, + wInsert: Double, + wDelete: Double) extends ConsensusGeneratorFromReads { /** * Attempts realignment of all reads using Smith-Waterman. Accepts all realignments that have one @@ -37,25 +38,30 @@ class ConsensusGeneratorFromSmithWaterman(wMatch: Double, * @param reads Reads to process. * @return Reads with indels normalized if they contain a single indel. */ - override def preprocessReadsForRealignment(reads: Iterable[RichAlignmentRecord], - reference: String, - region: ReferenceRegion): Iterable[RichAlignmentRecord] = { + override def preprocessReadsForRealignment( + reads: Iterable[RichAlignmentRecord], + reference: String, + region: ReferenceRegion): Iterable[RichAlignmentRecord] = { val rds: Iterable[RichAlignmentRecord] = reads.map(r => { - val sw = new SmithWatermanConstantGapScoring(r.record.getSequence.toString, + val sw = new SmithWatermanConstantGapScoring( + r.record.getSequence, reference, wMatch, wMismatch, wInsert, - wDelete) + wDelete + ) println("for " + r.record.getReadName + " sw to " + sw.xStart + " with " + sw.cigarX) // if we realign with fewer than three alignment blocks, then take the new alignment if (sw.cigarX.numAlignmentBlocks <= 2) { - val mdTag = MdTag(r.record.getSequence.toString, + val mdTag = MdTag( + r.record.getSequence, reference.drop(sw.xStart), sw.cigarX, - region.start) + region.start + ) val newRead: RichAlignmentRecord = AlignmentRecord.newBuilder(r) .setStart(sw.xStart + region.start) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/smithwaterman/SmithWaterman.scala b/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/smithwaterman/SmithWaterman.scala index 4e2ab4af9b..65e231f56a 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/smithwaterman/SmithWaterman.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/smithwaterman/SmithWaterman.scala @@ -130,11 +130,12 @@ abstract class SmithWaterman(xSequence: String, ySequence: String) extends Seria * * @see buildScoringMatrix */ - @tailrec private[smithwaterman] final def move(matrix: Array[Array[Char]], - i: Int, - j: Int, - cX: String, - cY: String): (String, String, Int, Int) = { + @tailrec private[smithwaterman] final def move( + matrix: Array[Array[Char]], + i: Int, + j: Int, + cX: String, + cY: String): (String, String, Int, Int) = { if (matrix(i)(j) == 'T') { // return if told to terminate (cigarFromRNNCigar(cX), cigarFromRNNCigar(cY), i, j) @@ -160,8 +161,9 @@ abstract class SmithWaterman(xSequence: String, ySequence: String) extends Seria * @param moveMatrix Move matrix to track back on. * @return Tuple of Cigar for X, Y. */ - private[smithwaterman] def trackback(scoreMatrix: Array[Array[Double]], - moveMatrix: Array[Array[Char]]): (Cigar, Cigar, Int, Int) = { + private[smithwaterman] def trackback( + scoreMatrix: Array[Array[Double]], + moveMatrix: Array[Array[Char]]): (Cigar, Cigar, Int, Int) = { assert(scoreMatrix.length == xSequence.length + 1) assert(scoreMatrix.forall(_.length == ySequence.length + 1)) assert(moveMatrix.length == xSequence.length + 1) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/smithwaterman/SmithWatermanConstantGapScoring.scala b/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/smithwaterman/SmithWatermanConstantGapScoring.scala index 1b8d2c7302..bbe209c64b 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/smithwaterman/SmithWatermanConstantGapScoring.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/smithwaterman/SmithWatermanConstantGapScoring.scala @@ -33,11 +33,12 @@ object SmithWatermanConstantGapScoring { } -class SmithWatermanConstantGapScoring(xSequence: String, - ySequence: String, - wMatch: Double, - wMismatch: Double, - wInsert: Double, - wDelete: Double) +class SmithWatermanConstantGapScoring( + xSequence: String, + ySequence: String, + wMatch: Double, + wMismatch: Double, + wInsert: Double, + wDelete: Double) extends SmithWatermanGapScoringFromFn(xSequence, ySequence, SmithWatermanConstantGapScoring.constantGapFn(wMatch, wInsert, wDelete, wMismatch)) { } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/smithwaterman/SmithWatermanGapScoringFromFn.scala b/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/smithwaterman/SmithWatermanGapScoringFromFn.scala index 946acd3dae..3df9e2fb86 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/smithwaterman/SmithWatermanGapScoringFromFn.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/smithwaterman/SmithWatermanGapScoringFromFn.scala @@ -17,9 +17,10 @@ */ package org.bdgenomics.adam.algorithms.smithwaterman -abstract class SmithWatermanGapScoringFromFn(xSequence: String, - ySequence: String, - scoreFn: (Int, Int, Char, Char) => Double) +abstract class SmithWatermanGapScoringFromFn( + xSequence: String, + ySequence: String, + scoreFn: (Int, Int, Char, Char) => Double) extends SmithWaterman(xSequence, ySequence) { def buildScoringMatrix(): (Array[Array[Double]], Array[Array[Char]]) = { diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/converters/AlignmentRecordConverter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/converters/AlignmentRecordConverter.scala index 653b07f373..118284326e 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/converters/AlignmentRecordConverter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/converters/AlignmentRecordConverter.scala @@ -39,9 +39,10 @@ class AlignmentRecordConverter extends Serializable { * @param adamRecord Read to convert to FASTQ. * @return Returns this read in string form. */ - def convertToFastq(adamRecord: AlignmentRecord, - maybeAddSuffix: Boolean = false, - outputOriginalBaseQualities: Boolean = false): String = { + def convertToFastq( + adamRecord: AlignmentRecord, + maybeAddSuffix: Boolean = false, + outputOriginalBaseQualities: Boolean = false): String = { val readNameSuffix = if (maybeAddSuffix && !AlignmentRecordConverter.readNameHasPairedSuffix(adamRecord) && @@ -101,7 +102,7 @@ class AlignmentRecordConverter extends Serializable { val builder: SAMRecord = new SAMRecord(header.header) // set canonically necessary fields - builder.setReadName(adamRecord.getReadName.toString) + builder.setReadName(adamRecord.getReadName) builder.setReadString(adamRecord.getSequence) adamRecord.getQual match { case null => builder.setBaseQualityString("*") @@ -110,18 +111,16 @@ class AlignmentRecordConverter extends Serializable { // set read group flags Option(adamRecord.getRecordGroupName) - .map(_.toString) .map(rgDict.getSequenceIndex) - .foreach(v => builder.setAttribute("RG", v.toString)) + .foreach(v => builder.setAttribute("RG", v)) Option(adamRecord.getRecordGroupLibrary) - .foreach(v => builder.setAttribute("LB", v.toString)) + .foreach(v => builder.setAttribute("LB", v)) Option(adamRecord.getRecordGroupPlatformUnit) - .foreach(v => builder.setAttribute("PU", v.toString)) + .foreach(v => builder.setAttribute("PU", v)) // set the reference name, and alignment position, for mate Option(adamRecord.getMateContig) .map(_.getContigName) - .map(_.toString) .foreach(builder.setMateReferenceName) Option(adamRecord.getMateAlignmentStart) .foreach(s => builder.setMateAlignmentStart(s.toInt + 1)) @@ -161,9 +160,9 @@ class AlignmentRecordConverter extends Serializable { builder.setReferenceName(adamRecord.getContig.getContigName) // set the cigar, if provided - Option(adamRecord.getCigar).map(_.toString).foreach(builder.setCigarString) + Option(adamRecord.getCigar).foreach(builder.setCigarString) // set the old cigar, if provided - Option(adamRecord.getOldCigar).map(_.toString).foreach(v => builder.setAttribute("OC", v)) + Option(adamRecord.getOldCigar).foreach(v => builder.setAttribute("OC", v)) // set mapping flags Option(adamRecord.getReadNegativeStrand) .foreach(v => builder.setReadNegativeStrandFlag(v.booleanValue)) @@ -184,7 +183,6 @@ class AlignmentRecordConverter extends Serializable { Option(adamRecord.getFailedVendorQualityChecks) .foreach(v => builder.setReadFailsVendorQualityCheckFlag(v.booleanValue)) Option(adamRecord.getMismatchingPositions) - .map(_.toString) .foreach(builder.setAttribute("MD", _)) // add all other tags @@ -195,7 +193,7 @@ class AlignmentRecordConverter extends Serializable { }) } - // return sam record + // return sam record builder } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/converters/FastaConverter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/converters/FastaConverter.scala index 010c3195e7..14a63b6c21 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/converters/FastaConverter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/converters/FastaConverter.scala @@ -34,13 +34,13 @@ private[adam] object FastaConverter { val (contigName, contigDescription) = parseDescriptionLine(descriptionLine, fileIndex) private def parseDescriptionLine(descriptionLine: Option[String], id: Long): (Option[String], Option[String]) = { - if (descriptionLine.isEmpty) { + descriptionLine.fold { assert(id == -1L, "Cannot have a headerless line in a file with more than one fragment.") - (None, None) - } else { - val splitIndex = descriptionLine.get.indexOf(' ') + (None: Option[String], None: Option[String]) + } { (dL) => + val splitIndex = dL.indexOf(' ') if (splitIndex >= 0) { - val split = descriptionLine.get.splitAt(splitIndex) + val split = dL.splitAt(splitIndex) val contigName: String = split._1.stripPrefix(">").trim val contigDescription: String = split._2.trim @@ -48,7 +48,7 @@ private[adam] object FastaConverter { (Some(contigName), Some(contigDescription)) } else { - (Some(descriptionLine.get.stripPrefix(">").trim), None) + (Some(dL.stripPrefix(">").trim), None) } } } @@ -70,8 +70,9 @@ private[adam] object FastaConverter { * @param maxFragmentLength The maximum length of fragments in the contig. * @return An RDD of ADAM FASTA data. */ - def apply(rdd: RDD[(Long, String)], - maxFragmentLength: Long = 10000L): RDD[NucleotideContigFragment] = { + def apply( + rdd: RDD[(Long, String)], + maxFragmentLength: Long = 10000L): RDD[NucleotideContigFragment] = { val filtered = rdd.map(kv => (kv._1, kv._2.trim())) .filter((kv: (Long, String)) => !kv._2.startsWith(";")) @@ -96,10 +97,12 @@ private[adam] object FastaConverter { assert(lines.size != 0, "Sequence " + descriptionLine.seqId + " has no sequence data.") val sequence: Seq[String] = lines.toSeq.sortBy(_._1).map(kv => cleanSequence(kv._2)) - converter.convert(descriptionLine.contigName, + converter.convert( + descriptionLine.contigName, descriptionLine.seqId, sequence, - descriptionLine.contigDescription) + descriptionLine.contigDescription + ) } } @@ -182,13 +185,14 @@ private[converters] class FastaConverter(fragmentLength: Long) extends Serializa * @param description Optional description of the sequence. * @return The converted ADAM FASTA contig. */ - def convert(name: Option[String], - id: Int, - sequence: Seq[String], - description: Option[String]): Seq[NucleotideContigFragment] = { + def convert( + name: Option[String], + id: Int, + sequence: Seq[String], + description: Option[String]): Seq[NucleotideContigFragment] = { // get sequence length - val sequenceLength = sequence.map(_.length).reduce(_ + _) + val sequenceLength = sequence.map(_.length).sum // map sequences into fragments val sequencesAsFragments = mapFragments(sequence) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/converters/FastqRecordConverter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/converters/FastqRecordConverter.scala index d2379f548c..158edb5ef9 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/converters/FastqRecordConverter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/converters/FastqRecordConverter.scala @@ -38,31 +38,36 @@ class FastqRecordConverter extends Serializable with Logging { val firstReadSequence = lines(1) val firstReadQualities = lines(3) - require(firstReadSequence.length == firstReadQualities.length, - "Read " + firstReadName + " has different sequence and qual length.") + require( + firstReadSequence.length == firstReadQualities.length, + "Read " + firstReadName + " has different sequence and qual length." + ) // get fields for second read in pair val secondReadName = lines(4).drop(1) val secondReadSequence = lines(5) val secondReadQualities = lines(7) - require(secondReadSequence.length == secondReadQualities.length, - "Read " + secondReadName + " has different sequence and qual length.") + require( + secondReadSequence.length == secondReadQualities.length, + "Read " + secondReadName + " has different sequence and qual length." + ) // build and return iterators - Iterable(AlignmentRecord.newBuilder() - .setReadName(firstReadName) - .setSequence(firstReadSequence) - .setQual(firstReadQualities) - .setReadPaired(true) - .setProperPair(true) - .setReadNum(0) - .setReadNegativeStrand(null) - .setMateNegativeStrand(null) - .setPrimaryAlignment(null) - .setSecondaryAlignment(null) - .setSupplementaryAlignment(null) - .build(), + Iterable( + AlignmentRecord.newBuilder() + .setReadName(firstReadName) + .setSequence(firstReadSequence) + .setQual(firstReadQualities) + .setReadPaired(true) + .setProperPair(true) + .setReadNum(0) + .setReadNegativeStrand(null) + .setMateNegativeStrand(null) + .setPrimaryAlignment(null) + .setSecondaryAlignment(null) + .setSupplementaryAlignment(null) + .build(), AlignmentRecord.newBuilder() .setReadName(secondReadName) .setSequence(secondReadSequence) @@ -75,7 +80,8 @@ class FastqRecordConverter extends Serializable with Logging { .setPrimaryAlignment(null) .setSecondaryAlignment(null) .setSupplementaryAlignment(null) - .build()) + .build() + ) } def convertFragment(element: (Void, Text)): Fragment = { @@ -87,19 +93,27 @@ class FastqRecordConverter extends Serializable with Logging { val firstReadSequence = lines(1) val firstReadQualities = lines(3) - require(firstReadSequence.length == firstReadQualities.length, - "Read " + firstReadName + " has different sequence and qual length.") + require( + firstReadSequence.length == firstReadQualities.length, + "Read " + firstReadName + " has different sequence and qual length." + ) // get fields for second read in pair val secondReadName = lines(4).drop(1) val secondReadSequence = lines(5) val secondReadQualities = lines(7) - require(secondReadSequence.length == secondReadQualities.length, - "Read " + secondReadName + " has different sequence and qual length.") - require(firstReadName == secondReadName, - "Reads %s and %s in Fragment have different names.".format(firstReadName, - secondReadName)) + require( + secondReadSequence.length == secondReadQualities.length, + "Read " + secondReadName + " has different sequence and qual length." + ) + require( + firstReadName == secondReadName, + "Reads %s and %s in Fragment have different names.".format( + firstReadName, + secondReadName + ) + ) // build and return record Fragment.newBuilder() @@ -114,11 +128,12 @@ class FastqRecordConverter extends Serializable with Logging { .build() } - def convertRead(element: (Void, Text), - recordGroupOpt: Option[String] = None, - setFirstOfPair: Boolean = false, - setSecondOfPair: Boolean = false, - stringency: ValidationStringency = ValidationStringency.STRICT): AlignmentRecord = { + def convertRead( + element: (Void, Text), + recordGroupOpt: Option[String] = None, + setFirstOfPair: Boolean = false, + setSecondOfPair: Boolean = false, + stringency: ValidationStringency = ValidationStringency.STRICT): AlignmentRecord = { val lines = element._2.toString.split('\n') require(lines.length == 4, "Record has wrong format:\n" + element._2.toString) @@ -146,10 +161,11 @@ class FastqRecordConverter extends Serializable with Logging { val readName = trimTrailingReadNumber(lines(0).drop(1)) val readSequence = lines(1) + lazy val suffix = s"\n=== printing received Fastq record for debugging ===\n${lines.mkString("\n")}\n=== End of debug output for Fastq record ===" if (stringency == ValidationStringency.STRICT && lines(3) == "*" && readSequence.length > 1) - throw new Exception(s"Fastq quality must be defined") + throw new IllegalArgumentException(s"Fastq quality must be defined. $suffix") else if (stringency == ValidationStringency.STRICT && lines(3).length != readSequence.length) - throw new Exception(s"Fastq sequence and quality strings must have the same length") + throw new IllegalArgumentException(s"Fastq sequence and quality strings must have the same length.\n Fastq quality string of length ${lines(3).length}, expected ${readSequence.length} from the sequence length. $suffix") val readQualities = if (lines(3) == "*") @@ -157,13 +173,15 @@ class FastqRecordConverter extends Serializable with Logging { else if (lines(3).length < lines(1).length) lines(3) + ("B" * (lines(1).length - lines(3).length)) else if (lines(3).length > lines(1).length) - throw new Exception(s"Not implemented") + throw new NotImplementedError("Not implemented") else lines(3) - require(readSequence.length == readQualities.length, + require( + readSequence.length == readQualities.length, "Read " + readName + " has different sequence and qual length: " + - "\n\tsequence=" + readSequence + "\n\tqual=" + readQualities) + "\n\tsequence=" + readSequence + "\n\tqual=" + readQualities + ) val builder = AlignmentRecord.newBuilder() .setReadName(readName) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/converters/FragmentConverter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/converters/FragmentConverter.scala index 8ee7d80e16..b0d8b1d8e2 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/converters/FragmentConverter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/converters/FragmentConverter.scala @@ -25,8 +25,10 @@ import scala.annotation.tailrec private[converters] object FragmentCollector extends Serializable { def apply(fragment: NucleotideContigFragment): (Contig, FragmentCollector) = { - (fragment.getContig, - FragmentCollector(Seq((ReferenceRegion(fragment).get, fragment.getFragmentSequence)))) + ( + fragment.getContig, + FragmentCollector(Seq((ReferenceRegion(fragment).get, fragment.getFragmentSequence))) + ) } } @@ -35,8 +37,9 @@ private[converters] case class FragmentCollector(fragments: Seq[(ReferenceRegion object FragmentConverter extends Serializable { - private def mergeFragments(f1: FragmentCollector, - f2: FragmentCollector): FragmentCollector = { + private def mergeFragments( + f1: FragmentCollector, + f2: FragmentCollector): FragmentCollector = { assert(!(f1.fragments.isEmpty || f2.fragments.isEmpty)) // join fragments from each and sort @@ -44,8 +47,9 @@ object FragmentConverter extends Serializable { var fragmentList = List[(ReferenceRegion, String)]() - @tailrec def fragmentCombiner(lastFragment: (ReferenceRegion, String), - iter: Iterator[(ReferenceRegion, String)]) { + @tailrec def fragmentCombiner( + lastFragment: (ReferenceRegion, String), + iter: Iterator[(ReferenceRegion, String)]) { if (!iter.hasNext) { // prepend fragment to list fragmentList = lastFragment :: fragmentList diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/converters/GenotypesToVariantsConverter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/converters/GenotypesToVariantsConverter.scala index 8629f794ef..b2aaf3df32 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/converters/GenotypesToVariantsConverter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/converters/GenotypesToVariantsConverter.scala @@ -20,8 +20,9 @@ package org.bdgenomics.adam.converters import org.bdgenomics.adam.util._ import scala.math.{ pow, sqrt } -private[adam] class GenotypesToVariantsConverter(validateSamples: Boolean = false, - failOnValidationError: Boolean = false) extends Serializable { +private[adam] class GenotypesToVariantsConverter( + validateSamples: Boolean = false, + failOnValidationError: Boolean = false) extends Serializable { /** * Computes root mean squared (RMS) values for a series of doubles. @@ -31,7 +32,7 @@ private[adam] class GenotypesToVariantsConverter(validateSamples: Boolean = fals */ def rms(values: Seq[Double]): Double = { if (values.length > 0) { - sqrt(values.map(pow(_, 2.0)).reduce(_ + _) / values.length.toDouble) + sqrt(values.map(pow(_, 2.0)).sum / values.length.toDouble) } else { 0.0 } @@ -67,5 +68,5 @@ private[adam] class GenotypesToVariantsConverter(validateSamples: Boolean = fals * @param values An array of non-phred scaled genotype quality scores. * @return A non-phred scaled variant likelihood. */ - def variantQualityFromGenotypes(values: Seq[Double]): Double = 1.0 - values.reduce(_ * _) + def variantQualityFromGenotypes(values: Seq[Double]): Double = 1.0 - values.product } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/converters/SAMRecordConverter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/converters/SAMRecordConverter.scala index 80a3177381..6149373f3a 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/converters/SAMRecordConverter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/converters/SAMRecordConverter.scala @@ -36,9 +36,10 @@ import org.bdgenomics.formats.avro.AlignmentRecord import scala.collection.JavaConverters._ class SAMRecordConverter extends Serializable with Logging { - def convert(samRecord: SAMRecord, - dict: SequenceDictionary, - readGroups: RecordGroupDictionary): AlignmentRecord = { + def convert( + samRecord: SAMRecord, + dict: SequenceDictionary, + readGroups: RecordGroupDictionary): AlignmentRecord = { try { val cigar: String = samRecord.getCigarString val startTrim = if (cigar == "*") { @@ -79,12 +80,14 @@ class SAMRecordConverter extends Serializable with Logging { // This prevents looking up a -1 in the sequence dictionary val readReference: Int = samRecord.getReferenceIndex if (readReference != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { - builder.setContig(SequenceRecord.toADAMContig(dict(samRecord.getReferenceName).get)) + dict(samRecord.getReferenceName).foreach { (rec) => + builder.setContig(SequenceRecord.toADAMContig(rec)) + } // set read alignment flag val start: Int = samRecord.getAlignmentStart assert(start != 0, "Start cannot equal 0 if contig is set.") - builder.setStart((start - 1).asInstanceOf[Long]) + builder.setStart((start - 1)) // set OP and OC flags, if applicable if (samRecord.getAttribute("OP") != null) { @@ -127,7 +130,9 @@ class SAMRecordConverter extends Serializable with Logging { val mateReference: Int = samRecord.getMateReferenceIndex if (mateReference != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { - builder.setMateContig(SequenceRecord.toADAMContig(dict(samRecord.getMateReferenceName).get)) + dict(samRecord.getMateReferenceName).foreach { (rec) => + builder.setMateContig(SequenceRecord.toADAMContig(rec)) + } val mateStart = samRecord.getMateAlignmentStart if (mateStart > 0) { diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/converters/VariantAnnotationConverter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/converters/VariantAnnotationConverter.scala index 1b9fe87a6e..861bf2c7c6 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/converters/VariantAnnotationConverter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/converters/VariantAnnotationConverter.scala @@ -63,20 +63,24 @@ object VariantAnnotationConverter extends Serializable { AttrKey("geneSymbol", attrAsString _, new VCFInfoHeaderLine("GENE,", 1, VCFHeaderLineType.String, "Gene name")), AttrKey("strand", attrAsString _, new VCFInfoHeaderLine("STRAND,", 1, VCFHeaderLineType.String, "Gene strand")), AttrKey("cds", attrAsString _, new VCFInfoHeaderLine("CDS,", 1, VCFHeaderLineType.String, "CDS annotation")), - AttrKey("cnt", attrAsString _, new VCFInfoHeaderLine("CNT,", 1, VCFHeaderLineType.Integer, "How many samples have this mutation"))) + AttrKey("cnt", attrAsString _, new VCFInfoHeaderLine("CNT,", 1, VCFHeaderLineType.Integer, "How many samples have this mutation")) + ) val DBNSFP_KEYS: List[AttrKey] = List( AttrKey("phylop", attrAsFloat _, new VCFInfoHeaderLine("PHYLOP", 1, VCFHeaderLineType.Float, "PhyloP score. The larger the score, the more conserved the site.")), AttrKey("siftPred", attrAsString _, new VCFInfoHeaderLine("SIFT_PRED", 1, VCFHeaderLineType.Character, "SIFT Prediction: D (damaging), T (tolerated)")), AttrKey("siftScore", attrAsFloat _, new VCFInfoHeaderLine("SIFT_SCORE", 1, VCFHeaderLineType.Float, "SIFT Score")), - AttrKey("ancestralAllele", attrAsString _, new VCFInfoHeaderLine("AA", 1, VCFHeaderLineType.String, "Ancestral allele"))) + AttrKey("ancestralAllele", attrAsString _, new VCFInfoHeaderLine("AA", 1, VCFHeaderLineType.String, "Ancestral allele")) + ) val CLINVAR_KEYS: List[AttrKey] = List( AttrKey("dbSnpId", attrAsInt _, new VCFInfoHeaderLine("dbSNP ID", 1, VCFHeaderLineType.Integer, "dbSNP ID")), - AttrKey("geneSymbol", attrAsString _, new VCFInfoHeaderLine("GENEINFO", 1, VCFHeaderLineType.String, "Pairs each of gene symbol:gene id. The gene symbol and id are delimited by a colon (:) and each pair is delimited by a vertical bar"))) + AttrKey("geneSymbol", attrAsString _, new VCFInfoHeaderLine("GENEINFO", 1, VCFHeaderLineType.String, "Pairs each of gene symbol:gene id. The gene symbol and id are delimited by a colon (:) and each pair is delimited by a vertical bar")) + ) val OMIM_KEYS: List[AttrKey] = List( - AttrKey("omimId", attrAsString _, new VCFInfoHeaderLine("VAR", 1, VCFHeaderLineType.String, "MIM entry with variant mapped to rsID"))) + AttrKey("omimId", attrAsString _, new VCFInfoHeaderLine("VAR", 1, VCFHeaderLineType.String, "MIM entry with variant mapped to rsID")) + ) val INFO_KEYS: Seq[AttrKey] = Seq( AttrKey("fisherStrandBiasPValue", attrAsFloat _, VCFStandardHeaderLines.getInfoLine(VCFConstants.STRAND_BIAS_KEY)), @@ -85,7 +89,8 @@ object VariantAnnotationConverter extends Serializable { AttrKey("mqRankSum", attrAsFloat _, new VCFInfoHeaderLine("MQRankSum", 1, VCFHeaderLineType.Float, "Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities")), AttrKey("readPositionRankSum", attrAsFloat _, new VCFInfoHeaderLine("ReadPosRankSum", 1, VCFHeaderLineType.Float, "Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias")), AttrKey("vqslod", attrAsFloat _, new VCFInfoHeaderLine("VQSLOD", 1, VCFHeaderLineType.Float, "Log odds ratio of being a true variant versus being false under the trained gaussian mixture model")), - AttrKey("culprit", attrAsString _, new VCFInfoHeaderLine("culprit", 1, VCFHeaderLineType.String, "The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out"))) + AttrKey("culprit", attrAsString _, new VCFInfoHeaderLine("culprit", 1, VCFHeaderLineType.String, "The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out")) + ) val FORMAT_KEYS: Seq[AttrKey] = Seq( AttrKey("alleles", VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY)), @@ -97,7 +102,8 @@ object VariantAnnotationConverter extends Serializable { AttrKey("phaseQuality", attrAsInt _, new VCFFormatHeaderLine(VCFConstants.PHASE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Read-backed phasing quality")), AttrKey("phaseSetId", attrAsInt _, new VCFFormatHeaderLine(VCFConstants.PHASE_SET_KEY, 1, VCFHeaderLineType.Integer, "Phase set")), AttrKey("minReadDepth", attrAsInt _, new VCFFormatHeaderLine("MIN_DP", 1, VCFHeaderLineType.Integer, "Minimum DP observed within the GVCF block")), - AttrKey("strandBiasComponents", attrAsInt _, new VCFFormatHeaderLine("SB", 4, VCFHeaderLineType.Integer, "Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias."))) + AttrKey("strandBiasComponents", attrAsInt _, new VCFFormatHeaderLine("SB", 4, VCFHeaderLineType.Integer, "Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.")) + ) lazy val infoHeaderLines: Seq[VCFCompoundHeaderLine] = INFO_KEYS.map(_.hdrLine) lazy val formatHeaderLines: Seq[VCFCompoundHeaderLine] = FORMAT_KEYS.map(_.hdrLine) @@ -113,7 +119,7 @@ object VariantAnnotationConverter extends Serializable { private def createFieldMap(keys: Seq[AttrKey], schema: Schema): Map[String, (Int, Object => Object)] = { keys.filter(_.attrConverter != null).map(field => { val avroField = schema.getField(field.adamKey) - field.vcfKey -> (avroField.pos, field.attrConverter) + field.vcfKey -> ((avroField.pos, field.attrConverter)) })(collection.breakOut) } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/converters/VariantContextConverter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/converters/VariantContextConverter.scala index 26563a6571..da28350a94 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/converters/VariantContextConverter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/converters/VariantContextConverter.scala @@ -50,7 +50,7 @@ object VariantContextConverter { if (allele == null) Seq() else - Seq(Allele.create(allele.toString, isRef)) + Seq(Allele.create(allele, isRef)) } private def convertAlleles(v: Variant): java.util.Collection[Allele] = { @@ -62,8 +62,8 @@ object VariantContextConverter { if (alleles == null) return Collections.emptyList[Allele] else g.getAlleles.map { case GenotypeAllele.NoCall => Allele.NO_CALL - case GenotypeAllele.Ref | GenotypeAllele.OtherAlt => Allele.create(g.getVariant.getReferenceAllele.toString, true) - case GenotypeAllele.Alt => Allele.create(g.getVariant.getAlternateAllele.toString) + case GenotypeAllele.Ref | GenotypeAllele.OtherAlt => Allele.create(g.getVariant.getReferenceAllele, true) + case GenotypeAllele.Alt => Allele.create(g.getVariant.getAlternateAllele) } } @@ -110,15 +110,19 @@ class VariantContextConverter(dict: Option[SequenceDictionary] = None) extends S return Seq(ADAMVariantContext(variant, genotypes, None)) } case List(allele) => { - assert(allele.isNonReference, - "Assertion failed when converting: " + vc.toString) + assert( + allele.isNonReference, + "Assertion failed when converting: " + vc.toString + ) val variant = createADAMVariant(vc, Some(allele.getDisplayString)) val genotypes = extractReferenceModelGenotypes(vc, variant, calling_annotations) return Seq(ADAMVariantContext(variant, genotypes, None)) } case List(allele, NON_REF_ALLELE) => { - assert(allele.isNonReference, - "Assertion failed when converting: " + vc.toString) + assert( + allele.isNonReference, + "Assertion failed when converting: " + vc.toString + ) val variant = createADAMVariant(vc, Some(allele.getDisplayString)) val genotypes = extractReferenceModelGenotypes(vc, variant, calling_annotations) return Seq(ADAMVariantContext(variant, genotypes, None)) @@ -189,13 +193,17 @@ class VariantContextConverter(dict: Option[SequenceDictionary] = None) extends S createADAMVariant(vc, None /* No alternate allele */ ) } case List(allele) => { - assert(allele.isNonReference, - "Assertion failed when converting: " + vc.toString) + assert( + allele.isNonReference, + "Assertion failed when converting: " + vc.toString + ) createADAMVariant(vc, Some(allele.getDisplayString)) } case List(allele, NON_REF_ALLELE) => { - assert(allele.isNonReference, - "Assertion failed when converting: " + vc.toString) + assert( + allele.isNonReference, + "Assertion failed when converting: " + vc.toString + ) createADAMVariant(vc, Some(allele.getDisplayString)) } case alleles :+ NON_REF_ALLELE => { @@ -266,7 +274,8 @@ class VariantContextConverter(dict: Option[SequenceDictionary] = None) extends S setPL(g, genotype) VariantAnnotationConverter.convert(g, genotype.build) - }).toSeq + } + ).toSeq genotypes } @@ -322,8 +331,10 @@ class VariantContextConverter(dict: Option[SequenceDictionary] = None) extends S def convert(vc: ADAMVariantContext): BroadVariantContext = { val variant: Variant = vc.variant val vcb = new VariantContextBuilder() - .chr(refSeqToContig.getOrElse(variant.getContig.getContigName.toString, - variant.getContig.getContigName.toString)) + .chr(refSeqToContig.getOrElse( + variant.getContig.getContigName, + variant.getContig.getContigName + )) .start(variant.getStart + 1 /* Recall ADAM is 0-indexed */ ) .stop(variant.getStart + variant.getReferenceAllele.length) .alleles(VariantContextConverter.convertAlleles(variant)) @@ -334,7 +345,8 @@ class VariantContextConverter(dict: Option[SequenceDictionary] = None) extends S try { vcb.genotypes(vc.genotypes.map(g => { val gb = new htsjdk.variant.variantcontext.GenotypeBuilder( - g.getSampleId.toString, VariantContextConverter.convertAlleles(g)) + g.getSampleId, VariantContextConverter.convertAlleles(g) + ) Option(g.getIsPhased).foreach(gb.phased(_)) Option(g.getGenotypeQuality).foreach(gb.GQ(_)) @@ -346,7 +358,7 @@ class VariantContextConverter(dict: Option[SequenceDictionary] = None) extends S if (g.getVariantCallingAnnotations != null) { val callAnnotations = g.getVariantCallingAnnotations() if (callAnnotations.getVariantFilters != null) - gb.filters(callAnnotations.getVariantFilters.map(_.toString)) + gb.filters(callAnnotations.getVariantFilters) } if (g.getGenotypeLikelihoods != null && !g.getGenotypeLikelihoods.isEmpty) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/Alphabet.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/Alphabet.scala index 391809da75..63d80619a0 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/Alphabet.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/Alphabet.scala @@ -52,7 +52,8 @@ trait Alphabet { * @throws IllegalArgumentException if the string contains a symbol which is not in the alphabet */ def reverseComplementExact(s: String): String = { - reverseComplement(s, + reverseComplement( + s, (symbol: Char) => throw new IllegalArgumentException("Character %s not found in alphabet.".format(symbol)) ) } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/Gene.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/Gene.scala index 7ae90dc3ca..8e461f3d28 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/Gene.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/Gene.scala @@ -73,13 +73,14 @@ case class Gene(id: String, names: Seq[String], strand: Boolean, transcripts: It * transcript * @param utrs */ -case class Transcript(id: String, - names: Seq[String], - geneId: String, - strand: Boolean, - exons: Iterable[Exon], - cds: Iterable[CDS], - utrs: Iterable[UTR]) { +case class Transcript( + id: String, + names: Seq[String], + geneId: String, + strand: Boolean, + exons: Iterable[Exon], + cds: Iterable[CDS], + utrs: Iterable[UTR]) { lazy val region = exons.map(_.region).reduceLeft[ReferenceRegion]((acc, ex) => acc.hull(ex)) @@ -92,10 +93,10 @@ case class Transcript(id: String, * @return the String representation of this Transcript's spliced mRNA sequence */ def extractTranscribedRNASequence(referenceSequence: String): String = { - val minStart = exons.map(_.region.start).toSeq.sorted.head.toInt + val minStart = exons.map(_.region.start).min.toInt // takes the max... - val maxEnd = -exons.map(-_.region.end).toSeq.sorted.head.toInt + val maxEnd = exons.map(_.region.end).max.toInt if (strand) referenceSequence.substring(minStart, maxEnd) else @@ -231,4 +232,3 @@ object ReferenceUtils { refs.toSeq.sorted.foldLeft(Seq[ReferenceRegion]())(folder) } } - diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/IndelTable.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/IndelTable.scala index 6aa54ace00..8b5dade283 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/IndelTable.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/IndelTable.scala @@ -24,8 +24,10 @@ import org.bdgenomics.adam.rdd.ADAMContext._ import org.bdgenomics.formats.avro.Variant class IndelTable(private val table: Map[String, Iterable[Consensus]]) extends Serializable with Logging { - log.info("Indel table has %s contigs and %s entries".format(table.size, - table.values.map(_.size).sum)) + log.info("Indel table has %s contigs and %s entries".format( + table.size, + table.values.map(_.size).sum + )) /** * Returns all known indels within the given reference region. If none are known, returns an empty Seq. @@ -67,7 +69,7 @@ object IndelTable { def apply(variants: RDD[Variant]): IndelTable = { val consensus: Map[String, Iterable[Consensus]] = variants.filter(v => v.getReferenceAllele.length != v.getAlternateAllele.length) .map(v => { - val referenceName = v.getContig.getContigName.toString + val referenceName = v.getContig.getContigName val consensus = if (v.getReferenceAllele.length > v.getAlternateAllele.length) { // deletion val deletionLength = v.getReferenceAllele.length - v.getAlternateAllele.length @@ -77,7 +79,7 @@ object IndelTable { } else { val start = v.getStart + v.getReferenceAllele.length - Consensus(v.getAlternateAllele.toString.drop(v.getReferenceAllele.length), ReferenceRegion(referenceName, start, start + 1)) + Consensus(v.getAlternateAllele.drop(v.getReferenceAllele.length), ReferenceRegion(referenceName, start, start + 1)) } (referenceName, consensus) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/NonoverlappingRegions.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/NonoverlappingRegions.scala index ef0c06ad2e..68822e22e5 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/NonoverlappingRegions.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/NonoverlappingRegions.scala @@ -74,7 +74,8 @@ class NonoverlappingRegions(regions: Iterable[ReferenceRegion]) extends Serializ def mergeRegions(regs: Seq[(ReferenceRegion)]): List[ReferenceRegion] = regs.aggregate(List[ReferenceRegion]())( (lst: List[ReferenceRegion], p: (ReferenceRegion)) => updateListWithRegion(lst, p), - (a, b) => a ++ b) + (a, b) => a ++ b + ) def binaryPointSearch(pos: Long, lessThan: Boolean): Int = { var i = 0 @@ -186,8 +187,10 @@ object NonoverlappingRegions { * dictionary. */ class MultiContigNonoverlappingRegions(regions: Seq[(String, Iterable[ReferenceRegion])]) extends Serializable { - assert(regions != null, - "Regions was set to null") + assert( + regions != null, + "Regions was set to null" + ) val regionMap: Map[String, NonoverlappingRegions] = Map(regions.map(r => (r._1, new NonoverlappingRegions(r._2))): _*) @@ -205,7 +208,8 @@ object MultiContigNonoverlappingRegions { values.map(kv => (kv._1.referenceName, kv._1)) .groupBy(t => t._1) .map(t => (t._1, t._2.map(k => k._2))) - .toSeq) + .toSeq + ) } } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/ProgramRecord.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/ProgramRecord.scala index 9db3bdddd2..858518fe67 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/ProgramRecord.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/ProgramRecord.scala @@ -27,20 +27,21 @@ object ProgramRecord { val id: String = pr.getId // these fields are optional and can be left null, so must check for null... - val commandLine: Option[String] = Option(pr.getCommandLine).map(_.toString) - val name: Option[String] = Option(pr.getProgramName).map(_.toString) - val version: Option[String] = Option(pr.getProgramVersion).map(_.toString) - val previousID: Option[String] = Option(pr.getPreviousProgramGroupId).map(_.toString) + val commandLine: Option[String] = Option(pr.getCommandLine) + val name: Option[String] = Option(pr.getProgramName) + val version: Option[String] = Option(pr.getProgramVersion) + val previousID: Option[String] = Option(pr.getPreviousProgramGroupId) new ProgramRecord(id, commandLine, name, version, previousID) } } -case class ProgramRecord(id: String, - commandLine: Option[String], - name: Option[String], - version: Option[String], - previousID: Option[String]) { +case class ProgramRecord( + id: String, + commandLine: Option[String], + name: Option[String], + version: Option[String], + previousID: Option[String]) { def toSAMProgramRecord(): SAMProgramRecord = { val pr = new SAMProgramRecord(id) @@ -54,4 +55,3 @@ case class ProgramRecord(id: String, pr } } - diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/ReadBucket.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/ReadBucket.scala index 9f60044d83..a915117f31 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/ReadBucket.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/ReadBucket.scala @@ -30,13 +30,14 @@ import org.bdgenomics.formats.avro.AlignmentRecord * * This is useful as this will usually map a single read in any of the sequences. */ -case class ReadBucket(unpairedPrimaryMappedReads: Iterable[AlignmentRecord] = Seq.empty, - pairedFirstPrimaryMappedReads: Iterable[AlignmentRecord] = Seq.empty, - pairedSecondPrimaryMappedReads: Iterable[AlignmentRecord] = Seq.empty, - unpairedSecondaryMappedReads: Iterable[AlignmentRecord] = Seq.empty, - pairedFirstSecondaryMappedReads: Iterable[AlignmentRecord] = Seq.empty, - pairedSecondSecondaryMappedReads: Iterable[AlignmentRecord] = Seq.empty, - unmappedReads: Iterable[AlignmentRecord] = Seq.empty) { +case class ReadBucket( + unpairedPrimaryMappedReads: Iterable[AlignmentRecord] = Seq.empty, + pairedFirstPrimaryMappedReads: Iterable[AlignmentRecord] = Seq.empty, + pairedSecondPrimaryMappedReads: Iterable[AlignmentRecord] = Seq.empty, + unpairedSecondaryMappedReads: Iterable[AlignmentRecord] = Seq.empty, + pairedFirstSecondaryMappedReads: Iterable[AlignmentRecord] = Seq.empty, + pairedSecondSecondaryMappedReads: Iterable[AlignmentRecord] = Seq.empty, + unmappedReads: Iterable[AlignmentRecord] = Seq.empty) { def allReads(): Iterable[AlignmentRecord] = unpairedPrimaryMappedReads ++ pairedFirstPrimaryMappedReads ++ @@ -89,31 +90,40 @@ class ReadBucketSerializer extends Serializer[ReadBucket] { unpairedSecondaryReads, pairedFirstSecondaryMappedReads, pairedSecondSecondaryMappedReads, - unmappedReads) + unmappedReads + ) } } object ReadBucket { implicit def singleReadBucketToReadBucket(bucket: SingleReadBucket): ReadBucket = { // check that reads are either first or second read from fragment - bucket.primaryMapped.foreach(r => require(r.getReadNum >= 0 && r.getReadNum <= 1, - "Read %s is not first or second read from pair (num = %d).".format(r, r.getReadNum))) - bucket.secondaryMapped.foreach(r => require(r.getReadNum >= 0 && r.getReadNum <= 1, - "Read %s is not first or second read from pair (num = %d).".format(r, r.getReadNum))) - bucket.unmapped.foreach(r => require(r.getReadNum >= 0 && r.getReadNum <= 1, - "Read %s is not first or second read from pair (num = %d).".format(r, r.getReadNum))) + bucket.primaryMapped.foreach(r => require( + r.getReadNum >= 0 && r.getReadNum <= 1, + "Read %s is not first or second read from pair (num = %d).".format(r, r.getReadNum) + )) + bucket.secondaryMapped.foreach(r => require( + r.getReadNum >= 0 && r.getReadNum <= 1, + "Read %s is not first or second read from pair (num = %d).".format(r, r.getReadNum) + )) + bucket.unmapped.foreach(r => require( + r.getReadNum >= 0 && r.getReadNum <= 1, + "Read %s is not first or second read from pair (num = %d).".format(r, r.getReadNum) + )) val (pairedPrimary, unpairedPrimary) = bucket.primaryMapped.partition(_.getReadPaired) val (pairedFirstPrimary, pairedSecondPrimary) = pairedPrimary.partition(_.getReadNum == 0) val (pairedSecondary, unpairedSecondary) = bucket.secondaryMapped.partition(_.getReadPaired) val (pairedFirstSecondary, pairedSecondSecondary) = pairedSecondary.partition(_.getReadNum == 0) - new ReadBucket(unpairedPrimary, + new ReadBucket( + unpairedPrimary, pairedFirstPrimary, pairedSecondPrimary, unpairedSecondary, pairedFirstSecondary, pairedSecondSecondary, - bucket.unmapped) + bucket.unmapped + ) } } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/RecordGroupDictionary.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/RecordGroupDictionary.scala index df25dbd969..658c6c00b0 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/RecordGroupDictionary.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/RecordGroupDictionary.scala @@ -68,8 +68,10 @@ class RecordGroupDictionary(val recordGroups: Seq[RecordGroup]) extends Serializ (name, (group, index)) }).toMap - assert(recordGroupMap.size == recordGroups.length, - "Read group dictionary contains multiple samples with identical read group names.") + assert( + recordGroupMap.size == recordGroups.length, + "Read group dictionary contains multiple samples with identical read group names." + ) def ++(that: RecordGroupDictionary): RecordGroupDictionary = { new RecordGroupDictionary(recordGroups ++ that.recordGroups) @@ -133,9 +135,12 @@ object RecordGroup { * @return Returns an equivalent ADAM format record group. */ def apply(samRGR: SAMReadGroupRecord): RecordGroup = { - assert(samRGR.getSample != null, - "Sample ID is not set for read group " + samRGR.getReadGroupId) - new RecordGroup(samRGR.getSample, + assert( + samRGR.getSample != null, + "Sample ID is not set for read group " + samRGR.getReadGroupId + ) + new RecordGroup( + samRGR.getSample, samRGR.getReadGroupId, Option(samRGR.getSequencingCenter).map(_.toString), Option(samRGR.getDescription).map(_.toString), @@ -149,21 +154,23 @@ object RecordGroup { i }).map(_.toInt), Option(samRGR.getPlatform).map(_.toString), - Option(samRGR.getPlatformUnit).map(_.toString)) + Option(samRGR.getPlatformUnit).map(_.toString) + ) } } -class RecordGroup(val sample: String, - val recordGroupName: String, - val sequencingCenter: Option[String] = None, - val description: Option[String] = None, - val runDateEpoch: Option[Long] = None, - val flowOrder: Option[String] = None, - val keySequence: Option[String] = None, - val library: Option[String] = None, - val predictedMedianInsertSize: Option[Int] = None, - val platform: Option[String] = None, - val platformUnit: Option[String] = None) extends Serializable { +class RecordGroup( + val sample: String, + val recordGroupName: String, + val sequencingCenter: Option[String] = None, + val description: Option[String] = None, + val runDateEpoch: Option[Long] = None, + val flowOrder: Option[String] = None, + val keySequence: Option[String] = None, + val library: Option[String] = None, + val predictedMedianInsertSize: Option[Int] = None, + val platform: Option[String] = None, + val platformUnit: Option[String] = None) extends Serializable { /** * Compares equality to another object. Only checks equality via the sample and diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferencePosition.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferencePosition.scala index f14355c4d5..db2598df6d 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferencePosition.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferencePosition.scala @@ -49,7 +49,7 @@ object ReferencePosition extends Serializable { * @see fivePrime */ def apply(record: AlignmentRecord): ReferencePosition = { - new ReferencePosition(record.getContig.getContigName.toString, record.getStart) + new ReferencePosition(record.getContig.getContigName, record.getStart) } /** @@ -83,9 +83,10 @@ object ReferencePosition extends Serializable { } } -class ReferencePosition(override val referenceName: String, - val pos: Long, - override val orientation: Strand = Strand.Independent) +class ReferencePosition( + override val referenceName: String, + val pos: Long, + override val orientation: Strand = Strand.Independent) extends ReferenceRegion(referenceName, pos, pos + 1, orientation) class ReferencePositionSerializer extends Serializer[ReferencePosition] { diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferencePositionPair.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferencePositionPair.scala index 64e47bdd74..f940099fb6 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferencePositionPair.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferencePositionPair.scala @@ -42,18 +42,23 @@ object ReferencePositionPair extends Logging { } if (firstOfPair.size + secondOfPair.size > 0) { - new ReferencePositionPair(firstOfPair.lift(0).map(getPos), - secondOfPair.lift(0).map(getPos)) + new ReferencePositionPair( + firstOfPair.lift(0).map(getPos), + secondOfPair.lift(0).map(getPos) + ) } else { - new ReferencePositionPair((singleReadBucket.primaryMapped ++ - singleReadBucket.unmapped).toSeq.lift(0).map(getPos), - None) + new ReferencePositionPair( + (singleReadBucket.primaryMapped ++ + singleReadBucket.unmapped).toSeq.lift(0).map(getPos), + None + ) } } } -case class ReferencePositionPair(read1refPos: Option[ReferencePosition], - read2refPos: Option[ReferencePosition]) +case class ReferencePositionPair( + read1refPos: Option[ReferencePosition], + read2refPos: Option[ReferencePosition]) class ReferencePositionPairSerializer extends Serializer[ReferencePositionPair] { val rps = new ReferencePositionSerializer() diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferenceRegion.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferenceRegion.scala index 959769b02a..73ca35779f 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferenceRegion.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferenceRegion.scala @@ -24,8 +24,9 @@ import org.bdgenomics.formats.avro._ import scala.math.{ max, min } trait ReferenceOrdering[T <: ReferenceRegion] extends Ordering[T] { - private def regionCompare(a: T, - b: T): Int = { + private def regionCompare( + a: T, + b: T): Int = { if (a.referenceName != b.referenceName) { a.referenceName.compareTo(b.referenceName) } else if (a.start != b.start) { @@ -35,8 +36,9 @@ trait ReferenceOrdering[T <: ReferenceRegion] extends Ordering[T] { } } - def compare(a: T, - b: T): Int = { + def compare( + a: T, + b: T): Int = { val rc = regionCompare(a, b) if (rc == 0) { a.orientation.ordinal compare b.orientation.ordinal @@ -137,10 +139,11 @@ object ReferenceRegion { * which is not in the region -- i.e. [start, end) define a 0-based * half-open interval. */ -case class ReferenceRegion(referenceName: String, - start: Long, - end: Long, - orientation: Strand = Strand.Independent) +case class ReferenceRegion( + referenceName: String, + start: Long, + end: Long, + orientation: Strand = Strand.Independent) extends Comparable[ReferenceRegion] with Interval { diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/SAMFileHeaderWritable.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/SAMFileHeaderWritable.scala index 54a9b1a5a9..49e5cf3c77 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/SAMFileHeaderWritable.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/SAMFileHeaderWritable.scala @@ -39,7 +39,7 @@ class SAMFileHeaderWritable(@transient hdr: SAMFileHeader) extends Serializable } protected val comments = { val cmts: List[java.lang.String] = hdr.getComments - cmts.flatMap(Option(_)).map(_.toString) // don't trust samtools to return non-nulls + cmts.flatMap(Option(_)) // don't trust samtools to return non-nulls } protected val rgs = RecordGroupDictionary.fromSAMHeader(hdr) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/SequenceDictionary.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/SequenceDictionary.scala index 1dc4529020..32bf3d630c 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/SequenceDictionary.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/SequenceDictionary.scala @@ -85,7 +85,7 @@ class SequenceDictionary(val records: Vector[SequenceRecord]) extends Serializab def isCompatibleWith(that: SequenceDictionary): Boolean = { for (record <- that.records) { val myRecord = byName.get(record.name) - if (myRecord.isDefined && myRecord.get != record) + if (myRecord.exists(_ != record)) return false } true @@ -125,15 +125,17 @@ class SequenceDictionary(val records: Vector[SequenceRecord]) extends Serializab } object SequenceOrderingByName extends Ordering[SequenceRecord] { - def compare(a: SequenceRecord, - b: SequenceRecord): Int = { + def compare( + a: SequenceRecord, + b: SequenceRecord): Int = { a.name.compareTo(b.name) } } object SequenceOrderingByRefIdx extends Ordering[SequenceRecord] { - def compare(a: SequenceRecord, - b: SequenceRecord): Int = { + def compare( + a: SequenceRecord, + b: SequenceRecord): Int = { (for { aRefIdx <- a.referenceIndex bRefIdx <- b.referenceIndex @@ -171,7 +173,7 @@ case class SequenceRecord( * @return A SAM formatted sequence record. */ def toSAMSequenceRecord: SAMSequenceRecord = { - val rec = new SAMSequenceRecord(name.toString, length.toInt) + val rec = new SAMSequenceRecord(name, length.toInt) // set md5 if available md5.foreach(s => rec.setAttribute(SAMSequenceRecord.MD5_TAG, s.toUpperCase)) @@ -214,24 +216,25 @@ object SequenceRecord { val REFSEQ_TAG = "REFSEQ" val GENBANK_TAG = "GENBANK" - def apply(name: String, - length: Long, - md5: String = null, - url: String = null, - refseq: String = null, - genbank: String = null, - assembly: String = null, - species: String = null, - referenceIndex: Option[Int] = None): SequenceRecord = { + def apply( + name: String, + length: Long, + md5: String = null, + url: String = null, + refseq: String = null, + genbank: String = null, + assembly: String = null, + species: String = null, + referenceIndex: Option[Int] = None): SequenceRecord = { new SequenceRecord( name, length, - Option(url).map(_.toString), - Option(md5).map(_.toString), - Option(refseq).map(_.toString), - Option(genbank).map(_.toString), - Option(assembly).map(_.toString), - Option(species).map(_.toString), + Option(url), + Option(md5), + Option(refseq), + Option(genbank), + Option(assembly), + Option(species), referenceIndex ) } @@ -258,8 +261,8 @@ object SequenceRecord { } def toSAMSequenceRecord(record: SequenceRecord): SAMSequenceRecord = { val sam = new SAMSequenceRecord(record.name, record.length.toInt) - record.md5.foreach(v => sam.setAttribute(SAMSequenceRecord.MD5_TAG, v.toString)) - record.url.foreach(v => sam.setAttribute(SAMSequenceRecord.URI_TAG, v.toString)) + record.md5.foreach(v => sam.setAttribute(SAMSequenceRecord.MD5_TAG, v)) + record.url.foreach(v => sam.setAttribute(SAMSequenceRecord.URI_TAG, v)) sam } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/SingleReadBucket.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/SingleReadBucket.scala index 0b07b6e9f6..d1c3bba3c2 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/SingleReadBucket.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/SingleReadBucket.scala @@ -46,9 +46,10 @@ object SingleReadBucket extends Logging { } } -case class SingleReadBucket(primaryMapped: Iterable[AlignmentRecord] = Seq.empty, - secondaryMapped: Iterable[AlignmentRecord] = Seq.empty, - unmapped: Iterable[AlignmentRecord] = Seq.empty) { +case class SingleReadBucket( + primaryMapped: Iterable[AlignmentRecord] = Seq.empty, + secondaryMapped: Iterable[AlignmentRecord] = Seq.empty, + unmapped: Iterable[AlignmentRecord] = Seq.empty) { // Note: not a val in order to save serialization/memory cost def allReads = { primaryMapped ++ secondaryMapped ++ unmapped diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/SnpTable.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/SnpTable.scala index d27f36b6a8..8a1a37ec2d 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/SnpTable.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/SnpTable.scala @@ -66,30 +66,35 @@ object SnpTable { // `knownSnpsFile` is expected to be a sites-only VCF def apply(knownSnpsFile: File): SnpTable = { // parse into tuples of (contig, position) - val lines = scala.io.Source.fromFile(knownSnpsFile).getLines() - val tuples = lines.filter(line => !line.startsWith("#")).flatMap(line => { - val split = line.split("\t") - val contig = split(0) - val pos = split(1).toLong - 1 - val ref = split(3) - assert(pos >= 0) - assert(!ref.isEmpty) - ref.zipWithIndex.map { - case (base, idx) => - assert(Seq('A', 'C', 'T', 'G', 'N').contains(base)) - (contig, pos + idx) - } - }) - // construct map from contig to set of positions - // this is done in-place to reduce overhead - val table = new mutable.HashMap[String, mutable.HashSet[Long]] - tuples.foreach(tup => table.getOrElseUpdate(tup._1, { new mutable.HashSet[Long] }) += tup._2) - // construct SnpTable from immutable copy of `table` - new SnpTable(table.mapValues(_.toSet).toMap) + val snpsSource = scala.io.Source.fromFile(knownSnpsFile) + try { + val lines = snpsSource.getLines() + val tuples = lines.filter(line => !line.startsWith("#")).flatMap(line => { + val split = line.split("\t") + val contig = split(0) + val pos = split(1).toLong - 1 + val ref = split(3) + assert(pos >= 0) + assert(!ref.isEmpty) + ref.zipWithIndex.map { + case (base, idx) => + assert(Seq('A', 'C', 'T', 'G', 'N').contains(base)) + (contig, pos + idx) + } + }) + // construct map from contig to set of positions + // this is done in-place to reduce overhead + val table = new mutable.HashMap[String, mutable.HashSet[Long]] + tuples.foreach(tup => table.getOrElseUpdate(tup._1, { new mutable.HashSet[Long] }) += tup._2) + // construct SnpTable from immutable copy of `table` + new SnpTable(table.mapValues(_.toSet).toMap) + } finally { + snpsSource.close() + } } def apply(variants: RDD[RichVariant]): SnpTable = { - val positions = variants.map(variant => (variant.getContig.getContigName.toString, variant.getStart)).collect() + val positions = variants.map(variant => (variant.getContig.getContigName, variant.getStart)).collect() val table = new mutable.HashMap[String, mutable.HashSet[Long]] positions.foreach(tup => table.getOrElseUpdate(tup._1, { new mutable.HashSet[Long] }) += tup._2) new SnpTable(table.mapValues(_.toSet).toMap) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/VariantContext.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/VariantContext.scala index ffb4c7192e..9b967d49df 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/VariantContext.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/VariantContext.scala @@ -72,8 +72,10 @@ object VariantContext { */ def buildFromGenotypes(genotypes: Seq[Genotype]): VariantContext = { val position = ReferencePosition(genotypes.head) - assert(genotypes.map(ReferencePosition(_)).forall(_ == position), - "Genotypes do not all have the same position.") + assert( + genotypes.map(ReferencePosition(_)).forall(_ == position), + "Genotypes do not all have the same position." + ) val variant = genotypes.head.getVariant diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala index e0aa16cddb..2632076c43 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala @@ -132,9 +132,10 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log * @tparam T The type of records to return * @return An RDD with records of the specified type */ - def loadParquet[T](filePath: String, - predicate: Option[FilterPredicate] = None, - projection: Option[Schema] = None)(implicit ev1: T => SpecificRecord, ev2: Manifest[T]): RDD[T] = { + def loadParquet[T]( + filePath: String, + predicate: Option[FilterPredicate] = None, + projection: Option[Schema] = None)(implicit ev1: T => SpecificRecord, ev2: Manifest[T]): RDD[T] = { //make sure a type was specified //not using require as to make the message clearer if (manifest[T] == manifest[scala.Nothing]) @@ -145,9 +146,9 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log val job = HadoopUtil.newJob(sc) ParquetInputFormat.setReadSupportClass(job, classOf[AvroReadSupport[T]]) - if (predicate.isDefined) { + predicate.foreach { (pred) => log.info("Using the specified push-down predicate") - ParquetInputFormat.setFilterPredicate(job.getConfiguration, predicate.get) + ParquetInputFormat.setFilterPredicate(job.getConfiguration, pred) } if (projection.isDefined) { @@ -202,7 +203,8 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log AlignmentRecordField.readPaired, AlignmentRecordField.firstOfPair, AlignmentRecordField.readMapped, - AlignmentRecordField.mateMapped) + AlignmentRecordField.mateMapped + ) } else if (isADAMContig) { Projection(NucleotideContigFragmentField.contig) } else { @@ -229,7 +231,8 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log val dict = recs.aggregate(SequenceDictionary())( (dict: SequenceDictionary, rec: SequenceRecord) => dict + rec, - (dict1: SequenceDictionary, dict2: SequenceDictionary) => dict1 ++ dict2) + (dict1: SequenceDictionary, dict2: SequenceDictionary) => dict1 ++ dict2 + ) dict } @@ -327,10 +330,12 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log }) val samDict = SAMHeaderReader.readSAMHeaderFrom(path, sc.hadoopConfiguration).getSequenceDictionary - IndexedBamInputFormat.setVars(new Path(filePath), + IndexedBamInputFormat.setVars( + new Path(filePath), new Path(filePath + ".bai"), viewRegion, - samDict) + samDict + ) val job = HadoopUtil.newJob(sc) @@ -366,20 +371,22 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log records.flatMap(fastqRecordConverter.convertPair) } - def loadFastq(filePath1: String, - filePath2Opt: Option[String], - recordGroupOpt: Option[String] = None, - stringency: ValidationStringency = ValidationStringency.STRICT): RDD[AlignmentRecord] = { + def loadFastq( + filePath1: String, + filePath2Opt: Option[String], + recordGroupOpt: Option[String] = None, + stringency: ValidationStringency = ValidationStringency.STRICT): RDD[AlignmentRecord] = { filePath2Opt match { case Some(filePath2) => loadPairedFastq(filePath1, filePath2, recordGroupOpt, stringency) case None => loadUnpairedFastq(filePath1, stringency = stringency) } } - def loadPairedFastq(filePath1: String, - filePath2: String, - recordGroupOpt: Option[String], - stringency: ValidationStringency): RDD[AlignmentRecord] = { + def loadPairedFastq( + filePath1: String, + filePath2: String, + recordGroupOpt: Option[String], + stringency: ValidationStringency): RDD[AlignmentRecord] = { val reads1 = loadUnpairedFastq(filePath1, setFirstOfPair = true, stringency = stringency) val reads2 = loadUnpairedFastq(filePath2, setSecondOfPair = true, stringency = stringency) @@ -403,11 +410,12 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log reads1 ++ reads2 } - def loadUnpairedFastq(filePath: String, - recordGroupOpt: Option[String] = None, - setFirstOfPair: Boolean = false, - setSecondOfPair: Boolean = false, - stringency: ValidationStringency = ValidationStringency.STRICT): RDD[AlignmentRecord] = { + def loadUnpairedFastq( + filePath: String, + recordGroupOpt: Option[String] = None, + setFirstOfPair: Boolean = false, + setSecondOfPair: Boolean = false, + stringency: ValidationStringency = ValidationStringency.STRICT): RDD[AlignmentRecord] = { val job = HadoopUtil.newJob(sc) val records = sc.newAPIHadoopFile( @@ -428,8 +436,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log if (recordGroup.isEmpty) filePath.substring(filePath.lastIndexOf("/") + 1) else - recordGroup - ), + recordGroup), setFirstOfPair, setSecondOfPair, stringency @@ -443,7 +450,8 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log val records = sc.newAPIHadoopFile( filePath, classOf[VCFInputFormat], classOf[LongWritable], classOf[VariantContextWritable], - ContextUtil.getConfiguration(job)) + ContextUtil.getConfiguration(job) + ) if (Metrics.isRecording) records.instrument() else records records.flatMap(p => vcc.convert(p._2.get)) @@ -466,10 +474,12 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log def loadFasta( filePath: String, fragmentLength: Long): RDD[NucleotideContigFragment] = { - val fastaData: RDD[(LongWritable, Text)] = sc.newAPIHadoopFile(filePath, + val fastaData: RDD[(LongWritable, Text)] = sc.newAPIHadoopFile( + filePath, classOf[TextInputFormat], classOf[LongWritable], - classOf[Text]) + classOf[Text] + ) if (Metrics.isRecording) fastaData.instrument() else fastaData val remapData = fastaData.map(kv => (kv._1.get, kv._2.toString)) @@ -560,7 +570,8 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log val records = sc.newAPIHadoopFile( filePath, classOf[VCFInputFormat], classOf[LongWritable], classOf[VariantContextWritable], - ContextUtil.getConfiguration(job)) + ContextUtil.getConfiguration(job) + ) if (Metrics.isRecording) records.instrument() else records records.map(p => vcc.convertToAnnotation(p._2.get)) @@ -611,8 +622,9 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log } } - def loadGenes(filePath: String, - projection: Option[Schema] = None): RDD[Gene] = { + def loadGenes( + filePath: String, + projection: Option[Schema] = None): RDD[Gene] = { import ADAMContext._ loadFeatures(filePath, projection).asGenes() } @@ -633,8 +645,10 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log if (filePath.endsWith(".fa") || filePath.endsWith(".fasta")) { log.info("Loading " + filePath + " as FASTA and converting to NucleotideContigFragment. Projection is ignored.") - loadFasta(filePath, - fragmentLength) + loadFasta( + filePath, + fragmentLength + ) } else { log.info("Loading " + filePath + " as Parquet containing NucleotideContigFragments.") loadParquetContigFragments(filePath, None, projection) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMRDDFunctions.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMRDDFunctions.scala index f1b3675399..ba7a5b43ee 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMRDDFunctions.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMRDDFunctions.scala @@ -52,12 +52,13 @@ class ADAMRDDFunctions[T <% IndexedRecord: Manifest](rdd: RDD[T]) extends Serial ) } - def adamParquetSave(filePath: String, - blockSize: Int = 128 * 1024 * 1024, - pageSize: Int = 1 * 1024 * 1024, - compressCodec: CompressionCodecName = CompressionCodecName.GZIP, - disableDictionaryEncoding: Boolean = false, - schema: Option[Schema] = None): Unit = SaveAsADAM.time { + def adamParquetSave( + filePath: String, + blockSize: Int = 128 * 1024 * 1024, + pageSize: Int = 1 * 1024 * 1024, + compressCodec: CompressionCodecName = CompressionCodecName.GZIP, + disableDictionaryEncoding: Boolean = false, + schema: Option[Schema] = None): Unit = SaveAsADAM.time { log.info("Saving data in ADAM format") val job = HadoopUtil.newJob(rdd.context) @@ -66,15 +67,18 @@ class ADAMRDDFunctions[T <% IndexedRecord: Manifest](rdd: RDD[T]) extends Serial ParquetOutputFormat.setEnableDictionary(job, !disableDictionaryEncoding) ParquetOutputFormat.setBlockSize(job, blockSize) ParquetOutputFormat.setPageSize(job, pageSize) - AvroParquetOutputFormat.setSchema(job, - if (schema.isDefined) schema.get - else manifest[T].runtimeClass.asInstanceOf[Class[T]].newInstance().getSchema) + AvroParquetOutputFormat.setSchema( + job, + schema.getOrElse(manifest[T].runtimeClass.asInstanceOf[Class[T]].newInstance().getSchema) + ) // Add the Void Key val recordToSave = rdd.map(p => (null, p)) // Save the values to the ADAM/Parquet file - recordToSave.saveAsNewAPIHadoopFile(filePath, + recordToSave.saveAsNewAPIHadoopFile( + filePath, classOf[java.lang.Void], manifest[T].runtimeClass.asInstanceOf[Class[T]], classOf[InstrumentedADAMAvroParquetOutputFormat], - ContextUtil.getConfiguration(job)) + ContextUtil.getConfiguration(job) + ) } } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/BroadcastRegionJoin.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/BroadcastRegionJoin.scala index 5daa9fe487..a689339d06 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/BroadcastRegionJoin.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/BroadcastRegionJoin.scala @@ -60,9 +60,10 @@ object BroadcastRegionJoin extends RegionJoin { * @return An RDD of pairs (x, y), where x is from baseRDD, y is from joinedRDD, and the region * corresponding to x overlaps the region corresponding to y. */ - def partitionAndJoin[T, U](baseRDD: RDD[(ReferenceRegion, T)], - joinedRDD: RDD[(ReferenceRegion, U)])(implicit tManifest: ClassTag[T], - uManifest: ClassTag[U]): RDD[(T, U)] = { + def partitionAndJoin[T, U]( + baseRDD: RDD[(ReferenceRegion, T)], + joinedRDD: RDD[(ReferenceRegion, U)])(implicit tManifest: ClassTag[T], + uManifest: ClassTag[U]): RDD[(T, U)] = { val sc = baseRDD.context @@ -139,9 +140,10 @@ object BroadcastRegionJoin extends RegionJoin { * realistic sized sets. * */ - def cartesianFilter[T, U](baseRDD: RDD[(ReferenceRegion, T)], - joinedRDD: RDD[(ReferenceRegion, U)])(implicit tManifest: ClassTag[T], - uManifest: ClassTag[U]): RDD[(T, U)] = { + def cartesianFilter[T, U]( + baseRDD: RDD[(ReferenceRegion, T)], + joinedRDD: RDD[(ReferenceRegion, U)])(implicit tManifest: ClassTag[T], + uManifest: ClassTag[U]): RDD[(T, U)] = { baseRDD.cartesian(joinedRDD).filter({ case (t: (ReferenceRegion, T), u: (ReferenceRegion, U)) => t._1.overlaps(u._1) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/Coverage.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/Coverage.scala index f90e0adde5..91cbbb51e1 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/Coverage.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/Coverage.scala @@ -145,7 +145,8 @@ class Coverage(val window: Long) extends Serializable { } else { Seq( OrientedPoint(r1.referenceName, r1.end, false), - OrientedPoint(r2.referenceName, r2.start, true)) + OrientedPoint(r2.referenceName, r2.start, true) + ) } case _ => Seq() } @@ -164,7 +165,8 @@ class Coverage(val window: Long) extends Serializable { dict.records.toSeq.map { case seqRecord => ReferenceRegion(seqRecord.name, 0, seqRecord.length) - }) + } + ) val windowRegions: RDD[ReferenceRegion] = chromRegions.flatMap { case chromRegion => diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/GenomicPartitioners.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/GenomicPartitioners.scala index 2e087243a9..21b53b44f4 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/GenomicPartitioners.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/GenomicPartitioners.scala @@ -45,11 +45,12 @@ case class GenomicPositionPartitioner(numParts: Int, seqLengths: Map[String, Lon private val cumuls: Seq[Long] = lengths.scan(0L)(_ + _) // total # of bases in the sequence dictionary - val totalLength: Long = lengths.reduce(_ + _) + val totalLength: Long = lengths.sum // referenceName -> cumulative length before this sequence (using seqDict.records as the implicit ordering) val cumulativeLengths: Map[String, Long] = Map( - names.zip(cumuls): _*) + names.zip(cumuls): _* + ) /** * 'parts' is the total number of partitions for non-UNMAPPED ReferencePositions -- @@ -78,9 +79,13 @@ case class GenomicPositionPartitioner(numParts: Int, seqLengths: Map[String, Lon // everything else gets assigned normally. case refpos: ReferencePosition => { - require(seqLengths.contains(refpos.referenceName), - "Received key (%s) that did not map to a known contig. Contigs are:\n%s".format(refpos, - seqLengths.keys.mkString("\n"))) + require( + seqLengths.contains(refpos.referenceName), + "Received key (%s) that did not map to a known contig. Contigs are:\n%s".format( + refpos, + seqLengths.keys.mkString("\n") + ) + ) getPart(refpos.referenceName, refpos.pos) } @@ -101,7 +106,7 @@ object GenomicPositionPartitioner { GenomicPositionPartitioner(numParts, extractLengthMap(seqDict)) def extractLengthMap(seqDict: SequenceDictionary): Map[String, Long] = - Map(seqDict.records.toSeq.map(rec => (rec.name.toString, rec.length)): _*) + Map(seqDict.records.toSeq.map(rec => (rec.name, rec.length)): _*) } case class GenomicRegionPartitioner(partitionSize: Long, seqLengths: Map[String, Long], start: Boolean = true) extends Partitioner with Logging { @@ -120,9 +125,13 @@ case class GenomicRegionPartitioner(partitionSize: Long, seqLengths: Map[String, override def getPartition(key: Any): Int = { key match { case region: ReferenceRegion => { - require(seqLengths.contains(region.referenceName), - "Received key (%s) that did not map to a known contig. Contigs are:\n%s".format(region, - seqLengths.keys.mkString("\n"))) + require( + seqLengths.contains(region.referenceName), + "Received key (%s) that did not map to a known contig. Contigs are:\n%s".format( + region, + seqLengths.keys.mkString("\n") + ) + ) computePartition(region) } case _ => throw new IllegalArgumentException("Only ReferenceMappable values can be partitioned by GenomicRegionPartitioner") diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/RegionJoin.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/RegionJoin.scala index 27e1aa3d2e..99e7421950 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/RegionJoin.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/RegionJoin.scala @@ -37,7 +37,8 @@ trait RegionJoin { * @return An RDD of pairs (x, y), where x is from baseRDD, y is from joinedRDD, and the region * corresponding to x overlaps the region corresponding to y. */ - def partitionAndJoin[T, U](baseRDD: RDD[(ReferenceRegion, T)], - joinedRDD: RDD[(ReferenceRegion, U)])(implicit tManifest: ClassTag[T], - uManifest: ClassTag[U]): RDD[(T, U)] + def partitionAndJoin[T, U]( + baseRDD: RDD[(ReferenceRegion, T)], + joinedRDD: RDD[(ReferenceRegion, U)])(implicit tManifest: ClassTag[T], + uManifest: ClassTag[U]): RDD[(T, U)] } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ShuffleRegionJoin.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ShuffleRegionJoin.scala index 5472d7db05..efaabe89f3 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ShuffleRegionJoin.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ShuffleRegionJoin.scala @@ -47,13 +47,14 @@ case class ShuffleRegionJoin(sd: SequenceDictionary, partitionSize: Long) extend * @return An RDD of pairs (x, y), where x is from leftRDD, y is from rightRDD, and the region * corresponding to x overlaps the region corresponding to y. */ - def partitionAndJoin[T, U](leftRDD: RDD[(ReferenceRegion, T)], - rightRDD: RDD[(ReferenceRegion, U)])(implicit tManifest: ClassTag[T], - uManifest: ClassTag[U]): RDD[(T, U)] = { + def partitionAndJoin[T, U]( + leftRDD: RDD[(ReferenceRegion, T)], + rightRDD: RDD[(ReferenceRegion, U)])(implicit tManifest: ClassTag[T], + uManifest: ClassTag[U]): RDD[(T, U)] = { val sc = leftRDD.context // Create the set of bins across the genome for parallel processing - val seqLengths = Map(sd.records.toSeq.map(rec => (rec.name.toString, rec.length)): _*) + val seqLengths = Map(sd.records.toSeq.map(rec => (rec.name, rec.length)): _*) val bins = sc.broadcast(GenomeBins(partitionSize, seqLengths)) // Key each RDD element to its corresponding bin @@ -196,9 +197,10 @@ private case class ManualRegionPartitioner(partitions: Int) extends Partitioner * @tparam T type of leftIter * @tparam U type of rightIter */ -private case class SortedIntervalPartitionJoin[T, U](binRegion: ReferenceRegion, - leftIter: Iterator[((ReferenceRegion, Int), T)], - rightIter: Iterator[((ReferenceRegion, Int), U)]) +private case class SortedIntervalPartitionJoin[T, U]( + binRegion: ReferenceRegion, + leftIter: Iterator[((ReferenceRegion, Int), T)], + rightIter: Iterator[((ReferenceRegion, Int), U)]) extends Iterator[(T, U)] with Serializable { // inspired by bedtools2 chromsweep private val left: BufferedIterator[((ReferenceRegion, Int), T)] = leftIter.buffered diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/FlankReferenceFragments.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/FlankReferenceFragments.scala index 2c4f5287a5..8a3a101ec7 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/FlankReferenceFragments.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/FlankReferenceFragments.scala @@ -25,16 +25,18 @@ import org.bdgenomics.formats.avro.NucleotideContigFragment private[contig] object FlankReferenceFragments extends Serializable { - def apply(rdd: RDD[NucleotideContigFragment], - sd: SequenceDictionary, - flankSize: Int): RDD[NucleotideContigFragment] = { + def apply( + rdd: RDD[NucleotideContigFragment], + sd: SequenceDictionary, + flankSize: Int): RDD[NucleotideContigFragment] = { rdd.keyBy(ctg => ReferenceRegion(ctg).get) .repartitionAndSortWithinPartitions(ReferencePartitioner(sd)) .mapPartitions(flank(_, flankSize)) } - def flank(iter: Iterator[(ReferenceRegion, NucleotideContigFragment)], - flankSize: Int): Iterator[NucleotideContigFragment] = { + def flank( + iter: Iterator[(ReferenceRegion, NucleotideContigFragment)], + flankSize: Int): Iterator[NucleotideContigFragment] = { // we need to have at least one element in the iterator if (iter.hasNext) { // now, we apply a window and flank adjacent segments diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDDFunctions.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDDFunctions.scala index 8f32070287..eefdfadd03 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDDFunctions.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDDFunctions.scala @@ -120,14 +120,17 @@ class NucleotideContigFragmentRDDFunctions(rdd: RDD[NucleotideContigFragment]) e val str = fragmentSequence.drop(trimStart) .dropRight(trimEnd) - val reg = new ReferenceRegion(fragment._1.referenceName, + val reg = new ReferenceRegion( + fragment._1.referenceName, fragment._1.start + trimStart, - fragment._1.end - trimEnd) + fragment._1.end - trimEnd + ) (reg, str) } - def reducePairs(kv1: (ReferenceRegion, String), - kv2: (ReferenceRegion, String)): (ReferenceRegion, String) = { + def reducePairs( + kv1: (ReferenceRegion, String), + kv2: (ReferenceRegion, String)): (ReferenceRegion, String) = { assert(kv1._1.isAdjacent(kv2._1), "Regions being joined must be adjacent. For: " + kv1 + ", " + kv2) @@ -147,8 +150,10 @@ class NucleotideContigFragmentRDDFunctions(rdd: RDD[NucleotideContigFragment]) e .map(kv => getString(kv)) .reduce(reducePairs) - assert(pair._1.compareTo(region) == 0, - "Merging fragments returned a different region than requested.") + assert( + pair._1.compareTo(region) == 0, + "Merging fragments returned a different region than requested." + ) pair._2 } catch { @@ -171,8 +176,9 @@ class NucleotideContigFragmentRDDFunctions(rdd: RDD[NucleotideContigFragment]) e * sequence dictionary on the fly. Default is None. * @return Returns the RDD, with all adjacent fragments extended with flanking sequence. */ - def flankAdjacentFragments(flankLength: Int, - optSd: Option[SequenceDictionary] = None): RDD[NucleotideContigFragment] = { + def flankAdjacentFragments( + flankLength: Int, + optSd: Option[SequenceDictionary] = None): RDD[NucleotideContigFragment] = { FlankReferenceFragments(rdd, optSd.getOrElse(adamGetSequenceDictionary(performLexSort = false)), flankLength) } @@ -184,8 +190,9 @@ class NucleotideContigFragmentRDDFunctions(rdd: RDD[NucleotideContigFragment]) e * sequence dictionary on the fly. Default is None. * @return Returns an RDD containing k-mer/count pairs. */ - def countKmers(kmerLength: Int, - optSd: Option[SequenceDictionary] = None): RDD[(String, Long)] = { + def countKmers( + kmerLength: Int, + optSd: Option[SequenceDictionary] = None): RDD[(String, Long)] = { flankAdjacentFragments(kmerLength, optSd).flatMap(r => { // cut each read into k-mers, and attach a count of 1L r.getFragmentSequence diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/features/FeatureParser.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/features/FeatureParser.scala index cfeebdb9a6..6c00dbc604 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/features/FeatureParser.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/features/FeatureParser.scala @@ -31,10 +31,16 @@ trait FeatureParser extends Serializable { } class FeatureFile(parser: FeatureParser) extends Serializable { - def parse(file: File): Iterator[Feature] = - Source.fromFile(file).getLines().flatMap { line => - parser.parse(line) + def parse(file: File): Iterator[Feature] = { + val src = Source.fromFile(file) + try { + src.getLines().flatMap { line => + parser.parse(line) + } + } finally { + src.close() } + } } object GTFParser { @@ -103,12 +109,11 @@ class GTFParser extends FeatureParser { val (_id, _parentId) = feature match { - case "gene" => (attrs.get("gene_id"), None) - case "transcript" => (attrs.get("transcript_id"), attrs.get("gene_id")) - case "exon" => (exonId, attrs.get("transcript_id")) - case "CDS" => (attrs.get("id"), attrs.get("transcript_id")) - case "UTR" => (attrs.get("id"), attrs.get("transcript_id")) - case _ => (attrs.get("id"), None) + case "gene" => (attrs.get("gene_id"), None) + case "transcript" => (attrs.get("transcript_id"), attrs.get("gene_id")) + case "exon" => (exonId, attrs.get("transcript_id")) + case "CDS" | "UTR" => (attrs.get("id"), attrs.get("transcript_id")) + case _ => (attrs.get("id"), None) } _id.foreach(f.setFeatureId) _parentId.foreach(parentId => f.setParentIds(List[String](parentId))) @@ -303,4 +308,3 @@ class NarrowPeakParser extends FeatureParser { Seq(fb.build()) } } - diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/features/FeatureRDDFunctions.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/features/FeatureRDDFunctions.scala index 891cceafa0..4a7ea7fc7c 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/features/FeatureRDDFunctions.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/features/FeatureRDDFunctions.scala @@ -49,7 +49,9 @@ class FeatureRDDFunctions(featureRDD: RDD[Feature]) extends Serializable with Lo enough information to create each Transcript, which we do, key each Transcript by its geneId, and group the transcripts which share a common gene together. - 4. Finally, find each 'gene'-typed GTFFeature, key it by its geneId, and join with + 4. Finally, find each 'gene'-typed GTFFeature, key it by its geneId, + and join with + * the transcripts in #3. Use these joined values to create the final set of Gene values. @@ -60,7 +62,7 @@ class FeatureRDDFunctions(featureRDD: RDD[Feature]) extends Serializable with Lo // Step #1 val typePartitioned: RDD[(String, Feature)] = - featureRDD.keyBy(_.getFeatureType.toString).cache() + featureRDD.keyBy(_.getFeatureType).cache() // Step #2 val exonsByTranscript: RDD[(String, Iterable[Exon])] = @@ -68,31 +70,37 @@ class FeatureRDDFunctions(featureRDD: RDD[Feature]) extends Serializable with Lo // There really only should be _one_ parent listed in this flatMap, but since // getParentIds is modeled as returning a List[], we'll write it this way. case ("exon", ftr: Feature) => - val ids: Seq[String] = ftr.getParentIds.map(_.toString) - ids.map(transcriptId => (transcriptId, - Exon(ftr.getFeatureId.toString, transcriptId, strand(ftr.getStrand), ReferenceRegion(ftr)))) + val ids: Seq[String] = ftr.getParentIds + ids.map(transcriptId => ( + transcriptId, + Exon(ftr.getFeatureId, transcriptId, strand(ftr.getStrand), ReferenceRegion(ftr)) + )) }.groupByKey() val cdsByTranscript: RDD[(String, Iterable[CDS])] = typePartitioned.filter(_._1 == "CDS").flatMap { case ("CDS", ftr: Feature) => - val ids: Seq[String] = ftr.getParentIds.map(_.toString) - ids.map(transcriptId => (transcriptId, - CDS(transcriptId, strand(ftr.getStrand), ReferenceRegion(ftr)))) + val ids: Seq[String] = ftr.getParentIds + ids.map(transcriptId => ( + transcriptId, + CDS(transcriptId, strand(ftr.getStrand), ReferenceRegion(ftr)) + )) }.groupByKey() val utrsByTranscript: RDD[(String, Iterable[UTR])] = typePartitioned.filter(_._1 == "UTR").flatMap { case ("UTR", ftr: Feature) => - val ids: Seq[String] = ftr.getParentIds.map(_.toString) - ids.map(transcriptId => (transcriptId, - UTR(transcriptId, strand(ftr.getStrand), ReferenceRegion(ftr)))) + val ids: Seq[String] = ftr.getParentIds + ids.map(transcriptId => ( + transcriptId, + UTR(transcriptId, strand(ftr.getStrand), ReferenceRegion(ftr)) + )) }.groupByKey() // Step #3 val transcriptsByGene: RDD[(String, Iterable[Transcript])] = typePartitioned.filter(_._1 == "transcript").map { - case ("transcript", ftr: Feature) => (ftr.getFeatureId.toString, ftr) + case ("transcript", ftr: Feature) => (ftr.getFeatureId, ftr) }.join(exonsByTranscript) .leftOuterJoin(utrsByTranscript) .leftOuterJoin(cdsByTranscript) @@ -103,16 +111,18 @@ class FeatureRDDFunctions(featureRDD: RDD[Feature]) extends Serializable with Lo case (transcriptId: String, (((tgtf: Feature, exons: Iterable[Exon]), utrs: Option[Iterable[UTR]]), cds: Option[Iterable[CDS]])) => - val geneIds: Seq[String] = tgtf.getParentIds.map(_.toString) // should be length 1 - geneIds.map(geneId => (geneId, + val geneIds: Seq[String] = tgtf.getParentIds // should be length 1 + geneIds.map(geneId => ( + geneId, Transcript(transcriptId, Seq(transcriptId), geneId, strand(tgtf.getStrand), - exons, cds.getOrElse(Seq()), utrs.getOrElse(Seq())))) + exons, cds.getOrElse(Seq()), utrs.getOrElse(Seq())) + )) }.groupByKey() // Step #4 val genes = typePartitioned.filter(_._1 == "gene").map { - case ("gene", ftr: Feature) => (ftr.getFeatureId.toString, ftr) + case ("gene", ftr: Feature) => (ftr.getFeatureId, ftr) }.leftOuterJoin(transcriptsByGene).map { case (geneId: String, (ggtf: Feature, transcripts: Option[Iterable[Transcript]])) => Gene(geneId, Seq(geneId), @@ -125,10 +135,9 @@ class FeatureRDDFunctions(featureRDD: RDD[Feature]) extends Serializable with Lo def filterByOverlappingRegion(query: ReferenceRegion): RDD[Feature] = { def overlapsQuery(rec: Feature): Boolean = - rec.getContig.getContigName.toString == query.referenceName && + rec.getContig.getContigName == query.referenceName && rec.getStart < query.end && rec.getEnd > query.start featureRDD.filter(overlapsQuery) } } - diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDDFunctions.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDDFunctions.scala index 6aad158d61..c9c2ab25f1 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDDFunctions.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDDFunctions.scala @@ -59,14 +59,15 @@ class AlignmentRecordRDDFunctions(rdd: RDD[AlignmentRecord]) def filterByOverlappingRegion(query: ReferenceRegion): RDD[AlignmentRecord] = { def overlapsQuery(rec: AlignmentRecord): Boolean = rec.getReadMapped && - rec.getContig.getContigName.toString == query.referenceName && + rec.getContig.getContigName == query.referenceName && rec.getStart < query.end && rec.getEnd > query.start rdd.filter(overlapsQuery) } - def maybeSaveBam(args: ADAMSaveAnyArgs, - isSorted: Boolean = false): Boolean = { + def maybeSaveBam( + args: ADAMSaveAnyArgs, + isSorted: Boolean = false): Boolean = { if (args.outputPath.endsWith(".sam")) { log.info("Saving data in SAM format") rdd.adamSAMSave(args.outputPath, asSingleFile = args.asSingleFile, isSorted = isSorted) @@ -92,8 +93,9 @@ class AlignmentRecordRDDFunctions(rdd: RDD[AlignmentRecord]) maybeSaveBam(args) || { rdd.adamParquetSave(args); true } } - def adamSave(args: ADAMSaveAnyArgs, - isSorted: Boolean = false) = { + def adamSave( + args: ADAMSaveAnyArgs, + isSorted: Boolean = false) = { maybeSaveBam(args, isSorted) || maybeSaveFastq(args) || { rdd.adamParquetSave(args); true } } @@ -124,10 +126,11 @@ class AlignmentRecordRDDFunctions(rdd: RDD[AlignmentRecord]) * @param asSam Selects whether to save as SAM or BAM. The default value is true (save in SAM format). * @param isSorted If the output is sorted, this will modify the header. */ - def adamSAMSave(filePath: String, - asSam: Boolean = true, - asSingleFile: Boolean = false, - isSorted: Boolean = false) = SAMSave.time { + def adamSAMSave( + filePath: String, + asSam: Boolean = true, + asSingleFile: Boolean = false, + isSorted: Boolean = false) = SAMSave.time { // convert the records val (convertRecords: RDD[SAMRecordWritable], header: SAMFileHeader) = rdd.adamConvertToSAM(isSorted) @@ -169,11 +172,11 @@ class AlignmentRecordRDDFunctions(rdd: RDD[AlignmentRecord]) case true => ADAMSAMOutputFormat.clearHeader() ADAMSAMOutputFormat.addHeader(header) - log.info(s"Set SAM header on driver") + log.info("Set SAM header on driver") case false => ADAMBAMOutputFormat.clearHeader() ADAMBAMOutputFormat.addHeader(header) - log.info(s"Set BAM header on driver") + log.info("Set BAM header on driver") } // write file to disk @@ -203,12 +206,16 @@ class AlignmentRecordRDDFunctions(rdd: RDD[AlignmentRecord]) log.info(s"Writing single ${if (asSam) "SAM" else "BAM"} file (not Hadoop-style directory)") val (outputFormat, headerLessOutputFormat) = asSam match { case true => - (classOf[InstrumentedADAMSAMOutputFormat[LongWritable]], - classOf[InstrumentedADAMSAMOutputFormatHeaderLess[LongWritable]]) + ( + classOf[InstrumentedADAMSAMOutputFormat[LongWritable]], + classOf[InstrumentedADAMSAMOutputFormatHeaderLess[LongWritable]] + ) case false => - (classOf[InstrumentedADAMBAMOutputFormat[LongWritable]], - classOf[InstrumentedADAMBAMOutputFormatHeaderLess[LongWritable]]) + ( + classOf[InstrumentedADAMBAMOutputFormat[LongWritable]], + classOf[InstrumentedADAMBAMOutputFormatHeaderLess[LongWritable]] + ) } val headPath = filePath + "_head" @@ -309,7 +316,6 @@ class AlignmentRecordRDDFunctions(rdd: RDD[AlignmentRecord]) rdd.flatMap(r => { // cut each read into k-mers, and attach a count of 1L r.getSequence - .toString .sliding(kmerLength) .map(k => (k, 1L)) }).reduceByKey((k1: Long, k2: Long) => k1 + k2) @@ -344,9 +350,10 @@ class AlignmentRecordRDDFunctions(rdd: RDD[AlignmentRecord]) * observations to. * @return Returns an RDD of recalibrated reads. */ - def adamBQSR(knownSnps: Broadcast[SnpTable], - observationDumpFile: Option[String] = None, - validationStringency: ValidationStringency = ValidationStringency.LENIENT): RDD[AlignmentRecord] = BQSRInDriver.time { + def adamBQSR( + knownSnps: Broadcast[SnpTable], + observationDumpFile: Option[String] = None, + validationStringency: ValidationStringency = ValidationStringency.LENIENT): RDD[AlignmentRecord] = BQSRInDriver.time { BaseQualityRecalibration(rdd, knownSnps, observationDumpFile, validationStringency) } @@ -365,12 +372,13 @@ class AlignmentRecordRDDFunctions(rdd: RDD[AlignmentRecord]) * * @return Returns an RDD of mapped reads which have been realigned. */ - def adamRealignIndels(consensusModel: ConsensusGenerator = new ConsensusGeneratorFromReads, - isSorted: Boolean = false, - maxIndelSize: Int = 500, - maxConsensusNumber: Int = 30, - lodThreshold: Double = 5.0, - maxTargetSize: Int = 3000): RDD[AlignmentRecord] = RealignIndelsInDriver.time { + def adamRealignIndels( + consensusModel: ConsensusGenerator = new ConsensusGeneratorFromReads, + isSorted: Boolean = false, + maxIndelSize: Int = 500, + maxConsensusNumber: Int = 30, + lodThreshold: Double = 5.0, + maxTargetSize: Int = 3000): RDD[AlignmentRecord] = RealignIndelsInDriver.time { RealignIndels(rdd, consensusModel, isSorted, maxIndelSize, maxConsensusNumber, lodThreshold) } @@ -407,7 +415,8 @@ class AlignmentRecordRDDFunctions(rdd: RDD[AlignmentRecord]) */ def adamCharacterizeTagValues(tag: String): Map[Any, Long] = { adamFilterRecordsWithTag(tag).flatMap(RichAlignmentRecord(_).tags.find(_.tag == tag)).map( - attr => Map(attr.value -> 1L)).reduce { + attr => Map(attr.value -> 1L) + ).reduce { (map1: Map[Any, Long], map2: Map[Any, Long]) => MapTools.add(map1, map2) } @@ -419,8 +428,10 @@ class AlignmentRecordRDDFunctions(rdd: RDD[AlignmentRecord]) * @return An RDD[Read] containing the subset of records with a tag that matches the given name. */ def adamFilterRecordsWithTag(tagName: String): RDD[AlignmentRecord] = { - assert(tagName.length == 2, - "withAttribute takes a tagName argument of length 2; tagName=\"%s\"".format(tagName)) + assert( + tagName.length == 2, + "withAttribute takes a tagName argument of length 2; tagName=\"%s\"".format(tagName) + ) rdd.filter(RichAlignmentRecord(_).tags.exists(_.tag == tagName)) } @@ -431,11 +442,12 @@ class AlignmentRecordRDDFunctions(rdd: RDD[AlignmentRecord]) * @param fileName2 Path at which to save a FASTQ file containing the second mate of each pair. * @param validationStringency Iff strict, throw an exception if any read in this RDD is not accompanied by its mate. */ - def adamSaveAsPairedFastq(fileName1: String, - fileName2: String, - outputOriginalBaseQualities: Boolean = false, - validationStringency: ValidationStringency = ValidationStringency.LENIENT, - persistLevel: Option[StorageLevel] = None): Unit = { + def adamSaveAsPairedFastq( + fileName1: String, + fileName2: String, + outputOriginalBaseQualities: Boolean = false, + validationStringency: ValidationStringency = ValidationStringency.LENIENT, + persistLevel: Option[StorageLevel] = None): Unit = { def maybePersist[T](r: RDD[T]): Unit = { persistLevel.foreach(r.persist(_)) } @@ -449,9 +461,9 @@ class AlignmentRecordRDDFunctions(rdd: RDD[AlignmentRecord]) val readsByID: RDD[(String, Iterable[AlignmentRecord])] = rdd.groupBy(record => { if (!AlignmentRecordConverter.readNameHasPairedSuffix(record)) - record.getReadName.toString + record.getReadName else - record.getReadName.toString.dropRight(2) + record.getReadName.dropRight(2) }) validationStringency match { @@ -468,9 +480,9 @@ class AlignmentRecordRDDFunctions(rdd: RDD[AlignmentRecord]) List( s"Found $numUnpairedReadIDsWithCounts read names that don't occur exactly twice:", - readNameOccurrencesMap.map({ + readNameOccurrencesMap.take(100).map({ case (numOccurrences, numReadNames) => s"${numOccurrences}x:\t$numReadNames" - }).take(100).mkString("\t", "\n\t", if (readNameOccurrencesMap.size > 100) "\n\t…" else ""), + }).mkString("\t", "\n\t", if (readNameOccurrencesMap.size > 100) "\n\t…" else ""), "", "Samples:", @@ -516,12 +528,10 @@ class AlignmentRecordRDDFunctions(rdd: RDD[AlignmentRecord]) if (validationStringency == ValidationStringency.STRICT) { firstInPairRecords.foreach(read => if (read.getReadNum == 1) - throw new Exception("Read %s found with first- and second-of-pair set".format(read.getReadName)) - ) + throw new Exception("Read %s found with first- and second-of-pair set".format(read.getReadName))) secondInPairRecords.foreach(read => if (read.getReadNum == 0) - throw new Exception("Read %s found with first- and second-of-pair set".format(read.getReadName)) - ) + throw new Exception("Read %s found with first- and second-of-pair set".format(read.getReadName))) } assert( @@ -532,12 +542,12 @@ class AlignmentRecordRDDFunctions(rdd: RDD[AlignmentRecord]) val arc = new AlignmentRecordConverter firstInPairRecords - .sortBy(_.getReadName.toString) + .sortBy(_.getReadName) .map(record => arc.convertToFastq(record, maybeAddSuffix = true, outputOriginalBaseQualities = outputOriginalBaseQualities)) .saveAsTextFile(fileName1) secondInPairRecords - .sortBy(_.getReadName.toString) + .sortBy(_.getReadName) .map(record => arc.convertToFastq(record, maybeAddSuffix = true, outputOriginalBaseQualities = outputOriginalBaseQualities)) .saveAsTextFile(fileName2) @@ -553,12 +563,13 @@ class AlignmentRecordRDDFunctions(rdd: RDD[AlignmentRecord]) * @param sort Whether to sort the FASTQ files by read name or not. Defaults * to false. Sorting the output will recover pair order, if desired. */ - def adamSaveAsFastq(fileName: String, - fileName2Opt: Option[String] = None, - outputOriginalBaseQualities: Boolean = false, - sort: Boolean = false, - validationStringency: ValidationStringency = ValidationStringency.LENIENT, - persistLevel: Option[StorageLevel] = None) { + def adamSaveAsFastq( + fileName: String, + fileName2Opt: Option[String] = None, + outputOriginalBaseQualities: Boolean = false, + sort: Boolean = false, + validationStringency: ValidationStringency = ValidationStringency.LENIENT, + persistLevel: Option[StorageLevel] = None) { log.info("Saving data in FASTQ format.") fileName2Opt match { case Some(fileName2) => @@ -574,7 +585,7 @@ class AlignmentRecordRDDFunctions(rdd: RDD[AlignmentRecord]) // sort the rdd if desired val outputRdd = if (sort || fileName2Opt.isDefined) { - rdd.sortBy(_.getReadName.toString) + rdd.sortBy(_.getReadName) } else { rdd } @@ -596,14 +607,15 @@ class AlignmentRecordRDDFunctions(rdd: RDD[AlignmentRecord]) * @param validationStringency How stringently to validate the reads. * @return Returns an RDD with the pair information recomputed. */ - def adamRePairReads(secondPairRdd: RDD[AlignmentRecord], - validationStringency: ValidationStringency = ValidationStringency.LENIENT): RDD[AlignmentRecord] = { + def adamRePairReads( + secondPairRdd: RDD[AlignmentRecord], + validationStringency: ValidationStringency = ValidationStringency.LENIENT): RDD[AlignmentRecord] = { // cache rdds val firstPairRdd = rdd.cache() secondPairRdd.cache() - val firstRDDKeyedByReadName = firstPairRdd.keyBy(_.getReadName.toString.dropRight(2)) - val secondRDDKeyedByReadName = secondPairRdd.keyBy(_.getReadName.toString.dropRight(2)) + val firstRDDKeyedByReadName = firstPairRdd.keyBy(_.getReadName.dropRight(2)) + val secondRDDKeyedByReadName = secondPairRdd.keyBy(_.getReadName.dropRight(2)) // all paired end reads should have the same name, except for the last two // characters, which will be _1/_2 @@ -620,8 +632,8 @@ class AlignmentRecordRDDFunctions(rdd: RDD[AlignmentRecord]) firstReads.size, secondReads.size, readName, - firstReads.map(_.getReadName.toString).mkString("\t", "\n\t", ""), - secondReads.map(_.getReadName.toString).mkString("\t", "\n\t", "") + firstReads.map(_.getReadName).mkString("\t", "\n\t", ""), + secondReads.map(_.getReadName).mkString("\t", "\n\t", "") ) ) } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/FlagStat.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/FlagStat.scala index f271a40445..082cacb008 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/FlagStat.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/FlagStat.scala @@ -40,10 +40,12 @@ object DuplicateMetrics { } def duplicateMetrics(f: (AlignmentRecord) => Boolean) = { - new DuplicateMetrics(b2i(f(record)), + new DuplicateMetrics( + b2i(f(record)), b2i(f(record) && record.getReadMapped && record.getMateMapped), b2i(f(record) && record.getReadMapped && !record.getMateMapped), - b2i(f(record) && (!isSameContig(record.getContig, record.getMateContig)))) + b2i(f(record) && (!isSameContig(record.getContig, record.getMateContig))) + ) } (duplicateMetrics(isPrimary), duplicateMetrics(isSecondary)) } @@ -51,10 +53,12 @@ object DuplicateMetrics { case class DuplicateMetrics(total: Long, bothMapped: Long, onlyReadMapped: Long, crossChromosome: Long) { def +(that: DuplicateMetrics): DuplicateMetrics = { - new DuplicateMetrics(total + that.total, + new DuplicateMetrics( + total + that.total, bothMapped + that.bothMapped, onlyReadMapped + that.onlyReadMapped, - crossChromosome + that.crossChromosome) + crossChromosome + that.crossChromosome + ) } } @@ -65,7 +69,8 @@ case class FlagStatMetrics(total: Long, duplicatesPrimary: DuplicateMetrics, dup withMateMappedToDiffChromosomeMapQ5: Long, failedQuality: Boolean) { def +(that: FlagStatMetrics): FlagStatMetrics = { assert(failedQuality == that.failedQuality, "Can't reduce passedVendorQuality with different failedQuality values") - new FlagStatMetrics(total + that.total, + new FlagStatMetrics( + total + that.total, duplicatesPrimary + that.duplicatesPrimary, duplicatesSecondary + that.duplicatesSecondary, mapped + that.mapped, @@ -77,7 +82,8 @@ case class FlagStatMetrics(total: Long, duplicatesPrimary: DuplicateMetrics, dup singleton + that.singleton, withMateMappedToDiffChromosome + that.withMateMappedToDiffChromosome, withMateMappedToDiffChromosomeMapQ5 + that.withMateMappedToDiffChromosomeMapQ5, - failedQuality) + failedQuality + ) } } @@ -93,7 +99,8 @@ object FlagStat { val mateMappedToDiffChromosome = p.getReadPaired && p.getReadMapped && p.getMateMapped && !isSameContig(p.getContig, p.getMateContig) val (primaryDuplicates, secondaryDuplicates) = DuplicateMetrics(p) - new FlagStatMetrics(1, + new FlagStatMetrics( + 1, primaryDuplicates, secondaryDuplicates, b2i(b(p.getReadMapped)), b2i(b(p.getReadPaired)), @@ -104,7 +111,8 @@ object FlagStat { b2i(b(p.getReadPaired) && b(p.getReadMapped) && b(!p.getMateMapped)), b2i(b(mateMappedToDiffChromosome)), b2i(b(mateMappedToDiffChromosome && i(p.getMapq) >= 5)), - p.getFailedVendorQualityChecks) + p.getFailedVendorQualityChecks + ) }.aggregate((FlagStatMetrics.emptyFailedQuality, FlagStatMetrics.emptyPassedQuality))( seqOp = { (a, b) => @@ -117,6 +125,7 @@ object FlagStat { combOp = { (a, b) => (a._1 + b._1, a._2 + b._2) - }) + } + ) } } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/MDTagging.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/MDTagging.scala index d80b8d2645..0d86459650 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/MDTagging.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/MDTagging.scala @@ -27,11 +27,12 @@ import org.bdgenomics.adam.models.ReferenceRegion import org.bdgenomics.adam.util.{ ReferenceFile, MdTag } import org.bdgenomics.formats.avro.AlignmentRecord -case class MDTagging(reads: RDD[AlignmentRecord], - @transient referenceFile: ReferenceFile, - partitionSize: Long = 1000000, - overwriteExistingTags: Boolean = false, - validationStringency: ValidationStringency = ValidationStringency.STRICT) extends Logging { +case class MDTagging( + reads: RDD[AlignmentRecord], + @transient referenceFile: ReferenceFile, + partitionSize: Long = 1000000, + overwriteExistingTags: Boolean = false, + validationStringency: ValidationStringency = ValidationStringency.STRICT) extends Logging { @transient val sc = reads.sparkContext val mdTagsAdded = sc.accumulator(0L, "MDTags Added") @@ -85,11 +86,12 @@ case class MDTagging(reads: RDD[AlignmentRecord], } object MDTagging { - def apply(reads: RDD[AlignmentRecord], - referenceFile: String, - fragmentLength: Long, - overwriteExistingTags: Boolean, - validationStringency: ValidationStringency): RDD[AlignmentRecord] = { + def apply( + reads: RDD[AlignmentRecord], + referenceFile: String, + fragmentLength: Long, + overwriteExistingTags: Boolean, + validationStringency: ValidationStringency): RDD[AlignmentRecord] = { val sc = reads.sparkContext new MDTagging( reads, diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/MarkDuplicates.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/MarkDuplicates.scala index 64aa79e3dc..93af1d4734 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/MarkDuplicates.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/MarkDuplicates.scala @@ -57,9 +57,8 @@ private[rdd] object MarkDuplicates extends Serializable { private def markReads(reads: Iterable[(ReferencePositionPair, SingleReadBucket)], primaryAreDups: Boolean, secondaryAreDups: Boolean, ignore: Option[(ReferencePositionPair, SingleReadBucket)] = None) = MarkReads.time { reads.foreach(read => { - if (ignore.isEmpty || read != ignore.get) { + if (ignore.forall(_ != read)) markReadsInBucket(read._2, primaryAreDups, secondaryAreDups) - } }) } @@ -135,4 +134,3 @@ private[rdd] object MarkDuplicates extends Serializable { } } - diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/IndelRealignmentTarget.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/IndelRealignmentTarget.scala index b91f53365f..c20f6ed386 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/IndelRealignmentTarget.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/IndelRealignmentTarget.scala @@ -97,8 +97,9 @@ object IndelRealignmentTarget { * @param maxIndelSize Maximum allowable size of an indel. * @return Set of generated realignment targets. */ - def apply(read: RichAlignmentRecord, - maxIndelSize: Int): Seq[IndelRealignmentTarget] = CreateIndelRealignmentTargets.time { + def apply( + read: RichAlignmentRecord, + maxIndelSize: Int): Seq[IndelRealignmentTarget] = CreateIndelRealignmentTargets.time { val region = ReferenceRegion(read.record) val refId = read.record.getContig.getContigName @@ -126,7 +127,7 @@ object IndelRealignmentTarget { }) // if we have indels, emit those targets, else emit a target for this read - if (pos.length == 0) { + if (pos.isEmpty) { Seq(new IndelRealignmentTarget(None, region)) } else { pos.map(ir => new IndelRealignmentTarget(Some(ir), region)) @@ -135,8 +136,9 @@ object IndelRealignmentTarget { } } -class IndelRealignmentTarget(val variation: Option[ReferenceRegion], - val readRange: ReferenceRegion) extends Logging { +class IndelRealignmentTarget( + val variation: Option[ReferenceRegion], + val readRange: ReferenceRegion) extends Logging { override def toString(): String = { variation + " over " + readRange @@ -152,14 +154,11 @@ class IndelRealignmentTarget(val variation: Option[ReferenceRegion], assert(readRange.isAdjacent(target.readRange) || readRange.overlaps(target.readRange), "Targets do not overlap, and therefore cannot be merged.") - val newVar = if (variation.isDefined && target.variation.isDefined) { - Some(variation.get.hull(target.variation.get)) - } else if (variation.isDefined) { - variation - } else if (target.variation.isDefined) { - target.variation - } else { - None + val newVar = (variation, target.variation) match { + case (Some(v), Some(tv)) => Some(v.hull(tv)) + case (Some(v), _) => Some(v) + case (_, Some(tv)) => Some(tv) + case _ => None } new IndelRealignmentTarget(newVar, readRange.merge(target.readRange)) @@ -206,4 +205,3 @@ case class TargetSet(set: TreeSet[IndelRealignmentTarget]) extends Serializable case class ZippedTargetSet(set: TreeSet[(IndelRealignmentTarget, Int)]) extends Serializable { } - diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/RealignIndels.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/RealignIndels.scala index d9d50a5b0b..d0995d8fd1 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/RealignIndels.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/RealignIndels.scala @@ -43,19 +43,22 @@ private[rdd] object RealignIndels extends Serializable with Logging { * @param rdd RDD of reads to realign. * @return RDD of realigned reads. */ - def apply(rdd: RDD[AlignmentRecord], - consensusModel: ConsensusGenerator = new ConsensusGeneratorFromReads, - dataIsSorted: Boolean = false, - maxIndelSize: Int = 500, - maxConsensusNumber: Int = 30, - lodThreshold: Double = 5.0, - maxTargetSize: Int = 3000): RDD[AlignmentRecord] = { - new RealignIndels(consensusModel, + def apply( + rdd: RDD[AlignmentRecord], + consensusModel: ConsensusGenerator = new ConsensusGeneratorFromReads, + dataIsSorted: Boolean = false, + maxIndelSize: Int = 500, + maxConsensusNumber: Int = 30, + lodThreshold: Double = 5.0, + maxTargetSize: Int = 3000): RDD[AlignmentRecord] = { + new RealignIndels( + consensusModel, dataIsSorted, maxIndelSize, maxConsensusNumber, lodThreshold, - maxTargetSize).realignIndels(rdd) + maxTargetSize + ).realignIndels(rdd) } /** @@ -69,8 +72,9 @@ private[rdd] object RealignIndels extends Serializable with Logging { * * @see mapTargets */ - @tailrec final def mapToTarget(read: RichAlignmentRecord, - targets: TreeSet[(IndelRealignmentTarget, Int)]): Int = { + @tailrec final def mapToTarget( + read: RichAlignmentRecord, + targets: TreeSet[(IndelRealignmentTarget, Int)]): Int = { // Perform tail call recursive binary search if (targets.size == 1) { if (TargetOrdering.contains(targets.head._1, read)) { @@ -103,8 +107,9 @@ private[rdd] object RealignIndels extends Serializable with Logging { * * @see mapTargets */ - def mapToTarget(read: RichAlignmentRecord, - targets: ZippedTargetSet): Int = { + def mapToTarget( + read: RichAlignmentRecord, + targets: ZippedTargetSet): Int = { mapToTarget(read, targets.set) } @@ -121,8 +126,9 @@ private[rdd] object RealignIndels extends Serializable with Logging { * * @see mapTargets */ - def mapToTargetUnpacked(targetIndex: Int, - targets: TreeSet[(IndelRealignmentTarget, Int)]): Option[IndelRealignmentTarget] = { + def mapToTargetUnpacked( + targetIndex: Int, + targets: TreeSet[(IndelRealignmentTarget, Int)]): Option[IndelRealignmentTarget] = { if (targetIndex < 0) { None } else { @@ -187,12 +193,12 @@ private[rdd] object RealignIndels extends Serializable with Logging { // get reference and range from a single read val readRefs = reads.flatMap((r: RichAlignmentRecord) => { - if (r.mdTag.isDefined) { - Some((r.mdTag.get.getReference(r), r.getStart.toLong to r.getEnd)) - } else { + r.mdTag.fold { log.warn("Discarding read " + r.record.getReadName + " during reference re-creation.") tossedReads += 1 - None + (None: Option[(String, NumericRange[Long])]) + } { (tag) => + Some((tag.getReference(r), (r.getStart: Long) to r.getEnd)) } }) .toSeq @@ -217,12 +223,13 @@ private[rdd] object RealignIndels extends Serializable with Logging { import org.bdgenomics.adam.rdd.read.realignment.RealignIndels._ -private[rdd] class RealignIndels(val consensusModel: ConsensusGenerator = new ConsensusGeneratorFromReads, - val dataIsSorted: Boolean = false, - val maxIndelSize: Int = 500, - val maxConsensusNumber: Int = 30, - val lodThreshold: Double = 5.0, - val maxTargetSize: Int = 3000) extends Serializable with Logging { +private[rdd] class RealignIndels( + val consensusModel: ConsensusGenerator = new ConsensusGeneratorFromReads, + val dataIsSorted: Boolean = false, + val maxIndelSize: Int = 500, + val maxConsensusNumber: Int = 30, + val lodThreshold: Double = 5.0, + val maxTargetSize: Int = 3000) extends Serializable with Logging { /** * Given a target group with an indel realignment target and a group of reads to realign, this method @@ -240,16 +247,18 @@ private[rdd] class RealignIndels(val consensusModel: ConsensusGenerator = new Co reads } else { // bootstrap realigned read set with the reads that need to be realigned - var realignedReads = reads.filter(r => r.mdTag.isDefined && !r.mdTag.get.hasMismatches) + var realignedReads = reads.filter(r => r.mdTag.exists(!_.hasMismatches)) // get reference from reads val (reference, refStart, refEnd) = getReferenceFromReads(reads.map(r => new RichAlignmentRecord(r))) val refRegion = ReferenceRegion(reads.head.record.getContig.getContigName, refStart, refEnd) // preprocess reads and get consensus - val readsToClean = consensusModel.preprocessReadsForRealignment(reads.filter(r => !r.mdTag.isDefined || r.mdTag.get.hasMismatches), + val readsToClean = consensusModel.preprocessReadsForRealignment( + reads.filter(r => r.mdTag.forall(_.hasMismatches)), reference, - refRegion) + refRegion + ) var consensus = consensusModel.findConsensus(readsToClean) // reduce count of consensus sequences @@ -261,9 +270,9 @@ private[rdd] class RealignIndels(val consensusModel: ConsensusGenerator = new Co if (readsToClean.size > 0 && consensus.size > 0) { // do not check realigned reads - they must match - val totalMismatchSumPreCleaning = readsToClean.map(sumMismatchQuality(_)).reduce(_ + _) + val totalMismatchSumPreCleaning = readsToClean.map(sumMismatchQuality(_)).sum - /* list to log the outcome of all consensus trials. stores: + /* list to log the outcome of all consensus trials. stores: * - mismatch quality of reads against new consensus sequence * - the consensus sequence itself * - a map containing each realigned read and it's offset into the new sequence @@ -280,8 +289,8 @@ private[rdd] class RealignIndels(val consensusModel: ConsensusGenerator = new Co val (qual, pos) = sweepReadOverReferenceForQuality(r.getSequence, consensusSequence, r.qualityScores) val originalQual = sumMismatchQuality(r) - // if the read's mismatch quality improves over the original alignment, save - // its alignment in the consensus sequence, else store -1 + // if the read's mismatch quality improves over the original alignment, save + // its alignment in the consensus sequence, else store -1 if (qual < originalQual) { (r, (qual, pos)) } else { @@ -290,7 +299,7 @@ private[rdd] class RealignIndels(val consensusModel: ConsensusGenerator = new Co }) // sum all mismatch qualities to get the total mismatch quality for this alignment - val totalQuality = sweptValues.map(_._2._1).reduce(_ + _) + val totalQuality = sweptValues.map(_._2._1).sum // package data var readMappings = mutable.Map[RichAlignmentRecord, Int]() @@ -303,13 +312,7 @@ private[rdd] class RealignIndels(val consensusModel: ConsensusGenerator = new Co }) // perform reduction to pick the consensus with the lowest aggregated mismatch score - val bestConsensusTuple = consensusOutcomes.reduce((c1: (Int, Consensus, mutable.Map[RichAlignmentRecord, Int]), c2: (Int, Consensus, mutable.Map[RichAlignmentRecord, Int])) => { - if (c1._1 <= c2._1) { - c1 - } else { - c2 - } - }) + val bestConsensusTuple = consensusOutcomes.minBy(_._1) val (bestConsensusMismatchSum, bestConsensus, bestMappings) = bestConsensusTuple @@ -353,9 +356,11 @@ private[rdd] class RealignIndels(val consensusModel: ConsensusGenerator = new Co // compensate the end builder.setEnd(refStart + remapping + r.getSequence.length + endPenalty) - val cigarElements = List[CigarElement](new CigarElement((bestConsensus.index.start - (refStart + remapping)).toInt, CigarOperator.M), + val cigarElements = List[CigarElement]( + new CigarElement((bestConsensus.index.start - (refStart + remapping)).toInt, CigarOperator.M), idElement, - new CigarElement(endLength.toInt, CigarOperator.M)) + new CigarElement(endLength.toInt, CigarOperator.M) + ) new Cigar(cigarElements) } @@ -407,13 +412,7 @@ private[rdd] class RealignIndels(val consensusModel: ConsensusGenerator = new Co } // perform reduction to get best quality offset - qualityScores.reduce((p1: (Int, Int), p2: (Int, Int)) => { - if (p1._1 < p2._1) { - p1 - } else { - p2 - } - }) + qualityScores.minBy(_._1) } /** @@ -433,7 +432,7 @@ private[rdd] class RealignIndels(val consensusModel: ConsensusGenerator = new Co .map(_._2) if (mismatchQualities.length > 0) { - mismatchQualities.reduce(_ + _) + mismatchQualities.sum } else { 0 } @@ -447,9 +446,11 @@ private[rdd] class RealignIndels(val consensusModel: ConsensusGenerator = new Co * @return Mismatch quality of read for current alignment. */ def sumMismatchQuality(read: AlignmentRecord): Int = { - sumMismatchQualityIgnoreCigar(read.getSequence, + sumMismatchQualityIgnoreCigar( + read.getSequence, read.mdTag.get.getReference(read), - read.qualityScores) + read.qualityScores + ) } /** @@ -475,9 +476,11 @@ private[rdd] class RealignIndels(val consensusModel: ConsensusGenerator = new Co // find realignment targets log.info("Generating realignment targets...") - val targets: TreeSet[IndelRealignmentTarget] = RealignmentTargetFinder(richRdd, + val targets: TreeSet[IndelRealignmentTarget] = RealignmentTargetFinder( + richRdd, maxIndelSize, - maxTargetSize) + maxTargetSize + ) // we should only attempt realignment if the target set isn't empty if (targets.isEmpty) { diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/RealignmentTargetFinder.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/RealignmentTargetFinder.scala index 7b8fdc83fb..dfcdf32636 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/RealignmentTargetFinder.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/RealignmentTargetFinder.scala @@ -32,9 +32,10 @@ object RealignmentTargetFinder { * @param rdd RDD of reads to use in generating realignment targets. * @return Sorted set of realignment targets. */ - def apply(rdd: RDD[RichAlignmentRecord], - maxIndelSize: Int = 500, - maxTargetSize: Int = 3000): TreeSet[IndelRealignmentTarget] = { + def apply( + rdd: RDD[RichAlignmentRecord], + maxIndelSize: Int = 500, + maxTargetSize: Int = 3000): TreeSet[IndelRealignmentTarget] = { new RealignmentTargetFinder().findTargets(rdd, maxIndelSize, maxTargetSize).set } } @@ -83,8 +84,9 @@ class RealignmentTargetFinder extends Serializable with Logging { * @param second A sorted set of realignment targets. * @return A merged set of targets. */ - def joinTargets(first: TargetSet, - second: TargetSet): TargetSet = JoinTargets.time { + def joinTargets( + first: TargetSet, + second: TargetSet): TargetSet = JoinTargets.time { new TargetSet(joinTargets(first.set, second.set)) } @@ -94,9 +96,10 @@ class RealignmentTargetFinder extends Serializable with Logging { * @param reads An RDD containing reads to generate indel realignment targets from. * @return An ordered set of indel realignment targets. */ - def findTargets(reads: RDD[RichAlignmentRecord], - maxIndelSize: Int = 500, - maxTargetSize: Int = 3000): TargetSet = FindTargets.time { + def findTargets( + reads: RDD[RichAlignmentRecord], + maxIndelSize: Int = 500, + maxTargetSize: Int = 3000): TargetSet = FindTargets.time { def createTargetSet(target: IndelRealignmentTarget): TargetSet = { val tmp = new TreeSet()(TargetOrdering) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/BaseQualityRecalibration.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/BaseQualityRecalibration.scala index 3b38256fe5..9150ae1d9c 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/BaseQualityRecalibration.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/BaseQualityRecalibration.scala @@ -124,9 +124,10 @@ class BaseQualityRecalibration( } object BaseQualityRecalibration { - def apply(rdd: RDD[AlignmentRecord], - knownSnps: Broadcast[SnpTable], - observationDumpFile: Option[String] = None, - validationStringency: ValidationStringency = ValidationStringency.STRICT): RDD[AlignmentRecord] = + def apply( + rdd: RDD[AlignmentRecord], + knownSnps: Broadcast[SnpTable], + observationDumpFile: Option[String] = None, + validationStringency: ValidationStringency = ValidationStringency.STRICT): RDD[AlignmentRecord] = new BaseQualityRecalibration(cloy(rdd, validationStringency), knownSnps).result } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/ObservationTable.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/ObservationTable.scala index 15fadd4bff..b1b031fa68 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/ObservationTable.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/ObservationTable.scala @@ -97,7 +97,8 @@ class Aggregate private ( new Aggregate( this.total + that.total, this.mismatches + that.mismatches, - this.expectedMismatches + that.expectedMismatches) + this.expectedMismatches + that.expectedMismatches + ) } object Aggregate { diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/Recalibrator.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/Recalibrator.scala index 52fe525702..e658645c9e 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/Recalibrator.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/Recalibrator.scala @@ -99,8 +99,9 @@ class RecalibrationTable( getOrElse(0.0) } - def getQualityEntry(quality: QualityScore, - globalEntry: Option[(Aggregate, QualityTable)]): Option[(Aggregate, ExtrasTables)] = { + def getQualityEntry( + quality: QualityScore, + globalEntry: Option[(Aggregate, QualityTable)]): Option[(Aggregate, ExtrasTables)] = { globalEntry.flatMap(_._2.table.get(quality)) } @@ -144,15 +145,17 @@ object RecalibrationTable { new RecalibrationTable(observed.space, globalTable) } - def computeQualityTable(globalEntry: (String, Map[CovariateKey, Observation]), - space: CovariateSpace): Map[QualityScore, (Aggregate, ExtrasTables)] = { + def computeQualityTable( + globalEntry: (String, Map[CovariateKey, Observation]), + space: CovariateSpace): Map[QualityScore, (Aggregate, ExtrasTables)] = { globalEntry._2.groupBy(_._1.quality).map(qualityEntry => { (qualityEntry._1, (aggregateObservations(qualityEntry._2), new ExtrasTables(computeExtrasTables(qualityEntry._2, space)))) }).map(identity) } - def computeExtrasTables(table: Map[CovariateKey, Observation], - space: CovariateSpace): IndexedSeq[Map[Option[Covariate#Value], Aggregate]] = { + def computeExtrasTables( + table: Map[CovariateKey, Observation], + space: CovariateSpace): IndexedSeq[Map[Option[Covariate#Value], Aggregate]] = { Range(0, space.extras.length).map(index => { table.groupBy(_._1.extras(index)).map(extraEntry => { (extraEntry._1, aggregateObservations(extraEntry._2)) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variation/ADAMVCFOutputFormat.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variation/ADAMVCFOutputFormat.scala index 49ac62112b..00d59c81c7 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variation/ADAMVCFOutputFormat.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variation/ADAMVCFOutputFormat.scala @@ -37,7 +37,8 @@ object ADAMVCFOutputFormat extends Serializable { def setHeader(samples: Seq[String]): VCFHeader = { header = Some(new VCFHeader( (VariantAnnotationConverter.infoHeaderLines ++ VariantAnnotationConverter.formatHeaderLines).toSet: Set[VCFHeaderLine], - samples)) + samples + )) header.get } } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variation/VariationRDDFunctions.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variation/VariationRDDFunctions.scala index 7a62669178..49a4f84246 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variation/VariationRDDFunctions.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variation/VariationRDDFunctions.scala @@ -63,7 +63,6 @@ class VariantContextRDDFunctions(rdd: RDD[VariantContext]) extends ADAMSequenceD def getCallsetSamples(): List[String] = { rdd.flatMap(c => c.genotypes.map(_.getSampleId).toSeq.distinct) .distinct - .map(_.toString) .collect() .toList } @@ -78,10 +77,11 @@ class VariantContextRDDFunctions(rdd: RDD[VariantContext]) extends ADAMSequenceD * Default is false (no sort). * @param coalesceTo Optionally coalesces the RDD down to _n_ partitions. Default is none. */ - def saveAsVcf(filePath: String, - dict: Option[SequenceDictionary] = None, - sortOnSave: Boolean = false, - coalesceTo: Option[Int] = None) = { + def saveAsVcf( + filePath: String, + dict: Option[SequenceDictionary] = None, + sortOnSave: Boolean = false, + coalesceTo: Option[Int] = None) = { val vcfFormat = VCFFormat.inferFromFilePath(filePath) assert(vcfFormat == VCFFormat.VCF, "BCF not yet supported") // TODO: Add BCF support @@ -137,9 +137,11 @@ class VariantContextRDDFunctions(rdd: RDD[VariantContext]) extends ADAMSequenceD // save to disk val conf = rdd.context.hadoopConfiguration conf.set(VCFOutputFormat.OUTPUT_VCF_FORMAT_PROPERTY, vcfFormat.toString) - withKey.saveAsNewAPIHadoopFile(filePath, + withKey.saveAsNewAPIHadoopFile( + filePath, classOf[LongWritable], classOf[VariantContextWritable], classOf[ADAMVCFOutputFormat[LongWritable]], - conf) + conf + ) log.info("Write %d records".format(gatkVCs.count())) rdd.unpersist() @@ -155,7 +157,7 @@ class GenotypeRDDFunctions(rdd: RDD[Genotype]) extends Serializable with Logging def filterByOverlappingRegion(query: ReferenceRegion): RDD[Genotype] = { def overlapsQuery(rec: Genotype): Boolean = - rec.getVariant.getContig.getContigName.toString == query.referenceName && + rec.getVariant.getContig.getContigName == query.referenceName && rec.getVariant.getStart < query.end && rec.getVariant.getEnd > query.start rdd.filter(overlapsQuery) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rich/DecadentRead.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rich/DecadentRead.scala index e85a5498d2..a8178070d0 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rich/DecadentRead.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rich/DecadentRead.scala @@ -49,8 +49,9 @@ private[adam] object DecadentRead extends Logging with Serializable { * 2. To clog, to glut, or satisfy, as the appetite; to satiate. * 3. To fill up or choke up; to stop up. */ - def cloy(rdd: RDD[AlignmentRecord], - strictness: ValidationStringency = ValidationStringency.STRICT): RDD[(Option[DecadentRead], Option[AlignmentRecord])] = { + def cloy( + rdd: RDD[AlignmentRecord], + strictness: ValidationStringency = ValidationStringency.STRICT): RDD[(Option[DecadentRead], Option[AlignmentRecord])] = { rdd.map(r => { try { val dr = DecadentRead.apply(r) @@ -61,7 +62,8 @@ private[adam] object DecadentRead extends Logging with Serializable { throw e } else { log.warn("Converting read %s to decadent read failed with %s. Skipping...".format( - r, e)) + r, e + )) (None, Some(r)) } } @@ -108,12 +110,13 @@ private[adam] class DecadentRead(val record: RichAlignmentRecord) extends Loggin def quality = QualityScore(record.qualityScores(offset)) def isRegularBase: Boolean = base match { - case 'A' => true - case 'C' => true - case 'T' => true - case 'G' => true - case 'N' => false - case unk => throw new IllegalArgumentException("Encountered unexpected base '%s'".format(unk)) + case 'A' | 'C' | 'T' | 'G' | 'U' => true + // 2-base alternatives in http://www.bioinformatics.org/sms/iupac.html + case 'R' | 'Y' | 'S' | 'W' | 'K' | 'M' => true + // 3-base alternatives in http://www.bioinformatics.org/sms/iupac.html + case 'B' | 'D' | 'H' | 'V' => true + case 'N' => false + case unk => throw new IllegalArgumentException("Encountered unexpected base '%s'".format(unk)) } def isMismatch(includeInsertions: Boolean = true): Boolean = @@ -126,19 +129,21 @@ private[adam] class DecadentRead(val record: RichAlignmentRecord) extends Loggin def referencePositionOption: Option[ReferencePosition] = assumingAligned( - record.readOffsetToReferencePosition(offset)) + record.readOffsetToReferencePosition(offset) + ) def referenceSequenceContext: Option[ReferenceSequenceContext] = assumingAligned(record.readOffsetToReferenceSequenceContext(offset)) def referencePosition: ReferencePosition = referencePositionOption.getOrElse( - throw new IllegalArgumentException("Residue has no reference location (may be an insertion)")) + throw new IllegalArgumentException("Residue has no reference location (may be an insertion)") + ) } - lazy val readGroup: String = record.getRecordGroupName.toString + lazy val readGroup: String = record.getRecordGroupName - private lazy val baseSequence: String = record.getSequence.toString + private lazy val baseSequence: String = record.getSequence lazy val residues: IndexedSeq[Residue] = Range(0, baseSequence.length).map(new Residue(_)) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rich/RichAlignmentRecord.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rich/RichAlignmentRecord.scala index 44bbe2dafd..e6bcf46eac 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rich/RichAlignmentRecord.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rich/RichAlignmentRecord.scala @@ -72,7 +72,7 @@ class RichAlignmentRecord(val record: AlignmentRecord) { // Returns the quality scores as a list of bytes lazy val qualityScores: Array[Int] = { - record.getQual.toString.toCharArray.map(q => q - 33) + record.getQual.toCharArray.map(q => q - 33) } // Parse the tags ("key:type:value" triples) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rich/RichCigar.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rich/RichCigar.scala index 8bda8fa5b5..93cb16f0c9 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rich/RichCigar.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rich/RichCigar.scala @@ -42,7 +42,7 @@ class RichCigar(cigar: Cigar) { case CigarOperator.M => 1 case _ => 0 } - }).reduce(_ + _) + }).sum } /** @@ -63,28 +63,27 @@ class RichCigar(cigar: Cigar) { * @param cigarElements List of cigar elements to move. * @return List of cigar elements with single element moved. */ - @tailrec def moveCigarLeft(head: List[CigarElement], - index: Int, - cigarElements: List[CigarElement]): List[CigarElement] = { + @tailrec def moveCigarLeft( + head: List[CigarElement], + index: Int, + cigarElements: List[CigarElement]): List[CigarElement] = { if (index == 1) { - val elementToTrim = cigarElements.head - val elementToMove: Option[CigarElement] = Some(cigarElements(1)) - val elementToPad: Option[CigarElement] = if (cigarElements.length > 2) { - Some(cigarElements(2)) - } else { - None + val elementToTrim = cigarElements.headOption + val elementToMove: Option[CigarElement] = PartialFunction.condOpt(cigarElements) { + case _ :: x :: _ => x } - val elementsAfterPad = if (cigarElements.length > 4) { - cigarElements.drop(3) - } else { - List[CigarElement]() + val elementToPad: Option[CigarElement] = PartialFunction.condOpt(cigarElements) { + case _ :: _ :: x :: _ => x } + val elementsAfterPad = cigarElements.drop(3) // if we are at the position to move, then we take one from it and add to the next element - val elementMovedLeft: Option[CigarElement] = if (elementToTrim.getLength > 1) { - Some(new CigarElement(elementToTrim.getLength - 1, elementToTrim.getOperator)) - } else { - None + val elementMovedLeft: Option[CigarElement] = elementToTrim.flatMap { (ett) => + if (ett.getLength > 1) { + Some(new CigarElement(ett.getLength - 1, ett.getOperator)) + } else { + None + } } // if there are no elements afterwards to pad, add a match operator with length 1 to the end @@ -102,7 +101,7 @@ class RichCigar(cigar: Cigar) { } else if (index == 0 || cigarElements.length < 2) { head ::: cigarElements } else { - moveCigarLeft(head ::: List(cigarElements.head), index - 1, cigarElements.tail) + moveCigarLeft(head :+ cigarElements.head, index - 1, cigarElements.tail) } } @@ -111,7 +110,7 @@ class RichCigar(cigar: Cigar) { } def getLength(): Int = { - cigar.getCigarElements.map(_.getLength).reduce(_ + _) + cigar.getCigarElements.map(_.getLength).sum } /** diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/util/AttributeUtils.scala b/adam-core/src/main/scala/org/bdgenomics/adam/util/AttributeUtils.scala index 3d1f1f6ffe..af2812e4f4 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/util/AttributeUtils.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/util/AttributeUtils.scala @@ -71,10 +71,11 @@ object AttributeUtils { */ def parseAttribute(encoded: String): Attribute = { attrRegex.matches(encoded) match { - case Some(m) => createAttribute(m.group(1), m.group(2), m.group(3), m.group(4)) + case Some(m) => createAttribute((m.group(1), m.group(2), m.group(3), m.group(4))) case None => throw new IllegalArgumentException( - "attribute string \"%s\" doesn't match format attrTuple:type:value".format(encoded)) + "attribute string \"%s\" doesn't match format attrTuple:type:value".format(encoded) + ) } } @@ -123,4 +124,3 @@ object AttributeUtils { } } } - diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/util/Flattener.scala b/adam-core/src/main/scala/org/bdgenomics/adam/util/Flattener.scala index 720baeb3f9..8b5ed43a61 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/util/Flattener.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/util/Flattener.scala @@ -87,7 +87,8 @@ object Flattener { if (schema.getType ne Schema.Type.UNION) { return Schema.createUnion( - ListBuffer[Schema](Schema.create(Schema.Type.NULL), schema).asJava) + ListBuffer[Schema](Schema.create(Schema.Type.NULL), schema).asJava + ) } schema // TODO: what about unions that don't contain null? diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/util/IntervalListReader.scala b/adam-core/src/main/scala/org/bdgenomics/adam/util/IntervalListReader.scala index 4007ec6c78..c87f9ab55c 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/util/IntervalListReader.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/util/IntervalListReader.scala @@ -48,6 +48,7 @@ class IntervalListReader(file: File) extends Traversable[(ReferenceRegion, Strin def foreach[U](f: ((ReferenceRegion, String)) => U) { IntervalList.fromFile(file).asScala.foreach( - i => f((ReferenceRegion(i.getSequence, i.getStart, i.getEnd), i.getName))) + i => f((ReferenceRegion(i.getSequence, i.getStart, i.getEnd), i.getName)) + ) } } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/util/MapTools.scala b/adam-core/src/main/scala/org/bdgenomics/adam/util/MapTools.scala index 31a4b70667..c9ee7e8899 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/util/MapTools.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/util/MapTools.scala @@ -40,8 +40,9 @@ object MapTools { * @tparam NumberType * @return */ - def add[KeyType, NumberType](map1: Map[KeyType, NumberType], - map2: Map[KeyType, NumberType])(implicit ops: Numeric[NumberType]): Map[KeyType, NumberType] = { + def add[KeyType, NumberType]( + map1: Map[KeyType, NumberType], + map2: Map[KeyType, NumberType])(implicit ops: Numeric[NumberType]): Map[KeyType, NumberType] = { (map1.keys ++ map2.keys.filter(!map1.contains(_))).map { (key: KeyType) => diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/util/MdTag.scala b/adam-core/src/main/scala/org/bdgenomics/adam/util/MdTag.scala index 4b6383fe31..b998db178f 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/util/MdTag.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/util/MdTag.scala @@ -45,9 +45,10 @@ object MdTag { * @param cigar Cigar operators for the read * @return Returns a populated MD tag. */ - def apply(mdTagInput: String, - referenceStart: Long, - cigar: Cigar): MdTag = { + def apply( + mdTagInput: String, + referenceStart: Long, + cigar: Cigar): MdTag = { var matches = List[NumericRange[Long]]() var mismatches = Map[Long, Char]() @@ -546,11 +547,7 @@ class MdTag( } else if (deletions.contains(i)) { if (!lastWasDeletion) { // write match count before deletion - if (lastWasMatch) { - mdString += matchRun.toString - } else { - mdString += "0" - } + mdString += (if (lastWasMatch) matchRun.toString else "0") // add deletion caret mdString += "^" @@ -563,11 +560,7 @@ class MdTag( mdString += deletions(i) } else if (mismatches.contains(i)) { // write match count before mismatch - if (lastWasMatch) { - mdString += matchRun.toString - } else { - mdString += "0" - } + mdString += (if (lastWasMatch) matchRun.toString else "0") mdString += mismatches(i) @@ -578,11 +571,7 @@ class MdTag( }) // if we have more matches, write count - if (lastWasMatch) { - mdString += matchRun.toString - } else { - mdString += "0" - } + mdString += (if (lastWasMatch) matchRun.toString else "0") mdString } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/util/ParquetFileTraversable.scala b/adam-core/src/main/scala/org/bdgenomics/adam/util/ParquetFileTraversable.scala index e68bcf405c..0014f433fb 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/util/ParquetFileTraversable.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/util/ParquetFileTraversable.scala @@ -65,4 +65,3 @@ class ParquetFileTraversable[T <: IndexedRecord](sc: SparkContext, file: Path) e } } - diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/util/TwoBitFile.scala b/adam-core/src/main/scala/org/bdgenomics/adam/util/TwoBitFile.scala index 3e253eaebf..218c443577 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/util/TwoBitFile.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/util/TwoBitFile.scala @@ -114,7 +114,7 @@ class TwoBitFile(byteAccess: ByteAccess) extends ReferenceFile { val sb = StringBuilder.newBuilder // define predicate for N blocks - val isNBlock = if (record.nBlocks.isEmpty || !record.nBlocks.get.hasRegionsFor(region -> None)) { + val isNBlock = if (record.nBlocks.forall(!_.hasRegionsFor(region -> None))) { // our region has no overlap with an N block, so the predicate is trivial pos: Long => false } else { @@ -123,7 +123,7 @@ class TwoBitFile(byteAccess: ByteAccess) extends ReferenceFile { } // define predicate for mask blocks - val isMaskBlock = if (record.maskBlocks.isEmpty || !record.maskBlocks.get.hasRegionsFor(region -> None)) { + val isMaskBlock = if (record.maskBlocks.forall(!_.hasRegionsFor(region -> None))) { // our region has no overlap with a mask block, so the predicate is trivial pos: Long => false } else { diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/util/VcfHeaderUtils.scala b/adam-core/src/main/scala/org/bdgenomics/adam/util/VcfHeaderUtils.scala index 7598704d58..214f5467dd 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/util/VcfHeaderUtils.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/util/VcfHeaderUtils.scala @@ -72,14 +72,20 @@ private[util] class VcfHeaderBuilder(samples: List[String]) { val formatLines: java.util.Set[VCFHeaderLine] = new java.util.HashSet[VCFHeaderLine]() val infoLines: java.util.Set[VCFHeaderLine] = new java.util.HashSet[VCFHeaderLine]() - val otherLines: Set[VCFHeaderLine] = Set(new VCFInfoHeaderLine(VCFConstants.RMS_BASE_QUALITY_KEY, - 1, - VCFHeaderLineType.Float, - "RMS Base Quality"), - new VCFInfoHeaderLine(VCFConstants.SAMPLE_NUMBER_KEY, + val otherLines: Set[VCFHeaderLine] = Set( + new VCFInfoHeaderLine( + VCFConstants.RMS_BASE_QUALITY_KEY, + 1, + VCFHeaderLineType.Float, + "RMS Base Quality" + ), + new VCFInfoHeaderLine( + VCFConstants.SAMPLE_NUMBER_KEY, VCFHeaderLineCount.INTEGER, VCFHeaderLineType.Integer, - "RMS Mapping Quality")) + "RMS Mapping Quality" + ) + ) /** * Creates VCF contig lines from a sequence dictionary. @@ -97,16 +103,20 @@ private[util] class VcfHeaderBuilder(samples: List[String]) { * Adds standard VCF header lines to header. */ private def addStandardLines() { - val formatKeys = List(VCFConstants.GENOTYPE_KEY, + val formatKeys = List( + VCFConstants.GENOTYPE_KEY, VCFConstants.GENOTYPE_QUALITY_KEY, - VCFConstants.GENOTYPE_PL_KEY) - val infoKeys = List(VCFConstants.ALLELE_FREQUENCY_KEY, + VCFConstants.GENOTYPE_PL_KEY + ) + val infoKeys = List( + VCFConstants.ALLELE_FREQUENCY_KEY, VCFConstants.ALLELE_COUNT_KEY, VCFConstants.ALLELE_NUMBER_KEY, VCFConstants.STRAND_BIAS_KEY, VCFConstants.RMS_MAPPING_QUALITY_KEY, VCFConstants.MAPPING_QUALITY_ZERO_KEY, - VCFConstants.DEPTH_KEY) + VCFConstants.DEPTH_KEY + ) VCFStandardHeaderLines.addStandardFormatLines(formatLines, false, formatKeys) VCFStandardHeaderLines.addStandardInfoLines(infoLines, false, infoKeys) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/util/VcfStringUtils.scala b/adam-core/src/main/scala/org/bdgenomics/adam/util/VcfStringUtils.scala deleted file mode 100644 index d5a85cfcc6..0000000000 --- a/adam-core/src/main/scala/org/bdgenomics/adam/util/VcfStringUtils.scala +++ /dev/null @@ -1,77 +0,0 @@ -/** - * Licensed to Big Data Genomics (BDG) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The BDG licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.bdgenomics.adam.util - -object VcfStringUtils { - - def clean(s: String): String = { - val s0 = if (s.startsWith("[")) { - s.drop(1) - } else { - s - } - if (s0.endsWith("]")) { - s0.dropRight(1) - } else { - s0 - } - } - - def vcfListToInts(l: String): List[Int] = { - val valueList = l.split(",").toList - - // TODO: @tailrec - def convertListToInts(l: List[String]): List[Int] = { - if (l.length == 0) { - List[Int]() - } else { - clean(l.head).toInt :: convertListToInts(l.tail) - } - } - - convertListToInts(valueList) - } - - def vcfListToDoubles(l: String): List[Double] = { - val valueList = l.split(",").toList - - // TODO: @tailrec - def convertListToDoubles(l: List[String]): List[Double] = { - if (l.length == 0) { - List[Double]() - } else { - clean(l.head).toDouble :: convertListToDoubles(l.tail) - } - } - - convertListToDoubles(valueList) - } - - def listToString(l: List[Any]): String = listToString(l.map(_.toString)) - - // TODO: @tailrec final - private def stringListToString(l: List[String]): String = { - if (l.length == 0) { - "" - } else { - l.head + "," + listToString(l.tail) - } - } - - def stringToList(s: String): List[String] = s.split(",").toList -} diff --git a/pom.xml b/pom.xml index 3a0c20d615..69bc471cd1 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 7 - + 4.0.0 org.bdgenomics.adam adam-parent_2.10 @@ -29,13 +29,13 @@ 0.2.3 1.139 - + adam-core adam-apis adam-cli - + Apache License