diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/SequenceDictionary.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/SequenceDictionary.scala index d77cb75c3c..60a15cb4ce 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/SequenceDictionary.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/SequenceDictionary.scala @@ -109,7 +109,8 @@ class SequenceDictionary(val records: Vector[SequenceRecord]) extends Serializab * @return Returns a SAM formatted sequence dictionary. */ def toSAMSequenceDictionary: SAMSequenceDictionary = { - new SAMSequenceDictionary(records.map(_ toSAMSequenceRecord).toList) + import SequenceRecord._ + new SAMSequenceDictionary(records.sorted.map(_ toSAMSequenceRecord).toList) } override def toString: String = { @@ -117,6 +118,13 @@ class SequenceDictionary(val records: Vector[SequenceRecord]) extends Serializab } } +object SequenceOrdering extends Ordering[SequenceRecord] { + def compare(a: SequenceRecord, + b: SequenceRecord): Int = { + a.name.compareTo(b.name) + } +} + /** * Utility class within the SequenceDictionary; represents unique reference name-to-id correspondence * @@ -183,6 +191,8 @@ object SequenceRecord { val REFSEQ_TAG = "REFSEQ" val GENBANK_TAG = "GENBANK" + implicit def ordering = SequenceOrdering + def apply(name: String, length: Long, md5: String = null, diff --git a/adam-core/src/test/resources/sorted.sam b/adam-core/src/test/resources/sorted.sam new file mode 100644 index 0000000000..099cb35865 --- /dev/null +++ b/adam-core/src/test/resources/sorted.sam @@ -0,0 +1,10 @@ +@HD VN:1.4 SO:unsorted +@SQ SN:1 LN:1000 +@SQ SN:2 LN:1000 +@SQ SN:3 LN:1000 +@SQ SN:4 LN:2000 +A 0 1 1 50 10M * 0 0 ACACACACAC ********** +E 0 2 101 45 10M * 0 0 ACACACACAC ********** +D 0 2 501 55 10M2S * 0 0 ACACACACACAC ************ +B 0 3 11 40 4M2I4M * 0 0 ACACACACAC ********** +C 0 4 1001 25 8M * 0 0 ACACACAC ******** diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/models/SequenceDictionarySuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/models/SequenceDictionarySuite.scala index a56a9231d8..cf7e0a7400 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/models/SequenceDictionarySuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/models/SequenceDictionarySuite.scala @@ -192,4 +192,20 @@ class SequenceDictionarySuite extends FunSuite { toSSD.assertSameDictionary(ssd) } + test("conversion to sam sequence dictionary has correct sort order") { + val sd = new SequenceDictionary(Vector(SequenceRecord("MT", 1000L), + SequenceRecord("4", 1000L), + SequenceRecord("1", 1000L), + SequenceRecord("3", 1000L), + SequenceRecord("2", 1000L), + SequenceRecord("X", 1000L))) + val ssd = sd.toSAMSequenceDictionary + val seq = ssd.getSequences + assert(seq.get(0).getSequenceName === "1") + assert(seq.get(1).getSequenceName === "2") + assert(seq.get(2).getSequenceName === "3") + assert(seq.get(3).getSequenceName === "4") + assert(seq.get(4).getSequenceName === "MT") + assert(seq.get(5).getSequenceName === "X") + } } diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDDFunctionsSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDDFunctionsSuite.scala index 37856ae9a8..6bba2594c2 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDDFunctionsSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDDFunctionsSuite.scala @@ -24,6 +24,7 @@ import org.bdgenomics.adam.models._ import org.bdgenomics.adam.rdd.ADAMContext._ import org.bdgenomics.adam.util.ADAMFunSuite import org.bdgenomics.formats.avro._ +import scala.io.Source import scala.util.Random class AlignmentRecordRDDFunctionsSuite extends ADAMFunSuite { @@ -213,4 +214,89 @@ class AlignmentRecordRDDFunctionsSuite extends ADAMFunSuite { assert(readA.getReadName === readB.getReadName) } } + + sparkTest("writing a small sorted file as SAM should produce the expected result") { + val reads = sc.parallelize(Seq(AlignmentRecord.newBuilder() + .setContig(Contig.newBuilder() + .setContigName("1") + .setContigLength(1000L) + .build()) + .setStart(0L) + .setEnd(10L) + .setSequence("ACACACACAC") + .setQual("**********") + .setMapq(50) + .setCigar("10M") + .setReadName("A") + .build(), + AlignmentRecord.newBuilder() + .setContig(Contig.newBuilder() + .setContigName("3") + .setContigLength(1000L) + .build()) + .setStart(10L) + .setEnd(20L) + .setSequence("ACACACACAC") + .setQual("**********") + .setMapq(40) + .setCigar("4M2I4M") + .setReadName("B") + .build(), + AlignmentRecord.newBuilder() + .setContig(Contig.newBuilder() + .setContigName("4") + .setContigLength(2000L) + .build()) + .setStart(1000L) + .setEnd(1008L) + .setSequence("ACACACAC") + .setQual("********") + .setMapq(25) + .setCigar("8M") + .setReadName("C") + .build(), + AlignmentRecord.newBuilder() + .setContig(Contig.newBuilder() + .setContigName("2") + .setContigLength(1000L) + .build()) + .setStart(500L) + .setEnd(510L) + .setSequence("ACACACACACAC") + .setQual("************") + .setMapq(55) + .setCigar("10M2S") + .setReadName("D") + .build(), + AlignmentRecord.newBuilder() + .setContig(Contig.newBuilder() + .setContigName("2") + .setContigLength(1000L) + .build()) + .setStart(100L) + .setEnd(110L) + .setSequence("ACACACACAC") + .setQual("**********") + .setMapq(45) + .setCigar("10M") + .setReadName("E") + .build()), 1) + + val tempFile = Files.createTempDirectory("sam") + val tempPath1 = tempFile.toAbsolutePath.toString + "/reads.sam" + + reads.adamSortReadsByReferencePosition() + .adamSAMSave(tempPath1) + + val file1 = Source.fromFile(tempPath1 + "/part-r-00000") + val file2 = Source.fromFile(ClassLoader.getSystemClassLoader + .getResource("sorted.sam").getFile) + + val f1lines = file1.getLines + val f2lines = file2.getLines + + assert(f1lines.size === f2lines.size) + f1lines.zip(f2lines) + .foreach(p => assert(p._1 === p._2)) + } }