bigdatagenomics · carlyeks · Feb 12, 2014 · Feb 3, 2014 · fnothaft · Feb 10, 2014
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -12,6 +12,10 @@ Trunk (not yet released)
     pipelines based on their read-by-read concordance across a number of fields: position, alignment,
     mapping and base quality scores, and can be extended to support new metrics or aggregations.
 
+  * Added FASTA import, and RDD convenience functions for remapping contig IDs. This allows for reference
+    sequences to be imported into an efficient record where bases are stored as a list of enums. Additionally,
+    convenience values are calculated. This feature was introduced in PR #79.
+
   OPTIMIZATIONS
 
   IMPROVEMENTS

diff --git a/adam-cli/src/main/scala/edu/berkeley/cs/amplab/adam/cli/AdamMain.scala b/adam-cli/src/main/scala/edu/berkeley/cs/amplab/adam/cli/AdamMain.scala
@@ -32,7 +32,8 @@ object AdamMain extends Logging {
     Bam2Adam,
     Adam2Vcf,
     Vcf2Adam,
-    FindReads)
+    FindReads,
+    Fasta2Adam)
 
   private def printCommands() {
     println("\n")

diff --git a/adam-cli/src/main/scala/edu/berkeley/cs/amplab/adam/cli/Fasta2Adam.scala b/adam-cli/src/main/scala/edu/berkeley/cs/amplab/adam/cli/Fasta2Adam.scala
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2014. Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.berkeley.cs.amplab.adam.cli
+
+import edu.berkeley.cs.amplab.adam.avro.{ADAMNucleotideContig, ADAMRecord}
+import edu.berkeley.cs.amplab.adam.converters.FastaConverter
+import edu.berkeley.cs.amplab.adam.rdd.AdamContext._
+import org.apache.hadoop.io.{LongWritable, Text}
+import org.apache.hadoop.mapreduce.Job
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
+import org.apache.spark.{SparkContext, Logging}
+import org.apache.spark.rdd.RDD
+import org.kohsuke.args4j.{Option => Args4jOption, Argument}
+
+object Fasta2Adam extends AdamCommandCompanion {
+  val commandName: String = "fasta2adam"
+  val commandDescription: String = "Converts a text FASTA sequence file into an ADAMNucleotideContig Parquet file which represents assembled sequences."
+
+  def apply(cmdLine: Array[String]): AdamCommand = {
+    new Fasta2Adam(Args4j[Fasta2AdamArgs](cmdLine))
+  }
+}
+
+class Fasta2AdamArgs extends Args4jBase with ParquetArgs with SparkArgs {
+  @Argument(required = true, metaVar = "FASTA", usage = "The FASTA file to convert", index = 0)
+  var fastaFile: String = null
+  @Argument(required = true, metaVar = "ADAM", usage = "Location to write ADAM data", index = 1)
+  var outputPath: String = null
+  @Args4jOption(required = false, name = "-verbose", usage = "Prints enhanced debugging info, including contents of seq dict.")
+  var verbose: Boolean = false
+  @Args4jOption(required = false, name = "-reads", usage = "Maps contig IDs to match contig IDs of reads.")
+  var reads: String = ""
+}
+
+class Fasta2Adam(protected val args: Fasta2AdamArgs) extends AdamSparkCommand[Fasta2AdamArgs] with Logging {
+  val companion = Fasta2Adam
+
+  def run(sc: SparkContext, job: Job) {
+    log.info("Loading FASTA data from disk.")
+    val fastaData: RDD[(LongWritable, Text)] = sc.newAPIHadoopFile(args.fastaFile,
+                                                                   classOf[TextInputFormat],
+                                                                   classOf[LongWritable],
+                                                                   classOf[Text])
+
+    val remapData = fastaData.map(kv => (kv._1.get.toInt, kv._2.toString.toString))
+
+    log.info("Converting FASTA to ADAM.")
+    val adamFasta = FastaConverter(remapData)
+
+    if (args.verbose) {
+      println("FASTA contains:")
+      println(adamFasta.adamGetSequenceDictionary())
+    }
+
+    val remapped = if (args.reads != "") {
+      val readDict = sc.adamDictionaryLoad[ADAMRecord](args.reads)
+
+      if (args.verbose) {
+        println("Remapping with:")
+        println(readDict)
+      }
+
+      val remapFasta = adamFasta.adamRewriteContigIds(readDict)
+
+      if (args.verbose) {
+        println("After remapping, have:")
+        println(remapFasta.adamGetSequenceDictionary())
+      }
+
+      remapFasta
+    } else {
+      adamFasta
+    }
+
+    log.info("Writing records to disk.")
+    remapped.adamSave(args.outputPath, blockSize = args.blockSize, pageSize = args.pageSize,
+                      compressCodec = args.compressionCodec, disableDictionaryEncoding = args.disableDictionary)
+  }
+}
+
diff --git a/adam-core/src/main/scala/edu/berkeley/cs/amplab/adam/converters/FastaConverter.scala b/adam-core/src/main/scala/edu/berkeley/cs/amplab/adam/converters/FastaConverter.scala
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2014. Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.berkeley.cs.amplab.adam.converters
+
+import edu.berkeley.cs.amplab.adam.avro.{ADAMNucleotideContig, Base}
+import edu.berkeley.cs.amplab.adam.rdd.AdamContext._
+import org.apache.spark.rdd.RDD
+import org.apache.spark.SparkContext._
+import scala.math.Ordering._
+
+/**
+ * Object for converting an RDD containing FASTA sequence data into ADAM FASTA data.
+ */
+private[adam] object FastaConverter {
+
+  /**
+   * Converts an RDD containing ints and strings into an RDD containing ADAM nucleotide contig assemblies.
+   *
+   * @note Input dataset is assumed to have come in from a Hadoop TextInputFormat reader. This sets
+   * a specific format for the RDD's Key-Value pairs.
+   *
+   * @throws AssertionError Thrown if there appear to be multiple sequences in a single file
+   * that do not have descriptions.
+   * @throws IllegalArgumentError Thrown if a sequence does not have sequence data.
+   * 
+   * @param rdd RDD containing Int,String tuples, where the Int corresponds to the number
+   * of the file line, and the String is the line of the file.
+   * @return An RDD of ADAM FASTA data.
+   */
+  def apply (rdd: RDD[(Int, String)]): RDD[ADAMNucleotideContig] = {
+    val filtered = rdd.map(kv => (kv._1, kv._2.trim()))
+      .filter((kv: (Int, String)) => !kv._2.startsWith(";"))
+
+    val indices: Map[Int, Int] = rdd.filter(kv => kv._2.startsWith(">"))
+      .map(kv => kv._1)
+      .collect()
+      .sortWith(_ < _)
+      .zipWithIndex
+      .reverse
+      .toMap
+
+    val groupedContigs: RDD[(Int, Seq[(Int, String)])] = if (indices.size == 0) {
+      filtered.keyBy(kv => -1)
+        .groupByKey()
+    } else {
+      filtered.keyBy((kv: (Int, String)) => indices.find(p => p._1 <= kv._1))
+        .filter((kv: (Option[(Int, Int)], (Int, String))) => kv._1.isDefined)
+        .map(kv => (kv._1.get._2, kv._2))
+        .groupByKey()
+    }
+
+    val converter = new FastaConverter
+
+    val convertedData = groupedContigs.map(kv => {
+      val (id, lines) = kv
+
+      val descriptionLine = lines.filter(kv => kv._2.startsWith(">"))
+
+      val (name, comment): (Option[String], Option[String]) = if (descriptionLine.size == 0) {
+        assert(id == -1, "Cannot have a headerless line in a file with more than one fragment.")
+        (None, None)
+      } else if (descriptionLine.forall(kv => kv._2.contains(' '))) {
+        val description: String = descriptionLine.head._2
+        val splitIndex = description.indexOf(' ')
+        val split = description.splitAt(splitIndex)
+
+        val contigName: String = split._1.stripPrefix(">").trim
+        val contigDescription: String = split._2.trim
+
+        (Some(contigName), Some(contigDescription))
+      } else {
+        (Some(descriptionLine.head._2.stripPrefix(">").trim), None)
+      }
+
+      val seqId = if (id == -1) {
+        0
+      } else {
+        id
+      }
+
+      val sequenceLines = lines.filter(kv => !kv._2.startsWith(">"))
+      assert(sequenceLines.length != 0, "Sequence " + seqId + " has no sequence data.")
+
+      val sequence = sequenceLines.sortBy(kv => kv._1)
+          .map(kv => kv._2.stripSuffix("*"))
+          .reduce(_ + _)
+
+      converter.convert(name, seqId, sequence, comment)
+    })
+
+    convertedData
+  }
+}
+
+/**
+ * Conversion methods for single FASTA sequences into ADAM FASTA data.
+ */
+private[converters] class FastaConverter extends Serializable {
+
+  /**
+   * Converts a string of nucleotides into an array of Base.
+   *
+   * @see Base
+   * @throws IllegalArgumentException Thrown if sequence contains an illegal character.
+   *
+   * @param sequence Nucleotide string.
+   * @return An array of Base enums.
+   */
+  def convertSequence (sequence: String): List[Base] = {
+    sequence.toList.map((c: Char) => {
+      try {
+        Base.valueOf(c.toString.toUpperCase)
+      } catch {
+        case iae: java.lang.IllegalArgumentException => {
+          throw new IllegalArgumentException(c + " in FASTA is an invalid base.")
+        }
+      }
+    })
+  }
+
+  /**
+   * Converts a single FASTA sequence into an ADAM FASTA contig.
+   *
+   * @throws IllegalArgumentException Thrown if sequence contains an illegal character.
+   *
+   * @param name String option for the sequence name.
+   * @param id Numerical identifier for the sequence.
+   * @param sequence Nucleotide sequence.
+   * @param description Optional description of the sequence.
+   * @return The converted ADAM FASTA contig.
+   */
+  def convert (name: Option[String], 
+               id: Int, 
+               sequence: String,
+               description: Option[String]): ADAMNucleotideContig = {
+
+    // convert sequence to sequence array
+    val bases = convertSequence(sequence)
+
+    // make new builder and set up non-optional fields
+    val builder = ADAMNucleotideContig.newBuilder()
+      .setContigId(id)
+      .setSequence(bases)
+      .setSequenceLength(sequence.length)
+
+    // map over optional fields
+    name.foreach((n: String) => builder.setContigName(n))
+    description.foreach(builder.setDescription(_))
+
+    // build and return
+    builder.build()
+  }
+}
diff --git a/adam-core/src/main/scala/edu/berkeley/cs/amplab/adam/models/SequenceDictionary.scala b/adam-core/src/main/scala/edu/berkeley/cs/amplab/adam/models/SequenceDictionary.scala
@@ -19,7 +19,7 @@ import scala.collection.JavaConversions._
 import scala.math.Ordering.Implicits._
 
 import scala.collection._
-import edu.berkeley.cs.amplab.adam.avro.ADAMRecord
+import edu.berkeley.cs.amplab.adam.avro.{ADAMRecord, ADAMNucleotideContig}
 import net.sf.samtools.{SAMFileHeader, SAMFileReader, SAMSequenceRecord}
 import org.apache.avro.specific.SpecificRecord
 
@@ -68,9 +68,27 @@ class SequenceDictionary(recordsIn: Iterable[SequenceRecord]) extends Serializab
 
   def apply(id: Int): SequenceRecord = recordIndices(id)
 
-  def apply(name: CharSequence): SequenceRecord = recordNames(name)
+  /**
+   * Returns the sequence record associated with a specific contig name.
+   *
+   * @param name Name to search for.
+   * @return SequenceRecord associated with this record.
+   */
+  def apply(name: CharSequence): SequenceRecord = {
+    // must explicitly call toString - see note at recordNames creation RE: Avro & Utf8
+    recordNames(name.toString)
+  }
 
-  def containsRefName(name : CharSequence) : Boolean = recordNames.contains(name)
+  /**
+   * Returns true if this sequence dictionary contains a reference with a specific name.
+   *
+   * @param name Reference name to look for.
+   * @return True if reference is in this dictionary.
+   */
+  def containsRefName(name : CharSequence) : Boolean = {
+    // must explicitly call toString - see note at recordNames creation RE: Avro & Utf8
+    recordNames.contains(name.toString)
+  }
 
   /**
    * Produces a Map of Int -> Int which maps the referenceIds from this SequenceDictionary
@@ -304,6 +322,16 @@ object SequenceRecord {
   def apply(id: Int, name: CharSequence, length: Long, url: CharSequence = null): SequenceRecord =
     new SequenceRecord(id, name, length, url)
 
+  /**
+   * Converts an ADAM contig into a sequence record.
+   *
+   * @param ctg Contig to convert.
+   * @return Contig expressed as a sequence record.
+   */
+  def fromADAMContig (ctg: ADAMNucleotideContig): SequenceRecord = {
+    apply(ctg.getContigId, ctg.getContigName, ctg.getSequenceLength, ctg.getUrl)
+  }
+
   /**
    * Convert an ADAMRecord into one or more SequenceRecords.
    * The reason that we can't simply use the "fromSpecificRecord" method, below, is that each ADAMRecord

diff --git a/...re/src/main/scala/edu/berkeley/cs/amplab/adam/projections/ADAMNucleotideContigField.scala b/...re/src/main/scala/edu/berkeley/cs/amplab/adam/projections/ADAMNucleotideContigField.scala
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2013. Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.berkeley.cs.amplab.adam.projections
+
+import edu.berkeley.cs.amplab.adam.avro.ADAMNucleotideContig
+
+/**
+ * This enumeration exist in order to reduce typo errors in the code. It needs to be kept
+ * in sync with any changes to ADAMRecord.
+ *
+ * This enumeration is necessary because Parquet needs the field string names
+ * for predicates and projections.
+ */
+object ADAMNucleotideContigField extends FieldEnumeration(ADAMNucleotideContig.SCHEMA$) {
+
+  val contigName,
+  contigId,
+  description,
+  sequence,
+  sequenceLength,
+  url = SchemaValue
+}