From 8fef9a783405294995199a57c01d8ecda5d7f698 Mon Sep 17 00:00:00 2001
From: Frank Austin Nothaft <fnothaft@alumni.stanford.edu>
Date: Tue, 15 Nov 2016 13:39:01 -0800
Subject: [PATCH] Clean up CLI operation categories and names, and add
 documentation for CLI.

* Removed CalculateDepth command.
* Clean up argument doc in TransformFeatures.
* Move OUTPUT to an option for Flagstat.
* Remove some output saving arguments in the View command that don't exactly
  make sense.
* Remove unused Vcf2ADAM/ADAM2Vcf -dict option and
  `org.bdgenomics.adam.cli.DictionaryCommand` trait.
* Remove WigFix2ADAM command.
* Move away from ParquetLoadSaveArgs.
* Move reads2coverage to actions.
* Reads2Coverage does not only write to Parquet, so clean up docs.
* Clean up fragments<->reads argument names.
---
 .../org/bdgenomics/adam/cli/ADAM2Fasta.scala  |   6 +-
 .../org/bdgenomics/adam/cli/ADAM2Fastq.scala  |   6 +-
 .../org/bdgenomics/adam/cli/ADAM2Vcf.scala    |   8 +-
 .../org/bdgenomics/adam/cli/ADAMMain.scala    |  14 +-
 .../org/bdgenomics/adam/cli/AlleleCount.scala |  81 ---
 .../bdgenomics/adam/cli/CalculateDepth.scala  |  94 ----
 .../adam/cli/CountContigKmers.scala           |   2 +-
 .../bdgenomics/adam/cli/CountReadKmers.scala  |   2 +-
 .../adam/cli/DictionaryCommand.scala          |  49 --
 .../org/bdgenomics/adam/cli/FlagStat.scala    |   4 +-
 .../bdgenomics/adam/cli/Fragments2Reads.scala |   4 +-
 .../org/bdgenomics/adam/cli/ListDict.scala    |  50 --
 .../bdgenomics/adam/cli/Reads2Coverage.scala  |   2 +-
 .../bdgenomics/adam/cli/Reads2Fragments.scala |   4 +-
 .../adam/cli/TransformFeatures.scala          |   2 +-
 .../org/bdgenomics/adam/cli/Vcf2ADAM.scala    |   4 +-
 .../adam/cli/VcfAnnotation2ADAM.scala         |   2 +-
 .../scala/org/bdgenomics/adam/cli/View.scala  |   7 +-
 .../org/bdgenomics/adam/cli/Wiggle2Bed.scala  |  83 ---
 .../adam/cli/TransformFeaturesSuite.scala     |  41 --
 docs/source/30_running_example.md             |   5 +-
 docs/source/40_deploying_ADAM.md              |  10 +-
 docs/source/50_cli.md                         | 492 ++++++++++++++++++
 docs/source/60_building_apps.md               |   2 +-
 docs/source/70_algorithms.md                  |   2 +-
 25 files changed, 532 insertions(+), 444 deletions(-)
 delete mode 100644 adam-cli/src/main/scala/org/bdgenomics/adam/cli/AlleleCount.scala
 delete mode 100644 adam-cli/src/main/scala/org/bdgenomics/adam/cli/CalculateDepth.scala
 delete mode 100644 adam-cli/src/main/scala/org/bdgenomics/adam/cli/DictionaryCommand.scala
 delete mode 100644 adam-cli/src/main/scala/org/bdgenomics/adam/cli/ListDict.scala
 delete mode 100644 adam-cli/src/main/scala/org/bdgenomics/adam/cli/Wiggle2Bed.scala
 create mode 100644 docs/source/50_cli.md

diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAM2Fasta.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAM2Fasta.scala
index 183afe5272..b6146861c8 100644
--- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAM2Fasta.scala
+++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAM2Fasta.scala
@@ -27,7 +27,11 @@ import org.bdgenomics.utils.cli._
 import org.bdgenomics.utils.misc.Logging
 import org.kohsuke.args4j.{ Argument, Option => Args4JOption }
 
-class ADAM2FastaArgs extends ParquetLoadSaveArgs {
+class ADAM2FastaArgs extends Args4jBase {
+  @Argument(required = true, metaVar = "ADAM", usage = "The Parquet file to convert", index = 0)
+  var inputPath: String = null
+  @Argument(required = true, metaVar = "FASTA", usage = "Location to write the FASTA to", index = 1)
+  var outputPath: String = null
   @Args4JOption(required = false, name = "-coalesce", usage = "Choose the number of partitions to coalesce down to.")
   var coalesce: Int = -1
   @Args4JOption(required = false, name = "-force_shuffle_coalesce", usage = "Force shuffle while partitioning, default false.")
diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAM2Fastq.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAM2Fastq.scala
index 192cbeecfb..e3e392fff5 100644
--- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAM2Fastq.scala
+++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAM2Fastq.scala
@@ -27,7 +27,11 @@ import org.bdgenomics.formats.avro.AlignmentRecord
 import org.bdgenomics.utils.cli._
 import org.kohsuke.args4j.{ Argument, Option => Args4JOption }
 
-class ADAM2FastqArgs extends ParquetLoadSaveArgs {
+class ADAM2FastqArgs extends Args4jBase {
+  @Argument(required = true, metaVar = "INPUT", usage = "The read file to convert", index = 0)
+  var inputPath: String = null
+  @Argument(required = true, metaVar = "OUTPUT", usage = "Location to write the FASTQ to", index = 1)
+  var outputPath: String = null
   @Argument(required = false, metaVar = "SECOND_OUTPUT", usage = "When writing FASTQ data, all second-in-pair reads will go here, if this argument is provided", index = 2)
   var outputPath2: String = null
 
diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAM2Vcf.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAM2Vcf.scala
index 1226e2ab81..30802e8f91 100644
--- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAM2Vcf.scala
+++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAM2Vcf.scala
@@ -40,8 +40,6 @@ object ADAM2Vcf extends BDGCommandCompanion {
 }
 
 class ADAM2VcfArgs extends Args4jBase with ParquetArgs {
-  @Args4jOption(required = false, name = "-dict", usage = "Reference dictionary")
-  var dictionaryFile: File = _
 
   @Argument(required = true, metaVar = "ADAM", usage = "The ADAM variant files to convert", index = 0)
   var adamFile: String = _
@@ -64,17 +62,13 @@ class ADAM2VcfArgs extends Args4jBase with ParquetArgs {
   var single: Boolean = false
 }
 
-class ADAM2Vcf(val args: ADAM2VcfArgs) extends BDGSparkCommand[ADAM2VcfArgs] with DictionaryCommand with Logging {
+class ADAM2Vcf(val args: ADAM2VcfArgs) extends BDGSparkCommand[ADAM2VcfArgs] with Logging {
   val companion = ADAM2Vcf
 
   def run(sc: SparkContext) {
     require(!(args.sort && args.sortLexicographically),
       "Cannot set both -sort_on_save and -sort_lexicographically_on_save.")
 
-    var dictionary: Option[SequenceDictionary] = loadSequenceDictionary(args.dictionaryFile)
-    if (dictionary.isDefined)
-      log.info("Using contig translation")
-
     val adamGTs = sc.loadParquetGenotypes(args.adamFile)
 
     val coalesce = if (args.coalesce > 0) {
diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAMMain.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAMMain.scala
index 71b15b795e..b1d376a3c0 100644
--- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAMMain.scala
+++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAMMain.scala
@@ -32,13 +32,13 @@ object ADAMMain {
       CommandGroup(
         "ADAM ACTIONS",
         List(
-          CalculateDepth,
           CountReadKmers,
           CountContigKmers,
           Transform,
-          ADAM2Fastq,
+          TransformFeatures,
           Flatten,
-          MergeShards
+          MergeShards,
+          Reads2Coverage
         )
       ),
       CommandGroup(
@@ -49,11 +49,9 @@ object ADAMMain {
           VcfAnnotation2ADAM,
           Fasta2ADAM,
           ADAM2Fasta,
-          TransformFeatures,
-          WigFix2Bed,
+          ADAM2Fastq,
           Fragments2Reads,
-          Reads2Fragments,
-          Reads2Coverage
+          Reads2Fragments
         )
       ),
       CommandGroup(
@@ -61,8 +59,6 @@ object ADAMMain {
         List(
           PrintADAM,
           FlagStat,
-          ListDict,
-          AlleleCount,
           View
         )
       )
diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/AlleleCount.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/AlleleCount.scala
deleted file mode 100644
index bd26911614..0000000000
--- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/AlleleCount.scala
+++ /dev/null
@@ -1,81 +0,0 @@
-/**
- * Licensed to Big Data Genomics (BDG) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The BDG licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.bdgenomics.adam.cli
-
-import org.apache.hadoop.mapreduce.Job
-import org.apache.spark.SparkContext
-import org.apache.spark.rdd.RDD
-import org.bdgenomics.adam.rdd.ADAMContext._
-import org.bdgenomics.adam.rdd.variant.GenotypeRDD
-import org.bdgenomics.formats.avro.{ Genotype, GenotypeAllele }
-import org.bdgenomics.utils.cli._
-import org.bdgenomics.utils.misc.Logging
-import org.kohsuke.args4j.Argument
-
-object AlleleCount extends BDGCommandCompanion {
-  val commandName = "allelecount"
-  val commandDescription = "Calculate Allele frequencies"
-
-  def apply(cmdLine: Array[String]) = {
-    new AlleleCount(Args4j[AlleleCountArgs](cmdLine))
-  }
-}
-
-class AlleleCountArgs extends Args4jBase with ParquetArgs {
-  @Argument(required = true, metaVar = "ADAM",
-    usage = "The ADAM Variant file", index = 0)
-  var adamFile: String = _
-  @Argument(required = true, metaVar = "Output",
-    usage = "Location to write allele frequency data", index = 1)
-  var outputPath: String = null
-}
-
-object AlleleCountHelper extends Serializable {
-  def chooseAllele(x: (String, java.lang.Long, String, String, GenotypeAllele)) =
-    x match {
-      case (chr, position, refAllele, varAllele, GenotypeAllele.REF) => Some(chr, position, refAllele)
-      case (chr, position, refAllele, varAllele, GenotypeAllele.ALT) => Some(chr, position, varAllele)
-      case _ => None
-    }
-
-  def countAlleles(adamVariants: GenotypeRDD, args: AlleleCountArgs) {
-    val usefulData = adamVariants.rdd.map(p => (
-      p.getVariant.getContigName,
-      p.getVariant.getStart,
-      p.getVariant.getReferenceAllele,
-      p.getVariant.getAlternateAllele,
-      p.getAlleles.get(0),
-      p.getAlleles.get(1)
-    ))
-    val reduced_Variants = usefulData.flatMap(p => Seq((p._1, p._2, p._3, p._4, p._5), (p._1, p._2, p._3, p._4, p._6)))
-    val alleles = reduced_Variants.flatMap(chooseAllele)
-    alleles.groupBy(identity).map { case (a, b) => "%s\t%s\t%s\t%d".format(a._1, a._2, a._3, b.size) }
-      .saveAsTextFile(args.outputPath)
-  }
-}
-
-class AlleleCount(val args: AlleleCountArgs) extends BDGSparkCommand[AlleleCountArgs] with Logging {
-  val companion = AlleleCount
-
-  def run(sc: SparkContext) {
-
-    val adamVariants = sc.loadGenotypes(args.adamFile)
-    AlleleCountHelper.countAlleles(adamVariants, args)
-  }
-}
diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CalculateDepth.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CalculateDepth.scala
deleted file mode 100644
index cda3dcb62b..0000000000
--- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CalculateDepth.scala
+++ /dev/null
@@ -1,94 +0,0 @@
-/**
- * Licensed to Big Data Genomics (BDG) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The BDG licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.bdgenomics.adam.cli
-
-import org.kohsuke.args4j.{ Argument, Option => Args4jOption }
-import org.kohsuke.args4j.spi.BooleanOptionHandler
-import org.apache.hadoop.mapreduce.Job
-import org.apache.spark.SparkContext
-import org.apache.spark.SparkContext._
-import org.apache.spark.rdd.RDD
-import org.bdgenomics.adam.models.{ SequenceDictionary, ReferenceRegion }
-import org.bdgenomics.adam.projections.Projection
-import org.bdgenomics.adam.projections.AlignmentRecordField._
-import org.bdgenomics.adam.rdd.ADAMContext._
-import org.bdgenomics.formats.avro.{ AlignmentRecord, Variant }
-import org.bdgenomics.utils.cli._
-import scala.io._
-
-/**
- * CalculateDepth (accessible as the command 'depth' through the CLI) takes two arguments,
- * an Read file and a VCF (or equivalent) file, and calculates the number of reads
- * (the 'depth') from the Read file which overlap each of the variants given by the VCF.
- * It then reports, on standard out, the location and name of each variant along with the
- * calculated depth.
- */
-object CalculateDepth extends BDGCommandCompanion {
-  val commandName: String = "depth"
-  val commandDescription: String = "Calculate the depth from a given ADAM file, " +
-    "at each variant in a VCF"
-
-  def apply(cmdLine: Array[String]): BDGCommand = {
-    new CalculateDepth(Args4j[CalculateDepthArgs](cmdLine))
-  }
-}
-
-class CalculateDepthArgs extends Args4jBase with ParquetArgs {
-  @Argument(required = true, metaVar = "ADAM", usage = "The Read file to use to calculate depths", index = 0)
-  val adamInputPath: String = null
-
-  @Argument(required = true, metaVar = "VCF", usage = "The VCF containing the sites at which to calculate depths", index = 1)
-  val vcfInputPath: String = null
-}
-
-class CalculateDepth(protected val args: CalculateDepthArgs) extends BDGSparkCommand[CalculateDepthArgs] {
-  val companion: BDGCommandCompanion = CalculateDepth
-
-  def run(sc: SparkContext): Unit = {
-
-    val proj = Projection(contigName, start, cigar, readMapped)
-
-    // load reads and variants
-    val readRdd = sc.loadAlignments(args.adamInputPath, projection = Some(proj))
-    val variants = sc.loadVariants(args.vcfInputPath)
-
-    // perform join
-    val joinedRdd = readRdd.broadcastRegionJoin(variants)
-
-    // map variant to region and swap tuple field order
-    val finalRdd = joinedRdd.rdd.map(kv => (ReferenceRegion(kv._2), kv._1))
-
-    // count at sites
-    val depths: RDD[(ReferenceRegion, Int)] =
-      finalRdd.map { case (region, record) => (region, 1) }.reduceByKey(_ + _).sortByKey()
-
-    /*
-     * tab-delimited output, containing the following columns:
-     * 0: the location of the variant
-     * 1: the depth of overlapping reads at the variant
-     */
-    println("location\tname\tdepth")
-    depths.collect().foreach {
-      case (region, count) =>
-        println("%20s\t% 5d".format(
-          "%s:%d".format(region.referenceName, region.start),
-          count
-        ))
-    }
-  }
-}
diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CountContigKmers.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CountContigKmers.scala
index 16fab5a031..f05c567130 100644
--- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CountContigKmers.scala
+++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CountContigKmers.scala
@@ -28,7 +28,7 @@ import org.bdgenomics.utils.misc.Logging
 import org.kohsuke.args4j.{ Argument, Option => Args4jOption }
 
 object CountContigKmers extends BDGCommandCompanion {
-  val commandName = "count_contig_kmers"
+  val commandName = "countContigKmers"
   val commandDescription = "Counts the k-mers/q-mers from a read dataset."
 
   def apply(cmdLine: Array[String]) = {
diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CountReadKmers.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CountReadKmers.scala
index 4330c4b869..cdfdfd2ea6 100644
--- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CountReadKmers.scala
+++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CountReadKmers.scala
@@ -29,7 +29,7 @@ import org.bdgenomics.utils.misc.Logging
 import org.kohsuke.args4j.{ Argument, Option => Args4jOption }
 
 object CountReadKmers extends BDGCommandCompanion {
-  val commandName = "count_kmers"
+  val commandName = "countKmers"
   val commandDescription = "Counts the k-mers/q-mers from a read dataset."
 
   def apply(cmdLine: Array[String]) = {
diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/DictionaryCommand.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/DictionaryCommand.scala
deleted file mode 100644
index c692bbe107..0000000000
--- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/DictionaryCommand.scala
+++ /dev/null
@@ -1,49 +0,0 @@
-/**
- * Licensed to Big Data Genomics (BDG) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The BDG licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.bdgenomics.adam.cli
-
-import htsjdk.variant.utils.SAMSequenceDictionaryExtractor
-import java.io.{ File, FileOutputStream }
-import org.apache.commons.io.IOUtils
-import org.bdgenomics.adam.models.SequenceDictionary
-
-trait DictionaryCommand {
-  private def getDictionaryFile(name: String): Option[File] = {
-    val stream = ClassLoader.getSystemClassLoader.getResourceAsStream("dictionaries/" + name)
-    if (stream == null)
-      return None
-    val file = File.createTempFile(name, ".dict")
-    file.deleteOnExit()
-    IOUtils.copy(stream, new FileOutputStream(file))
-    Some(file)
-  }
-
-  private def getDictionary(file: File) = Some(SequenceDictionary(
-    SAMSequenceDictionaryExtractor.extractDictionary(file)))
-
-  def loadSequenceDictionary(file: File): Option[SequenceDictionary] = {
-    if (file != null) {
-      if (file.exists)
-        getDictionary(file)
-      else getDictionaryFile(file.getName) match {
-        case Some(file) => getDictionary(file)
-        case _          => None
-      }
-    } else None
-  }
-}
diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/FlagStat.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/FlagStat.scala
index 387d999e5e..9d4e739152 100644
--- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/FlagStat.scala
+++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/FlagStat.scala
@@ -36,10 +36,10 @@ object FlagStat extends BDGCommandCompanion {
   }
 }
 
-class FlagStatArgs extends Args4jBase with ParquetArgs {
+class FlagStatArgs extends Args4jBase {
   @Argument(required = true, metaVar = "INPUT", usage = "The ADAM data to return stats for", index = 0)
   val inputPath: String = null
-  @Argument(required = false, metaVar = "OUTPUT", usage = "Optionally write the stats to this file.", index = 1)
+  @Args4jOption(required = false, name = "-o", usage = "Optionally write the stats to this file.")
   val outputPath: String = null
   @Args4jOption(required = false, name = "-stringency", usage = "Set the parsing stringency: SILENT, LENIENT, STRICT.")
   val stringency: String = "SILENT"
diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Fragments2Reads.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Fragments2Reads.scala
index f8513776bd..c587430b2e 100644
--- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Fragments2Reads.scala
+++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Fragments2Reads.scala
@@ -35,9 +35,9 @@ object Fragments2Reads extends BDGCommandCompanion {
 }
 
 class Fragments2ReadsArgs extends Args4jBase with ADAMSaveAnyArgs with ParquetArgs {
-  @Argument(required = true, metaVar = "INPUT", usage = "The Fragment file to apply the transforms to", index = 0)
+  @Argument(required = true, metaVar = "FRAGMENTS", usage = "The Fragment file to apply the transforms to", index = 0)
   var inputPath: String = null
-  @Argument(required = true, metaVar = "OUTPUT", usage = "Location to write the transformed data in ADAM/Parquet format", index = 1)
+  @Argument(required = true, metaVar = "READS", usage = "Location to write the transformed data as reads", index = 1)
   var outputPath: String = null
   @Args4jOption(required = false, name = "-single", usage = "Saves OUTPUT as single file")
   var asSingleFile: Boolean = false
diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ListDict.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ListDict.scala
deleted file mode 100644
index ad03455c4e..0000000000
--- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ListDict.scala
+++ /dev/null
@@ -1,50 +0,0 @@
-/**
- * Licensed to Big Data Genomics (BDG) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The BDG licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.bdgenomics.adam.cli
-
-import org.apache.hadoop.mapreduce.Job
-import org.apache.spark.SparkContext
-import org.bdgenomics.adam.models.SequenceRecord
-import org.bdgenomics.adam.rdd.ADAMContext._
-import org.bdgenomics.formats.avro.AlignmentRecord
-import org.bdgenomics.utils.cli._
-import org.kohsuke.args4j.Argument
-
-object ListDict extends BDGCommandCompanion {
-  val commandName: String = "listdict"
-  val commandDescription: String = "Print the contents of an ADAM sequence dictionary"
-
-  def apply(cmdLine: Array[String]): BDGCommand = {
-    new ListDict(Args4j[ListDictArgs](cmdLine))
-  }
-}
-
-class ListDictArgs extends Args4jBase with ParquetArgs {
-  @Argument(required = true, metaVar = "INPUT", usage = "The ADAM sequence dictionary to print", index = 0)
-  val inputPath: String = null
-}
-
-class ListDict(protected val args: ListDictArgs) extends BDGSparkCommand[ListDictArgs] {
-  val companion: BDGCommandCompanion = ListDict
-
-  def run(sc: SparkContext): Unit = {
-    val gRdd = sc.loadAlignments(args.inputPath)
-
-    println(gRdd.sequences)
-  }
-}
diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Reads2Coverage.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Reads2Coverage.scala
index 0edb1ae931..031ffd6031 100644
--- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Reads2Coverage.scala
+++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Reads2Coverage.scala
@@ -46,7 +46,7 @@ object Reads2Coverage extends BDGCommandCompanion {
 class Reads2CoverageArgs extends Args4jBase with ParquetArgs {
   @Argument(required = true, metaVar = "INPUT", usage = "The reads file to use to calculate depths", index = 0)
   var inputPath: String = null
-  @Argument(required = true, metaVar = "OUTPUT", usage = "Location to write the coverage data in ADAM/Parquet format", index = 1)
+  @Argument(required = true, metaVar = "OUTPUT", usage = "Location to write the coverage data to", index = 1)
   var outputPath: String = null
   @Args4jOption(required = false, name = "-collapse", usage = "Collapses neighboring coverage records " +
     "of equal counts into the same record")
diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Reads2Fragments.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Reads2Fragments.scala
index 039a5cfb2b..0aa01c0115 100644
--- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Reads2Fragments.scala
+++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Reads2Fragments.scala
@@ -34,9 +34,9 @@ object Reads2Fragments extends BDGCommandCompanion {
 }
 
 class Reads2FragmentsArgs extends Args4jBase with ADAMSaveAnyArgs with ParquetArgs {
-  @Argument(required = true, metaVar = "INPUT", usage = "The ADAM, interleaved FASTQ, BAM, or SAM file to apply the transforms to", index = 0)
+  @Argument(required = true, metaVar = "READS", usage = "The ADAM, interleaved FASTQ, BAM, or SAM file to apply the transforms to", index = 0)
   var inputPath: String = null
-  @Argument(required = true, metaVar = "OUTPUT", usage = "Location to write the transformed data in ADAM/Parquet format", index = 1)
+  @Argument(required = true, metaVar = "FRAGMENTS", usage = "Location to write the transformed data in ADAM/Parquet format", index = 1)
   var outputPath: String = null
 
   // these are required because of the ADAMSaveAnyArgs trait... fix this trait???
diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformFeatures.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformFeatures.scala
index 7658673d9c..898772b3ca 100644
--- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformFeatures.scala
+++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformFeatures.scala
@@ -41,7 +41,7 @@ class TransformFeaturesArgs extends Args4jBase with ParquetSaveArgs {
   var outputPath: String = null
 
   @Args4jOption(required = false, name = "-num_partitions",
-    usage = "Number of partitions to load an interval file using.")
+    usage = "Number of partitions to load a text file using.")
   var numPartitions: Int = _
 
   @Args4jOption(required = false, name = "-single",
diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Vcf2ADAM.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Vcf2ADAM.scala
index 537ce9134a..62593b3b18 100644
--- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Vcf2ADAM.scala
+++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Vcf2ADAM.scala
@@ -37,8 +37,6 @@ object Vcf2ADAM extends BDGCommandCompanion {
 }
 
 class Vcf2ADAMArgs extends Args4jBase with ParquetSaveArgs {
-  @Args4jOption(required = false, name = "-dict", usage = "Reference dictionary")
-  var dictionaryFile: File = _
 
   @Argument(required = true, metaVar = "VCF", usage = "The VCF file to convert", index = 0)
   var vcfPath: String = _
@@ -56,7 +54,7 @@ class Vcf2ADAMArgs extends Args4jBase with ParquetSaveArgs {
   var onlyVariants: Boolean = false
 }
 
-class Vcf2ADAM(val args: Vcf2ADAMArgs) extends BDGSparkCommand[Vcf2ADAMArgs] with DictionaryCommand with Logging {
+class Vcf2ADAM(val args: Vcf2ADAMArgs) extends BDGSparkCommand[Vcf2ADAMArgs] with Logging {
   val companion = Vcf2ADAM
 
   def run(sc: SparkContext) {
diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/VcfAnnotation2ADAM.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/VcfAnnotation2ADAM.scala
index 2b90edcc85..f51a410235 100644
--- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/VcfAnnotation2ADAM.scala
+++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/VcfAnnotation2ADAM.scala
@@ -44,7 +44,7 @@ class VcfAnnotation2ADAMArgs extends Args4jBase with ParquetSaveArgs {
   var vcfFile: String = _
   @Argument(required = true, metaVar = "ADAM", usage = "Location to write ADAM Variant annotations data", index = 1)
   var outputPath: String = null
-  @Args4jOption(required = false, name = "-current_db", usage = "Location of existing ADAM Variant annotations data")
+  @Args4jOption(required = false, name = "-annotations_to_join", usage = "Location of existing ADAM Variant annotations data")
   var currentAnnotations: String = null
 }
 
diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/View.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/View.scala
index 5867560c77..676a658d94 100644
--- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/View.scala
+++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/View.scala
@@ -30,7 +30,7 @@ class ViewArgs extends Args4jBase with ParquetArgs with ADAMSaveAnyArgs {
   @Argument(required = true, metaVar = "INPUT", usage = "The ADAM, BAM or SAM file to view", index = 0)
   var inputPath: String = null
 
-  @Argument(required = false, metaVar = "OUTPUT", usage = "Location to write output data", index = 1)
+  // left null until constructor
   var outputPath: String = null
 
   @Args4jOption(
@@ -80,12 +80,9 @@ class ViewArgs extends Args4jBase with ParquetArgs with ADAMSaveAnyArgs {
   )
   var outputPathArg: String = null
 
-  @Args4jOption(required = false, name = "-sort_fastq_output", usage = "Sets whether to sort the FASTQ output, if saving as FASTQ. False by default. Ignored if not saving as FASTQ.")
+  // required by ADAMAnySaveArgs
   var sortFastqOutput: Boolean = false
-
-  @Args4jOption(required = false, name = "-single", usage = "Saves OUTPUT as single file")
   var asSingleFile: Boolean = false
-  @Args4jOption(required = false, name = "-defer_merging", usage = "Defers merging single file output")
   var deferMerging: Boolean = false
 }
 
diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Wiggle2Bed.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Wiggle2Bed.scala
deleted file mode 100644
index 3c5255fde8..0000000000
--- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Wiggle2Bed.scala
+++ /dev/null
@@ -1,83 +0,0 @@
-/**
- * Licensed to Big Data Genomics (BDG) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The BDG licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.bdgenomics.adam.cli
-
-import java.io.PrintWriter
-import org.bdgenomics.utils.cli._
-import org.kohsuke.args4j.Option
-import scala.io.Source
-
-class Wig2BedArgs extends Args4jBase {
-  @Option(name = "-wig", usage = "The wig file to convert (leave out for stdin)")
-  var wigPath: String = ""
-
-  @Option(name = "-bed", usage = "Location to write BED data (leave out for stdout)")
-  var bedPath: String = ""
-}
-
-/**
- * WigFix2Bed (accessible as the command "wigfix2bed" through the CLI) takes
- * fixed wiggle file and converts it to a BED formatted file.  The wiggle file
- * is a text based format that implements run-length encoding, without any
- * guarantees where the sync markers are.  This makes it difficult to use as a
- * "splittable" format, and necessitates processing the file locally.
- */
-object WigFix2Bed extends BDGCommandCompanion {
-  val commandName = "wigfix2bed"
-  val commandDescription = "Locally convert a wigFix file to BED format"
-
-  // matches a "sync" line that resets the position
-  val declPattern = "^fixedStep[\\s]+chrom=(.+)[\\s]+start=([0-9]+)[\\s]+step=([0-9]+)[\\s]*(?:$|span=([0-9]+).*$)".r
-  // a single datum in the run-length encoded file
-  val featPattern = "^\\s*([-]?[0-9]*\\.?[0-9]*)\\s*$".r
-
-  def apply(cmdLine: Array[String]) = {
-    new WigFix2Bed(Args4j[Wig2BedArgs](cmdLine))
-  }
-}
-
-class WigFix2Bed(val args: Wig2BedArgs) extends BDGCommand {
-  val companion = WigFix2Bed
-
-  def run() {
-    // state from the declaration lines
-    var contig: String = ""
-    var current: Long = 0
-    var step: Long = 0
-    var span: Long = 1
-
-    val in = if (args.wigPath == "") Source.stdin else Source.fromFile(args.wigPath)
-    val out = if (args.bedPath == "") new PrintWriter(System.out) else new PrintWriter(args.bedPath)
-    in.getLines().foreach {
-      case WigFix2Bed.declPattern(c, st, sp, sn) => {
-        contig = c
-        current = st.toLong - 1 // convert to BED coords
-        step = sp.toLong
-        span = if (sn == null) span else sn.toLong
-      }
-      case WigFix2Bed.featPattern(value) => {
-        out.println(Array(contig, current.toString, (current + span).toString, "", value).mkString("\t"))
-        current += step
-      }
-      case _ => None
-    }
-    in.close()
-    out.close()
-  }
-}
diff --git a/adam-cli/src/test/scala/org/bdgenomics/adam/cli/TransformFeaturesSuite.scala b/adam-cli/src/test/scala/org/bdgenomics/adam/cli/TransformFeaturesSuite.scala
index 5f98dd720d..8b9cbf6324 100644
--- a/adam-cli/src/test/scala/org/bdgenomics/adam/cli/TransformFeaturesSuite.scala
+++ b/adam-cli/src/test/scala/org/bdgenomics/adam/cli/TransformFeaturesSuite.scala
@@ -52,45 +52,4 @@ class TransformFeaturesSuite extends ADAMFunSuite {
     assert(converted.size === 10)
     assert(converted.find(_.getContigName != "chr1").isEmpty)
   }
-
-  sparkTest("can convert a simple wigfix file") {
-    val loader = Thread.currentThread().getContextClassLoader
-    val inputPath = loader.getResource("chr5.phyloP46way.trunc.wigFix").getPath
-    val bedFile = File.createTempFile("adam-cli.TransformFeaturesSuite", ".bed")
-    val bedPath = bedFile.getAbsolutePath
-    val outputFile = File.createTempFile("adam-cli.TransformFeaturesSuite", ".adam")
-    val outputPath = outputFile.getAbsolutePath
-
-    // We have to do this, since the features2adam won't work if the file already exists,
-    // but the "createTempFile" method actually creates the file (on some systems?)
-    assert(bedFile.delete(), "Couldn't delete (empty) temp file")
-    assert(outputFile.delete(), "Couldn't delete (empty) temp file")
-
-    // convert to BED
-    val bedArgLine = "-wig %s -bed %s".format(inputPath, bedPath).split("\\s+")
-    val bedArgs: Wig2BedArgs = Args4j.apply[Wig2BedArgs](bedArgLine)
-    val wigFix2Bed = new WigFix2Bed(bedArgs)
-    wigFix2Bed.run()
-
-    // convert to ADAM Features
-    val adamArgLine = "%s %s".format(bedPath, outputPath).split("\\s+")
-    val adamArgs: TransformFeaturesArgs = Args4j.apply[TransformFeaturesArgs](adamArgLine)
-    val features2Adam = new TransformFeatures(adamArgs)
-    features2Adam.run(sc)
-
-    val schema = Projection(featureId, contigName, start, end, score)
-    val rdd = sc.loadFeatures(outputPath, projection = Some(schema))
-    val converted = rdd.rdd.collect.toSeq.sortBy(f => f.getStart)
-
-    assert(converted.size === 10)
-    assert(converted(0).getContigName == "chr5")
-    assert(converted(0).getStart == 13939)
-    assert(converted(0).getEnd == 13940)
-    assert(converted(0).getScore == 0.067)
-    assert(converted(6).getContigName == "chr5")
-    assert(converted(6).getStart == 15295)
-    assert(converted(9).getStart == 15298)
-    assert(converted(9).getEnd == 15299)
-    assert(converted(9).getScore == 0.139)
-  }
 }
diff --git a/docs/source/30_running_example.md b/docs/source/30_running_example.md
index 3f590cf20f..365f5363be 100644
--- a/docs/source/30_running_example.md
+++ b/docs/source/30_running_example.md
@@ -1,7 +1,8 @@
 ## flagstat
 
-Once you have data converted to ADAM, you can gather statistics from the ADAM file using `flagstat`.
-This command will output stats identically to the samtools `flagstat` command, e.g.
+Once you have data converted to ADAM, you can gather statistics from the ADAM
+file using [`flagstat`](#flagstat). This command will output stats identically
+to the samtools `flagstat` command, e.g.
 
 ```bash
 $ ./bin/adam-submit flagstat NA12878_chr20.adam
diff --git a/docs/source/40_deploying_ADAM.md b/docs/source/40_deploying_ADAM.md
index c58855c66b..a5cb3c9d05 100644
--- a/docs/source/40_deploying_ADAM.md
+++ b/docs/source/40_deploying_ADAM.md
@@ -89,11 +89,11 @@ include:
 
 * [adam-kmers](https://github.com/BD2KGenomics/toil-scripts/tree/master/src/toil_scripts/adam_kmers):
   this workflow was demonstrated in [@vivian16] and sets up a Spark cluster
-  which then runs ADAM's `count_kmers` CLI.
+  which then runs ADAM's [`countKmers` CLI](#countKmers).
 * [adam-pipeline](https://github.com/BD2KGenomics/toil-scripts/tree/master/src/toil_scripts/adam_pipeline):
-  this workflow runs several stages in the ADAM `transform` CLI. This pipeline
-  is the ADAM equivalent to the GATK's "Best Practice" read preprocessing
-  pipeline. We then stitch together this pipeline with
+  this workflow runs several stages in the ADAM [`transform` CLI](#transform).
+  This pipeline is the ADAM equivalent to the GATK's "Best Practice" read
+  preprocessing pipeline. We then stitch together this pipeline with
   [BWA-MEM](https://github.com/lh3/bwa) and the GATK in the [adam-gatk-pipeline](
   https://github.com/BD2KGenomics/toil-scripts/tree/master/src/toil_scripts/adam_gatk_pipeline).
 
@@ -291,7 +291,7 @@ does the following work:
     _log.info('Counting %d-mers in %s, and saving to %s.',
               kmer_length, hdfs_input_file, hdfs_output_file)
     call_adam(master_ip,
-              ['count_kmers',
+              ['countKmers',
                hdfs_input_file, hdfs_output_file,
                str(kmer_length)],
               memory=memory, override_parameters=spark_conf)
diff --git a/docs/source/50_cli.md b/docs/source/50_cli.md
new file mode 100644
index 0000000000..aee1e508ae
--- /dev/null
+++ b/docs/source/50_cli.md
@@ -0,0 +1,492 @@
+# Running ADAM's command line tools
+
+In addition to being used as an API for [building applications](#apps), ADAM
+provides a command line interface (CLI) for extracting, transforming, and
+loading (ETL-ing) genomics data. Our CLI is roughly divided into three sections:
+
+* [Actions](#actions) that manipulate data using the ADAM schemas
+* [Conversions](#conversions) that convert data from legacy formats into Parquet
+* [Printers](#printers) that provide detailed or summarized views of genomic
+  data
+
+ADAM's various CLI actions can be run from the command line using the
+`scripts/adam-submit` script. This script uses the `spark-submit` script to run
+an ADAM application on a Spark cluster. To use this script, either
+`spark-submit` must be on the `$PATH`, or the `$SPARK_HOME` environment variable
+must be set.
+
+#### Default arguments {#default-args}
+
+There are several command line options that are present across most commands.
+These include:
+
+* `-h`, `-help`, `--help`, `-?`: prints the usage for this command
+* `-parquet_block_size N`: sets the block size for Parquet in bytes, if writing
+  a Parquet output file. Defaults to 128 MB (128 * 1024 * 1024).
+* `-parquet_compression_codec`: The codec to use for compressing a Parquet page.
+  Choices are:
+    * `UNCOMPRESSED`: No compression.
+    * `SNAPPY`: Use the [Snappy](https://github.com/google/snappy) compression
+      codec.
+    * `GZIP`: Use a [Gzip](https://www.gnu.org/software/gzip/) based compression
+      codec.
+    * `LZO`: Use a
+      [LZO](https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Oberhumer)
+      based compression codec. To use LZO, the [LZO libraries must be
+      installed](http://hbase.apache.org/book.html#trouble.rs.startup.compression).
+* `-parquet_disable_dictionary`: Disables dictionary encoding in Parquet, and
+  enables delta encoding.
+* `-parquet_logging_level VAL`: The [Log4j](http://logging.apache.org/log4j/)
+  logging level to set for Parquet's loggers. Defaults to `severe`.
+* `-parquet_page_size N`: The page size in bytes to use when writing Parquet
+  files. Defaults to 1MB (1024 * 1024).
+* `-print_metrics`: If provided, prints the
+  [instrumentation](https://github.com/bigdatagenomics/utils#instrumentation)
+  metrics to the log when the CLI operation terminates.
+
+#### Legacy output options {#legacy-output}
+
+Several tools in ADAM support saving back to legacy genomics output formats. Any
+tool saving to one of these formats supports the following options:
+
+* `-single`: Merge sharded output files. If this is not provided, the output
+  will be written as sharded files where each shard is a valid file. If this
+  _is_ provided, the shards will be written without headers as a
+  `${OUTPUTNAME}_tail` directory, and a single header will be written to
+  `${OUTPUTNAME}_head`. If `-single` is provided and `-defer_merging` is _not_
+  provided, the header file and the shard directory will be merged into a single
+  file at `${OUTPUTPATH}`.
+* `-defer_merging`: If both `-defer_merging` and `-single` are provided, the
+  output will be saved as if is a single file, but the output files will not be
+  merged.
+
+#### Validation stringency {#validation}
+
+Various components in ADAM support passing a validation stringency level. This
+is a three level scale:
+
+* `STRICT`: If validation fails, throw an exception.
+* `LENIENT`: If validation fails, ignore the data and write a warning to the
+  log.
+* `SILENT`: If validation fails, ignore the data silently.
+
+## Action tools {#actions}
+
+Roughly speaking, "action" tools apply some form of non-trivial transformation
+to data using the ADAM APIs.
+
+### countKmers and countContigKmers {#countKmers}
+
+Counts the $k$ length substrings in either a set of reads or reference
+fragments. Takes three required arguments:
+
+1. `INPUT`: The input path. A set of reads for `countKmers` or a set of
+  reference contigs for `countContigKmers`.
+2. `OUTPUT`: The path to save the output to. Saves the output as
+ [CSV](https://en.wikipedia.org/wiki/Comma-separated_values) containing
+ the $k$-mer sequence and count.
+3. `KMER_LENGTH`: The length $k$ of substrings to count.
+
+Beyond the [default options](#default-args), both `countKmers` and
+`countContigKmers` take one option:
+
+* `-print_histogram`: If provided, prints a histogram of the $k$-mer count
+  distribution to standard out.
+
+### transform {#transform}
+
+The `transform` CLI is the entrypoint to ADAM's read preprocessing tools. This
+command provides drop-in replacement commands for several commands in the
+[Genome Analysis Toolkit](https://software.broadinstitute.org/gatk/) "Best
+Practices" read preprocessing pipeline and more [@depristo11]. This CLI tool
+takes two required arguments:
+
+1. `INPUT`: The input path. A file containing reads in any of the supported
+  ADAM read input formats.
+2. `OUTPUT`: The path to save the transformed reads to. Supports any of ADAM's
+  read output formats.
+
+Beyond the [default options](#default-args) and the [legacy output
+options](#legacy-output), `transform` supports a vast range of options. These
+options fall into several general categories:
+
+* General options:
+    * `-cache`: If provided, the results of intermediate stages will be cached.
+      This is necessary to avoid recomputation if running multiple
+      transformations (e.g., Indel realignment, BQSR, etc) back to back.
+    * `-storage_level`: Along with `-cache`, this can be used to set the Spark
+      [persistance level](http://spark.apache.org/docs/latest/programming-guide.html#which-storage-level-to-choose)
+      for cached data. If not provided, this defaults to `MEM_ONLY`.
+    * `-stringency`: Sets the validation stringency for various operations.
+      Defaults to `LENIENT.` See [validation stringency](#validation) for more
+      details.
+* Loading options:
+    * `-repartition`: Forces a repartition on load. Useful to increase the
+      available parallelism on a small dataset. Forces a shuffle. Takes the
+      number of partitions to repartition to.
+    * `-force_load_bam`: Forces ADAM to try to load the input as SAM/BAM/CRAM.
+    * `-force_load_fastq`: Forces ADAM to try to load the input as FASTQ.
+    * `-paired_fastq`: Forces `-force_load_fastq`, and passes the path of a
+      second-of-pair FASTQ file to load.
+    * `-record_group`: If loading FASTQ, sets the record group name on each
+      read to this value.
+    * `-force_load_ifastq`: Forces ADAM to try to load the input as interleaved
+      FASTQ.
+    * `-force_load_parquet`: Forces ADAM to try to load the input as Parquet
+      encoded using the ADAM `AlignmentRecord` schema.
+    * `-limit_projection`: If loading as Parquet, sets a projection that does
+      not load the `attributes` or `origQual` fields of the `AlignmentRecord`.
+    * `-aligned_read_predicate`: If loading as Parquet, only loads aligned
+      reads.
+    * `-concat`: Provides a path to an optional second file to load, which is
+      then concatenated to the file given as the `INPUT` path.
+* Duplicate marking options: Duplicate marking is run with the
+  `-mark_duplicate_reads` option. It takes no optional parameters.
+* BQSR options: BQSR is run with the `-recalibrate_base_qualities` flag.
+  Additionally, the BQSR engine takes the following parameters:
+    * `-known_snps`: Path to a VCF file/Parquet variant file containing known
+      point variants. These point variants are used to mask read errors during
+      recalibration. Specifically, putative read errors that are at variant
+      sites are treated as correct observations. If BQSR is run, this option
+      should be passed, along with a path to a known variation database (e.g.,
+      [dbSNP](https://www.ncbi.nlm.nih.gov/projects/SNP/)). {#known-snps}
+    * `-dump_observations`: If provided, a path to dump the recalibration table
+      in CSV format.
+* Indel realignment options: Indel realignment is run with the `-realign_indels`
+  flag. Additionally, the Indel realignment engine takes the following options:
+    * `-known_indels`: Path to a VCF file/Parquet variant file containing known
+      Indel variants to realign against. If provided, forces the `KNOWNS_ONLY`
+      consensus model. If not provided, forces the `CONSENSUS_FROM_READS` model.
+      See [candidate generation and realignment](#consensus-model). {#known-indels}
+    * `-max_consensus_number`: The maximum number of consensus sequences to
+      realign a single target against. If more consensus sequences are seen at
+      a single target, we randomly downsample. Defaults to 30.
+    * `-max_indel_size`: The maximum length of an Indel to realign against.
+      Indels longer than this size are dropped before generating consensus
+      sequences. Defaults to 500bp.
+    * `-max_target_size`: The maximum length of a target to realign. Targets
+      longer than this size are dropped before trying to realign. Defaults to
+      3,000bp.
+    * `-log_odds_threshold`: The log odds threshold to use for picking a
+      consensus sequence to finalize realignments against. A consensus will not
+      be realigned against unless the Phred weighted edit distance against the
+      given consensus/reference pair is a sufficient improvement over the
+      original reference realignments. This option sets that improvement weight.
+      Defaults to 5.0.
+* `mismatchingPositions` tagging options: We can recompute the
+  `mismatchingPositions` field of an AlignmentRecord (SAM "MD" tag) with the
+  `-add_md_tags` flag. This flag takes a path to a reference file in either
+  FASTA or Parquet `NucleotideContigFragment` format. Additionally, this engine
+  takes the following options:
+    * `-md_tag_fragment_size`: If loading from FASTA, sets the size of each
+      fragment to load. Defaults to 10,000bp.
+    * `-md_tag_overwrite`: If provided, recomputes and overwrites the
+      `mismatchingPositions` field for records where this field was provided.
+* Output options: `transform` supports the [legacy output](#legacy-output)
+  options. Additionally, there are the following options:
+    * `-coalesce`: Sets the number of partitions to coalesce the output to.
+      If `-force_shuffle_coalesce` is not provided, the Spark engine may ignore
+      the coalesce directive.
+    * `-force_shuffle_coalesce`: Forces a shuffle that leads to the output being
+      saved with the number of partitions requested by `-coalesce`. This is
+      necessary if the `-coalesce` would increase the number of partitions, or
+      if it would reduce the number of partitions to fewer than the number of
+      Spark executors. This may have a substantial performance cost, and will
+      invalidate any sort order.
+    * `-sort_reads`: Sorts reads by alignment position. Unmapped reads are
+      placed at the end of all reads. Contigs are ordered by sequence record
+      index.
+    * `-sort_lexicographically`: Sorts reads by alignment position. Unmapped
+      reads are placed at the end of all reads. Contigs are ordered
+      lexicographically.
+    * `-sort_fastq_output`: Ignored if not saving to FASTQ. If saving to FASTQ,
+      sorts the output reads by read name.
+
+### transformFeatures
+
+Loads a feature file into the ADAM `Feature` schema, and saves it back. The
+input and output formats are autodetected. Takes two required arguments:
+
+1. `INPUT`: The input path. A file containing features in any of the supported
+  ADAM feature input formats.
+2. `OUTPUT`: The path to save the transformed features to. Supports any of ADAM's
+  feature output formats.
+
+Beyond the [default options](#default-args) and the [legacy output
+options]{#legacy-output}, `transformFeatures` has one optional argument:
+
+* `-num_partitions`: If loading from a textual feature format (i.e., not
+  Parquet), sets the number of partitions to load. If not provided, this is
+  chosen by Spark.
+
+### flatten
+
+Loads a Parquet file and rewrites the file as a new Parquet file with a flat
+schema. This is useful if loading the data into a database that supports Parquet
+but that does not support nested schemas. Takes two required arguments:
+
+1. `INPUT`: The input path to a Parquet file.
+2. `OUTPUT`: The path to save a Parquet file containing the input data, but
+   written using a flattened schema.
+
+### mergeShards
+
+A CLI tool for merging a [sharded legacy file](#legacy-output) that was written
+with the `-single` and `-defer_merging` flags. Runs the file merging process.
+Takes two required arguments:
+
+1. `INPUT`: The input directory of sharded files to merge.
+2. `OUTPUT`: The path to save the merged file at.
+
+This command takes several optional arguments:
+
+* `-buffer_size`: The buffer size in bytes to use for copying data on the
+  driver. Defaults to 4MB (4 * 1024 * 1024).
+* `-header_path`: The path to a header file that should be written to the start
+  of the merged output.
+* `-write_cram_eof`: Writes an empty CRAM container at the end of the merged
+  output file. This should not be provided unless merging a sharded CRAM file.
+* `-write_empty_GZIP_at_eof`: Writes an empty GZIP block at the end of the
+  merged output file. This should be provided if merging a sharded BAM file or
+  any other BGZIPed format.
+
+This command does not support Parquet output, so the only [default
+options](#default-args) that this command supports is `-print_metrics`.
+
+### reads2coverage
+
+The `reads2coverage` command computes per-locus coverage from reads and saves
+the coverage counts as features. Takes two required arguments:
+
+1. `INPUT`: The input path. A file containing reads in any of the supported
+  ADAM read input formats.
+2. `OUTPUT`: The path to save the coverage counts to. Saves in any of the ADAM
+  supported feature file formats.
+
+In addition to the [default options](#default-args), `reads2coverage` takes the
+following options:
+
+* `-collapse`: If two (or more) neighboring sites have the same coverage, we
+  collapse them down into a single genomic feature.
+* `-only_negative_strands`: Only computes coverage for reads aligned on the
+  negative strand. Conflicts with `-only_positive_strands`.
+* `-only_positive_strands`: Only computes coverage for reads aligned on the
+  positive strand. Conflicts with `-only_negative_strands`.
+
+## Conversion tools {#conversions}
+
+These tools convert data between a legacy genomic file format and using ADAM's
+schemas to store data in Parquet.
+
+### vcf2adam, anno2adam, and adam2vcf
+
+These commands convert between VCF and Parquet using the Genotype and Variant
+schemas.
+
+`vcf2adam` takes two required arguments:
+
+1. `VCF`: The VCF file to convert to Parquet.
+2. `ADAM`: The path to save the converted Parquet data at.
+
+`vcf2adam` supports the full set of [default options](#default-args).
+Additionally, `vcf2adam` takes the following options:
+
+* `-only_variants`: Instead of saving the VCF file as Genotypes, only save the
+  Variants from the VCF. This is useful if loading a sites-only VCF, e.g., for
+  [BQSR](#known-snps) or [Indel realignment](#known-indels).
+* `-coalesce`: Sets the number of partitions to coalesce the output to.
+  If `-force_shuffle_coalesce` is not provided, the Spark engine may ignore
+  the coalesce directive.
+* `-force_shuffle_coalesce`: Forces a shuffle that leads to the output being
+  saved with the number of partitions requested by `-coalesce`. This is
+  necessary if the `-coalesce` would increase the number of partitions, or
+  if it would reduce the number of partitions to fewer than the number of
+  Spark executors. This may have a substantial performance cost, and will
+  invalidate any sort order.
+
+`anno2adam` converts VCFs with annotated variants (i.e., the VCF INFO fields)
+into Parquet using the VariantAnnotation schema. `anno2adam` takes the same two
+required arguments as `vcf2adam`. `anno2adam` takes the [default
+options](#default-args), and one additional option:
+
+* `-annotations_to_join`: A path to an existing Parquet file of
+  VariantAnnotations. These two files are joined together, and the annotations
+  are merged.
+
+`adam2vcf` takes two required arguments:
+
+1. `ADAM`: The Parquet file of Genotypes to convert to VCF.
+2. `VCF`: The path to save the VCF file to.
+
+`adam2vcf` only supports the `-print_metrics` option from the [default
+options](#default-args). Additionally, `adam2vcf` takes the following options:
+
+* `-coalesce`: Sets the number of partitions to coalesce the output to.
+  The Spark engine may ignore the coalesce directive.
+* `-sort_on_save`: Sorts the variants when saving, where contigs are ordered
+  by sequence index. Conflicts with `-sort_lexicographically_on_save`.
+* `-sort_lexicographically_on_save`: Sorts the variants when saving, where
+  contigs are ordered lexicographically. Conflicts with `-sort_on_save`.
+* `-single`: Saves the VCF file as headerless shards, and then merges the
+  sharded files into a single VCF.
+
+### fasta2adam and adam2fasta
+
+These commands convert between FASTA and Parquet files storing assemblies using
+the NucleotideContigFragment schema.
+
+`fasta2adam` takes two required arguments:
+
+1. `FASTA`: The input FASTA file to convert.
+2. `ADAM`: The path to save the Parquet formatted NucleotideContigFragments to.
+
+`fasta2adam` supports the full set of [default options](#default-args), as well
+as the following options:
+
+* `-fragment_length`: The fragment length to shard a given contig into. Defaults
+  to 10,000bp.
+* `-reads`: Path to a set of reads that includes sequence info. This read path
+  is used to obtain the sequence indices for ordering the contigs from the
+  FASTA file.
+* `-repartition`: The number of partitions to save the data to. If provided,
+  forces a shuffle.
+* `-verbose`: If given, enables additional logging where the sequence dictionary
+  is printed.
+
+`adam2fasta` takes two required arguments:
+
+1. `ADAM`: The path to a Parquet file containing NucleotideContigFragments.
+2. `FASTA`: The path to save the FASTA file to.
+
+`adam2fasta` only supports the `-print_metrics` option from the [default
+options](#default-args). Additionally, `adam2fasta` takes the following options:
+
+* `-line_width`: The line width in characters to use for breaking FASTA lines.
+  Defaults to 60 characters.
+* `-coalesce`: Sets the number of partitions to coalesce the output to.
+  If `-force_shuffle_coalesce` is not provided, the Spark engine may ignore
+  the coalesce directive.
+* `-force_shuffle_coalesce`: Forces a shuffle that leads to the output being
+  saved with the number of partitions requested by `-coalesce`. This is
+  necessary if the `-coalesce` would increase the number of partitions, or
+  if it would reduce the number of partitions to fewer than the number of
+  Spark executors. This may have a substantial performance cost, and will
+  invalidate any sort order.
+
+### adam2fastq
+
+While the [`transform`](#transform) command can export to FASTQ, the
+`adam2fastq` provides a simpler CLI with more output options. `adam2fastq`
+takes two required arguments and an optional third argument:
+
+1. `INPUT`: The input read file, in any ADAM-supported read format.
+2. `OUTPUT`: The path to save an unpaired or interleaved FASTQ file to, or the
+  path to save the first-of-pair reads to, for paired FASTQ.
+3. Optional `SECOND_OUTPUT`: If saving paired FASTQ, the path to save the
+  second-of-pair reads to.
+
+`adam2fastq` only supports the `-print_metrics` option from the [default
+options](#default-args). Additionally, `adam2fastq` takes the following options:
+
+* `-no_projection`: By default, `adam2fastq` only projects the fields necessary
+  for saving to FASTQ. This option disables that projection and projects all
+  fields.
+* `-output_oq`: Outputs the original read qualities, if available.
+* `-persist_level`: Sets the Spark
+  [persistance level](http://spark.apache.org/docs/latest/programming-guide.html#which-storage-level-to-choose)
+  for cached data during the conversion back to FASTQ. If not provided, the
+  intermediate RDDs are not cached.
+* `-repartition`: The number of partitions to save the data to. If provided,
+  forces a shuffle.
+* `-validation`: Sets the validation stringency for checking whether reads are
+  paired when saving paired reads. Defaults to `LENIENT.` See [validation
+  stringency](#validation) for more details.
+
+### reads2fragments and fragments2reads
+
+These two commands translate read data between the single read alignment and
+fragment representations.
+
+`reads2fragments` takes two required arguments:
+
+1. `READS`: The input read file, in any ADAM-supported read format.
+2. `FRAGMENTS`: The path to save Parquet data with the Fragment schema.
+
+`reads2fragments` takes the [default options](#default-args).
+
+`fragments2reads` takes two required arguments:
+
+1. `FRAGMENTS`: The input fragment file, in any ADAM-supported fragment format.
+2. `READS`: The path to save reads at, in any ADAM-supported read format.
+
+`fragments2reads` takes the [default options](#default-args). Additionally,
+`fragments2reads` takes the following options:
+
+* `-sort_reads`: Sorts reads by alignment position. Unmapped reads are
+  placed at the end of all reads. Contigs are ordered by sequence record
+  index.
+* `-sort_lexicographically`: Sorts reads by alignment position. Unmapped
+  reads are placed at the end of all reads. Contigs are ordered
+  lexicographically.
+
+## Printing tools {#printers}
+
+The printing tools provide some form of user readable view of an ADAM file.
+These commands are useful for both quality control and debugging.
+
+### print
+
+Dumps a Parquet file to either the console or a text file as
+[JSON](http://www.json.org). Takes one required argument:
+
+1. `FILE(S)`: The file paths to load. These must be Parquet formatted files.
+
+This command has several options:
+
+* `-pretty`: Pretty print's the JSON output.
+* `-o`: Provides a path to save the output dump to, instead of writing the
+  output to the console.
+
+This command does not support Parquet output, so the only [default
+options](#default-args) that this command supports is `-print_metrics`.
+
+### flagstat {#flagstat}
+
+Runs the ADAM equivalent to the
+[SAMTools](http://www.htslib.org/doc/samtools.html) `flagstat` command. Takes
+one required argument:
+
+1. `INPUT`: The input path. A file containing reads in any of the supported
+  ADAM read input formats.
+
+This command has several options:
+
+* `-stringency`: Sets the validation stringency for various operations.
+  Defaults to `SILENT.` See [validation stringency](#validation) for more
+  details.
+* `-o`: Provides a path to save the output dump to, instead of writing the
+  output to the console.
+
+This command does not support Parquet output, so the only [default
+options](#default-args) that this command supports is `-print_metrics`.
+
+### view
+
+Runs the ADAM equivalent to the 
+[SAMTools](http://www.htslib.org/doc/samtools.html) `view` command. Takes
+one required argument:
+
+1. `INPUT`: The input path. A file containing reads in any of the supported
+  ADAM read input formats.
+
+In addition to the [default options](#default-args), this command supports the
+following options:
+
+* `-o`: Provides a path to save the output dump to, instead of writing the
+  output to the console. Format is autodetected as any of the ADAM read outputs.
+* `-F`/`-f`: Filters reads that either match all (`-f`) or none (`-F`) of the
+  flag bits.
+* `-G`/`-g`: Filters reads that either mismatch all (`-g`) or none (`-G`) of the
+  flag bits.
+* `-c`: Prints the number of reads that (mis)matched the filters, instead of the
+  reads themselves. Conflicts with `-o`.
diff --git a/docs/source/60_building_apps.md b/docs/source/60_building_apps.md
index b185e5bf97..e7af579ee8 100644
--- a/docs/source/60_building_apps.md
+++ b/docs/source/60_building_apps.md
@@ -1,3 +1,3 @@
-# Building Downstream Applications
+# Building Downstream Applications {#apps}
 
 ADAM is packaged so that it can be used via the base CLI, to run plugins via the CLI, or as a library which can be used when building downstream applications. This document covers some of the important things to note when building applications downstream of ADAM.
diff --git a/docs/source/70_algorithms.md b/docs/source/70_algorithms.md
index d81ba5d71d..4222b36dad 100644
--- a/docs/source/70_algorithms.md
+++ b/docs/source/70_algorithms.md
@@ -194,7 +194,7 @@ Algorithm \ref{alg:join-targets}.
 
 The set returned by this function is used as an index for mapping reads directly to realignment targets.
 
-#### Candidate Generation and Realignment
+#### Candidate Generation and Realignment {#consensus-model}
 
 Once we have generated the target set, we map across all the reads and check to see if the read overlaps
 a realignment target. We then group together all reads that map to a given realignment target; reads that