From 8fef9a783405294995199a57c01d8ecda5d7f698 Mon Sep 17 00:00:00 2001 From: Frank Austin Nothaft Date: Tue, 15 Nov 2016 13:39:01 -0800 Subject: [PATCH] Clean up CLI operation categories and names, and add documentation for CLI. * Removed CalculateDepth command. * Clean up argument doc in TransformFeatures. * Move OUTPUT to an option for Flagstat. * Remove some output saving arguments in the View command that don't exactly make sense. * Remove unused Vcf2ADAM/ADAM2Vcf -dict option and `org.bdgenomics.adam.cli.DictionaryCommand` trait. * Remove WigFix2ADAM command. * Move away from ParquetLoadSaveArgs. * Move reads2coverage to actions. * Reads2Coverage does not only write to Parquet, so clean up docs. * Clean up fragments<->reads argument names. --- .../org/bdgenomics/adam/cli/ADAM2Fasta.scala | 6 +- .../org/bdgenomics/adam/cli/ADAM2Fastq.scala | 6 +- .../org/bdgenomics/adam/cli/ADAM2Vcf.scala | 8 +- .../org/bdgenomics/adam/cli/ADAMMain.scala | 14 +- .../org/bdgenomics/adam/cli/AlleleCount.scala | 81 --- .../bdgenomics/adam/cli/CalculateDepth.scala | 94 ---- .../adam/cli/CountContigKmers.scala | 2 +- .../bdgenomics/adam/cli/CountReadKmers.scala | 2 +- .../adam/cli/DictionaryCommand.scala | 49 -- .../org/bdgenomics/adam/cli/FlagStat.scala | 4 +- .../bdgenomics/adam/cli/Fragments2Reads.scala | 4 +- .../org/bdgenomics/adam/cli/ListDict.scala | 50 -- .../bdgenomics/adam/cli/Reads2Coverage.scala | 2 +- .../bdgenomics/adam/cli/Reads2Fragments.scala | 4 +- .../adam/cli/TransformFeatures.scala | 2 +- .../org/bdgenomics/adam/cli/Vcf2ADAM.scala | 4 +- .../adam/cli/VcfAnnotation2ADAM.scala | 2 +- .../scala/org/bdgenomics/adam/cli/View.scala | 7 +- .../org/bdgenomics/adam/cli/Wiggle2Bed.scala | 83 --- .../adam/cli/TransformFeaturesSuite.scala | 41 -- docs/source/30_running_example.md | 5 +- docs/source/40_deploying_ADAM.md | 10 +- docs/source/50_cli.md | 492 ++++++++++++++++++ docs/source/60_building_apps.md | 2 +- docs/source/70_algorithms.md | 2 +- 25 files changed, 532 insertions(+), 444 deletions(-) delete mode 100644 adam-cli/src/main/scala/org/bdgenomics/adam/cli/AlleleCount.scala delete mode 100644 adam-cli/src/main/scala/org/bdgenomics/adam/cli/CalculateDepth.scala delete mode 100644 adam-cli/src/main/scala/org/bdgenomics/adam/cli/DictionaryCommand.scala delete mode 100644 adam-cli/src/main/scala/org/bdgenomics/adam/cli/ListDict.scala delete mode 100644 adam-cli/src/main/scala/org/bdgenomics/adam/cli/Wiggle2Bed.scala create mode 100644 docs/source/50_cli.md diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAM2Fasta.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAM2Fasta.scala index 183afe5272..b6146861c8 100644 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAM2Fasta.scala +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAM2Fasta.scala @@ -27,7 +27,11 @@ import org.bdgenomics.utils.cli._ import org.bdgenomics.utils.misc.Logging import org.kohsuke.args4j.{ Argument, Option => Args4JOption } -class ADAM2FastaArgs extends ParquetLoadSaveArgs { +class ADAM2FastaArgs extends Args4jBase { + @Argument(required = true, metaVar = "ADAM", usage = "The Parquet file to convert", index = 0) + var inputPath: String = null + @Argument(required = true, metaVar = "FASTA", usage = "Location to write the FASTA to", index = 1) + var outputPath: String = null @Args4JOption(required = false, name = "-coalesce", usage = "Choose the number of partitions to coalesce down to.") var coalesce: Int = -1 @Args4JOption(required = false, name = "-force_shuffle_coalesce", usage = "Force shuffle while partitioning, default false.") diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAM2Fastq.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAM2Fastq.scala index 192cbeecfb..e3e392fff5 100644 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAM2Fastq.scala +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAM2Fastq.scala @@ -27,7 +27,11 @@ import org.bdgenomics.formats.avro.AlignmentRecord import org.bdgenomics.utils.cli._ import org.kohsuke.args4j.{ Argument, Option => Args4JOption } -class ADAM2FastqArgs extends ParquetLoadSaveArgs { +class ADAM2FastqArgs extends Args4jBase { + @Argument(required = true, metaVar = "INPUT", usage = "The read file to convert", index = 0) + var inputPath: String = null + @Argument(required = true, metaVar = "OUTPUT", usage = "Location to write the FASTQ to", index = 1) + var outputPath: String = null @Argument(required = false, metaVar = "SECOND_OUTPUT", usage = "When writing FASTQ data, all second-in-pair reads will go here, if this argument is provided", index = 2) var outputPath2: String = null diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAM2Vcf.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAM2Vcf.scala index 1226e2ab81..30802e8f91 100644 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAM2Vcf.scala +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAM2Vcf.scala @@ -40,8 +40,6 @@ object ADAM2Vcf extends BDGCommandCompanion { } class ADAM2VcfArgs extends Args4jBase with ParquetArgs { - @Args4jOption(required = false, name = "-dict", usage = "Reference dictionary") - var dictionaryFile: File = _ @Argument(required = true, metaVar = "ADAM", usage = "The ADAM variant files to convert", index = 0) var adamFile: String = _ @@ -64,17 +62,13 @@ class ADAM2VcfArgs extends Args4jBase with ParquetArgs { var single: Boolean = false } -class ADAM2Vcf(val args: ADAM2VcfArgs) extends BDGSparkCommand[ADAM2VcfArgs] with DictionaryCommand with Logging { +class ADAM2Vcf(val args: ADAM2VcfArgs) extends BDGSparkCommand[ADAM2VcfArgs] with Logging { val companion = ADAM2Vcf def run(sc: SparkContext) { require(!(args.sort && args.sortLexicographically), "Cannot set both -sort_on_save and -sort_lexicographically_on_save.") - var dictionary: Option[SequenceDictionary] = loadSequenceDictionary(args.dictionaryFile) - if (dictionary.isDefined) - log.info("Using contig translation") - val adamGTs = sc.loadParquetGenotypes(args.adamFile) val coalesce = if (args.coalesce > 0) { diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAMMain.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAMMain.scala index 71b15b795e..b1d376a3c0 100644 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAMMain.scala +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAMMain.scala @@ -32,13 +32,13 @@ object ADAMMain { CommandGroup( "ADAM ACTIONS", List( - CalculateDepth, CountReadKmers, CountContigKmers, Transform, - ADAM2Fastq, + TransformFeatures, Flatten, - MergeShards + MergeShards, + Reads2Coverage ) ), CommandGroup( @@ -49,11 +49,9 @@ object ADAMMain { VcfAnnotation2ADAM, Fasta2ADAM, ADAM2Fasta, - TransformFeatures, - WigFix2Bed, + ADAM2Fastq, Fragments2Reads, - Reads2Fragments, - Reads2Coverage + Reads2Fragments ) ), CommandGroup( @@ -61,8 +59,6 @@ object ADAMMain { List( PrintADAM, FlagStat, - ListDict, - AlleleCount, View ) ) diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/AlleleCount.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/AlleleCount.scala deleted file mode 100644 index bd26911614..0000000000 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/AlleleCount.scala +++ /dev/null @@ -1,81 +0,0 @@ -/** - * Licensed to Big Data Genomics (BDG) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The BDG licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.bdgenomics.adam.cli - -import org.apache.hadoop.mapreduce.Job -import org.apache.spark.SparkContext -import org.apache.spark.rdd.RDD -import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.adam.rdd.variant.GenotypeRDD -import org.bdgenomics.formats.avro.{ Genotype, GenotypeAllele } -import org.bdgenomics.utils.cli._ -import org.bdgenomics.utils.misc.Logging -import org.kohsuke.args4j.Argument - -object AlleleCount extends BDGCommandCompanion { - val commandName = "allelecount" - val commandDescription = "Calculate Allele frequencies" - - def apply(cmdLine: Array[String]) = { - new AlleleCount(Args4j[AlleleCountArgs](cmdLine)) - } -} - -class AlleleCountArgs extends Args4jBase with ParquetArgs { - @Argument(required = true, metaVar = "ADAM", - usage = "The ADAM Variant file", index = 0) - var adamFile: String = _ - @Argument(required = true, metaVar = "Output", - usage = "Location to write allele frequency data", index = 1) - var outputPath: String = null -} - -object AlleleCountHelper extends Serializable { - def chooseAllele(x: (String, java.lang.Long, String, String, GenotypeAllele)) = - x match { - case (chr, position, refAllele, varAllele, GenotypeAllele.REF) => Some(chr, position, refAllele) - case (chr, position, refAllele, varAllele, GenotypeAllele.ALT) => Some(chr, position, varAllele) - case _ => None - } - - def countAlleles(adamVariants: GenotypeRDD, args: AlleleCountArgs) { - val usefulData = adamVariants.rdd.map(p => ( - p.getVariant.getContigName, - p.getVariant.getStart, - p.getVariant.getReferenceAllele, - p.getVariant.getAlternateAllele, - p.getAlleles.get(0), - p.getAlleles.get(1) - )) - val reduced_Variants = usefulData.flatMap(p => Seq((p._1, p._2, p._3, p._4, p._5), (p._1, p._2, p._3, p._4, p._6))) - val alleles = reduced_Variants.flatMap(chooseAllele) - alleles.groupBy(identity).map { case (a, b) => "%s\t%s\t%s\t%d".format(a._1, a._2, a._3, b.size) } - .saveAsTextFile(args.outputPath) - } -} - -class AlleleCount(val args: AlleleCountArgs) extends BDGSparkCommand[AlleleCountArgs] with Logging { - val companion = AlleleCount - - def run(sc: SparkContext) { - - val adamVariants = sc.loadGenotypes(args.adamFile) - AlleleCountHelper.countAlleles(adamVariants, args) - } -} diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CalculateDepth.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CalculateDepth.scala deleted file mode 100644 index cda3dcb62b..0000000000 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CalculateDepth.scala +++ /dev/null @@ -1,94 +0,0 @@ -/** - * Licensed to Big Data Genomics (BDG) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The BDG licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.bdgenomics.adam.cli - -import org.kohsuke.args4j.{ Argument, Option => Args4jOption } -import org.kohsuke.args4j.spi.BooleanOptionHandler -import org.apache.hadoop.mapreduce.Job -import org.apache.spark.SparkContext -import org.apache.spark.SparkContext._ -import org.apache.spark.rdd.RDD -import org.bdgenomics.adam.models.{ SequenceDictionary, ReferenceRegion } -import org.bdgenomics.adam.projections.Projection -import org.bdgenomics.adam.projections.AlignmentRecordField._ -import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.formats.avro.{ AlignmentRecord, Variant } -import org.bdgenomics.utils.cli._ -import scala.io._ - -/** - * CalculateDepth (accessible as the command 'depth' through the CLI) takes two arguments, - * an Read file and a VCF (or equivalent) file, and calculates the number of reads - * (the 'depth') from the Read file which overlap each of the variants given by the VCF. - * It then reports, on standard out, the location and name of each variant along with the - * calculated depth. - */ -object CalculateDepth extends BDGCommandCompanion { - val commandName: String = "depth" - val commandDescription: String = "Calculate the depth from a given ADAM file, " + - "at each variant in a VCF" - - def apply(cmdLine: Array[String]): BDGCommand = { - new CalculateDepth(Args4j[CalculateDepthArgs](cmdLine)) - } -} - -class CalculateDepthArgs extends Args4jBase with ParquetArgs { - @Argument(required = true, metaVar = "ADAM", usage = "The Read file to use to calculate depths", index = 0) - val adamInputPath: String = null - - @Argument(required = true, metaVar = "VCF", usage = "The VCF containing the sites at which to calculate depths", index = 1) - val vcfInputPath: String = null -} - -class CalculateDepth(protected val args: CalculateDepthArgs) extends BDGSparkCommand[CalculateDepthArgs] { - val companion: BDGCommandCompanion = CalculateDepth - - def run(sc: SparkContext): Unit = { - - val proj = Projection(contigName, start, cigar, readMapped) - - // load reads and variants - val readRdd = sc.loadAlignments(args.adamInputPath, projection = Some(proj)) - val variants = sc.loadVariants(args.vcfInputPath) - - // perform join - val joinedRdd = readRdd.broadcastRegionJoin(variants) - - // map variant to region and swap tuple field order - val finalRdd = joinedRdd.rdd.map(kv => (ReferenceRegion(kv._2), kv._1)) - - // count at sites - val depths: RDD[(ReferenceRegion, Int)] = - finalRdd.map { case (region, record) => (region, 1) }.reduceByKey(_ + _).sortByKey() - - /* - * tab-delimited output, containing the following columns: - * 0: the location of the variant - * 1: the depth of overlapping reads at the variant - */ - println("location\tname\tdepth") - depths.collect().foreach { - case (region, count) => - println("%20s\t% 5d".format( - "%s:%d".format(region.referenceName, region.start), - count - )) - } - } -} diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CountContigKmers.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CountContigKmers.scala index 16fab5a031..f05c567130 100644 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CountContigKmers.scala +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CountContigKmers.scala @@ -28,7 +28,7 @@ import org.bdgenomics.utils.misc.Logging import org.kohsuke.args4j.{ Argument, Option => Args4jOption } object CountContigKmers extends BDGCommandCompanion { - val commandName = "count_contig_kmers" + val commandName = "countContigKmers" val commandDescription = "Counts the k-mers/q-mers from a read dataset." def apply(cmdLine: Array[String]) = { diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CountReadKmers.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CountReadKmers.scala index 4330c4b869..cdfdfd2ea6 100644 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CountReadKmers.scala +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CountReadKmers.scala @@ -29,7 +29,7 @@ import org.bdgenomics.utils.misc.Logging import org.kohsuke.args4j.{ Argument, Option => Args4jOption } object CountReadKmers extends BDGCommandCompanion { - val commandName = "count_kmers" + val commandName = "countKmers" val commandDescription = "Counts the k-mers/q-mers from a read dataset." def apply(cmdLine: Array[String]) = { diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/DictionaryCommand.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/DictionaryCommand.scala deleted file mode 100644 index c692bbe107..0000000000 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/DictionaryCommand.scala +++ /dev/null @@ -1,49 +0,0 @@ -/** - * Licensed to Big Data Genomics (BDG) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The BDG licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.bdgenomics.adam.cli - -import htsjdk.variant.utils.SAMSequenceDictionaryExtractor -import java.io.{ File, FileOutputStream } -import org.apache.commons.io.IOUtils -import org.bdgenomics.adam.models.SequenceDictionary - -trait DictionaryCommand { - private def getDictionaryFile(name: String): Option[File] = { - val stream = ClassLoader.getSystemClassLoader.getResourceAsStream("dictionaries/" + name) - if (stream == null) - return None - val file = File.createTempFile(name, ".dict") - file.deleteOnExit() - IOUtils.copy(stream, new FileOutputStream(file)) - Some(file) - } - - private def getDictionary(file: File) = Some(SequenceDictionary( - SAMSequenceDictionaryExtractor.extractDictionary(file))) - - def loadSequenceDictionary(file: File): Option[SequenceDictionary] = { - if (file != null) { - if (file.exists) - getDictionary(file) - else getDictionaryFile(file.getName) match { - case Some(file) => getDictionary(file) - case _ => None - } - } else None - } -} diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/FlagStat.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/FlagStat.scala index 387d999e5e..9d4e739152 100644 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/FlagStat.scala +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/FlagStat.scala @@ -36,10 +36,10 @@ object FlagStat extends BDGCommandCompanion { } } -class FlagStatArgs extends Args4jBase with ParquetArgs { +class FlagStatArgs extends Args4jBase { @Argument(required = true, metaVar = "INPUT", usage = "The ADAM data to return stats for", index = 0) val inputPath: String = null - @Argument(required = false, metaVar = "OUTPUT", usage = "Optionally write the stats to this file.", index = 1) + @Args4jOption(required = false, name = "-o", usage = "Optionally write the stats to this file.") val outputPath: String = null @Args4jOption(required = false, name = "-stringency", usage = "Set the parsing stringency: SILENT, LENIENT, STRICT.") val stringency: String = "SILENT" diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Fragments2Reads.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Fragments2Reads.scala index f8513776bd..c587430b2e 100644 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Fragments2Reads.scala +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Fragments2Reads.scala @@ -35,9 +35,9 @@ object Fragments2Reads extends BDGCommandCompanion { } class Fragments2ReadsArgs extends Args4jBase with ADAMSaveAnyArgs with ParquetArgs { - @Argument(required = true, metaVar = "INPUT", usage = "The Fragment file to apply the transforms to", index = 0) + @Argument(required = true, metaVar = "FRAGMENTS", usage = "The Fragment file to apply the transforms to", index = 0) var inputPath: String = null - @Argument(required = true, metaVar = "OUTPUT", usage = "Location to write the transformed data in ADAM/Parquet format", index = 1) + @Argument(required = true, metaVar = "READS", usage = "Location to write the transformed data as reads", index = 1) var outputPath: String = null @Args4jOption(required = false, name = "-single", usage = "Saves OUTPUT as single file") var asSingleFile: Boolean = false diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ListDict.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ListDict.scala deleted file mode 100644 index ad03455c4e..0000000000 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ListDict.scala +++ /dev/null @@ -1,50 +0,0 @@ -/** - * Licensed to Big Data Genomics (BDG) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The BDG licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.bdgenomics.adam.cli - -import org.apache.hadoop.mapreduce.Job -import org.apache.spark.SparkContext -import org.bdgenomics.adam.models.SequenceRecord -import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.formats.avro.AlignmentRecord -import org.bdgenomics.utils.cli._ -import org.kohsuke.args4j.Argument - -object ListDict extends BDGCommandCompanion { - val commandName: String = "listdict" - val commandDescription: String = "Print the contents of an ADAM sequence dictionary" - - def apply(cmdLine: Array[String]): BDGCommand = { - new ListDict(Args4j[ListDictArgs](cmdLine)) - } -} - -class ListDictArgs extends Args4jBase with ParquetArgs { - @Argument(required = true, metaVar = "INPUT", usage = "The ADAM sequence dictionary to print", index = 0) - val inputPath: String = null -} - -class ListDict(protected val args: ListDictArgs) extends BDGSparkCommand[ListDictArgs] { - val companion: BDGCommandCompanion = ListDict - - def run(sc: SparkContext): Unit = { - val gRdd = sc.loadAlignments(args.inputPath) - - println(gRdd.sequences) - } -} diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Reads2Coverage.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Reads2Coverage.scala index 0edb1ae931..031ffd6031 100644 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Reads2Coverage.scala +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Reads2Coverage.scala @@ -46,7 +46,7 @@ object Reads2Coverage extends BDGCommandCompanion { class Reads2CoverageArgs extends Args4jBase with ParquetArgs { @Argument(required = true, metaVar = "INPUT", usage = "The reads file to use to calculate depths", index = 0) var inputPath: String = null - @Argument(required = true, metaVar = "OUTPUT", usage = "Location to write the coverage data in ADAM/Parquet format", index = 1) + @Argument(required = true, metaVar = "OUTPUT", usage = "Location to write the coverage data to", index = 1) var outputPath: String = null @Args4jOption(required = false, name = "-collapse", usage = "Collapses neighboring coverage records " + "of equal counts into the same record") diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Reads2Fragments.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Reads2Fragments.scala index 039a5cfb2b..0aa01c0115 100644 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Reads2Fragments.scala +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Reads2Fragments.scala @@ -34,9 +34,9 @@ object Reads2Fragments extends BDGCommandCompanion { } class Reads2FragmentsArgs extends Args4jBase with ADAMSaveAnyArgs with ParquetArgs { - @Argument(required = true, metaVar = "INPUT", usage = "The ADAM, interleaved FASTQ, BAM, or SAM file to apply the transforms to", index = 0) + @Argument(required = true, metaVar = "READS", usage = "The ADAM, interleaved FASTQ, BAM, or SAM file to apply the transforms to", index = 0) var inputPath: String = null - @Argument(required = true, metaVar = "OUTPUT", usage = "Location to write the transformed data in ADAM/Parquet format", index = 1) + @Argument(required = true, metaVar = "FRAGMENTS", usage = "Location to write the transformed data in ADAM/Parquet format", index = 1) var outputPath: String = null // these are required because of the ADAMSaveAnyArgs trait... fix this trait??? diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformFeatures.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformFeatures.scala index 7658673d9c..898772b3ca 100644 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformFeatures.scala +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformFeatures.scala @@ -41,7 +41,7 @@ class TransformFeaturesArgs extends Args4jBase with ParquetSaveArgs { var outputPath: String = null @Args4jOption(required = false, name = "-num_partitions", - usage = "Number of partitions to load an interval file using.") + usage = "Number of partitions to load a text file using.") var numPartitions: Int = _ @Args4jOption(required = false, name = "-single", diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Vcf2ADAM.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Vcf2ADAM.scala index 537ce9134a..62593b3b18 100644 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Vcf2ADAM.scala +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Vcf2ADAM.scala @@ -37,8 +37,6 @@ object Vcf2ADAM extends BDGCommandCompanion { } class Vcf2ADAMArgs extends Args4jBase with ParquetSaveArgs { - @Args4jOption(required = false, name = "-dict", usage = "Reference dictionary") - var dictionaryFile: File = _ @Argument(required = true, metaVar = "VCF", usage = "The VCF file to convert", index = 0) var vcfPath: String = _ @@ -56,7 +54,7 @@ class Vcf2ADAMArgs extends Args4jBase with ParquetSaveArgs { var onlyVariants: Boolean = false } -class Vcf2ADAM(val args: Vcf2ADAMArgs) extends BDGSparkCommand[Vcf2ADAMArgs] with DictionaryCommand with Logging { +class Vcf2ADAM(val args: Vcf2ADAMArgs) extends BDGSparkCommand[Vcf2ADAMArgs] with Logging { val companion = Vcf2ADAM def run(sc: SparkContext) { diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/VcfAnnotation2ADAM.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/VcfAnnotation2ADAM.scala index 2b90edcc85..f51a410235 100644 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/VcfAnnotation2ADAM.scala +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/VcfAnnotation2ADAM.scala @@ -44,7 +44,7 @@ class VcfAnnotation2ADAMArgs extends Args4jBase with ParquetSaveArgs { var vcfFile: String = _ @Argument(required = true, metaVar = "ADAM", usage = "Location to write ADAM Variant annotations data", index = 1) var outputPath: String = null - @Args4jOption(required = false, name = "-current_db", usage = "Location of existing ADAM Variant annotations data") + @Args4jOption(required = false, name = "-annotations_to_join", usage = "Location of existing ADAM Variant annotations data") var currentAnnotations: String = null } diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/View.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/View.scala index 5867560c77..676a658d94 100644 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/View.scala +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/View.scala @@ -30,7 +30,7 @@ class ViewArgs extends Args4jBase with ParquetArgs with ADAMSaveAnyArgs { @Argument(required = true, metaVar = "INPUT", usage = "The ADAM, BAM or SAM file to view", index = 0) var inputPath: String = null - @Argument(required = false, metaVar = "OUTPUT", usage = "Location to write output data", index = 1) + // left null until constructor var outputPath: String = null @Args4jOption( @@ -80,12 +80,9 @@ class ViewArgs extends Args4jBase with ParquetArgs with ADAMSaveAnyArgs { ) var outputPathArg: String = null - @Args4jOption(required = false, name = "-sort_fastq_output", usage = "Sets whether to sort the FASTQ output, if saving as FASTQ. False by default. Ignored if not saving as FASTQ.") + // required by ADAMAnySaveArgs var sortFastqOutput: Boolean = false - - @Args4jOption(required = false, name = "-single", usage = "Saves OUTPUT as single file") var asSingleFile: Boolean = false - @Args4jOption(required = false, name = "-defer_merging", usage = "Defers merging single file output") var deferMerging: Boolean = false } diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Wiggle2Bed.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Wiggle2Bed.scala deleted file mode 100644 index 3c5255fde8..0000000000 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Wiggle2Bed.scala +++ /dev/null @@ -1,83 +0,0 @@ -/** - * Licensed to Big Data Genomics (BDG) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The BDG licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.bdgenomics.adam.cli - -import java.io.PrintWriter -import org.bdgenomics.utils.cli._ -import org.kohsuke.args4j.Option -import scala.io.Source - -class Wig2BedArgs extends Args4jBase { - @Option(name = "-wig", usage = "The wig file to convert (leave out for stdin)") - var wigPath: String = "" - - @Option(name = "-bed", usage = "Location to write BED data (leave out for stdout)") - var bedPath: String = "" -} - -/** - * WigFix2Bed (accessible as the command "wigfix2bed" through the CLI) takes - * fixed wiggle file and converts it to a BED formatted file. The wiggle file - * is a text based format that implements run-length encoding, without any - * guarantees where the sync markers are. This makes it difficult to use as a - * "splittable" format, and necessitates processing the file locally. - */ -object WigFix2Bed extends BDGCommandCompanion { - val commandName = "wigfix2bed" - val commandDescription = "Locally convert a wigFix file to BED format" - - // matches a "sync" line that resets the position - val declPattern = "^fixedStep[\\s]+chrom=(.+)[\\s]+start=([0-9]+)[\\s]+step=([0-9]+)[\\s]*(?:$|span=([0-9]+).*$)".r - // a single datum in the run-length encoded file - val featPattern = "^\\s*([-]?[0-9]*\\.?[0-9]*)\\s*$".r - - def apply(cmdLine: Array[String]) = { - new WigFix2Bed(Args4j[Wig2BedArgs](cmdLine)) - } -} - -class WigFix2Bed(val args: Wig2BedArgs) extends BDGCommand { - val companion = WigFix2Bed - - def run() { - // state from the declaration lines - var contig: String = "" - var current: Long = 0 - var step: Long = 0 - var span: Long = 1 - - val in = if (args.wigPath == "") Source.stdin else Source.fromFile(args.wigPath) - val out = if (args.bedPath == "") new PrintWriter(System.out) else new PrintWriter(args.bedPath) - in.getLines().foreach { - case WigFix2Bed.declPattern(c, st, sp, sn) => { - contig = c - current = st.toLong - 1 // convert to BED coords - step = sp.toLong - span = if (sn == null) span else sn.toLong - } - case WigFix2Bed.featPattern(value) => { - out.println(Array(contig, current.toString, (current + span).toString, "", value).mkString("\t")) - current += step - } - case _ => None - } - in.close() - out.close() - } -} diff --git a/adam-cli/src/test/scala/org/bdgenomics/adam/cli/TransformFeaturesSuite.scala b/adam-cli/src/test/scala/org/bdgenomics/adam/cli/TransformFeaturesSuite.scala index 5f98dd720d..8b9cbf6324 100644 --- a/adam-cli/src/test/scala/org/bdgenomics/adam/cli/TransformFeaturesSuite.scala +++ b/adam-cli/src/test/scala/org/bdgenomics/adam/cli/TransformFeaturesSuite.scala @@ -52,45 +52,4 @@ class TransformFeaturesSuite extends ADAMFunSuite { assert(converted.size === 10) assert(converted.find(_.getContigName != "chr1").isEmpty) } - - sparkTest("can convert a simple wigfix file") { - val loader = Thread.currentThread().getContextClassLoader - val inputPath = loader.getResource("chr5.phyloP46way.trunc.wigFix").getPath - val bedFile = File.createTempFile("adam-cli.TransformFeaturesSuite", ".bed") - val bedPath = bedFile.getAbsolutePath - val outputFile = File.createTempFile("adam-cli.TransformFeaturesSuite", ".adam") - val outputPath = outputFile.getAbsolutePath - - // We have to do this, since the features2adam won't work if the file already exists, - // but the "createTempFile" method actually creates the file (on some systems?) - assert(bedFile.delete(), "Couldn't delete (empty) temp file") - assert(outputFile.delete(), "Couldn't delete (empty) temp file") - - // convert to BED - val bedArgLine = "-wig %s -bed %s".format(inputPath, bedPath).split("\\s+") - val bedArgs: Wig2BedArgs = Args4j.apply[Wig2BedArgs](bedArgLine) - val wigFix2Bed = new WigFix2Bed(bedArgs) - wigFix2Bed.run() - - // convert to ADAM Features - val adamArgLine = "%s %s".format(bedPath, outputPath).split("\\s+") - val adamArgs: TransformFeaturesArgs = Args4j.apply[TransformFeaturesArgs](adamArgLine) - val features2Adam = new TransformFeatures(adamArgs) - features2Adam.run(sc) - - val schema = Projection(featureId, contigName, start, end, score) - val rdd = sc.loadFeatures(outputPath, projection = Some(schema)) - val converted = rdd.rdd.collect.toSeq.sortBy(f => f.getStart) - - assert(converted.size === 10) - assert(converted(0).getContigName == "chr5") - assert(converted(0).getStart == 13939) - assert(converted(0).getEnd == 13940) - assert(converted(0).getScore == 0.067) - assert(converted(6).getContigName == "chr5") - assert(converted(6).getStart == 15295) - assert(converted(9).getStart == 15298) - assert(converted(9).getEnd == 15299) - assert(converted(9).getScore == 0.139) - } } diff --git a/docs/source/30_running_example.md b/docs/source/30_running_example.md index 3f590cf20f..365f5363be 100644 --- a/docs/source/30_running_example.md +++ b/docs/source/30_running_example.md @@ -1,7 +1,8 @@ ## flagstat -Once you have data converted to ADAM, you can gather statistics from the ADAM file using `flagstat`. -This command will output stats identically to the samtools `flagstat` command, e.g. +Once you have data converted to ADAM, you can gather statistics from the ADAM +file using [`flagstat`](#flagstat). This command will output stats identically +to the samtools `flagstat` command, e.g. ```bash $ ./bin/adam-submit flagstat NA12878_chr20.adam diff --git a/docs/source/40_deploying_ADAM.md b/docs/source/40_deploying_ADAM.md index c58855c66b..a5cb3c9d05 100644 --- a/docs/source/40_deploying_ADAM.md +++ b/docs/source/40_deploying_ADAM.md @@ -89,11 +89,11 @@ include: * [adam-kmers](https://github.com/BD2KGenomics/toil-scripts/tree/master/src/toil_scripts/adam_kmers): this workflow was demonstrated in [@vivian16] and sets up a Spark cluster - which then runs ADAM's `count_kmers` CLI. + which then runs ADAM's [`countKmers` CLI](#countKmers). * [adam-pipeline](https://github.com/BD2KGenomics/toil-scripts/tree/master/src/toil_scripts/adam_pipeline): - this workflow runs several stages in the ADAM `transform` CLI. This pipeline - is the ADAM equivalent to the GATK's "Best Practice" read preprocessing - pipeline. We then stitch together this pipeline with + this workflow runs several stages in the ADAM [`transform` CLI](#transform). + This pipeline is the ADAM equivalent to the GATK's "Best Practice" read + preprocessing pipeline. We then stitch together this pipeline with [BWA-MEM](https://github.com/lh3/bwa) and the GATK in the [adam-gatk-pipeline]( https://github.com/BD2KGenomics/toil-scripts/tree/master/src/toil_scripts/adam_gatk_pipeline). @@ -291,7 +291,7 @@ does the following work: _log.info('Counting %d-mers in %s, and saving to %s.', kmer_length, hdfs_input_file, hdfs_output_file) call_adam(master_ip, - ['count_kmers', + ['countKmers', hdfs_input_file, hdfs_output_file, str(kmer_length)], memory=memory, override_parameters=spark_conf) diff --git a/docs/source/50_cli.md b/docs/source/50_cli.md new file mode 100644 index 0000000000..aee1e508ae --- /dev/null +++ b/docs/source/50_cli.md @@ -0,0 +1,492 @@ +# Running ADAM's command line tools + +In addition to being used as an API for [building applications](#apps), ADAM +provides a command line interface (CLI) for extracting, transforming, and +loading (ETL-ing) genomics data. Our CLI is roughly divided into three sections: + +* [Actions](#actions) that manipulate data using the ADAM schemas +* [Conversions](#conversions) that convert data from legacy formats into Parquet +* [Printers](#printers) that provide detailed or summarized views of genomic + data + +ADAM's various CLI actions can be run from the command line using the +`scripts/adam-submit` script. This script uses the `spark-submit` script to run +an ADAM application on a Spark cluster. To use this script, either +`spark-submit` must be on the `$PATH`, or the `$SPARK_HOME` environment variable +must be set. + +#### Default arguments {#default-args} + +There are several command line options that are present across most commands. +These include: + +* `-h`, `-help`, `--help`, `-?`: prints the usage for this command +* `-parquet_block_size N`: sets the block size for Parquet in bytes, if writing + a Parquet output file. Defaults to 128 MB (128 * 1024 * 1024). +* `-parquet_compression_codec`: The codec to use for compressing a Parquet page. + Choices are: + * `UNCOMPRESSED`: No compression. + * `SNAPPY`: Use the [Snappy](https://github.com/google/snappy) compression + codec. + * `GZIP`: Use a [Gzip](https://www.gnu.org/software/gzip/) based compression + codec. + * `LZO`: Use a + [LZO](https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Oberhumer) + based compression codec. To use LZO, the [LZO libraries must be + installed](http://hbase.apache.org/book.html#trouble.rs.startup.compression). +* `-parquet_disable_dictionary`: Disables dictionary encoding in Parquet, and + enables delta encoding. +* `-parquet_logging_level VAL`: The [Log4j](http://logging.apache.org/log4j/) + logging level to set for Parquet's loggers. Defaults to `severe`. +* `-parquet_page_size N`: The page size in bytes to use when writing Parquet + files. Defaults to 1MB (1024 * 1024). +* `-print_metrics`: If provided, prints the + [instrumentation](https://github.com/bigdatagenomics/utils#instrumentation) + metrics to the log when the CLI operation terminates. + +#### Legacy output options {#legacy-output} + +Several tools in ADAM support saving back to legacy genomics output formats. Any +tool saving to one of these formats supports the following options: + +* `-single`: Merge sharded output files. If this is not provided, the output + will be written as sharded files where each shard is a valid file. If this + _is_ provided, the shards will be written without headers as a + `${OUTPUTNAME}_tail` directory, and a single header will be written to + `${OUTPUTNAME}_head`. If `-single` is provided and `-defer_merging` is _not_ + provided, the header file and the shard directory will be merged into a single + file at `${OUTPUTPATH}`. +* `-defer_merging`: If both `-defer_merging` and `-single` are provided, the + output will be saved as if is a single file, but the output files will not be + merged. + +#### Validation stringency {#validation} + +Various components in ADAM support passing a validation stringency level. This +is a three level scale: + +* `STRICT`: If validation fails, throw an exception. +* `LENIENT`: If validation fails, ignore the data and write a warning to the + log. +* `SILENT`: If validation fails, ignore the data silently. + +## Action tools {#actions} + +Roughly speaking, "action" tools apply some form of non-trivial transformation +to data using the ADAM APIs. + +### countKmers and countContigKmers {#countKmers} + +Counts the $k$ length substrings in either a set of reads or reference +fragments. Takes three required arguments: + +1. `INPUT`: The input path. A set of reads for `countKmers` or a set of + reference contigs for `countContigKmers`. +2. `OUTPUT`: The path to save the output to. Saves the output as + [CSV](https://en.wikipedia.org/wiki/Comma-separated_values) containing + the $k$-mer sequence and count. +3. `KMER_LENGTH`: The length $k$ of substrings to count. + +Beyond the [default options](#default-args), both `countKmers` and +`countContigKmers` take one option: + +* `-print_histogram`: If provided, prints a histogram of the $k$-mer count + distribution to standard out. + +### transform {#transform} + +The `transform` CLI is the entrypoint to ADAM's read preprocessing tools. This +command provides drop-in replacement commands for several commands in the +[Genome Analysis Toolkit](https://software.broadinstitute.org/gatk/) "Best +Practices" read preprocessing pipeline and more [@depristo11]. This CLI tool +takes two required arguments: + +1. `INPUT`: The input path. A file containing reads in any of the supported + ADAM read input formats. +2. `OUTPUT`: The path to save the transformed reads to. Supports any of ADAM's + read output formats. + +Beyond the [default options](#default-args) and the [legacy output +options](#legacy-output), `transform` supports a vast range of options. These +options fall into several general categories: + +* General options: + * `-cache`: If provided, the results of intermediate stages will be cached. + This is necessary to avoid recomputation if running multiple + transformations (e.g., Indel realignment, BQSR, etc) back to back. + * `-storage_level`: Along with `-cache`, this can be used to set the Spark + [persistance level](http://spark.apache.org/docs/latest/programming-guide.html#which-storage-level-to-choose) + for cached data. If not provided, this defaults to `MEM_ONLY`. + * `-stringency`: Sets the validation stringency for various operations. + Defaults to `LENIENT.` See [validation stringency](#validation) for more + details. +* Loading options: + * `-repartition`: Forces a repartition on load. Useful to increase the + available parallelism on a small dataset. Forces a shuffle. Takes the + number of partitions to repartition to. + * `-force_load_bam`: Forces ADAM to try to load the input as SAM/BAM/CRAM. + * `-force_load_fastq`: Forces ADAM to try to load the input as FASTQ. + * `-paired_fastq`: Forces `-force_load_fastq`, and passes the path of a + second-of-pair FASTQ file to load. + * `-record_group`: If loading FASTQ, sets the record group name on each + read to this value. + * `-force_load_ifastq`: Forces ADAM to try to load the input as interleaved + FASTQ. + * `-force_load_parquet`: Forces ADAM to try to load the input as Parquet + encoded using the ADAM `AlignmentRecord` schema. + * `-limit_projection`: If loading as Parquet, sets a projection that does + not load the `attributes` or `origQual` fields of the `AlignmentRecord`. + * `-aligned_read_predicate`: If loading as Parquet, only loads aligned + reads. + * `-concat`: Provides a path to an optional second file to load, which is + then concatenated to the file given as the `INPUT` path. +* Duplicate marking options: Duplicate marking is run with the + `-mark_duplicate_reads` option. It takes no optional parameters. +* BQSR options: BQSR is run with the `-recalibrate_base_qualities` flag. + Additionally, the BQSR engine takes the following parameters: + * `-known_snps`: Path to a VCF file/Parquet variant file containing known + point variants. These point variants are used to mask read errors during + recalibration. Specifically, putative read errors that are at variant + sites are treated as correct observations. If BQSR is run, this option + should be passed, along with a path to a known variation database (e.g., + [dbSNP](https://www.ncbi.nlm.nih.gov/projects/SNP/)). {#known-snps} + * `-dump_observations`: If provided, a path to dump the recalibration table + in CSV format. +* Indel realignment options: Indel realignment is run with the `-realign_indels` + flag. Additionally, the Indel realignment engine takes the following options: + * `-known_indels`: Path to a VCF file/Parquet variant file containing known + Indel variants to realign against. If provided, forces the `KNOWNS_ONLY` + consensus model. If not provided, forces the `CONSENSUS_FROM_READS` model. + See [candidate generation and realignment](#consensus-model). {#known-indels} + * `-max_consensus_number`: The maximum number of consensus sequences to + realign a single target against. If more consensus sequences are seen at + a single target, we randomly downsample. Defaults to 30. + * `-max_indel_size`: The maximum length of an Indel to realign against. + Indels longer than this size are dropped before generating consensus + sequences. Defaults to 500bp. + * `-max_target_size`: The maximum length of a target to realign. Targets + longer than this size are dropped before trying to realign. Defaults to + 3,000bp. + * `-log_odds_threshold`: The log odds threshold to use for picking a + consensus sequence to finalize realignments against. A consensus will not + be realigned against unless the Phred weighted edit distance against the + given consensus/reference pair is a sufficient improvement over the + original reference realignments. This option sets that improvement weight. + Defaults to 5.0. +* `mismatchingPositions` tagging options: We can recompute the + `mismatchingPositions` field of an AlignmentRecord (SAM "MD" tag) with the + `-add_md_tags` flag. This flag takes a path to a reference file in either + FASTA or Parquet `NucleotideContigFragment` format. Additionally, this engine + takes the following options: + * `-md_tag_fragment_size`: If loading from FASTA, sets the size of each + fragment to load. Defaults to 10,000bp. + * `-md_tag_overwrite`: If provided, recomputes and overwrites the + `mismatchingPositions` field for records where this field was provided. +* Output options: `transform` supports the [legacy output](#legacy-output) + options. Additionally, there are the following options: + * `-coalesce`: Sets the number of partitions to coalesce the output to. + If `-force_shuffle_coalesce` is not provided, the Spark engine may ignore + the coalesce directive. + * `-force_shuffle_coalesce`: Forces a shuffle that leads to the output being + saved with the number of partitions requested by `-coalesce`. This is + necessary if the `-coalesce` would increase the number of partitions, or + if it would reduce the number of partitions to fewer than the number of + Spark executors. This may have a substantial performance cost, and will + invalidate any sort order. + * `-sort_reads`: Sorts reads by alignment position. Unmapped reads are + placed at the end of all reads. Contigs are ordered by sequence record + index. + * `-sort_lexicographically`: Sorts reads by alignment position. Unmapped + reads are placed at the end of all reads. Contigs are ordered + lexicographically. + * `-sort_fastq_output`: Ignored if not saving to FASTQ. If saving to FASTQ, + sorts the output reads by read name. + +### transformFeatures + +Loads a feature file into the ADAM `Feature` schema, and saves it back. The +input and output formats are autodetected. Takes two required arguments: + +1. `INPUT`: The input path. A file containing features in any of the supported + ADAM feature input formats. +2. `OUTPUT`: The path to save the transformed features to. Supports any of ADAM's + feature output formats. + +Beyond the [default options](#default-args) and the [legacy output +options]{#legacy-output}, `transformFeatures` has one optional argument: + +* `-num_partitions`: If loading from a textual feature format (i.e., not + Parquet), sets the number of partitions to load. If not provided, this is + chosen by Spark. + +### flatten + +Loads a Parquet file and rewrites the file as a new Parquet file with a flat +schema. This is useful if loading the data into a database that supports Parquet +but that does not support nested schemas. Takes two required arguments: + +1. `INPUT`: The input path to a Parquet file. +2. `OUTPUT`: The path to save a Parquet file containing the input data, but + written using a flattened schema. + +### mergeShards + +A CLI tool for merging a [sharded legacy file](#legacy-output) that was written +with the `-single` and `-defer_merging` flags. Runs the file merging process. +Takes two required arguments: + +1. `INPUT`: The input directory of sharded files to merge. +2. `OUTPUT`: The path to save the merged file at. + +This command takes several optional arguments: + +* `-buffer_size`: The buffer size in bytes to use for copying data on the + driver. Defaults to 4MB (4 * 1024 * 1024). +* `-header_path`: The path to a header file that should be written to the start + of the merged output. +* `-write_cram_eof`: Writes an empty CRAM container at the end of the merged + output file. This should not be provided unless merging a sharded CRAM file. +* `-write_empty_GZIP_at_eof`: Writes an empty GZIP block at the end of the + merged output file. This should be provided if merging a sharded BAM file or + any other BGZIPed format. + +This command does not support Parquet output, so the only [default +options](#default-args) that this command supports is `-print_metrics`. + +### reads2coverage + +The `reads2coverage` command computes per-locus coverage from reads and saves +the coverage counts as features. Takes two required arguments: + +1. `INPUT`: The input path. A file containing reads in any of the supported + ADAM read input formats. +2. `OUTPUT`: The path to save the coverage counts to. Saves in any of the ADAM + supported feature file formats. + +In addition to the [default options](#default-args), `reads2coverage` takes the +following options: + +* `-collapse`: If two (or more) neighboring sites have the same coverage, we + collapse them down into a single genomic feature. +* `-only_negative_strands`: Only computes coverage for reads aligned on the + negative strand. Conflicts with `-only_positive_strands`. +* `-only_positive_strands`: Only computes coverage for reads aligned on the + positive strand. Conflicts with `-only_negative_strands`. + +## Conversion tools {#conversions} + +These tools convert data between a legacy genomic file format and using ADAM's +schemas to store data in Parquet. + +### vcf2adam, anno2adam, and adam2vcf + +These commands convert between VCF and Parquet using the Genotype and Variant +schemas. + +`vcf2adam` takes two required arguments: + +1. `VCF`: The VCF file to convert to Parquet. +2. `ADAM`: The path to save the converted Parquet data at. + +`vcf2adam` supports the full set of [default options](#default-args). +Additionally, `vcf2adam` takes the following options: + +* `-only_variants`: Instead of saving the VCF file as Genotypes, only save the + Variants from the VCF. This is useful if loading a sites-only VCF, e.g., for + [BQSR](#known-snps) or [Indel realignment](#known-indels). +* `-coalesce`: Sets the number of partitions to coalesce the output to. + If `-force_shuffle_coalesce` is not provided, the Spark engine may ignore + the coalesce directive. +* `-force_shuffle_coalesce`: Forces a shuffle that leads to the output being + saved with the number of partitions requested by `-coalesce`. This is + necessary if the `-coalesce` would increase the number of partitions, or + if it would reduce the number of partitions to fewer than the number of + Spark executors. This may have a substantial performance cost, and will + invalidate any sort order. + +`anno2adam` converts VCFs with annotated variants (i.e., the VCF INFO fields) +into Parquet using the VariantAnnotation schema. `anno2adam` takes the same two +required arguments as `vcf2adam`. `anno2adam` takes the [default +options](#default-args), and one additional option: + +* `-annotations_to_join`: A path to an existing Parquet file of + VariantAnnotations. These two files are joined together, and the annotations + are merged. + +`adam2vcf` takes two required arguments: + +1. `ADAM`: The Parquet file of Genotypes to convert to VCF. +2. `VCF`: The path to save the VCF file to. + +`adam2vcf` only supports the `-print_metrics` option from the [default +options](#default-args). Additionally, `adam2vcf` takes the following options: + +* `-coalesce`: Sets the number of partitions to coalesce the output to. + The Spark engine may ignore the coalesce directive. +* `-sort_on_save`: Sorts the variants when saving, where contigs are ordered + by sequence index. Conflicts with `-sort_lexicographically_on_save`. +* `-sort_lexicographically_on_save`: Sorts the variants when saving, where + contigs are ordered lexicographically. Conflicts with `-sort_on_save`. +* `-single`: Saves the VCF file as headerless shards, and then merges the + sharded files into a single VCF. + +### fasta2adam and adam2fasta + +These commands convert between FASTA and Parquet files storing assemblies using +the NucleotideContigFragment schema. + +`fasta2adam` takes two required arguments: + +1. `FASTA`: The input FASTA file to convert. +2. `ADAM`: The path to save the Parquet formatted NucleotideContigFragments to. + +`fasta2adam` supports the full set of [default options](#default-args), as well +as the following options: + +* `-fragment_length`: The fragment length to shard a given contig into. Defaults + to 10,000bp. +* `-reads`: Path to a set of reads that includes sequence info. This read path + is used to obtain the sequence indices for ordering the contigs from the + FASTA file. +* `-repartition`: The number of partitions to save the data to. If provided, + forces a shuffle. +* `-verbose`: If given, enables additional logging where the sequence dictionary + is printed. + +`adam2fasta` takes two required arguments: + +1. `ADAM`: The path to a Parquet file containing NucleotideContigFragments. +2. `FASTA`: The path to save the FASTA file to. + +`adam2fasta` only supports the `-print_metrics` option from the [default +options](#default-args). Additionally, `adam2fasta` takes the following options: + +* `-line_width`: The line width in characters to use for breaking FASTA lines. + Defaults to 60 characters. +* `-coalesce`: Sets the number of partitions to coalesce the output to. + If `-force_shuffle_coalesce` is not provided, the Spark engine may ignore + the coalesce directive. +* `-force_shuffle_coalesce`: Forces a shuffle that leads to the output being + saved with the number of partitions requested by `-coalesce`. This is + necessary if the `-coalesce` would increase the number of partitions, or + if it would reduce the number of partitions to fewer than the number of + Spark executors. This may have a substantial performance cost, and will + invalidate any sort order. + +### adam2fastq + +While the [`transform`](#transform) command can export to FASTQ, the +`adam2fastq` provides a simpler CLI with more output options. `adam2fastq` +takes two required arguments and an optional third argument: + +1. `INPUT`: The input read file, in any ADAM-supported read format. +2. `OUTPUT`: The path to save an unpaired or interleaved FASTQ file to, or the + path to save the first-of-pair reads to, for paired FASTQ. +3. Optional `SECOND_OUTPUT`: If saving paired FASTQ, the path to save the + second-of-pair reads to. + +`adam2fastq` only supports the `-print_metrics` option from the [default +options](#default-args). Additionally, `adam2fastq` takes the following options: + +* `-no_projection`: By default, `adam2fastq` only projects the fields necessary + for saving to FASTQ. This option disables that projection and projects all + fields. +* `-output_oq`: Outputs the original read qualities, if available. +* `-persist_level`: Sets the Spark + [persistance level](http://spark.apache.org/docs/latest/programming-guide.html#which-storage-level-to-choose) + for cached data during the conversion back to FASTQ. If not provided, the + intermediate RDDs are not cached. +* `-repartition`: The number of partitions to save the data to. If provided, + forces a shuffle. +* `-validation`: Sets the validation stringency for checking whether reads are + paired when saving paired reads. Defaults to `LENIENT.` See [validation + stringency](#validation) for more details. + +### reads2fragments and fragments2reads + +These two commands translate read data between the single read alignment and +fragment representations. + +`reads2fragments` takes two required arguments: + +1. `READS`: The input read file, in any ADAM-supported read format. +2. `FRAGMENTS`: The path to save Parquet data with the Fragment schema. + +`reads2fragments` takes the [default options](#default-args). + +`fragments2reads` takes two required arguments: + +1. `FRAGMENTS`: The input fragment file, in any ADAM-supported fragment format. +2. `READS`: The path to save reads at, in any ADAM-supported read format. + +`fragments2reads` takes the [default options](#default-args). Additionally, +`fragments2reads` takes the following options: + +* `-sort_reads`: Sorts reads by alignment position. Unmapped reads are + placed at the end of all reads. Contigs are ordered by sequence record + index. +* `-sort_lexicographically`: Sorts reads by alignment position. Unmapped + reads are placed at the end of all reads. Contigs are ordered + lexicographically. + +## Printing tools {#printers} + +The printing tools provide some form of user readable view of an ADAM file. +These commands are useful for both quality control and debugging. + +### print + +Dumps a Parquet file to either the console or a text file as +[JSON](http://www.json.org). Takes one required argument: + +1. `FILE(S)`: The file paths to load. These must be Parquet formatted files. + +This command has several options: + +* `-pretty`: Pretty print's the JSON output. +* `-o`: Provides a path to save the output dump to, instead of writing the + output to the console. + +This command does not support Parquet output, so the only [default +options](#default-args) that this command supports is `-print_metrics`. + +### flagstat {#flagstat} + +Runs the ADAM equivalent to the +[SAMTools](http://www.htslib.org/doc/samtools.html) `flagstat` command. Takes +one required argument: + +1. `INPUT`: The input path. A file containing reads in any of the supported + ADAM read input formats. + +This command has several options: + +* `-stringency`: Sets the validation stringency for various operations. + Defaults to `SILENT.` See [validation stringency](#validation) for more + details. +* `-o`: Provides a path to save the output dump to, instead of writing the + output to the console. + +This command does not support Parquet output, so the only [default +options](#default-args) that this command supports is `-print_metrics`. + +### view + +Runs the ADAM equivalent to the +[SAMTools](http://www.htslib.org/doc/samtools.html) `view` command. Takes +one required argument: + +1. `INPUT`: The input path. A file containing reads in any of the supported + ADAM read input formats. + +In addition to the [default options](#default-args), this command supports the +following options: + +* `-o`: Provides a path to save the output dump to, instead of writing the + output to the console. Format is autodetected as any of the ADAM read outputs. +* `-F`/`-f`: Filters reads that either match all (`-f`) or none (`-F`) of the + flag bits. +* `-G`/`-g`: Filters reads that either mismatch all (`-g`) or none (`-G`) of the + flag bits. +* `-c`: Prints the number of reads that (mis)matched the filters, instead of the + reads themselves. Conflicts with `-o`. diff --git a/docs/source/60_building_apps.md b/docs/source/60_building_apps.md index b185e5bf97..e7af579ee8 100644 --- a/docs/source/60_building_apps.md +++ b/docs/source/60_building_apps.md @@ -1,3 +1,3 @@ -# Building Downstream Applications +# Building Downstream Applications {#apps} ADAM is packaged so that it can be used via the base CLI, to run plugins via the CLI, or as a library which can be used when building downstream applications. This document covers some of the important things to note when building applications downstream of ADAM. diff --git a/docs/source/70_algorithms.md b/docs/source/70_algorithms.md index d81ba5d71d..4222b36dad 100644 --- a/docs/source/70_algorithms.md +++ b/docs/source/70_algorithms.md @@ -194,7 +194,7 @@ Algorithm \ref{alg:join-targets}. The set returned by this function is used as an index for mapping reads directly to realignment targets. -#### Candidate Generation and Realignment +#### Candidate Generation and Realignment {#consensus-model} Once we have generated the target set, we map across all the reads and check to see if the read overlaps a realignment target. We then group together all reads that map to a given realignment target; reads that