diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index dd4ecfec1..6ce4d2098 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -232,16 +232,19 @@ jobs: run: | nextflow run ${GITHUB_WORKSPACE} -profile test_cache,docker --aligner hisat2 ${{ matrix.parameters }} --outdir ./results --test_data_base ${{ github.workspace }}/test-datasets/ - salmon: - name: Test Salmon with workflow parameters + pseudo: + name: Test Pseudoaligners with workflow parameters if: ${{ (github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/rnaseq')) && !contains(github.event.head_commit.message, '[ci fast]') }} runs-on: ubuntu-latest strategy: matrix: parameters: - - "--skip_qc" - - "--skip_alignment --skip_pseudo_alignment" - - "--salmon_index false --transcript_fasta false" + - "--pseudo_aligner salmon --skip_qc" + - "--pseudo_aligner salmon --skip_alignment --skip_pseudo_alignment" + - "--pseudo_aligner salmon --salmon_index false --transcript_fasta false" + - "--pseudo_aligner kallisto --skip_qc" + - "--pseudo_aligner kallisto --skip_alignment --skip_pseudo_alignment" + - "--pseudo_aligner kallisto --kallisto_index false --transcript_fasta false" steps: - name: Check out pipeline code uses: actions/checkout@v2 @@ -280,6 +283,6 @@ jobs: wget -qO- get.nextflow.io | bash sudo mv nextflow /usr/local/bin/ - - name: Run pipeline with Salmon and various parameters + - name: Run pipeline with Salmon or Kallisto and various parameters run: | - nextflow run ${GITHUB_WORKSPACE} -profile test_cache,docker --pseudo_aligner salmon ${{ matrix.parameters }} --outdir ./results --test_data_base ${{ github.workspace }}/test-datasets/ + nextflow run ${GITHUB_WORKSPACE} -profile test_cache,docker ${{ matrix.parameters }} --outdir ./results --test_data_base ${{ github.workspace }}/test-datasets/ diff --git a/CHANGELOG.md b/CHANGELOG.md index a235dd00f..5d13bc92c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 Special thanks to the following for their contributions to the release: - [Adam Talbot](https://github.com/adamrtalbot) +- [Jonathan Manning](https://github.com/pinin4fjords) - [JĂșlia Mir Pedrol](https://github.com/mirpedrol) - [Matthias Zepper](https://github.com/MatthiasZepper) - [Maxime Garcia](https://github.com/maxulysse) @@ -28,13 +29,16 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements - [PR #1083](https://github.com/nf-core/rnaseq/pull/1083) - Move local modules and subworkflows to subfolders - [PR #1088](https://github.com/nf-core/rnaseq/pull/1088) - Updates contributing and code of conduct documents with nf-core template 2.10 - [PR #1091](https://github.com/nf-core/rnaseq/pull/1091) - Reorganise parameters in schema for better usability +- [PR #1106](https://github.com/nf-core/rnaseq/pull/1106) - Kallisto quantification +- [PR #1106](https://github.com/nf-core/rnaseq/pull/1106) - MultiQC [version bump](https://github.com/nf-core/rnaseq/pull/1106/commits/aebad067a10a45510a2b421da852cb436ae65fd8) +- [#1050](https://github.com/nf-core/rnaseq/issues/1050) - Provide custom prefix/suffix for summary files to avoid overwriting ### Software dependencies | Dependency | Old version | New version | | ----------------------- | ----------- | ----------- | | `fastqc` | 0.11.9 | 0.12.1 | -| `multiqc` | 1.14 | 1.15 | +| `multiqc` | 1.14 | 1.17 | | `ucsc-bedgraphtobigwig` | 377 | 445 | > **NB:** Dependency has been **updated** if both old and new version information is present. @@ -61,7 +65,7 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements ### Enhancements & fixes - [[#1011](https://github.com/nf-core/rnaseq/issues/1011)] - FastQ files from UMI-tools not being passed to fastp -- [[#1018](https://github.com/nf-core/rnaseq/issues/1018)] - Ability to skip both alignment and pseudo-alignment to only run pre-processing QC steps. +- [[#1018](https://github.com/nf-core/rnaseq/issues/1018)] - Ability to skip both alignment and pseudoalignment to only run pre-processing QC steps. - [PR #1016](https://github.com/nf-core/rnaseq/pull/1016) - Updated pipeline template to [nf-core/tools 2.8](https://github.com/nf-core/tools/releases/tag/2.8) - [PR #1025](https://github.com/nf-core/fetchngs/pull/1025) - Add `public_aws_ecr.config` to source mulled containers when using `public.ecr.aws` Docker Biocontainer registry - [PR #1038](https://github.com/nf-core/rnaseq/pull/1038) - Updated error log for count values when supplying `--additional_fasta` @@ -809,7 +813,7 @@ Major novel changes include: - Added options to skip several steps - Skip trimming using `--skipTrimming` - Skip BiotypeQC using `--skipBiotypeQC` - - Skip Alignment using `--skipAlignment` to only use pseudo-alignment using Salmon + - Skip Alignment using `--skipAlignment` to only use pseudoalignment using Salmon ### Documentation updates diff --git a/README.md b/README.md index 9fe5c0ab8..b45db8c7a 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ 3. [`dupRadar`](https://bioconductor.org/packages/release/bioc/html/dupRadar.html) 4. [`Preseq`](http://smithlabresearch.org/software/preseq/) 5. [`DESeq2`](https://bioconductor.org/packages/release/bioc/html/DESeq2.html) -15. Pseudo-alignment and quantification ([`Salmon`](https://combine-lab.github.io/salmon/); _optional_) +15. Pseudoalignment and quantification ([`Salmon`](https://combine-lab.github.io/salmon/) or ['Kallisto'](https://pachterlab.github.io/kallisto/); _optional_) 16. Present QC for raw read, alignment, gene biotype, sample similarity, and strand-specificity checks ([`MultiQC`](http://multiqc.info/), [`R`](https://www.r-project.org/)) > **Note** diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 4af748689..100536f4e 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -23,6 +23,7 @@ run_modules: - hisat2 - rsem - salmon + - kallisto - samtools - picard - preseq @@ -66,6 +67,7 @@ extra_fn_clean_exts: - ".umi_dedup" - "_val" - ".markdup" + - "_primary" # Customise the module search patterns to speed up execution time # - Skip module sub-tools that we are not interested in diff --git a/bin/salmon_tx2gene.py b/bin/salmon_tx2gene.py deleted file mode 100755 index 6c2f2bef6..000000000 --- a/bin/salmon_tx2gene.py +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env python -from __future__ import print_function -from collections import OrderedDict, defaultdict, Counter -import logging -import argparse -import glob -import os - -# Create a logger -logging.basicConfig(format="%(name)s - %(asctime)s %(levelname)s: %(message)s") -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -def read_top_transcript(salmon): - txs = set() - fn = glob.glob(os.path.join(salmon, "*", "quant.sf"))[0] - with open(fn) as inh: - for line in inh: - if line.startswith("Name"): - continue - txs.add(line.split()[0]) - if len(txs) > 100: - break - logger.info("Transcripts found in FASTA: %s" % txs) - return txs - - -def tx2gene(gtf, salmon, gene_id, extra, out): - txs = read_top_transcript(salmon) - votes = Counter() - gene_dict = defaultdict(list) - with open(gtf) as inh: - for line in inh: - if line.startswith("#"): - continue - cols = line.split("\t") - attr_dict = OrderedDict() - for gff_item in cols[8].split(";"): - item_pair = gff_item.strip().split(" ") - if len(item_pair) > 1: - value = item_pair[1].strip().replace('"', "") - if value in txs: - votes[item_pair[0].strip()] += 1 - - attr_dict[item_pair[0].strip()] = value - try: - gene_dict[attr_dict[gene_id]].append(attr_dict) - except KeyError: - continue - - if not votes: - logger.warning("No attribute in GTF matching transcripts") - return None - - txid = votes.most_common(1)[0][0] - logger.info("Attributed found to be transcript: %s" % txid) - seen = set() - with open(out, "w") as outh: - for gene in gene_dict: - for row in gene_dict[gene]: - if txid not in row: - continue - if (gene, row[txid]) not in seen: - seen.add((gene, row[txid])) - if not extra in row: - extra_id = gene - else: - extra_id = row[extra] - print("%s\t%s\t%s" % (row[txid], gene, extra_id), file=outh) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="""Get tx to gene names for tximport""") - parser.add_argument("--gtf", type=str, help="GTF file") - parser.add_argument("--salmon", type=str, help="output of salmon") - parser.add_argument("--id", type=str, help="gene id in the gtf file") - parser.add_argument("--extra", type=str, help="extra id in the gtf file") - parser.add_argument( - "-o", - "--output", - dest="output", - default="tx2gene.tsv", - type=str, - help="file with output", - ) - - args = parser.parse_args() - tx2gene(args.gtf, args.salmon, args.id, args.extra, args.output) diff --git a/bin/salmon_summarizedexperiment.r b/bin/summarizedexperiment.r similarity index 90% rename from bin/salmon_summarizedexperiment.r rename to bin/summarizedexperiment.r index 5ebdd317b..d55101ecf 100755 --- a/bin/salmon_summarizedexperiment.r +++ b/bin/summarizedexperiment.r @@ -2,18 +2,18 @@ library(SummarizedExperiment) -## Create SummarizedExperiment (se) object from Salmon counts +## Create SummarizedExperiment (se) object from counts args <- commandArgs(trailingOnly = TRUE) -if (length(args) < 2) { - stop("Usage: salmon_se.r ", call. = FALSE) +if (length(args) < 3) { + stop("Usage: summarizedexperiment.r ", call. = FALSE) } coldata <- args[1] counts_fn <- args[2] tpm_fn <- args[3] +tx2gene <- args[4] -tx2gene <- "salmon_tx2gene.tsv" info <- file.info(tx2gene) if (info$size == 0) { tx2gene <- NULL diff --git a/bin/tx2gene.py b/bin/tx2gene.py new file mode 100755 index 000000000..0a418920e --- /dev/null +++ b/bin/tx2gene.py @@ -0,0 +1,160 @@ +#!/usr/bin/env python +import logging +import argparse +import glob +import os +from collections import Counter, defaultdict, OrderedDict +from collections.abc import Set +from typing import Dict + +# Configure logging +logging.basicConfig(format="%(name)s - %(asctime)s %(levelname)s: %(message)s") +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +def read_top_transcripts(quant_dir: str, file_pattern: str) -> Set[str]: + """ + Read the top 100 transcripts from the quantification file. + + Parameters: + quant_dir (str): Directory where quantification files are located. + file_pattern (str): Pattern to match quantification files. + + Returns: + set: A set containing the top 100 transcripts. + """ + try: + # Find the quantification file within the directory + quant_file_path = glob.glob(os.path.join(quant_dir, "*", file_pattern))[0] + with open(quant_file_path, "r") as file_handle: + # Read the file and extract the top 100 transcripts + return {line.split()[0] for i, line in enumerate(file_handle) if i > 0 and i <= 100} + except IndexError: + # Log an error and raise a FileNotFoundError if the quant file does not exist + logger.error("No quantification files found.") + raise FileNotFoundError("Quantification file not found.") + + +def discover_transcript_attribute(gtf_file: str, transcripts: Set[str]) -> str: + """ + Discover the attribute in the GTF that corresponds to transcripts, prioritizing 'transcript_id'. + + Parameters: + gtf_file (str): Path to the GTF file. + transcripts (Set[str]): A set of transcripts to match in the GTF file. + + Returns: + str: The attribute name that corresponds to transcripts in the GTF file. + """ + votes = Counter() + with open(gtf_file) as inh: + # Read GTF file, skipping header lines + for line in filter(lambda x: not x.startswith("#"), inh): + cols = line.split("\t") + # Parse attribute column and update votes for each attribute found + attributes = dict(item.strip().split(" ", 1) for item in cols[8].split(";") if item.strip()) + votes.update(key for key, value in attributes.items() if value.strip('"') in transcripts) + + if not votes: + # Log a warning if no matching attribute is found + logger.warning("No attribute in GTF matching transcripts") + return "" + + # Check if 'transcript_id' is among the attributes with the highest votes + if "transcript_id" in votes and votes["transcript_id"] == max(votes.values()): + logger.info("Attribute 'transcript_id' corresponds to transcripts.") + return "transcript_id" + + # If 'transcript_id' isn't the highest, determine the most common attribute that matches the transcripts + attribute, _ = votes.most_common(1)[0] + logger.info(f"Attribute '{attribute}' corresponds to transcripts.") + return attribute + + +def parse_attributes(attributes_text: str) -> Dict[str, str]: + """ + Parse the attributes column of a GTF file. + + :param attributes_text: The attributes column as a string. + :return: A dictionary of the attributes. + """ + # Split the attributes string by semicolon and strip whitespace + attributes = attributes_text.strip().split(";") + attr_dict = OrderedDict() + + # Iterate over each attribute pair + for attribute in attributes: + # Split the attribute into key and value, ensuring there are two parts + parts = attribute.strip().split(" ", 1) + if len(parts) == 2: + key, value = parts + # Remove any double quotes from the value + value = value.replace('"', "") + attr_dict[key] = value + + return attr_dict + + +def map_transcripts_to_gene( + quant_type: str, gtf_file: str, quant_dir: str, gene_id: str, extra_id_field: str, output_file: str +) -> bool: + """ + Map transcripts to gene names and write the output to a file. + + Parameters: + quant_type (str): The quantification method used (e.g., 'salmon'). + gtf_file (str): Path to the GTF file. + quant_dir (str): Directory where quantification files are located. + gene_id (str): The gene ID attribute in the GTF file. + extra_id_field (str): Additional ID field in the GTF file. + output_file (str): The output file path. + + Returns: + bool: True if the operation was successful, False otherwise. + """ + # Read the top transcripts based on quantification type + transcripts = read_top_transcripts(quant_dir, "quant.sf" if quant_type == "salmon" else "abundance.tsv") + # Discover the attribute that corresponds to transcripts in the GTF + transcript_attribute = discover_transcript_attribute(gtf_file, transcripts) + + if not transcript_attribute: + # If no attribute is found, return False + return False + + # Open GTF and output file to write the mappings + # Initialize the set to track seen combinations + seen = set() + + with open(gtf_file) as inh, open(output_file, "w") as output_handle: + # Parse each line of the GTF, mapping transcripts to genes + for line in filter(lambda x: not x.startswith("#"), inh): + cols = line.split("\t") + attr_dict = parse_attributes(cols[8]) + if gene_id in attr_dict and transcript_attribute in attr_dict: + # Create a unique identifier for the transcript-gene combination + transcript_gene_pair = (attr_dict[transcript_attribute], attr_dict[gene_id]) + + # Check if the combination has already been seen + if transcript_gene_pair not in seen: + # If it's a new combination, write it to the output and add to the seen set + extra_id = attr_dict.get(extra_id_field, attr_dict[gene_id]) + output_handle.write(f"{attr_dict[transcript_attribute]}\t{attr_dict[gene_id]}\t{extra_id}\n") + seen.add(transcript_gene_pair) + + return True + + +# Main function to parse arguments and call the mapping function +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Map transcripts to gene names for tximport.") + parser.add_argument("--quant_type", type=str, help="Quantification type", default="salmon") + parser.add_argument("--gtf", type=str, help="GTF file", required=True) + parser.add_argument("--quants", type=str, help="Output of quantification", required=True) + parser.add_argument("--id", type=str, help="Gene ID in the GTF file", required=True) + parser.add_argument("--extra", type=str, help="Extra ID in the GTF file") + parser.add_argument("-o", "--output", dest="output", default="tx2gene.tsv", type=str, help="File with output") + + args = parser.parse_args() + if not map_transcripts_to_gene(args.quant_type, args.gtf, args.quants, args.id, args.extra, args.output): + logger.error("Failed to map transcripts to genes.") diff --git a/bin/salmon_tximport.r b/bin/tximport.r similarity index 78% rename from bin/salmon_tximport.r rename to bin/tximport.r index e4416080d..e1bad7511 100755 --- a/bin/salmon_tximport.r +++ b/bin/tximport.r @@ -4,26 +4,29 @@ library(SummarizedExperiment) library(tximport) args = commandArgs(trailingOnly=TRUE) -if (length(args) < 2) { - stop("Usage: salmon_tximport.r ", call.=FALSE) +if (length(args) < 4) { + stop("Usage: tximport.r ", call.=FALSE) } coldata = args[1] path = args[2] sample_name = args[3] +quant_type = args[4] +tx2gene_path = args[5] prefix = sample_name -tx2gene = "salmon_tx2gene.tsv" -info = file.info(tx2gene) + +info = file.info(tx2gene_path) if (info$size == 0) { tx2gene = NULL } else { - rowdata = read.csv(tx2gene, sep="\t", header = FALSE) + rowdata = read.csv(tx2gene_path, sep="\t", header = FALSE) colnames(rowdata) = c("tx", "gene_id", "gene_name") tx2gene = rowdata[,1:2] } -fns = list.files(path, pattern = "quant.sf", recursive = T, full.names = T) +pattern <- ifelse(quant_type == "kallisto", "abundance.tsv", "quant.sf") +fns = list.files(path, pattern = pattern, recursive = T, full.names = T) names = basename(dirname(fns)) names(fns) = names @@ -32,11 +35,13 @@ if (file.exists(coldata)) { coldata = coldata[match(names, coldata[,1]),] coldata = cbind(files = fns, coldata) } else { - message("ColData not avaliable ", coldata) + message("ColData not available: ", coldata) coldata = data.frame(files = fns, names = names) } -txi = tximport(fns, type = "salmon", txOut = TRUE) +dropInfReps = quant_type == "kallisto" + +txi = tximport(fns, type = quant_type, txOut = TRUE, dropInfReps = dropInfReps) rownames(coldata) = coldata[["names"]] extra = setdiff(rownames(txi[[1]]), as.character(rowdata[["tx"]])) if (length(extra) > 0) { @@ -49,8 +54,8 @@ se = SummarizedExperiment(assays = list(counts = txi[["counts"]], abundance = tx rowData = rowdata) if (!is.null(tx2gene)) { gi = summarizeToGene(txi, tx2gene = tx2gene) - gi.ls = summarizeToGene(txi, tx2gene = tx2gene,countsFromAbundance="lengthScaledTPM") - gi.s = summarizeToGene(txi, tx2gene = tx2gene,countsFromAbundance="scaledTPM") + gi.ls = summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "lengthScaledTPM") + gi.s = summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "scaledTPM") growdata = unique(rowdata[,2:3]) growdata = growdata[match(rownames(gi[[1]]), growdata[["gene_id"]]),] rownames(growdata) = growdata[["tx"]] @@ -78,9 +83,10 @@ if(exists("gse")){ write.table(build_table(gse.s, "counts"), paste(c(prefix, "gene_counts_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) } -write.table(build_table(se,"abundance"), paste(c(prefix, "transcript_tpm.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) +write.table(build_table(se, "abundance"), paste(c(prefix, "transcript_tpm.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) write.table(build_table(se, "counts"), paste(c(prefix, "transcript_counts.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) # Print sessioninfo to standard out citation("tximeta") sessionInfo() + diff --git a/conf/modules.config b/conf/modules.config index d3c22f00c..5b46659cf 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -83,6 +83,15 @@ process { ] } + withName: 'KALLISTO_INDEX' { + ext.args = { params.gencode ? '--gencode' : '' } + publishDir = [ + path: { "${params.outdir}/genome/index" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : params.save_reference ? filename : null } + ] + } + withName: 'RSEM_PREPAREREFERENCE_GENOME' { ext.args = '--star' publishDir = [ @@ -566,7 +575,7 @@ if (!params.skip_alignment && params.aligner == 'star_salmon') { ] } - withName: '.*:QUANTIFY_STAR_SALMON:SALMON_TX2GENE' { + withName: '.*:QUANTIFY_STAR_SALMON:TX2GENE' { publishDir = [ path: { "${params.outdir}/${params.aligner}" }, mode: params.publish_dir_mode, @@ -574,7 +583,8 @@ if (!params.skip_alignment && params.aligner == 'star_salmon') { ] } - withName: '.*:QUANTIFY_STAR_SALMON:SALMON_TXIMPORT' { + withName: '.*:QUANTIFY_STAR_SALMON:TXIMPORT' { + ext.prefix = { "${quant_type}.merged" } publishDir = [ path: { "${params.outdir}/${params.aligner}" }, mode: params.publish_dir_mode, @@ -582,7 +592,7 @@ if (!params.skip_alignment && params.aligner == 'star_salmon') { ] } - withName: '.*:QUANTIFY_STAR_SALMON:SALMON_SE_.*' { + withName: '.*:QUANTIFY_STAR_SALMON:SE_.*' { publishDir = [ path: { "${params.outdir}/${params.aligner}" }, mode: params.publish_dir_mode, @@ -698,7 +708,6 @@ if (!params.skip_alignment && params.aligner == 'star_salmon') { ext.args = { [ "--id_col 1", "--sample_suffix ''", - "--outprefix deseq2", "--count_col 3", params.deseq2_vst ? '--vst TRUE' : '' ].join(' ').trim() } @@ -762,7 +771,6 @@ if (!params.skip_alignment && params.aligner == 'star_rsem') { ext.args = { [ "--id_col 1", "--sample_suffix ''", - "--outprefix deseq2", "--count_col 3", params.deseq2_vst ? '--vst TRUE' : '' ].join(' ').trim() } @@ -1059,6 +1067,7 @@ if (!params.skip_multiqc) { process { withName: 'MULTIQC' { ext.args = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' } + ext.prefix = "multiqc_report" publishDir = [ path: { [ "${params.outdir}/multiqc", @@ -1072,12 +1081,13 @@ if (!params.skip_multiqc) { } // -// Salmon pseudo-alignment options +// Salmon/ Kallisto pseudoalignment options // if (!params.skip_pseudo_alignment && params.pseudo_aligner == 'salmon') { process { - withName: '.*:QUANTIFY_SALMON:SALMON_QUANT' { + + withName: '.*:QUANTIFY_PSEUDO_ALIGNMENT:SALMON_QUANT' { ext.args = { params.extra_salmon_quant_args ?: '' } publishDir = [ path: { "${params.outdir}/${params.pseudo_aligner}" }, @@ -1085,8 +1095,25 @@ if (!params.skip_pseudo_alignment && params.pseudo_aligner == 'salmon') { saveAs: { filename -> filename.equals('versions.yml') || filename.endsWith('_meta_info.json') ? null : filename } ] } + } +} + +if (!params.skip_pseudo_alignment && params.pseudo_aligner == 'kallisto') { + process { + withName: '.*:QUANTIFY_PSEUDO_ALIGNMENT:KALLISTO_QUANT' { + ext.args = params.extra_kallisto_quant_args ?: '' + publishDir = [ + path: { "${params.outdir}/${params.pseudo_aligner}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') || filename.endsWith('.run_info.json') || filename.endsWith('.log') ? null : filename } + ] + } + } +} - withName: '.*:QUANTIFY_SALMON:SALMON_TX2GENE' { +if (!params.skip_pseudo_alignment) { + process { + withName: '.*:QUANTIFY_PSEUDO_ALIGNMENT:TX2GENE' { publishDir = [ path: { "${params.outdir}/${params.pseudo_aligner}" }, mode: params.publish_dir_mode, @@ -1094,7 +1121,8 @@ if (!params.skip_pseudo_alignment && params.pseudo_aligner == 'salmon') { ] } - withName: '.*:QUANTIFY_SALMON:SALMON_TXIMPORT' { + withName: '.*:QUANTIFY_PSEUDO_ALIGNMENT:TXIMPORT' { + ext.prefix = { "${quant_type}.merged" } publishDir = [ path: { "${params.outdir}/${params.pseudo_aligner}" }, mode: params.publish_dir_mode, @@ -1102,7 +1130,7 @@ if (!params.skip_pseudo_alignment && params.pseudo_aligner == 'salmon') { ] } - withName: '.*:QUANTIFY_SALMON:SALMON_SE_.*' { + withName: '.*:QUANTIFY_PSEUDO_ALIGNMENT:SE_.*' { publishDir = [ path: { "${params.outdir}/${params.pseudo_aligner}" }, mode: params.publish_dir_mode, @@ -1113,15 +1141,14 @@ if (!params.skip_pseudo_alignment && params.pseudo_aligner == 'salmon') { if (!params.skip_qc & !params.skip_deseq2_qc) { process { - withName: 'DESEQ2_QC_SALMON' { + withName: 'DESEQ2_QC_PSEUDO' { ext.args = { [ "--id_col 1", "--sample_suffix ''", - "--outprefix deseq2", "--count_col 3", params.deseq2_vst ? '--vst TRUE' : '' ].join(' ').trim() } - ext.args2 = 'salmon' + ext.args2 = { params.pseudo_aligner } publishDir = [ path: { "${params.outdir}/${params.pseudo_aligner}/deseq2_qc" }, mode: params.publish_dir_mode, diff --git a/docs/images/nf-core-rnaseq_metro_map_grey.png b/docs/images/nf-core-rnaseq_metro_map_grey.png index 0a3645f81..f2b5e4e87 100644 Binary files a/docs/images/nf-core-rnaseq_metro_map_grey.png and b/docs/images/nf-core-rnaseq_metro_map_grey.png differ diff --git a/docs/images/nf-core-rnaseq_metro_map_grey.svg b/docs/images/nf-core-rnaseq_metro_map_grey.svg index 2e2a07374..d140e28dd 100644 --- a/docs/images/nf-core-rnaseq_metro_map_grey.svg +++ b/docs/images/nf-core-rnaseq_metro_map_grey.svg @@ -7,7 +7,7 @@ viewBox="0 0 646.4851 269.92565" version="1.1" id="svg8" - inkscape:version="1.2.2 (1:1.2.2+202212051552+b0a8486541)" + inkscape:version="1.3 (0e150ed, 2023-07-21)" sodipodi:docname="nf-core-rnaseq_metro_map_grey.svg" inkscape:export-filename="nf-core-rnaseq_metro_map_grey.png" inkscape:export-xdpi="89" @@ -655,17 +655,17 @@ borderopacity="1.0" inkscape:pageopacity="0.0" inkscape:pageshadow="2" - inkscape:zoom="0.40305088" - inkscape:cx="916.75771" - inkscape:cy="282.8427" + inkscape:zoom="0.80610176" + inkscape:cx="2141.1689" + inkscape:cy="630.81366" inkscape:document-units="mm" inkscape:current-layer="layer1" showgrid="false" - inkscape:window-width="1920" - inkscape:window-height="1052" - inkscape:window-x="0" - inkscape:window-y="0" - inkscape:window-maximized="1" + inkscape:window-width="1448" + inkscape:window-height="794" + inkscape:window-x="40" + inkscape:window-y="48" + inkscape:window-maximized="0" inkscape:snap-others="true" inkscape:snap-nodes="true" showguides="false" @@ -684,7 +684,11 @@ type="xygrid" id="grid66457" originx="64.401789" - originy="-15.824357" />image/svg+xml3SalmonKallistoMultiQCGTFGTFFASTAFASTAFASTQFASTQsubsamplefastq(fq)TSVTSVTSVTSVBAMBAMBAIBAIBIGWIGBIGWIGHTMLHTMLPseudo-aligner: Salmon, Quantification: SalmonAligner: HISAT2, Quantification: NoneAligner: STAR, Quantification: Salmon (default)Pseudo-aligner: Kallisto, Quantification: KallistoAligner: STAR, Quantification: RSEMMETHOD1. Pre-processing4. Post-processingLicense:License:TrimTrimGalore!FastQCFastQC(Salmon)(Salmon)BBSplitBBSplitFastQCFastQCFastP + y="-45.41811">FastP diff --git a/docs/output.md b/docs/output.md index af9721f00..a4c559b3b 100644 --- a/docs/output.md +++ b/docs/output.md @@ -41,8 +41,9 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [featureCounts](#featurecounts) - Read counting relative to gene biotype - [DESeq2](#deseq2) - PCA plot and sample pairwise distance heatmap and dendrogram - [MultiQC](#multiqc) - Present QC for raw reads, alignment, read counting and sample similiarity -- [Pseudo-alignment and quantification](#pseudo-alignment-and-quantification) - - [Salmon](#salmon) - Wicked fast gene and isoform quantification relative to the transcriptome +- [Pseudoalignment and quantification](#pseudoalignment-and-quantification) + - [Salmon](#pseudoalignment) - Wicked fast gene and isoform quantification relative to the transcriptome + - [Kallisto](#pseudoalignment) - Near-optimal probabilistic RNA-seq quantification - [Workflow reporting and genomes](#workflow-reporting-and-genomes) - [Reference genome files](#reference-genome-files) - Saving reference genome indices/files - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution @@ -180,7 +181,7 @@ When `--remove_ribo_rna` is specified, the pipeline uses [SortMeRNA](https://git ## Alignment and quantification -### STAR and Salmon +### STAR, Salmon and Kallisto
Output files @@ -199,12 +200,12 @@ When `--remove_ribo_rna` is specified, the pipeline uses [SortMeRNA](https://git [STAR](https://github.com/alexdobin/STAR) is a read aligner designed for splice aware mapping typical of RNA sequencing data. STAR stands for *S*pliced *T*ranscripts *A*lignment to a *R*eference, and has been shown to have high accuracy and outperforms other aligners by more than a factor of 50 in mapping speed, but it is memory intensive. Using `--aligner star_salmon` is the default alignment and quantification option. -[Salmon](https://salmon.readthedocs.io/en/latest/salmon.html) from [Ocean Genomics](https://oceangenomics.com/) is a tool for wicked-fast transcript quantification from RNA-seq data. It requires a set of target transcripts (either from a reference or de-novo assembly) in order to perform quantification. All you need to run Salmon is a FASTA file containing your reference transcripts and a set of FASTA/FASTQ/BAM file(s) containing your reads. The transcriptome-level BAM files generated by STAR are provided to Salmon for downstream quantification. You can of course also provide FASTQ files directly as input to Salmon in order to pseudo-align and quantify your data by providing the `--pseudo_aligner salmon` parameter. The results generated by the pipeline are exactly the same whether you provide BAM or FASTQ input so please see the [Salmon](#salmon) results section for more details. - The STAR section of the MultiQC report shows a bar plot with alignment rates: good samples should have most reads as _Uniquely mapped_ and few _Unmapped_ reads. ![MultiQC - STAR alignment scores plot](images/mqc_star.png) +[Salmon](https://salmon.readthedocs.io/en/latest/salmon.html) from [Ocean Genomics](https://oceangenomics.com/) and [Kallisto](https://pachterlab.github.io/kallisto/), from the Pachter Lab, are provided as options for pseudoalignment. Both allow quantification of reads against an index generated from a reference set of target transcripts. By default, the transcriptome-level BAM files generated by STAR are provided to Salmon for downstream quantification, and Kallisto is not an option here (it does not allow BAM file input). But you can provide FASTQ files directly as input to either Salmon or Kallisto in order to pseudoalign and quantify your data by providing the `--pseudo_aligner salmon` or `--pseudo_aligner kallisto` parameter. See the [Salmon](#pseudoalignment) and [Kallisto](#pseudoalignment) results sections for more details. + ### STAR via RSEM
@@ -668,25 +669,34 @@ The plot on the left hand side shows the standard PC plot - notice the variable Results generated by MultiQC collate pipeline QC from supported tools i.e. FastQC, Cutadapt, SortMeRNA, STAR, RSEM, HISAT2, Salmon, SAMtools, Picard, RSeQC, Qualimap, Preseq and featureCounts. Additionally, various custom content has been added to the report to assess the output of dupRadar, DESeq2 and featureCounts biotypes, and to highlight samples failing a mimimum mapping threshold or those that failed to match the strand-specificity provided in the input samplesheet. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . -## Pseudo-alignment and quantification +## Pseudoalignment and quantification -### Salmon +### Pseudoalignment + +The principal output files are the same between Salmon and Kallsto: + +
+Output files + +- `/` + - `.merged.gene_counts.tsv`: Matrix of gene-level raw counts across all samples. + - `.gene_tpm.tsv`: Matrix of gene-level TPM values across all samples. + - `.gene_counts.rds`: RDS object that can be loaded in R that contains a [SummarizedExperiment](https://bioconductor.org/packages/release/bioc/html/SummarizedExperiment.html) container with the TPM (`abundance`), estimated counts (`counts`) and transcript length (`length`) in the assays slot for genes. + - `.merged.gene_counts_scaled.tsv`: Matrix of gene-level library size-scaled counts across all samples. + - `.merged.gene_counts_scaled.rds`: RDS object that can be loaded in R that contains a [SummarizedExperiment](https://bioconductor.org/packages/release/bioc/html/SummarizedExperiment.html) container with the TPM (`abundance`), estimated library size-scaled counts (`counts`) and transcript length (`length`) in the assays slot for genes. + - `.merged.gene_counts_length_scaled.tsv`: Matrix of gene-level length-scaled counts across all samples. + - `.merged.gene_counts_length_scaled.rds`: RDS object that can be loaded in R that contains a [SummarizedExperiment](https://bioconductor.org/packages/release/bioc/html/SummarizedExperiment.html) container with the TPM (`abundance`), estimated length-scaled counts (`counts`) and transcript length (`length`) in the assays slot for genes. + - `.merged.transcript_counts.tsv`: Matrix of isoform-level raw counts across all samples. + - `.merged.transcript_tpm.tsv`: Matrix of isoform-level TPM values across all samples. + - `.merged.transcript_counts.rds`: RDS object that can be loaded in R that contains a [SummarizedExperiment](https://bioconductor.org/packages/release/bioc/html/SummarizedExperiment.html) container with the TPM (`abundance`), estimated isoform-level raw counts (`counts`) and transcript length (`length`) in the assays slot for transcripts. + - `tx2gene.tsv`: Tab-delimited file containing gene to transcripts ids mappings. +
+ +An additional subset of files are distinct to each tool, for Salmon:
Output files -- `salmon/` - - `salmon.merged.gene_counts.tsv`: Matrix of gene-level raw counts across all samples. - - `salmon.merged.gene_tpm.tsv`: Matrix of gene-level TPM values across all samples. - - `salmon.merged.gene_counts.rds`: RDS object that can be loaded in R that contains a [SummarizedExperiment](https://bioconductor.org/packages/release/bioc/html/SummarizedExperiment.html) container with the TPM (`abundance`), estimated counts (`counts`) and transcript length (`length`) in the assays slot for genes. - - `salmon.merged.gene_counts_scaled.tsv`: Matrix of gene-level library size-scaled counts across all samples. - - `salmon.merged.gene_counts_scaled.rds`: RDS object that can be loaded in R that contains a [SummarizedExperiment](https://bioconductor.org/packages/release/bioc/html/SummarizedExperiment.html) container with the TPM (`abundance`), estimated library size-scaled counts (`counts`) and transcript length (`length`) in the assays slot for genes. - - `salmon.merged.gene_counts_length_scaled.tsv`: Matrix of gene-level length-scaled counts across all samples. - - `salmon.merged.gene_counts_length_scaled.rds`: RDS object that can be loaded in R that contains a [SummarizedExperiment](https://bioconductor.org/packages/release/bioc/html/SummarizedExperiment.html) container with the TPM (`abundance`), estimated length-scaled counts (`counts`) and transcript length (`length`) in the assays slot for genes. - - `salmon.merged.transcript_counts.tsv`: Matrix of isoform-level raw counts across all samples. - - `salmon.merged.transcript_tpm.tsv`: Matrix of isoform-level TPM values across all samples. - - `salmon.merged.transcript_counts.rds`: RDS object that can be loaded in R that contains a [SummarizedExperiment](https://bioconductor.org/packages/release/bioc/html/SummarizedExperiment.html) container with the TPM (`abundance`), estimated isoform-level raw counts (`counts`) and transcript length (`length`) in the assays slot for transcripts. - - `salmon_tx2gene.tsv`: Tab-delimited file containing gene to transcripts ids mappings. - `salmon//` - `aux_info/`: Auxiliary info e.g. versions and number of mapped reads. - `cmd_info.json`: Information about the Salmon quantification command, version and options. @@ -695,14 +705,25 @@ Results generated by MultiQC collate pipeline QC from supported tools i.e. FastQ - `logs/`: Contains the file `salmon_quant.log` giving a record of Salmon's quantification. - `quant.genes.sf`: Salmon _gene_-level quantification of the sample, including feature length, effective length, TPM, and number of reads. - `quant.sf`: Salmon _transcript_-level quantification of the sample, including feature length, effective length, TPM, and number of reads. +
-
+... and Kallisto: + +
+Output files + +- `kallisto//` + - `abundance.h5`: a HDF5 binary file containing run info, abundance esimates, bootstrap estimates, and transcript length information length. This file can be read in by [sleuth](https://pachterlab.github.io/sleuth/about). + - `abundance.tsv`: a plaintext file of the abundance estimates. It does not contains bootstrap estimates. + - `run_info.json`: a json file containing information about the run. + - `kallisto_quant.log`: standard output from the Kallisto process per sample. +
-As described in the [STAR and Salmon](#star-and-salmon) section, you can choose to pseudo-align and quantify your data with [Salmon](https://salmon.readthedocs.io/en/latest/salmon.html) by providing the `--pseudo_aligner salmon` parameter. By default, Salmon is run in addition to the standard alignment workflow defined by `--aligner`, mainly because it allows you to obtain QC metrics with respect to the genomic alignments. However, you can provide the `--skip_alignment` parameter if you would like to run Salmon in isolation. If Salmon is run in isolation, the outputs mentioned above will be found in a folder named `salmon`. If Salmon is run alongside STAR, the folder will be named `star_salmon`. +As described in the [STAR and Salmon](#star-and-salmon) section, you can choose to pseudoalign and quantify your data with [Salmon](https://salmon.readthedocs.io/en/latest/salmon.html) or [Kallisto](https://pachterlab.github.io/kallisto/) by providing the `--pseudo_aligner` parameter. By default, Salmon is run in addition to the standard alignment workflow defined by `--aligner`, mainly because it allows you to obtain QC metrics with respect to the genomic alignments. However, you can provide the `--skip_alignment` parameter if you would like to run Salmon or Kallisto in isolation. If Salmon or Kallisto are run in isolation, the outputs mentioned above will be found in a folder named `salmon` or `kallisto`. If Salmon is run alongside STAR, the folder will be named `star_salmon`. Transcripts with large inferential uncertainty won't be assigned the exact number of reads reproducibly, every time Salmon is run. Read more about this on the [nf-core/rnaseq](https://github.com/nf-core/rnaseq/issues/585) and [salmon](https://github.com/COMBINE-lab/salmon/issues/613) Github repos. -The [tximport](https://bioconductor.org/packages/release/bioc/html/tximport.html) package is used in this pipeline to summarise the results generated by Salmon into matrices for use with downstream differential analysis packages. We use tximport with different options to summarize count and TPM quantifications at the gene- and transcript-level. Please see [#499](https://github.com/nf-core/rnaseq/issues/499) for discussion and links regarding which counts are suitable for different types of analysis. +The [tximport](https://bioconductor.org/packages/release/bioc/html/tximport.html) package is used in this pipeline to summarise the results generated by Salmon or Kallisto into matrices for use with downstream differential analysis packages. We use tximport with different options to summarize count and TPM quantifications at the gene- and transcript-level. Please see [#499](https://github.com/nf-core/rnaseq/issues/499) for discussion and links regarding which counts are suitable for different types of analysis. According to the `txtimport` documentation you can do one of the following: @@ -728,6 +749,7 @@ According to the `txtimport` documentation you can do one of the following: - `hisat2/`: Directory containing HISAT2 indices. - `rsem/`: Directory containing STAR and RSEM indices. - `salmon/`: Directory containing Salmon indices. + - `kallisto/`: Directory containing Kallisto indices.
diff --git a/docs/usage.md b/docs/usage.md index 9748aa5de..0f3050907 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -65,17 +65,19 @@ An [example samplesheet](../assets/samplesheet.csv) has been provided with the p By default, the pipeline uses [STAR](https://github.com/alexdobin/STAR) (i.e. `--aligner star_salmon`) to map the raw FastQ reads to the reference genome, project the alignments onto the transcriptome and to perform the downstream BAM-level quantification with [Salmon](https://salmon.readthedocs.io/en/latest/salmon.html). STAR is fast but requires a lot of memory to run, typically around 38GB for the Human GRCh37 reference genome. Since the [RSEM](https://github.com/deweylab/RSEM) (i.e. `--aligner star_rsem`) workflow in the pipeline also uses STAR you should use the [HISAT2](https://ccb.jhu.edu/software/hisat2/index.shtml) aligner (i.e. `--aligner hisat2`) if you have memory limitations. -You also have the option to pseudo-align and quantify your data with [Salmon](https://salmon.readthedocs.io/en/latest/salmon.html) by providing the `--pseudo_aligner salmon` parameter. Salmon will then be run in addition to the standard alignment workflow defined by `--aligner`, mainly because it allows you to obtain QC metrics with respect to the genomic alignments. However, you can provide the `--skip_alignment` parameter if you would like to run Salmon in isolation. By default, the pipeline will use the genome fasta and gtf file to generate the transcripts fasta file, and then to build the Salmon index. You can override these parameters using the `--transcript_fasta` and `--salmon_index` parameters, respectively. The library preparation protocol (library type) used by Salmon quantification is inferred by the pipeline based on the information provided in the samplesheet, however, you can override it using the `--salmon_quant_libtype` parameter. You can find the available options in the [Salmon documentation](https://salmon.readthedocs.io/en/latest/library_type.html). +You also have the option to pseudoalign and quantify your data directly with [Salmon](https://salmon.readthedocs.io/en/latest/salmon.html) or [Kallisto](https://pachterlab.github.io/kallisto/) by specifying `salmon` or `kallisto` to the `--pseudo_aligner` parameter. The selected pseudoaligner will then be run in addition to the standard alignment workflow defined by `--aligner`, mainly because it allows you to obtain QC metrics with respect to the genomic alignments. However, you can provide the `--skip_alignment` parameter if you would like to run Salmon or Kallisto in isolation. By default, the pipeline will use the genome fasta and gtf file to generate the transcripts fasta file, and then to build the Salmon index. You can override these parameters using the `--transcript_fasta` and `--salmon_index` parameters, respectively. + +The library preparation protocol (library type) used by Salmon quantification is inferred by the pipeline based on the information provided in the samplesheet, however, you can override it using the `--salmon_quant_libtype` parameter. You can find the available options in the [Salmon documentation](https://salmon.readthedocs.io/en/latest/library_type.html). Similarly, strandedness is taken from the sample sheet or calculated automatically, and passed to Kallisto on a per-library basis, but you can apply a global override by setting the Kallisto strandedness parameters in `--extra_kallisto_quant_args` like `--extra_kallisto_quant_args '--fr-stranded'` see the [Kallisto documentation](https://pachterlab.github.io/kallisto/manual). When running Salmon in mapping-based mode via `--pseudo_aligner salmon` the entire genome of the organism is used by default for the decoy-aware transcriptome when creating the indices (see second bulleted option in [Salmon documentation](https://salmon.readthedocs.io/en/latest/salmon.html#preparing-transcriptome-indices-mapping-based-mode)). -Two additional parameters `--extra_star_align_args` and `--extra_salmon_quant_args` were added in v3.10 of the pipeline that allow you to append any custom parameters to the STAR align and Salmon quant commands, respectively. Note, the `--seqBias` and `--gcBias` are not provided to Salmon quant by default so you can provide these via `--extra_salmon_quant_args '--seqBias --gcBias'` if required. +Two additional parameters `--extra_star_align_args` and `--extra_salmon_quant_args` were added in v3.10 of the pipeline that allow you to append any custom parameters to the STAR align and Salmon quant commands, respectively. Note, the `--seqBias` and `--gcBias` are not provided to Salmon quant by default so you can provide these via `--extra_salmon_quant_args '--seqBias --gcBias'` if required. You can now also supply additional arguments to Kallisto via `--extra_kallisto_quant_args`. -> **NB:** You can use `--skip_alignment --skip_pseudo_alignment` if you only want to run the pre-processing QC steps in the pipeline like FastQ, trimming etc. This will skip alignment, pseudo-alignment and any post-alignment processing steps. +> **NB:** You can use `--skip_alignment --skip_pseudo_alignment` if you only want to run the pre-processing QC steps in the pipeline like FastQ, trimming etc. This will skip alignment, pseudoalignment and any post-alignment processing steps. ## Quantification options -The current options align with STAR and quantify using either Salmon (`--aligner star_salmon`) / RSEM (`--aligner star_rsem`). You also have the option to pseudo-align and quantify your data with Salmon by providing the `--pseudo_aligner salmon` parameter. +The current options align with STAR and quantify using either Salmon (`--aligner star_salmon`) / RSEM (`--aligner star_rsem`). You also have the option to pseudoalign and quantify your data with Salmon or Kallisto by providing the `--pseudo_aligner salmon` or `--pseudo_aligner kallisto` parameter, respectively. Since v3.0 of the pipeline, featureCounts is no longer used to perform gene/transcript quantification, however it is still used to generate QC metrics based on [biotype](http://www.ensembl.org/info/genome/genebuild/biotypes.html) information available within GFF/GTF genome annotation files. This decision was made primarily because of the limitations of featureCounts to appropriately quantify gene expression data. Please see [Zhao et al., 2015](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0141910#pone-0141910-t001) and [Soneson et al., 2015](https://f1000research.com/articles/4-1521/v1). diff --git a/main.nf b/main.nf index c47980c73..69ae6c3b3 100755 --- a/main.nf +++ b/main.nf @@ -28,6 +28,7 @@ params.star_index = WorkflowMain.getGenomeAttribute(params, 'star') params.hisat2_index = WorkflowMain.getGenomeAttribute(params, 'hisat2') params.rsem_index = WorkflowMain.getGenomeAttribute(params, 'rsem') params.salmon_index = WorkflowMain.getGenomeAttribute(params, 'salmon') +params.kallisto_index = WorkflowMain.getGenomeAttribute(params, 'kallisto') /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/modules.json b/modules.json index 3ae59acc3..d06dfe193 100644 --- a/modules.json +++ b/modules.json @@ -65,6 +65,16 @@ "git_sha": "735e1e04e7e01751d2d6e97055bbdb6f70683cc1", "installed_by": ["modules"] }, + "kallisto/index": { + "branch": "master", + "git_sha": "bdbbd9a849f9dd2b6078b303d36bf26e6adb8a9e", + "installed_by": ["modules"] + }, + "kallisto/quant": { + "branch": "kallisto_updates", + "git_sha": "bc4719dcd079fcdb650ddeac05739c2f7dd58c84", + "installed_by": ["modules"] + }, "picard/markduplicates": { "branch": "master", "git_sha": "2ee934606f1fdf7fc1cb05d6e8abc13bec8ab448", diff --git a/modules/local/deseq2_qc/main.nf b/modules/local/deseq2_qc/main.nf index 7b3fac57b..466194934 100644 --- a/modules/local/deseq2_qc/main.nf +++ b/modules/local/deseq2_qc/main.nf @@ -32,11 +32,13 @@ process DESEQ2_QC { def args2 = task.ext.args2 ?: '' def label_lower = args2.toLowerCase() def label_upper = args2.toUpperCase() + prefix = task.ext.prefix ?: "deseq2" """ deseq2_qc.r \\ --count_file $counts \\ --outdir ./ \\ --cores $task.cpus \\ + --outprefix $prefix \\ $args if [ -f "R_sessionInfo.log" ]; then diff --git a/modules/local/multiqc/main.nf b/modules/local/multiqc/main.nf index 44565a9f7..7ecc28f87 100644 --- a/modules/local/multiqc/main.nf +++ b/modules/local/multiqc/main.nf @@ -1,10 +1,10 @@ process MULTIQC { label 'process_medium' - conda "bioconda::multiqc=1.15" + conda "bioconda::multiqc=1.17" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.15--pyhdfd78af_0' : - 'biocontainers/multiqc:1.15--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.17--pyhdfd78af_0' : + 'biocontainers/multiqc:1.17--pyhdfd78af_0' }" input: path multiqc_config @@ -23,7 +23,7 @@ process MULTIQC { path ('star/*') path ('hisat2/*') path ('rsem/*') - path ('salmon/*') + path ('pseudoalignment/*') path ('samtools/stats/*') path ('samtools/flagstat/*') path ('samtools/idxstats/*') @@ -57,8 +57,10 @@ process MULTIQC { script: def args = task.ext.args ?: '' def custom_config = params.multiqc_config ? "--config $multiqc_custom_config" : '' + prefix = task.ext.prefix ?: "multiqc_report" """ multiqc \\ + -n ${prefix}.html \\ -f \\ $args \\ $custom_config \\ diff --git a/modules/local/salmon_summarizedexperiment/main.nf b/modules/local/summarizedexperiment/main.nf similarity index 91% rename from modules/local/salmon_summarizedexperiment/main.nf rename to modules/local/summarizedexperiment/main.nf index 2278ea14e..f278e2301 100644 --- a/modules/local/salmon_summarizedexperiment/main.nf +++ b/modules/local/summarizedexperiment/main.nf @@ -1,4 +1,4 @@ -process SALMON_SUMMARIZEDEXPERIMENT { +process SUMMARIZEDEXPERIMENT { tag "$tx2gene" label "process_medium" @@ -21,10 +21,11 @@ process SALMON_SUMMARIZEDEXPERIMENT { script: // This script is bundled with the pipeline, in nf-core/rnaseq/bin/ """ - salmon_summarizedexperiment.r \\ + summarizedexperiment.r \\ NULL \\ $counts \\ - $tpm + $tpm \\ + $tx2gene cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/salmon_tx2gene/main.nf b/modules/local/tx2gene/main.nf similarity index 83% rename from modules/local/salmon_tx2gene/main.nf rename to modules/local/tx2gene/main.nf index b6e9df662..150e74afb 100644 --- a/modules/local/salmon_tx2gene/main.nf +++ b/modules/local/tx2gene/main.nf @@ -1,4 +1,4 @@ -process SALMON_TX2GENE { +process TX2GENE { tag "$gtf" label "process_low" @@ -8,7 +8,8 @@ process SALMON_TX2GENE { 'biocontainers/python:3.9--1' }" input: - path ("salmon/*") + path ("quants/*") + val quant_type path gtf output: @@ -20,12 +21,13 @@ process SALMON_TX2GENE { script: // This script is bundled with the pipeline, in nf-core/rnaseq/bin/ """ - salmon_tx2gene.py \\ + tx2gene.py \\ + --quant_type $quant_type \\ --gtf $gtf \\ - --salmon salmon \\ + --quants quants \\ --id $params.gtf_group_features \\ --extra $params.gtf_extra_attributes \\ - -o salmon_tx2gene.tsv + -o tx2gene.tsv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/salmon_tximport/main.nf b/modules/local/tximport/main.nf similarity index 86% rename from modules/local/salmon_tximport/main.nf rename to modules/local/tximport/main.nf index 181c897cd..56d826a7d 100644 --- a/modules/local/salmon_tximport/main.nf +++ b/modules/local/tximport/main.nf @@ -1,4 +1,4 @@ -process SALMON_TXIMPORT { +process TXIMPORT { label "process_medium" conda "bioconda::bioconductor-tximeta=1.12.0" @@ -7,8 +7,9 @@ process SALMON_TXIMPORT { 'biocontainers/bioconductor-tximeta:1.12.0--r41hdfd78af_0' }" input: - path ("salmon/*") + path ("quants/*") path tx2gene + val quant_type output: path "*gene_tpm.tsv" , emit: tpm_gene @@ -23,11 +24,14 @@ process SALMON_TXIMPORT { task.ext.when == null || task.ext.when script: // This script is bundled with the pipeline, in nf-core/rnaseq/bin/ + prefix = task.ext.prefix ?: "${quant_type}.merged" """ - salmon_tximport.r \\ + tximport.r \\ NULL \\ - salmon \\ - salmon.merged + quants \\ + $prefix \\ + $quant_type \\ + $tx2gene cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/kallisto/index/environment.yml b/modules/nf-core/kallisto/index/environment.yml new file mode 100644 index 000000000..d62586580 --- /dev/null +++ b/modules/nf-core/kallisto/index/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::kallisto=0.48.0 diff --git a/modules/nf-core/kallisto/index/main.nf b/modules/nf-core/kallisto/index/main.nf new file mode 100644 index 000000000..28a47dbeb --- /dev/null +++ b/modules/nf-core/kallisto/index/main.nf @@ -0,0 +1,44 @@ +process KALLISTO_INDEX { + tag "$fasta" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/kallisto:0.48.0--h15996b6_2': + 'biocontainers/kallisto:0.48.0--h15996b6_2' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path("kallisto") , emit: index + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + kallisto \\ + index \\ + $args \\ + -i kallisto \\ + $fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kallisto: \$(echo \$(kallisto 2>&1) | sed 's/^kallisto //; s/Usage.*\$//') + END_VERSIONS + """ + + stub: + """ + touch kallisto + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kallisto: \$(echo \$(kallisto 2>&1) | sed 's/^kallisto //; s/Usage.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/kallisto/index/meta.yml b/modules/nf-core/kallisto/index/meta.yml new file mode 100644 index 000000000..d366aeb45 --- /dev/null +++ b/modules/nf-core/kallisto/index/meta.yml @@ -0,0 +1,41 @@ +name: kallisto_index +description: Create kallisto index +keywords: + - kallisto + - kallisto/index + - index +tools: + - kallisto: + description: Quantifying abundances of transcripts from bulk and single-cell RNA-Seq data, or more generally of target sequences using high-throughput sequencing reads. + homepage: https://pachterlab.github.io/kallisto/ + documentation: https://pachterlab.github.io/kallisto/manual + tool_dev_url: https://github.com/pachterlab/kallisto + licence: ["BSD-2-Clause"] +input: + - meta: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - fasta: + type: file + description: genome fasta file + pattern: "*.{fasta}" +output: + - meta: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - index: + type: directory + description: Kallisto genome index + pattern: "*.idx" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@ggabernet" +maintainers: + - "@ggabernet" diff --git a/modules/nf-core/kallisto/index/tests/main.nf.test b/modules/nf-core/kallisto/index/tests/main.nf.test new file mode 100644 index 000000000..97933d697 --- /dev/null +++ b/modules/nf-core/kallisto/index/tests/main.nf.test @@ -0,0 +1,33 @@ +nextflow_process { + + name "Test Process KALLISTO_INDEX" + script "../main.nf" + process "KALLISTO_INDEX" + tag "modules" + tag "modules_nfcore" + tag "kallisto" + tag "kallisto/index" + + test("homo_sapiens genome_fasta") { + + when { + process { + """ + input[0] = [ + [ id:'test_fasta' ], // meta map + [ file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/kallisto/index/tests/main.nf.test.snap b/modules/nf-core/kallisto/index/tests/main.nf.test.snap new file mode 100644 index 000000000..c0f45ac45 --- /dev/null +++ b/modules/nf-core/kallisto/index/tests/main.nf.test.snap @@ -0,0 +1,31 @@ +{ + "homo_sapiens genome_fasta": { + "content": [ + { + "0": [ + [ + { + "id": "test_fasta" + }, + "kallisto:md5,2dab84e1456201beca5a43f4c514d67c" + ] + ], + "1": [ + "versions.yml:md5,178f9b57d4228edc356911d571b958a4" + ], + "index": [ + [ + { + "id": "test_fasta" + }, + "kallisto:md5,2dab84e1456201beca5a43f4c514d67c" + ] + ], + "versions": [ + "versions.yml:md5,178f9b57d4228edc356911d571b958a4" + ] + } + ], + "timestamp": "2023-11-02T09:58:48.83625986" + } +} \ No newline at end of file diff --git a/modules/nf-core/kallisto/index/tests/tags.yml b/modules/nf-core/kallisto/index/tests/tags.yml new file mode 100644 index 000000000..9f47b88a1 --- /dev/null +++ b/modules/nf-core/kallisto/index/tests/tags.yml @@ -0,0 +1,2 @@ +kallisto/index: + - modules/nf-core/kallisto/index/** diff --git a/modules/nf-core/kallisto/quant/environment.yml b/modules/nf-core/kallisto/quant/environment.yml new file mode 100644 index 000000000..c2d6306bb --- /dev/null +++ b/modules/nf-core/kallisto/quant/environment.yml @@ -0,0 +1,7 @@ +name: kallisto_quant +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::kallisto=0.48.0 diff --git a/modules/nf-core/kallisto/quant/main.nf b/modules/nf-core/kallisto/quant/main.nf new file mode 100644 index 000000000..2ad0bb78b --- /dev/null +++ b/modules/nf-core/kallisto/quant/main.nf @@ -0,0 +1,78 @@ +process KALLISTO_QUANT { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/kallisto:0.48.0--h15996b6_2': + 'biocontainers/kallisto:0.48.0--h15996b6_2' }" + + input: + tuple val(meta), path(reads) + tuple val(meta2), path(index) + path gtf + path chromosomes + val fragment_length + val fragment_length_sd + + output: + tuple val(meta), path("${prefix}") , emit: results + tuple val(meta), path("*.run_info.json") , emit: json_info + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def gtf_input = gtf ? "--gtf ${gtf}" : '' + def chromosomes_input = chromosomes ? "--chromosomes ${chromosomes}" : '' + + def single_end_params = '' + if (meta.single_end) { + if (!(fragment_length =~ /^\d+$/)) { + error "fragment_length must be set and numeric for single-end data" + } + if (!(fragment_length_sd =~ /^\d+$/)) { + error "fragment_length_sd must be set and numeric for single-end data" + } + single_end_params = "--single --fragment-length=${fragment_length} --sd=${fragment_length_sd}" + } + + def strandedness = '' + if (!args.contains('--fr-stranded') && !args.contains('--rf-stranded')) { + strandedness = (meta.strandedness == 'forward') ? '--fr-stranded' : + (meta.strandedness == 'reverse') ? '--rf-stranded' : '' + } + + """ + mkdir -p $prefix && kallisto quant \\ + --threads ${task.cpus} \\ + --index ${index} \\ + ${gtf_input} \\ + ${chromosomes_input} \\ + ${single_end_params} \\ + ${strandedness} \\ + ${args} \\ + -o $prefix \\ + ${reads} 2> >(tee -a ${prefix}/kallisto_quant.log >&2) + + cp ${prefix}/kallisto_quant.log ${prefix}.log + cp ${prefix}/run_info.json ${prefix}.run_info.json + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kallisto: \$(echo \$(kallisto version) | sed "s/kallisto, version //g" ) + END_VERSIONS + """ + + stub: + """ + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kallisto: \$(echo \$(kallisto version) | sed "s/kallisto, version //g" ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/kallisto/quant/meta.yml b/modules/nf-core/kallisto/quant/meta.yml new file mode 100644 index 000000000..d5100290f --- /dev/null +++ b/modules/nf-core/kallisto/quant/meta.yml @@ -0,0 +1,77 @@ +name: "kallisto_quant" +description: Computes equivalence classes for reads and quantifies abundances +keywords: + - quant + - kallisto + - pseudoalignment +tools: + - "kallisto": + description: "Quantifying abundances of transcripts from RNA-Seq data, or more generally of target sequences using high-throughput sequencing reads." + homepage: https://pachterlab.github.io/kallisto/ + documentation: https://pachterlab.github.io/kallisto/manual + tool_dev_url: https://github.com/pachterlab/kallisto + doi: "10.1038/nbt.3519" + licence: ["BSD_2_clause"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + pattern: "*.{fastq,fastq.gz}" + - index: + type: file + description: Kallisto genome index. + pattern: "*.idx" + - gtf: + type: file + description: Optional gtf file for translation of transcripts into genomic coordinates. + pattern: "*.gtf" + - chromosomes: + type: file + description: Optional tab separated file with chromosome names and lengths. + pattern: "*.tsv" + - fragment_length: + type: integer + description: For single-end mode only, the estimated average fragment length. + - fragment_length_sd: + type: integer + description: For single-end mode only, the estimated standard deviation of the fragment length. + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - log: + type: file + description: File containing log information from running kallisto quant + pattern: "*.log.txt" + - abundance: + type: file + description: Plaintext file of the abundance estimates + pattern: "abundance.tsv" + - abundance_hdf5: + type: file + description: | + A HDF5 binary file containing run info, abundance estimates, bootstrap + estimates, and transcript length information + pattern: "abundance.h5" + - run_info: + type: file + description: A json file containing information about the run + pattern: "run_info.json" +authors: + - "@anoronh4" +maintainers: + - "@anoronh4" diff --git a/modules/nf-core/kallisto/quant/tests/main.nf.test b/modules/nf-core/kallisto/quant/tests/main.nf.test new file mode 100644 index 000000000..f716e5e6d --- /dev/null +++ b/modules/nf-core/kallisto/quant/tests/main.nf.test @@ -0,0 +1,92 @@ +nextflow_process { + + name "Test Process KALLISTO_QUANT" + script "../main.nf" + process "KALLISTO_QUANT" + tag "modules" + tag "modules_nfcore" + tag "kallisto" + tag "kallisto/quant" + + setup { + run("KALLISTO_INDEX") { + script "../../index/main.nf" + process { + """ + input[0] = [ + [ id:'test_fasta' ], // meta map + [ file(params.test_data['sarscov2']['genome']['transcriptome_fasta'], checkIfExists: true) ] + ] + """ + } + } + } + + test("sarscov2 single-end") { + config "./single_end.config" + + when { + params{ + outdir = "$outputDir" + } + + process { + """ + input[0] = [ + [ id:'test', single_end:true ], + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] + ] + input[1] = KALLISTO_INDEX.out.index + input[2] = [] + input[3] = [] + input[4] = 150 + input[5] = 75 + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(path("$outputDir/kallisto/test/abundance.tsv")).match("abundance_tsv_single") }, + { assert snapshot(process.out.log).match("se_log") }, + { assert snapshot(process.out.versions).match("se_versions") } + ) + } + } + + test("sarscov2 paired-end") { + config "./paired_end.config" + + when { + params{ + outdir = "$outputDir" + } + + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) + ] + ] + input[1] = KALLISTO_INDEX.out.index + input[2] = [] + input[3] = [] + input[4] = [] + input[5] = [] + """ + } + } + then { + assertAll( + { assert process.success }, + { assert snapshot(path("$outputDir/kallisto/test/abundance.tsv")).match("abundance_tsv_paired") }, + { assert snapshot(process.out.log).match("pe_log") }, + { assert snapshot(process.out.versions).match("pe_versions") } + ) + } + } +} diff --git a/modules/nf-core/kallisto/quant/tests/main.nf.test.snap b/modules/nf-core/kallisto/quant/tests/main.nf.test.snap new file mode 100644 index 000000000..d59d7f06f --- /dev/null +++ b/modules/nf-core/kallisto/quant/tests/main.nf.test.snap @@ -0,0 +1,58 @@ +{ + "pe_log": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.log:md5,8a5987f8e779cd12ca708e2212f771f5" + ] + ] + ], + "timestamp": "2023-11-02T09:16:05.163403975" + }, + "se_versions": { + "content": [ + [ + "versions.yml:md5,f981ad0cc089194a8fb00a47948eea94" + ] + ], + "timestamp": "2023-10-30T22:11:17.982220232" + }, + "abundance_tsv_paired": { + "content": [ + "abundance.tsv:md5,f0a9a2543f8fc0c8442be0a939d70f66" + ], + "timestamp": "2023-11-02T09:16:05.157883165" + }, + "se_log": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.log:md5,9c166f0c50cd4fdbdbf1bff9d5d8aba2" + ] + ] + ], + "timestamp": "2023-10-30T22:11:17.973230763" + }, + "abundance_tsv_single": { + "content": [ + "abundance.tsv:md5,8a4afe91e6a75b4e619daaf664eb7d9b" + ], + "timestamp": "2023-11-02T09:01:51.48615229" + }, + "pe_versions": { + "content": [ + [ + "versions.yml:md5,f981ad0cc089194a8fb00a47948eea94" + ] + ], + "timestamp": "2023-11-02T09:16:05.168753684" + } +} diff --git a/modules/nf-core/kallisto/quant/tests/paired_end.config b/modules/nf-core/kallisto/quant/tests/paired_end.config new file mode 100644 index 000000000..8730f1c4b --- /dev/null +++ b/modules/nf-core/kallisto/quant/tests/paired_end.config @@ -0,0 +1,5 @@ +process { + + publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" } + +} diff --git a/modules/nf-core/kallisto/quant/tests/single_end.config b/modules/nf-core/kallisto/quant/tests/single_end.config new file mode 100644 index 000000000..7022246bc --- /dev/null +++ b/modules/nf-core/kallisto/quant/tests/single_end.config @@ -0,0 +1,5 @@ +process { + + publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" } + +} diff --git a/modules/nf-core/kallisto/quant/tests/tags.yml b/modules/nf-core/kallisto/quant/tests/tags.yml new file mode 100644 index 000000000..460936583 --- /dev/null +++ b/modules/nf-core/kallisto/quant/tests/tags.yml @@ -0,0 +1,3 @@ +kallisto/quant: + - modules/nf-core/kallisto/index/** + - modules/nf-core/kallisto/quant/** diff --git a/nextflow.config b/nextflow.config index f786d1daa..2c8b29691 100644 --- a/nextflow.config +++ b/nextflow.config @@ -66,6 +66,9 @@ params { min_mapped_reads = 5 extra_star_align_args = null extra_salmon_quant_args = null + extra_kallisto_quant_args = null + kallisto_quant_fraglen = 200 + kallisto_quant_fraglen_sd = 200 save_merged_fastq = false save_unaligned = false save_align_intermeds = false diff --git a/nextflow_schema.json b/nextflow_schema.json index a22baecef..ddbe77de4 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -149,6 +149,13 @@ "fa_icon": "fas fa-bezier-curve", "description": "Path to directory or tar.gz archive for pre-built Salmon index." }, + "kallisto_index": { + "type": "string", + "format": "path", + "exists": true, + "fa_icon": "fas fa-bezier-curve", + "description": "Path to directory or tar.gz archive for pre-built Kallisto index." + }, "hisat2_build_memory": { "type": "string", "default": "200.GB", @@ -352,7 +359,7 @@ "type": "string", "description": "Specifies the pseudo aligner to use - available options are 'salmon'. Runs in addition to '--aligner'.", "fa_icon": "fas fa-hamburger", - "enum": ["salmon"] + "enum": ["salmon", "kallisto"] }, "bam_csi_index": { "type": "boolean", @@ -368,11 +375,29 @@ "type": "string", "fa_icon": "fas fa-fast-forward", "description": " Override Salmon library type inferred based on strandedness defined in meta object.", - "help_text": "See [Salmon docs](https://salmon.readthedocs.io/en/latest/library_type.html)." + "help_text": "See [Salmon docs](https://salmon.readthedocs.io/en/latest/library_type.html).", + "enum": [ + "A", + "IS", + "ISF", + "ISR", + "IU", + "MS", + "MSF", + "MSR", + "MU", + "OS", + "OSF", + "OSR", + "OU", + "SF", + "SR", + "U" + ] }, "min_mapped_reads": { "type": "number", - "default": 5, + "default": 5.0, "fa_icon": "fas fa-percentage", "description": "Minimum percentage of uniquely mapped reads below which samples are removed from further processing.", "help_text": "Some downstream steps in the pipeline will fail if this threshold is too low." @@ -396,6 +421,23 @@ "type": "string", "description": "Extra arguments to pass to Salmon quant command in addition to defaults defined by the pipeline.", "fa_icon": "fas fa-plus" + }, + "extra_kallisto_quant_args": { + "type": "string", + "description": "Extra arguments to pass to Kallisto quant command in addition to defaults defined by the pipeline.", + "fa_icon": "fas fa-plus" + }, + "kallisto_quant_fraglen": { + "type": "integer", + "description": "In single-end mode Kallisto requires an estimated fragment length. Specify a default value for that here. TODO: use existing RSeQC results to do this dynamically.", + "default": 200, + "fa_icon": "fas fa-ruler-horizontal" + }, + "kallisto_quant_fraglen_sd": { + "type": "integer", + "description": "In single-end mode, Kallisto requires an estimated standard error for fragment length. Specify a default value for that here. TODO: use existing RSeQC results to do this dynamically.", + "default": 200, + "fa_icon": "fas fa-sort-amount-up-alt" } } }, @@ -503,7 +545,7 @@ "skip_pseudo_alignment": { "type": "boolean", "fa_icon": "fas fa-fast-forward", - "description": "Skip all of the pseudo-alignment-based processes within the pipeline." + "description": "Skip all of the pseudoalignment-based processes within the pipeline." }, "skip_markduplicates": { "type": "boolean", diff --git a/subworkflows/local/prepare_genome/main.nf b/subworkflows/local/prepare_genome/main.nf index eceb2c1ce..ca598a199 100644 --- a/subworkflows/local/prepare_genome/main.nf +++ b/subworkflows/local/prepare_genome/main.nf @@ -14,6 +14,7 @@ include { UNTAR as UNTAR_STAR_INDEX } from '../../../modules/nf-core/unt include { UNTAR as UNTAR_RSEM_INDEX } from '../../../modules/nf-core/untar' include { UNTAR as UNTAR_HISAT2_INDEX } from '../../../modules/nf-core/untar' include { UNTAR as UNTAR_SALMON_INDEX } from '../../../modules/nf-core/untar' +include { UNTAR as UNTAR_KALLISTO_INDEX } from '../../../modules/nf-core/untar' include { CUSTOM_GETCHROMSIZES } from '../../../modules/nf-core/custom/getchromsizes' include { GFFREAD } from '../../../modules/nf-core/gffread' @@ -22,6 +23,7 @@ include { STAR_GENOMEGENERATE } from '../../../modules/nf-core/sta include { HISAT2_EXTRACTSPLICESITES } from '../../../modules/nf-core/hisat2/extractsplicesites' include { HISAT2_BUILD } from '../../../modules/nf-core/hisat2/build' include { SALMON_INDEX } from '../../../modules/nf-core/salmon/index' +include { KALLISTO_INDEX } from '../../../modules/nf-core/kallisto/index' include { RSEM_PREPAREREFERENCE as RSEM_PREPAREREFERENCE_GENOME } from '../../../modules/nf-core/rsem/preparereference' include { RSEM_PREPAREREFERENCE as MAKE_TRANSCRIPTS_FASTA } from '../../../modules/nf-core/rsem/preparereference' @@ -44,6 +46,7 @@ workflow PREPARE_GENOME { star_index // directory: /path/to/star/index/ rsem_index // directory: /path/to/rsem/index/ salmon_index // directory: /path/to/salmon/index/ + kallisto_index // directory: /path/to/kallisto/index/ hisat2_index // directory: /path/to/hisat2/index/ bbsplit_index // directory: /path/to/rsem/index/ gencode // boolean: whether the genome is from GENCODE @@ -257,6 +260,24 @@ workflow PREPARE_GENOME { ch_versions = ch_versions.mix(SALMON_INDEX.out.versions) } } + + // + // Uncompress Kallisto index or generate from scratch if required + // + ch_kallisto_index = Channel.empty() + if (kallisto_index) { + if (kallisto_index.endsWith('.tar.gz')) { + ch_kallisto_index = UNTAR_KALLISTO_INDEX ( [ [:], kallisto_index ] ).untar + ch_versions = ch_versions.mix(UNTAR_KALLISTO_INDEX.out.versions) + } else { + ch_kallisto_index = Channel.value([[:], file(kallisto_index)]) + } + } else { + if ('kallisto' in prepare_tool_indices) { + ch_kallisto_index = KALLISTO_INDEX ( ch_transcript_fasta.map{[ [:], it]} ).index + ch_versions = ch_versions.mix(KALLISTO_INDEX.out.versions) + } + } emit: fasta = ch_fasta // channel: path(genome.fasta) @@ -271,6 +292,7 @@ workflow PREPARE_GENOME { rsem_index = ch_rsem_index // channel: path(rsem/index/) hisat2_index = ch_hisat2_index // channel: path(hisat2/index/) salmon_index = ch_salmon_index // channel: path(salmon/index/) + kallisto_index = ch_kallisto_index // channel: [ meta, path(kallisto/index/) ] versions = ch_versions.ifEmpty(null) // channel: [ versions.yml ] } diff --git a/subworkflows/local/quantify_pseudo/main.nf b/subworkflows/local/quantify_pseudo/main.nf new file mode 100644 index 000000000..886b45e7b --- /dev/null +++ b/subworkflows/local/quantify_pseudo/main.nf @@ -0,0 +1,98 @@ +// +// Pseudoalignment and quantification with Salmon or Kallisto +// + +include { SALMON_QUANT } from '../../../modules/nf-core/salmon/quant' +include { KALLISTO_QUANT } from '../../../modules/nf-core/kallisto/quant' +include { TX2GENE } from '../../../modules/local/tx2gene' +include { TXIMPORT } from '../../../modules/local/tximport' + +include { SUMMARIZEDEXPERIMENT as SE_GENE } from '../../../modules/local/summarizedexperiment' +include { SUMMARIZEDEXPERIMENT as SE_GENE_LENGTH_SCALED } from '../../../modules/local/summarizedexperiment' +include { SUMMARIZEDEXPERIMENT as SE_GENE_SCALED } from '../../../modules/local/summarizedexperiment' +include { SUMMARIZEDEXPERIMENT as SE_TRANSCRIPT } from '../../../modules/local/summarizedexperiment' + +workflow QUANTIFY_PSEUDO_ALIGNMENT { + take: + reads // channel: [ val(meta), [ reads ] ] + index // channel: /path/to//index/ + transcript_fasta // channel: /path/to/transcript.fasta + gtf // channel: /path/to/genome.gtf + pseudo_aligner // val: kallisto or salmon + alignment_mode // bool: Run Salmon in alignment mode + lib_type // val: String to override Salmon library type + kallisto_quant_fraglen // val: Estimated fragment length required by Kallisto in single-end mode + kallisto_quant_fraglen_sd // val: Estimated standard error for fragment length required by Kallisto in single-end mode + + main: + + ch_versions = Channel.empty() + + // + // Quantify and merge counts across samples + // + // NOTE: MultiQC needs Salmon outputs, but Kallisto logs + if (pseudo_aligner == 'salmon') { + SALMON_QUANT ( reads, index, gtf, transcript_fasta, alignment_mode, lib_type ) + ch_pseudo_results = SALMON_QUANT.out.results + ch_pseudo_multiqc = ch_pseudo_results + ch_versions = ch_versions.mix(SALMON_QUANT.out.versions.first()) + } else { + KALLISTO_QUANT ( reads, index, gtf, [], kallisto_quant_fraglen, kallisto_quant_fraglen_sd) + ch_pseudo_results = KALLISTO_QUANT.out.results + ch_pseudo_multiqc = KALLISTO_QUANT.out.log + ch_versions = ch_versions.mix(KALLISTO_QUANT.out.versions.first()) + } + + TX2GENE ( ch_pseudo_results.collect{it[1]}, pseudo_aligner, gtf ) + ch_versions = ch_versions.mix(TX2GENE.out.versions) + + TXIMPORT ( ch_pseudo_results.collect{it[1]}, TX2GENE.out.tsv.collect(), pseudo_aligner ) + ch_versions = ch_versions.mix(TXIMPORT.out.versions) + + SE_GENE ( + TXIMPORT.out.counts_gene, + TXIMPORT.out.tpm_gene, + TX2GENE.out.tsv.collect() + ) + ch_versions = ch_versions.mix(SE_GENE.out.versions) + + SE_GENE_LENGTH_SCALED ( + TXIMPORT.out.counts_gene_length_scaled, + TXIMPORT.out.tpm_gene, + TX2GENE.out.tsv.collect() + ) + + SE_GENE_SCALED ( + TXIMPORT.out.counts_gene_scaled, + TXIMPORT.out.tpm_gene, + TX2GENE.out.tsv.collect() + ) + + SE_TRANSCRIPT ( + TXIMPORT.out.counts_transcript, + TXIMPORT.out.tpm_transcript, + TX2GENE.out.tsv.collect() + ) + + emit: + results = ch_pseudo_results // channel: [ val(meta), results_dir ] + multiqc = ch_pseudo_multiqc // channel: [ val(meta), files_for_multiqc ] + + tpm_gene = TXIMPORT.out.tpm_gene // channel: [ val(meta), counts ] + counts_gene = TXIMPORT.out.counts_gene // channel: [ val(meta), counts ] + counts_gene_length_scaled = TXIMPORT.out.counts_gene_length_scaled // channel: [ val(meta), counts ] + counts_gene_scaled = TXIMPORT.out.counts_gene_scaled // channel: [ val(meta), counts ] + tpm_transcript = TXIMPORT.out.tpm_transcript // channel: [ val(meta), counts ] + counts_transcript = TXIMPORT.out.counts_transcript // channel: [ val(meta), counts ] + + merged_gene_rds = SE_GENE.out.rds // path: *.rds + merged_gene_rds_length_scaled = SE_GENE_LENGTH_SCALED.out.rds // path: *.rds + merged_gene_rds_scaled = SE_GENE_SCALED.out.rds // path: *.rds + + merged_counts_transcript = TXIMPORT.out.counts_transcript // path: *.transcript_counts.tsv + merged_tpm_transcript = TXIMPORT.out.tpm_transcript // path: *.transcript_tpm.tsv + merged_transcript_rds = SE_TRANSCRIPT.out.rds // path: *.rds + + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/quantify_salmon/main.nf b/subworkflows/local/quantify_salmon/main.nf deleted file mode 100644 index 1b936d6fe..000000000 --- a/subworkflows/local/quantify_salmon/main.nf +++ /dev/null @@ -1,83 +0,0 @@ -// -// Pseudo-alignment and quantification with Salmon -// - -include { SALMON_QUANT } from '../../../modules/nf-core/salmon/quant' -include { SALMON_TX2GENE } from '../../../modules/local/salmon_tx2gene' -include { SALMON_TXIMPORT } from '../../../modules/local/salmon_tximport' - -include { SALMON_SUMMARIZEDEXPERIMENT as SALMON_SE_GENE } from '../../../modules/local/salmon_summarizedexperiment' -include { SALMON_SUMMARIZEDEXPERIMENT as SALMON_SE_GENE_LENGTH_SCALED } from '../../../modules/local/salmon_summarizedexperiment' -include { SALMON_SUMMARIZEDEXPERIMENT as SALMON_SE_GENE_SCALED } from '../../../modules/local/salmon_summarizedexperiment' -include { SALMON_SUMMARIZEDEXPERIMENT as SALMON_SE_TRANSCRIPT } from '../../../modules/local/salmon_summarizedexperiment' - -workflow QUANTIFY_SALMON { - take: - reads // channel: [ val(meta), [ reads ] ] - index // channel: /path/to/salmon/index/ - transcript_fasta // channel: /path/to/transcript.fasta - gtf // channel: /path/to/genome.gtf - alignment_mode // bool: Run Salmon in alignment mode - lib_type // val: String to override salmon library type - - main: - - ch_versions = Channel.empty() - - // - // Quantify and merge counts across samples - // - SALMON_QUANT ( reads, index, gtf, transcript_fasta, alignment_mode, lib_type ) - ch_versions = ch_versions.mix(SALMON_QUANT.out.versions.first()) - - SALMON_TX2GENE ( SALMON_QUANT.out.results.collect{it[1]}, gtf ) - ch_versions = ch_versions.mix(SALMON_TX2GENE.out.versions) - - SALMON_TXIMPORT ( SALMON_QUANT.out.results.collect{it[1]}, SALMON_TX2GENE.out.tsv.collect() ) - ch_versions = ch_versions.mix(SALMON_TXIMPORT.out.versions) - - SALMON_SE_GENE ( - SALMON_TXIMPORT.out.counts_gene, - SALMON_TXIMPORT.out.tpm_gene, - SALMON_TX2GENE.out.tsv.collect() - ) - ch_versions = ch_versions.mix(SALMON_SE_GENE.out.versions) - - SALMON_SE_GENE_LENGTH_SCALED ( - SALMON_TXIMPORT.out.counts_gene_length_scaled, - SALMON_TXIMPORT.out.tpm_gene, - SALMON_TX2GENE.out.tsv.collect() - ) - - SALMON_SE_GENE_SCALED ( - SALMON_TXIMPORT.out.counts_gene_scaled, - SALMON_TXIMPORT.out.tpm_gene, - SALMON_TX2GENE.out.tsv.collect() - ) - - SALMON_SE_TRANSCRIPT ( - SALMON_TXIMPORT.out.counts_transcript, - SALMON_TXIMPORT.out.tpm_transcript, - SALMON_TX2GENE.out.tsv.collect() - ) - - emit: - results = SALMON_QUANT.out.results // channel: [ val(meta), results_dir ] - - tpm_gene = SALMON_TXIMPORT.out.tpm_gene // channel: [ val(meta), counts ] - counts_gene = SALMON_TXIMPORT.out.counts_gene // channel: [ val(meta), counts ] - counts_gene_length_scaled = SALMON_TXIMPORT.out.counts_gene_length_scaled // channel: [ val(meta), counts ] - counts_gene_scaled = SALMON_TXIMPORT.out.counts_gene_scaled // channel: [ val(meta), counts ] - tpm_transcript = SALMON_TXIMPORT.out.tpm_transcript // channel: [ val(meta), counts ] - counts_transcript = SALMON_TXIMPORT.out.counts_transcript // channel: [ val(meta), counts ] - - merged_gene_rds = SALMON_SE_GENE.out.rds // path: *.rds - merged_gene_rds_length_scaled = SALMON_SE_GENE_LENGTH_SCALED.out.rds // path: *.rds - merged_gene_rds_scaled = SALMON_SE_GENE_SCALED.out.rds // path: *.rds - - merged_counts_transcript = SALMON_TXIMPORT.out.counts_transcript // path: *.transcript_counts.tsv - merged_tpm_transcript = SALMON_TXIMPORT.out.tpm_transcript // path: *.transcript_tpm.tsv - merged_transcript_rds = SALMON_SE_TRANSCRIPT.out.rds // path: *.rds - - versions = ch_versions // channel: [ versions.yml ] -} diff --git a/tower.yml b/tower.yml index b6983530a..21f821a53 100644 --- a/tower.yml +++ b/tower.yml @@ -21,6 +21,16 @@ reports: display: "All samples Salmon merged transcript raw counts" "**/salmon/salmon.merged.transcript_tpm.tsv": display: "All samples Salmon merged transcript TPM counts" + "**/kallisto/**/deseq2.plots.pdf": + display: "All samples Kallisto DESeq2 QC PDF plots" + "**/kallisto/kallisto.merged.gene_counts.tsv": + display: "All samples Kallisto merged gene raw counts" + "**/kallisto/kallisto.merged.gene_tpm.tsv": + display: "All samples Kallisto merged gene TPM counts" + "**/kallisto/kallisto.merged.transcript_counts.tsv": + display: "All samples Kallisto merged transcript raw counts" + "**/kallisto/kallisto.merged.transcript_tpm.tsv": + display: "All samples Kallisto merged transcript TPM counts" "**/star_rsem/**/deseq2.plots.pdf": display: "All samples STAR RSEM DESeq2 QC PDF plots" "**/star_rsem/rsem.merged.gene_counts.tsv": diff --git a/workflows/rnaseq.nf b/workflows/rnaseq.nf index 955a727c3..66ee061f5 100755 --- a/workflows/rnaseq.nf +++ b/workflows/rnaseq.nf @@ -88,7 +88,7 @@ ch_biotypes_header_multiqc = file("$projectDir/assets/multiqc/biotypes_header. include { BEDTOOLS_GENOMECOV } from '../modules/local/bedtools_genomecov' include { DESEQ2_QC as DESEQ2_QC_STAR_SALMON } from '../modules/local/deseq2_qc' include { DESEQ2_QC as DESEQ2_QC_RSEM } from '../modules/local/deseq2_qc' -include { DESEQ2_QC as DESEQ2_QC_SALMON } from '../modules/local/deseq2_qc' +include { DESEQ2_QC as DESEQ2_QC_PSEUDO } from '../modules/local/deseq2_qc' include { DUPRADAR } from '../modules/local/dupradar' include { MULTIQC } from '../modules/local/multiqc' include { MULTIQC_CUSTOM_BIOTYPE } from '../modules/local/multiqc_custom_biotype' @@ -97,11 +97,11 @@ include { UMITOOLS_PREPAREFORRSEM as UMITOOLS_PREPAREFORSALMON } from '../module // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { PREPARE_GENOME } from '../subworkflows/local/prepare_genome' -include { ALIGN_STAR } from '../subworkflows/local/align_star' -include { QUANTIFY_RSEM } from '../subworkflows/local/quantify_rsem' -include { QUANTIFY_SALMON as QUANTIFY_STAR_SALMON } from '../subworkflows/local/quantify_salmon' -include { QUANTIFY_SALMON as QUANTIFY_SALMON } from '../subworkflows/local/quantify_salmon' +include { PREPARE_GENOME } from '../subworkflows/local/prepare_genome' +include { ALIGN_STAR } from '../subworkflows/local/align_star' +include { QUANTIFY_RSEM } from '../subworkflows/local/quantify_rsem' +include { QUANTIFY_PSEUDO_ALIGNMENT as QUANTIFY_STAR_SALMON } from '../subworkflows/local/quantify_pseudo' +include { QUANTIFY_PSEUDO_ALIGNMENT } from '../subworkflows/local/quantify_pseudo' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -169,6 +169,7 @@ workflow RNASEQ { params.star_index, params.rsem_index, params.salmon_index, + params.kallisto_index, params.hisat2_index, params.bbsplit_index, params.gencode, @@ -235,7 +236,7 @@ workflow RNASEQ { .set { ch_strand_fastq } // - // SUBWORKFLOW: Sub-sample FastQ files and pseudo-align with Salmon to auto-infer strandedness + // SUBWORKFLOW: Sub-sample FastQ files and pseudoalign with Salmon to auto-infer strandedness // // Return empty channel if ch_strand_fastq.auto_strand is empty so salmon index isn't created PREPARE_GENOME.out.fasta @@ -474,8 +475,11 @@ workflow RNASEQ { ch_dummy_file, PREPARE_GENOME.out.transcript_fasta, PREPARE_GENOME.out.gtf, + 'salmon', true, - params.salmon_quant_libtype ?: '' + params.salmon_quant_libtype ?: '', + params.kallisto_quant_fraglen, + params.kallisto_quant_fraglen_sd ) ch_versions = ch_versions.mix(QUANTIFY_STAR_SALMON.out.versions) @@ -788,35 +792,47 @@ workflow RNASEQ { } // - // SUBWORKFLOW: Pseudo-alignment and quantification with Salmon + // SUBWORKFLOW: Pseudoalignment and quantification with Salmon // - ch_salmon_multiqc = Channel.empty() + ch_pseudo_multiqc = Channel.empty() ch_pseudoaligner_pca_multiqc = Channel.empty() ch_pseudoaligner_clustering_multiqc = Channel.empty() - if (!params.skip_pseudo_alignment && params.pseudo_aligner == 'salmon') { - QUANTIFY_SALMON ( + + if (!params.skip_pseudo_alignment) { + + if (params.pseudo_aligner == 'salmon') { + ch_pseudo_index = PREPARE_GENOME.out.salmon_index + } else { + ch_pseudo_index = PREPARE_GENOME.out.kallisto_index + } + + QUANTIFY_PSEUDO_ALIGNMENT ( ch_filtered_reads, - PREPARE_GENOME.out.salmon_index, + ch_pseudo_index, ch_dummy_file, PREPARE_GENOME.out.gtf, + params.pseudo_aligner, false, - params.salmon_quant_libtype ?: '' + params.salmon_quant_libtype ?: '', + params.kallisto_quant_fraglen, + params.kallisto_quant_fraglen_sd ) - ch_salmon_multiqc = QUANTIFY_SALMON.out.results - ch_versions = ch_versions.mix(QUANTIFY_SALMON.out.versions) + ch_pseudo_multiqc = QUANTIFY_PSEUDO_ALIGNMENT.out.multiqc + ch_counts_gene_length_scaled = QUANTIFY_PSEUDO_ALIGNMENT.out.counts_gene_length_scaled + ch_versions = ch_versions.mix(QUANTIFY_PSEUDO_ALIGNMENT.out.versions) if (!params.skip_qc & !params.skip_deseq2_qc) { - DESEQ2_QC_SALMON ( - QUANTIFY_SALMON.out.counts_gene_length_scaled, + DESEQ2_QC_PSEUDO ( + ch_counts_gene_length_scaled, ch_pca_header_multiqc, ch_clustering_header_multiqc ) - ch_pseudoaligner_pca_multiqc = DESEQ2_QC_SALMON.out.pca_multiqc - ch_pseudoaligner_clustering_multiqc = DESEQ2_QC_SALMON.out.dists_multiqc - ch_versions = ch_versions.mix(DESEQ2_QC_SALMON.out.versions) + ch_pseudoaligner_pca_multiqc = DESEQ2_QC_PSEUDO.out.pca_multiqc + ch_pseudoaligner_clustering_multiqc = DESEQ2_QC_PSEUDO.out.dists_multiqc + ch_versions = ch_versions.mix(DESEQ2_QC_PSEUDO.out.versions) } } - + // // MODULE: Pipeline reporting // @@ -851,7 +867,7 @@ workflow RNASEQ { ch_star_multiqc.collect{it[1]}.ifEmpty([]), ch_hisat2_multiqc.collect{it[1]}.ifEmpty([]), ch_rsem_multiqc.collect{it[1]}.ifEmpty([]), - ch_salmon_multiqc.collect{it[1]}.ifEmpty([]), + ch_pseudo_multiqc.collect{it[1]}.ifEmpty([]), ch_samtools_stats.collect{it[1]}.ifEmpty([]), ch_samtools_flagstat.collect{it[1]}.ifEmpty([]), ch_samtools_idxstats.collect{it[1]}.ifEmpty([]),