Merge pull request #562 from FriederikeHanssen/automatic_restart

Automatic restart & General csv file updates
nf-core · Jun 9, 2022 · 226263b · 226263b
2 parents ba1b991 + a84b939
commit 226263b
Show file tree

Hide file tree

Showing 32 changed files with 234 additions and 216 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -57,6 +57,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#571](https://github.com/nf-core/sarek/pull/571) - Including and using GATK4's mergeVcfs
 - [#572](https://github.com/nf-core/sarek/pull/572) - Adjusted subway map svg for firefox compatibility
 - [#578](https://github.com/nf-core/sarek/pull/578) - Updated module deeptools/bamcoverage
+- [#562](https://github.com/nf-core/sarek/pull/562) - Restart from `--step annotate` is now also requiring a CSV file.
 
 ### Fixed
 
@@ -94,6 +95,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#539](https://github.com/nf-core/sarek/pull/539) - `--cadd_cache`, `--cadd_indels`, `--cadd_indels_tbi`, `--cadd_wg_snvs`, `--cadd_wg_snvs_tbi` have been removed
 - [#539](https://github.com/nf-core/sarek/pull/539) - `--genesplicer` has been removed
 - [#539](https://github.com/nf-core/sarek/pull/539) - `conf/genomes.config` and `params.genomes_base` have been removed
+- [#562](https://github.com/nf-core/sarek/pull/562) - Restart from `--step annotate` from folder is removed. Use a `csv` file instead
 - [#571](https://github.com/nf-core/sarek/pull/571) - Removed the local module `concat_vcf`.
 
 ## [2.7.1](https://github.com/nf-core/sarek/releases/tag/2.7.1) - Pårtejekna

diff --git a/conf/modules.config b/conf/modules.config
@@ -988,7 +988,7 @@ process{
     // VCF QC
     withName: 'BCFTOOLS_STATS'{
         ext.when         = { !(params.skip_tools && params.skip_tools.contains('bcftools')) }
-        ext.prefix       = { meta.type ? "${meta.variantcaller}_${vcf.baseName.minus(".vcf")}_${meta.type}" : "${meta.variantcaller}_${vcf.baseName.minus(".vcf")}" }
+        ext.prefix       = { "${vcf.baseName.minus(".vcf")}" }
         publishDir       = [
             mode: params.publish_dir_mode,
             path: { "${params.outdir}/reports/bcftools" },
@@ -998,7 +998,7 @@ process{
 
     withName: 'VCFTOOLS_.*'{
         ext.when         = { !(params.skip_tools && params.skip_tools.contains('vcftools')) }
-        ext.prefix       = { meta.type ? "${meta.variantcaller}_${variant_file.baseName.minus(".vcf")}_${meta.type}" : "${meta.variantcaller}_${variant_file.baseName.minus(".vcf")}" }
+        ext.prefix       = { "${variant_file.baseName.minus(".vcf")}" }
         publishDir       = [
             mode: params.publish_dir_mode,
             path: { "${params.outdir}/reports/vcftools" },
@@ -1021,7 +1021,7 @@ process{
 // ANNOTATE
 
     withName: 'ENSEMBLVEP' {
-        ext.prefix       = { meta.variantcaller ?  meta.type ? "${meta.variantcaller}_${meta.id}_${meta.type}_VEP.ann.vcf" : "${meta.variantcaller}_${meta.id}_VEP.ann.vcf" :  "${meta.id}_VEP.ann.vcf" }
+        ext.prefix       = { "${vcf.baseName.minus(".vcf")}_VEP" }
         ext.args          = [
             '--everything --filter_common --per_gene --total_length --offline',
             (params.vep_dbnsfp && params.dbnsfp)                                  ? '--plugin dbNSFP,dbNSFP.gz,rs_dbSNP,HGVSc_VEP,HGVSp_VEP,1000Gp3_EAS_AF,1000Gp3_AMR_AF,LRT_score,GERP++_RS,gnomAD_exomes_AF' : '',
@@ -1038,11 +1038,12 @@ process{
     }
 
     withName: ".*:ANNOTATION_MERGE:ENSEMBLVEP" {
-        ext.prefix       = { meta.variantcaller ?  meta.type ? "${meta.variantcaller}_${meta.id}_${meta.type}_snpEff_VEP.ann.vcf" : "${meta.variantcaller}_${meta.id}_snpEff_VEP.ann.vcf" :  "${meta.id}_snpEff_VEP.ann.vcf" }
+        // Output file will have format *_snpEff_VEP.ann.vcf
+        ext.prefix       = { "${vcf.baseName.minus(".ann.vcf")}_VEP" }
     }
 
     withName: 'SNPEFF' {
-        ext.prefix       = {  meta.variantcaller ?  meta.type ? "${meta.variantcaller}_${meta.id}_${meta.type}_snpEff.ann.vcf" : "${meta.variantcaller}_${meta.id}_snpEff.ann.vcf" :  "${meta.id}_snpEff.ann.vcf" }
+        ext.prefix       = { "${vcf.baseName.minus(".vcf")}_snpEff" }
         ext.args          = '-nodownload -canon -v'
         if (!params.snpeff_cache) container = { params.snpeff_genome ? "nfcore/snpeff:5.0.${params.snpeff_genome}" : "nfcore/snpeff:5.0.${params.genome}" }
         publishDir       = [
@@ -1054,6 +1055,7 @@ process{
     }
 
     withName: "NFCORE_SAREK:SAREK:ANNOTATE:.*:TABIX_BGZIPTABIX" {
+        ext.prefix       = { "${input}" }
         publishDir       = [
                 mode: params.publish_dir_mode,
                 path: { "${params.outdir}/annotation/${meta.id}/${meta.variantcaller}" },
@@ -1062,7 +1064,6 @@ process{
     }
 
     withName: 'NFCORE_SAREK:SAREK:ANNOTATE:ANNOTATION_SNPEFF:TABIX_BGZIPTABIX' {
-        ext.prefix       = {  meta.variantcaller ?  meta.type ? "${meta.variantcaller}_${meta.id}_${meta.type}_snpEff.ann.vcf" : "${meta.variantcaller}_${meta.id}_snpEff.ann.vcf" :  "${meta.id}_snpEff.ann.vcf" }
         publishDir       = [
             mode: params.publish_dir_mode,
             path: { "${params.outdir}/annotation/${meta.id}/${meta.variantcaller}" },
@@ -1071,13 +1072,6 @@ process{
         ]
     }
 
-    withName: 'NFCORE_SAREK:SAREK:ANNOTATE:ANNOTATION_ENSEMBLVEP:TABIX_BGZIPTABIX' {
-        ext.prefix       = { meta.variantcaller ?  meta.type ? "${meta.variantcaller}_${meta.id}_${meta.type}_VEP.ann.vcf" : "${meta.variantcaller}_${meta.id}_VEP.ann.vcf" :  "${meta.id}_VEP.ann.vcf" }
-    }
-
-    withName: 'NFCORE_SAREK:SAREK:ANNOTATE:ANNOTATION_MERGE:TABIX_BGZIPTABIX' {
-        ext.prefix       = { meta.variantcaller ?  meta.type ? "${meta.variantcaller}_${meta.id}_${meta.type}_snpEff_VEP.ann.vcf" : "${meta.variantcaller}_${meta.id}_snpEff_VEP.ann.vcf" :  "${meta.id}_snpEff_VEP.ann.vcf" }
-    }
 
     // MULTIQC
 

diff --git a/docs/usage.md b/docs/usage.md
@@ -64,13 +64,13 @@ Multiple `CSV` files can be specified if the path is enclosed in quotes.
 | `cram`    | Full path to CRAM file                                                                                                                                                                                                                                                                                          |
 | `crai`    | Full path to CRAM index file                                                                                                                                                                                                                                                                                    |
 | `table`   | Full path to recalibration table file                                                                                                                                                                                                                                                                           |
-| `mpileup` | Full path to pileup file                                                                                                                                                                                                                                                                                        |
+| `vcf`     | Full path to vcf file                                                                                                                                                                                                                                                                                           |
 
 An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
 
 #### Start with mapping (`--step mapping` [default])
 
-This step can be started either from `fastq` files or (u)`bam`s. The `CSV` must contain at least the columns `patient`, `sample`, `lane`, and either `fastq_1/fastq_2` or `bam`.
+This step can be started either from `fastq` files or `(u)bam`s. The `CSV` must contain at least the columns `patient`, `sample`, `lane`, and either `fastq_1/fastq_2` or `bam`.
 
 ##### Examples
 
@@ -144,6 +144,26 @@ patient,sample,cram,crai
 patient1,test_sample,test_mapped.cram,test_mapped.cram.crai
 ```
 
+The `Sarek`-generated `CSV` file is stored under `results/csv/mapped.csv` if in a previous run `--save_bam_mapped` was set and will automatically be used as an input when specifying the parameter `--step prepare_recalibration`. Otherwise this file will need to be manually generated.
+
+##### Full samplesheet
+
+In this example, all possible columns are used including the `gender` and `status` information per patient:
+
+```console
+patient,gender,status,sample,bam,bai
+patient1,XX,0,test_sample,test_mapped.bam,test_mapped.bam.bai
+patient1,XX,1,tumor_sample,test2_mapped.bam,test2_mapped.bam.bai
+patient1,XX,1,relapse_sample,test3_mapped.bam,test3_mapped.bam.bai
+```
+
+```console
+patient,gender,status,sample,cram,crai
+patient1,XX,0,normal_sample,test_mapped.cram,test_mapped.cram.crai
+patient1,XX,1,tumor_sample,test2_mapped.cram,test2_mapped.cram.crai
+patient1,XX,1,relapse_sample,test3_mapped.cram,test3_mapped.cram.crai
+```
+
 ##### Prepare Recalibration
 
 For starting directly from preparing recalibration, the `CSV` file must contain at least the columns `patient`, `sample`, `bam`, `bai` or `patient`, `sample`, `cram`, `crai`.
@@ -152,37 +172,37 @@ Example:
 
 ```console
 patient,sample,bam,bai
-patient1,test_sample,test_mapped.bam,test_mapped.bam.bai
+patient1,test_sample,test_md.bam,test_md.bam.bai
 ```
 
 ```console
 patient,sample,cram,crai
-patient1,test_sample,test_mapped.cram,test_mapped.cram.crai
+patient1,test_sample,test_md.cram,test_md.cram.crai
 ```
 
-The `Sarek`-generated `CSV` file is stored under `results/Preprocessing/CSV/duplicates_marked_no_table.csv` and will automatically be used as an input when specifying the parameter `--step prepare_recalibration`.
+The `Sarek`-generated `CSV` file is stored under `results/csv/markduplicates_no_table.csv` and will automatically be used as an input when specifying the parameter `--step prepare_recalibration`.
 
 ##### Full samplesheet
 
 In this example, all possible columns are used including the `gender` and `status` information per patient:
 
 ```console
 patient,gender,status,sample,bam,bai
-patient1,XX,0,test_sample,test_mapped.bam,test_mapped.bam.bai
-patient1,XX,1,tumor_sample,test2_mapped.bam,test2_mapped.bam.bai
-patient1,XX,1,relapse_sample,test3_mapped.bam,test3_mapped.bam.bai
+patient1,XX,0,test_sample,test_md.bam,test_md.bam.bai
+patient1,XX,1,tumor_sample,test2_md.bam,test2_md.bam.bai
+patient1,XX,1,relapse_sample,test3_md.bam,test3_md.bam.bai
 ```
 
 ```console
 patient,gender,status,sample,cram,crai
-patient1,XX,0,normal_sample,test_mapped.cram,test_mapped.cram.crai
-patient1,XX,1,tumor_sample,test2_mapped.cram,test2_mapped.cram.crai
-patient1,XX,1,relapse_sample,test3_mapped.cram,test3_mapped.cram.crai
+patient1,XX,0,normal_sample,test_md.cram,test_md.cram.crai
+patient1,XX,1,tumor_sample,test2_md.cram,test2_md.cram.crai
+patient1,XX,1,relapse_sample,test3_md.cram,test3_md.cram.crai
 ```
 
-#### Start with base quality recalibration (`--step recalibrate`)
+#### Start with base quality score recalibration (`--step recalibrate`)
 
-For starting from base quality recalibration the `CSV` file must contain at least the columns `patient`, `sample`, `bam`, `bai`, `table` or `patient`, `sample`, `cram`, `crai`, `table` containing the paths to _non-recalibrated CRAM/BAM_ files and the associated recalibration table.
+For starting from base quality score recalibration the `CSV` file must contain at least the columns `patient`, `sample`, `bam`, `bai`, `table` or `patient`, `sample`, `cram`, `crai`, `table` containing the paths to _non-recalibrated CRAM/BAM_ files and the associated recalibration table.
 
 Example:
 
@@ -196,7 +216,7 @@ patient,sample,cram,crai,table
 patient1,test_sample,test_mapped.cram,test_mapped.cram.crai,test.table
 ```
 
-The `Sarek`-generated `CSV` file is stored under `results/Preprocessing/CSV/duplicates_marked.csv` and will automatically be used as an input when specifying the parameter `--step recalibrate`.
+The `Sarek`-generated `CSV` file is stored under `results/csv/markduplicates.csv` and will automatically be used as an input when specifying the parameter `--step recalibrate`.
 
 ##### Full samplesheet
 
@@ -225,7 +245,7 @@ patient,sample,cram,crai
 patient1,test_sample,test_mapped.cram,test_mapped.cram.crai
 ```
 
-The `Sarek`-generated `CSV` file is stored under `results/Preprocessing/CSV/recalibrated.csv` and will automatically be used as an input when specifying the parameter `--step variant_calling`.
+The `Sarek`-generated `CSV` file is stored under `results/csv/recalibrated.csv` and will automatically be used as an input when specifying the parameter `--step variant_calling`.
 
 ##### Full samplesheet
 
@@ -240,13 +260,28 @@ patient1,XX,1,relapse_sample,test3_mapped.cram,test3_mapped.cram.crai
 
 #### Start with annotation (`--step annotate`)
 
-Starting with annotation, is a special case in that it doesn't require an input sample sheet. The input files for Sarek can be specified using the path to a `VCF` file given to the `--input` command only with the annotation step (`--step annotate`).
+For starting from the annotation step, the `CSV` file must contain at least the columns `patient`, `sample`, `vcf`.
+
 As `Sarek` will use `bgzip` and `tabix` to compress and index the annotated `VCF` files, it expects the input `VCF` files to be sorted.
-Multiple `VCF` files can be specified, using a [glob path](https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob), if enclosed in quotes.
-For example:
 
-```bash
---step annotate --input "results/VariantCalling/*/{HaplotypeCaller,Manta,Mutect2,Strelka,TIDDIT}/*.vcf.gz"
+Example:
+
+```console
+patient,sample,vcf
+patient1,test_sample,test,vcf
+```
+
+The `Sarek`-generated `CSV` file is stored under `results/csv/variantcalled.csv` and will automatically be used as an input when specifying the parameter `--step annotation`.
+
+##### Full samplesheet
+
+In this example, all possible columns are used including the `variantcaller` information per sample:
+
+```console
+patient,sample,variantcaller,vcf
+test,sample3,strelka,sample3.variants.vcf.gz
+test,sample4_vs_sample3,manta,sample4_vs_sample3.diploid_sv.vcf.gz
+test,sample4_vs_sample3,manta,sample4_vs_sample3.somatic_sv.vcf.gz
 ```
 
 ### Updating the pipeline

diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy
@@ -74,8 +74,8 @@ class WorkflowMain {
 
         // Check input has been provided
         if (!params.input) {
-            log.error "Please provide an input samplesheet to the pipeline e.g. '--input samplesheet.csv'"
-            System.exit(1)
+            log.warn "No samplesheet specified, attempting to restart from csv files present in ${params.outdir}"
+            WorkflowSarek.retrieveInput(params, log)
         }
     }
 

diff --git a/lib/WorkflowSarek.groovy b/lib/WorkflowSarek.groovy
@@ -56,4 +56,30 @@ class WorkflowSarek {
             System.exit(1)
         }
     }
+
+    public static String retrieveInput(params, log){
+        switch (params.step) {
+            case 'mapping':                 log.warn "Can't start with step $params.step without samplesheet"
+                                            System.exit(1);
+                                            break
+            case 'markduplicates':          log.warn "Using file ${params.outdir}/csv/mapped.csv"
+                                            params.replace("input","${params.outdir}/csv/mapped.csv");
+                                            break
+            case 'prepare_recalibration':   log.warn "Using file ${params.outdir}/csv/markduplicates_no_table.csv"
+                                            params.replace("input", "${params.outdir}/csv/markduplicates_no_table.csv");
+                                            break
+            case 'recalibrate':             log.warn "Using file ${params.outdir}/csv/markduplicates.csv"
+                                            params.replace("input", "${params.outdir}/csv/markduplicates.csv");
+                                            break
+            case 'variant_calling':         log.warn "Using file ${params.outdir}/csv/recalibrated.csv"
+                                            params.replace("input", "${params.outdir}/csv/recalibrated.csv");
+                                            break
+            // case 'controlfreec':         csv_file = file("${params.outdir}/variant_calling/csv/control-freec_mpileup.csv", checkIfExists: true); break
+            case 'annotate':                log.warn "Using file ${params.outdir}/csv/variantcalled.csv"
+                                            params.replace("input","${params.outdir}/csv/variantcalled.csv");
+                                            break
+            default:    log.warn "Please provide an input samplesheet to the pipeline e.g. '--input samplesheet.csv'"
+                        exit 1, "Unknown step $params.step"
+        }
+    }
 }
diff --git a/nextflow.config b/nextflow.config
@@ -23,7 +23,6 @@ params {
     no_intervals = false // Intervals will be built from the fasta file
     nucleotides_per_second = 1000 // Default interval size
     sentieon = false // Not using Sentieon by default
-    target_bed = null // No default TargetBED file for targeted sequencing
     tools = null // No default Variant_Calling or Annotation tools
     skip_tools = null // All tools (markduplicates + baserecalibrator + QC) are used by default