Skip to content

Commit

Permalink
Merge pull request #562 from FriederikeHanssen/automatic_restart
Browse files Browse the repository at this point in the history
Automatic restart & General csv file updates
  • Loading branch information
FriederikeHanssen authored Jun 9, 2022
2 parents ba1b991 + a84b939 commit 226263b
Show file tree
Hide file tree
Showing 32 changed files with 234 additions and 216 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- [#571](https://github.com/nf-core/sarek/pull/571) - Including and using GATK4's mergeVcfs
- [#572](https://github.com/nf-core/sarek/pull/572) - Adjusted subway map svg for firefox compatibility
- [#578](https://github.com/nf-core/sarek/pull/578) - Updated module deeptools/bamcoverage
- [#562](https://github.com/nf-core/sarek/pull/562) - Restart from `--step annotate` is now also requiring a CSV file.

### Fixed

Expand Down Expand Up @@ -94,6 +95,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- [#539](https://github.com/nf-core/sarek/pull/539) - `--cadd_cache`, `--cadd_indels`, `--cadd_indels_tbi`, `--cadd_wg_snvs`, `--cadd_wg_snvs_tbi` have been removed
- [#539](https://github.com/nf-core/sarek/pull/539) - `--genesplicer` has been removed
- [#539](https://github.com/nf-core/sarek/pull/539) - `conf/genomes.config` and `params.genomes_base` have been removed
- [#562](https://github.com/nf-core/sarek/pull/562) - Restart from `--step annotate` from folder is removed. Use a `csv` file instead
- [#571](https://github.com/nf-core/sarek/pull/571) - Removed the local module `concat_vcf`.

## [2.7.1](https://github.com/nf-core/sarek/releases/tag/2.7.1) - Pårtejekna
Expand Down
20 changes: 7 additions & 13 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -988,7 +988,7 @@ process{
// VCF QC
withName: 'BCFTOOLS_STATS'{
ext.when = { !(params.skip_tools && params.skip_tools.contains('bcftools')) }
ext.prefix = { meta.type ? "${meta.variantcaller}_${vcf.baseName.minus(".vcf")}_${meta.type}" : "${meta.variantcaller}_${vcf.baseName.minus(".vcf")}" }
ext.prefix = { "${vcf.baseName.minus(".vcf")}" }
publishDir = [
mode: params.publish_dir_mode,
path: { "${params.outdir}/reports/bcftools" },
Expand All @@ -998,7 +998,7 @@ process{

withName: 'VCFTOOLS_.*'{
ext.when = { !(params.skip_tools && params.skip_tools.contains('vcftools')) }
ext.prefix = { meta.type ? "${meta.variantcaller}_${variant_file.baseName.minus(".vcf")}_${meta.type}" : "${meta.variantcaller}_${variant_file.baseName.minus(".vcf")}" }
ext.prefix = { "${variant_file.baseName.minus(".vcf")}" }
publishDir = [
mode: params.publish_dir_mode,
path: { "${params.outdir}/reports/vcftools" },
Expand All @@ -1021,7 +1021,7 @@ process{
// ANNOTATE

withName: 'ENSEMBLVEP' {
ext.prefix = { meta.variantcaller ? meta.type ? "${meta.variantcaller}_${meta.id}_${meta.type}_VEP.ann.vcf" : "${meta.variantcaller}_${meta.id}_VEP.ann.vcf" : "${meta.id}_VEP.ann.vcf" }
ext.prefix = { "${vcf.baseName.minus(".vcf")}_VEP" }
ext.args = [
'--everything --filter_common --per_gene --total_length --offline',
(params.vep_dbnsfp && params.dbnsfp) ? '--plugin dbNSFP,dbNSFP.gz,rs_dbSNP,HGVSc_VEP,HGVSp_VEP,1000Gp3_EAS_AF,1000Gp3_AMR_AF,LRT_score,GERP++_RS,gnomAD_exomes_AF' : '',
Expand All @@ -1038,11 +1038,12 @@ process{
}

withName: ".*:ANNOTATION_MERGE:ENSEMBLVEP" {
ext.prefix = { meta.variantcaller ? meta.type ? "${meta.variantcaller}_${meta.id}_${meta.type}_snpEff_VEP.ann.vcf" : "${meta.variantcaller}_${meta.id}_snpEff_VEP.ann.vcf" : "${meta.id}_snpEff_VEP.ann.vcf" }
// Output file will have format *_snpEff_VEP.ann.vcf
ext.prefix = { "${vcf.baseName.minus(".ann.vcf")}_VEP" }
}

withName: 'SNPEFF' {
ext.prefix = { meta.variantcaller ? meta.type ? "${meta.variantcaller}_${meta.id}_${meta.type}_snpEff.ann.vcf" : "${meta.variantcaller}_${meta.id}_snpEff.ann.vcf" : "${meta.id}_snpEff.ann.vcf" }
ext.prefix = { "${vcf.baseName.minus(".vcf")}_snpEff" }
ext.args = '-nodownload -canon -v'
if (!params.snpeff_cache) container = { params.snpeff_genome ? "nfcore/snpeff:5.0.${params.snpeff_genome}" : "nfcore/snpeff:5.0.${params.genome}" }
publishDir = [
Expand All @@ -1054,6 +1055,7 @@ process{
}

withName: "NFCORE_SAREK:SAREK:ANNOTATE:.*:TABIX_BGZIPTABIX" {
ext.prefix = { "${input}" }
publishDir = [
mode: params.publish_dir_mode,
path: { "${params.outdir}/annotation/${meta.id}/${meta.variantcaller}" },
Expand All @@ -1062,7 +1064,6 @@ process{
}

withName: 'NFCORE_SAREK:SAREK:ANNOTATE:ANNOTATION_SNPEFF:TABIX_BGZIPTABIX' {
ext.prefix = { meta.variantcaller ? meta.type ? "${meta.variantcaller}_${meta.id}_${meta.type}_snpEff.ann.vcf" : "${meta.variantcaller}_${meta.id}_snpEff.ann.vcf" : "${meta.id}_snpEff.ann.vcf" }
publishDir = [
mode: params.publish_dir_mode,
path: { "${params.outdir}/annotation/${meta.id}/${meta.variantcaller}" },
Expand All @@ -1071,13 +1072,6 @@ process{
]
}

withName: 'NFCORE_SAREK:SAREK:ANNOTATE:ANNOTATION_ENSEMBLVEP:TABIX_BGZIPTABIX' {
ext.prefix = { meta.variantcaller ? meta.type ? "${meta.variantcaller}_${meta.id}_${meta.type}_VEP.ann.vcf" : "${meta.variantcaller}_${meta.id}_VEP.ann.vcf" : "${meta.id}_VEP.ann.vcf" }
}

withName: 'NFCORE_SAREK:SAREK:ANNOTATE:ANNOTATION_MERGE:TABIX_BGZIPTABIX' {
ext.prefix = { meta.variantcaller ? meta.type ? "${meta.variantcaller}_${meta.id}_${meta.type}_snpEff_VEP.ann.vcf" : "${meta.variantcaller}_${meta.id}_snpEff_VEP.ann.vcf" : "${meta.id}_snpEff_VEP.ann.vcf" }
}

// MULTIQC

Expand Down
75 changes: 55 additions & 20 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,13 @@ Multiple `CSV` files can be specified if the path is enclosed in quotes.
| `cram` | Full path to CRAM file |
| `crai` | Full path to CRAM index file |
| `table` | Full path to recalibration table file |
| `mpileup` | Full path to pileup file |
| `vcf` | Full path to vcf file |

An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.

#### Start with mapping (`--step mapping` [default])

This step can be started either from `fastq` files or (u)`bam`s. The `CSV` must contain at least the columns `patient`, `sample`, `lane`, and either `fastq_1/fastq_2` or `bam`.
This step can be started either from `fastq` files or `(u)bam`s. The `CSV` must contain at least the columns `patient`, `sample`, `lane`, and either `fastq_1/fastq_2` or `bam`.

##### Examples

Expand Down Expand Up @@ -144,6 +144,26 @@ patient,sample,cram,crai
patient1,test_sample,test_mapped.cram,test_mapped.cram.crai
```

The `Sarek`-generated `CSV` file is stored under `results/csv/mapped.csv` if in a previous run `--save_bam_mapped` was set and will automatically be used as an input when specifying the parameter `--step prepare_recalibration`. Otherwise this file will need to be manually generated.

##### Full samplesheet

In this example, all possible columns are used including the `gender` and `status` information per patient:

```console
patient,gender,status,sample,bam,bai
patient1,XX,0,test_sample,test_mapped.bam,test_mapped.bam.bai
patient1,XX,1,tumor_sample,test2_mapped.bam,test2_mapped.bam.bai
patient1,XX,1,relapse_sample,test3_mapped.bam,test3_mapped.bam.bai
```

```console
patient,gender,status,sample,cram,crai
patient1,XX,0,normal_sample,test_mapped.cram,test_mapped.cram.crai
patient1,XX,1,tumor_sample,test2_mapped.cram,test2_mapped.cram.crai
patient1,XX,1,relapse_sample,test3_mapped.cram,test3_mapped.cram.crai
```

##### Prepare Recalibration

For starting directly from preparing recalibration, the `CSV` file must contain at least the columns `patient`, `sample`, `bam`, `bai` or `patient`, `sample`, `cram`, `crai`.
Expand All @@ -152,37 +172,37 @@ Example:

```console
patient,sample,bam,bai
patient1,test_sample,test_mapped.bam,test_mapped.bam.bai
patient1,test_sample,test_md.bam,test_md.bam.bai
```

```console
patient,sample,cram,crai
patient1,test_sample,test_mapped.cram,test_mapped.cram.crai
patient1,test_sample,test_md.cram,test_md.cram.crai
```

The `Sarek`-generated `CSV` file is stored under `results/Preprocessing/CSV/duplicates_marked_no_table.csv` and will automatically be used as an input when specifying the parameter `--step prepare_recalibration`.
The `Sarek`-generated `CSV` file is stored under `results/csv/markduplicates_no_table.csv` and will automatically be used as an input when specifying the parameter `--step prepare_recalibration`.

##### Full samplesheet

In this example, all possible columns are used including the `gender` and `status` information per patient:

```console
patient,gender,status,sample,bam,bai
patient1,XX,0,test_sample,test_mapped.bam,test_mapped.bam.bai
patient1,XX,1,tumor_sample,test2_mapped.bam,test2_mapped.bam.bai
patient1,XX,1,relapse_sample,test3_mapped.bam,test3_mapped.bam.bai
patient1,XX,0,test_sample,test_md.bam,test_md.bam.bai
patient1,XX,1,tumor_sample,test2_md.bam,test2_md.bam.bai
patient1,XX,1,relapse_sample,test3_md.bam,test3_md.bam.bai
```

```console
patient,gender,status,sample,cram,crai
patient1,XX,0,normal_sample,test_mapped.cram,test_mapped.cram.crai
patient1,XX,1,tumor_sample,test2_mapped.cram,test2_mapped.cram.crai
patient1,XX,1,relapse_sample,test3_mapped.cram,test3_mapped.cram.crai
patient1,XX,0,normal_sample,test_md.cram,test_md.cram.crai
patient1,XX,1,tumor_sample,test2_md.cram,test2_md.cram.crai
patient1,XX,1,relapse_sample,test3_md.cram,test3_md.cram.crai
```

#### Start with base quality recalibration (`--step recalibrate`)
#### Start with base quality score recalibration (`--step recalibrate`)

For starting from base quality recalibration the `CSV` file must contain at least the columns `patient`, `sample`, `bam`, `bai`, `table` or `patient`, `sample`, `cram`, `crai`, `table` containing the paths to _non-recalibrated CRAM/BAM_ files and the associated recalibration table.
For starting from base quality score recalibration the `CSV` file must contain at least the columns `patient`, `sample`, `bam`, `bai`, `table` or `patient`, `sample`, `cram`, `crai`, `table` containing the paths to _non-recalibrated CRAM/BAM_ files and the associated recalibration table.

Example:

Expand All @@ -196,7 +216,7 @@ patient,sample,cram,crai,table
patient1,test_sample,test_mapped.cram,test_mapped.cram.crai,test.table
```

The `Sarek`-generated `CSV` file is stored under `results/Preprocessing/CSV/duplicates_marked.csv` and will automatically be used as an input when specifying the parameter `--step recalibrate`.
The `Sarek`-generated `CSV` file is stored under `results/csv/markduplicates.csv` and will automatically be used as an input when specifying the parameter `--step recalibrate`.

##### Full samplesheet

Expand Down Expand Up @@ -225,7 +245,7 @@ patient,sample,cram,crai
patient1,test_sample,test_mapped.cram,test_mapped.cram.crai
```

The `Sarek`-generated `CSV` file is stored under `results/Preprocessing/CSV/recalibrated.csv` and will automatically be used as an input when specifying the parameter `--step variant_calling`.
The `Sarek`-generated `CSV` file is stored under `results/csv/recalibrated.csv` and will automatically be used as an input when specifying the parameter `--step variant_calling`.

##### Full samplesheet

Expand All @@ -240,13 +260,28 @@ patient1,XX,1,relapse_sample,test3_mapped.cram,test3_mapped.cram.crai

#### Start with annotation (`--step annotate`)

Starting with annotation, is a special case in that it doesn't require an input sample sheet. The input files for Sarek can be specified using the path to a `VCF` file given to the `--input` command only with the annotation step (`--step annotate`).
For starting from the annotation step, the `CSV` file must contain at least the columns `patient`, `sample`, `vcf`.

As `Sarek` will use `bgzip` and `tabix` to compress and index the annotated `VCF` files, it expects the input `VCF` files to be sorted.
Multiple `VCF` files can be specified, using a [glob path](https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob), if enclosed in quotes.
For example:

```bash
--step annotate --input "results/VariantCalling/*/{HaplotypeCaller,Manta,Mutect2,Strelka,TIDDIT}/*.vcf.gz"
Example:

```console
patient,sample,vcf
patient1,test_sample,test,vcf
```

The `Sarek`-generated `CSV` file is stored under `results/csv/variantcalled.csv` and will automatically be used as an input when specifying the parameter `--step annotation`.

##### Full samplesheet

In this example, all possible columns are used including the `variantcaller` information per sample:

```console
patient,sample,variantcaller,vcf
test,sample3,strelka,sample3.variants.vcf.gz
test,sample4_vs_sample3,manta,sample4_vs_sample3.diploid_sv.vcf.gz
test,sample4_vs_sample3,manta,sample4_vs_sample3.somatic_sv.vcf.gz
```

### Updating the pipeline
Expand Down
4 changes: 2 additions & 2 deletions lib/WorkflowMain.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,8 @@ class WorkflowMain {

// Check input has been provided
if (!params.input) {
log.error "Please provide an input samplesheet to the pipeline e.g. '--input samplesheet.csv'"
System.exit(1)
log.warn "No samplesheet specified, attempting to restart from csv files present in ${params.outdir}"
WorkflowSarek.retrieveInput(params, log)
}
}

Expand Down
26 changes: 26 additions & 0 deletions lib/WorkflowSarek.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -56,4 +56,30 @@ class WorkflowSarek {
System.exit(1)
}
}

public static String retrieveInput(params, log){
switch (params.step) {
case 'mapping': log.warn "Can't start with step $params.step without samplesheet"
System.exit(1);
break
case 'markduplicates': log.warn "Using file ${params.outdir}/csv/mapped.csv"
params.replace("input","${params.outdir}/csv/mapped.csv");
break
case 'prepare_recalibration': log.warn "Using file ${params.outdir}/csv/markduplicates_no_table.csv"
params.replace("input", "${params.outdir}/csv/markduplicates_no_table.csv");
break
case 'recalibrate': log.warn "Using file ${params.outdir}/csv/markduplicates.csv"
params.replace("input", "${params.outdir}/csv/markduplicates.csv");
break
case 'variant_calling': log.warn "Using file ${params.outdir}/csv/recalibrated.csv"
params.replace("input", "${params.outdir}/csv/recalibrated.csv");
break
// case 'controlfreec': csv_file = file("${params.outdir}/variant_calling/csv/control-freec_mpileup.csv", checkIfExists: true); break
case 'annotate': log.warn "Using file ${params.outdir}/csv/variantcalled.csv"
params.replace("input","${params.outdir}/csv/variantcalled.csv");
break
default: log.warn "Please provide an input samplesheet to the pipeline e.g. '--input samplesheet.csv'"
exit 1, "Unknown step $params.step"
}
}
}
1 change: 0 additions & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ params {
no_intervals = false // Intervals will be built from the fasta file
nucleotides_per_second = 1000 // Default interval size
sentieon = false // Not using Sentieon by default
target_bed = null // No default TargetBED file for targeted sequencing
tools = null // No default Variant_Calling or Annotation tools
skip_tools = null // All tools (markduplicates + baserecalibrator + QC) are used by default

Expand Down
Loading

0 comments on commit 226263b

Please sign in to comment.