diff --git a/CHANGELOG.md b/CHANGELOG.md index 0e8c8b9603..313e89c4ac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -43,17 +43,17 @@ Initial release of `nf-core/sarek`, created with the [nf-core](http://nf-co.re/) - [#21](https://github.com/nf-core/sarek/pull/21) - Add tests for latest Nextflow version as well - [#21](https://github.com/nf-core/sarek/pull/21) - Add `genomes.config` for genomes without AWS iGenomes - [#24](https://github.com/nf-core/sarek/pull/24) - Added GATK4 Mutect2 calling and filtering -- [#XXX](https://github.com/nf-core/sarek/pull/XXX) - Use Github actions for CI +- [#27](https://github.com/nf-core/sarek/pull/27), [#30](https://github.com/nf-core/sarek/pull/30) - Use Github actions for CI, linting and branch protection - [#31](https://github.com/nf-core/sarek/pull/31) - Add nf-core lint - [#31](https://github.com/nf-core/sarek/pull/31) - Add extra CI to GitHub Actions nf-core extra CI ### `Changed` -- [#1](https://github.com/nf-core/sarek/pull/1), [#2](https://github.com/nf-core/sarek/pull/2), [#3](https://github.com/nf-core/sarek/pull/3), [#4](https://github.com/nf-core/sarek/pull/4), [#5](https://github.com/nf-core/sarek/pull/5), [#6](https://github.com/nf-core/sarek/pull/6), [#7](https://github.com/nf-core/sarek/pull/7), [#8](https://github.com/nf-core/sarek/pull/8), [#9](https://github.com/nf-core/sarek/pull/9), [#10](https://github.com/nf-core/sarek/pull/10), [#11](https://github.com/nf-core/sarek/pull/11), [#12](https://github.com/nf-core/sarek/pull/12), [#18](https://github.com/nf-core/sarek/pull/18), [#20](https://github.com/nf-core/sarek/pull/20), [#21](https://github.com/nf-core/sarek/pull/21), [#29](https://github.com/nf-core/sarek/pull/29) - Update docs +- [#1](https://github.com/nf-core/sarek/pull/1), [#2](https://github.com/nf-core/sarek/pull/2), [#3](https://github.com/nf-core/sarek/pull/3), [#4](https://github.com/nf-core/sarek/pull/4), [#5](https://github.com/nf-core/sarek/pull/5), [#6](https://github.com/nf-core/sarek/pull/6), [#7](https://github.com/nf-core/sarek/pull/7), [#8](https://github.com/nf-core/sarek/pull/8), [#9](https://github.com/nf-core/sarek/pull/9), [#10](https://github.com/nf-core/sarek/pull/10), [#11](https://github.com/nf-core/sarek/pull/11), [#12](https://github.com/nf-core/sarek/pull/12), [#18](https://github.com/nf-core/sarek/pull/18), [#20](https://github.com/nf-core/sarek/pull/20), [#21](https://github.com/nf-core/sarek/pull/21), [#23](https://github.com/nf-core/sarek/pull/23), [#29](https://github.com/nf-core/sarek/pull/29) - Update docs - [#4](https://github.com/nf-core/sarek/pull/4) - Update `cancerit-allelecount` from `2.1.2` to `4.0.2` - [#4](https://github.com/nf-core/sarek/pull/4) - Update `gatk4` from `4.1.1.0` to `4.1.2.0` -- [#7](https://github.com/nf-core/sarek/pull/7) - `--sampleDir` is now deprecated, use `--sample` instead -- [#7](https://github.com/nf-core/sarek/pull/8) - `--annotateVCF` is now deprecated, use `--sample` instead +- [#7](https://github.com/nf-core/sarek/pull/7), [#23](https://github.com/nf-core/sarek/pull/23) - `--sampleDir` is now deprecated, use `--input` instead +- [#7](https://github.com/nf-core/sarek/pull/8), [#23](https://github.com/nf-core/sarek/pull/23) - `--annotateVCF` is now deprecated, use `--input` instead - [#8](https://github.com/nf-core/sarek/pull/8), [#12](https://github.com/nf-core/sarek/pull/12) - Improve helper script `build.nf` for downloading and building reference files - [#9](https://github.com/nf-core/sarek/pull/9) - ApplyBQSR is now parallelized - [#9](https://github.com/nf-core/sarek/pull/9) - Fastq files are named following "${idRun}_R1.fastq.gz" in the FastQC output for easier reporting @@ -73,7 +73,13 @@ Initial release of `nf-core/sarek`, created with the [nf-core](http://nf-co.re/) - [#18](https://github.com/nf-core/sarek/pull/18), [#29](https://github.com/nf-core/sarek/pull/29) - `--noReports` is now `--skipQC all` - [#18](https://github.com/nf-core/sarek/pull/18), [#21](https://github.com/nf-core/sarek/pull/21) - Update logo - [#21](https://github.com/nf-core/sarek/pull/21) - Moved smallGRCh37 path to `genomes.config` +- [#23](https://github.com/nf-core/sarek/pull/23) - Rename `genomeFile`, `genomeIndex` and `genomeDict` by `fasta`, `fastaFai` and `dict` +- [#23](https://github.com/nf-core/sarek/pull/23) - `--sample` is now deprecated, use `--input` instead +- [#23](https://github.com/nf-core/sarek/pull/23) - `--genomeFile` is now deprecated, use `--fasta` instead +- [#23](https://github.com/nf-core/sarek/pull/23) - `--genomeIndex` is now deprecated, use `--fastaFai` instead +- [#23](https://github.com/nf-core/sarek/pull/23) - `--genomeDict` is now deprecated, use `--dict` instead - [#24](https://github.com/nf-core/sarek/pull/24) - iGenomes config now contains germline resource for GATK4 Mutect2 +- [#30](https://github.com/nf-core/sarek/pull/30) - Simplify code for `MapReads` process - [#31](https://github.com/nf-core/sarek/pull/31) - Move extra CI to GitHub Actions nf-core extra CI - [#32](https://github.com/nf-core/sarek/pull/32), [#33](https://github.com/nf-core/sarek/pull/33) - Install `ASCAT` with `conda` in the `environment.yml` file - [#33](https://github.com/nf-core/sarek/pull/33) - use workflow.manifest.version to specify workflow version in path to R scripts for control-FREEC and VEP processes @@ -91,7 +97,7 @@ Initial release of `nf-core/sarek`, created with the [nf-core](http://nf-co.re/) ### `Fixed` - [#3](https://github.com/nf-core/sarek/pull/3) - Fix Docker ownership -- [#11](https://github.com/nf-core/sarek/pull/11) - Fix MergeMpileup PublishDir +- [#11](https://github.com/nf-core/sarek/pull/11) - Fix `MergeMpileup` PublishDir - [#13](https://github.com/nf-core/sarek/pull/13) - Fix merge in annotation - [#14](https://github.com/nf-core/sarek/pull/14) - Fix output name for vcf files - [#16](https://github.com/nf-core/sarek/pull/16) - Fix path to Rscript @@ -99,7 +105,8 @@ Initial release of `nf-core/sarek`, created with the [nf-core](http://nf-co.re/) - [#18](https://github.com/nf-core/sarek/pull/18) - Use same font for nf-core and sarek in ascii art - [#20](https://github.com/nf-core/sarek/pull/20) - Use new logo in README - [#20](https://github.com/nf-core/sarek/pull/20) - Fix path to references genomes -- [#22](https://github.com/nf-core/sarek/pull/22) - Fix --singleCPUMem issue +- [#22](https://github.com/nf-core/sarek/pull/22) - Fix `--singleCPUMem` issue +- [#30](https://github.com/nf-core/sarek/pull/30) - fix choice between `inputPairReadsFastQC` and `inputBAMFastQC` channels - [#31](https://github.com/nf-core/sarek/pull/31) - Fix badges according to nf-core lint - [#31](https://github.com/nf-core/sarek/pull/31) - Fix rcolorbrewer version according to nf-core lint - [#33](https://github.com/nf-core/sarek/pull/33) - Fix MD Linting diff --git a/build.nf b/build.nf index 3f23f8a1da..b70e1530f1 100644 --- a/build.nf +++ b/build.nf @@ -186,9 +186,6 @@ process DecompressFile { ch_decompressedFiles = ch_decompressedFiles.dump(tag:'DecompressedFile') ch_fastaFile = Channel.create() -ch_fastaForBWA = Channel.create() -ch_fastaReference = Channel.create() -ch_fastaForSAMTools = Channel.create() ch_otherFile = Channel.create() ch_vcfFile = Channel.create() diff --git a/conf/genomes.config b/conf/genomes.config index 78ec23c8e6..c22824a932 100644 --- a/conf/genomes.config +++ b/conf/genomes.config @@ -15,9 +15,9 @@ params { bwaIndex = "${params.genomes_base}/human_g1k_v37_decoy.fasta.{amb,ann,bwt,pac,sa}" dbsnp = "${params.genomes_base}/dbsnp_138.b37.vcf" dbsnpIndex = "${params.genomes_base}/dbsnp_138.b37.vcf.idx" - genomeDict = "${params.genomes_base}/human_g1k_v37_decoy.dict" - genomeFile = "${params.genomes_base}/human_g1k_v37_decoy.fasta" - genomeIndex = "${params.genomes_base}/human_g1k_v37_decoy.fasta.fai" + dict = "${params.genomes_base}/human_g1k_v37_decoy.dict" + fasta = "${params.genomes_base}/human_g1k_v37_decoy.fasta" + fastaFai = "${params.genomes_base}/human_g1k_v37_decoy.fasta.fai" intervals = "${params.genomes_base}/wgs_calling_regions_Sarek.list" knownIndels = "${params.genomes_base}/{1000G_phase1,Mills_and_1000G_gold_standard}.indels.b37.vcf" knownIndelsIndex = "${params.genomes_base}/{1000G_phase1,Mills_and_1000G_gold_standard}.indels.b37.vcf.idx" @@ -32,9 +32,9 @@ params { dbsnpIndex = "${params.genomes_base}/dbsnp_146.hg38.vcf.gz.tbi" germlineResource = "${params.genomes_base}/GCF_000001405.38.AUTOSOMESXY.COMMON.BIALLELIC.SNPs.with.AF.vcf.gz" germlineResourceIndex = "${params.genomes_base}/GCF_000001405.38.AUTOSOMESXY.COMMON.BIALLELIC.SNPs.with.AF.vcf.gz.tbi" - genomeDict = "${params.genomes_base}/Homo_sapiens_assembly38.dict" - genomeFile = "${params.genomes_base}/Homo_sapiens_assembly38.fasta" - genomeIndex = "${params.genomes_base}/Homo_sapiens_assembly38.fasta.fai" + dict = "${params.genomes_base}/Homo_sapiens_assembly38.dict" + fasta = "${params.genomes_base}/Homo_sapiens_assembly38.fasta" + fastaFai = "${params.genomes_base}/Homo_sapiens_assembly38.fasta.fai" intervals = "${params.genomes_base}/wgs_calling_regions.hg38.bed" knownIndels = "${params.genomes_base}/{Mills_and_1000G_gold_standard.indels.hg38,beta/Homo_sapiens_assembly38.known_indels}.vcf.gz" knownIndelsIndex = "${params.genomes_base}/{Mills_and_1000G_gold_standard.indels.hg38,beta/Homo_sapiens_assembly38.known_indels}.vcf.gz.tbi" @@ -49,9 +49,9 @@ params { dbsnpIndex = "${params.genomes_base}/dbsnp_138.b37.small.vcf.idx" germlineResource = "${params.genomes_base}/dbsnp_138.b37.small.vcf" germlineResourceIndex = "${params.genomes_base}/dbsnp_138.b37.small.vcf.idx" - genomeDict = "${params.genomes_base}/human_g1k_v37_decoy.small.dict" - genomeFile = "${params.genomes_base}/human_g1k_v37_decoy.small.fasta" - genomeIndex = "${params.genomes_base}/human_g1k_v37_decoy.small.fasta.fai" + dict = "${params.genomes_base}/human_g1k_v37_decoy.small.dict" + fasta = "${params.genomes_base}/human_g1k_v37_decoy.small.fasta" + fastaFai = "${params.genomes_base}/human_g1k_v37_decoy.small.fasta.fai" intervals = "${params.genomes_base}/small.intervals" knownIndels = "${params.genomes_base}/{1000G_phase1,Mills_and_1000G_gold_standard}.indels.b37.small.vcf" knownIndelsIndex = "${params.genomes_base}/{1000G_phase1,Mills_and_1000G_gold_standard}.indels.b37.small.vcf.idx" diff --git a/conf/igenomes.config b/conf/igenomes.config index 95fa91e27c..fd12e8200a 100644 --- a/conf/igenomes.config +++ b/conf/igenomes.config @@ -17,9 +17,9 @@ dbsnpIndex = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/GATKBundle/dbsnp_138.b37.vcf.idx" germlineResource = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/GermlineResource/gnomAD.r2.1.1.GRCh37.PASS.AC.AF.only.vcf.gz" germlineResourceIndex = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/GermlineResource/gnomAD.r2.1.1.GRCh37.PASS.AC.AF.only.vcf.gz.tbi" - genomeDict = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Sequence/WholeGenomeFasta/human_g1k_v37_decoy.dict" - genomeFile = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Sequence/WholeGenomeFasta/human_g1k_v37_decoy.fasta" - genomeIndex = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Sequence/WholeGenomeFasta/human_g1k_v37_decoy.fasta.fai" + dict = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Sequence/WholeGenomeFasta/human_g1k_v37_decoy.dict" + fasta = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Sequence/WholeGenomeFasta/human_g1k_v37_decoy.fasta" + fastaFai = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Sequence/WholeGenomeFasta/human_g1k_v37_decoy.fasta.fai" intervals = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/intervals/wgs_calling_regions_Sarek.list" knownIndels = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/GATKBundle/{1000G_phase1,Mills_and_1000G_gold_standard}.indels.b37.vcf" knownIndelsIndex = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/GATKBundle/{1000G_phase1,Mills_and_1000G_gold_standard}.indels.b37.vcf.idx" @@ -34,9 +34,9 @@ dbsnpIndex = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/dbsnp_146.hg38.vcf.gz.tbi" germlineResource = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GermlineResource/gnomAD.r2.1.1.GRCh38.PASS.AC.AF.only.vcf.gz" germlineResourceIndex = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GermlineResource/gnomAD.r2.1.1.GRCh38.PASS.AC.AF.only.vcf.gz.tbi" - genomeDict = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.dict" - genomeFile = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta" - genomeIndex = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta.fai" + dict = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.dict" + fasta = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta" + fastaFai = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta.fai" intervals = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/intervals/wgs_calling_regions.hg38.bed" knownIndels = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/{Mills_and_1000G_gold_standard.indels.hg38,beta/Homo_sapiens_assembly38.known_indels}.vcf.gz" knownIndelsIndex = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/{Mills_and_1000G_gold_standard.indels.hg38,beta/Homo_sapiens_assembly38.known_indels}.vcf.gz.tbi" diff --git a/conf/test.config b/conf/test.config index b0fd3b6fab..fb92b47f10 100644 --- a/conf/test.config +++ b/conf/test.config @@ -15,7 +15,7 @@ params { max_memory = 6.GB max_time = 48.h // Input data - sample = 'https://github.com/nf-core/test-datasets/raw/sarek/testdata/tsv/tiny-manta-https.tsv' + input = 'https://github.com/nf-core/test-datasets/raw/sarek/testdata/tsv/tiny-manta-https.tsv' // Small reference genome // To be build with: `nextflow run build.nf --build -profile docker --outdir references` igenomesIgnore = true diff --git a/docs/input.md b/docs/input.md index 094ca38169..18e6695855 100644 --- a/docs/input.md +++ b/docs/input.md @@ -2,7 +2,7 @@ ## Information about the TSV files -Input files for Sarek can be specified using a TSV file given to the `--sample` command. +Input files for Sarek can be specified using a TSV file given to the `--input` command. The TSV file is a Tab Separated Value file with columns: - `subject gender status sample lane fastq1 fastq2` for step `mapping` with paired-end FASTQs @@ -49,10 +49,10 @@ G15511 XX 1 D0ENMT D0ENM_2 pathToFiles/D0ENMACXX111207.2_1.fastq. ## Path to a FASTQ directory for a single normal sample (step mapping) -Input files for Sarek can be specified using the path to a FASTQ directory given to the `--sample` command only with the `mapping` step. +Input files for Sarek can be specified using the path to a FASTQ directory given to the `--input` command only with the `mapping` step. ```bash -nextflow run nf-core/sarek --sample pathToDirectory ... +nextflow run nf-core/sarek --input pathToDirectory ... ``` ### Input FASTQ file name best practices @@ -128,9 +128,9 @@ G15511 XX 1 D0ENMT pathToFiles/G15511.D0ENMT.md.recal.bam pathToF ## VCF files for annotation -Input files for Sarek can be specified using the path to a VCF directory given to the `--sample` command only with the `annotate` step. +Input files for Sarek can be specified using the path to a VCF directory given to the `--input` command only with the `annotate` step. Multiple VCF files can be specified if the path is enclosed in quotes. ```bash -nextflow run nf-core/sarek --step annotate --sample "results/VariantCalling/*/.vcf.gz" ... +nextflow run nf-core/sarek --step annotate --input "results/VariantCalling/*/.vcf.gz" ... ``` diff --git a/docs/usage.md b/docs/usage.md index c737e92688..6d179111e4 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -11,10 +11,14 @@ * [Reproducibility](#reproducibility) * [Main arguments](#main-arguments) * [`-profile`](#-profile) + * [`--input`](#--input) * [`--sample`](#--sample) + * [`--sampleDir`](#--sampledir) + * [`--annotateVCF`](#--annotatevcf) * [`--noGVCF`](#--nogvcf) + * [`--skipQC`](#--skipqc) + * [`--noReports`](#--noreports) * [`--nucleotidesPerSecond`](#--nucleotidespersecond) - * [`--skipQC`](#--skipQC) * [`--step`](#--step) * [`--tools`](#--tools) * [`--noStrelkaBP`](#--nostrelkabp) @@ -26,16 +30,19 @@ * [`--bwaIndex`](#--bwaindex) * [`--dbsnp`](#--dbsnp) * [`--dbsnpIndex`](#--dbsnpindex) + * [`--dict`](#--dict) + * [`--fasta`](#--fasta) + * [`--fastaFai`](#--fastafai) * [`--genomeDict`](#--genomedict) * [`--genomeFile`](#--genomefile) * [`--genomeIndex`](#--genomeindex) - * [`--germlineResource`](#--germlineResource) - * [`--germlineResourceIndex`](#--germlineResourceIndex) + * [`--germlineResource`](#--germlineresource) + * [`--germlineResourceIndex`](#--germlineresourceindex) * [`--intervals`](#--intervals) * [`--knownIndels`](#--knownindels) * [`--knownIndelsIndex`](#--knownindelsindex) - * [`--snpeffDb`](#--snpeffdb) * [`--pon`](#--pon) + * [`--snpeffDb`](#--snpeffdb) * [`--vepCacheVersion`](#--vepcacheversion) * [`--igenomesIgnore`](#--igenomesignore) * [Job resources](#job-resources) @@ -80,7 +87,7 @@ NXF_OPTS='-Xms1g -Xmx4g' The typical command for running the pipeline is as follows: ```bash -nextflow run nf-core/sarek --sample sample.tsv -profile docker +nextflow run nf-core/sarek --input sample.tsv -profile docker ``` This will launch the pipeline with the `docker` configuration profile. @@ -145,8 +152,38 @@ If `-profile` is not specified at all the pipeline will be run locally and expec * A profile with a complete configuration for automated testing * Includes links to test data so needs no other parameters +### `--input` + +Use this to specify the location of your input TSV file, on `mapping`, `recalibrate` and `variantcalling` steps. +For example: + +```bash +--input sample.tsv +``` + +Multiple TSV files can be specified if the path must be enclosed in quotes + +Use this to specify the location to a directory on `mapping` step with a single germline sample only. +For example: + +```bash +--input PathToDirectory +``` + +Use this to specify the location of your VCF input file on `annotate` step. +For example: + +```bash +--input sample.vcf +``` + +Multiple VCF files can be specified if the path must be enclosed in quotes + ### `--sample` +> :warning: This params is deprecated -- it will be removed in a future release. +> Please check: [`--input`](#--input) + Use this to specify the location of your input TSV file, on `mapping`, `recalibrate` and `variantcalling` steps. For example: @@ -172,6 +209,32 @@ For example: Multiple VCF files can be specified if the path must be enclosed in quotes +### `--sampleDir` + +> :warning: This params is deprecated -- it will be removed in a future release. +> Please check: [`--input`](#--input) + +Use this to specify the location to a directory on `mapping` step with a single germline sample only. +For example: + +```bash +--sampleDir PathToDirectory +``` + +### `--annotateVCF` + +> :warning: This params is deprecated -- it will be removed in a future release. +> Please check: [`--input`](#--input) + +Use this to specify the location of your VCF input file on `annotate` step. +For example: + +```bash +--annotateVCF sample.vcf +``` + +Multiple VCF files can be specified if the path must be enclosed in quotes + ### `--noGVCF` Use this to disable g.vcf from `HaplotypeCaller`. @@ -182,6 +245,13 @@ Use this to disable specific QC and Reporting tools. Available: `all`, `bamQC`, `BCFtools`, `FastQC`, `MultiQC`, `samtools`, `vcftools`, `versions` Default: `None` +### `--noReports` + +> :warning: This params is deprecated -- it will be removed in a future release. +> Please check: [`--skipQC`](#--skipQC) + +Use this to disable all QC and Reporting tools. + ### `--nucleotidesPerSecond` Use this to estimate of how many seconds it will take to call variants on any interval, the default value is `1000` is it's not specified in the `.bed` file. @@ -236,9 +306,9 @@ params { bwaIndex = '' dbsnp = '' dbsnpIndex = '' - genomeDict = '' - genomeFile = '' - genomeIndex = '' + dict = '' + fasta = '' + fastaFai = '' intervals = '' knownIndels = '' knownIndelsIndex = '' @@ -290,28 +360,61 @@ If you prefer, you can specify the full path to your reference genome when you r --dbsnpIndex '[path to the dbsnp index]' ``` +### `--dict` + +If you prefer, you can specify the full path to your reference genome when you run the pipeline: + +```bash +--dict '[path to the dict file]' +``` + +### `--fasta` + +If you prefer, you can specify the full path to your reference genome when you run the pipeline: + +```bash +--fasta '[path to the reference fasta file]' +``` + +### `--fastaFai` + +If you prefer, you can specify the full path to your reference genome when you run the pipeline: + +```bash +--fastaFai '[path to the reference index]' +``` + ### `--genomeDict` +> :warning: This params is deprecated -- it will be removed in a future release. +> Please check: [`--dict`](#--dict) + If you prefer, you can specify the full path to your reference genome when you run the pipeline: ```bash ---genomeDict '[path to the genomeDict file]' +--dict '[path to the dict file]' ``` ### `--genomeFile` +> :warning: This params is deprecated -- it will be removed in a future release. +> Please check: [`--fasta`](#--fasta) + If you prefer, you can specify the full path to your reference genome when you run the pipeline: ```bash ---genomeFile '[path to the genome file]' +--fasta '[path to the reference fasta file]' ``` ### `--genomeIndex` +> :warning: This params is deprecated -- it will be removed in a future release. +> Please check: [`--fastaFai`](#--fastaFai) + If you prefer, you can specify the full path to your reference genome when you run the pipeline: ```bash ---genomeIndex '[path to the genome Index]' +--fastaFai '[path to the reference index]' ``` ### `--germlineResource` diff --git a/docs/use_cases.md b/docs/use_cases.md index c47ad93387..2491ad69e6 100644 --- a/docs/use_cases.md +++ b/docs/use_cases.md @@ -5,7 +5,7 @@ Using the `mapping` directive one will have a pair of mapped, deduplicated and r This is the usual option you have to give when you are starting from raw FASTQ data: ```bash -nextflow run nf-core/sarek/main.nf --sample mysample.tsv --tools +nextflow run nf-core/sarek/main.nf --input mysample.tsv --tools ``` `mapping` will start by default, you do not have to give any additional parameters, only the TSV file describing the sample (see below). @@ -20,7 +20,7 @@ Also, older version are renamed with incremented numbers. The workflow should be started in this case with the smallest set of options as written above: ```bash -nextflow run nf-core/sarek/main.nf --sample mysample.tsv --tools +nextflow run nf-core/sarek/main.nf --input mysample.tsv --tools ``` The TSV file should look like: @@ -33,22 +33,22 @@ See the [input files documentation](docs/input.md) for more information. ## Starting from raw FASTQ - a directory with normal sample only -The `--sample` option can be also used to point Sarek to a directory with FASTQ files: +The `--input` option can be also used to point Sarek to a directory with FASTQ files: ```bash -nextflow run nf-core/sarek/main.nf --sample path/to/FASTQ/files --tools +nextflow run nf-core/sarek/main.nf --input path/to/FASTQ/files --tools ``` The given directory is searched recursively for FASTQ files that are named `*_R1_*.fastq.gz`, and a matching pair with the same name except `_R2_` instead of `_R1_` is expected to exist alongside. All of the found FASTQ files are considered to belong to the sample. Each FASTQ file pair gets its own read group (`@RG`) in the resulting BAM file. -### Metadata when using `--sample` with a directory +### Metadata when using `--input` with a directory -When using `--sample` with a directory, the metadata about the sample that are written to the BAM header in the `@RG` tag are determined in the following way. +When using `--input` with a directory, the metadata about the sample that are written to the BAM header in the `@RG` tag are determined in the following way. -- The sample name (`SM`) is derived from the the last component of the path given to `--sample`. -That is, you should make sure that that directory has a meaningful name! For example, with `--sample=/my/fastqs/sample123`, the sample name will be `sample123`. +- The sample name (`SM`) is derived from the the last component of the path given to `--input`. +That is, you should make sure that that directory has a meaningful name! For example, with `--input=/my/fastqs/sample123`, the sample name will be `sample123`. - The read group id is set to *flowcell.samplename.lane*. The flowcell id and lane number are auto-detected from the name of the first read in the FASTQ file. @@ -78,7 +78,7 @@ See the [input files documentation](docs/input.md) for more information. ## Starting from recalibration ```bash -nextflow run nf-core/sarek/main.nf --sample mysample.tsv --step recalibrate --tools +nextflow run nf-core/sarek/main.nf --input mysample.tsv --step recalibrate --tools ``` And the corresponding TSV file should be like: @@ -121,5 +121,5 @@ It is adviced to pad the variant calling regions (exons or the target) to some e To add the target BED file configure the flow like: ```bash -nextflow run nf-core/sarek/main.nf --tools haplotypecaller,strelka,mutect2 --targetBED targets.bed --sample my_panel.tsv +nextflow run nf-core/sarek/main.nf --tools haplotypecaller,strelka,mutect2 --targetBED targets.bed --input my_panel.tsv ``` diff --git a/main.nf b/main.nf index 62f73b9474..d54965cf63 100644 --- a/main.nf +++ b/main.nf @@ -27,10 +27,10 @@ def helpMessage() { The typical command for running the pipeline is as follows: - nextflow run nf-core/sarek --sample sample.tsv -profile docker + nextflow run nf-core/sarek --input sample.tsv -profile docker Mandatory arguments: - --sample Path to input TSV file on mapping, recalibrate and variantcalling steps + --input Path to input TSV file on mapping, recalibrate and variantcalling steps Multiple TSV files can be specified with quotes Works also with the path to a directory on mapping step with a single germline sample only Alternatively, path to VCF input file on annotate step @@ -70,9 +70,9 @@ def helpMessage() { --bwaIndex bwa indexes --dbsnp dbsnp file --dbsnpIndex dbsnp index - --genomeDict genome dict - --genomeFile genome file - --genomeIndex genome index + --dict dict from the fasta reference + --fasta fasta reference + --fastafai reference index --intervals intervals --knownIndels knownIndels file --knownIndelsIndex knownIndels index @@ -102,6 +102,22 @@ def helpMessage() { // Show help message if (params.help) exit 0, helpMessage() +// Handle deprecation +params.noReports = null +if (params.noReports) log.warn "The params `--noReports` is deprecated -- it will be removed in a future release.\n\tPlease check: https://github.com/nf-core/sarek/blob/master/docs/usage.md#--skipQC" +params.annotateVCF = null +if (params.annotateVCF) log.warn "The params `--annotateVCF` is deprecated -- it will be removed in a future release.\n\tPlease check: https://github.com/nf-core/sarek/blob/master/docs/usage.md#--input" +params.genomeDict = null +if (params.genomeDict) log.warn "The params `--genomeDict` is deprecated -- it will be removed in a future release.\n\tPlease check: https://github.com/nf-core/sarek/blob/master/docs/usage.md#--dict" +params.genomeFile = null +if (params.genomeFile) log.warn "The params `--genomeFile` is deprecated -- it will be removed in a future release.\n\tPlease check: https://github.com/nf-core/sarek/blob/master/docs/usage.md#--fasta" +params.genomeIndex = null +if (params.genomeIndex) log.warn "The params `--genomeIndex` is deprecated -- it will be removed in a future release.\n\tPlease check: https://github.com/nf-core/sarek/blob/master/docs/usage.md#--fastaFai" +params.sample = null +if (params.sample) log.warn "The params `--sample` is deprecated -- it will be removed in a future release.\n\tPlease check: https://github.com/nf-core/sarek/blob/master/docs/usage.md#--input" +params.sampleDir = null +if (params.sampleDir) log.warn "The params `--sampleDir` is deprecated -- it will be removed in a future release.\n\tPlease check: https://github.com/nf-core/sarek/blob/master/docs/usage.md#--input" + // Check if genome exists in the config file if (params.genomes && params.genome && !params.genomes.containsKey(params.genome)) { exit 1, "The provided genome '${params.genome}' is not available in the iGenomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}" @@ -116,13 +132,13 @@ params.cadd_WG_SNVs = null params.cadd_WG_SNVs_tbi = null params.cadd_cache = null params.genesplicer = null +params.input = null params.monochrome_logs = null params.multiqc_config = null params.noGVCF = null params.noStrelkaBP = null params.nucleotidesPerSecond = 1000.0 params.pon = null -params.sample = null params.sequencing_center = null params.skipQC = null params.snpEff_cache = null @@ -145,6 +161,9 @@ skipQClist = defineSkipQClist() skipQC = params.skipQC ? params.skipQC == 'all' ? skipQClist : params.skipQC.split(',').collect{it.trim().toLowerCase()} : [] if (!checkParameterList(skipQC, skipQClist)) exit 1, 'Unknown QC tool(s), see --help for more information' +// Handle deprecation +if (params.noReports) skipQC = skipQClist + annoList = defineAnnoList() annotateTools = params.annotateTools ? params.annotateTools.split(',').collect{it.trim().toLowerCase()} : [] if (!checkParameterList(annotateTools,annoList)) exit 1, 'Unknown tool(s) to annotate, see --help for more information' @@ -171,12 +190,17 @@ if (workflow.profile == 'awsbatch') { ch_output_docs = Channel.fromPath("${baseDir}/docs/output.md") tsvPath = null -if (params.sample) if (hasExtension(params.sample, "tsv") || hasExtension(params.sample, "vcf") || hasExtension(params.sample, "vcf.gz")) tsvPath = params.sample -if (params.sample) if (hasExtension(params.sample, "vcf") || hasExtension(params.sample, "vcf.gz")) step = "annotate" +if (params.input && (hasExtension(params.input, "tsv") || hasExtension(params.input, "vcf") || hasExtension(params.input, "vcf.gz"))) tsvPath = params.input +if (params.input && (hasExtension(params.input, "vcf") || hasExtension(params.input, "vcf.gz"))) step = "annotate" + +// Handle deprecation +if (params.annotateVCF) tsvPath = params.annotateVCF +if (params.sample) tsvPath = params.sample +if (params.sampleDir) tsvPath = params.sampleDir // If no input file specified, trying to get TSV files corresponding to step in the TSV directory // only for steps recalibrate and variantCalling -if (!params.sample && step != 'mapping' && step != 'annotate') { +if (!params.input && step != 'mapping' && step != 'annotate') { tsvPath = step == 'recalibrate' ? "${params.outdir}/Preprocessing/TSV/duplicateMarked.tsv": "${params.outdir}/Preprocessing/TSV/recalibrated.tsv" } @@ -190,16 +214,16 @@ if (tsvPath) { case 'annotate': break default: exit 1, "Unknown step ${step}" } -} else if (params.sample) if (!hasExtension(params.sample, "tsv")) { +} else if (params.input && !hasExtension(params.input, "tsv")) { println "No TSV file" if (step != 'mapping') exit 1, 'No other step than "mapping" support a dir as an input' - println "Reading ${params.sample} directory" - inputSample = extractFastqFromDir(params.sample) + println "Reading ${params.input} directory" + inputSample = extractFastqFromDir(params.input) (inputSample, fastqTMP) = inputSample.into(2) fastqTMP.toList().subscribe onNext: { - if (it.size() == 0) exit 1, "No FASTQ files found in --sample directory '${params.sample}'" + if (it.size() == 0) exit 1, "No FASTQ files found in --input directory '${params.input}'" } - tsvFile = params.sample // used in the reports + tsvFile = params.input // used in the reports } else if (step == 'annotate') { println "Annotating ${tsvFile}" } else exit 1, 'No sample were defined, see --help' @@ -212,15 +236,15 @@ def summary = [:] if (workflow.revision) summary['Pipeline Release'] = workflow.revision summary['Run Name'] = custom_runName ?: workflow.runName summary['Max Resources'] = "${params.max_memory} memory, ${params.max_cpus} cpus, ${params.max_time} time per job" -if (workflow.containerEngine) summary['Container'] = "${workflow.containerEngine} - ${workflow.container}" -if (params.sample) summary['Sample'] = params.sample -if (params.targetBED) summary['Target BED'] = params.targetBED -if (params.step) summary['Step'] = params.step -if (params.tools) summary['Tools'] = tools.join(', ') -if (params.skipQC) summary['QC tools skip'] = skipQC.join(', ') -if (params.noGVCF) summary['No GVCF'] = params.noGVCF -if (params.noStrelkaBP) summary['No Strelka BP'] = params.noStrelkaBP -if (params.sequencing_center) summary['Sequenced by '] = params.sequencing_center +if (workflow.containerEngine) summary['Container'] = "${workflow.containerEngine} - ${workflow.container}" +if (params.input) summary['Input'] = params.input +if (params.targetBED) summary['Target BED'] = params.targetBED +if (params.step) summary['Step'] = params.step +if (params.tools) summary['Tools'] = tools.join(', ') +if (params.skipQC) summary['QC tools skip'] = skipQC.join(', ') +if (params.noGVCF) summary['No GVCF'] = params.noGVCF +if (params.noStrelkaBP) summary['No Strelka BP'] = params.noStrelkaBP +if (params.sequencing_center) summary['Sequenced by '] = params.sequencing_center if (params.pon) summary['Panel of normals '] = params.pon summary['Nucleotides/s'] = params.nucleotidesPerSecond summary['Output dir'] = params.outdir @@ -437,7 +461,7 @@ process MapReads { input: set idPatient, idSample, idRun, file(inputFile1), file(inputFile2) from inputReads - set file(genomeFile), file(bwaIndex) from Channel.value([referenceMap.genomeFile, referenceMap.bwaIndex]) + set file(fasta), file(bwaIndex) from Channel.value([referenceMap.fasta, referenceMap.bwaIndex]) output: set idPatient, idSample, idRun, file("${idRun}.bam") into bamMapped @@ -460,7 +484,7 @@ process MapReads { input = hasExtension(inputFile1, "bam") ? "-p /dev/stdin - 2> >(tee ${inputFile1}.bwa.stderr.log >&2)" : "${inputFile1} ${inputFile2}" """ ${convertToFastq} - bwa mem -K 100000000 -R \"${readGroup}\" ${extra} -t ${task.cpus} -M ${genomeFile} \ + bwa mem -K 100000000 -R \"${readGroup}\" ${extra} -t ${task.cpus} -M ${fasta} \ ${input} | \ samtools sort --threads ${task.cpus} -m 2G - > ${idRun}.bam """ @@ -560,10 +584,10 @@ process BaseRecalibrator { input: set idPatient, idSample, file(bam), file(bai), file(intervalBed) from bamBaseRecalibrator - set file(genomeFile), file(genomeIndex), file(genomeDict), file(dbsnp), file(dbsnpIndex), file(knownIndels), file(knownIndelsIndex) from Channel.value([ - referenceMap.genomeFile, - referenceMap.genomeIndex, - referenceMap.genomeDict, + set file(fasta), file(fastaFai), file(dict), file(dbsnp), file(dbsnpIndex), file(knownIndels), file(knownIndelsIndex) from Channel.value([ + referenceMap.fasta, + referenceMap.fastaFai, + referenceMap.dict, referenceMap.dbsnp, referenceMap.dbsnpIndex, referenceMap.knownIndels, @@ -584,7 +608,7 @@ process BaseRecalibrator { -I ${bam} \ -O ${intervalBed.baseName}_${idSample}.recal.table \ --tmp-dir /tmp \ - -R ${genomeFile} \ + -R ${fasta} \ -L ${intervalBed} \ --known-sites ${dbsnp} \ ${known} \ @@ -658,10 +682,10 @@ process ApplyBQSR { input: set idPatient, idSample, file(bam), file(bai), file(recalibrationReport), file(intervalBed) from bamApplyBQSR - set file(genomeFile), file(genomeIndex), file(genomeDict)from Channel.value([ - referenceMap.genomeFile, - referenceMap.genomeIndex, - referenceMap.genomeDict + set file(fasta), file(fastaFai), file(dict)from Channel.value([ + referenceMap.fasta, + referenceMap.fastaFai, + referenceMap.dict ]) output: @@ -671,7 +695,7 @@ process ApplyBQSR { """ gatk --java-options -Xmx${task.memory.toGiga()}g \ ApplyBQSR \ - -R ${genomeFile} \ + -R ${fasta} \ --input ${bam} \ --output ${intervalBed.baseName}_${idSample}.recal.bam \ -L ${intervalBed} \ @@ -820,10 +844,10 @@ process HaplotypeCaller { input: set idPatient, idSample, file(bam), file(bai), file(intervalBed) from bamHaplotypeCaller - set file(genomeFile), file(genomeIndex), file(genomeDict), file(dbsnp), file(dbsnpIndex) from Channel.value([ - referenceMap.genomeFile, - referenceMap.genomeIndex, - referenceMap.genomeDict, + set file(fasta), file(fastaFai), file(dict), file(dbsnp), file(dbsnpIndex) from Channel.value([ + referenceMap.fasta, + referenceMap.fastaFai, + referenceMap.dict, referenceMap.dbsnp, referenceMap.dbsnpIndex ]) @@ -838,7 +862,7 @@ process HaplotypeCaller { """ gatk --java-options "-Xmx${task.memory.toGiga()}g -Xms6000m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10" \ HaplotypeCaller \ - -R ${genomeFile} \ + -R ${fasta} \ -I ${bam} \ -L ${intervalBed} \ -D ${dbsnp} \ @@ -859,10 +883,10 @@ process GenotypeGVCFs { input: set idPatient, idSample, file(intervalBed), file(gvcf) from gvcfGenotypeGVCFs - set file(genomeFile), file(genomeIndex), file(genomeDict), file(dbsnp), file(dbsnpIndex) from Channel.value([ - referenceMap.genomeFile, - referenceMap.genomeIndex, - referenceMap.genomeDict, + set file(fasta), file(fastaFai), file(dict), file(dbsnp), file(dbsnpIndex) from Channel.value([ + referenceMap.fasta, + referenceMap.fastaFai, + referenceMap.dict, referenceMap.dbsnp, referenceMap.dbsnpIndex ]) @@ -880,7 +904,7 @@ process GenotypeGVCFs { gatk --java-options -Xmx${task.memory.toGiga()}g \ GenotypeGVCFs \ - -R ${genomeFile} \ + -R ${fasta} \ -L ${intervalBed} \ -D ${dbsnp} \ -V ${gvcf} \ @@ -903,9 +927,9 @@ process StrelkaSingle { input: set idPatient, idSample, file(bam), file(bai) from bamStrelkaSingle file(targetBED) from Channel.value(params.targetBED ? file(params.targetBED) : "null") - set file(genomeFile), file(genomeIndex) from Channel.value([ - referenceMap.genomeFile, - referenceMap.genomeIndex + set file(fasta), file(fastaFai) from Channel.value([ + referenceMap.fasta, + referenceMap.fastaFai ]) output: @@ -920,7 +944,7 @@ process StrelkaSingle { ${beforeScript} configureStrelkaGermlineWorkflow.py \ --bam ${bam} \ - --referenceFasta ${genomeFile} \ + --referenceFasta ${fasta} \ ${options} \ --runDir Strelka @@ -952,9 +976,9 @@ process MantaSingle { input: set idPatient, idSample, file(bam), file(bai) from bamMantaSingle file(targetBED) from Channel.value(params.targetBED ? file(params.targetBED) : "null") - set file(genomeFile), file(genomeIndex) from Channel.value([ - referenceMap.genomeFile, - referenceMap.genomeIndex + set file(fasta), file(fastaFai) from Channel.value([ + referenceMap.fasta, + referenceMap.fastaFai ]) output: @@ -972,7 +996,7 @@ process MantaSingle { ${beforeScript} configManta.py \ ${inputbam} ${bam} \ - --reference ${genomeFile} \ + --reference ${fasta} \ ${options} \ --runDir Manta @@ -1010,9 +1034,9 @@ process TIDDIT { input: set idPatient, idSample, file(bam), file(bai) from bamTIDDIT - set file(genomeFile), file(genomeIndex) from Channel.value([ - referenceMap.genomeFile, - referenceMap.genomeIndex + set file(fasta), file(fastaFai) from Channel.value([ + referenceMap.fasta, + referenceMap.fastaFai ]) output: @@ -1023,7 +1047,7 @@ process TIDDIT { script: """ - tiddit --sv -o TIDDIT_${idSample} --bam ${bam} --ref ${genomeFile} + tiddit --sv -o TIDDIT_${idSample} --bam ${bam} --ref ${fasta} mv TIDDIT_${idSample}.vcf TIDDIT_${idSample}.old.vcf @@ -1080,8 +1104,10 @@ process FreeBayes { input: set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor), file(intervalBed) from pairBamFreeBayes - file(genomeFile) from Channel.value(referenceMap.genomeFile) - file(genomeIndex) from Channel.value(referenceMap.genomeIndex) + set file(fasta), file(fastaFai) from Channel.value([ + referenceMap.fasta, + referenceMap.fastaFai + ]) output: set val("FreeBayes"), idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf") into vcfFreeBayes @@ -1091,7 +1117,7 @@ process FreeBayes { script: """ freebayes \ - -f ${genomeFile} \ + -f ${fasta} \ --pooled-continuous \ --pooled-discrete \ --genotype-qualities \ @@ -1116,10 +1142,10 @@ process Mutect2 { input: set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor), file(intervalBed) from pairBamMutect2 - set file(genomeFile), file(genomeIndex), file(genomeDict), file(intervals), file(germlineResource), file(germlineResourceIndex) from Channel.value([ - referenceMap.genomeFile, - referenceMap.genomeIndex, - referenceMap.genomeDict, + set file(fasta), file(fastaFai), file(dict), file(intervals), file(germlineResource), file(germlineResourceIndex) from Channel.value([ + referenceMap.fasta, + referenceMap.fastaFai, + referenceMap.dict, referenceMap.intervals, referenceMap.germlineResource, referenceMap.germlineResourceIndex @@ -1147,7 +1173,7 @@ process Mutect2 { # this case we are getting raw calls only for the intervals, we also have to concatenate them gatk --java-options "-Xmx${task.memory.toGiga()}g" \ Mutect2 \ - -R ${genomeFile}\ + -R ${fasta}\ -I ${bamTumor} -tumor ${idSampleTumor} \ -I ${bamNormal} -normal ${idSampleNormal} \ -L ${intervalBed} \ @@ -1177,10 +1203,10 @@ process MergeMutect2Stats { idSampleTumor, idSampleNormal, file(statsFiles) from mutect2Stats // the actual stats files - set file(genomeFile), file(genomeIndex), file(genomeDict), file(intervals), file(germlineResource), file(germlineResourceIndex) from Channel.value([ - referenceMap.genomeFile, - referenceMap.genomeIndex, - referenceMap.genomeDict, + set file(fasta), file(fastaFai), file(dict), file(intervals), file(germlineResource), file(germlineResourceIndex) from Channel.value([ + referenceMap.fasta, + referenceMap.fastaFai, + referenceMap.dict, referenceMap.intervals, referenceMap.germlineResource, referenceMap.germlineResourceIndex @@ -1217,7 +1243,7 @@ process ConcatVCF { input: set variantCaller, idPatient, idSample, file(vcFiles) from vcfConcatenateVCFs - file(genomeIndex) from Channel.value(referenceMap.genomeIndex) + file(fastaFai) from Channel.value(referenceMap.fastaFai) file(targetBED) from Channel.value(params.targetBED ? file(params.targetBED) : "null") output: @@ -1235,13 +1261,15 @@ process ConcatVCF { outputFile = "${variantCaller}_${idSample}.vcf" options = params.targetBED ? "-t ${targetBED}" : "" """ - concatenateVCFs.sh -i ${genomeIndex} -c ${task.cpus} -o ${outputFile} ${options} + concatenateVCFs.sh -i ${fastaFai} -c ${task.cpus} -o ${outputFile} ${options} """ } (vcfConcatenated, vcfConcatenatedForFilter) = vcfConcatenated.into(2) vcfConcatenated = vcfConcatenated.dump(tag:'VCF') +// STEP GATK MUTECT2 + process PileupSummariesForMutect2 { tag {idSampleTumor + "_vs_" + idSampleNormal + "_" + intervalBed.baseName } label 'cpus_1' @@ -1286,7 +1314,7 @@ process MergePileupSummaries { input: set idPatient, idSampleTumor, file(pileupSums) from pileupSummaries - file(genomeDict) from Channel.value([referenceMap.genomeDict]) + file(dict) from Channel.value([referenceMap.dict]) output: file("${idSampleTumor}_pileupsummaries.table.tsv") into mergedPileupFile @@ -1297,7 +1325,7 @@ process MergePileupSummaries { """ gatk --java-options "-Xmx${task.memory.toGiga()}g" \ GatherPileupSummaries \ - --sequence-dictionary ${genomeDict} \ + --sequence-dictionary ${dict} \ ${allPileups} \ -O ${idSampleTumor}_pileupsummaries.table.tsv """ @@ -1340,10 +1368,10 @@ process FilterMutect2Calls { idSampleTN, file(unfiltered), file(unfilteredIndex) from vcfConcatenatedForFilter - set file(genomeFile), file(genomeIndex), file(genomeDict), file(intervals), file(germlineResource), file(germlineResourceIndex) from Channel.value([ - referenceMap.genomeFile, - referenceMap.genomeIndex, - referenceMap.genomeDict, + set file(fasta), file(fastaFai), file(dict), file(intervals), file(germlineResource), file(germlineResourceIndex) from Channel.value([ + referenceMap.fasta, + referenceMap.fastaFai, + referenceMap.dict, referenceMap.intervals, referenceMap.germlineResource, referenceMap.germlineResourceIndex @@ -1369,7 +1397,7 @@ process FilterMutect2Calls { -V $unfiltered \ --contamination-table ${idSampleTN}_contamination.table \ --stats ${idSampleTN}.vcf.gz.stats \ - -R ${genomeFile} \ + -R ${fasta} \ -O filtered_${variantCaller}_${idSampleTN}.vcf.gz """ } @@ -1387,10 +1415,10 @@ process Strelka { input: set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor) from pairBamStrelka file(targetBED) from Channel.value(params.targetBED ? file(params.targetBED) : "null") - set file(genomeFile), file(genomeIndex), file(genomeDict) from Channel.value([ - referenceMap.genomeFile, - referenceMap.genomeIndex, - referenceMap.genomeDict + set file(fasta), file(fastaFai), file(dict) from Channel.value([ + referenceMap.fasta, + referenceMap.fastaFai, + referenceMap.dict ]) output: @@ -1406,7 +1434,7 @@ process Strelka { configureStrelkaSomaticWorkflow.py \ --tumor ${bamTumor} \ --normal ${bamNormal} \ - --referenceFasta ${genomeFile} \ + --referenceFasta ${fasta} \ ${options} \ --runDir Strelka @@ -1438,9 +1466,9 @@ process Manta { input: set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor) from pairBamManta file(targetBED) from Channel.value(params.targetBED ? file(params.targetBED) : "null") - set file(genomeFile), file(genomeIndex) from Channel.value([ - referenceMap.genomeFile, - referenceMap.genomeIndex + set file(fasta), file(fastaFai) from Channel.value([ + referenceMap.fasta, + referenceMap.fastaFai ]) output: @@ -1457,7 +1485,7 @@ process Manta { configManta.py \ --normalBam ${bamNormal} \ --tumorBam ${bamTumor} \ - --reference ${genomeFile} \ + --reference ${fasta} \ ${options} \ --runDir Manta @@ -1506,10 +1534,10 @@ process StrelkaBP { input: set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor), file(mantaCSI), file(mantaCSIi) from pairBamStrelkaBP file(targetBED) from Channel.value(params.targetBED ? file(params.targetBED) : "null") - set file(genomeFile), file(genomeIndex), file(genomeDict) from Channel.value([ - referenceMap.genomeFile, - referenceMap.genomeIndex, - referenceMap.genomeDict + set file(fasta), file(fastaFai), file(dict) from Channel.value([ + referenceMap.fasta, + referenceMap.fastaFai, + referenceMap.dict ]) output: @@ -1525,7 +1553,7 @@ process StrelkaBP { configureStrelkaSomaticWorkflow.py \ --tumor ${bamTumor} \ --normal ${bamNormal} \ - --referenceFasta ${genomeFile} \ + --referenceFasta ${fasta} \ --indelCandidates ${mantaCSI} \ ${options} \ --runDir Strelka @@ -1556,11 +1584,11 @@ process AlleleCounter { input: set idPatient, idSample, file(bam), file(bai) from bamAscat - set file(acLoci), file(genomeFile), file(genomeIndex), file(genomeDict) from Channel.value([ + set file(acLoci), file(fasta), file(fastaFai), file(dict) from Channel.value([ referenceMap.acLoci, - referenceMap.genomeFile, - referenceMap.genomeIndex, - referenceMap.genomeDict + referenceMap.fasta, + referenceMap.fastaFai, + referenceMap.dict ]) output: @@ -1572,7 +1600,7 @@ process AlleleCounter { """ alleleCounter \ -l ${acLoci} \ - -r ${genomeFile} \ + -r ${fasta} \ -b ${bam} \ -o ${idSample}.alleleCount; """ @@ -1657,9 +1685,9 @@ process Mpileup { input: set idPatient, idSample, file(bam), file(bai), file(intervalBed) from bamMpileup - set file(genomeFile), file(genomeIndex) from Channel.value([ - referenceMap.genomeFile, - referenceMap.genomeIndex + set file(fasta), file(fastaFai) from Channel.value([ + referenceMap.fasta, + referenceMap.fastaFai ]) output: @@ -1670,7 +1698,7 @@ process Mpileup { script: """ samtools mpileup \ - -f ${genomeFile} ${bam} \ + -f ${fasta} ${bam} \ -l ${intervalBed} \ | bgzip --threads ${task.cpus} -c > ${intervalBed.baseName}_${idSample}.pileup.gz """ @@ -1732,9 +1760,9 @@ process ControlFREEC { input: set idPatient, idSampleNormal, idSampleTumor, file(mpileupNormal), file(mpileupTumor) from mpileupOut - set file(genomeFile), file(genomeIndex), file(dbsnp), file(dbsnpIndex), file(chrDir), file(chrLength) from Channel.value([ - referenceMap.genomeFile, - referenceMap.genomeIndex, + set file(fasta), file(fastaFai), file(dbsnp), file(dbsnpIndex), file(chrDir), file(chrLength) from Channel.value([ + referenceMap.fasta, + referenceMap.fastaFai, referenceMap.dbsnp, referenceMap.dbsnpIndex, referenceMap.chrDir, @@ -2434,6 +2462,11 @@ def checkParameterList(list, realList) { // Check if params.item exists and return params.genomes[params.genome].item otherwise def checkParamReturnFile(item) { + // Handle deprecation + if (params.genomeDict && item == "dict") return file(params.genomeDict) + if (params.genomeFile && item == "fasta") return file(params.genomeFile) + if (params.genomeIndex && item == "fastaFai") return file(params.genomeIndex) + params."${item}" = params.genomes[params.genome]."${item}" return file(params."${item}") } @@ -2462,9 +2495,9 @@ def checkReferenceMap(referenceMap) { // Define map of reference depending of tools and step def defineReferenceMap(step, tools) { def referenceMap = [ - 'genomeDict' : checkParamReturnFile("genomeDict"), - 'genomeFile' : checkParamReturnFile("genomeFile"), - 'genomeIndex' : checkParamReturnFile("genomeIndex"), + 'dict' : checkParamReturnFile("dict"), + 'fasta' : checkParamReturnFile("fasta"), + 'fastaFai' : checkParamReturnFile("fastaFai"), 'intervals' : checkParamReturnFile("intervals") ] if ('mapping' in step) { @@ -2554,6 +2587,12 @@ def defineToolList() { ] } +// Print deprecation message +def deprecationMessage(oldItem, newItem = null) { + extra = newItem == null ? "": ", please use `${newItem}` instead" + log.warn "The ${oldItem} is deprecated${extra} -- it will be removed in a future release" +} + // Channeling the TSV file containing BAM. // Format is: "subject gender status sample bam bai" def extractBam(tsvFile) { diff --git a/nextflow.config b/nextflow.config index 686593b934..ec1ca9b05c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -100,7 +100,7 @@ dag { manifest { name = 'nf-core/sarek' - author = 'Maxime Garcia' + author = 'Maxime Garcia, Szilveszter Juhos' homePage = 'https://github.com/nf-core/sarek' description = 'An open-source analysis pipeline to detect germline or somatic variants from whole genome or targeted sequencing' mainScript = 'main.nf' diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh index 6e125e3c9a..2f852a6655 100755 --- a/scripts/run_tests.sh +++ b/scripts/run_tests.sh @@ -109,21 +109,21 @@ esac case $TEST in ANNOTATE) - run_sarek --step annotate --tools ${ANNOTATOR} --sample ${PATHTOSAMPLE}/vcf/Strelka_1234N_variants.vcf.gz + run_sarek --step annotate --tools ${ANNOTATOR} --input ${PATHTOSAMPLE}/vcf/Strelka_1234N_variants.vcf.gz ;; GERMLINE) - run_sarek --tools=false --sample data/testdata/tiny/normal - run_sarek --tools=false --sample results/Preprocessing/TSV/duplicateMarked.tsv --step recalibrate - run_sarek --tools HaplotypeCaller --sample results/Preprocessing/TSV/recalibrated.tsv --step variantCalling + run_sarek --tools=false --input data/testdata/tiny/normal + run_sarek --tools=false --input results/Preprocessing/TSV/duplicateMarked.tsv --step recalibrate + run_sarek --tools HaplotypeCaller --input results/Preprocessing/TSV/recalibrated.tsv --step variantCalling ;; MULTIPLE) - run_sarek ${OPTIONS},snpEff,VEP,merge --sample ${PATHTOSAMPLE}/tsv/tiny-multiple${SUFFIX}.tsv + run_sarek ${OPTIONS},snpEff,VEP,merge --input ${PATHTOSAMPLE}/tsv/tiny-multiple${SUFFIX}.tsv ;; SOMATIC) - run_sarek ${OPTIONS} --sample ${PATHTOSAMPLE}/tsv/tiny-manta${SUFFIX}.tsv + run_sarek ${OPTIONS} --input ${PATHTOSAMPLE}/tsv/tiny-manta${SUFFIX}.tsv ;; TARGETED) - run_sarek ${OPTIONS} --sample ${PATHTOSAMPLE}/tsv/tiny-manta${SUFFIX}.tsv --targetBED ${PATHTOSAMPLE}/target.bed + run_sarek ${OPTIONS} --input ${PATHTOSAMPLE}/tsv/tiny-manta${SUFFIX}.tsv --targetBED ${PATHTOSAMPLE}/target.bed ;; esac