diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 83dee22c5e..18695a1d22 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -19,47 +19,30 @@ jobs: NXF_ANSI_LOG: false strategy: matrix: - # Nextflow versions: check pipeline minimum and current latest - nxf_ver: ['20.04.0', ''] + # Nextflow versions: check pipeline minimum + nxf_ver: ['20.11.0-edge'] steps: - name: Check out pipeline code uses: actions/checkout@v2 - - name: Check if Dockerfile or Conda environment changed - uses: technote-space/get-diff-action@v4 - with: - FILES: | - Dockerfile - environment.yml - - name: Build new docker image - if: env.MATCHED_FILES - run: docker build --no-cache . -t nfcore/sarek:dev - - - name: Pull docker image - if: ${{ !env.MATCHED_FILES }} - run: | - docker pull nfcore/sarek:dev - docker tag nfcore/sarek:dev nfcore/sarek:dev - - name: Install Nextflow env: CAPSULE_LOG: none run: | wget -qO- get.nextflow.io | bash sudo mv nextflow /usr/local/bin/ - - name: Run pipeline with test data run: | nextflow run ${GITHUB_WORKSPACE} -profile test,docker - annotation: + aligner: + name: Run aligner tests env: NXF_ANSI_LOG: false runs-on: ubuntu-latest strategy: matrix: - tools: [snpeff, vep] - species: [WBcel235] + aligner: [bwa-mem, bwa-mem2] steps: - uses: actions/checkout@v2 - name: Install Nextflow @@ -68,15 +51,12 @@ jobs: sudo mv nextflow /usr/local/bin/ env: # Only check Nextflow pipeline minimum version - NXF_VER: '20.04.0' - - name: Pull docker image - run: | - docker pull nfcore/sarek:dev - docker pull nfcore/sarek${{ matrix.tools }}:dev.${{ matrix.species }} - - name: Run annotation test - run: nextflow run ${GITHUB_WORKSPACE} -profile test_annotation,docker --tools ${{ matrix.tools }} + NXF_VER: '20.11.0-edge' + - name: Run ${{ matrix.profile }} test + run: nextflow run ${GITHUB_WORKSPACE} -profile test,docker --aligner ${{ matrix.aligner }} germline: + name: Run input from a folder test and restart from step tests env: NXF_ANSI_LOG: false runs-on: ubuntu-latest @@ -91,99 +71,99 @@ jobs: sudo mv nextflow /usr/local/bin/ env: # Only check Nextflow pipeline minimum version - NXF_VER: '20.04.0' - - name: Pull docker image - run: docker pull nfcore/sarek:dev + NXF_VER: '20.11.0-edge' - name: Get test data run: git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git data - - name: Run germline test - run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input data/testdata/tiny/normal --saved_bam_mapped - nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input=false --step prepare_recalibration -resume - nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input=false --step recalibrate -resume - nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input=false --step variantCalling + - name: Run germline test with ${{ matrix.markduplicates }} --step mapping + run: nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input data/testdata/tiny/normal --save_bam_mapped + - name: Run germline test with ${{ matrix.markduplicates }} --step prepare_recalibration + run: nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input=false --step prepare_recalibration -resume + - name: Run germline test with ${{ matrix.markduplicates }} --step recalibrate + run: nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input=false --step recalibrate -resume + - name: Run germline test with ${{ matrix.markduplicates }} --step variantCalling + run: nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input=false --step variantCalling - minimal: - env: - NXF_ANSI_LOG: false - runs-on: ubuntu-latest - strategy: - matrix: - genome: [smallerGRCh37, minimalGRCh37] - intervals: [--no_intervals, ''] - steps: - - uses: actions/checkout@v2 - - name: Install Nextflow - run: | - wget -qO- get.nextflow.io | bash - sudo mv nextflow /usr/local/bin/ - env: - # Only check Nextflow pipeline minimum version - NXF_VER: '20.04.0' - - name: Pull docker image - run: docker pull nfcore/sarek:dev - - name: Run test for minimal genomes - run: nextflow run ${GITHUB_WORKSPACE} -profile test,docker --skipQC all --genome ${{ matrix.genome }} ${{ matrix.intervals }} --tools Manta,mpileup,Strelka,FreeBayes + # annotation: + # env: + # NXF_ANSI_LOG: false + # runs-on: ubuntu-latest + # strategy: + # matrix: + # tools: [snpeff, vep] + # species: [WBcel235] + # steps: + # - uses: actions/checkout@v2 + # - name: Install Nextflow + # run: | + # wget -qO- get.nextflow.io | bash + # sudo mv nextflow /usr/local/bin/ + # env: + # # Only check Nextflow pipeline minimum version + # NXF_VER: '20.11.0-edge' + # - name: Pull docker image + # run: | + # docker pull nfcore/sarek${{ matrix.tools }}:dev.${{ matrix.species }} + # - name: Run annotation test + # run: nextflow run ${GITHUB_WORKSPACE} -profile test_annotation,docker --tools ${{ matrix.tools }} - profile: - env: - NXF_ANSI_LOG: false - runs-on: ubuntu-latest - strategy: - matrix: - profile: [test_split_fastq, test_targeted, test_trimming, test_use_gatk_spark, test_umi_tso, test_umi_qiaseq] - steps: - - uses: actions/checkout@v2 - - name: Install Nextflow - run: | - wget -qO- get.nextflow.io | bash - sudo mv nextflow /usr/local/bin/ - env: - # Only check Nextflow pipeline minimum version - NXF_VER: '20.04.0' - - name: Pull docker image - run: docker pull nfcore/sarek:dev - - name: Run ${{ matrix.profile }} test - run: nextflow run ${GITHUB_WORKSPACE} -profile ${{ matrix.profile }},docker + # minimal: + # env: + # NXF_ANSI_LOG: false + # runs-on: ubuntu-latest + # strategy: + # matrix: + # genome: [smallerGRCh37, minimalGRCh37] + # intervals: [--no_intervals, ''] + # steps: + # - uses: actions/checkout@v2 + # - name: Install Nextflow + # run: | + # wget -qO- get.nextflow.io | bash + # sudo mv nextflow /usr/local/bin/ + # env: + # # Only check Nextflow pipeline minimum version + # NXF_VER: '20.11.0-edge' + # - name: Run test for minimal genomes + # run: nextflow run ${GITHUB_WORKSPACE} -profile test,docker --skipQC all --genome ${{ matrix.genome }} ${{ matrix.intervals }} --tools Manta,mpileup,Strelka,FreeBayes - aligner: - env: - NXF_ANSI_LOG: false - runs-on: ubuntu-latest - strategy: - matrix: - aligner: [bwa-mem, bwa-mem2] - steps: - - uses: actions/checkout@v2 - - name: Install Nextflow - run: | - wget -qO- get.nextflow.io | bash - sudo mv nextflow /usr/local/bin/ - env: - # Only check Nextflow pipeline minimum version - NXF_VER: '20.04.0' - - name: Pull docker image - run: docker pull nfcore/sarek:dev - - name: Run ${{ matrix.profile }} test - run: nextflow run ${GITHUB_WORKSPACE} -profile test,docker --aligner ${{ matrix.aligner }} + # profile: + # env: + # NXF_ANSI_LOG: false + # runs-on: ubuntu-latest + # strategy: + # matrix: + # profile: [test_split_fastq, test_targeted, test_trimming, test_no_gatk_spark, test_umi_tso, test_umi_qiaseq] + # steps: + # - uses: actions/checkout@v2 + # - name: Install Nextflow + # run: | + # wget -qO- get.nextflow.io | bash + # sudo mv nextflow /usr/local/bin/ + # env: + # # Only check Nextflow pipeline minimum version + # NXF_VER: '20.11.0-edge' + # - name: Run ${{ matrix.profile }} test + # run: nextflow run ${GITHUB_WORKSPACE} -profile ${{ matrix.profile }},docker tools: + name: Run tool tests env: NXF_ANSI_LOG: false runs-on: ubuntu-latest strategy: matrix: - tool: [Haplotypecaller, Freebayes, Manta, mpileup, MSIsensor, Strelka, TIDDIT] + # tool: [Haplotypecaller, Freebayes, Manta, mpileup, MSIsensor, Strelka, TIDDIT] + tool: [Haplotypecaller, Strelka] intervals: [--no_intervals, ''] exclude: - - tool: Manta - intervals: --no_intervals - - tool: MSIsensor - intervals: --no_intervals + # - tool: Manta + # intervals: --no_intervals + # - tool: MSIsensor + # intervals: --no_intervals - tool: Strelka intervals: --no_intervals - - tool: TIDDIT - intervals: --no_intervals + # - tool: TIDDIT + # intervals: --no_intervals steps: - uses: actions/checkout@v2 - name: Install Nextflow @@ -192,8 +172,6 @@ jobs: sudo mv nextflow /usr/local/bin/ env: # Only check Nextflow pipeline minimum version - NXF_VER: '20.04.0' - - name: Pull docker image - run: docker pull nfcore/sarek:dev + NXF_VER: '20.11.0-edge' - name: Run ${{ matrix.tool }} test run: nextflow run ${GITHUB_WORKSPACE} -profile test_tool,docker --tools ${{ matrix.tool }} ${{ matrix.intervals }} diff --git a/CHANGELOG.md b/CHANGELOG.md index 45dd4e4bd3..9e6478f170 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,8 +9,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- [#238](https://github.com/nf-core/sarek/pull/238) -Add subworkflow for building all the indices +- [#241](https://github.com/nf-core/sarek/pull/241) -Add modules and workflows parts for preprocessing steps + ### Changed +- [#233](https://github.com/nf-core/sarek/pull/233) - Switch `bwa 0.7.17` for `bwa-mem2 2.0` +- [#234](https://github.com/nf-core/sarek/pull/234) -Switching to DSL2 +- [#234](https://github.com/nf-core/sarek/pull/234) - Update Nextflow `19.10.0` -> `20.04.1` - [#333](https://github.com/nf-core/sarek/pull/333) - Bump `Sarek` version to `3.0dev` ### Fixed diff --git a/README.md b/README.md index 14546f22b6..07a8557837 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ > **An open-source analysis pipeline to detect germline or somatic variants from whole genome or targeted sequencing** -[![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A520.04.0-brightgreen.svg)](https://www.nextflow.io/) +[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A520.11.0--edge-brightgreen.svg)](https://www.nextflow.io/) [![nf-core](https://img.shields.io/badge/nf--core-pipeline-brightgreen.svg)](https://nf-co.re/) [![DOI](https://zenodo.org/badge/184289291.svg)](https://zenodo.org/badge/latestdoi/184289291) diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml index 2865e59a99..49b2e00887 100644 --- a/assets/multiqc_config.yaml +++ b/assets/multiqc_config.yaml @@ -19,6 +19,7 @@ top_modules: name: 'FastQC' path_filters_exclude: - '*trimmed_fastqc*' +- 'cutadapt' - 'fastqc': name: 'FastQC after trimming' info: 'FastQC after applying TrimGalore.' diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py index ac12fb994b..1318be7b88 100755 --- a/bin/scrape_software_versions.py +++ b/bin/scrape_software_versions.py @@ -6,54 +6,60 @@ regexes = { "ASCAT": ["v_ascat.txt", r"Version: (\S+)"], "AlleleCount": ["v_allelecount.txt", r"(\S+)"], + "bcftools": ["v_bcftools.txt", r"bcftools (\S+)"], "BWA": ["v_bwa.txt", r"Version: (\S+)"], + "BWA-MEM2": ["v_bwamem2.txt", r"(\S+)"], "CNVkit": ["v_cnvkit.txt", r"(\S+)"], "Control-FREEC": ["v_controlfreec.txt", r"Control-FREEC\s(\S+)"], "FastQC": ["v_fastqc.txt", r"FastQC v(\S+)"], "FreeBayes": ["v_freebayes.txt", r"version: v(\d\.\d\.\d+)"], "GATK": ["v_gatk.txt", r"Version:(\S+)"], + "htslib": ["v_samtools.txt", r"htslib (\S+)"], "Manta": ["v_manta.txt", r"([0-9.]+)"], "MultiQC": ["v_multiqc.txt", r"multiqc, version (\S+)"], + "msisensor": ["v_msisensor.txt", r"Version: v(\S+)"], "Nextflow": ["v_nextflow.txt", r"(\S+)"], + "nf-core/sarek": ["v_pipeline.txt", r"(\S+)"], "QualiMap": ["v_qualimap.txt", r"QualiMap v.(\S+)"], "R": ["v_r.txt", r"R version (\S+)"], + "samtools": ["v_samtools.txt", r"samtools (\S+)"], "SnpEff": ["v_snpeff.txt", r"SnpEff\s(\S+)"], "Strelka": ["v_strelka.txt", r"([0-9.]+)"], "TIDDIT": ["v_tiddit.txt", r"TIDDIT-(\S+)"], "Trim Galore": ["v_trim_galore.txt", r"version (\S+)"], - "VEP": ["v_vep.txt", r"ensembl-vep : (\S+)"], - "bcftools": ["v_bcftools.txt", r"bcftools (\S+)"], - "htslib": ["v_samtools.txt", r"htslib (\S+)"], - "msisensor": ["v_msisensor.txt", r"Version: v(\S+)"], - "nf-core/sarek": ["v_pipeline.txt", r"(\S+)"], - "samtools": ["v_samtools.txt", r"samtools (\S+)"], - "vcftools": ["v_vcftools.txt", r"([0-9.]+)"] + "vcftools": ["v_vcftools.txt", r"([0-9.]+)"], + "VEP": ["v_vep.txt", r"ensembl-vep : (\S+)"] } results = OrderedDict() results["nf-core/sarek"] = 'N/A' results["Nextflow"] = 'N/A' + results["BWA"] = 'N/A' +results["BWA-MEM2"] = 'N/A' results["GATK"] = 'N/A' -results["FreeBayes"] = 'N/A' +results["htslib"] = 'N/A' results["samtools"] = 'N/A' -results["Strelka"] = 'N/A' -results["Manta"] = 'N/A' -results["TIDDIT"] = 'N/A' -results["AlleleCount"] = 'N/A' + results["ASCAT"] = 'N/A' +results["AlleleCount"] = 'N/A' +results["CNVkit"] = 'N/A' results["Control-FREEC"] = 'N/A' +results["FreeBayes"] = 'N/A' +results["Manta"] = 'N/A' results["msisensor"] = 'N/A' +results["R"] = 'N/A' +results["Strelka"] = 'N/A' +results["TIDDIT"] = 'N/A' + results["SnpEff"] = 'N/A' results["VEP"] = 'N/A' -results["MultiQC"] = 'N/A' -results["FastQC"] = 'N/A' + results["bcftools"] = 'N/A' -results["CNVkit"] = 'N/A' -results["htslib"] = 'N/A' +results["FastQC"] = 'N/A' +results["MultiQC"] = 'N/A' results["QualiMap"] = 'N/A' results["Trim Galore"] = 'N/A' results["vcftools"] = 'N/A' -results["R"] = 'N/A' # Search each file using its regex for k, v in regexes.items(): diff --git a/conf/base.config b/conf/base.config index ac13fe4927..3381bed5ff 100644 --- a/conf/base.config +++ b/conf/base.config @@ -41,55 +41,55 @@ process { time = { check_resource( 20.h * task.attempt) } } - withLabel:cpus_1 { + withLabel:CPUS_1 { cpus = {check_resource(1)} } - withLabel:cpus_2 { + withLabel:CPUS_2 { cpus = {check_resource(2)} } - withLabel:cpus_4 { + withLabel:CPUS_4 { cpus = {check_resource(4)} } - withLabel:cpus_8 { + withLabel:CPUS_8 { cpus = {check_resource(8)} } - withLabel:cpus_16 { + withLabel:CPUS_16 { cpus = {check_resource(16)} } - withLabel:cpus_max { + withLabel:CPUS_MAX { cpus = {params.max_cpus} } - withLabel:memory_singleCPU_2_task { + withLabel:MEMORY_SINGLECPU_2_TASK { memory = {check_resource((params.single_cpu_mem as nextflow.util.MemoryUnit) * 2 * task.attempt)} } - withLabel:memory_singleCPU_task_sq { + withLabel:MEMORY_SINGLECPU_TASK_SQ { memory = {check_resource((params.single_cpu_mem as nextflow.util.MemoryUnit) * task.attempt * task.attempt)} } - withLabel:memory_max { + withLabel:MEMORY_MAX { memory = {params.max_memory} } - withName:get_software_versions { + withName:GET_SOFTWARE_VERSIONS { cache = false } - withLabel:concat_vcf { - // For unknown reasons, ConcatVCF sometimes fails with SIGPIPE + withName:CONCAT_VCF { + // For unknown reasons, CONCAT_VCF sometimes fails with SIGPIPE // (exit code 141). Rerunning the process will usually work. errorStrategy = {task.exitStatus == 141 ? 'retry' : 'terminate'} } - withLabel:FastQC { + withLabel:FASTQC { errorStrategy = {task.exitStatus == 143 ? 'retry' : 'ignore'} } - withName:MapReads { + withName:BWAMEM2_MEM { memory = {check_resource(60.GB * task.attempt)} time = {check_resource(48.h * task.attempt)} } - withName:MultiQC { + withName:MULTIQC { errorStrategy = {task.exitStatus == 143 ? 'retry' : 'ignore'} } - withName:Snpeff { + withName:SNPEFF { container = {(params.annotation_cache && params.snpeff_cache) ? 'nfcore/sarek:dev' : "nfcore/sareksnpeff:dev.${params.genome}"} errorStrategy = {task.exitStatus == 143 ? 'retry' : 'ignore'} } diff --git a/conf/modules.config b/conf/modules.config new file mode 100644 index 0000000000..cb73ab1d10 --- /dev/null +++ b/conf/modules.config @@ -0,0 +1,163 @@ +/* + * ------------------------------------------------- + * Nextflow config file for module specific-options + * ------------------------------------------------- + */ + +params { + modules { +// BUILD_INDICES + 'build_intervals' { + publish_dir = "reference" + publish_files = "false" + } + 'bwa_index' { + publish_dir = "reference" + publish_files = "false" + } + 'bwamem2_index' { + publish_dir = "reference" + publish_files = "false" + } + 'create_intervals_bed' { + publish_dir = "reference" + publish_files = "false" + } + 'dict' { + publish_dir = "reference" + publish_files = "false" + } + 'samtools_faidx' { + publish_dir = "reference" + publish_files = "false" + } + 'tabix_dbsnp' { + publish_dir = "reference" + publish_files = "false" + } + 'tabix_germline_resource' { + publish_dir = "reference" + publish_files = "false" + } + 'tabix_known_indels' { + publish_dir = "reference" + publish_files = "false" + } + 'tabix_pon' { + publish_dir = "reference" + publish_files = "false" + } +// MAPPING + 'bwa_mem1_mem' { + args = "-K 100000000 -M" + publish_files = "false" + } + 'bwa_mem2_mem' { + args = "-K 100000000 -M" + publish_files = "false" + } + 'merge_bam_mapping' { + publish_by_id = "true" + publish_dir = "preprocessing" + } + 'qualimap_bamqc_mapping' { + publish_by_id = "true" + publish_dir = "reports/qualimap" + } + 'samtools_index_mapping' { + publish_by_id = "true" + publish_dir = "preprocessing" + } + 'samtools_stats_mapping' { + publish_by_id = "true" + publish_dir = "reports/samtools_stats" + } +// MARKDUPLICATES + 'markduplicates' { + args = "ASSUME_SORTED=true REMOVE_DUPLICATES=false VALIDATION_STRINGENCY=LENIENT TMP_DIR=tmp" + suffix = ".md" + publish_by_id = "true" + publish_dir = "preprocessing" + publish_files = ['md.bam': 'markduplicates', 'md.bam.bai': 'markduplicates'] + } +// PREPARE_RECALIBRATION + 'baserecalibrator' { + publish_by_id = "true" + publish_dir = "preprocessing" + publish_files = ['recal.table':'markduplicates'] + } + 'gatherbqsrreports' { + publish_by_id = "true" + publish_dir = "preprocessing" + publish_files = ['recal.table':'markduplicates'] + } +// RECALIBRATE + 'applybqsr' { + suffix = "recal" + publish_by_id = "true" + publish_dir = "preprocessing" + } + 'merge_bam_recalibrate' { + suffix = "recal" + publish_by_id = "true" + publish_dir = "preprocessing" + } + 'qualimap_bamqc_recalibrate' { + publish_by_id = "true" + publish_dir = "reports/qualimap" + } + 'samtools_index_recalibrate' { + suffix = "recal" + publish_by_id = "true" + publish_dir = "preprocessing" + publish_files = ['recal.bam':'recalibrated', 'recal.bam.bai':'recalibrated'] + } + 'samtools_stats_recalibrate' { + publish_by_id = "true" + publish_dir = "reports/samtools_stats" + } +// GERMLINE_VARIANT_CALLING + 'haplotypecaller' { + publish_by_id = "true" + publish_dir = "variant_calling" + publish_files = "false" + } + 'genotypegvcf' { + publish_by_id = "true" + publish_dir = "variant_calling" + publish_files = "false" + } + 'concat_haplotypecaller' { + suffix = "haplotypecaller" + publish_by_id = "true" + publish_dir = "variant_calling" + publish_files = ['vcf.gz':'haplotypecaller', 'vcf.gz.tbi':'haplotypecaller'] + } + 'concat_gvcf' { + suffix = "haplotypecaller_gvcf" + publish_by_id = "true" + publish_dir = "variant_calling" + publish_files = ['vcf.gz':'haplotypecaller_gvcf', 'vcf.gz.tbi':'haplotypecaller_gvcf'] + } + 'strelka_germline' { + publish_by_id = "true" + publish_dir = "variant_calling" + publish_files = ['vcf.gz':'strelka', 'vcf.gz.tbi':'strelka'] + } +// QC_TRIM + 'fastqc' { + args = "--quiet" + publish_by_id = "true" + publish_dir = "reports/fastqc" + } + 'trimgalore' { + args = "--fastqc" + publish_by_id = "true" + publish_dir = "trimgalore" + } +// OTHERS + 'get_software_versions' { + publish_dir = "pipeline_info" + } + } +} diff --git a/conf/test.config b/conf/test.config index 50f523c9b2..bbbf1a0f94 100644 --- a/conf/test.config +++ b/conf/test.config @@ -28,13 +28,16 @@ params { vep_cache_version = '99' } -process { - withName:Snpeff { - container = 'nfcore/sareksnpeff:dev.WBcel235' - maxForks = 1 - } - withLabel:VEP { - container = 'nfcore/sarekvep:dev.WBcel235' - maxForks = 1 - } +/* + * TODO: uncomment when ready + process { + withName:Snpeff { + container = 'nfcore/sareksnpeff:dev.WBcel235' + maxForks = 1 + } + withLabel:VEP { + container = 'nfcore/sarekvep:dev.WBcel235' + maxForks = 1 + } } +*/ diff --git a/conf/test_germline_variantcalling.config b/conf/test_germline_variantcalling.config new file mode 100644 index 0000000000..7cec01feca --- /dev/null +++ b/conf/test_germline_variantcalling.config @@ -0,0 +1,15 @@ +/* + * ------------------------------------------------- + * Nextflow config file for running tests + * ------------------------------------------------- + * Defines bundled input files and everything required + * to run a fast and simple test. Use as follows: + * nextflow run nf-core/sarek -profile test_tool + */ + +includeConfig 'test.config' + +params { + // Input data + tools = 'haplotypecaller,strelka' +} \ No newline at end of file diff --git a/containers/snpeff/Dockerfile b/containers/snpeff/Dockerfile index 6af8b3f089..85e488fc8e 100644 --- a/containers/snpeff/Dockerfile +++ b/containers/snpeff/Dockerfile @@ -9,7 +9,7 @@ COPY environment.yml / RUN conda env create -f /environment.yml && conda clean -a # Add conda installation dir to PATH (instead of doing 'conda activate') -ENV PATH /opt/conda/envs/nf-core-sarek-snpeff-dev/bin:$PATH +ENV PATH /opt/conda/envs/nf-core-sarek-snpeff-3.0dev/bin:$PATH # Setup default ARG variables ARG GENOME=GRCh38 @@ -19,4 +19,8 @@ ARG SNPEFF_CACHE_VERSION=86 RUN snpEff download -v ${GENOME}.${SNPEFF_CACHE_VERSION} # Dump the details of the installed packages to a file for posterity -RUN conda env export --name nf-core-sarek-snpeff-dev > nf-core-sarek-snpeff-dev.yml +RUN conda env export --name nf-core-sarek-snpeff-3.0dev > nf-core-sarek-snpeff-3.0dev.yml + +# Instruct R processes to use these empty files instead of clashing with a local version +RUN touch .Rprofile +RUN touch .Renviron diff --git a/containers/snpeff/environment.yml b/containers/snpeff/environment.yml index 424e23e305..f04b058328 100644 --- a/containers/snpeff/environment.yml +++ b/containers/snpeff/environment.yml @@ -1,6 +1,6 @@ # You can use this file to create a conda environment for this pipeline: -# conda env create -f environment.yml -name: nf-core-sarek-snpeff-dev +# conda env create -f environment.yml +name: nf-core-sarek-snpeff-3.0dev channels: - conda-forge - bioconda diff --git a/containers/vep/Dockerfile b/containers/vep/Dockerfile index d34936609e..65b7ee9f9f 100644 --- a/containers/vep/Dockerfile +++ b/containers/vep/Dockerfile @@ -9,7 +9,7 @@ COPY environment.yml / RUN conda env create -f /environment.yml && conda clean -a # Add conda installation dir to PATH (instead of doing 'conda activate') -ENV PATH /opt/conda/envs/nf-core-sarek-vep-dev/bin:$PATH +ENV PATH /opt/conda/envs/nf-core-sarek-vep-3.0dev/bin:$PATH # Setup default ARG variables ARG GENOME=GRCh38 @@ -27,4 +27,8 @@ RUN vep_install \ --NO_BIOPERL --NO_HTSLIB --NO_TEST --NO_UPDATE # Dump the details of the installed packages to a file for posterity -RUN conda env export --name nf-core-sarek-vep-dev > nf-core-sarek-vep-dev.yml +RUN conda env export --name nf-core-sarek-vep-3.0dev > nf-core-sarek-vep-3.0dev.yml + +# Instruct R processes to use these empty files instead of clashing with a local version +RUN touch .Rprofile +RUN touch .Renviron diff --git a/containers/vep/environment.yml b/containers/vep/environment.yml index e90fff10e5..a0095be4c5 100644 --- a/containers/vep/environment.yml +++ b/containers/vep/environment.yml @@ -1,6 +1,6 @@ # You can use this file to create a conda environment for this pipeline: -# conda env create -f environment.yml -name: nf-core-sarek-vep-dev +# conda env create -f environment.yml +name: nf-core-sarek-vep-3.0dev channels: - conda-forge - bioconda diff --git a/docs/output.md b/docs/output.md index 6daa45b311..f81855f154 100644 --- a/docs/output.md +++ b/docs/output.md @@ -10,6 +10,9 @@ This document describes the output produced by the pipeline. Most of the plots a The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. +The directories listed below will be created in the results directory after the pipeline has finished. +All paths are relative to the top-level results directory. + ## Pipeline overview The pipeline is built using [Nextflow](https://www.nextflow.io/) @@ -17,7 +20,7 @@ and processes data using the following steps: - [Preprocessing](#preprocessing) - [Map to Reference](#map-to-reference) - - [bwa](#bwa) + - [BWA](#bwa) - [BWA-mem2](#bwa-mem2) - [Mark Duplicates](#mark-duplicates) - [GATK MarkDuplicates](#gatk-markduplicates) @@ -71,9 +74,9 @@ and processes data using the following steps: ### Map to Reference -#### bwa +#### BWA -[bwa](https://github.com/lh3/bwa) is a software package for mapping low-divergent sequences against a large reference genome. +[BWA](https://github.com/lh3/bwa) is a software package for mapping low-divergent sequences against a large reference genome. Such files are intermediate and not kept in the final files delivered to users. @@ -87,9 +90,9 @@ Such files are intermediate and not kept in the final files delivered to users. #### GATK MarkDuplicates -By default, `Sarek` will use [GATK MarkDuplicatesSpark](https://gatk.broadinstitute.org/hc/en-us/articles/360042912511-MarkDuplicatesSpark), `Spark` implementation of [GATK MarkDuplicates](https://gatk.broadinstitute.org/hc/en-us/articles/360042477492-MarkDuplicates-Picard), which locates and tags duplicate reads in a `BAM` or `SAM` file, where duplicate reads are defined as originating from a single fragment of DNA. +By default, `Sarek` will use [GATK MarkDuplicates](https://gatk.broadinstitute.org/hc/en-us/articles/360042477492-MarkDuplicates-Picard), which locates and tags duplicate reads in a `BAM` or `SAM` file, where duplicate reads are defined as originating from a single fragment of DNA. -Specify `--no_gatk_spark` to use `GATK MarkDuplicates` instead. +Specify `--use_gatk_spark` to use [`GATK MarkDuplicatesSpark`](https://gatk.broadinstitute.org/hc/en-us/articles/360042912511-MarkDuplicatesSpark) instead, `Spark` implementation of `GATK MarkDuplicates`. This directory is the location for the `BAM` files delivered to users. Besides the `duplicates-marked BAM` files, the recalibration tables (`*.recal.table`) are also stored, and can be used to create `recalibrated BAM` files. diff --git a/docs/usage.md b/docs/usage.md index 7f53cd73f5..1805bd5f32 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -52,18 +52,21 @@ This version number will be logged in reports when you run the pipeline, so that ### `-profile` -Use this parameter to choose a configuration profile. Profiles can give configuration presets for different compute environments. +Use this parameter to choose a configuration profile. +Profiles can give configuration presets for different compute environments. -Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Conda) - see below. +Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (`Docker`, `Singularity`, `Podman`, `Conda`) - see below. -> We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. +> We highly recommend the use of `Docker` or `Singularity` containers for full pipeline reproducibility, however when this is not possible, `Conda` is also supported. -The pipeline also dynamically loads configurations from [https://github.com/nf-core/configs](https://github.com/nf-core/configs) when it runs, making multiple config profiles for various institutional clusters available at run time. For more information and to see if your system is available in these configs please see the [nf-core/configs documentation](https://github.com/nf-core/configs#documentation). +The pipeline also dynamically loads configurations from [github.com/nf-core/configs](https://github.com/nf-core/configs) when it runs, making multiple config profiles for various institutional clusters available at run time. +For more information and to see if your system is available in these configs please see the [nf-core/configs documentation](https://github.com/nf-core/configs#documentation). Note that multiple profiles can be loaded, for example: `-profile test,docker` - the order of arguments is important! They are loaded in sequence, so later profiles can overwrite earlier profiles. -If `-profile` is not specified, the pipeline will run locally and expect all software to be installed and available on the `PATH`. This is _not_ recommended. +If `-profile` is not specified, the pipeline will run locally and expect all software to be installed and available on the `PATH`. +This is _not_ recommended. * `docker` * A generic configuration profile to be used with [Docker](https://docker.com/) @@ -496,36 +499,7 @@ This tool is enabled within `Sarek` if both `--sentieon` and `--tools DNAscope` [![sarek-docker status](https://img.shields.io/docker/automated/nfcore/sarek.svg)](https://hub.docker.com/r/nfcore/sarek) -Based on [nfcore/base:1.12.1](https://hub.docker.com/r/nfcore/base/tags), it contains: - -* **[ASCAT](https://github.com/Crick-CancerGenomics/ascat)** 2.5.2 -* **[AlleleCount](https://github.com/cancerit/alleleCount)** 4.0.2 -* **[BCFTools](https://github.com/samtools/bcftools)** 1.9 -* **[bwa](https://github.com/lh3/bwa)** 0.7.17 -* **[bwa-mem2](https://github.com/bwa-mem2/bwa-mem2)** 2.0 -* **[CNVkit](https://github.com/etal/cnvkit)** 0.9.6 -* **[Control-FREEC](https://github.com/BoevaLab/FREEC)** 11.6 -* **[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/)** 0.11.9 -* **[fgbio](https://github.com/fulcrumgenomics/fgbio)** 1.1.0 -* **[FreeBayes](https://github.com/ekg/freebayes)** 1.3.2 -* **[GATK4-spark](https://github.com/broadinstitute/gatk)** 4.1.7.0 -* **[GeneSplicer](https://ccb.jhu.edu/software/genesplicer/)** 1.0 -* **[ggplot2](https://github.com/tidyverse/ggplot2)** 3.3.0 -* **[HTSlib](https://github.com/samtools/htslib)** 1.9 -* **[Manta](https://github.com/Illumina/manta)** 1.6.0 -* **[msisensor](https://github.com/ding-lab/msisensor)** 0.5 -* **[MultiQC](https://github.com/ewels/MultiQC/)** 1.8 -* **[Qualimap](http://qualimap.bioinfo.cipf.es)** 2.2.2d -* **[SAMBLASTER](https://github.com/GregoryFaust/samblaster)** 0.1.24 -* **[samtools](https://github.com/samtools/samtools)** 1.9 -* **[snpEff](http://snpeff.sourceforge.net/)** 4.3.1t -* **[Strelka2](https://github.com/Illumina/strelka)** 2.9.10 -* **[TIDDIT](https://github.com/SciLifeLab/TIDDIT)** 2.7.1 -* **[pigz](https://zlib.net/pigz/)** 2.3.4 -* **[Trim Galore](https://github.com/FelixKrueger/TrimGalore)** 0.6.5 -* **[VCFanno](https://github.com/brentp/vcfanno)** 0.3.2 -* **[VCFtools](https://vcftools.github.io/index.html)** 0.1.16 -* **[VEP](https://github.com/Ensembl/ensembl-vep)** 99.2 +Based on [nfcore/base:1.12.1](https://hub.docker.com/r/nfcore/base/tags): For annotation, the main container can be used, but then cache has to be downloaded, or additional containers are available with cache. diff --git a/environment.yml b/environment.yml index 730b0674cd..b05d4d3789 100644 --- a/environment.yml +++ b/environment.yml @@ -6,36 +6,6 @@ channels: - bioconda - defaults dependencies: - - conda-forge::llvm-openmp=8.0.1=hc9558a2_0 - - conda-forge::openmp=8.0.1=0 - - conda-forge::markdown=3.1.1=py_0 - - conda-forge::pymdown-extensions=6.0=py_0 - - conda-forge::pygments=2.5.2=py_0 - - bioconda::ascat=2.5.2=r40_2 - - bioconda::bcftools=1.9=ha228f0b_4 - - bioconda::bwa-mem2=2.0=he513fc3_1 - - bioconda::bwa=0.7.17=hed695b0_7 - - bioconda::cancerit-allelecount=4.0.2=ha228f0b_1 - - bioconda::cnvkit=0.9.6=py27_1 - - bioconda::control-freec=11.6=he1b5a44_0 - - bioconda::ensembl-vep=99.2=pl526hecc5488_0 - - bioconda::fastqc=0.11.9=0 - - bioconda::fgbio=1.1.0=0 - - bioconda::freebayes=1.3.2=py27h49fb759_2 - - bioconda::gatk4-spark=4.1.7.0=0 - - bioconda::genesplicer=1.0=1 - - bioconda::htslib=1.9=ha228f0b_7 - - bioconda::manta=1.6.0=py27_0 - - bioconda::msisensor=0.5=h25a10a7_1 - - bioconda::multiqc=1.8=py_2 - - bioconda::qualimap=2.2.2d=1 - - bioconda::samblaster=0.1.24=hc9558a2_3 - - bioconda::samtools=1.9=h10a08f8_12 - - bioconda::snpeff=4.3.1t=0 - - bioconda::strelka=2.9.10=0 - - bioconda::tiddit=2.7.1=py27hb3f55d8_1 - - bioconda::trim-galore=0.6.5=0 - - bioconda::vcfanno=0.3.2=0 - - bioconda::vcftools=0.1.16=he513fc3_4 - - conda-forge::pigz=2.3.4=hed695b0_1 - - conda-forge::r-ggplot2=3.3.0=r40h6115d3f_1 \ No newline at end of file + - conda-forge::markdown=3.1.1 + - conda-forge::pymdown-extensions=6.0 + - conda-forge::pygments=2.5.2 diff --git a/lib/Checks.groovy b/lib/Checks.groovy new file mode 100644 index 0000000000..0c912c401b --- /dev/null +++ b/lib/Checks.groovy @@ -0,0 +1,36 @@ +/* + * This file holds several functions used to perform standard checks for the nf-core pipeline template. + */ + +class Checks { + + static void aws_batch(workflow, params) { + if (workflow.profile.contains('awsbatch')) { + assert !params.awsqueue || !params.awsregion : "Specify correct --awsqueue and --awsregion parameters on AWSBatch!" + // Check outdir paths to be S3 buckets if running on AWSBatch + // related: https://github.com/nextflow-io/nextflow/issues/813 + assert !params.outdir.startsWith('s3:') : "Outdir not on S3 - specify S3 Bucket to run on AWSBatch!" + // Prevent trace files to be stored on S3 since S3 does not support rolling files. + assert params.tracedir.startsWith('s3:') : "Specify a local tracedir or run without trace! S3 cannot be used for tracefiles." + } + } + + static void hostname(workflow, params, log) { + Map colors = Headers.log_colours(params.monochrome_logs) + if (params.hostnames) { + def hostname = "hostname".execute().text.trim() + params.hostnames.each { prof, hnames -> + hnames.each { hname -> + if (hostname.contains(hname) && !workflow.profile.contains(prof)) { + log.info "=${colors.yellow}====================================================${colors.reset}=\n" + + "${colors.yellow}WARN: You are running with `-profile $workflow.profile`\n" + + " but your machine hostname is ${colors.white}'$hostname'${colors.reset}.\n" + + " ${colors.yellow_bold}Please use `-profile $prof${colors.reset}`\n" + + "=${colors.yellow}====================================================${colors.reset}=" + } + } + } + } + } +} + diff --git a/lib/Completion.groovy b/lib/Completion.groovy new file mode 100644 index 0000000000..996276b8e6 --- /dev/null +++ b/lib/Completion.groovy @@ -0,0 +1,118 @@ +/* + * Functions to be run on completion of pipeline + */ + +class Completion { + static void email(workflow, params, summary, run_name, baseDir, multiqc_report, log) { + + // Set up the e-mail variables + def subject = "[$workflow.manifest.name] Successful: $workflow.runName" + if (!workflow.success) { + subject = "[$workflow.manifest.name] FAILED: $workflow.runName" + } + + def email_fields = [:] + email_fields['version'] = workflow.manifest.version + email_fields['runName'] = run_name ?: workflow.runName + email_fields['success'] = workflow.success + email_fields['dateComplete'] = workflow.complete + email_fields['duration'] = workflow.duration + email_fields['exitStatus'] = workflow.exitStatus + email_fields['errorMessage'] = (workflow.errorMessage ?: 'None') + email_fields['errorReport'] = (workflow.errorReport ?: 'None') + email_fields['commandLine'] = workflow.commandLine + email_fields['projectDir'] = workflow.projectDir + email_fields['summary'] = summary + email_fields['summary']['Date Started'] = workflow.start + email_fields['summary']['Date Completed'] = workflow.complete + email_fields['summary']['Pipeline script file path'] = workflow.scriptFile + email_fields['summary']['Pipeline script hash ID'] = workflow.scriptId + if (workflow.repository) email_fields['summary']['Pipeline repository Git URL'] = workflow.repository + if (workflow.commitId) email_fields['summary']['Pipeline repository Git Commit'] = workflow.commitId + if (workflow.revision) email_fields['summary']['Pipeline Git branch/tag'] = workflow.revision + email_fields['summary']['Nextflow Version'] = workflow.nextflow.version + email_fields['summary']['Nextflow Build'] = workflow.nextflow.build + email_fields['summary']['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp + + // On success try attach the multiqc report + def mqc_report = null + try { + if (workflow.success) { + mqc_report = multiqc_report.getVal() + if (mqc_report.getClass() == ArrayList) { + log.warn "[$workflow.manifest.name] Found multiple reports from process 'MULTIQC', will use only one" + mqc_report = mqc_report[0] + } + } + } catch (all) { + log.warn "[$workflow.manifest.name] Could not attach MultiQC report to summary email" + } + + // Check if we are only sending emails on failure + def email_address = params.email + if (!params.email && params.email_on_fail && !workflow.success) { + email_address = params.email_on_fail + } + + // Render the TXT template + def engine = new groovy.text.GStringTemplateEngine() + def tf = new File("$baseDir/assets/email_template.txt") + def txt_template = engine.createTemplate(tf).make(email_fields) + def email_txt = txt_template.toString() + + // Render the HTML template + def hf = new File("$baseDir/assets/email_template.html") + def html_template = engine.createTemplate(hf).make(email_fields) + def email_html = html_template.toString() + + // Render the sendmail template + def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, baseDir: "$baseDir", mqcFile: mqc_report, mqcMaxSize: params.max_multiqc_email_size.toBytes() ] + def sf = new File("$baseDir/assets/sendmail_template.txt") + def sendmail_template = engine.createTemplate(sf).make(smail_fields) + def sendmail_html = sendmail_template.toString() + + // Send the HTML e-mail + if (email_address) { + try { + if (params.plaintext_email) { throw GroovyException('Send plaintext e-mail, not HTML') } + // Try to send HTML e-mail using sendmail + [ 'sendmail', '-t' ].execute() << sendmail_html + log.info "[$workflow.manifest.name] Sent summary e-mail to $email_address (sendmail)" + } catch (all) { + // Catch failures and try with plaintext + def mail_cmd = [ 'mail', '-s', subject, '--content-type=text/html', email_address ] + if ( mqc_report.size() <= params.max_multiqc_email_size.toBytes() ) { + mail_cmd += [ '-A', mqc_report ] + } + mail_cmd.execute() << email_html + log.info "[$workflow.manifest.name] Sent summary e-mail to $email_address (mail)" + } + } + + // Write summary e-mail HTML to a file + def output_d = new File("${params.outdir}/pipeline_info/") + if (!output_d.exists()) { + output_d.mkdirs() + } + def output_hf = new File(output_d, "pipeline_report.html") + output_hf.withWriter { w -> w << email_html } + def output_tf = new File(output_d, "pipeline_report.txt") + output_tf.withWriter { w -> w << email_txt } + } + + static void summary(workflow, params, log) { + Map colors = Headers.log_colours(params.monochrome_logs) + if (workflow.stats.ignoredCount > 0 && workflow.success) { + log.info "-${colors.purple}Warning, pipeline completed, but with errored process(es) ${colors.reset}-" + log.info "-${colors.red}Number of ignored errored process(es) : ${workflow.stats.ignoredCount} ${colors.reset}-" + log.info "-${colors.green}Number of successfully ran process(es) : ${workflow.stats.succeedCount} ${colors.reset}-" + } + if (workflow.success) { + log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Pipeline completed successfully${colors.reset}-" + } else { + Checks.hostname() + log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} Pipeline completed with errors${colors.reset}-" + } + } +} + diff --git a/lib/Headers.groovy b/lib/Headers.groovy new file mode 100644 index 0000000000..ee3817cfde --- /dev/null +++ b/lib/Headers.groovy @@ -0,0 +1,46 @@ +/* + * This file holds several functions used to render the nf-core ANSI header. + */ + +class Headers { + + private static Map log_colours(Boolean monochrome_logs) { + Map colorcodes = [:] + colorcodes['reset'] = monochrome_logs ? '' : "\033[0m" + colorcodes['dim'] = monochrome_logs ? '' : "\033[2m" + colorcodes['black'] = monochrome_logs ? '' : "\033[0;30m" + colorcodes['green'] = monochrome_logs ? '' : "\033[0;32m" + colorcodes['yellow'] = monochrome_logs ? '' : "\033[0;33m" + colorcodes['yellow_bold'] = monochrome_logs ? '' : "\033[1;93m" + colorcodes['blue'] = monochrome_logs ? '' : "\033[0;34m" + colorcodes['purple'] = monochrome_logs ? '' : "\033[0;35m" + colorcodes['cyan'] = monochrome_logs ? '' : "\033[0;36m" + colorcodes['white'] = monochrome_logs ? '' : "\033[0;37m" + colorcodes['red'] = monochrome_logs ? '' : "\033[1;91m" + return colorcodes + } + + static String nf_core(workflow, monochrome_logs) { + Map colors = log_colours(monochrome_logs) + String.format( +""" +-${colors.dim}----------------------------------------------------${colors.reset}- + ${colors.green},--.${colors.black}/${colors.green},-.${colors.reset} + ${colors.blue} ___ __ __ __ ___ ${colors.green}/,-._.--~\'${colors.reset} + ${colors.blue} |\\ | |__ __ / ` / \\ |__) |__ ${colors.yellow}} {${colors.reset} + ${colors.blue} | \\| | \\__, \\__/ | \\ |___ ${colors.green}\\`-._,-`-,${colors.reset} + ${colors.green}`._,._,\'${colors.reset} + ${colors.white}____${colors.reset} + ${colors.white}.´ _ `.${colors.reset} + ${colors.white}/ ${colors.green}|\\${colors.white}`-_ \\${colors.reset} ${colors.blue} __ __ ___ ${colors.reset} + ${colors.white}| ${colors.green}| \\${colors.white} `-|${colors.reset} ${colors.blue}|__` /\\ |__) |__ |__/${colors.reset} + ${colors.white}\\ ${colors.green}| \\${colors.white} /${colors.reset} ${colors.blue}.__| /¯¯\\ | \\ |___ | \\${colors.reset} + ${colors.white}`${colors.green}|${colors.white}____${colors.green}\\${colors.white}´${colors.reset} + + ${colors.purple} ${workflow.manifest.name} v${workflow.manifest.version}${colors.reset} +-${colors.dim}--------------------------------------------------${colors.reset}- +""".stripIndent() + ) + } +} + diff --git a/lib/Schema.groovy b/lib/Schema.groovy new file mode 100644 index 0000000000..4c7215e699 --- /dev/null +++ b/lib/Schema.groovy @@ -0,0 +1,226 @@ +/* + * This file holds several functions used to perform JSON parameter validation, help and summary rendering for the nf-core pipeline template. + */ + +import groovy.json.JsonSlurper + +class JSON { + /* + * This method tries to read a JSON params file + */ + private static LinkedHashMap params_get(String path) { + def usage = new LinkedHashMap() + try { + usage = params_try(path) + } catch (Exception e) { + println "Could not read parameters settings from JSON. $e" + usage = new LinkedHashMap() + } + return usage + } + + /* + Method to actually read in JSON file using Groovy. + Group (as Key), values are all parameters + - Parameter1 as Key, Description as Value + - Parameter2 as Key, Description as Value + .... + Group + - + */ + private static LinkedHashMap params_try(String path) throws Exception { + + def json = new File(path).text + def Map usage = (Map) new JsonSlurper().parseText(json).get('properties') + + /* Tree looks like this in nf-core schema + * properties <- this is what the first get('properties') gets us + group 1 + properties + description + group 2 + properties + description + group 3 + properties + description + */ + def output_map = new LinkedHashMap() + + // Lets go deeper + usage.each { key, val -> + def Map submap = usage."$key".properties // Gets the property object of the group + def sub_params = new LinkedHashMap() + submap.each { innerkey, value -> + sub_params.put("$innerkey", "$value.description") + } + output_map.put("$key", sub_params) + } + return output_map + } + + static String params_help(path, command) { + String output = "Typical pipeline command:\n\n" + output += " ${command}\n\n" + output += params_beautify(params_get(path)) + } + + static String params_beautify(usage) { + String output = "" + for (group in usage.keySet()) { + output += group + "\n" + def params = usage.get(group) // This gets the parameters of that particular group + for (par in params.keySet()) { + output+= " \u001B[1m" + par.padRight(27) + "\u001B[1m" + params.get(par) + "\n" + } + output += "\n" + } + return output + } + + private static LinkedHashMap params_summary(workflow, params, run_name, step, tools, skip_qc, annotate_tools) { + def Map summary = [:] + if (workflow.revision) summary['Pipeline Release'] = workflow.revision + summary['Run Name'] = run_name ?: workflow.runName + summary['Max Resources'] = "${params.max_memory} memory, ${params.max_cpus} cpus, ${params.max_time} time per job" + if (workflow.containerEngine) summary['Container'] = "${workflow.containerEngine} - ${workflow.container}" + summary['Input'] = params.input + summary['Step'] = step + summary['Genome'] = params.genome + if (params.no_intervals && step != 'annotate') summary['Intervals'] = 'Do not use' + summary['Nucleotides/s'] = params.nucleotides_per_second + if (params.sentieon) summary['Sention'] = "Using Sentieon for Preprocessing and/or Variant Calling" + if (params.skip_qc) summary['QC tools skipped'] = skip_qc.join(', ') + if (params.target_bed) summary['Target BED'] = params.target_bed + if (params.tools) summary['Tools'] = tools.join(', ') + if (params.trim_fastq || params.split_fastq) summary['Modify fastqs'] = "trim and/or split" + + if (params.trim_fastq) { + summary['Fastq trim'] = "Fastq trim selected" + summary['Trim R1'] = "${params.clip_r1} bp" + summary['Trim R2'] = "${params.clip_r2} bp" + summary["Trim 3 R1"] = "${params.three_prime_clip_r1} bp" + summary["Trim 3 R2"] = "${params.three_prime_clip_r2} bp" + summary['NextSeq Trim'] = "${params.trim_nextseq} bp" + summary['Saved Trimmed Fastq'] = params.save_trimmed ? 'Yes' : 'No' + } + if (params.split_fastq) summary['Reads in fastq'] = params.split_fastq + + summary['MarkDuplicates'] = "Options" + summary['Java options'] = params.markdup_java_options + summary['GATK Spark'] = params.no_gatk_spark ? 'No' : 'Yes' + + summary['Save BAMs mapped'] = params.save_bam_mapped ? 'Yes' : 'No' + summary['Skip MarkDuplicates'] = params.skip_markduplicates ? 'Yes' : 'No' + + if ('ascat' in tools) { + summary['ASCAT'] = "Options" + if (params.ascat_purity) summary['purity'] = params.ascat_purity + if (params.ascat_ploidy) summary['ploidy'] = params.ascat_ploidy + } + + if ('controlfreec' in tools) { + summary['Control-FREEC'] = "Options" + if (params.cf_window) summary['window'] = params.cf_window + if (params.cf_coeff) summary['coeff of variation'] = params.cf_coeff + if (params.cf_ploidy) summary['ploidy'] = params.cf_ploidy + } + + if ('haplotypecaller' in tools) summary['GVCF'] = params.no_gvcf ? 'No' : 'Yes' + if ('strelka' in tools && 'manta' in tools) summary['Strelka BP'] = params.no_strelka_bp ? 'No' : 'Yes' + if (params.pon && ('mutect2' in tools || (params.sentieon && 'tnscope' in tools))) summary['Panel of normals'] = params.pon + + if (params.annotate_tools) summary['Tools to annotate'] = annotate_tools.join(', ') + + if (params.annotation_cache) { + summary['Annotation cache'] = "Enabled" + if (params.snpeff_cache) summary['snpEff cache'] = params.snpeff_cache + if (params.vep_cache) summary['VEP cache'] = params.vep_cache + } + + if (params.cadd_cache) { + summary['CADD cache'] = "Enabled" + if (params.cadd_indels) summary['CADD indels'] = params.cadd_indels + if (params.cadd_wg_snvs) summary['CADD wg snvs'] = params.cadd_wg_snvs + } + + if (params.genesplicer) summary['genesplicer'] = "Enabled" + + if (params.igenomes_base && !params.igenomes_ignore) summary['AWS iGenomes base'] = params.igenomes_base + if (params.igenomes_ignore) summary['AWS iGenomes'] = "Do not use" + if (params.genomes_base && !params.igenomes_ignore) summary['Genomes base'] = params.genomes_base + + summary['Save Reference'] = params.save_reference ? 'Yes' : 'No' + + if (params.ac_loci) summary['Loci'] = params.ac_loci + if (params.ac_loci_gc) summary['Loci GC'] = params.ac_loci_gc + if (params.bwa) summary['BWA indexes'] = params.bwa + if (params.chr_dir) summary['Chromosomes'] = params.chr_dir + if (params.chr_length) summary['Chromosomes length'] = params.chr_length + if (params.dbsnp) summary['dbsnp'] = params.dbsnp + if (params.dbsnp_index) summary['dbsnp index'] = params.dbsnp_index + if (params.dict) summary['dict'] = params.dict + if (params.fasta) summary['fasta reference'] = params.fasta + if (params.fasta_fai) summary['fasta index'] = params.fasta_fai + if (params.germline_resource) summary['germline resource'] = params.germline_resource + if (params.germline_resource_index) summary['germline resource index'] = params.germline_resource_index + if (params.intervals) summary['intervals'] = params.intervals + if (params.known_indels) summary['known indels'] = params.known_indels + if (params.known_indels_index) summary['known indels index'] = params.known_indels_index + if (params.mappability) summary['Mappability'] = params.mappability + if (params.snpeff_cache) summary['snpEff cache'] = params.snpeff_cache + if (params.snpeff_db) summary['snpEff DB'] = params.snpeff_db + if (params.species) summary['snpEff species'] = params.species + if (params.vep_cache) summary['VEP cache'] = params.vep_cache + if (params.vep_cache_version) summary['VEP cache version'] = params.vep_cache_version + + summary['Output dir'] = params.outdir + summary['Publish dir mode'] = params.publish_dir_mode + if (params.sequencing_center) summary['Sequenced by'] = params.sequencing_center + + summary['Launch dir'] = workflow.launchDir + summary['Working dir'] = workflow.workDir + summary['Script dir'] = workflow.projectDir + summary['User'] = workflow.userName + + if (params.multiqc_config) summary['MultiQC config'] = params.multiqc_config + + summary['Config Profile'] = workflow.profile + + if (params.config_profile_description) summary['Description'] = params.config_profile_description + if (params.config_profile_contact) summary['Contact'] = params.config_profile_contact + if (params.config_profile_url) summary['URL'] = params.config_profile_url + + summary['Config Files'] = workflow.configFiles.join(', ') + + if (params.email || params.email_on_fail) { + summary['E-mail Address'] = params.email + summary['E-mail on failure'] = params.email_on_fail + summary['MultiQC maxsize'] = params.max_multiqc_email_size + } + + if (workflow.profile.contains('awsbatch')) { + summary['AWS Region'] = params.awsregion + summary['AWS Queue'] = params.awsqueue + summary['AWS CLI'] = params.awscli + } + + return summary + } + + static String params_mqc_summary(summary) { + String yaml_file_text = """ + id: 'nf-core-sarek-summary' + description: " - this information is collected when the pipeline is started." + section_name: 'nf-core/sarek Workflow Summary' + section_href: 'https://github.com/nf-core/sarek' + plot_type: 'html' + data: | +
+ ${summary.collect { k,v -> "
$k
${v ?: 'N/A'}
" }.join("\n")} +
+ """.stripIndent() + + return yaml_file_text + } +} diff --git a/main.nf b/main.nf index d6a8842ce4..cf3b756857 100644 --- a/main.nf +++ b/main.nf @@ -1,11 +1,12 @@ #!/usr/bin/env nextflow /* -================================================================================ +-------------------------------------------------------------------------------- nf-core/sarek -================================================================================ +-------------------------------------------------------------------------------- Started March 2016. Ported to nf-core May 2019. +Ported to DSL 2 July 2020. -------------------------------------------------------------------------------- nf-core/sarek: An open-source analysis pipeline to detect germline or somatic variants @@ -15,179 +16,58 @@ nf-core/sarek: https://nf-co.re/sarek -------------------------------------------------------------------------------- @Documentation - https://nf-co.re/sarek/docs + https://nf-co.re/sarek/latest/usage -------------------------------------------------------------------------------- */ -def helpMessage() { - log.info nfcoreHeader() - log.info""" - - Usage: - - The typical command for running the pipeline is as follows: - - nextflow run nf-core/sarek --input sample.tsv -profile docker - - Mandatory arguments: - --input [file] Path to input TSV file on mapping, prepare_recalibration, recalibrate, variant_calling and Control-FREEC steps - Multiple TSV files can be specified surrounded with quotes - Works also with the path to a directory on mapping step with a single germline sample only - Alternatively, path to VCF input file on annotate step - Multiple VCF files can be specified surrounded with quotes - -profile [str] Configuration profile to use. Can use multiple (comma separated) - Available: conda, docker, singularity, test, awsbatch, and more - --step [list] Specify starting step (only one) - Available: mapping, prepare_recalibration, recalibrate, variant_calling, annotate, ControlFREEC - Default: ${params.step} - --genome [str] Name of iGenomes reference - Default: ${params.genome} - - Main options: - --help [bool] You're reading it - --no_intervals [bool] Disable usage of intervals - Intervals are part of the genome chopped up, used to speed up preprocessing and variant calling - --nucleotides_per_second [int] To estimate interval size - Default: ${params.nucleotides_per_second} - --sentieon [bool] If sentieon is available, will enable it for Preprocessing, and Variant Calling - Adds the following options for --tools: DNAseq, DNAscope and TNscope - --skip_qc [str] Specify which QC tools to skip when running Sarek (multiple separated with commas) - Available: all, bamQC, BaseRecalibrator, BCFtools, Documentation - FastQC, MultiQC, samtools, vcftools, versions - Default: None - --target_bed [file] Target BED file for whole exome or targeted sequencing - Default: None - --tools [str] Specify tools to use for variant calling (multiple separated with commas): - Available: ASCAT, CNVkit, ControlFREEC, FreeBayes, HaplotypeCaller - Manta, mpileup, MSIsensor, Mutect2, Strelka, TIDDIT - and/or for annotation: - snpEff, VEP, merge - Default: None - - Modify fastqs (trim/split): - --trim_fastq [bool] Run Trim Galore - --clip_r1 [int] Instructs Trim Galore to remove bp from the 5' end of read 1 (or single-end reads) - --clip_r2 [int] Instructs Trim Galore to remove bp from the 5' end of read 2 (paired-end reads only) - --three_prime_clip_r1 [int] Instructs Trim Galore to remove bp from the 3' end of read 1 AFTER adapter/quality trimming has been performed - --three_prime_clip_r2 [int] Instructs Trim Galore to remove bp from the 3' end of read 2 AFTER adapter/quality trimming has been performed - --trim_nextseq [int] Instructs Trim Galore to apply the --nextseq=X option, to trim based on quality after removing poly-G tails - --save_trimmed [bool] Save trimmed FastQ file intermediates - --split_fastq [int] Specify how many reads should be contained in the split fastq file - Default: no split - - Preprocessing: - --markdup_java_options [str] Establish values for markDuplicates memory consumption - Default: ${params.markdup_java_options} - --use_gatk_spark [bool] Enable usage of GATK Spark implementation of their tools in local mode - --save_bam_mapped [bool] Save Mapped BAMs - --skip_markduplicates [bool] Skip MarkDuplicates - - Variant Calling: - --ascat_ploidy [int] Use this parameter to overwrite default behavior from ASCAT regarding ploidy - Requires that --ascat_purity is set - --ascat_purity [int] Use this parameter to overwrite default behavior from ASCAT regarding purity - Requires that --ascat_ploidy is set - --cf_coeff [str] Control-FREEC coefficientOfVariation - Default: ${params.cf_coeff} - --cf_ploidy [str] Control-FREEC ploidy - Default: ${params.cf_ploidy} - --cf_window [int] Control-FREEC window size - Default: Disabled - --generate_gvcf [bool] Enable g.vcf output from GATK HaplotypeCaller - --no_strelka_bp [bool] Will not use Manta candidateSmallIndels for Strelka (not recommended by Best Practices) - --pon [file] Panel-of-normals VCF (bgzipped) for GATK Mutect2 / Sentieon TNscope - See: https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_mutect_CreateSomaticPanelOfNormals.php - --pon_index [file] Index of pon panel-of-normals VCF - If none provided, will be generated automatically from the PON - --ignore_soft_clipped_bases [bool] Do not analyze soft clipped bases in the reads for GATK Mutect2 - Default: Do not use - --umi [bool] If provided, UMIs steps will be run to extract and annotate the reads with UMI and create consensus reads - --read_structure1 [string] When processing UMIs, a read structure should always be provided for each of the fastq files. If the read does not contain any UMI, the structure will be +T (i.e. only template of any length). - See: https://github.com/fulcrumgenomics/fgbio/wiki/Read-Structures - --read_structure2 [string] When processing UMIs, a read structure should always be provided for each of the fastq files. If the read does not contain any UMI, the structure will be +T (i.e. only template of any length). - See: https://github.com/fulcrumgenomics/fgbio/wiki/Read-Structures - - Annotation: - --annotate_tools [str] Specify from which tools Sarek should look for VCF files to annotate, only for step Annotate - Available: HaplotypeCaller, Manta, Mutect2, Strelka, TIDDIT - Default: None - --annotation_cache [bool] Enable the use of cache for annotation, to be used with --snpeff_cache and/or --vep_cache - --snpeff_cache [file] Specity the path to snpEff cache, to be used with --annotation_cache - --vep_cache [file] Specity the path to VEP cache, to be used with --annotation_cache - --cadd_cache [bool] Enable CADD cache - --cadd_indels [file] Path to CADD InDels file - --cadd_indels_tbi [file] Path to CADD InDels index - --cadd_wg_snvs [file] Path to CADD SNVs file - --cadd_wg_snvs_tbi [file] Path to CADD SNVs index - --genesplicer [file] Enable genesplicer within VEP - - References options: - --igenomes_base [file] Specify base path to AWS iGenomes - Default: ${params.igenomes_base} - --igenomes_ignore [bool] Do not use AWS iGenomes. Will load genomes.config instead of igenomes.config - --genomes_base [file] Specify base path to reference genome - --save_reference [bool] Save built references - - References: If not specified in the configuration file or you wish to overwrite any of the references. - --ac_loci [file] Loci file for ASCAT - --ac_loci_gc [file] Loci GC file for ASCAT - --bwa [file] BWA indexes - If none provided, will be generated automatically from the fasta reference - --chr_dir [file] Chromosomes folder - --chr_length [file] Chromosomes length file - --dbsnp [file] Dbsnp file - --dbsnp_index [file] Dbsnp index - If none provided, will be generated automatically if a dbsnp file is provided - --dict [file] Fasta dictionary file - If none provided, will be generated automatically from the fasta reference - --fasta [file] Fasta reference - --fasta_fai [file] Fasta reference index - If none provided, will be generated automatically from the fasta reference - --germline_resource [file] Germline Resource File for GATK Mutect2 - --germline_resource_index [file] Germline Resource Index for GATK Mutect2 - if none provided, will be generated automatically if a germlineResource file is provided - --intervals [file] Intervals - If none provided, will be generated automatically from the fasta reference - Use --no_intervals to disable automatic generation - --known_indels [file] Known indels file - --known_indels_index [file] Known indels index - If none provided, will be generated automatically if a knownIndels file is provided - --mappability [file] Mappability file for Control-FREEC - --snpeff_db [str] snpEff Database version - --species [str] Species for VEP - --vep_cache_version [int] VEP cache version - - Other options: - --outdir [file] The output directory where the results will be saved - --publish_dir_mode [list] Mode for publishing results in the output directory (only one) - Available: symlink, rellink, link, copy, copyNoFollow, move - Default: copy - --sequencing_center [str] Name of sequencing center to be displayed in BAM file - --multiqc_config [file] Specify a custom config file for MultiQC - --monochrome_logs [bool] Logs will be without colors - --email [str] Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits - --email_on_fail [str] Same as --email, except only send mail if the workflow is not successful - --plaintext_email [bool] Enable plaintext email - --max_multiqc_email_size [str] Threshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached - Default: 25MB - -name [str] Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic - - AWSBatch options: - --awsqueue [str] The AWSBatch JobQueue that needs to be set when running on AWSBatch - --awsregion [str] The AWS Region for your AWS Batch job to run on - --awscli [str] Path to the AWS CLI tool - """.stripIndent() +nextflow.enable.dsl=2 + +// Print help message if required + +if (params.help) { + def command = "nextflow run nf-core/sarek -profile docker --input sample.tsv" + log.info Schema.params_help("$projectDir/nextflow_schema.json", command) + exit 0 } -// Show help message -if (params.help) exit 0, helpMessage() +/* +-------------------------------------------------------------------------------- + INCLUDE SAREK FUNCTIONS +-------------------------------------------------------------------------------- +*/ + +include { + check_parameter_existence; + check_parameter_list; + define_anno_list; + define_skip_qc_list; + define_step_list; + define_tool_list; + extract_bam; + extract_fastq; + extract_fastq_from_dir; + extract_recal; + has_extension +} from './modules/local/functions' /* -================================================================================ +-------------------------------------------------------------------------------- SET UP CONFIGURATION VARIABLES -================================================================================ +-------------------------------------------------------------------------------- */ +// Check parameters + +Checks.aws_batch(workflow, params) // Check AWS batch settings +Checks.hostname(workflow, params, log) // Check the hostnames against configured profiles + +// MultiQC - Stage config files + +multiqc_config = file("$projectDir/assets/multiqc_config.yaml", checkIfExists: true) +multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config, checkIfExists: true) : Channel.empty() +output_docs = file("$projectDir/docs/output.md", checkIfExists: true) +output_docs_images = file("$projectDir/docs/images/", checkIfExists: true) + // Check if genome exists in the config file if (params.genomes && !params.genomes.containsKey(params.genome) && !params.igenomes_ignore) { exit 1, "The provided genome '${params.genome}' is not available in the iGenomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}" @@ -195,59 +75,34 @@ if (params.genomes && !params.genomes.containsKey(params.genome) && !params.igen exit 1, "The provided genome '${params.genome}' is not available in the genomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}" } -stepList = defineStepList() +step_list = define_step_list() step = params.step ? params.step.toLowerCase().replaceAll('-', '').replaceAll('_', '') : '' -// Handle deprecation -if (step == 'preprocessing') step = 'mapping' - if (step.contains(',')) exit 1, 'You can choose only one step, see --help for more information' -if (!checkParameterExistence(step, stepList)) exit 1, "Unknown step ${step}, see --help for more information" +if (!check_parameter_existence(step, step_list)) exit 1, "Unknown step ${step}, see --help for more information" -toolList = defineToolList() +tool_list = define_tool_list() tools = params.tools ? params.tools.split(',').collect{it.trim().toLowerCase().replaceAll('-', '').replaceAll('_', '')} : [] if (step == 'controlfreec') tools = ['controlfreec'] -if (!checkParameterList(tools, toolList)) exit 1, 'Unknown tool(s), see --help for more information' +if (!check_parameter_list(tools, tool_list)) exit 1, 'Unknown tool(s), see --help for more information' -skipQClist = defineSkipQClist() -skipQC = params.skip_qc ? params.skip_qc == 'all' ? skipQClist : params.skip_qc.split(',').collect{it.trim().toLowerCase().replaceAll('-', '').replaceAll('_', '')} : [] -if (!checkParameterList(skipQC, skipQClist)) exit 1, 'Unknown QC tool(s), see --help for more information' +skip_qc_list = define_skip_qc_list() +skip_qc = params.skip_qc ? params.skip_qc == 'all' ? skip_qc_list : params.skip_qc.split(',').collect{it.trim().toLowerCase().replaceAll('-', '').replaceAll('_', '')} : [] +if (!check_parameter_list(skip_qc, skip_qc_list)) exit 1, 'Unknown QC tool(s), see --help for more information' -annoList = defineAnnoList() +anno_list = define_anno_list() annotate_tools = params.annotate_tools ? params.annotate_tools.split(',').collect{it.trim().toLowerCase().replaceAll('-', '')} : [] -if (!checkParameterList(annotate_tools,annoList)) exit 1, 'Unknown tool(s) to annotate, see --help for more information' +if (!check_parameter_list(annotate_tools,anno_list)) exit 1, 'Unknown tool(s) to annotate, see --help for more information' -// Check parameters +// // Check parameters if ((params.ascat_ploidy && !params.ascat_purity) || (!params.ascat_ploidy && params.ascat_purity)) exit 1, 'Please specify both --ascat_purity and --ascat_ploidy, or none of them' +if (params.cf_window && params.cf_coeff) exit 1, 'Please specify either --cf_window OR --cf_coeff, but not both of them' if (params.umi && !(params.read_structure1 && params.read_structure2)) exit 1, 'Please specify both --read_structure1 and --read_structure2, when using --umi' -// Has the run name been specified by the user? -// This has the bonus effect of catching both -name and --name -custom_runName = params.name -if (!(workflow.runName ==~ /[a-z]+_[a-z]+/)) custom_runName = workflow.runName - -// Check AWS batch settings -if (workflow.profile.contains('awsbatch')) { - // AWSBatch sanity checking - if (!params.awsqueue || !params.awsregion) exit 1, "Specify correct --awsqueue and --awsregion parameters on AWSBatch!" - // Check outdir paths to be S3 buckets if running on AWSBatch - // related: https://github.com/nextflow-io/nextflow/issues/813 - if (!params.outdir.startsWith('s3:')) exit 1, "Outdir not on S3 - specify S3 Bucket to run on AWSBatch!" - // Prevent trace files to be stored on S3 since S3 does not support rolling files. - if (params.tracedir.startsWith('s3:')) exit 1, "Specify a local tracedir or run without trace! S3 cannot be used for tracefiles." -} - -// MultiQC -// Stage config files -ch_multiqc_config = file("$projectDir/assets/multiqc_config.yaml", checkIfExists: true) -ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config, checkIfExists: true) : Channel.empty() -ch_output_docs = file("$projectDir/docs/output.md", checkIfExists: true) -ch_output_docs_images = file("$projectDir/docs/images/", checkIfExists: true) - // Handle input -tsvPath = null -if (params.input && (hasExtension(params.input, "tsv") || hasExtension(params.input, "vcf") || hasExtension(params.input, "vcf.gz"))) tsvPath = params.input -if (params.input && (hasExtension(params.input, "vcf") || hasExtension(params.input, "vcf.gz"))) step = "annotate" +tsv_path = null +if (params.input && (has_extension(params.input, "tsv") || has_extension(params.input, "vcf") || has_extension(params.input, "vcf.gz"))) tsv_path = params.input +if (params.input && (has_extension(params.input, "vcf") || has_extension(params.input, "vcf.gz"))) step = "annotate" save_bam_mapped = params.skip_markduplicates ? true : params.save_bam_mapped ? true : false @@ -256,4120 +111,2014 @@ save_bam_mapped = params.skip_markduplicates ? true : params.save_bam_mapped ? t if (!params.input && params.sentieon) { switch (step) { case 'mapping': break - case 'recalibrate': tsvPath = "${params.outdir}/Preprocessing/TSV/sentieon_deduped.tsv"; break - case 'variantcalling': tsvPath = "${params.outdir}/Preprocessing/TSV/sentieon_recalibrated.tsv"; break + case 'recalibrate': tsv_path = "${params.outdir}/preprocessing/tsv/sentieon_deduped.tsv"; break + case 'variantcalling': tsv_path = "${params.outdir}/preprocessing/tsv/sentieon_recalibrated.tsv"; break case 'annotate': break default: exit 1, "Unknown step ${step}" } } else if (!params.input && !params.sentieon && !params.skip_markduplicates) { switch (step) { case 'mapping': break - case 'preparerecalibration': tsvPath = "${params.outdir}/Preprocessing/TSV/duplicates_marked_no_table.tsv"; break - case 'recalibrate': tsvPath = "${params.outdir}/Preprocessing/TSV/duplicates_marked.tsv"; break - case 'variantcalling': tsvPath = "${params.outdir}/Preprocessing/TSV/recalibrated.tsv"; break - case 'controlfreec': tsvPath = "${params.outdir}/VariantCalling/TSV/control-freec_mpileup.tsv"; break + case 'preparerecalibration': tsv_path = "${params.outdir}/preprocessing/tsv/markduplicates_no_table.tsv"; break + case 'recalibrate': tsv_path = "${params.outdir}/preprocessing/tsv/markduplicates.tsv"; break + case 'variantcalling': tsv_path = "${params.outdir}/preprocessing/tsv/recalibrated.tsv"; break + case 'controlfreec': tsv_path = "${params.outdir}/variant_calling/tsv/control-freec_mpileup.tsv"; break case 'annotate': break default: exit 1, "Unknown step ${step}" } } else if (!params.input && !params.sentieon && params.skip_markduplicates) { switch (step) { case 'mapping': break - case 'preparerecalibration': tsvPath = "${params.outdir}/Preprocessing/TSV/mapped.tsv"; break - case 'recalibrate': tsvPath = "${params.outdir}/Preprocessing/TSV/mapped_no_duplicates_marked.tsv"; break - case 'variantcalling': tsvPath = "${params.outdir}/Preprocessing/TSV/recalibrated.tsv"; break - case 'controlfreec': tsvPath = "${params.outdir}/VariantCalling/TSV/control-freec_mpileup.tsv"; break + case 'preparerecalibration': tsv_path = "${params.outdir}/preprocessing/tsv/mapped.tsv"; break + case 'recalibrate': tsv_path = "${params.outdir}/preprocessing/tsv/mapped_no_markduplicates.tsv"; break + case 'variantcalling': tsv_path = "${params.outdir}/preprocessing/tsv/recalibrated.tsv"; break + case 'controlfreec': tsv_path = "${params.outdir}/variant_calling/tsv/control-freec_mpileup.tsv"; break case 'annotate': break default: exit 1, "Unknown step ${step}" } } -inputSample = Channel.empty() -if (tsvPath) { - tsvFile = file(tsvPath) +input_sample = Channel.empty() +if (tsv_path) { + tsv_file = file(tsv_path) switch (step) { - case 'mapping': inputSample = extractFastq(tsvFile); break - case 'preparerecalibration': inputSample = extractBam(tsvFile); break - case 'recalibrate': inputSample = extractRecal(tsvFile); break - case 'variantcalling': inputSample = extractBam(tsvFile); break - case 'controlfreec': inputSample = extractPileup(tsvFile); break + case 'mapping': input_sample = extract_fastq(tsv_file); break + case 'preparerecalibration': input_sample = extract_bam(tsv_file); break + case 'recalibrate': input_sample = extract_recal(tsv_file); break + case 'variantcalling': input_sample = extract_bam(tsv_file); break + case 'controlfreec': input_sample = extract_pileup(tsv_file); break case 'annotate': break default: exit 1, "Unknown step ${step}" } -} else if (params.input && !hasExtension(params.input, "tsv")) { +} else if (params.input && !has_extension(params.input, "tsv")) { log.info "No TSV file" if (step != 'mapping') exit 1, 'No step other than "mapping" supports a directory as an input' log.info "Reading ${params.input} directory" log.warn "[nf-core/sarek] in ${params.input} directory, all fastqs are assuming to be from the same sample, which is assumed to be a germline one" - inputSample = extractFastqFromDir(params.input) - (inputSample, fastqTMP) = inputSample.into(2) - fastqTMP.toList().subscribe onNext: { - if (it.size() == 0) exit 1, "No FASTQ files found in --input directory '${params.input}'" - } - tsvFile = params.input // used in the reports -} else if (tsvPath && step == 'annotate') { - log.info "Annotating ${tsvPath}" + input_sample = extract_fastq_from_dir(params.input) + tsv_file = params.input // used in the reports +} else if (tsv_path && step == 'annotate') { + log.info "Annotating ${tsv_path}" } else if (step == 'annotate') { log.info "Trying automatic annotation on files in the VariantCalling/ directory" } else exit 1, 'No sample were defined, see --help' -(genderMap, statusMap, inputSample) = extractInfos(inputSample) +/* +-------------------------------------------------------------------------------- + UPDATE MODULES OPTIONS BASED ON PARAMS +-------------------------------------------------------------------------------- +*/ + +modules = params.modules + +if (params.save_reference) modules['build_intervals'].publish_files = ['bed':'intervals'] +if (params.save_reference) modules['bwa_index'].publish_files = ['amb':'bwa', 'ann':'bwa', 'bwt':'bwa', 'pac':'bwa', 'sa':'bwa'] +if (params.save_reference) modules['bwamem2_index'].publish_files = ['0123':'bwamem2', 'amb':'bwamem2', 'ann':'bwamem2', 'bwt.2bit.64':'bwamem2', 'bwt.8bit.32':'bwamem2', 'pac':'bwamem2'] +if (params.save_reference) modules['create_intervals_bed'].publish_files = ['bed':'intervals'] +if (params.save_reference) modules['dict'].publish_files = ['dict':'dict'] +if (params.save_reference) modules['samtools_faidx'].publish_files = ['fai':'fai'] +if (params.save_reference) modules['tabix_dbsnp'].publish_files = ['vcf.gz.tbi':'dbsnp'] +if (params.save_reference) modules['tabix_germline_resource'].publish_files = ['vcf.gz.tbi':'germline_resource'] +if (params.save_reference) modules['tabix_known_indels'].publish_files = ['vcf.gz.tbi':'known_indels'] +if (params.save_reference) modules['tabix_pon'].publish_files = ['vcf.gz.tbi':'pon'] +if (save_bam_mapped) modules['samtools_index_mapping'].publish_files = ['bam':'mapped', 'bai':'mapped'] +if (params.skip_markduplicates) modules['baserecalibrator'].publish_files = ['recal.table':'mapped'] +if (params.skip_markduplicates) modules['gatherbqsrreports'].publish_files = ['recal.table':'mapped'] /* -================================================================================ +-------------------------------------------------------------------------------- CHECKING REFERENCES -================================================================================ +-------------------------------------------------------------------------------- */ // Initialize each params in params.genomes, catch the command line first if it was defined -// params.fasta has to be the first one -params.fasta = params.genome && !('annotate' in step) ? params.genomes[params.genome].fasta ?: null : null -// The rest can be sorted -params.ac_loci = params.genome && 'ascat' in tools ? params.genomes[params.genome].ac_loci ?: null : null -params.ac_loci_gc = params.genome && 'ascat' in tools ? params.genomes[params.genome].ac_loci_gc ?: null : null -params.bwa = params.genome && params.fasta && 'mapping' in step ? params.genomes[params.genome].bwa ?: null : null -params.chr_dir = params.genome && 'controlfreec' in tools ? params.genomes[params.genome].chr_dir ?: null : null -params.chr_length = params.genome && 'controlfreec' in tools ? params.genomes[params.genome].chr_length ?: null : null -params.dbsnp = params.genome && ('mapping' in step || 'preparerecalibration' in step || 'controlfreec' in tools || 'haplotypecaller' in tools || 'mutect2' in tools || params.sentieon) ? params.genomes[params.genome].dbsnp ?: null : null -params.dbsnp_index = params.genome && params.dbsnp ? params.genomes[params.genome].dbsnp_index ?: null : null -params.dict = params.genome && params.fasta ? params.genomes[params.genome].dict ?: null : null -params.fasta_fai = params.genome && params.fasta ? params.genomes[params.genome].fasta_fai ?: null : null -params.germline_resource = params.genome && 'mutect2' in tools ? params.genomes[params.genome].germline_resource ?: null : null -params.germline_resource_index = params.genome && params.germline_resource ? params.genomes[params.genome].germline_resource_index ?: null : null -params.intervals = params.genome && !('annotate' in step) ? params.genomes[params.genome].intervals ?: null : null -params.known_indels = params.genome && ('mapping' in step || 'preparerecalibration' in step) ? params.genomes[params.genome].known_indels ?: null : null -params.known_indels_index = params.genome && params.known_indels ? params.genomes[params.genome].known_indels_index ?: null : null -params.mappability = params.genome && 'controlfreec' in tools ? params.genomes[params.genome].mappability ?: null : null -params.snpeff_db = params.genome && ('snpeff' in tools || 'merge' in tools) ? params.genomes[params.genome].snpeff_db ?: null : null -params.species = params.genome && ('vep' in tools || 'merge' in tools) ? params.genomes[params.genome].species ?: null : null -params.vep_cache_version = params.genome && ('vep' in tools || 'merge' in tools) ? params.genomes[params.genome].vep_cache_version ?: null : null - -// Initialize channels with files based on params -ch_ac_loci = params.ac_loci && 'ascat' in tools ? Channel.value(file(params.ac_loci)) : "null" -ch_ac_loci_gc = params.ac_loci_gc && 'ascat' in tools ? Channel.value(file(params.ac_loci_gc)) : "null" -ch_chr_dir = params.chr_dir && 'controlfreec' in tools ? Channel.value(file(params.chr_dir)) : "null" -ch_chr_length = params.chr_length && 'controlfreec' in tools ? Channel.value(file(params.chr_length)) : "null" -ch_dbsnp = params.dbsnp && ('mapping' in step || 'preparerecalibration' in step || 'controlfreec' in tools || 'haplotypecaller' in tools || 'mutect2' in tools || params.sentieon) ? Channel.value(file(params.dbsnp)) : "null" -ch_fasta = params.fasta && !('annotate' in step) ? Channel.value(file(params.fasta)) : "null" -ch_fai = params.fasta_fai && !('annotate' in step) ? Channel.value(file(params.fasta_fai)) : "null" -ch_germline_resource = params.germline_resource && 'mutect2' in tools ? Channel.value(file(params.germline_resource)) : "null" -ch_intervals = params.intervals && !params.no_intervals && !('annotate' in step) ? Channel.value(file(params.intervals)) : "null" -ch_known_indels = params.known_indels && ('mapping' in step || 'preparerecalibration' in step) ? Channel.value(file(params.known_indels)) : "null" -ch_mappability = params.mappability && 'controlfreec' in tools ? Channel.value(file(params.mappability)) : "null" - -// Initialize channels with values based on params -ch_snpeff_cache = params.snpeff_cache ? Channel.value(file(params.snpeff_cache)) : "null" -ch_snpeff_db = params.snpeff_db ? Channel.value(params.snpeff_db) : "null" -ch_vep_cache_version = params.vep_cache_version ? Channel.value(params.vep_cache_version) : "null" -ch_vep_cache = params.vep_cache ? Channel.value(file(params.vep_cache)) : "null" - -// Optional files, not defined within the params.genomes[params.genome] scope -ch_cadd_indels = params.cadd_indels ? Channel.value(file(params.cadd_indels)) : "null" -ch_cadd_indels_tbi = params.cadd_indels_tbi ? Channel.value(file(params.cadd_indels_tbi)) : "null" -ch_cadd_wg_snvs = params.cadd_wg_snvs ? Channel.value(file(params.cadd_wg_snvs)) : "null" -ch_cadd_wg_snvs_tbi = params.cadd_wg_snvs_tbi ? Channel.value(file(params.cadd_wg_snvs_tbi)) : "null" -ch_pon = params.pon ? Channel.value(file(params.pon)) : "null" -ch_target_bed = params.target_bed ? Channel.value(file(params.target_bed)) : "null" +params.ac_loci = params.genome ? params.genomes[params.genome].ac_loci ?: false : false +params.ac_loci_gc = params.genome ? params.genomes[params.genome].ac_loci_gc ?: false : false +params.bwa = params.genome ? params.genomes[params.genome].bwa ?: false : false +params.chr_dir = params.genome ? params.genomes[params.genome].chr_dir ?: false : false +params.chr_length = params.genome ? params.genomes[params.genome].chr_length ?: false : false +params.dbsnp = params.genome ? params.genomes[params.genome].dbsnp ?: false : false +params.dbsnp_index = params.genome ? params.genomes[params.genome].dbsnp_index ?: false : false +params.dict = params.genome ? params.genomes[params.genome].dict ?: false : false +params.fasta = params.genome ? params.genomes[params.genome].fasta ?: false : false +params.fasta_fai = params.genome ? params.genomes[params.genome].fasta_fai ?: false : false +params.germline_resource = params.genome ? params.genomes[params.genome].germline_resource ?: false : false +params.germline_resource_index = params.genome ? params.genomes[params.genome].germline_resource_index ?: false : false +params.intervals = params.genome ? params.genomes[params.genome].intervals ?: false : false +params.known_indels = params.genome ? params.genomes[params.genome].known_indels ?: false : false +params.known_indels_index = params.genome ? params.genomes[params.genome].known_indels_index ?: false : false +params.mappability = params.genome ? params.genomes[params.genome].mappability ?: false : false +params.snpeff_db = params.genome ? params.genomes[params.genome].snpeff_db ?: false : false +params.species = params.genome ? params.genomes[params.genome].species ?: false : false +params.vep_cache_version = params.genome ? params.genomes[params.genome].vep_cache_version ?: false : false + +file("${params.outdir}/no_file").text = "no_file\n" + +// Initialize file channels based on params, defined in the params.genomes[params.genome] scope +chr_dir = params.chr_dir ? file(params.chr_dir) : file("${params.outdir}/no_file") +chr_length = params.chr_length ? file(params.chr_length) : file("${params.outdir}/no_file") +dbsnp = params.dbsnp ? file(params.dbsnp) : file("${params.outdir}/no_file") +fasta = params.fasta ? file(params.fasta) : file("${params.outdir}/no_file") +germline_resource = params.germline_resource ? file(params.germline_resource) : file("${params.outdir}/no_file") +known_indels = params.known_indels ? file(params.known_indels) : file("${params.outdir}/no_file") +loci = params.ac_loci ? file(params.ac_loci) : file("${params.outdir}/no_file") +loci_gc = params.ac_loci_gc ? file(params.ac_loci_gc) : file("${params.outdir}/no_file") +mappability = params.mappability ? file(params.mappability) : file("${params.outdir}/no_file") + +// Initialize value channels based on params, defined in the params.genomes[params.genome] scope +snpeff_db = params.snpeff_db ?: Channel.empty() +snpeff_species = params.species ?: Channel.empty() +vep_cache_version = params.vep_cache_version ?: Channel.empty() + +// Initialize files channels based on params, not defined within the params.genomes[params.genome] scope +cadd_indels = params.cadd_indels ? file(params.cadd_indels) : file("${params.outdir}/no_file") +cadd_indels_tbi = params.cadd_indels_tbi ? file(params.cadd_indels_tbi) : file("${params.outdir}/no_file") +cadd_wg_snvs = params.cadd_wg_snvs ? file(params.cadd_wg_snvs) : file("${params.outdir}/no_file") +cadd_wg_snvs_tbi = params.cadd_wg_snvs_tbi ? file(params.cadd_wg_snvs_tbi) : file("${params.outdir}/no_file") +pon = params.pon ? file(params.pon) : file("${params.outdir}/no_file") +snpeff_cache = params.snpeff_cache ? file(params.snpeff_cache) : file("${params.outdir}/no_file") +target_bed = params.target_bed ? file(params.target_bed) : file("${params.outdir}/no_file") +vep_cache = params.vep_cache ? file(params.vep_cache) : file("${params.outdir}/no_file") + +// Initialize value channels based on params, not defined within the params.genomes[params.genome] scope +read_structure1 = params.read_structure1 ?: Channel.empty() +read_structure2 = params.read_structure2 ?: Channel.empty() // Optional values, not defined within the params.genomes[params.genome] scope ch_read_structure1 = params.read_structure1 ? Channel.value(params.read_structure1) : "null" ch_read_structure2 = params.read_structure2 ? Channel.value(params.read_structure2) : "null" /* -================================================================================ +-------------------------------------------------------------------------------- PRINTING SUMMARY -================================================================================ +-------------------------------------------------------------------------------- */ -// Header log info -log.info nfcoreHeader() -def summary = [:] -if (workflow.revision) summary['Pipeline Release'] = workflow.revision -summary['Run Name'] = custom_runName ?: workflow.runName -summary['Max Resources'] = "${params.max_memory} memory, ${params.max_cpus} cpus, ${params.max_time} time per job" -if (workflow.containerEngine) summary['Container'] = "${workflow.containerEngine} - ${workflow.container}" - -summary['Input'] = params.input -summary['Step'] = step -summary['Genome'] = params.genome - -if (params.no_intervals && step != 'annotate') summary['Intervals'] = 'Do not use' -summary['Nucleotides/s'] = params.nucleotides_per_second -if (params.sentieon) summary['Sention'] = "Using Sentieon for Preprocessing and/or Variant Calling" -if (params.skip_qc) summary['QC tools skipped'] = skipQC.join(', ') -if (params.target_bed) summary['Target BED'] = params.target_bed -if (params.tools) summary['Tools'] = tools.join(', ') - -if (params.trim_fastq || params.split_fastq) summary['Modify fastqs (trim/split)'] = "" - -if (params.trim_fastq) { - summary['Fastq trim'] = "Fastq trim selected" - summary['Trim R1'] = "${params.clip_r1} bp" - summary['Trim R2'] = "${params.clip_r2} bp" - summary["Trim 3' R1"] = "${params.three_prime_clip_r1} bp" - summary["Trim 3' R2"] = "${params.three_prime_clip_r2} bp" - summary['NextSeq Trim'] = "${params.trim_nextseq} bp" - summary['Saved Trimmed Fastq'] = params.save_trimmed ? 'Yes' : 'No' -} -if (params.split_fastq) summary['Reads in fastq'] = params.split_fastq - -summary['MarkDuplicates'] = "Options" -summary['Java options'] = params.markdup_java_options -summary['GATK Spark'] = params.use_gatk_spark ? 'Yes' : 'No' - -summary['Save BAMs mapped'] = params.save_bam_mapped ? 'Yes' : 'No' -summary['Skip MarkDuplicates'] = params.skip_markduplicates ? 'Yes' : 'No' - -if ('ascat' in tools) { - summary['ASCAT'] = "Options" - if (params.ascat_purity) summary['purity'] = params.ascat_purity - if (params.ascat_ploidy) summary['ploidy'] = params.ascat_ploidy -} - -if ('controlfreec' in tools) { - summary['Control-FREEC'] = "Options" - if (params.cf_window) summary['window'] = params.cf_window - if (params.cf_coeff) summary['coefficientOfVariation'] = params.cf_coeff - if (params.cf_ploidy) summary['ploidy'] = params.cf_ploidy -} - -if ('haplotypecaller' in tools) summary['GVCF'] = params.generate_gvcf ? 'Yes' : 'No' -if ('strelka' in tools && 'manta' in tools) summary['Strelka BP'] = params.no_strelka_bp ? 'No' : 'Yes' -if (params.pon && ('mutect2' in tools || (params.sentieon && 'tnscope' in tools))) summary['Panel of normals'] = params.pon - -if (params.annotate_tools) summary['Tools to annotate'] = annotate_tools.join(', ') - -if (params.annotation_cache) { - summary['Annotation cache'] = "Enabled" - if (params.snpeff_cache) summary['snpEff cache'] = params.snpeff_cache - if (params.vep_cache) summary['VEP cache'] = params.vep_cache -} - -if (params.cadd_cache) { - summary['CADD cache'] = "Enabled" - if (params.cadd_indels) summary['CADD indels'] = params.cadd_indels - if (params.cadd_wg_snvs) summary['CADD wg snvs'] = params.cadd_wg_snvs -} - -if (params.genesplicer) summary['genesplicer'] = "Enabled" - -if (params.igenomes_base && !params.igenomes_ignore) summary['AWS iGenomes base'] = params.igenomes_base -if (params.igenomes_ignore) summary['AWS iGenomes'] = "Do not use" -if (params.genomes_base && !params.igenomes_ignore) summary['Genomes base'] = params.genomes_base - -summary['Save Reference'] = params.save_reference ? 'Yes' : 'No' - -if (params.ac_loci) summary['Loci'] = params.ac_loci -if (params.ac_loci_gc) summary['Loci GC'] = params.ac_loci_gc -if (params.bwa) summary['BWA indexes'] = params.bwa -if (params.chr_dir) summary['Chromosomes'] = params.chr_dir -if (params.chr_length) summary['Chromosomes length'] = params.chr_length -if (params.dbsnp) summary['dbsnp'] = params.dbsnp -if (params.dbsnp_index) summary['dbsnpIndex'] = params.dbsnp_index -if (params.dict) summary['dict'] = params.dict -if (params.fasta) summary['fasta reference'] = params.fasta -if (params.fasta_fai) summary['fasta index'] = params.fasta_fai -if (params.germline_resource) summary['germline resource'] = params.germline_resource -if (params.germline_resource_index) summary['germline resource index'] = params.germline_resource_index -if (params.intervals) summary['intervals'] = params.intervals -if (params.known_indels) summary['known indels'] = params.known_indels -if (params.known_indels_index) summary['known indels index'] = params.known_indels_index -if (params.mappability) summary['Mappability'] = params.mappability -if (params.snpeff_cache) summary['snpEff cache'] = params.snpeff_cache -if (params.snpeff_db) summary['snpEff DB'] = params.snpeff_db -if (params.species) summary['species'] = params.species -if (params.vep_cache) summary['VEP cache'] = params.vep_cache -if (params.vep_cache_version) summary['VEP cache version'] = params.vep_cache_version - -summary['Output dir'] = params.outdir -summary['Publish dir mode'] = params.publish_dir_mode -if (params.sequencing_center) summary['Sequenced by'] = params.sequencing_center - -summary['Launch dir'] = workflow.launchDir -summary['Working dir'] = workflow.workDir -summary['Script dir'] = workflow.projectDir -summary['User'] = workflow.userName - -if (params.multiqc_config) summary['MultiQC config'] = params.multiqc_config - -summary['Config Profile'] = workflow.profile - -if (params.config_profile_description) summary['Config Description'] = params.config_profile_description -if (params.config_profile_contact) summary['Config Contact'] = params.config_profile_contact -if (params.config_profile_url) summary['Config URL'] = params.config_profile_url - -summary['Config Files'] = workflow.configFiles.join(', ') - - -if (workflow.profile.contains('awsbatch')) { - summary['AWS Region'] = params.awsregion - summary['AWS Queue'] = params.awsqueue - summary['AWS CLI'] = params.awscli +// Has the run name been specified by the user? +// This has the bonus effect of catching both -name and --name +run_name = params.name +if (!(workflow.runName ==~ /[a-z]+_[a-z]+/)) { + run_name = workflow.runName } +summary = Schema.params_summary(workflow, params, run_name, step, tools, skip_qc, annotate_tools) +log.info Headers.nf_core(workflow, params.monochrome_logs) +log.info summary.collect { k,v -> "${k.padRight(20)}: $v" }.join("\n") +log.info "-\033[2m----------------------------------------------------\033[0m-" -if (params.email || params.email_on_fail) { - summary['E-mail Address'] = params.email - summary['E-mail on failure'] = params.email_on_fail - summary['MultiQC maxsize'] = params.max_multiqc_email_size -} -log.info summary.collect { k,v -> "${k.padRight(18)}: $v" }.join("\n") -log.info "-\033[2m--------------------------------------------------\033[0m-" +// params summary for MultiQC +workflow_summary = Schema.params_mqc_summary(summary) +workflow_summary = Channel.value(workflow_summary) if ('mutect2' in tools && !(params.pon)) log.warn "[nf-core/sarek] Mutect2 was requested, but as no panel of normals were given, results will not be optimal" if (params.sentieon) log.warn "[nf-core/sarek] Sentieon will be used, only works if Sentieon is available where nf-core/sarek is run" -// Check the hostnames against configured profiles -checkHostname() - -Channel.from(summary.collect{ [it.key, it.value] }) - .map { k,v -> "
$k
${v ?: 'N/A'}
" } - .reduce { a, b -> return [a, b].join("\n ") } - .map { x -> """ - id: 'sarek-summary' - description: " - this information is collected when the pipeline is started." - section_name: 'nf-core/sarek Workflow Summary' - section_href: 'https://github.com/nf-core/sarek' - plot_type: 'html' - data: | -
- $x -
- """.stripIndent() } - .set { ch_workflow_summary } - -// Parse software version numbers - -process get_software_versions { - publishDir "${params.outdir}/pipeline_info", mode: params.publish_dir_mode, - saveAs: {it.indexOf(".csv") > 0 ? it : null} - - output: - file 'software_versions_mqc.yaml' into ch_software_versions_yaml - file "software_versions.csv" - - when: !('versions' in skipQC) - - script: - aligner = params.aligner == "bwa-mem2" ? "bwa-mem2" : "bwa" - """ - alleleCounter --version &> v_allelecount.txt 2>&1 || true - bcftools --version &> v_bcftools.txt 2>&1 || true - ${aligner} version &> v_bwa.txt 2>&1 || true - cnvkit.py version &> v_cnvkit.txt 2>&1 || true - configManta.py --version &> v_manta.txt 2>&1 || true - configureStrelkaGermlineWorkflow.py --version &> v_strelka.txt 2>&1 || true - echo "${workflow.manifest.version}" &> v_pipeline.txt 2>&1 || true - echo "${workflow.nextflow.version}" &> v_nextflow.txt 2>&1 || true - snpEff -version &> v_snpeff.txt 2>&1 || true - fastqc --version &> v_fastqc.txt 2>&1 || true - freebayes --version &> v_freebayes.txt 2>&1 || true - freec &> v_controlfreec.txt 2>&1 || true - gatk ApplyBQSR --help &> v_gatk.txt 2>&1 || true - msisensor &> v_msisensor.txt 2>&1 || true - multiqc --version &> v_multiqc.txt 2>&1 || true - qualimap --version &> v_qualimap.txt 2>&1 || true - R --version &> v_r.txt 2>&1 || true - R -e "library(ASCAT); help(package='ASCAT')" &> v_ascat.txt 2>&1 || true - samtools --version &> v_samtools.txt 2>&1 || true - tiddit &> v_tiddit.txt 2>&1 || true - trim_galore -v &> v_trim_galore.txt 2>&1 || true - vcftools --version &> v_vcftools.txt 2>&1 || true - vep --help &> v_vep.txt 2>&1 || true - - scrape_software_versions.py &> software_versions_mqc.yaml - """ -} - -ch_software_versions_yaml = ch_software_versions_yaml.dump(tag:'SOFTWARE VERSIONS') - /* -================================================================================ - BUILDING INDEXES -================================================================================ +-------------------------------------------------------------------------------- + INCLUDE LOCAL MODULES +-------------------------------------------------------------------------------- */ -// And then initialize channels based on params or indexes that were just built - -process BuildBWAindexes { - tag "${fasta}" - - publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: {params.save_reference ? "reference_genome/BWAIndex/${it}" : null } - - input: - file(fasta) from ch_fasta - - output: - file("${fasta}.*") into bwa_built - - when: !(params.bwa) && params.fasta && 'mapping' in step - - script: - aligner = params.aligner == "bwa-mem2" ? "bwa-mem2" : "bwa" - """ - ${aligner} index ${fasta} - """ -} - -ch_bwa = params.bwa ? Channel.value(file(params.bwa)) : bwa_built - -process BuildDict { - tag "${fasta}" - - publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: {params.save_reference ? "reference_genome/${it}" : null } - - input: - file(fasta) from ch_fasta - - output: - file("${fasta.baseName}.dict") into dictBuilt - - when: !(params.dict) && params.fasta && !('annotate' in step) && !('controlfreec' in step) - - script: - """ - gatk --java-options "-Xmx${task.memory.toGiga()}g" \ - CreateSequenceDictionary \ - --REFERENCE ${fasta} \ - --OUTPUT ${fasta.baseName}.dict - """ -} - -ch_dict = params.dict ? Channel.value(file(params.dict)) : dictBuilt - -process BuildFastaFai { - tag "${fasta}" - - publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: {params.save_reference ? "reference_genome/${it}" : null } - - input: - file(fasta) from ch_fasta - - output: - file("${fasta}.fai") into fai_built - - when: !(params.fasta_fai) && params.fasta && !('annotate' in step) - - script: - """ - samtools faidx ${fasta} - """ -} - -ch_fai = params.fasta_fai ? Channel.value(file(params.fasta_fai)) : fai_built - -process BuildDbsnpIndex { - tag "${dbsnp}" - - publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: {params.save_reference ? "reference_genome/${it}" : null } - - input: - file(dbsnp) from ch_dbsnp - - output: - file("${dbsnp}.tbi") into dbsnp_tbi - - when: !(params.dbsnp_index) && params.dbsnp && ('mapping' in step || 'preparerecalibration' in step || 'controlfreec' in tools || 'haplotypecaller' in tools || 'mutect2' in tools || 'tnscope' in tools) - - script: - """ - tabix -p vcf ${dbsnp} - """ -} - -ch_dbsnp_tbi = params.dbsnp ? params.dbsnp_index ? Channel.value(file(params.dbsnp_index)) : dbsnp_tbi : "null" - -process BuildGermlineResourceIndex { - tag "${germlineResource}" - - publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: {params.save_reference ? "reference_genome/${it}" : null } - - input: - file(germlineResource) from ch_germline_resource - - output: - file("${germlineResource}.tbi") into germline_resource_tbi - - when: !(params.germline_resource_index) && params.germline_resource && 'mutect2' in tools - - script: - """ - tabix -p vcf ${germlineResource} - """ -} - -ch_germline_resource_tbi = params.germline_resource ? params.germline_resource_index ? Channel.value(file(params.germline_resource_index)) : germline_resource_tbi : "null" - -process BuildKnownIndelsIndex { - tag "${knownIndels}" - - publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: {params.save_reference ? "reference_genome/${it}" : null } - - input: - each file(knownIndels) from ch_known_indels - - output: - file("${knownIndels}.tbi") into known_indels_tbi - - when: !(params.known_indels_index) && params.known_indels && ('mapping' in step || 'preparerecalibration' in step) - - script: - """ - tabix -p vcf ${knownIndels} - """ -} - -ch_known_indels_tbi = params.known_indels ? params.known_indels_index ? Channel.value(file(params.known_indels_index)) : known_indels_tbi.collect() : "null" - -process BuildPonIndex { - tag "${pon}" - - publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: {params.save_reference ? "reference_genome/${it}" : null } - - input: - file(pon) from ch_pon - - output: - file("${pon}.tbi") into pon_tbi - - when: !(params.pon_index) && params.pon && ('tnscope' in tools || 'mutect2' in tools) - - script: - """ - tabix -p vcf ${pon} - """ -} - -ch_pon_tbi = params.pon ? params.pon_index ? Channel.value(file(params.pon_index)) : pon_tbi : "null" - -process BuildIntervals { - tag "${fastaFai}" - - publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: {params.save_reference ? "reference_genome/${it}" : null } - - input: - file(fastaFai) from ch_fai - - output: - file("${fastaFai.baseName}.bed") into intervalBuilt - - when: !(params.intervals) && !('annotate' in step) && !('controlfreec' in step) - - script: - """ - awk -v FS='\t' -v OFS='\t' '{ print \$1, \"0\", \$2 }' ${fastaFai} > ${fastaFai.baseName}.bed - """ -} +/* +-------------------------------------------------------------------------------- + INCLUDE LOCAL SUBWORKFLOWS +-------------------------------------------------------------------------------- +*/ -ch_intervals = params.no_intervals ? "null" : params.intervals && !('annotate' in step) ? Channel.value(file(params.intervals)) : intervalBuilt +include { BUILD_INDICES } from './modules/local/subworkflow/build_indices' addParams( + build_intervals_options: modules['build_intervals'], + bwa_index_options: modules['bwa_index'], + bwamem2_index_options: modules['bwamem2_index'], + create_intervals_bed_options: modules['create_intervals_bed'], + gatk_dict_options: modules['dict'], + samtools_faidx_options: modules['samtools_faidx'], + tabix_dbsnp_options: modules['tabix_dbsnp'], + tabix_germline_resource_options: modules['tabix_germline_resource'], + tabix_known_indels_options: modules['tabix_known_indels'], + tabix_pon_options: modules['tabix_pon'] +) +include { MAPPING } from './modules/local/subworkflow/mapping' addParams( + bwamem1_mem_options: modules['bwa_mem1_mem'], + bwamem2_mem_options: modules['bwa_mem2_mem'], + merge_bam_options: modules['merge_bam_mapping'], + qualimap_bamqc_options: modules['qualimap_bamqc_mapping'], + samtools_index_options: modules['samtools_index_mapping'], + samtools_stats_options: modules['samtools_stats_mapping'] +) +include { MARKDUPLICATES } from './modules/local/subworkflow/markduplicates' addParams( + markduplicates_options: modules['markduplicates'] +) +include { PREPARE_RECALIBRATION } from './modules/local/subworkflow/prepare_recalibration' addParams( + baserecalibrator_options: modules['baserecalibrator'], + gatherbqsrreports_options: modules['gatherbqsrreports'] +) +include { RECALIBRATE } from './modules/local/subworkflow/recalibrate' addParams( + applybqsr_options: modules['applybqsr'], + merge_bam_options: modules['merge_bam_recalibrate'], + qualimap_bamqc_options: modules['qualimap_bamqc_recalibrate'], + samtools_index_options: modules['samtools_index_recalibrate'], + samtools_stats_options: modules['samtools_stats_recalibrate'] +) +include { GERMLINE_VARIANT_CALLING } from './modules/local/subworkflow/germline_variant_calling' addParams( + haplotypecaller_options: modules['haplotypecaller'], + genotypegvcf_options: modules['genotypegvcf'], + concat_gvcf_options: modules['concat_gvcf'], + concat_haplotypecaller_options: modules['concat_haplotypecaller'], + strelka_options: modules['strelka_germline'] +) /* -================================================================================ - PREPROCESSING -================================================================================ +-------------------------------------------------------------------------------- + INCLUDE nf-core MODULES +-------------------------------------------------------------------------------- */ -// STEP 0: CREATING INTERVALS FOR PARALLELIZATION (PREPROCESSING AND VARIANT CALLING) - -process CreateIntervalBeds { - tag "${intervals}" - - input: - file(intervals) from ch_intervals - - output: - file '*.bed' into bedIntervals mode flatten - - when: (!params.no_intervals) && step != 'annotate' - - script: - // If the interval file is BED format, the fifth column is interpreted to - // contain runtime estimates, which is then used to combine short-running jobs - if (hasExtension(intervals, "bed")) - """ - awk -vFS="\t" '{ - t = \$5 # runtime estimate - if (t == "") { - # no runtime estimate in this row, assume default value - t = (\$3 - \$2) / ${params.nucleotides_per_second} - } - if (name == "" || (chunk > 600 && (chunk + t) > longest * 1.05)) { - # start a new chunk - name = sprintf("%s_%d-%d.bed", \$1, \$2+1, \$3) - chunk = 0 - longest = 0 - } - if (t > longest) - longest = t - chunk += t - print \$0 > name - }' ${intervals} - """ - else if (hasExtension(intervals, "interval_list")) - """ - grep -v '^@' ${intervals} | awk -vFS="\t" '{ - name = sprintf("%s_%d-%d", \$1, \$2, \$3); - printf("%s\\t%d\\t%d\\n", \$1, \$2-1, \$3) > name ".bed" - }' - """ - else - """ - awk -vFS="[:-]" '{ - name = sprintf("%s_%d-%d", \$1, \$2, \$3); - printf("%s\\t%d\\t%d\\n", \$1, \$2-1, \$3) > name ".bed" - }' ${intervals} - """ -} - -bedIntervals = bedIntervals - .map { intervalFile -> - def duration = 0.0 - for (line in intervalFile.readLines()) { - final fields = line.split('\t') - if (fields.size() >= 5) duration += fields[4].toFloat() - else { - start = fields[1].toInteger() - end = fields[2].toInteger() - duration += (end - start) / params.nucleotides_per_second - } - } - [duration, intervalFile] - }.toSortedList({ a, b -> b[0] <=> a[0] }) - .flatten().collate(2) - .map{duration, intervalFile -> intervalFile} - -bedIntervals = bedIntervals.dump(tag:'bedintervals') - -if (params.no_intervals && step != 'annotate') { - file("${params.outdir}/no_intervals.bed").text = "no_intervals\n" - bedIntervals = Channel.from(file("${params.outdir}/no_intervals.bed")) -} +include { MULTIQC } from './modules/nf-core/software/multiqc' -(intBaseRecalibrator, intApplyBQSR, intHaplotypeCaller, intFreebayesSingle, intMpileup, bedIntervals) = bedIntervals.into(6) +/* +-------------------------------------------------------------------------------- + INCLUDE nf-core SUBWORKFLOWS +-------------------------------------------------------------------------------- +*/ +include { QC_TRIM } from './modules/nf-core/subworkflow/qc_trim' addParams( + fastqc_options: modules['fastqc'], + trimgalore_options: modules['trimgalore'] +) // PREPARING CHANNELS FOR PREPROCESSING AND QC -inputBam = Channel.create() -inputPairReads = Channel.create() - -if (step in ['preparerecalibration', 'recalibrate', 'variantcalling', 'controlfreec', 'annotate']) { - inputBam.close() - inputPairReads.close() -} else inputSample.choice(inputPairReads, inputBam) {hasExtension(it[3], "bam") ? 1 : 0} - -(inputBam, inputBamFastQC) = inputBam.into(2) - -// Removing inputFile2 which is null in case of uBAM -inputBamFastQC = inputBamFastQC.map { - idPatient, idSample, idRun, inputFile1, inputFile2 -> - [idPatient, idSample, idRun, inputFile1] -} - -if (params.split_fastq){ - inputPairReads = inputPairReads - // newly splitfastq are named based on split, so the name is easier to catch - .splitFastq(by: params.split_fastq, compress:true, file:"split", pe:true) - .map {idPatient, idSample, idRun, reads1, reads2 -> - // The split fastq read1 is the 4th element (indexed 3) its name is split_3 - // The split fastq read2's name is split_4 - // It's followed by which split it's acutally based on the mother fastq file - // Index start at 1 - // Extracting the index to get a new IdRun - splitIndex = reads1.fileName.toString().minus("split_3.").minus(".gz") - newIdRun = idRun + "_" + splitIndex - // Giving the files a new nice name - newReads1 = file("${idSample}_${newIdRun}_R1.fastq.gz") - newReads2 = file("${idSample}_${newIdRun}_R2.fastq.gz") - [idPatient, idSample, newIdRun, reads1, reads2]} -} - -inputPairReads = inputPairReads.dump(tag:'INPUT') - -(inputPairReads, inputPairReadsTrimGalore, inputPairReadsFastQC, inputPairReadsUMI) = inputPairReads.into(4) - -if (params.umi) inputPairReads.close() -else inputPairReadsUMI.close() - -if (params.trim_fastq) inputPairReads.close() -else inputPairReadsTrimGalore.close() - -// STEP 0.5: QC ON READS - -// TODO: Use only one process for FastQC for FASTQ files and uBAM files -// FASTQ and uBAM files are renamed based on the sample name - -process FastQCFQ { - label 'FastQC' - label 'cpus_2' - - tag "${idPatient}-${idRun}" - - publishDir "${params.outdir}/Reports/${idSample}/FastQC/${idSample}_${idRun}", mode: params.publish_dir_mode - - input: - set idPatient, idSample, idRun, file("${idSample}_${idRun}_R1.fastq.gz"), file("${idSample}_${idRun}_R2.fastq.gz") from inputPairReadsFastQC - - output: - file("*.{html,zip}") into fastQCFQReport - - when: !('fastqc' in skipQC) - - script: - """ - fastqc -t 2 -q ${idSample}_${idRun}_R1.fastq.gz ${idSample}_${idRun}_R2.fastq.gz - """ -} - -process FastQCBAM { - label 'FastQC' - label 'cpus_2' +// input_bam = Channel.empty() +// input_pair_reads = Channel.empty() + +// if (step in ['preparerecalibration', 'recalibrate', 'variantcalling', 'controlfreec', 'annotate']) { +// input_bam.close() +// input_pair_reads.close() +// } else input_sample.branch(input_pair_reads, input_bam) {has_extension(it[3], "bam") ? 1 : 0} + +// (input_bam, input_bam_fastqc) = input_bam.into(2) + +// // Removing inputFile2 which is null in case of uBAM +// input_bam_fastqc = input_bam_fastqc.map { +// idPatient, idSample, idRun, inputFile1, inputFile2 -> +// [idPatient, idSample, idRun, inputFile1] +// } + +// if (params.split_fastq){ +// input_pair_reads = input_pair_reads +// // newly splitfastq are named based on split, so the name is easier to catch +// .splitFastq(by: params.split_fastq, compress:true, file:"split", pe:true) +// .map {idPatient, idSample, idRun, reads1, reads2 -> +// // The split fastq read1 is the 4th element (indexed 3) its name is split_3 +// // The split fastq read2's name is split_4 +// // It's followed by which split it's acutally based on the mother fastq file +// // Index start at 1 +// // Extracting the index to get a new IdRun +// splitIndex = reads1.fileName.toString().minus("split_3.").minus(".gz") +// newIdRun = idRun + "_" + splitIndex +// // Giving the files a new nice name +// newReads1 = file("${idSample}_${newIdRun}_R1.fastq.gz") +// newReads2 = file("${idSample}_${newIdRun}_R2.fastq.gz") +// [idPatient, idSample, newIdRun, reads1, reads2]} +//} + +// input_pair_reads.dump(tag:'INPUT') + +// (input_pair_reads, input_pair_readstrimgalore, input_pair_readsfastqc) = input_pair_reads.into(3) - tag "${idPatient}-${idRun}" - - publishDir "${params.outdir}/Reports/${idSample}/FastQC/${idSample}_${idRun}", mode: params.publish_dir_mode - - input: - set idPatient, idSample, idRun, file("${idSample}_${idRun}.bam") from inputBamFastQC - - output: - file("*.{html,zip}") into fastQCBAMReport - - when: !('fastqc' in skipQC) - - script: - """ - fastqc -t 2 -q ${idSample}_${idRun}.bam - """ -} - -fastQCReport = fastQCFQReport.mix(fastQCBAMReport) - -fastQCReport = fastQCReport.dump(tag:'FastQC') - -process TrimGalore { - label 'TrimGalore' - - tag "${idPatient}-${idRun}" - - publishDir "${params.outdir}/Reports/${idSample}/TrimGalore/${idSample}_${idRun}", mode: params.publish_dir_mode, - saveAs: {filename -> - if (filename.indexOf("_fastqc") > 0) "FastQC/$filename" - else if (filename.indexOf("trimming_report.txt") > 0) "logs/$filename" - else if (params.save_trimmed) filename - else null - } - - input: - set idPatient, idSample, idRun, file("${idSample}_${idRun}_R1.fastq.gz"), file("${idSample}_${idRun}_R2.fastq.gz") from inputPairReadsTrimGalore - - output: - file("*.{html,zip,txt}") into trimGaloreReport - set idPatient, idSample, idRun, file("${idSample}_${idRun}_R1_val_1.fq.gz"), file("${idSample}_${idRun}_R2_val_2.fq.gz") into outputPairReadsTrimGalore - - when: params.trim_fastq - - script: - // Calculate number of --cores for TrimGalore based on value of task.cpus - // See: https://github.com/FelixKrueger/TrimGalore/blob/master/Changelog.md#version-060-release-on-1-mar-2019 - // See: https://github.com/nf-core/atacseq/pull/65 - def cores = 1 - if (task.cpus) { - cores = (task.cpus as int) - 4 - if (cores < 1) cores = 1 - if (cores > 4) cores = 4 - } - c_r1 = params.clip_r1 > 0 ? "--clip_r1 ${params.clip_r1}" : '' - c_r2 = params.clip_r2 > 0 ? "--clip_r2 ${params.clip_r2}" : '' - tpc_r1 = params.three_prime_clip_r1 > 0 ? "--three_prime_clip_r1 ${params.three_prime_clip_r1}" : '' - tpc_r2 = params.three_prime_clip_r2 > 0 ? "--three_prime_clip_r2 ${params.three_prime_clip_r2}" : '' - nextseq = params.trim_nextseq > 0 ? "--nextseq ${params.trim_nextseq}" : '' - """ - trim_galore \ - --cores ${cores} \ - --paired \ - --fastqc \ - --gzip \ - ${c_r1} ${c_r2} \ - ${tpc_r1} ${tpc_r2} \ - ${nextseq} \ - ${idSample}_${idRun}_R1.fastq.gz ${idSample}_${idRun}_R2.fastq.gz - - mv *val_1_fastqc.html "${idSample}_${idRun}_R1.trimmed_fastqc.html" - mv *val_2_fastqc.html "${idSample}_${idRun}_R2.trimmed_fastqc.html" - mv *val_1_fastqc.zip "${idSample}_${idRun}_R1.trimmed_fastqc.zip" - mv *val_2_fastqc.zip "${idSample}_${idRun}_R2.trimmed_fastqc.zip" - """ -} /* -================================================================================ - UMIs PROCESSING -================================================================================ +-------------------------------------------------------------------------------- + RUN THE WORKFLOW +-------------------------------------------------------------------------------- */ -// UMI - STEP 1 - ANNOTATE -// the process needs to convert fastq to unmapped bam -// and while doing the conversion, tag the bam field RX with the UMI sequence - -process UMIFastqToBAM { - publishDir "${params.outdir}/Reports/${idSample}/UMI/${idSample}_${idRun}", mode: params.publish_dir_mode - - input: - set idPatient, idSample, idRun, file("${idSample}_${idRun}_R1.fastq.gz"), file("${idSample}_${idRun}_R2.fastq.gz") from inputPairReadsUMI - val read_structure1 from ch_read_structure1 - val read_structure2 from ch_read_structure2 - - output: - tuple val(idPatient), val(idSample), val(idRun), file("${idSample}_umi_converted.bam") into umi_converted_bams_ch - - when: params.umi - - // tmp folder for fgbio might be solved more elengantly? - - script: - """ - mkdir tmp - - fgbio --tmp-dir=${PWD}/tmp \ - FastqToBam \ - -i "${idSample}_${idRun}_R1.fastq.gz" "${idSample}_${idRun}_R2.fastq.gz" \ - -o "${idSample}_umi_converted.bam" \ - --read-structures ${read_structure1} ${read_structure2} \ - --sample ${idSample} \ - --library ${idSample} - """ -} - -// UMI - STEP 2 - MAP THE BAM FILE -// this is necessary because the UMI groups are created based on -// mapping position + same UMI tag - -process UMIMapBamFile { - input: - set idPatient, idSample, idRun, file(convertedBam) from umi_converted_bams_ch - file(bwaIndex) from ch_bwa - file(fasta) from ch_fasta - file(fastaFai) from ch_fai - - output: - tuple val(idPatient), val(idSample), val(idRun), file("${idSample}_umi_unsorted.bam") into umi_aligned_bams_ch - - when: params.umi - - script: - aligner = params.aligner == "bwa-mem2" ? "bwa-mem2" : "bwa" - """ - samtools bam2fq -T RX ${convertedBam} | \ - ${aligner} mem -p -t ${task.cpus} -C -M -R \"@RG\\tID:${idSample}\\tSM:${idSample}\\tPL:Illumina\" \ - ${fasta} - | \ - samtools view -bS - > ${idSample}_umi_unsorted.bam - """ -} - -// UMI - STEP 3 - GROUP READS BY UMIs -// We have chose the Adjacency method, following the nice paper and blog explanation integrated in both -// UMItools and FGBIO -// https://cgatoxford.wordpress.com/2015/08/14/unique-molecular-identifiers-the-problem-the-solution-and-the-proof/ -// alternatively we can define this as input for the user to choose from - -process GroupReadsByUmi { - publishDir "${params.outdir}/Reports/${idSample}/UMI/${idSample}_${idRun}", mode: params.publish_dir_mode - - input: - set idPatient, idSample, idRun, file(alignedBam) from umi_aligned_bams_ch - - output: - file("${idSample}_umi_histogram.txt") into umi_histogram_ch - tuple val(idPatient), val(idSample), val(idRun), file("${idSample}_umi-grouped.bam") into umi_grouped_bams_ch - - when: params.umi - - script: - """ - mkdir tmp - - samtools view -h ${alignedBam} | \ - samblaster -M --addMateTags | \ - samtools view -Sb - >${idSample}_unsorted_tagged.bam - - fgbio --tmp-dir=${PWD}/tmp \ - GroupReadsByUmi \ - -s Adjacency \ - -i ${idSample}_unsorted_tagged.bam \ - -o ${idSample}_umi-grouped.bam \ - -f ${idSample}_umi_histogram.txt - """ -} - -// UMI - STEP 4 - CALL MOLECULAR CONSENSUS -// Now that the reads are organised by UMI groups a molecular consensus will be created -// the resulting bam file will be again unmapped and therefore can be fed into the -// existing workflow from the step mapping - -process CallMolecularConsensusReads { - publishDir "${params.outdir}/Reports/${idSample}/UMI/${idSample}_${idRun}", mode: params.publish_dir_mode - - input: - set idPatient, idSample, idRun, file(groupedBamFile) from umi_grouped_bams_ch +workflow { - output: - tuple val(idPatient), val(idSample), val(idRun), file("${idSample}_umi-consensus.bam"), val("null") into consensus_bam_ch - - when: params.umi - - script: - """ - mkdir tmp - - fgbio --tmp-dir=${PWD}/tmp \ - CallMolecularConsensusReads \ - -i $groupedBamFile \ - -o ${idSample}_umi-consensus.bam \ - -M 1 -S Coordinate - """ -} - -// ################# END OF UMI READS PRE-PROCESSING -// from this moment on the generated uBam files can feed into the existing tools - -// STEP 1: MAPPING READS TO REFERENCE GENOME WITH BWA MEM - -input_pair_reads_sentieon = Channel.empty() +/* +-------------------------------------------------------------------------------- + BUILD INDICES +-------------------------------------------------------------------------------- +*/ -if (params.umi) { - inputPairReads = inputPairReads.dump(tag:'INPUT before mapping') - if (params.sentieon) input_pair_reads_sentieon = consensus_bam_ch - else inputPairReads = consensus_bam_ch -} -else { - if (params.trim_fastq) inputPairReads = outputPairReadsTrimGalore - else inputPairReads = inputPairReads.mix(inputBam) - inputPairReads = inputPairReads.dump(tag:'INPUT before mapping') - - (inputPairReads, input_pair_reads_sentieon) = inputPairReads.into(2) - if (params.sentieon) inputPairReads.close() - else input_pair_reads_sentieon.close() -} + BUILD_INDICES( + dbsnp, + fasta, + germline_resource, + known_indels, + pon, + step, + tools) -process MapReads { - label 'cpus_max' - - tag "${idPatient}-${idRun}" - - input: - set idPatient, idSample, idRun, file(inputFile1), file(inputFile2) from inputPairReads - file(bwaIndex) from ch_bwa - file(fasta) from ch_fasta - file(fastaFai) from ch_fai - - output: - set idPatient, idSample, idRun, file("${idSample}_${idRun}.bam") into bamMapped - set idPatient, val("${idSample}_${idRun}"), file("${idSample}_${idRun}.bam") into bamMappedBamQC - - when: !(params.sentieon) - - script: - // -K is an hidden option, used to fix the number of reads processed by bwa mem - // Chunk size can affect bwa results, if not specified, - // the number of threads can change which can give not deterministic result. - // cf https://github.com/CCDG/Pipeline-Standardization/blob/master/PipelineStandard.md - // and https://github.com/gatk-workflows/gatk4-data-processing/blob/8ffa26ff4580df4ac3a5aa9e272a4ff6bab44ba2/processing-for-variant-discovery-gatk4.b37.wgs.inputs.json#L29 - CN = params.sequencing_center ? "CN:${params.sequencing_center}\\t" : "" - readGroup = "@RG\\tID:${idRun}\\t${CN}PU:${idRun}\\tSM:${idSample}\\tLB:${idSample}\\tPL:illumina" - // adjust mismatch penalty for tumor samples - status = statusMap[idPatient, idSample] - extra = status == 1 ? "-B 3" : "" - convertToFastq = hasExtension(inputFile1, "bam") ? "gatk --java-options -Xmx${task.memory.toGiga()}g SamToFastq --INPUT=${inputFile1} --FASTQ=/dev/stdout --INTERLEAVE=true --NON_PF=true | \\" : "" - input = hasExtension(inputFile1, "bam") ? "-p /dev/stdin - 2> >(tee ${inputFile1}.bwa.stderr.log >&2)" : "${inputFile1} ${inputFile2}" - aligner = params.aligner == "bwa-mem2" ? "bwa-mem2" : "bwa" - """ - ${convertToFastq} - ${aligner} mem -K 100000000 -R \"${readGroup}\" ${extra} -t ${task.cpus} -M ${fasta} \ - ${input} | \ - samtools sort --threads ${task.cpus} -m 2G - > ${idSample}_${idRun}.bam - """ -} + intervals = BUILD_INDICES.out.intervals -bamMapped = bamMapped.dump(tag:'Mapped BAM') -// Sort BAM whether they are standalone or should be merged + bwa = params.bwa ? file(params.bwa) : BUILD_INDICES.out.bwa + dict = params.dict ? file(params.dict) : BUILD_INDICES.out.dict + fai = params.fasta_fai ? file(params.fasta_fai) : BUILD_INDICES.out.fai -singleBam = Channel.create() -multipleBam = Channel.create() -bamMapped.groupTuple(by:[0, 1]) - .choice(singleBam, multipleBam) {it[2].size() > 1 ? 1 : 0} -singleBam = singleBam.map { - idPatient, idSample, idRun, bam -> - [idPatient, idSample, bam] -} -singleBam = singleBam.dump(tag:'Single BAM') - -// STEP 1': MAPPING READS TO REFERENCE GENOME WITH SENTIEON BWA MEM - -process Sentieon_MapReads { - label 'cpus_max' - label 'memory_max' - label 'sentieon' - - tag "${idPatient}-${idRun}" - - input: - set idPatient, idSample, idRun, file(inputFile1), file(inputFile2) from input_pair_reads_sentieon - file(bwaIndex) from ch_bwa - file(fasta) from ch_fasta - file(fastaFai) from ch_fai - - output: - set idPatient, idSample, idRun, file("${idSample}_${idRun}.bam") into bam_sentieon_mapped - - when: params.sentieon - - script: - // -K is an hidden option, used to fix the number of reads processed by bwa mem - // Chunk size can affect bwa results, if not specified, - // the number of threads can change which can give not deterministic result. - // cf https://github.com/CCDG/Pipeline-Standardization/blob/master/PipelineStandard.md - // and https://github.com/gatk-workflows/gatk4-data-processing/blob/8ffa26ff4580df4ac3a5aa9e272a4ff6bab44ba2/processing-for-variant-discovery-gatk4.b37.wgs.inputs.json#L29 - CN = params.sequencing_center ? "CN:${params.sequencing_center}\\t" : "" - readGroup = "@RG\\tID:${idRun}\\t${CN}PU:${idRun}\\tSM:${idSample}\\tLB:${idSample}\\tPL:illumina" - // adjust mismatch penalty for tumor samples - status = statusMap[idPatient, idSample] - extra = status == 1 ? "-B 3" : "" - """ - sentieon bwa mem -K 100000000 -R \"${readGroup}\" ${extra} -t ${task.cpus} -M ${fasta} \ - ${inputFile1} ${inputFile2} | \ - sentieon util sort -r ${fasta} -o ${idSample}_${idRun}.bam -t ${task.cpus} --sam2bam -i - - """ -} + dbsnp_tbi = params.dbsnp ? params.dbsnp_index ? file(params.dbsnp_index) : BUILD_INDICES.out.dbsnp_tbi : file("${params.outdir}/no_file") + germline_resource_tbi = params.germline_resource ? params.germline_resource_index ? file(params.germline_resource_index) : BUILD_INDICES.out.germline_resource_tbi : file("${params.outdir}/no_file") + known_indels_tbi = params.known_indels ? params.known_indels_index ? file(params.known_indels_index) : BUILD_INDICES.out.known_indels_tbi.collect() : file("${params.outdir}/no_file") + pon_tbi = params.pon ? params.pon_index ? file(params.pon_index) : BUILD_INDICES.out.pon_tbi : file("${params.outdir}/no_file") -bam_sentieon_mapped = bam_sentieon_mapped.dump(tag:'Sentieon Mapped BAM') -// Sort BAM whether they are standalone or should be merged +/* +-------------------------------------------------------------------------------- + PREPROCESSING +-------------------------------------------------------------------------------- +*/ -singleBamSentieon = Channel.create() -multipleBamSentieon = Channel.create() -bam_sentieon_mapped.groupTuple(by:[0, 1]) - .choice(singleBamSentieon, multipleBamSentieon) {it[2].size() > 1 ? 1 : 0} -singleBamSentieon = singleBamSentieon.map { - idPatient, idSample, idRun, bam -> - [idPatient, idSample, bam] + bam_mapped = Channel.empty() + bam_mapped_qc = Channel.empty() + bam_recalibrated_qc = Channel.empty() + input_reads = Channel.empty() + qc_reports = Channel.empty() + + // STEP 0: QC & TRIM + // `--skip_qc fastqc` to skip fastqc + // trim only with `--trim_fastq` + // additional options to be set up + + QC_TRIM( + input_sample, + ('fastqc' in skip_qc || step != "mapping"), + !(params.trim_fastq)) + + reads_input = QC_TRIM.out.reads + + qc_reports = qc_reports.mix( + QC_TRIM.out.fastqc_html, + QC_TRIM.out.fastqc_zip, + QC_TRIM.out.trimgalore_html, + QC_TRIM.out.trimgalore_log, + QC_TRIM.out.trimgalore_zip) + + // STEP 1: MAPPING READS TO REFERENCE GENOME WITH BWA-MEM + + MAPPING( + ('bamqc' in skip_qc), + ('samtools' in skip_qc), + bwa, + fai, + fasta, + reads_input, + save_bam_mapped, + step, + target_bed) + + bam_mapped = MAPPING.out.bam + bam_mapped_qc = MAPPING.out.qc + + qc_reports = qc_reports.mix(bam_mapped_qc) + + // STEP 2: MARKING DUPLICATES + + MARKDUPLICATES( + bam_mapped, + step) + + bam_markduplicates = MARKDUPLICATES.out.bam + + if (step == 'preparerecalibration') bam_markduplicates = input_sample + + // STEP 3: CREATING RECALIBRATION TABLES + + PREPARE_RECALIBRATION( + bam_markduplicates, + dbsnp, + dbsnp_tbi, + dict, + fai, + fasta, + intervals, + known_indels, + known_indels_tbi, + step) + + table_bqsr = PREPARE_RECALIBRATION.out.table_bqsr + + // STEP 4: RECALIBRATING + bam_applybqsr = bam_markduplicates.join(table_bqsr) + + if (step == 'recalibrate') bam_applybqsr = input_sample + + RECALIBRATE( + ('bamqc' in skip_qc), + ('samtools' in skip_qc), + bam_applybqsr, + dict, + fai, + fasta, + intervals, + step, + target_bed) + + bam_recalibrated = RECALIBRATE.out.bam + bam_recalibrated_qc = RECALIBRATE.out.qc + + qc_reports = qc_reports.mix(bam_recalibrated_qc) + + bam_variant_calling = bam_recalibrated + + if (step == 'variantcalling') bam_variant_calling = input_sample + + /* + -------------------------------------------------------------------------------- + GERMLINE VARIANT CALLING + -------------------------------------------------------------------------------- + */ + + GERMLINE_VARIANT_CALLING( + bam_variant_calling, + dbsnp, + dbsnp_tbi, + dict, + fai, + fasta, + intervals, + target_bed, + tools) + + /* + -------------------------------------------------------------------------------- + SOMATIC VARIANT CALLING + -------------------------------------------------------------------------------- + */ + + /* + -------------------------------------------------------------------------------- + ANNOTATION + -------------------------------------------------------------------------------- + */ + + + /* + -------------------------------------------------------------------------------- + MultiQC + -------------------------------------------------------------------------------- + */ + + // GET_SOFTWARE_VERSIONS() + + MULTIQC( + // GET_SOFTWARE_VERSIONS.out.yml, + multiqc_config, + multiqc_custom_config.ifEmpty([]), + workflow_summary, + qc_reports.collect()) } -singleBamSentieon = singleBamSentieon.dump(tag:'Single BAM') - -// STEP 1.5: MERGING BAM FROM MULTIPLE LANES - -multipleBam = multipleBam.mix(multipleBamSentieon) - -process MergeBamMapped { - label 'cpus_8' - tag "${idPatient}-${idSample}" - - input: - set idPatient, idSample, idRun, file(bam) from multipleBam - - output: - set idPatient, idSample, file("${idSample}.bam") into bam_mapped_merged +/* +-------------------------------------------------------------------------------- + SEND COMPLETION EMAIL +-------------------------------------------------------------------------------- + */ - script: - """ - samtools merge --threads ${task.cpus} ${idSample}.bam ${bam} - """ +workflow.onComplete { + def multiqc_report = [] + Completion.email(workflow, params, summary, run_name, projectDir, multiqc_report, log) + Completion.summary(workflow, params, log) } -bam_mapped_merged = bam_mapped_merged.dump(tag:'Merged BAM') - -bam_mapped_merged = bam_mapped_merged.mix(singleBam,singleBamSentieon) - -(bam_mapped_merged, bam_sentieon_mapped_merged) = bam_mapped_merged.into(2) +// /* +// -------------------------------------------------------------------------------- +// GERMLINE VARIANT CALLING +// -------------------------------------------------------------------------------- +// */ -if (!params.sentieon) bam_sentieon_mapped_merged.close() -else bam_mapped_merged.close() +// // STEP MANTA.1 - SINGLE MODE -bam_mapped_merged = bam_mapped_merged.dump(tag:'BAMs for MD') -bam_sentieon_mapped_merged = bam_sentieon_mapped_merged.dump(tag:'Sentieon BAMs to Index') +// process MantaSingle { +// label 'cpus_max' +// label 'memory_max' -process IndexBamMergedForSentieon { - label 'cpus_8' +// tag "${idSample}" - tag "${idPatient}-${idSample}" +// publishDir "${params.outdir}/VariantCalling/${idSample}/Manta", mode: params.publish_dir_mode - input: - set idPatient, idSample, file("${idSample}.bam") from bam_sentieon_mapped_merged +// input: +// set idPatient, idSample, file(bam), file(bai) from bamMantaSingle +// file(fasta) from fasta +// file(fastaFai) from fai +// file(targetBED) from ch_target_bed - output: - set idPatient, idSample, file("${idSample}.bam"), file("${idSample}.bam.bai") into bam_sentieon_mapped_merged_indexed +// output: +// set val("Manta"), idPatient, idSample, file("*.vcf.gz"), file("*.vcf.gz.tbi") into vcfMantaSingle - script: - """ - samtools index ${idSample}.bam - """ -} - -(bam_mapped_merged, bam_mapped_merged_to_index) = bam_mapped_merged.into(2) - -process IndexBamFile { - label 'cpus_8' - - tag "${idPatient}-${idSample}" +// when: 'manta' in tools - publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: { - if (save_bam_mapped) "Preprocessing/${idSample}/Mapped/${it}" - else null - } +// script: +// beforeScript = params.target_bed ? "bgzip --threads ${task.cpus} -c ${targetBED} > call_targets.bed.gz ; tabix call_targets.bed.gz" : "" +// options = params.target_bed ? "--exome --callRegions call_targets.bed.gz" : "" +// status = status_map[idPatient, idSample] +// input_bam = status == 0 ? "--bam" : "--tumorBam" +// vcftype = status == 0 ? "diploid" : "tumor" +// """ +// ${beforeScript} +// configManta.py \ +// ${input_bam} ${bam} \ +// --reference ${fasta} \ +// ${options} \ +// --runDir Manta - input: - set idPatient, idSample, file("${idSample}.bam") from bam_mapped_merged_to_index +// python Manta/runWorkflow.py -m local -j ${task.cpus} + +// mv Manta/results/variants/candidateSmallIndels.vcf.gz \ +// Manta_${idSample}.candidateSmallIndels.vcf.gz +// mv Manta/results/variants/candidateSmallIndels.vcf.gz.tbi \ +// Manta_${idSample}.candidateSmallIndels.vcf.gz.tbi +// mv Manta/results/variants/candidateSV.vcf.gz \ +// Manta_${idSample}.candidateSV.vcf.gz +// mv Manta/results/variants/candidateSV.vcf.gz.tbi \ +// Manta_${idSample}.candidateSV.vcf.gz.tbi +// mv Manta/results/variants/${vcftype}SV.vcf.gz \ +// Manta_${idSample}.${vcftype}SV.vcf.gz +// mv Manta/results/variants/${vcftype}SV.vcf.gz.tbi \ +// Manta_${idSample}.${vcftype}SV.vcf.gz.tbi +// """ +// } - output: - set idPatient, idSample, file("${idSample}.bam"), file("${idSample}.bam.bai") into bam_mapped_merged_indexed - set idPatient, idSample into tsv_bam_indexed +// vcfMantaSingle = vcfMantaSingle.dump(tag:'Single Manta') - when: save_bam_mapped || !(params.known_indels) +// // STEP TIDDIT - script: - """ - samtools index ${idSample}.bam - """ -} - -if (!save_bam_mapped) tsv_bam_indexed.close() +// process TIDDIT { +// tag "${idSample}" -(tsv_bam_indexed, tsv_bam_indexed_sample) = tsv_bam_indexed.into(2) - -// Creating a TSV file to restart from this step -tsv_bam_indexed.map { idPatient, idSample -> - gender = genderMap[idPatient] - status = statusMap[idPatient, idSample] - bam = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam" - bai = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam.bai" - "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n" -}.collectFile( - name: 'mapped.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" -) +// publishDir params.outdir, mode: params.publish_dir_mode, +// saveAs: { +// if (it == "TIDDIT_${idSample}.vcf") "VariantCalling/${idSample}/TIDDIT/${it}" +// else "Reports/${idSample}/TIDDIT/${it}" +// } -tsv_bam_indexed_sample - .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { idPatient, idSample -> - status = statusMap[idPatient, idSample] - gender = genderMap[idPatient] - bam = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam" - bai = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam.bai" - ["mapped_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n"] -} -// STEP 2: MARKING DUPLICATES - -process MarkDuplicates { - label 'cpus_16' - - tag "${idPatient}-${idSample}" - - publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: { - if (it == "${idSample}.bam.metrics") "Reports/${idSample}/MarkDuplicates/${it}" - else "Preprocessing/${idSample}/DuplicatesMarked/${it}" - } - - input: - set idPatient, idSample, file("${idSample}.bam") from bam_mapped_merged - - output: - set idPatient, idSample, file("${idSample}.md.bam"), file("${idSample}.md.bam.bai") into bam_duplicates_marked - set idPatient, idSample into tsv_bam_duplicates_marked - file ("${idSample}.bam.metrics") optional true into duplicates_marked_report - - when: !(params.skip_markduplicates) - - script: - markdup_java_options = task.memory.toGiga() > 8 ? params.markdup_java_options : "\"-Xms" + (task.memory.toGiga() / 2).trunc() + "g -Xmx" + (task.memory.toGiga() - 1) + "g\"" - metrics = 'markduplicates' in skipQC ? '' : "-M ${idSample}.bam.metrics" - if (params.use_gatk_spark) - """ - gatk --java-options ${markdup_java_options} \ - MarkDuplicatesSpark \ - -I ${idSample}.bam \ - -O ${idSample}.md.bam \ - ${metrics} \ - --tmp-dir . \ - --create-output-bam-index true \ - --spark-master local[${task.cpus}] - """ - else - """ - gatk --java-options ${markdup_java_options} \ - MarkDuplicates \ - --INPUT ${idSample}.bam \ - --METRICS_FILE ${idSample}.bam.metrics \ - --TMP_DIR . \ - --ASSUME_SORT_ORDER coordinate \ - --CREATE_INDEX true \ - --OUTPUT ${idSample}.md.bam - - mv ${idSample}.md.bai ${idSample}.md.bam.bai - """ -} +// input: +// set idPatient, idSample, file(bam), file(bai) from bamTIDDIT +// file(fasta) from fasta +// file(fastaFai) from fai -(tsv_bam_duplicates_marked, tsv_bam_duplicates_marked_sample) = tsv_bam_duplicates_marked.into(2) - -// Creating a TSV file to restart from this step -tsv_bam_duplicates_marked.map { idPatient, idSample -> - gender = genderMap[idPatient] - status = statusMap[idPatient, idSample] - bam = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam" - bai = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam.bai" - "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n" -}.collectFile( - name: 'duplicates_marked_no_table.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" -) +// output: +// set val("TIDDIT"), idPatient, idSample, file("*.vcf.gz"), file("*.tbi") into vcfTIDDIT +// set file("TIDDIT_${idSample}.old.vcf"), file("TIDDIT_${idSample}.ploidy.tab"), file("TIDDIT_${idSample}.signals.tab"), file("TIDDIT_${idSample}.wig"), file("TIDDIT_${idSample}.gc.wig") into tidditOut -tsv_bam_duplicates_marked_sample - .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { idPatient, idSample -> - status = statusMap[idPatient, idSample] - gender = genderMap[idPatient] - bam = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam" - bai = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam.bai" - ["duplicates_marked_no_table_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n"] -} +// when: 'tiddit' in tools -if ('markduplicates' in skipQC) duplicates_marked_report.close() +// script: +// """ +// tiddit --sv -o TIDDIT_${idSample} --bam ${bam} --ref ${fasta} -if (step == 'preparerecalibration') bam_duplicates_marked = inputSample +// mv TIDDIT_${idSample}.vcf TIDDIT_${idSample}.old.vcf -bam_duplicates_marked = bam_duplicates_marked.dump(tag:'MD BAM') -duplicates_marked_report = duplicates_marked_report.dump(tag:'MD Report') +// grep -E "#|PASS" TIDDIT_${idSample}.old.vcf > TIDDIT_${idSample}.vcf -if (params.skip_markduplicates) bam_duplicates_marked = bam_mapped_merged_indexed +// bgzip --threads ${task.cpus} -c TIDDIT_${idSample}.vcf > TIDDIT_${idSample}.vcf.gz -(bamMD, bamMDToJoin, bam_duplicates_marked) = bam_duplicates_marked.into(3) +// tabix TIDDIT_${idSample}.vcf.gz +// """ +// } -bamBaseRecalibrator = bamMD.combine(intBaseRecalibrator) +// vcfTIDDIT = vcfTIDDIT.dump(tag:'TIDDIT') -bamBaseRecalibrator = bamBaseRecalibrator.dump(tag:'BAM FOR BASERECALIBRATOR') +// // STEP FREEBAYES SINGLE MODE -// STEP 2': SENTIEON DEDUP +// process FreebayesSingle { +// tag "${idSample}-${intervalBed.baseName}" -process Sentieon_Dedup { - label 'cpus_max' - label 'memory_max' - label 'sentieon' +// label 'cpus_1' - tag "${idPatient}-${idSample}" +// input: +// set idPatient, idSample, file(bam), file(bai), file(intervalBed) from bamFreebayesSingle +// file(fasta) from fasta +// file(fastaFai) from ch_software_versions_yaml - publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: { - if (it == "${idSample}_*.txt" && 'sentieon' in skipQC) null - else if (it == "${idSample}_*.txt") "Reports/${idSample}/Sentieon/${it}" - else "Preprocessing/${idSample}/DedupedSentieon/${it}" - } +// output: +// set val("FreeBayes"), idPatient, idSample, file("${intervalBed.baseName}_${idSample}.vcf") into vcfFreebayesSingle + +// when: 'freebayes' in tools + +// script: +// intervalsOptions = params.no_intervals ? "" : "-t ${intervalBed}" +// """ +// freebayes \ +// -f ${fasta} \ +// --min-alternate-fraction 0.1 \ +// --min-mapping-quality 1 \ +// ${intervalsOptions} \ +// ${bam} > ${intervalBed.baseName}_${idSample}.vcf +// """ +// } - input: - set idPatient, idSample, file(bam), file(bai) from bam_sentieon_mapped_merged_indexed - file(fasta) from ch_fasta - file(fastaFai) from ch_fai +// vcfFreebayesSingle = vcfFreebayesSingle.groupTuple(by: [0,1,2]) + +// /* +// -------------------------------------------------------------------------------- +// SOMATIC VARIANT CALLING +// -------------------------------------------------------------------------------- +// */ +// // Ascat, pileup, pileups with no intervals, recalibrated BAMs +// (bamAscat, bamMpileup, bamMpileupNoInt, bamRecalAll) = bamRecalAll.into(4) - output: - set idPatient, idSample, file("${idSample}.deduped.bam"), file("${idSample}.deduped.bam.bai") into bam_sentieon_dedup +// // separate BAM by status +// bamNormal = Channel.create() +// bamTumor = Channel.create() - when: params.sentieon +// bamRecalAll +// .choice(bamTumor, bamNormal) {status_map[it[0], it[1]] == 0 ? 1 : 0} - script: - """ - sentieon driver \ - -t ${task.cpus} \ - -i ${bam} \ - -r ${fasta} \ - --algo GCBias --summary ${idSample}_gc_summary.txt ${idSample}_gc_metric.txt \ - --algo MeanQualityByCycle ${idSample}_mq_metric.txt \ - --algo QualDistribution ${idSample}_qd_metric.txt \ - --algo InsertSizeMetricAlgo ${idSample}_is_metric.txt \ - --algo AlignmentStat ${idSample}_aln_metric.txt +// // Crossing Normal and Tumor to get a T/N pair for Somatic Variant Calling +// // Remapping channel to remove common key idPatient +// pairBam = bamNormal.cross(bamTumor).map { +// normal, tumor -> +// [normal[0], normal[1], normal[2], normal[3], tumor[1], tumor[2], tumor[3]] +// } - sentieon driver \ - -t ${task.cpus} \ - -i ${bam} \ - --algo LocusCollector \ - --fun score_info ${idSample}_score.gz +// pairBam = pairBam.dump(tag:'BAM Somatic Pair') - sentieon driver \ - -t ${task.cpus} \ - -i ${bam} \ - --algo Dedup \ - --rmdup \ - --score_info ${idSample}_score.gz \ - --metrics ${idSample}_dedup_metric.txt ${idSample}.deduped.bam - """ -} +// // Manta, Strelka, Mutect2, MSIsensor +// (pairBamManta, pairBamStrelka, pairBamStrelkaBP, pairBamCalculateContamination, pairBamFilterMutect2, pairBamMsisensor, pairBamCNVkit, pairBam) = pairBam.into(8) -// STEP 3: CREATING RECALIBRATION TABLES - -process BaseRecalibrator { - label 'cpus_1' - - tag "${idPatient}-${idSample}-${intervalBed.baseName}" - - input: - set idPatient, idSample, file(bam), file(bai), file(intervalBed) from bamBaseRecalibrator - file(dbsnp) from ch_dbsnp - file(dbsnpIndex) from ch_dbsnp_tbi - file(fasta) from ch_fasta - file(dict) from ch_dict - file(fastaFai) from ch_fai - file(knownIndels) from ch_known_indels - file(knownIndelsIndex) from ch_known_indels_tbi - - output: - set idPatient, idSample, file("${prefix}${idSample}.recal.table") into tableGatherBQSRReports - set idPatient, idSample into recalTableTSVnoInt - - when: params.known_indels - - script: - dbsnpOptions = params.dbsnp ? "--known-sites ${dbsnp}" : "" - knownOptions = params.known_indels ? knownIndels.collect{"--known-sites ${it}"}.join(' ') : "" - prefix = params.no_intervals ? "" : "${intervalBed.baseName}_" - intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}" - // TODO: --use-original-qualities ??? - """ - gatk --java-options -Xmx${task.memory.toGiga()}g \ - BaseRecalibrator \ - -I ${bam} \ - -O ${prefix}${idSample}.recal.table \ - --tmp-dir . \ - -R ${fasta} \ - ${intervalsOptions} \ - ${dbsnpOptions} \ - ${knownOptions} \ - --verbosity INFO - """ -} +// // Making Pair Bam for Sention -if (!params.no_intervals) tableGatherBQSRReports = tableGatherBQSRReports.groupTuple(by:[0, 1]) +// // separate BAM by status +// bam_sention_normal = Channel.create() +// bam_sentieon_tumor = Channel.create() + +// bam_sentieon_all +// .choice(bam_sentieon_tumor, bam_sention_normal) {status_map[it[0], it[1]] == 0 ? 1 : 0} + +// // Crossing Normal and Tumor to get a T/N pair for Somatic Variant Calling +// // Remapping channel to remove common key idPatient + +// bam_pair_sentieon_TNscope = bam_sention_normal.cross(bam_sentieon_tumor).map { +// normal, tumor -> +// [normal[0], normal[1], normal[2], normal[3], normal[4], tumor[1], tumor[2], tumor[3], tumor[4]] +// } + +// intervalPairBam = pairBam.spread(bedIntervals) -tableGatherBQSRReports = tableGatherBQSRReports.dump(tag:'BQSR REPORTS') +// bamMpileup = bamMpileup.spread(intMpileup) + +// // intervals for Mutect2 calls, FreeBayes and pileups for Mutect2 filtering +// (pairBamMutect2, pairBamFreeBayes, pairBamPileupSummaries) = intervalPairBam.into(3) + +// // STEP FREEBAYES + +// process FreeBayes { +// tag "${idSampleTumor}_vs_${idSampleNormal}-${intervalBed.baseName}" + +// label 'cpus_1' + +// input: +// set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor), file(intervalBed) from pairBamFreeBayes +// file(fasta) from fasta +// file(fastaFai) from fai + +// output: +// set val("FreeBayes"), idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf") into vcfFreeBayes + +// when: 'freebayes' in tools + +// script: +// intervalsOptions = params.no_intervals ? "" : "-t ${intervalBed}" +// """ +// freebayes \ +// -f ${fasta} \ +// --pooled-continuous \ +// --pooled-discrete \ +// --genotype-qualities \ +// --report-genotype-likelihood-max \ +// --allele-balance-priors-off \ +// --min-alternate-fraction 0.03 \ +// --min-repeat-entropy 1 \ +// --min-alternate-count 2 \ +// ${intervalsOptions} \ +// ${bamTumor} \ +// ${bamNormal} > ${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf +// """ +// } + +// vcfFreeBayes = vcfFreeBayes.groupTuple(by:[0,1,2]) + +// // STEP GATK MUTECT2.1 - RAW CALLS + +// process Mutect2 { +// tag "${idSampleTumor}_vs_${idSampleNormal}-${intervalBed.baseName}" + +// label 'cpus_1' + +// input: +// set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor), file(intervalBed) from pairBamMutect2 +// file(dict) from dict +// file(fasta) from fasta +// file(fastaFai) from fai +// file(germlineResource) from germline_resource +// file(germlineResourceIndex) from germline_resource_tbi +// file(intervals) from intervals +// file(pon) from pon +// file(ponIndex) from pon_tbi + +// output: +// set val("Mutect2"), idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf") into mutect2Output +// set idPatient, idSampleNormal, idSampleTumor, file("${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf.stats") optional true into intervalStatsFiles +// set idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf.stats"), file("${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf") optional true into mutect2Stats + +// when: 'mutect2' in tools + +// script: +// // please make a panel-of-normals, using at least 40 samples +// // https://gatkforums.broadinstitute.org/gatk/discussion/11136/how-to-call-somatic-mutations-using-gatk4-mutect2 +// PON = params.pon ? "--panel-of-normals ${pon}" : "" +// intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}" +// softClippedOption = params.ignore_soft_clipped_bases ? "--dont-use-soft-clipped-bases true" : "" +// """ +// # Get raw calls +// gatk --java-options "-Xmx${task.memory.toGiga()}g" \ +// Mutect2 \ +// -R ${fasta}\ +// -I ${bamTumor} -tumor ${idSampleTumor} \ +// -I ${bamNormal} -normal ${idSampleNormal} \ +// ${intervalsOptions} \ +// ${softClippedOption} \ +// --germline-resource ${germlineResource} \ +// ${PON} \ +// -O ${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf +// """ +// } + +// mutect2Output = mutect2Output.groupTuple(by:[0,1,2]) +// mutect2Stats = mutect2Stats.groupTuple(by:[0,1]) + +// // STEP GATK MUTECT2.2 - MERGING STATS + +// process MergeMutect2Stats { +// tag "${idSamplePair}" + +// publishDir "${params.outdir}/VariantCalling/${idSamplePair}/Mutect2", mode: params.publish_dir_mode + +// input: +// set idPatient, idSamplePair, file(statsFiles), file(vcf) from mutect2Stats // Actual stats files and corresponding VCF chunks +// file(dict) from dict +// file(fasta) from fasta +// file(fastaFai) from fai +// file(germlineResource) from germline_resource +// file(germlineResourceIndex) from germline_resource_tbi +// file(intervals) from intervals + +// output: +// set idPatient, idSamplePair, file("${idSamplePair}.vcf.gz.stats") into mergedStatsFile + +// when: 'mutect2' in tools + +// script: +// stats = statsFiles.collect{ "-stats ${it} " }.join(' ') +// """ +// gatk --java-options "-Xmx${task.memory.toGiga()}g" \ +// MergeMutectStats \ +// ${stats} \ +// -O ${idSamplePair}.vcf.gz.stats +// """ +// } + +// // we are merging the VCFs that are called separatelly for different intervals +// // so we can have a single sorted VCF containing all the calls for a given caller + +// // STEP MERGING VCF - FREEBAYES & GATK HAPLOTYPECALLER + +// vcfConcatenateVCFs = vcfFreeBayes.mix(vcfFreebayesSingle, vcfGenotypeGVCFs, gvcfHaplotypeCaller) +// vcfConcatenateVCFs = vcfConcatenateVCFs.dump(tag:'VCF to merge') + +// process ConcatVCF { +// label 'cpus_8' + +// tag "${variantCaller}-${idSample}" + +// publishDir "${params.outdir}/VariantCalling/${idSample}/${"$variantCaller"}", mode: params.publish_dir_mode + +// input: +// set variantCaller, idPatient, idSample, file(vcf) from vcfConcatenateVCFs +// file(fastaFai) from fai +// file(targetBED) from ch_target_bed -if (params.no_intervals) { - (tableGatherBQSRReports, tableGatherBQSRReportsNoInt) = tableGatherBQSRReports.into(2) - recalTable = tableGatherBQSRReportsNoInt -} else recalTableTSVnoInt.close() +// output: +// // we have this funny *_* pattern to avoid copying the raw calls to publishdir +// set variantCaller, idPatient, idSample, file("*_*.vcf.gz"), file("*_*.vcf.gz.tbi") into vcfConcatenated + +// when: ('haplotypecaller' in tools || 'mutect2' in tools || 'freebayes' in tools) + +// script: +// if (variantCaller == 'HaplotypeCallerGVCF') +// outputFile = "HaplotypeCaller_${idSample}.g.vcf" +// else +// outputFile = "${variantCaller}_${idSample}.vcf" +// options = params.target_bed ? "-t ${targetBED}" : "" +// intervalsOptions = params.no_intervals ? "-n" : "" +// """ +// concatenateVCFs.sh -i ${fastaFai} -c ${task.cpus} -o ${outputFile} ${options} ${intervalsOptions} +// """ +// } -// STEP 3.5: MERGING RECALIBRATION TABLES +// vcfConcatenated = vcfConcatenated.dump(tag:'VCF') -process GatherBQSRReports { - label 'memory_singleCPU_2_task' - label 'cpus_2' +// // STEP MERGING VCF - GATK MUTECT2 (UNFILTERED) - tag "${idPatient}-${idSample}" +// mutect2Output = mutect2Output.dump(tag:'Mutect2 output VCF to merge') + +// process ConcatVCF_Mutect2 { +// label 'cpus_8' + +// tag "${idSample}" - publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: { - if (it == "${idSample}.recal.table" && !params.skip_markduplicates) "Preprocessing/${idSample}/DuplicatesMarked/${it}" - else "Preprocessing/${idSample}/Mapped/${it}" - } +// publishDir "${params.outdir}/VariantCalling/${idSample}/Mutect2", mode: params.publish_dir_mode - input: - set idPatient, idSample, file(recal) from tableGatherBQSRReports +// input: +// set variantCaller, idPatient, idSample, file(vcf) from mutect2Output +// file(fastaFai) from fai +// file(targetBED) from ch_target_bed - output: - set idPatient, idSample, file("${idSample}.recal.table") into recalTable - file("${idSample}.recal.table") into baseRecalibratorReport - set idPatient, idSample into recalTableTSV +// output: +// // we have this funny *_* pattern to avoid copying the raw calls to publishdir +// set variantCaller, idPatient, idSample, file("*_*.vcf.gz"), file("*_*.vcf.gz.tbi") into vcfConcatenatedForFilter - when: !(params.no_intervals) +// when: ('haplotypecaller' in tools || 'mutect2' in tools || 'freebayes' in tools) - script: - input = recal.collect{"-I ${it}"}.join(' ') - """ - gatk --java-options -Xmx${task.memory.toGiga()}g \ - GatherBQSRReports \ - ${input} \ - -O ${idSample}.recal.table \ - """ -} +// script: +// outputFile = "Mutect2_unfiltered_${idSample}.vcf" +// options = params.target_bed ? "-t ${targetBED}" : "" +// intervalsOptions = params.no_intervals ? "-n" : "" +// """ +// concatenateVCFs.sh -i ${fastaFai} -c ${task.cpus} -o ${outputFile} ${options} ${intervalsOptions} +// """ +// } -if ('baserecalibrator' in skipQC) baseRecalibratorReport.close() - -recalTable = recalTable.dump(tag:'RECAL TABLE') - -(recalTableTSV, recalTableSampleTSV) = recalTableTSV.mix(recalTableTSVnoInt).into(2) - -// Create TSV files to restart from this step -if (params.skip_markduplicates) { - recalTableTSV.map { idPatient, idSample -> - status = statusMap[idPatient, idSample] - gender = genderMap[idPatient] - bam = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam" - bai = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam.bai" - recalTable = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.recal.table" - "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${recalTable}\n" - }.collectFile( - name: 'mapped_no_duplicates_marked.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" - ) - - recalTableSampleTSV - .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV/") { - idPatient, idSample -> - status = statusMap[idPatient, idSample] - gender = genderMap[idPatient] - bam = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam" - bai = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam.bai" - recalTable = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.recal.table" - ["mapped_no_duplicates_marked_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${recalTable}\n"] - } -} else { - recalTableTSV.map { idPatient, idSample -> - status = statusMap[idPatient, idSample] - gender = genderMap[idPatient] - bam = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam" - bai = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam.bai" - recalTable = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.recal.table" - - "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${recalTable}\n" - }.collectFile( - name: 'duplicates_marked.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" - ) - - recalTableSampleTSV - .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV/") { - idPatient, idSample -> - status = statusMap[idPatient, idSample] - gender = genderMap[idPatient] - bam = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam" - bai = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam.bai" - recalTable = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.recal.table" - ["duplicates_marked_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${recalTable}\n"] - } -} +// vcfConcatenatedForFilter = vcfConcatenatedForFilter.dump(tag:'Mutect2 unfiltered VCF') -bamApplyBQSR = bamMDToJoin.join(recalTable, by:[0,1]) +// // STEP GATK MUTECT2.3 - GENERATING PILEUP SUMMARIES -if (step == 'recalibrate') bamApplyBQSR = inputSample +// pairBamPileupSummaries = pairBamPileupSummaries.map{ +// idPatient, idSampleNormal, bamNormal, baiNormal, idSampleTumor, bamTumor, baiTumor, intervalBed -> +// [idPatient, idSampleNormal, idSampleTumor, bamNormal, baiNormal, bamTumor, baiTumor, intervalBed] +// }.join(intervalStatsFiles, by:[0,1,2]) -bamApplyBQSR = bamApplyBQSR.dump(tag:'BAM + BAI + RECAL TABLE') +// process PileupSummariesForMutect2 { +// tag "${idSampleTumor}_vs_${idSampleNormal}-${intervalBed.baseName}" -bamApplyBQSR = bamApplyBQSR.combine(intApplyBQSR) +// label 'cpus_1' -bamApplyBQSR = bamApplyBQSR.dump(tag:'BAM + BAI + RECAL TABLE + INT') +// input: +// set idPatient, idSampleNormal, idSampleTumor, file(bamNormal), file(baiNormal), file(bamTumor), file(baiTumor), file(intervalBed), file(statsFile) from pairBamPileupSummaries +// file(germlineResource) from germline_resource +// file(germlineResourceIndex) from germline_resource_tbi -// STEP 4: RECALIBRATING +// output: +// set idPatient, idSampleNormal, idSampleTumor, file("${intervalBed.baseName}_${idSampleTumor}_pileupsummaries.table") into pileupSummaries -process ApplyBQSR { - label 'memory_singleCPU_2_task' - label 'cpus_2' +// when: 'mutect2' in tools - tag "${idPatient}-${idSample}-${intervalBed.baseName}" +// script: +// intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}" +// """ +// gatk --java-options "-Xmx${task.memory.toGiga()}g" \ +// GetPileupSummaries \ +// -I ${bamTumor} \ +// -V ${germlineResource} \ +// ${intervalsOptions} \ +// -O ${intervalBed.baseName}_${idSampleTumor}_pileupsummaries.table +// """ +// } - input: - set idPatient, idSample, file(bam), file(bai), file(recalibrationReport), file(intervalBed) from bamApplyBQSR - file(dict) from ch_dict - file(fasta) from ch_fasta - file(fastaFai) from ch_fai +// pileupSummaries = pileupSummaries.groupTuple(by:[0,1,2]) - output: - set idPatient, idSample, file("${prefix}${idSample}.recal.bam") into bam_recalibrated_to_merge +// // STEP GATK MUTECT2.4 - MERGING PILEUP SUMMARIES - script: - prefix = params.no_intervals ? "" : "${intervalBed.baseName}_" - intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}" - """ - gatk --java-options -Xmx${task.memory.toGiga()}g \ - ApplyBQSR \ - -R ${fasta} \ - --input ${bam} \ - --output ${prefix}${idSample}.recal.bam \ - ${intervalsOptions} \ - --bqsr-recal-file ${recalibrationReport} - """ -} +// process MergePileupSummaries { +// label 'cpus_1' -(bam_recalibrated_to_merge, bam_recalibrated_to_index) = bam_recalibrated_to_merge.groupTuple(by:[0, 1]).into(2) - -// STEP 4': SENTIEON BQSR - -bam_sentieon_dedup = bam_sentieon_dedup.dump(tag:'deduped.bam') - -process Sentieon_BQSR { - label 'cpus_max' - label 'memory_max' - label 'sentieon' - - tag "${idPatient}-${idSample}" - - publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: { - if (it == "${idSample}_recal_result.csv" && 'sentieon' in skipQC) "Reports/${idSample}/Sentieon/${it}" - else "Preprocessing/${idSample}/RecalSentieon/${it}" - } - - input: - set idPatient, idSample, file(bam), file(bai) from bam_sentieon_dedup - file(dbsnp) from ch_dbsnp - file(dbsnpIndex) from ch_dbsnp_tbi - file(fasta) from ch_fasta - file(dict) from ch_dict - file(fastaFai) from ch_fai - file(knownIndels) from ch_known_indels - file(knownIndelsIndex) from ch_known_indels_tbi - - output: - set idPatient, idSample, file("${idSample}.recal.bam"), file("${idSample}.recal.bam.bai") into bam_sentieon_recal - set idPatient, idSample, file(bam), file(bai), file("${idSample}.recal.table") into bam_sentieon_deduped_table - set idPatient, idSample into tsv_sentieon - - when: params.sentieon - - script: - known = knownIndels.collect{"--known-sites ${it}"}.join(' ') - """ - sentieon driver \ - -t ${task.cpus} \ - -r ${fasta} \ - -i ${idSample}.deduped.bam \ - --algo QualCal \ - -k ${dbsnp} \ - ${idSample}.recal.table - - sentieon driver \ - -t ${task.cpus} \ - -r ${fasta} \ - -i ${idSample}.deduped.bam \ - -q ${idSample}.recal.table \ - --algo QualCal \ - -k ${dbsnp} \ - ${idSample}.table.post \ - --algo ReadWriter ${idSample}.recal.bam - - sentieon driver \ - -t ${task.cpus} \ - --algo QualCal \ - --plot \ - --before ${idSample}.recal.table \ - --after ${idSample}.table.post \ - ${idSample}_recal_result.csv - """ -} +// tag "${idPatient}_${idSampleTumor}" -(tsv_sentieon_deduped, tsv_sentieon_deduped_sample, tsv_sentieon_recal, tsv_sentieon_recal_sample) = tsv_sentieon.into(4) - -// Creating a TSV file to restart from this step -tsv_sentieon_deduped.map { idPatient, idSample -> - gender = genderMap[idPatient] - status = statusMap[idPatient, idSample] - bam = "${params.outdir}/Preprocessing/${idSample}/DedupedSentieon/${idSample}.deduped.bam" - bai = "${params.outdir}/Preprocessing/${idSample}/DedupedSentieon/${idSample}.deduped.bam.bai" - table = "${params.outdir}/Preprocessing/${idSample}/RecalSentieon/${idSample}.recal.table" - "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${table}\n" -}.collectFile( - name: 'sentieon_deduped.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" -) +// publishDir "${params.outdir}/VariantCalling/${idSampleTumor}/Mutect2", mode: params.publish_dir_mode -tsv_sentieon_deduped_sample - .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { idPatient, idSample -> - status = statusMap[idPatient, idSample] - gender = genderMap[idPatient] - bam = "${params.outdir}/Preprocessing/${idSample}/DedupedSentieon/${idSample}.deduped.bam" - bai = "${params.outdir}/Preprocessing/${idSample}/DedupedSentieon/${idSample}.deduped.bam.bai" - table = "${params.outdir}/Preprocessing/${idSample}/RecalSentieon/${idSample}.recal.table" - ["sentieon_deduped_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${table}\n"] -} +// input: +// set idPatient, idSampleNormal, idSampleTumor, file(pileupSums) from pileupSummaries +// file(dict) from dict -// Creating a TSV file to restart from this step -tsv_sentieon_recal.map { idPatient, idSample -> - gender = genderMap[idPatient] - status = statusMap[idPatient, idSample] - bam = "${params.outdir}/Preprocessing/${idSample}/RecalSentieon/${idSample}.recal.bam" - bai = "${params.outdir}/Preprocessing/${idSample}/RecalSentieon/${idSample}.recal.bam.bai" - "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n" -}.collectFile( - name: 'sentieon_recalibrated.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" -) +// output: +// set idPatient, idSampleNormal, idSampleTumor, file("${idSampleTumor}_pileupsummaries.table") into mergedPileupFile -tsv_sentieon_recal_sample - .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { idPatient, idSample -> - status = statusMap[idPatient, idSample] - gender = genderMap[idPatient] - bam = "${params.outdir}/Preprocessing/${idSample}/RecalSentieon/${idSample}.recal.bam" - bai = "${params.outdir}/Preprocessing/${idSample}/RecalSentieon/${idSample}.recal.bam.bai" - ["sentieon_recalibrated_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n"] -} +// when: 'mutect2' in tools -// STEP 4.5: MERGING THE RECALIBRATED BAM FILES +// script: +// allPileups = pileupSums.collect{ "-I ${it} " }.join(' ') +// """ +// gatk --java-options "-Xmx${task.memory.toGiga()}g" \ +// GatherPileupSummaries \ +// --sequence-dictionary ${dict} \ +// ${allPileups} \ +// -O ${idSampleTumor}_pileupsummaries.table +// """ +// } -process MergeBamRecal { - label 'cpus_8' +// // STEP GATK MUTECT2.5 - CALCULATING CONTAMINATION - tag "${idPatient}-${idSample}" +// pairBamCalculateContamination = pairBamCalculateContamination.map{ +// idPatient, idSampleNormal, bamNormal, baiNormal, idSampleTumor, bamTumor, baiTumor -> +// [idPatient, idSampleNormal, idSampleTumor, bamNormal, baiNormal, bamTumor, baiTumor] +// }.join(mergedPileupFile, by:[0,1,2]) + +// process CalculateContamination { +// label 'cpus_1' + +// tag "${idSampleTumor}_vs_${idSampleNormal}" + +// publishDir "${params.outdir}/VariantCalling/${idSampleTumor}/Mutect2", mode: params.publish_dir_mode + +// input: +// set idPatient, idSampleNormal, idSampleTumor, file(bamNormal), file(baiNormal), file(bamTumor), file(baiTumor), file(mergedPileup) from pairBamCalculateContamination + +// output: +// set idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("${idSampleTumor}_contamination.table") into contaminationTable + +// when: 'mutect2' in tools + +// script: +// """ +// # calculate contamination +// gatk --java-options "-Xmx${task.memory.toGiga()}g" \ +// CalculateContamination \ +// -I ${idSampleTumor}_pileupsummaries.table \ +// -O ${idSampleTumor}_contamination.table +// """ +// } - publishDir "${params.outdir}/Preprocessing/${idSample}/Recalibrated", mode: params.publish_dir_mode +// // STEP GATK MUTECT2.6 - FILTERING CALLS + +// mutect2CallsToFilter = vcfConcatenatedForFilter.map{ +// variantCaller, idPatient, idSamplePair, vcf, tbi -> +// [idPatient, idSamplePair, vcf, tbi] +// }.join(mergedStatsFile, by:[0,1]).join(contaminationTable, by:[0,1]) + +// process FilterMutect2Calls { +// label 'cpus_1' - input: - set idPatient, idSample, file(bam) from bam_recalibrated_to_merge +// tag "${idSamplePair}" + +// publishDir "${params.outdir}/VariantCalling/${idSamplePair}/Mutect2", mode: params.publish_dir_mode + +// input: +// set idPatient, idSamplePair, file(unfiltered), file(unfilteredIndex), file(stats), file(contaminationTable) from mutect2CallsToFilter +// file(dict) from dict +// file(fasta) from fasta +// file(fastaFai) from fai +// file(germlineResource) from germline_resource +// file(germlineResourceIndex) from germline_resource_tbi +// file(intervals) from intervals + +// output: +// set val("Mutect2"), idPatient, idSamplePair, file("Mutect2_filtered_${idSamplePair}.vcf.gz"), file("Mutect2_filtered_${idSamplePair}.vcf.gz.tbi"), file("Mutect2_filtered_${idSamplePair}.vcf.gz.filteringStats.tsv") into filteredMutect2Output + +// when: 'mutect2' in tools + +// script: +// """ +// # do the actual filtering +// gatk --java-options "-Xmx${task.memory.toGiga()}g" \ +// FilterMutectCalls \ +// -V ${unfiltered} \ +// --contamination-table ${contaminationTable} \ +// --stats ${stats} \ +// -R ${fasta} \ +// -O Mutect2_filtered_${idSamplePair}.vcf.gz +// """ +// } + +// // STEP SENTIEON TNSCOPE + +// process Sentieon_TNscope { +// label 'cpus_max' +// label 'memory_max' +// label 'sentieon' + +// tag "${idSampleTumor}_vs_${idSampleNormal}" + +// input: +// set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), file(recalNormal), idSampleTumor, file(bamTumor), file(baiTumor), file(recalTumor) from bam_pair_sentieon_TNscope +// file(dict) from dict +// file(fasta) from fasta +// file(fastaFai) from fai +// file(dbsnp) from dbsnp +// file(dbsnpIndex) from dbsnp_tbi +// file(pon) from pon +// file(ponIndex) from pon_tbi + +// output: +// set val("SentieonTNscope"), idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("*.vcf") into vcf_sentieon_TNscope + +// when: 'tnscope' in tools && params.sentieon + +// script: +// PON = params.pon ? "--pon ${pon}" : "" +// """ +// sentieon driver \ +// -t ${task.cpus} \ +// -r ${fasta} \ +// -i ${bamTumor} \ +// -q ${recalTumor} \ +// -i ${bamNormal} \ +// -q ${recalNormal} \ +// --algo TNscope \ +// --tumor_sample ${idSampleTumor} \ +// --normal_sample ${idSampleNormal} \ +// --dbsnp ${dbsnp} \ +// ${PON} \ +// TNscope_${idSampleTumor}_vs_${idSampleNormal}.vcf +// """ +// } + +// vcf_sentieon_TNscope = vcf_sentieon_TNscope.dump(tag:'Sentieon TNscope') + +// vcf_sentieon = vcf_sentieon_DNAseq.mix(vcf_sentieon_DNAscope, vcf_sentieon_DNAscope_SV, vcf_sentieon_TNscope) + +// process CompressSentieonVCF { +// tag "${idSample} - ${vcf}" + +// publishDir "${params.outdir}/VariantCalling/${idSample}/${variantCaller}", mode: params.publish_dir_mode + +// input: +// set variantCaller, idPatient, idSample, file(vcf) from vcf_sentieon + +// output: +// set variantCaller, idPatient, idSample, file("*.vcf.gz"), file("*.vcf.gz.tbi") into vcf_sentieon_compressed + +// script: +// """ +// bgzip < ${vcf} > ${vcf}.gz +// tabix ${vcf}.gz +// """ +// } + +// vcf_sentieon_compressed = vcf_sentieon_compressed.dump(tag:'Sentieon VCF indexed') + +// // STEP STRELKA.2 - SOMATIC PAIR + +// process Strelka { +// label 'cpus_max' +// label 'memory_max' + +// tag "${idSampleTumor}_vs_${idSampleNormal}" + +// publishDir "${params.outdir}/VariantCalling/${idSampleTumor}_vs_${idSampleNormal}/Strelka", mode: params.publish_dir_mode + +// input: +// set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor) from pairBamStrelka +// file(dict) from dict +// file(fasta) from fasta +// file(fastaFai) from fai +// file(targetBED) from ch_target_bed + +// output: +// set val("Strelka"), idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("*.vcf.gz"), file("*.vcf.gz.tbi") into vcfStrelka + +// when: 'strelka' in tools + +// script: +// beforeScript = params.target_bed ? "bgzip --threads ${task.cpus} -c ${targetBED} > call_targets.bed.gz ; tabix call_targets.bed.gz" : "" +// options = params.target_bed ? "--exome --callRegions call_targets.bed.gz" : "" +// """ +// ${beforeScript} +// configureStrelkaSomaticWorkflow.py \ +// --tumor ${bamTumor} \ +// --normal ${bamNormal} \ +// --referenceFasta ${fasta} \ +// ${options} \ +// --runDir Strelka + +// python Strelka/runWorkflow.py -m local -j ${task.cpus} + +// mv Strelka/results/variants/somatic.indels.vcf.gz \ +// Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz +// mv Strelka/results/variants/somatic.indels.vcf.gz.tbi \ +// Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz.tbi +// mv Strelka/results/variants/somatic.snvs.vcf.gz \ +// Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz +// mv Strelka/results/variants/somatic.snvs.vcf.gz.tbi \ +// Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz.tbi +// """ +// } + +// vcfStrelka = vcfStrelka.dump(tag:'Strelka') + +// // STEP MANTA.2 - SOMATIC PAIR + +// process Manta { +// label 'cpus_max' +// label 'memory_max' + +// tag "${idSampleTumor}_vs_${idSampleNormal}" + +// publishDir "${params.outdir}/VariantCalling/${idSampleTumor}_vs_${idSampleNormal}/Manta", mode: params.publish_dir_mode + +// input: +// set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor) from pairBamManta +// file(fasta) from fasta +// file(fastaFai) from fai +// file(targetBED) from ch_target_bed + +// output: +// set val("Manta"), idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("*.vcf.gz"), file("*.vcf.gz.tbi") into vcfManta +// set idPatient, idSampleNormal, idSampleTumor, file("*.candidateSmallIndels.vcf.gz"), file("*.candidateSmallIndels.vcf.gz.tbi") into mantaToStrelka + +// when: 'manta' in tools + +// script: +// beforeScript = params.target_bed ? "bgzip --threads ${task.cpus} -c ${targetBED} > call_targets.bed.gz ; tabix call_targets.bed.gz" : "" +// options = params.target_bed ? "--exome --callRegions call_targets.bed.gz" : "" +// """ +// ${beforeScript} +// configManta.py \ +// --normalBam ${bamNormal} \ +// --tumorBam ${bamTumor} \ +// --reference ${fasta} \ +// ${options} \ +// --runDir Manta + +// python Manta/runWorkflow.py -m local -j ${task.cpus} + +// mv Manta/results/variants/candidateSmallIndels.vcf.gz \ +// Manta_${idSampleTumor}_vs_${idSampleNormal}.candidateSmallIndels.vcf.gz +// mv Manta/results/variants/candidateSmallIndels.vcf.gz.tbi \ +// Manta_${idSampleTumor}_vs_${idSampleNormal}.candidateSmallIndels.vcf.gz.tbi +// mv Manta/results/variants/candidateSV.vcf.gz \ +// Manta_${idSampleTumor}_vs_${idSampleNormal}.candidateSV.vcf.gz +// mv Manta/results/variants/candidateSV.vcf.gz.tbi \ +// Manta_${idSampleTumor}_vs_${idSampleNormal}.candidateSV.vcf.gz.tbi +// mv Manta/results/variants/diploidSV.vcf.gz \ +// Manta_${idSampleTumor}_vs_${idSampleNormal}.diploidSV.vcf.gz +// mv Manta/results/variants/diploidSV.vcf.gz.tbi \ +// Manta_${idSampleTumor}_vs_${idSampleNormal}.diploidSV.vcf.gz.tbi +// mv Manta/results/variants/somaticSV.vcf.gz \ +// Manta_${idSampleTumor}_vs_${idSampleNormal}.somaticSV.vcf.gz +// mv Manta/results/variants/somaticSV.vcf.gz.tbi \ +// Manta_${idSampleTumor}_vs_${idSampleNormal}.somaticSV.vcf.gz.tbi +// """ +// } + +// vcfManta = vcfManta.dump(tag:'Manta') + +// // Remmaping channels to match input for StrelkaBP +// pairBamStrelkaBP = pairBamStrelkaBP.map { +// idPatientNormal, idSampleNormal, bamNormal, baiNormal, idSampleTumor, bamTumor, baiTumor -> +// [idPatientNormal, idSampleNormal, idSampleTumor, bamNormal, baiNormal, bamTumor, baiTumor] +// }.join(mantaToStrelka, by:[0,1,2]).map { +// idPatientNormal, idSampleNormal, idSampleTumor, bamNormal, baiNormal, bamTumor, baiTumor, mantaCSI, mantaCSIi -> +// [idPatientNormal, idSampleNormal, bamNormal, baiNormal, idSampleTumor, bamTumor, baiTumor, mantaCSI, mantaCSIi] +// } + +// // STEP STRELKA.3 - SOMATIC PAIR - BEST PRACTICES + +// process StrelkaBP { +// label 'cpus_max' +// label 'memory_max' + +// tag "${idSampleTumor}_vs_${idSampleNormal}" + +// publishDir "${params.outdir}/VariantCalling/${idSampleTumor}_vs_${idSampleNormal}/Strelka", mode: params.publish_dir_mode + +// input: +// set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor), file(mantaCSI), file(mantaCSIi) from pairBamStrelkaBP +// file(dict) from dict +// file(fasta) from fasta +// file(fastaFai) from fai +// file(targetBED) from ch_target_bed + +// output: +// set val("Strelka"), idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("*.vcf.gz"), file("*.vcf.gz.tbi") into vcfStrelkaBP + +// when: 'strelka' in tools && 'manta' in tools && !params.no_strelka_bp + +// script: +// beforeScript = params.target_bed ? "bgzip --threads ${task.cpus} -c ${targetBED} > call_targets.bed.gz ; tabix call_targets.bed.gz" : "" +// options = params.target_bed ? "--exome --callRegions call_targets.bed.gz" : "" +// """ +// ${beforeScript} +// configureStrelkaSomaticWorkflow.py \ +// --tumor ${bamTumor} \ +// --normal ${bamNormal} \ +// --referenceFasta ${fasta} \ +// --indelCandidates ${mantaCSI} \ +// ${options} \ +// --runDir Strelka + +// python Strelka/runWorkflow.py -m local -j ${task.cpus} + +// mv Strelka/results/variants/somatic.indels.vcf.gz \ +// StrelkaBP_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz +// mv Strelka/results/variants/somatic.indels.vcf.gz.tbi \ +// StrelkaBP_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz.tbi +// mv Strelka/results/variants/somatic.snvs.vcf.gz \ +// StrelkaBP_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz +// mv Strelka/results/variants/somatic.snvs.vcf.gz.tbi \ +// StrelkaBP_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz.tbi +// """ +// } + +// vcfStrelkaBP = vcfStrelkaBP.dump(tag:'Strelka BP') + +// // STEP CNVkit + +// process CNVkit { +// tag "${idSampleTumor}_vs_${idSampleNormal}" + +// publishDir "${params.outdir}/VariantCalling/${idSampleTumor}_vs_${idSampleNormal}/CNVkit", mode: params.publish_dir_mode + +// input: +// set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor) from pairBamCNVkit +// file(targetBED) from ch_target_bed +// file(fasta) from fasta + +// output: +// set idPatient, idSampleNormal, idSampleTumor, file("${idSampleTumor}*"), file("${idSampleNormal}*") into cnvkitOut + +// when: 'cnvkit' in tools && params.target_bed + +// script: +// """ +// cnvkit.py \ +// batch \ +// ${bamTumor} \ +// --normal ${bamNormal} \ +// --targets ${targetBED} \ +// --fasta ${fasta} \ +// --output-reference output_reference.cnn \ +// --output-dir ./ \ +// --diagram \ +// --scatter +// """ +// } + +// // STEP MSISENSOR.1 - SCAN + +// // Scan reference genome for microsatellites +// process MSIsensor_scan { +// label 'cpus_1' +// label 'memory_max' - output: - set idPatient, idSample, file("${idSample}.recal.bam"), file("${idSample}.recal.bam.bai") into bam_recalibrated - set idPatient, idSample, file("${idSample}.recal.bam") into bam_recalibrated_qc - set idPatient, idSample into tsv_bam_recalibrated +// tag "${fasta}" - when: !(params.no_intervals) +// input: +// file(fasta) from fasta +// file(fastaFai) from fai - script: - """ - samtools merge --threads ${task.cpus} ${idSample}.recal.bam ${bam} - samtools index ${idSample}.recal.bam - """ -} +// output: +// file "microsatellites.list" into msi_scan_ch -// STEP 4.5': INDEXING THE RECALIBRATED BAM FILES +// when: 'msisensor' in tools -process IndexBamRecal { - label 'cpus_8' +// script: +// """ +// msisensor scan -d ${fasta} -o microsatellites.list +// """ +// } - tag "${idPatient}-${idSample}" +// // STEP MSISENSOR.2 - SCORE - publishDir "${params.outdir}/Preprocessing/${idSample}/Recalibrated", mode: params.publish_dir_mode +// // Score the normal vs somatic pair of bams - input: - set idPatient, idSample, file("${idSample}.recal.bam") from bam_recalibrated_to_index +// process MSIsensor_msi { +// label 'cpus_4' +// label 'memory_max' - output: - set idPatient, idSample, file("${idSample}.recal.bam"), file("${idSample}.recal.bam.bai") into bam_recalibrated_indexed - set idPatient, idSample, file("${idSample}.recal.bam") into bam_recalibrated_no_int_qc - set idPatient, idSample into tsv_bam_recalibrated_no_int +// tag "${idSampleTumor}_vs_${idSampleNormal}" - when: params.no_intervals +// publishDir "${params.outdir}/VariantCalling/${idSampleTumor}_vs_${idSampleNormal}/MSIsensor", mode: params.publish_dir_mode - script: - """ - samtools index ${idSample}.recal.bam - """ -} +// input: +// set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor) from pairBamMsisensor +// file msiSites from msi_scan_ch -bam_recalibrated = bam_recalibrated.mix(bam_recalibrated_indexed) -bam_recalibrated_qc = bam_recalibrated_qc.mix(bam_recalibrated_no_int_qc) -tsv_bam_recalibrated = tsv_bam_recalibrated.mix(tsv_bam_recalibrated_no_int) - -(bam_recalibrated_bamqc, bam_recalibrated_samtools_stats) = bam_recalibrated_qc.into(2) -(tsv_bam_recalibrated, tsv_bam_recalibrated_sample) = tsv_bam_recalibrated.into(2) - -// Creating a TSV file to restart from this step -tsv_bam_recalibrated.map { idPatient, idSample -> - gender = genderMap[idPatient] - status = statusMap[idPatient, idSample] - bam = "${params.outdir}/Preprocessing/${idSample}/Recalibrated/${idSample}.recal.bam" - bai = "${params.outdir}/Preprocessing/${idSample}/Recalibrated/${idSample}.recal.bam.bai" - "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n" -}.collectFile( - name: 'recalibrated.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" -) +// output: +// set val("Msisensor"), idPatient, file("${idSampleTumor}_vs_${idSampleNormal}_msisensor"), file("${idSampleTumor}_vs_${idSampleNormal}_msisensor_dis"), file("${idSampleTumor}_vs_${idSampleNormal}_msisensor_germline"), file("${idSampleTumor}_vs_${idSampleNormal}_msisensor_somatic") into msisensor_out_ch -tsv_bam_recalibrated_sample - .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { - idPatient, idSample -> - status = statusMap[idPatient, idSample] - gender = genderMap[idPatient] - bam = "${params.outdir}/Preprocessing/${idSample}/Recalibrated/${idSample}.recal.bam" - bai = "${params.outdir}/Preprocessing/${idSample}/Recalibrated/${idSample}.recal.bam.bai" - ["recalibrated_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n"] -} +// when: 'msisensor' in tools -// STEP 5: QC +// script: +// """ +// msisensor msi -d ${msiSites} \ +// -b 4 \ +// -n ${bamNormal} \ +// -t ${bamTumor} \ +// -o ${idSampleTumor}_vs_${idSampleNormal}_msisensor +// """ +// } -process SamtoolsStats { - label 'cpus_2' +// // STEP ASCAT.1 - ALLELECOUNTER - tag "${idPatient}-${idSample}" +// // Run commands and code from Malin Larsson +// // Based on Jesper Eisfeldt's code +// process AlleleCounter { +// label 'memory_singleCPU_2_task' - publishDir "${params.outdir}/Reports/${idSample}/SamToolsStats", mode: params.publish_dir_mode +// tag "${idSample}" - input: - set idPatient, idSample, file(bam) from bam_recalibrated_samtools_stats +// input: +// set idPatient, idSample, file(bam), file(bai) from bamAscat +// file(acLoci) from loci +// file(dict) from dict +// file(fasta) from fasta +// file(fastaFai) from fai - output: - file ("${bam}.samtools.stats.out") into samtoolsStatsReport +// output: +// set idPatient, idSample, file("${idSample}.alleleCount") into alleleCounterOut - when: !('samtools' in skipQC) +// when: 'ascat' in tools - script: - """ - samtools stats ${bam} > ${bam}.samtools.stats.out - """ -} +// script: +// """ +// alleleCounter \ +// -l ${acLoci} \ +// -r ${fasta} \ +// -b ${bam} \ +// -o ${idSample}.alleleCount; +// """ +// } -samtoolsStatsReport = samtoolsStatsReport.dump(tag:'SAMTools') +// alleleCountOutNormal = Channel.create() +// alleleCountOutTumor = Channel.create() -bamBamQC = bamMappedBamQC.mix(bam_recalibrated_bamqc) +// alleleCounterOut +// .choice(alleleCountOutTumor, alleleCountOutNormal) {status_map[it[0], it[1]] == 0 ? 1 : 0} -process BamQC { - label 'memory_max' - label 'cpus_16' +// alleleCounterOut = alleleCountOutNormal.combine(alleleCountOutTumor, by:0) - tag "${idPatient}-${idSample}" +// alleleCounterOut = alleleCounterOut.map { +// idPatientNormal, idSampleNormal, alleleCountOutNormal, +// idSampleTumor, alleleCountOutTumor -> +// [idPatientNormal, idSampleNormal, idSampleTumor, alleleCountOutNormal, alleleCountOutTumor] +// } - publishDir "${params.outdir}/Reports/${idSample}/bamQC", mode: params.publish_dir_mode +// // STEP ASCAT.2 - CONVERTALLELECOUNTS - input: - set idPatient, idSample, file(bam) from bamBamQC - file(targetBED) from ch_target_bed +// // R script from Malin Larssons bitbucket repo: +// // https://bitbucket.org/malinlarsson/somatic_wgs_pipeline +// process ConvertAlleleCounts { +// label 'memory_singleCPU_2_task' - output: - file("${bam.baseName}") into bamQCReport +// tag "${idSampleTumor}_vs_${idSampleNormal}" - when: !('bamqc' in skipQC) +// publishDir "${params.outdir}/VariantCalling/${idSampleTumor}_vs_${idSampleNormal}/ASCAT", mode: params.publish_dir_mode - script: - use_bed = params.target_bed ? "-gff ${targetBED}" : '' - """ - qualimap --java-mem-size=${task.memory.toGiga()}G \ - bamqc \ - -bam ${bam} \ - --paint-chromosome-limits \ - --genome-gc-distr HUMAN \ - $use_bed \ - -nt ${task.cpus} \ - -skip-duplicated \ - --skip-dup-mode 0 \ - -outdir ${bam.baseName} \ - -outformat HTML - """ -} +// input: +// set idPatient, idSampleNormal, idSampleTumor, file(alleleCountNormal), file(alleleCountTumor) from alleleCounterOut -bamQCReport = bamQCReport.dump(tag:'BamQC') +// output: +// set idPatient, idSampleNormal, idSampleTumor, file("${idSampleNormal}.BAF"), file("${idSampleNormal}.LogR"), file("${idSampleTumor}.BAF"), file("${idSampleTumor}.LogR") into convertAlleleCountsOut -/* -================================================================================ - GERMLINE VARIANT CALLING -================================================================================ -*/ +// when: 'ascat' in tools + +// script: +// gender = gender_map[idPatient] +// """ +// convertAlleleCounts.r ${idSampleTumor} ${alleleCountTumor} ${idSampleNormal} ${alleleCountNormal} ${gender} +// """ +// } + +// // STEP ASCAT.3 - ASCAT + +// // R scripts from Malin Larssons bitbucket repo: +// // https://bitbucket.org/malinlarsson/somatic_wgs_pipeline +// process Ascat { +// label 'memory_singleCPU_2_task' + +// tag "${idSampleTumor}_vs_${idSampleNormal}" + +// publishDir "${params.outdir}/VariantCalling/${idSampleTumor}_vs_${idSampleNormal}/ASCAT", mode: params.publish_dir_mode + +// input: +// set idPatient, idSampleNormal, idSampleTumor, file(bafNormal), file(logrNormal), file(bafTumor), file(logrTumor) from convertAlleleCountsOut +// file(acLociGC) from loci_gc + +// output: +// set val("ASCAT"), idPatient, idSampleNormal, idSampleTumor, file("${idSampleTumor}.*.{png,txt}") into ascatOut + +// when: 'ascat' in tools + +// script: +// gender = gender_map[idPatient] +// purity_ploidy = (params.ascat_purity && params.ascat_ploidy) ? "--purity ${params.ascat_purity} --ploidy ${params.ascat_ploidy}" : "" +// """ +// for f in *BAF *LogR; do sed 's/chr//g' \$f > tmpFile; mv tmpFile \$f;done +// run_ascat.r \ +// --tumorbaf ${bafTumor} \ +// --tumorlogr ${logrTumor} \ +// --normalbaf ${bafNormal} \ +// --normallogr ${logrNormal} \ +// --tumorname ${idSampleTumor} \ +// --basedir ${$projectDir} \ +// --gcfile ${acLociGC} \ +// --gender ${gender} \ +// ${purity_ploidy} +// """ +// } -// When using sentieon for mapping, Channel bam_recalibrated is bam_sentieon_recal -if (params.sentieon && step == 'mapping') bam_recalibrated = bam_sentieon_recal +// ascatOut.dump(tag:'ASCAT') -// When no knownIndels for mapping, Channel bam_recalibrated is bam_duplicates_marked -if (!params.known_indels && step in ['mapping', 'preparerecalibration']) bam_recalibrated = bam_duplicates_marked +// // STEP MPILEUP.1 -// When starting with variant calling, Channel bam_recalibrated is inputSample -if (step == 'variantcalling') bam_recalibrated = inputSample +// process Mpileup { +// label 'cpus_1' +// label 'memory_singleCPU_2_task' -bam_recalibrated = bam_recalibrated.dump(tag:'BAM for Variant Calling') +// tag "${idSample}-${intervalBed.baseName}" -// Here we have a recalibrated bam set -// The TSV file is formatted like: "idPatient status idSample bamFile baiFile" -// Manta will be run in Germline mode, or in Tumor mode depending on status -// HaplotypeCaller, TIDDIT and Strelka will be run for Normal and Tumor samples +// publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { it == "${idSample}.pileup" ? "VariantCalling/${idSample}/Control-FREEC/${it}" : null } -(bamMantaSingle, bamStrelkaSingle, bamTIDDIT, bamFreebayesSingleNoIntervals, bamHaplotypeCallerNoIntervals, bamRecalAll) = bam_recalibrated.into(6) +// input: +// set idPatient, idSample, file(bam), file(bai), file(intervalBed) from bamMpileup +// file(fasta) from fasta +// file(fastaFai) from fai -(bam_sentieon_DNAseq, bam_sentieon_DNAscope, bam_sentieon_all) = bam_sentieon_deduped_table.into(3) +// output: +// set idPatient, idSample, file("${prefix}${idSample}.pileup") into mpileupMerge +// set idPatient, idSample into tsv_mpileup -// To speed Variant Callers up we are chopping the reference into smaller pieces -// Do variant calling by this intervals, and re-merge the VCFs +// when: 'controlfreec' in tools || 'mpileup' in tools -bamHaplotypeCaller = bamHaplotypeCallerNoIntervals.combine(intHaplotypeCaller) -bamFreebayesSingle = bamFreebayesSingleNoIntervals.combine(intFreebayesSingle) +// script: +// prefix = params.no_intervals ? "" : "${intervalBed.baseName}_" +// intervalsOptions = params.no_intervals ? "" : "-l ${intervalBed}" -// STEP GATK HAPLOTYPECALLER.1 +// """ +// # Control-FREEC reads uncompresses the zipped file TWICE in single-threaded mode. +// # we are therefore not using compressed pileups here +// samtools mpileup \ +// -f ${fasta} ${bam} \ +// ${intervalsOptions} > ${prefix}${idSample}.pileup +// """ +// } -process HaplotypeCaller { - label 'memory_singleCPU_task_sq' - label 'cpus_2' +// (tsv_mpileup, tsv_mpileup_sample) = tsv_mpileup.groupTuple(by:[0, 1]).into(2) - tag "${idSample}-${intervalBed.baseName}" - - input: - set idPatient, idSample, file(bam), file(bai), file(intervalBed) from bamHaplotypeCaller - file(dbsnp) from ch_dbsnp - file(dbsnpIndex) from ch_dbsnp_tbi - file(dict) from ch_dict - file(fasta) from ch_fasta - file(fastaFai) from ch_fai - - output: - set val("HaplotypeCallerGVCF"), idPatient, idSample, file("${intervalBed.baseName}_${idSample}.g.vcf") into gvcfHaplotypeCaller - set idPatient, idSample, file(intervalBed), file("${intervalBed.baseName}_${idSample}.g.vcf") into gvcfGenotypeGVCFs - - when: 'haplotypecaller' in tools - - script: - intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}" - dbsnpOptions = params.dbsnp ? "--D ${dbsnp}" : "" - """ - gatk --java-options "-Xmx${task.memory.toGiga()}g -Xms6000m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10" \ - HaplotypeCaller \ - -R ${fasta} \ - -I ${bam} \ - ${intervalsOptions} \ - ${dbsnpOptions} \ - -O ${intervalBed.baseName}_${idSample}.g.vcf \ - -ERC GVCF - """ -} - -gvcfHaplotypeCaller = gvcfHaplotypeCaller.groupTuple(by:[0, 1, 2]) - -if (!params.generate_gvcf) gvcfHaplotypeCaller.close() -else gvcfHaplotypeCaller = gvcfHaplotypeCaller.dump(tag:'GVCF HaplotypeCaller') - -// STEP GATK HAPLOTYPECALLER.2 - -process GenotypeGVCFs { - tag "${idSample}-${intervalBed.baseName}" - - input: - set idPatient, idSample, file(intervalBed), file(gvcf) from gvcfGenotypeGVCFs - file(dbsnp) from ch_dbsnp - file(dbsnpIndex) from ch_dbsnp_tbi - file(dict) from ch_dict - file(fasta) from ch_fasta - file(fastaFai) from ch_fai - - output: - set val("HaplotypeCaller"), idPatient, idSample, file("${intervalBed.baseName}_${idSample}.vcf") into vcfGenotypeGVCFs - - when: 'haplotypecaller' in tools - - script: - // Using -L is important for speed and we have to index the interval files also - intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}" - dbsnpOptions = params.dbsnp ? "--D ${dbsnp}" : "" - """ - gatk --java-options -Xmx${task.memory.toGiga()}g \ - IndexFeatureFile \ - -I ${gvcf} - - gatk --java-options -Xmx${task.memory.toGiga()}g \ - GenotypeGVCFs \ - -R ${fasta} \ - ${intervalsOptions} \ - ${dbsnpOptions} \ - -V ${gvcf} \ - -O ${intervalBed.baseName}_${idSample}.vcf - """ -} - -vcfGenotypeGVCFs = vcfGenotypeGVCFs.groupTuple(by:[0, 1, 2]) - -// STEP SENTIEON DNAseq - -process Sentieon_DNAseq { - label 'cpus_max' - label 'memory_max' - label 'sentieon' - - tag "${idSample}" - - input: - set idPatient, idSample, file(bam), file(bai), file(recal) from bam_sentieon_DNAseq - file(dbsnp) from ch_dbsnp - file(dbsnpIndex) from ch_dbsnp_tbi - file(fasta) from ch_fasta - file(fastaFai) from ch_fai - - output: - set val("SentieonDNAseq"), idPatient, idSample, file("DNAseq_${idSample}.vcf") into vcf_sentieon_DNAseq - - when: 'dnaseq' in tools && params.sentieon - - script: - """ - sentieon driver \ - -t ${task.cpus} \ - -r ${fasta} \ - -i ${bam} \ - -q ${recal} \ - --algo Haplotyper \ - -d ${dbsnp} \ - DNAseq_${idSample}.vcf - """ -} - -vcf_sentieon_DNAseq = vcf_sentieon_DNAseq.dump(tag:'sentieon DNAseq') - -// STEP SENTIEON DNAscope - -process Sentieon_DNAscope { - label 'cpus_max' - label 'memory_max' - label 'sentieon' - - tag "${idSample}" - - input: - set idPatient, idSample, file(bam), file(bai), file(recal) from bam_sentieon_DNAscope - file(dbsnp) from ch_dbsnp - file(dbsnpIndex) from ch_dbsnp_tbi - file(fasta) from ch_fasta - file(fastaFai) from ch_fai - - output: - set val("SentieonDNAscope"), idPatient, idSample, file("DNAscope_${idSample}.vcf") into vcf_sentieon_DNAscope - set val("SentieonDNAscope"), idPatient, idSample, file("DNAscope_SV_${idSample}.vcf") into vcf_sentieon_DNAscope_SV - - when: 'dnascope' in tools && params.sentieon - - script: - """ - sentieon driver \ - -t ${task.cpus} \ - -r ${fasta} \ - -i ${bam} \ - -q ${recal} \ - --algo DNAscope \ - -d ${dbsnp} \ - DNAscope_${idSample}.vcf - - sentieon driver \ - -t ${task.cpus} \ - -r ${fasta}\ - -i ${bam} \ - -q ${recal} \ - --algo DNAscope \ - --var_type bnd \ - -d ${dbsnp} \ - DNAscope_${idSample}.temp.vcf - - sentieon driver \ - -t ${task.cpus} \ - -r ${fasta}\ - -q ${recal} \ - --algo SVSolver \ - -v DNAscope_${idSample}.temp.vcf \ - DNAscope_SV_${idSample}.vcf - """ -} - -vcf_sentieon_DNAscope = vcf_sentieon_DNAscope.dump(tag:'sentieon DNAscope') -vcf_sentieon_DNAscope_SV = vcf_sentieon_DNAscope_SV.dump(tag:'sentieon DNAscope SV') - -// STEP STRELKA.1 - SINGLE MODE - -process StrelkaSingle { - label 'cpus_max' - label 'memory_max' - - tag "${idSample}" - - publishDir "${params.outdir}/VariantCalling/${idSample}/Strelka", mode: params.publish_dir_mode - - input: - set idPatient, idSample, file(bam), file(bai) from bamStrelkaSingle - file(fasta) from ch_fasta - file(fastaFai) from ch_fai - file(targetBED) from ch_target_bed - - output: - set val("Strelka"), idPatient, idSample, file("*.vcf.gz"), file("*.vcf.gz.tbi") into vcfStrelkaSingle - - when: 'strelka' in tools - - script: - beforeScript = params.target_bed ? "bgzip --threads ${task.cpus} -c ${targetBED} > call_targets.bed.gz ; tabix call_targets.bed.gz" : "" - options = params.target_bed ? "--exome --callRegions call_targets.bed.gz" : "" - """ - ${beforeScript} - configureStrelkaGermlineWorkflow.py \ - --bam ${bam} \ - --referenceFasta ${fasta} \ - ${options} \ - --runDir Strelka - - python Strelka/runWorkflow.py -m local -j ${task.cpus} - - mv Strelka/results/variants/genome.*.vcf.gz \ - Strelka_${idSample}_genome.vcf.gz - mv Strelka/results/variants/genome.*.vcf.gz.tbi \ - Strelka_${idSample}_genome.vcf.gz.tbi - mv Strelka/results/variants/variants.vcf.gz \ - Strelka_${idSample}_variants.vcf.gz - mv Strelka/results/variants/variants.vcf.gz.tbi \ - Strelka_${idSample}_variants.vcf.gz.tbi - """ -} - -vcfStrelkaSingle = vcfStrelkaSingle.dump(tag:'Strelka - Single Mode') - -// STEP MANTA.1 - SINGLE MODE - -process MantaSingle { - label 'cpus_max' - label 'memory_max' - - tag "${idSample}" - - publishDir "${params.outdir}/VariantCalling/${idSample}/Manta", mode: params.publish_dir_mode - - input: - set idPatient, idSample, file(bam), file(bai) from bamMantaSingle - file(fasta) from ch_fasta - file(fastaFai) from ch_fai - file(targetBED) from ch_target_bed - - output: - set val("Manta"), idPatient, idSample, file("*.vcf.gz"), file("*.vcf.gz.tbi") into vcfMantaSingle - - when: 'manta' in tools - - script: - beforeScript = params.target_bed ? "bgzip --threads ${task.cpus} -c ${targetBED} > call_targets.bed.gz ; tabix call_targets.bed.gz" : "" - options = params.target_bed ? "--exome --callRegions call_targets.bed.gz" : "" - status = statusMap[idPatient, idSample] - inputbam = status == 0 ? "--bam" : "--tumorBam" - vcftype = status == 0 ? "diploid" : "tumor" - """ - ${beforeScript} - configManta.py \ - ${inputbam} ${bam} \ - --reference ${fasta} \ - ${options} \ - --runDir Manta - - python Manta/runWorkflow.py -m local -j ${task.cpus} - - mv Manta/results/variants/candidateSmallIndels.vcf.gz \ - Manta_${idSample}.candidateSmallIndels.vcf.gz - mv Manta/results/variants/candidateSmallIndels.vcf.gz.tbi \ - Manta_${idSample}.candidateSmallIndels.vcf.gz.tbi - mv Manta/results/variants/candidateSV.vcf.gz \ - Manta_${idSample}.candidateSV.vcf.gz - mv Manta/results/variants/candidateSV.vcf.gz.tbi \ - Manta_${idSample}.candidateSV.vcf.gz.tbi - mv Manta/results/variants/${vcftype}SV.vcf.gz \ - Manta_${idSample}.${vcftype}SV.vcf.gz - mv Manta/results/variants/${vcftype}SV.vcf.gz.tbi \ - Manta_${idSample}.${vcftype}SV.vcf.gz.tbi - """ -} - -vcfMantaSingle = vcfMantaSingle.dump(tag:'Single Manta') - -// STEP TIDDIT - -process TIDDIT { - tag "${idSample}" - - publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: { - if (it == "TIDDIT_${idSample}.vcf") "VariantCalling/${idSample}/TIDDIT/${it}" - else "Reports/${idSample}/TIDDIT/${it}" - } - - input: - set idPatient, idSample, file(bam), file(bai) from bamTIDDIT - file(fasta) from ch_fasta - file(fastaFai) from ch_fai - - output: - set val("TIDDIT"), idPatient, idSample, file("*.vcf.gz"), file("*.tbi") into vcfTIDDIT - set file("TIDDIT_${idSample}.old.vcf"), file("TIDDIT_${idSample}.ploidy.tab"), file("TIDDIT_${idSample}.signals.tab"), file("TIDDIT_${idSample}.wig"), file("TIDDIT_${idSample}.gc.wig") into tidditOut - - when: 'tiddit' in tools - - script: - """ - tiddit --sv -o TIDDIT_${idSample} --bam ${bam} --ref ${fasta} - - mv TIDDIT_${idSample}.vcf TIDDIT_${idSample}.old.vcf - - grep -E "#|PASS" TIDDIT_${idSample}.old.vcf > TIDDIT_${idSample}.vcf - - bgzip --threads ${task.cpus} -c TIDDIT_${idSample}.vcf > TIDDIT_${idSample}.vcf.gz - - tabix TIDDIT_${idSample}.vcf.gz - """ -} - -vcfTIDDIT = vcfTIDDIT.dump(tag:'TIDDIT') - -// STEP FREEBAYES SINGLE MODE - -process FreebayesSingle { - tag "${idSample}-${intervalBed.baseName}" - - label 'cpus_1' - - input: - set idPatient, idSample, file(bam), file(bai), file(intervalBed) from bamFreebayesSingle - file(fasta) from ch_fasta - file(fastaFai) from ch_software_versions_yaml - - output: - set val("FreeBayes"), idPatient, idSample, file("${intervalBed.baseName}_${idSample}.vcf") into vcfFreebayesSingle - - when: 'freebayes' in tools - - script: - intervalsOptions = params.no_intervals ? "" : "-t ${intervalBed}" - """ - freebayes \ - -f ${fasta} \ - --min-alternate-fraction 0.1 \ - --min-mapping-quality 1 \ - ${intervalsOptions} \ - ${bam} > ${intervalBed.baseName}_${idSample}.vcf - """ -} - -vcfFreebayesSingle = vcfFreebayesSingle.groupTuple(by: [0,1,2]) - -/* -================================================================================ - SOMATIC VARIANT CALLING -================================================================================ -*/ -// Ascat, pileup, pileups with no intervals, recalibrated BAMs -(bamAscat, bamMpileup, bamMpileupNoInt, bamRecalAll) = bamRecalAll.into(4) - -// separate BAM by status -bamNormal = Channel.create() -bamTumor = Channel.create() - -bamRecalAll - .choice(bamTumor, bamNormal) {statusMap[it[0], it[1]] == 0 ? 1 : 0} - -// Crossing Normal and Tumor to get a T/N pair for Somatic Variant Calling -// Remapping channel to remove common key idPatient -pairBam = bamNormal.cross(bamTumor).map { - normal, tumor -> - [normal[0], normal[1], normal[2], normal[3], tumor[1], tumor[2], tumor[3]] -} - -pairBam = pairBam.dump(tag:'BAM Somatic Pair') - -// Manta, Strelka, Mutect2, MSIsensor -(pairBamManta, pairBamStrelka, pairBamStrelkaBP, pairBamCalculateContamination, pairBamFilterMutect2, pairBamMsisensor, pairBamCNVkit, pairBam) = pairBam.into(8) - -// Making Pair Bam for Sention - -// separate BAM by status -bam_sention_normal = Channel.create() -bam_sentieon_tumor = Channel.create() - -bam_sentieon_all - .choice(bam_sentieon_tumor, bam_sention_normal) {statusMap[it[0], it[1]] == 0 ? 1 : 0} - -// Crossing Normal and Tumor to get a T/N pair for Somatic Variant Calling -// Remapping channel to remove common key idPatient - -bam_pair_sentieon_TNscope = bam_sention_normal.cross(bam_sentieon_tumor).map { - normal, tumor -> - [normal[0], normal[1], normal[2], normal[3], normal[4], tumor[1], tumor[2], tumor[3], tumor[4]] -} - -intervalPairBam = pairBam.combine(bedIntervals) - -bamMpileup = bamMpileup.combine(intMpileup) - -// intervals for Mutect2 calls, FreeBayes and pileups for Mutect2 filtering -(pairBamMutect2, pairBamFreeBayes, pairBamPileupSummaries) = intervalPairBam.into(3) - -// STEP FREEBAYES - -process FreeBayes { - tag "${idSampleTumor}_vs_${idSampleNormal}-${intervalBed.baseName}" - - label 'cpus_1' - - input: - set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor), file(intervalBed) from pairBamFreeBayes - file(fasta) from ch_fasta - file(fastaFai) from ch_fai - - output: - set val("FreeBayes"), idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf") into vcfFreeBayes - - when: 'freebayes' in tools - - script: - intervalsOptions = params.no_intervals ? "" : "-t ${intervalBed}" - """ - freebayes \ - -f ${fasta} \ - --pooled-continuous \ - --pooled-discrete \ - --genotype-qualities \ - --report-genotype-likelihood-max \ - --allele-balance-priors-off \ - --min-alternate-fraction 0.03 \ - --min-repeat-entropy 1 \ - --min-alternate-count 2 \ - ${intervalsOptions} \ - ${bamTumor} \ - ${bamNormal} > ${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf - """ -} - -vcfFreeBayes = vcfFreeBayes.groupTuple(by:[0,1,2]) - -// STEP GATK MUTECT2.1 - RAW CALLS - -process Mutect2 { - tag "${idSampleTumor}_vs_${idSampleNormal}-${intervalBed.baseName}" - - label 'cpus_1' - - input: - set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor), file(intervalBed) from pairBamMutect2 - file(dict) from ch_dict - file(fasta) from ch_fasta - file(fastaFai) from ch_fai - file(germlineResource) from ch_germline_resource - file(germlineResourceIndex) from ch_germline_resource_tbi - file(intervals) from ch_intervals - file(pon) from ch_pon - file(ponIndex) from ch_pon_tbi - - output: - set val("Mutect2"), idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf") into mutect2Output - set idPatient, idSampleNormal, idSampleTumor, file("${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf.stats") optional true into intervalStatsFiles - set idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf.stats"), file("${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf") optional true into mutect2Stats - - when: 'mutect2' in tools - - script: - // please make a panel-of-normals, using at least 40 samples - // https://gatkforums.broadinstitute.org/gatk/discussion/11136/how-to-call-somatic-mutations-using-gatk4-mutect2 - PON = params.pon ? "--panel-of-normals ${pon}" : "" - intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}" - softClippedOption = params.ignore_soft_clipped_bases ? "--dont-use-soft-clipped-bases true" : "" - """ - # Get raw calls - gatk --java-options "-Xmx${task.memory.toGiga()}g" \ - Mutect2 \ - -R ${fasta}\ - -I ${bamTumor} -tumor ${idSampleTumor} \ - -I ${bamNormal} -normal ${idSampleNormal} \ - ${intervalsOptions} \ - ${softClippedOption} \ - --germline-resource ${germlineResource} \ - ${PON} \ - -O ${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf - """ -} - -mutect2Output = mutect2Output.groupTuple(by:[0,1,2]) -mutect2Stats = mutect2Stats.groupTuple(by:[0,1]) - -// STEP GATK MUTECT2.2 - MERGING STATS - -process MergeMutect2Stats { - tag "${idSamplePair}" - - publishDir "${params.outdir}/VariantCalling/${idSamplePair}/Mutect2", mode: params.publish_dir_mode - - input: - set idPatient, idSamplePair, file(statsFiles), file(vcf) from mutect2Stats // Actual stats files and corresponding VCF chunks - file(dict) from ch_dict - file(fasta) from ch_fasta - file(fastaFai) from ch_fai - file(germlineResource) from ch_germline_resource - file(germlineResourceIndex) from ch_germline_resource_tbi - file(intervals) from ch_intervals - - output: - set idPatient, idSamplePair, file("${idSamplePair}.vcf.gz.stats") into mergedStatsFile - - when: 'mutect2' in tools - - script: - stats = statsFiles.collect{ "-stats ${it} " }.join(' ') - """ - gatk --java-options "-Xmx${task.memory.toGiga()}g" \ - MergeMutectStats \ - ${stats} \ - -O ${idSamplePair}.vcf.gz.stats - """ -} - -// we are merging the VCFs that are called separatelly for different intervals -// so we can have a single sorted VCF containing all the calls for a given caller - -// STEP MERGING VCF - FREEBAYES & GATK HAPLOTYPECALLER - -vcfConcatenateVCFs = vcfFreeBayes.mix(vcfFreebayesSingle, vcfGenotypeGVCFs, gvcfHaplotypeCaller) -vcfConcatenateVCFs = vcfConcatenateVCFs.dump(tag:'VCF to merge') - -process ConcatVCF { - label 'concat_vcf' - label 'cpus_8' - - tag "${variantCaller}-${idSample}" - - publishDir "${params.outdir}/VariantCalling/${idSample}/${"$variantCaller"}", mode: params.publish_dir_mode - - input: - set variantCaller, idPatient, idSample, file(vcf) from vcfConcatenateVCFs - file(fastaFai) from ch_fai - file(targetBED) from ch_target_bed - - output: - // we have this funny *_* pattern to avoid copying the raw calls to publishdir - set variantCaller, idPatient, idSample, file("*_*.vcf.gz"), file("*_*.vcf.gz.tbi") into vcfConcatenated - - when: ('haplotypecaller' in tools || 'mutect2' in tools || 'freebayes' in tools) - - script: - if (variantCaller == 'HaplotypeCallerGVCF') - outputFile = "HaplotypeCaller_${idSample}.g.vcf" - else - outputFile = "${variantCaller}_${idSample}.vcf" - options = params.target_bed ? "-t ${targetBED}" : "" - intervalsOptions = params.no_intervals ? "-n" : "" - """ - concatenateVCFs.sh -i ${fastaFai} -c ${task.cpus} -o ${outputFile} ${options} ${intervalsOptions} - """ -} - -vcfConcatenated = vcfConcatenated.dump(tag:'VCF') - -// STEP MERGING VCF - GATK MUTECT2 (UNFILTERED) - -mutect2Output = mutect2Output.dump(tag:'Mutect2 output VCF to merge') - -process ConcatVCF_Mutect2 { - label 'concat_vcf' - label 'cpus_8' - - tag "${idSample}" - - publishDir "${params.outdir}/VariantCalling/${idSample}/Mutect2", mode: params.publish_dir_mode - - input: - set variantCaller, idPatient, idSample, file(vcf) from mutect2Output - file(fastaFai) from ch_fai - file(targetBED) from ch_target_bed - - output: - // we have this funny *_* pattern to avoid copying the raw calls to publishdir - set variantCaller, idPatient, idSample, file("*_*.vcf.gz"), file("*_*.vcf.gz.tbi") into vcfConcatenatedForFilter - - when: ('haplotypecaller' in tools || 'mutect2' in tools || 'freebayes' in tools) - - script: - outputFile = "Mutect2_unfiltered_${idSample}.vcf" - options = params.target_bed ? "-t ${targetBED}" : "" - intervalsOptions = params.no_intervals ? "-n" : "" - """ - concatenateVCFs.sh -i ${fastaFai} -c ${task.cpus} -o ${outputFile} ${options} ${intervalsOptions} - """ -} - -vcfConcatenatedForFilter = vcfConcatenatedForFilter.dump(tag:'Mutect2 unfiltered VCF') - -// STEP GATK MUTECT2.3 - GENERATING PILEUP SUMMARIES - -pairBamPileupSummaries = pairBamPileupSummaries.map{ - idPatient, idSampleNormal, bamNormal, baiNormal, idSampleTumor, bamTumor, baiTumor, intervalBed -> - [idPatient, idSampleNormal, idSampleTumor, bamNormal, baiNormal, bamTumor, baiTumor, intervalBed] -}.join(intervalStatsFiles, by:[0,1,2]) - -process PileupSummariesForMutect2 { - tag "${idSampleTumor}_vs_${idSampleNormal}-${intervalBed.baseName}" - - label 'cpus_1' - - input: - set idPatient, idSampleNormal, idSampleTumor, file(bamNormal), file(baiNormal), file(bamTumor), file(baiTumor), file(intervalBed), file(statsFile) from pairBamPileupSummaries - file(germlineResource) from ch_germline_resource - file(germlineResourceIndex) from ch_germline_resource_tbi - - output: - set idPatient, idSampleNormal, idSampleTumor, file("${intervalBed.baseName}_${idSampleTumor}_pileupsummaries.table") into pileupSummaries - - when: 'mutect2' in tools - - script: - intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}" - """ - gatk --java-options "-Xmx${task.memory.toGiga()}g" \ - GetPileupSummaries \ - -I ${bamTumor} \ - -V ${germlineResource} \ - ${intervalsOptions} \ - -O ${intervalBed.baseName}_${idSampleTumor}_pileupsummaries.table - """ -} - -pileupSummaries = pileupSummaries.groupTuple(by:[0,1,2]) - -// STEP GATK MUTECT2.4 - MERGING PILEUP SUMMARIES - -process MergePileupSummaries { - label 'cpus_1' - - tag "${idPatient}_${idSampleTumor}" - - publishDir "${params.outdir}/VariantCalling/${idSampleTumor}/Mutect2", mode: params.publish_dir_mode - - input: - set idPatient, idSampleNormal, idSampleTumor, file(pileupSums) from pileupSummaries - file(dict) from ch_dict - - output: - set idPatient, idSampleNormal, idSampleTumor, file("${idSampleTumor}_pileupsummaries.table") into mergedPileupFile - - when: 'mutect2' in tools - - script: - allPileups = pileupSums.collect{ "-I ${it} " }.join(' ') - """ - gatk --java-options "-Xmx${task.memory.toGiga()}g" \ - GatherPileupSummaries \ - --sequence-dictionary ${dict} \ - ${allPileups} \ - -O ${idSampleTumor}_pileupsummaries.table - """ -} - -// STEP GATK MUTECT2.5 - CALCULATING CONTAMINATION - -pairBamCalculateContamination = pairBamCalculateContamination.map{ - idPatient, idSampleNormal, bamNormal, baiNormal, idSampleTumor, bamTumor, baiTumor -> - [idPatient, idSampleNormal, idSampleTumor, bamNormal, baiNormal, bamTumor, baiTumor] -}.join(mergedPileupFile, by:[0,1,2]) - -process CalculateContamination { - label 'cpus_1' - - tag "${idSampleTumor}_vs_${idSampleNormal}" - - publishDir "${params.outdir}/VariantCalling/${idSampleTumor}/Mutect2", mode: params.publish_dir_mode - - input: - set idPatient, idSampleNormal, idSampleTumor, file(bamNormal), file(baiNormal), file(bamTumor), file(baiTumor), file(mergedPileup) from pairBamCalculateContamination - - output: - set idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("${idSampleTumor}_contamination.table") into contaminationTable - - when: 'mutect2' in tools - - script: - """ - # calculate contamination - gatk --java-options "-Xmx${task.memory.toGiga()}g" \ - CalculateContamination \ - -I ${idSampleTumor}_pileupsummaries.table \ - -O ${idSampleTumor}_contamination.table - """ -} - -// STEP GATK MUTECT2.6 - FILTERING CALLS - -mutect2CallsToFilter = vcfConcatenatedForFilter.map{ - variantCaller, idPatient, idSamplePair, vcf, tbi -> - [idPatient, idSamplePair, vcf, tbi] -}.join(mergedStatsFile, by:[0,1]).join(contaminationTable, by:[0,1]) - -process FilterMutect2Calls { - label 'cpus_1' - - tag "${idSamplePair}" - - publishDir "${params.outdir}/VariantCalling/${idSamplePair}/Mutect2", mode: params.publish_dir_mode - - input: - set idPatient, idSamplePair, file(unfiltered), file(unfilteredIndex), file(stats), file(contaminationTable) from mutect2CallsToFilter - file(dict) from ch_dict - file(fasta) from ch_fasta - file(fastaFai) from ch_fai - file(germlineResource) from ch_germline_resource - file(germlineResourceIndex) from ch_germline_resource_tbi - file(intervals) from ch_intervals - - output: - set val("Mutect2"), idPatient, idSamplePair, file("Mutect2_filtered_${idSamplePair}.vcf.gz"), file("Mutect2_filtered_${idSamplePair}.vcf.gz.tbi"), file("Mutect2_filtered_${idSamplePair}.vcf.gz.filteringStats.tsv") into filteredMutect2Output - - when: 'mutect2' in tools - - script: - """ - # do the actual filtering - gatk --java-options "-Xmx${task.memory.toGiga()}g" \ - FilterMutectCalls \ - -V ${unfiltered} \ - --contamination-table ${contaminationTable} \ - --stats ${stats} \ - -R ${fasta} \ - -O Mutect2_filtered_${idSamplePair}.vcf.gz - """ -} - -// STEP SENTIEON TNSCOPE - -process Sentieon_TNscope { - label 'cpus_max' - label 'memory_max' - label 'sentieon' - - tag "${idSampleTumor}_vs_${idSampleNormal}" - - input: - set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), file(recalNormal), idSampleTumor, file(bamTumor), file(baiTumor), file(recalTumor) from bam_pair_sentieon_TNscope - file(dict) from ch_dict - file(fasta) from ch_fasta - file(fastaFai) from ch_fai - file(dbsnp) from ch_dbsnp - file(dbsnpIndex) from ch_dbsnp_tbi - file(pon) from ch_pon - file(ponIndex) from ch_pon_tbi - - output: - set val("SentieonTNscope"), idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("*.vcf") into vcf_sentieon_TNscope - - when: 'tnscope' in tools && params.sentieon - - script: - PON = params.pon ? "--pon ${pon}" : "" - """ - sentieon driver \ - -t ${task.cpus} \ - -r ${fasta} \ - -i ${bamTumor} \ - -q ${recalTumor} \ - -i ${bamNormal} \ - -q ${recalNormal} \ - --algo TNscope \ - --tumor_sample ${idSampleTumor} \ - --normal_sample ${idSampleNormal} \ - --dbsnp ${dbsnp} \ - ${PON} \ - TNscope_${idSampleTumor}_vs_${idSampleNormal}.vcf - """ -} - -vcf_sentieon_TNscope = vcf_sentieon_TNscope.dump(tag:'Sentieon TNscope') - -vcf_sentieon = vcf_sentieon_DNAseq.mix(vcf_sentieon_DNAscope, vcf_sentieon_DNAscope_SV, vcf_sentieon_TNscope) - -process CompressSentieonVCF { - tag "${idSample} - ${vcf}" - - publishDir "${params.outdir}/VariantCalling/${idSample}/${variantCaller}", mode: params.publish_dir_mode - - input: - set variantCaller, idPatient, idSample, file(vcf) from vcf_sentieon - - output: - set variantCaller, idPatient, idSample, file("*.vcf.gz"), file("*.vcf.gz.tbi") into vcf_sentieon_compressed - - script: - """ - bgzip < ${vcf} > ${vcf}.gz - tabix ${vcf}.gz - """ -} - -vcf_sentieon_compressed = vcf_sentieon_compressed.dump(tag:'Sentieon VCF indexed') - -// STEP STRELKA.2 - SOMATIC PAIR - -process Strelka { - label 'cpus_max' - label 'memory_max' - - tag "${idSampleTumor}_vs_${idSampleNormal}" - - publishDir "${params.outdir}/VariantCalling/${idSampleTumor}_vs_${idSampleNormal}/Strelka", mode: params.publish_dir_mode - - input: - set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor) from pairBamStrelka - file(dict) from ch_dict - file(fasta) from ch_fasta - file(fastaFai) from ch_fai - file(targetBED) from ch_target_bed - - output: - set val("Strelka"), idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("*.vcf.gz"), file("*.vcf.gz.tbi") into vcfStrelka - - when: 'strelka' in tools - - script: - beforeScript = params.target_bed ? "bgzip --threads ${task.cpus} -c ${targetBED} > call_targets.bed.gz ; tabix call_targets.bed.gz" : "" - options = params.target_bed ? "--exome --callRegions call_targets.bed.gz" : "" - """ - ${beforeScript} - configureStrelkaSomaticWorkflow.py \ - --tumor ${bamTumor} \ - --normal ${bamNormal} \ - --referenceFasta ${fasta} \ - ${options} \ - --runDir Strelka - - python Strelka/runWorkflow.py -m local -j ${task.cpus} - - mv Strelka/results/variants/somatic.indels.vcf.gz \ - Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz - mv Strelka/results/variants/somatic.indels.vcf.gz.tbi \ - Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz.tbi - mv Strelka/results/variants/somatic.snvs.vcf.gz \ - Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz - mv Strelka/results/variants/somatic.snvs.vcf.gz.tbi \ - Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz.tbi - """ -} - -vcfStrelka = vcfStrelka.dump(tag:'Strelka') - -// STEP MANTA.2 - SOMATIC PAIR - -process Manta { - label 'cpus_max' - label 'memory_max' - - tag "${idSampleTumor}_vs_${idSampleNormal}" - - publishDir "${params.outdir}/VariantCalling/${idSampleTumor}_vs_${idSampleNormal}/Manta", mode: params.publish_dir_mode - - input: - set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor) from pairBamManta - file(fasta) from ch_fasta - file(fastaFai) from ch_fai - file(targetBED) from ch_target_bed - - output: - set val("Manta"), idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("*.vcf.gz"), file("*.vcf.gz.tbi") into vcfManta - set idPatient, idSampleNormal, idSampleTumor, file("*.candidateSmallIndels.vcf.gz"), file("*.candidateSmallIndels.vcf.gz.tbi") into mantaToStrelka - - when: 'manta' in tools - - script: - beforeScript = params.target_bed ? "bgzip --threads ${task.cpus} -c ${targetBED} > call_targets.bed.gz ; tabix call_targets.bed.gz" : "" - options = params.target_bed ? "--exome --callRegions call_targets.bed.gz" : "" - """ - ${beforeScript} - configManta.py \ - --normalBam ${bamNormal} \ - --tumorBam ${bamTumor} \ - --reference ${fasta} \ - ${options} \ - --runDir Manta - - python Manta/runWorkflow.py -m local -j ${task.cpus} - - mv Manta/results/variants/candidateSmallIndels.vcf.gz \ - Manta_${idSampleTumor}_vs_${idSampleNormal}.candidateSmallIndels.vcf.gz - mv Manta/results/variants/candidateSmallIndels.vcf.gz.tbi \ - Manta_${idSampleTumor}_vs_${idSampleNormal}.candidateSmallIndels.vcf.gz.tbi - mv Manta/results/variants/candidateSV.vcf.gz \ - Manta_${idSampleTumor}_vs_${idSampleNormal}.candidateSV.vcf.gz - mv Manta/results/variants/candidateSV.vcf.gz.tbi \ - Manta_${idSampleTumor}_vs_${idSampleNormal}.candidateSV.vcf.gz.tbi - mv Manta/results/variants/diploidSV.vcf.gz \ - Manta_${idSampleTumor}_vs_${idSampleNormal}.diploidSV.vcf.gz - mv Manta/results/variants/diploidSV.vcf.gz.tbi \ - Manta_${idSampleTumor}_vs_${idSampleNormal}.diploidSV.vcf.gz.tbi - mv Manta/results/variants/somaticSV.vcf.gz \ - Manta_${idSampleTumor}_vs_${idSampleNormal}.somaticSV.vcf.gz - mv Manta/results/variants/somaticSV.vcf.gz.tbi \ - Manta_${idSampleTumor}_vs_${idSampleNormal}.somaticSV.vcf.gz.tbi - """ -} - -vcfManta = vcfManta.dump(tag:'Manta') - -// Remmaping channels to match input for StrelkaBP -pairBamStrelkaBP = pairBamStrelkaBP.map { - idPatientNormal, idSampleNormal, bamNormal, baiNormal, idSampleTumor, bamTumor, baiTumor -> - [idPatientNormal, idSampleNormal, idSampleTumor, bamNormal, baiNormal, bamTumor, baiTumor] -}.join(mantaToStrelka, by:[0,1,2]).map { - idPatientNormal, idSampleNormal, idSampleTumor, bamNormal, baiNormal, bamTumor, baiTumor, mantaCSI, mantaCSIi -> - [idPatientNormal, idSampleNormal, bamNormal, baiNormal, idSampleTumor, bamTumor, baiTumor, mantaCSI, mantaCSIi] -} - -// STEP STRELKA.3 - SOMATIC PAIR - BEST PRACTICES - -process StrelkaBP { - label 'cpus_max' - label 'memory_max' - - tag "${idSampleTumor}_vs_${idSampleNormal}" - - publishDir "${params.outdir}/VariantCalling/${idSampleTumor}_vs_${idSampleNormal}/Strelka", mode: params.publish_dir_mode - - input: - set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor), file(mantaCSI), file(mantaCSIi) from pairBamStrelkaBP - file(dict) from ch_dict - file(fasta) from ch_fasta - file(fastaFai) from ch_fai - file(targetBED) from ch_target_bed - - output: - set val("Strelka"), idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("*.vcf.gz"), file("*.vcf.gz.tbi") into vcfStrelkaBP - - when: 'strelka' in tools && 'manta' in tools && !params.no_strelka_bp - - script: - beforeScript = params.target_bed ? "bgzip --threads ${task.cpus} -c ${targetBED} > call_targets.bed.gz ; tabix call_targets.bed.gz" : "" - options = params.target_bed ? "--exome --callRegions call_targets.bed.gz" : "" - """ - ${beforeScript} - configureStrelkaSomaticWorkflow.py \ - --tumor ${bamTumor} \ - --normal ${bamNormal} \ - --referenceFasta ${fasta} \ - --indelCandidates ${mantaCSI} \ - ${options} \ - --runDir Strelka - - python Strelka/runWorkflow.py -m local -j ${task.cpus} - - mv Strelka/results/variants/somatic.indels.vcf.gz \ - StrelkaBP_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz - mv Strelka/results/variants/somatic.indels.vcf.gz.tbi \ - StrelkaBP_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz.tbi - mv Strelka/results/variants/somatic.snvs.vcf.gz \ - StrelkaBP_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz - mv Strelka/results/variants/somatic.snvs.vcf.gz.tbi \ - StrelkaBP_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz.tbi - """ -} - -vcfStrelkaBP = vcfStrelkaBP.dump(tag:'Strelka BP') - -// STEP CNVkit - -process CNVkit { - tag "${idSampleTumor}_vs_${idSampleNormal}" - - publishDir "${params.outdir}/VariantCalling/${idSampleTumor}_vs_${idSampleNormal}/CNVkit", mode: params.publish_dir_mode - - input: - set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor) from pairBamCNVkit - file(targetBED) from ch_target_bed - file(fasta) from ch_fasta - - output: - set idPatient, idSampleNormal, idSampleTumor, file("${idSampleTumor}*"), file("${idSampleNormal}*") into cnvkitOut - - when: 'cnvkit' in tools && params.target_bed - - script: - """ - cnvkit.py \ - batch \ - ${bamTumor} \ - --normal ${bamNormal} \ - --targets ${targetBED} \ - --fasta ${fasta} \ - --output-reference output_reference.cnn \ - --output-dir ./ \ - --diagram \ - --scatter - """ -} - -// STEP MSISENSOR.1 - SCAN - -// Scan reference genome for microsatellites -process MSIsensor_scan { - label 'cpus_1' - label 'memory_max' - - tag "${fasta}" - - input: - file(fasta) from ch_fasta - file(fastaFai) from ch_fai - - output: - file "microsatellites.list" into msi_scan_ch - - when: 'msisensor' in tools - - script: - """ - msisensor scan -d ${fasta} -o microsatellites.list - """ -} - -// STEP MSISENSOR.2 - SCORE - -// Score the normal vs somatic pair of bams - -process MSIsensor_msi { - label 'cpus_4' - label 'memory_max' - - tag "${idSampleTumor}_vs_${idSampleNormal}" - - publishDir "${params.outdir}/VariantCalling/${idSampleTumor}_vs_${idSampleNormal}/MSIsensor", mode: params.publish_dir_mode - - input: - set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor) from pairBamMsisensor - file msiSites from msi_scan_ch - - output: - set val("Msisensor"), idPatient, file("${idSampleTumor}_vs_${idSampleNormal}_msisensor"), file("${idSampleTumor}_vs_${idSampleNormal}_msisensor_dis"), file("${idSampleTumor}_vs_${idSampleNormal}_msisensor_germline"), file("${idSampleTumor}_vs_${idSampleNormal}_msisensor_somatic") into msisensor_out_ch - - when: 'msisensor' in tools - - script: - """ - msisensor msi -d ${msiSites} \ - -b 4 \ - -n ${bamNormal} \ - -t ${bamTumor} \ - -o ${idSampleTumor}_vs_${idSampleNormal}_msisensor - """ -} - -// STEP ASCAT.1 - ALLELECOUNTER - -// Run commands and code from Malin Larsson -// Based on Jesper Eisfeldt's code -process AlleleCounter { - label 'memory_singleCPU_2_task' - - tag "${idSample}" - - input: - set idPatient, idSample, file(bam), file(bai) from bamAscat - file(acLoci) from ch_ac_loci - file(dict) from ch_dict - file(fasta) from ch_fasta - file(fastaFai) from ch_fai - - output: - set idPatient, idSample, file("${idSample}.alleleCount") into alleleCounterOut - - when: 'ascat' in tools - - script: - """ - alleleCounter \ - -l ${acLoci} \ - -r ${fasta} \ - -b ${bam} \ - -o ${idSample}.alleleCount; - """ -} - -alleleCountOutNormal = Channel.create() -alleleCountOutTumor = Channel.create() - -alleleCounterOut - .choice(alleleCountOutTumor, alleleCountOutNormal) {statusMap[it[0], it[1]] == 0 ? 1 : 0} - -alleleCounterOut = alleleCountOutNormal.combine(alleleCountOutTumor, by:0) - -alleleCounterOut = alleleCounterOut.map { - idPatientNormal, idSampleNormal, alleleCountOutNormal, - idSampleTumor, alleleCountOutTumor -> - [idPatientNormal, idSampleNormal, idSampleTumor, alleleCountOutNormal, alleleCountOutTumor] -} - -// STEP ASCAT.2 - CONVERTALLELECOUNTS - -// R script from Malin Larssons bitbucket repo: -// https://bitbucket.org/malinlarsson/somatic_wgs_pipeline -process ConvertAlleleCounts { - label 'memory_singleCPU_2_task' - - tag "${idSampleTumor}_vs_${idSampleNormal}" - - publishDir "${params.outdir}/VariantCalling/${idSampleTumor}_vs_${idSampleNormal}/ASCAT", mode: params.publish_dir_mode - - input: - set idPatient, idSampleNormal, idSampleTumor, file(alleleCountNormal), file(alleleCountTumor) from alleleCounterOut - - output: - set idPatient, idSampleNormal, idSampleTumor, file("${idSampleNormal}.BAF"), file("${idSampleNormal}.LogR"), file("${idSampleTumor}.BAF"), file("${idSampleTumor}.LogR") into convertAlleleCountsOut - - when: 'ascat' in tools - - script: - gender = genderMap[idPatient] - """ - convertAlleleCounts.r ${idSampleTumor} ${alleleCountTumor} ${idSampleNormal} ${alleleCountNormal} ${gender} - """ -} - -// STEP ASCAT.3 - ASCAT - -// R scripts from Malin Larssons bitbucket repo: -// https://bitbucket.org/malinlarsson/somatic_wgs_pipeline -process Ascat { - label 'memory_singleCPU_2_task' - - tag "${idSampleTumor}_vs_${idSampleNormal}" - - publishDir "${params.outdir}/VariantCalling/${idSampleTumor}_vs_${idSampleNormal}/ASCAT", mode: params.publish_dir_mode - - input: - set idPatient, idSampleNormal, idSampleTumor, file(bafNormal), file(logrNormal), file(bafTumor), file(logrTumor) from convertAlleleCountsOut - file(acLociGC) from ch_ac_loci_gc - - output: - set val("ASCAT"), idPatient, idSampleNormal, idSampleTumor, file("${idSampleTumor}.*.{png,txt}") into ascatOut - - when: 'ascat' in tools - - script: - gender = genderMap[idPatient] - purity_ploidy = (params.ascat_purity && params.ascat_ploidy) ? "--purity ${params.ascat_purity} --ploidy ${params.ascat_ploidy}" : "" - """ - for f in *BAF *LogR; do sed 's/chr//g' \$f > tmpFile; mv tmpFile \$f;done - run_ascat.r \ - --tumorbaf ${bafTumor} \ - --tumorlogr ${logrTumor} \ - --normalbaf ${bafNormal} \ - --normallogr ${logrNormal} \ - --tumorname ${idSampleTumor} \ - --basedir ${projectDir} \ - --gcfile ${acLociGC} \ - --gender ${gender} \ - ${purity_ploidy} - """ -} - -ascatOut.dump(tag:'ASCAT') - -// STEP MPILEUP.1 - -process Mpileup { - label 'cpus_1' - label 'memory_singleCPU_2_task' - - tag "${idSample}-${intervalBed.baseName}" - - publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { it == "${idSample}.pileup" ? "VariantCalling/${idSample}/Control-FREEC/${it}" : null } - - input: - set idPatient, idSample, file(bam), file(bai), file(intervalBed) from bamMpileup - file(fasta) from ch_fasta - file(fastaFai) from ch_fai - - output: - set idPatient, idSample, file("${prefix}${idSample}.pileup") into mpileupMerge - set idPatient, idSample into tsv_mpileup - - when: 'controlfreec' in tools || 'mpileup' in tools - - script: - prefix = params.no_intervals ? "" : "${intervalBed.baseName}_" - intervalsOptions = params.no_intervals ? "" : "-l ${intervalBed}" - - """ - # Control-FREEC reads uncompresses the zipped file TWICE in single-threaded mode. - # we are therefore not using compressed pileups here - samtools mpileup \ - -f ${fasta} ${bam} \ - ${intervalsOptions} > ${prefix}${idSample}.pileup - """ -} - -(tsv_mpileup, tsv_mpileup_sample) = tsv_mpileup.groupTuple(by:[0, 1]).into(2) - -// Creating a TSV file to restart from this step -tsv_mpileup.map { idPatient, idSample -> - gender = genderMap[idPatient] - status = statusMap[idPatient, idSample] - mpileup = "${params.outdir}/VariantCalling/${idSample}/Control-FREEC/${idSample}.pileup" - "${idPatient}\t${gender}\t${status}\t${idSample}\t${mpileup}\n" -}.collectFile( - name: 'control-freec_mpileup.tsv', sort: true, storeDir: "${params.outdir}/VariantCalling/TSV" -) - -tsv_mpileup_sample - .collectFile(storeDir: "${params.outdir}/VariantCalling/TSV") { - idPatient, idSample -> - status = statusMap[idPatient, idSample] - gender = genderMap[idPatient] - mpileup = "${params.outdir}/VariantCalling/${idSample}/Control-FREEC/${idSample}.pileup" - ["control-freec_mpileup_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${mpileup}\n"] -} - -if (!params.no_intervals) { - mpileupMerge = mpileupMerge.groupTuple(by:[0, 1]) - mpileupNoInt = Channel.empty() -} else { - (mpileupMerge, mpileupNoInt) = mpileupMerge.into(2) - mpileupMerge.close() -} - -// STEP MPILEUP.2 - MERGE -process MergeMpileup { - label 'cpus_1' - - tag "${idSample}" - - publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { it == "${idSample}.pileup" ? "VariantCalling/${idSample}/Control-FREEC/${it}" : null } - - input: - set idPatient, idSample, file(mpileup) from mpileupMerge - - output: - set idPatient, idSample, file("${idSample}.pileup") into mpileupOut - - when: !(params.no_intervals) && 'controlfreec' in tools || 'mpileup' in tools - - script: - """ - for i in `ls -1v *.pileup`; - do cat \$i >> ${idSample}.pileup - done - """ -} - -mpileupOut = mpileupOut.mix(mpileupNoInt) -mpileupOut = mpileupOut.dump(tag:'mpileup') - -mpileupOutNormal = Channel.create() -mpileupOutTumor = Channel.create() - -if (step == 'controlfreec') mpileupOut = inputSample - -mpileupOut - .choice(mpileupOutTumor, mpileupOutNormal) {statusMap[it[0], it[1]] == 0 ? 1 : 0} - -(mpileupOutSingle,mpileupOutTumor) = mpileupOutTumor.into(2) - -mpileupOut = mpileupOutNormal.combine(mpileupOutTumor, by:0) - -mpileupOut = mpileupOut.map { - idPatientNormal, idSampleNormal, mpileupOutNormal, - idSampleTumor, mpileupOutTumor -> - [idPatientNormal, idSampleNormal, idSampleTumor, mpileupOutNormal, mpileupOutTumor] -} - -// STEP CONTROLFREEC.1 - CONTROLFREEC - -process ControlFREEC { - label 'cpus_8' - - tag "${idSampleTumor}_vs_${idSampleNormal}" - - publishDir "${params.outdir}/VariantCalling/${idSampleTumor}_vs_${idSampleNormal}/Control-FREEC", mode: params.publish_dir_mode - - input: - set idPatient, idSampleNormal, idSampleTumor, file(mpileupNormal), file(mpileupTumor) from mpileupOut - file(chrDir) from ch_chr_dir - file(mappability) from ch_mappability - file(chrLength) from ch_chr_length - file(dbsnp) from ch_dbsnp - file(dbsnpIndex) from ch_dbsnp_tbi - file(fasta) from ch_fasta - file(fastaFai) from ch_fai - file(targetBED) from ch_target_bed - - output: - set idPatient, idSampleNormal, idSampleTumor, file("${idSampleTumor}.pileup_CNVs"), file("${idSampleTumor}.pileup_ratio.txt"), file("${idSampleTumor}.pileup_BAF.txt") into controlFreecViz - set file("*.pileup*"), file("${idSampleTumor}_vs_${idSampleNormal}.config.txt") into controlFreecOut - - when: 'controlfreec' in tools - - script: - config = "${idSampleTumor}_vs_${idSampleNormal}.config.txt" - gender = genderMap[idPatient] - // Window has higher priority than coefficientOfVariation if both given - window = params.cf_window ? "window = ${params.cf_window}" : "" - coeffvar = params.cf_coeff ? "coefficientOfVariation = ${params.cf_coeff}" : "" - use_bed = params.target_bed ? "captureRegions = ${targetBED}" : "" - // This parameter makes Control-FREEC unstable (still in Beta according to the developers) - // so we disable it by setting it to its default value (it is disabled by default) - //min_subclone = params.target_bed ? "30" : "20" - min_subclone = 100 - readCountThreshold = params.target_bed ? "50" : "10" - breakPointThreshold = params.target_bed ? "1.2" : "0.8" - breakPointType = params.target_bed ? "4" : "2" - mappabilitystr = params.mappability ? "gemMappabilityFile = \${PWD}/${mappability}" : "" - - """ - touch ${config} - echo "[general]" >> ${config} - echo "BedGraphOutput = TRUE" >> ${config} - echo "chrFiles = \${PWD}/${chrDir.fileName}" >> ${config} - echo "chrLenFile = \${PWD}/${chrLength.fileName}" >> ${config} - echo "forceGCcontentNormalization = 1" >> ${config} - echo "maxThreads = ${task.cpus}" >> ${config} - echo "minimalSubclonePresence = ${min_subclone}" >> ${config} - echo "ploidy = ${params.cf_ploidy}" >> ${config} - echo "sex = ${gender}" >> ${config} - echo "readCountThreshold = ${readCountThreshold}" >> ${config} - echo "breakPointThreshold = ${breakPointThreshold}" >> ${config} - echo "breakPointType = ${breakPointType}" >> ${config} - echo "${window}" >> ${config} - echo "${coeffvar}" >> ${config} - echo "${mappabilitystr}" >> ${config} - echo "" >> ${config} - - echo "[control]" >> ${config} - echo "inputFormat = pileup" >> ${config} - echo "mateFile = \${PWD}/${mpileupNormal}" >> ${config} - echo "mateOrientation = FR" >> ${config} - echo "" >> ${config} - - echo "[sample]" >> ${config} - echo "inputFormat = pileup" >> ${config} - echo "mateFile = \${PWD}/${mpileupTumor}" >> ${config} - echo "mateOrientation = FR" >> ${config} - echo "" >> ${config} - - echo "[BAF]" >> ${config} - echo "SNPfile = ${dbsnp.fileName}" >> ${config} - echo "" >> ${config} - - echo "[target]" >> ${config} - echo "${use_bed}" >> ${config} - - freec -conf ${config} - """ -} - -controlFreecOut.dump(tag:'ControlFREEC') - -process ControlFREECSingle { - label 'cpus_8' - - tag "${idSampleTumor}" - - publishDir "${params.outdir}/VariantCalling/${idSampleTumor}/Control-FREEC", mode: params.publish_dir_mode - - input: - set idPatient, idSampleTumor, file(mpileupTumor) from mpileupOutSingle - file(chrDir) from ch_chr_dir - file(mappability) from ch_mappability - file(chrLength) from ch_chr_length - file(dbsnp) from ch_dbsnp - file(dbsnpIndex) from ch_dbsnp_tbi - file(fasta) from ch_fasta - file(fastaFai) from ch_fai - file(targetBED) from ch_target_bed - - output: - set idPatient, idSampleTumor, file("${idSampleTumor}.pileup_CNVs"), file("${idSampleTumor}.pileup_ratio.txt"), file("${idSampleTumor}.pileup_BAF.txt") into controlFreecVizSingle - set file("*.pileup*"), file("${idSampleTumor}.config.txt") into controlFreecOutSingle - - when: 'controlfreec' in tools - - script: - config = "${idSampleTumor}.config.txt" - gender = genderMap[idPatient] - // Window has higher priority than coefficientOfVariation if both given - window = params.cf_window ? "window = ${params.cf_window}" : "" - coeffvar = params.cf_coeff ? "coefficientOfVariation = ${params.cf_coeff}" : "" - use_bed = params.target_bed ? "captureRegions = ${targetBED}" : "" - // This parameter makes Control-FREEC unstable (still in Beta according to the developers) - // so we disable it by setting it to its default value (it is disabled by default) - //min_subclone = params.target_bed ? "30" : "20" - min_subclone = 100 - readCountThreshold = params.target_bed ? "50" : "10" - breakPointThreshold = params.target_bed ? "1.2" : "0.8" - breakPointType = params.target_bed ? "4" : "2" - mappabilitystr = params.mappability ? "gemMappabilityFile = \${PWD}/${mappability}" : "" - - """ - touch ${config} - echo "[general]" >> ${config} - echo "BedGraphOutput = TRUE" >> ${config} - echo "chrFiles = \${PWD}/${chrDir.fileName}" >> ${config} - echo "chrLenFile = \${PWD}/${chrLength.fileName}" >> ${config} - echo "forceGCcontentNormalization = 1" >> ${config} - echo "maxThreads = ${task.cpus}" >> ${config} - echo "minimalSubclonePresence = ${min_subclone}" >> ${config} - echo "ploidy = ${params.cf_ploidy}" >> ${config} - echo "sex = ${gender}" >> ${config} - echo "readCountThreshold = ${readCountThreshold}" >> ${config} - echo "breakPointThreshold = ${breakPointThreshold}" >> ${config} - echo "breakPointType = ${breakPointType}" >> ${config} - echo "${window}" >> ${config} - echo "${coeffvar}" >> ${config} - echo "${mappabilitystr}" >> ${config} - echo "" >> ${config} - - echo "[sample]" >> ${config} - echo "inputFormat = pileup" >> ${config} - echo "mateFile = \${PWD}/${mpileupTumor}" >> ${config} - echo "mateOrientation = FR" >> ${config} - echo "" >> ${config} - - echo "[BAF]" >> ${config} - echo "SNPfile = ${dbsnp.fileName}" >> ${config} - echo "" >> ${config} - - echo "[target]" >> ${config} - echo "${use_bed}" >> ${config} - - freec -conf ${config} - """ -} - -controlFreecOutSingle.dump(tag:'ControlFREECSingle') - -// STEP CONTROLFREEC.3 - VISUALIZATION - -process ControlFreecViz { - label 'memory_singleCPU_2_task' - - tag "${idSampleTumor}_vs_${idSampleNormal}" - - publishDir "${params.outdir}/VariantCalling/${idSampleTumor}_vs_${idSampleNormal}/Control-FREEC", mode: params.publish_dir_mode - - input: - set idPatient, idSampleNormal, idSampleTumor, file(cnvTumor), file(ratioTumor), file(bafTumor) from controlFreecViz - - output: - set file("*.txt"), file("*.png"), file("*.bed") into controlFreecVizOut - - when: 'controlfreec' in tools - - script: - """ - echo "############### Calculating significance values for TUMOR CNVs #############" - cat /opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/bin/assess_significance.R | R --slave --args ${cnvTumor} ${ratioTumor} - - echo "############### Creating graph for TUMOR ratios ###############" - cat /opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/bin/makeGraph.R | R --slave --args 2 ${ratioTumor} ${bafTumor} - - echo "############### Creating BED files for TUMOR ##############" - perl /opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/bin/freec2bed.pl -f ${ratioTumor} > ${idSampleTumor}.bed - """ -} - -controlFreecVizOut.dump(tag:'ControlFreecViz') - -process ControlFreecVizSingle { - label 'memory_singleCPU_2_task' - - tag "${idSampleTumor}" - - publishDir "${params.outdir}/VariantCalling/${idSampleTumor}/Control-FREEC", mode: params.publish_dir_mode - - input: - set idPatient, idSampleTumor, file(cnvTumor), file(ratioTumor), file(bafTumor) from controlFreecVizSingle - - output: - set file("*.txt"), file("*.png"), file("*.bed") into controlFreecVizOutSingle - - when: 'controlfreec' in tools - - script: - """ - echo "############### Calculating significance values for TUMOR CNVs #############" - cat /opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/bin/assess_significance.R | R --slave --args ${cnvTumor} ${ratioTumor} - - echo "############### Creating graph for TUMOR ratios ###############" - cat /opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/bin/makeGraph.R | R --slave --args 2 ${ratioTumor} ${bafTumor} - - echo "############### Creating BED files for TUMOR ##############" - perl /opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/bin/freec2bed.pl -f ${ratioTumor} > ${idSampleTumor}.bed - """ -} - -controlFreecVizOutSingle.dump(tag:'ControlFreecVizSingle') - -// Remapping channels for QC and annotation - -(vcfStrelkaIndels, vcfStrelkaSNVS) = vcfStrelka.into(2) -(vcfStrelkaBPIndels, vcfStrelkaBPSNVS) = vcfStrelkaBP.into(2) -(vcfMantaSomaticSV, vcfMantaDiploidSV) = vcfManta.into(2) - -vcfKeep = Channel.empty().mix( - filteredMutect2Output.map{ - variantCaller, idPatient, idSample, vcf, tbi, tsv -> - [variantCaller, idSample, vcf] - }, - vcfConcatenated.map{ - variantCaller, idPatient, idSample, vcf, tbi -> - [variantCaller, idSample, vcf] - }, - vcf_sentieon_compressed.map { - variantCaller, idPatient, idSample, vcf, tbi -> - [variantCaller, idSample, vcf] - }, - vcfStrelkaSingle.map { - variantCaller, idPatient, idSample, vcf, tbi -> - [variantCaller, idSample, vcf[1]] - }, - vcfMantaSingle.map { - variantCaller, idPatient, idSample, vcf, tbi -> - [variantCaller, idSample, vcf[2]] - }, - vcfMantaDiploidSV.map { - variantCaller, idPatient, idSample, vcf, tbi -> - [variantCaller, idSample, vcf[2]] - }, - vcfMantaSomaticSV.map { - variantCaller, idPatient, idSample, vcf, tbi -> - [variantCaller, idSample, vcf[3]] - }, - vcfStrelkaIndels.map { - variantCaller, idPatient, idSample, vcf, tbi -> - [variantCaller, idSample, vcf[0]] - }, - vcfStrelkaSNVS.map { - variantCaller, idPatient, idSample, vcf, tbi -> - [variantCaller, idSample, vcf[1]] - }, - vcfStrelkaBPIndels.map { - variantCaller, idPatient, idSample, vcf, tbi -> - [variantCaller, idSample, vcf[0]] - }, - vcfStrelkaBPSNVS.map { - variantCaller, idPatient, idSample, vcf, tbi -> - [variantCaller, idSample, vcf[1]] - }, - vcfTIDDIT.map { - variantCaller, idPatient, idSample, vcf, tbi -> - [variantCaller, idSample, vcf] - }) - -(vcfBCFtools, vcfVCFtools, vcfAnnotation) = vcfKeep.into(3) - -// STEP VCF.QC - -process BcftoolsStats { - label 'cpus_1' - - tag "${variantCaller} - ${vcf}" - - publishDir "${params.outdir}/Reports/${idSample}/BCFToolsStats", mode: params.publish_dir_mode - - input: - set variantCaller, idSample, file(vcf) from vcfBCFtools - - output: - file ("*.bcf.tools.stats.out") into bcftoolsReport - - when: !('bcftools' in skipQC) - - script: - """ - bcftools stats ${vcf} > ${reduceVCF(vcf.fileName)}.bcf.tools.stats.out - """ -} - -bcftoolsReport = bcftoolsReport.dump(tag:'BCFTools') - -process Vcftools { - label 'cpus_1' - - tag "${variantCaller} - ${vcf}" - - publishDir "${params.outdir}/Reports/${idSample}/VCFTools", mode: params.publish_dir_mode - - input: - set variantCaller, idSample, file(vcf) from vcfVCFtools - - output: - file ("${reduceVCF(vcf.fileName)}.*") into vcftoolsReport - - when: !('vcftools' in skipQC) - - script: - """ - vcftools \ - --gzvcf ${vcf} \ - --TsTv-by-count \ - --out ${reduceVCF(vcf.fileName)} - - vcftools \ - --gzvcf ${vcf} \ - --TsTv-by-qual \ - --out ${reduceVCF(vcf.fileName)} - - vcftools \ - --gzvcf ${vcf} \ - --FILTER-summary \ - --out ${reduceVCF(vcf.fileName)} - """ -} - -vcftoolsReport = vcftoolsReport.dump(tag:'VCFTools') - -/* -================================================================================ - ANNOTATION -================================================================================ -*/ - -if (step == 'annotate') { - vcfToAnnotate = Channel.create() - vcfNoAnnotate = Channel.create() - - if (tsvPath == []) { - // Sarek, by default, annotates all available vcfs that it can find in the VariantCalling directory - // Excluding vcfs from FreeBayes, and g.vcf from HaplotypeCaller - // Basically it's: results/VariantCalling/*/{HaplotypeCaller,Manta,Mutect2,SentieonDNAseq,SentieonDNAscope,SentieonTNscope,Strelka,TIDDIT}/*.vcf.gz - // Without *SmallIndels.vcf.gz from Manta, and *.genome.vcf.gz from Strelka - // The small snippet `vcf.minus(vcf.fileName)[-2]` catches idSample - // This field is used to output final annotated VCFs in the correct directory - Channel.empty().mix( - Channel.fromPath("${params.outdir}/VariantCalling/*/HaplotypeCaller/*.vcf.gz") - .flatten().map{vcf -> ['HaplotypeCaller', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, - Channel.fromPath("${params.outdir}/VariantCalling/*/Manta/*[!candidate]SV.vcf.gz") - .flatten().map{vcf -> ['Manta', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, - Channel.fromPath("${params.outdir}/VariantCalling/*/Mutect2/*.vcf.gz") - .flatten().map{vcf -> ['Mutect2', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, - Channel.fromPath("${params.outdir}/VariantCalling/*/SentieonDNAseq/*.vcf.gz") - .flatten().map{vcf -> ['SentieonDNAseq', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, - Channel.fromPath("${params.outdir}/VariantCalling/*/SentieonDNAscope/*.vcf.gz") - .flatten().map{vcf -> ['SentieonDNAscope', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, - Channel.fromPath("${params.outdir}/VariantCalling/*/SentieonTNscope/*.vcf.gz") - .flatten().map{vcf -> ['SentieonTNscope', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, - Channel.fromPath("${params.outdir}/VariantCalling/*/Strelka/*{somatic,variant}*.vcf.gz") - .flatten().map{vcf -> ['Strelka', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, - Channel.fromPath("${params.outdir}/VariantCalling/*/TIDDIT/*.vcf.gz") - .flatten().map{vcf -> ['TIDDIT', vcf.minus(vcf.fileName)[-2].toString(), vcf]} - ).choice(vcfToAnnotate, vcfNoAnnotate) { - annotate_tools == [] || (annotate_tools != [] && it[0] in annotate_tools) ? 0 : 1 - } - } else if (annotate_tools == []) { - // Annotate user-submitted VCFs - // If user-submitted, Sarek assume that the idSample should be assumed automatically - vcfToAnnotate = Channel.fromPath(tsvPath) - .map{vcf -> ['userspecified', vcf.minus(vcf.fileName)[-2].toString(), vcf]} - } else exit 1, "specify only tools or files to annotate, not both" - - vcfNoAnnotate.close() - vcfAnnotation = vcfAnnotation.mix(vcfToAnnotate) -} - -// as now have the list of VCFs to annotate, the first step is to annotate with allele frequencies, if there are any - -(vcfSnpeff, vcfVep) = vcfAnnotation.into(2) - -vcfVep = vcfVep.map { - variantCaller, idSample, vcf -> - [variantCaller, idSample, vcf, null] -} - -// STEP SNPEFF - -process Snpeff { - tag "${idSample} - ${variantCaller} - ${vcf}" - - publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { - if (it == "${reducedVCF}_snpEff.ann.vcf") null - else "Reports/${idSample}/snpEff/${it}" - } - - input: - set variantCaller, idSample, file(vcf) from vcfSnpeff - file(dataDir) from ch_snpeff_cache - val snpeffDb from ch_snpeff_db - - output: - set file("${reducedVCF}_snpEff.genes.txt"), file("${reducedVCF}_snpEff.html"), file("${reducedVCF}_snpEff.csv") into snpeffReport - set variantCaller, idSample, file("${reducedVCF}_snpEff.ann.vcf") into snpeffVCF - - when: 'snpeff' in tools || 'merge' in tools - - script: - reducedVCF = reduceVCF(vcf.fileName) - cache = (params.snpeff_cache && params.annotation_cache) ? "-dataDir \${PWD}/${dataDir}" : "" - """ - snpEff -Xmx${task.memory.toGiga()}g \ - ${snpeffDb} \ - -csvStats ${reducedVCF}_snpEff.csv \ - -nodownload \ - ${cache} \ - -canon \ - -v \ - ${vcf} \ - > ${reducedVCF}_snpEff.ann.vcf - - mv snpEff_summary.html ${reducedVCF}_snpEff.html - """ -} - -snpeffReport = snpeffReport.dump(tag:'snpEff report') - -// STEP COMPRESS AND INDEX VCF.1 - SNPEFF - -process CompressVCFsnpEff { - tag "${idSample} - ${vcf}" - - publishDir "${params.outdir}/Annotation/${idSample}/snpEff", mode: params.publish_dir_mode - - input: - set variantCaller, idSample, file(vcf) from snpeffVCF - - output: - set variantCaller, idSample, file("*.vcf.gz"), file("*.vcf.gz.tbi") into (compressVCFsnpEffOut) - - script: - """ - bgzip < ${vcf} > ${vcf}.gz - tabix ${vcf}.gz - """ -} - -compressVCFsnpEffOut = compressVCFsnpEffOut.dump(tag:'VCF') - -// STEP VEP.1 - -process VEP { - label 'VEP' - label 'cpus_4' - - tag "${idSample} - ${variantCaller} - ${vcf}" - - publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { - if (it == "${reducedVCF}_VEP.summary.html") "Reports/${idSample}/VEP/${it}" - else null - } - - input: - set variantCaller, idSample, file(vcf), file(idx) from vcfVep - file(dataDir) from ch_vep_cache - val cache_version from ch_vep_cache_version - file(cadd_InDels) from ch_cadd_indels - file(cadd_InDels_tbi) from ch_cadd_indels_tbi - file(cadd_WG_SNVs) from ch_cadd_wg_snvs - file(cadd_WG_SNVs_tbi) from ch_cadd_wg_snvs_tbi - - output: - set variantCaller, idSample, file("${reducedVCF}_VEP.ann.vcf") into vepVCF - file("${reducedVCF}_VEP.summary.html") into vepReport - - when: 'vep' in tools - - script: - reducedVCF = reduceVCF(vcf.fileName) - genome = params.genome == 'smallGRCh37' ? 'GRCh37' : params.genome - - dir_cache = (params.vep_cache && params.annotation_cache) ? " \${PWD}/${dataDir}" : "/.vep" - cadd = (params.cadd_cache && params.cadd_wg_snvs && params.cadd_indels) ? "--plugin CADD,whole_genome_SNVs.tsv.gz,InDels.tsv.gz" : "" - genesplicer = params.genesplicer ? "--plugin GeneSplicer,/opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/bin/genesplicer,/opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/share/genesplicer-1.0-1/human,context=200,tmpdir=\$PWD/${reducedVCF}" : "--offline" - """ - mkdir ${reducedVCF} - - vep \ - -i ${vcf} \ - -o ${reducedVCF}_VEP.ann.vcf \ - --assembly ${genome} \ - --species ${params.species} \ - ${cadd} \ - ${genesplicer} \ - --cache \ - --cache_version ${cache_version} \ - --dir_cache ${dir_cache} \ - --everything \ - --filter_common \ - --fork ${task.cpus} \ - --format vcf \ - --per_gene \ - --stats_file ${reducedVCF}_VEP.summary.html \ - --total_length \ - --vcf - - rm -rf ${reducedVCF} - """ -} - -vepReport = vepReport.dump(tag:'VEP') - -// STEP VEP.2 - VEP AFTER SNPEFF - -process VEPmerge { - label 'VEP' - label 'cpus_4' - - tag "${idSample} - ${variantCaller} - ${vcf}" - - publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { - if (it == "${reducedVCF}_VEP.summary.html") "Reports/${idSample}/VEP/${it}" - else null - } - - input: - set variantCaller, idSample, file(vcf), file(idx) from compressVCFsnpEffOut - file(dataDir) from ch_vep_cache - val cache_version from ch_vep_cache_version - file(cadd_InDels) from ch_cadd_indels - file(cadd_InDels_tbi) from ch_cadd_indels_tbi - file(cadd_WG_SNVs) from ch_cadd_wg_snvs - file(cadd_WG_SNVs_tbi) from ch_cadd_wg_snvs_tbi - - output: - set variantCaller, idSample, file("${reducedVCF}_VEP.ann.vcf") into vepVCFmerge - file("${reducedVCF}_VEP.summary.html") into vepReportMerge - - when: 'merge' in tools - - script: - reducedVCF = reduceVCF(vcf.fileName) - genome = params.genome == 'smallGRCh37' ? 'GRCh37' : params.genome - dir_cache = (params.vep_cache && params.annotation_cache) ? " \${PWD}/${dataDir}" : "/.vep" - cadd = (params.cadd_cache && params.cadd_wg_snvs && params.cadd_indels) ? "--plugin CADD,whole_genome_SNVs.tsv.gz,InDels.tsv.gz" : "" - genesplicer = params.genesplicer ? "--plugin GeneSplicer,/opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/bin/genesplicer,/opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/share/genesplicer-1.0-1/human,context=200,tmpdir=\$PWD/${reducedVCF}" : "--offline" - """ - mkdir ${reducedVCF} - - vep \ - -i ${vcf} \ - -o ${reducedVCF}_VEP.ann.vcf \ - --assembly ${genome} \ - --species ${params.species} \ - ${cadd} \ - ${genesplicer} \ - --cache \ - --cache_version ${cache_version} \ - --dir_cache ${dir_cache} \ - --everything \ - --filter_common \ - --fork ${task.cpus} \ - --format vcf \ - --per_gene \ - --stats_file ${reducedVCF}_VEP.summary.html \ - --total_length \ - --vcf - - rm -rf ${reducedVCF} - """ -} - -vepReportMerge = vepReportMerge.dump(tag:'VEP') - -vcfCompressVCFvep = vepVCF.mix(vepVCFmerge) - -// STEP COMPRESS AND INDEX VCF.2 - VEP - -process CompressVCFvep { - tag "${idSample} - ${vcf}" - - publishDir "${params.outdir}/Annotation/${idSample}/VEP", mode: params.publish_dir_mode - - input: - set variantCaller, idSample, file(vcf) from vcfCompressVCFvep - - output: - set variantCaller, idSample, file("*.vcf.gz"), file("*.vcf.gz.tbi") into compressVCFOutVEP - - script: - """ - bgzip < ${vcf} > ${vcf}.gz - tabix ${vcf}.gz - """ -} - -compressVCFOutVEP = compressVCFOutVEP.dump(tag:'VCF') - -/* -================================================================================ - MultiQC -================================================================================ -*/ - -// STEP MULTIQC - -process MultiQC { - publishDir "${params.outdir}/Reports/MultiQC", mode: params.publish_dir_mode - - input: - file (multiqcConfig) from ch_multiqc_config - file (mqc_custom_config) from ch_multiqc_custom_config.collect().ifEmpty([]) - file (versions) from ch_software_versions_yaml.collect() - file workflow_summary from ch_workflow_summary.collectFile(name: "workflow_summary_mqc.yaml") - file ('bamQC/*') from bamQCReport.collect().ifEmpty([]) - file ('BCFToolsStats/*') from bcftoolsReport.collect().ifEmpty([]) - file ('FastQC/*') from fastQCReport.collect().ifEmpty([]) - file ('TrimmedFastQC/*') from trimGaloreReport.collect().ifEmpty([]) - file ('MarkDuplicates/*') from duplicates_marked_report.collect().ifEmpty([]) - file ('DuplicatesMarked/*.recal.table') from baseRecalibratorReport.collect().ifEmpty([]) - file ('SamToolsStats/*') from samtoolsStatsReport.collect().ifEmpty([]) - file ('snpEff/*') from snpeffReport.collect().ifEmpty([]) - file ('VCFTools/*') from vcftoolsReport.collect().ifEmpty([]) - - output: - file "*multiqc_report.html" into ch_multiqc_report - file "*_data" - file "multiqc_plots" - - when: !('multiqc' in skipQC) - - script: - rtitle = custom_runName ? "--title \"$custom_runName\"" : '' - rfilename = custom_runName ? "--filename " + custom_runName.replaceAll('\\W','_').replaceAll('_+','_') + "_multiqc_report" : '' - custom_config_file = params.multiqc_config ? "--config $mqc_custom_config" : '' - """ - multiqc -f ${rtitle} ${rfilename} ${custom_config_file} . - """ -} - -ch_multiqc_report.dump(tag:'MultiQC') - -// Output Description HTML -process Output_documentation { - publishDir "${params.outdir}/pipeline_info", mode: params.publish_dir_mode - - input: - file output_docs from ch_output_docs - file images from ch_output_docs_images - - output: - file "results_description.html" - - when: !('documentation' in skipQC) - - script: - """ - markdown_to_html.py $output_docs -o results_description.html - """ -} - -// Completion e-mail notification -workflow.onComplete { - - // Set up the e-mail variables - def subject = "[nf-core/sarek] Successful: $workflow.runName" - if (!workflow.success) { - subject = "[nf-core/sarek] FAILED: $workflow.runName" - } - def email_fields = [:] - email_fields['version'] = workflow.manifest.version - email_fields['runName'] = custom_runName ?: workflow.runName - email_fields['success'] = workflow.success - email_fields['dateComplete'] = workflow.complete - email_fields['duration'] = workflow.duration - email_fields['exitStatus'] = workflow.exitStatus - email_fields['errorMessage'] = (workflow.errorMessage ?: 'None') - email_fields['errorReport'] = (workflow.errorReport ?: 'None') - email_fields['commandLine'] = workflow.commandLine - email_fields['projectDir'] = workflow.projectDir - email_fields['summary'] = summary - email_fields['summary']['Date Started'] = workflow.start - email_fields['summary']['Date Completed'] = workflow.complete - email_fields['summary']['Pipeline script file path'] = workflow.scriptFile - email_fields['summary']['Pipeline script hash ID'] = workflow.scriptId - if (workflow.repository) email_fields['summary']['Pipeline repository Git URL'] = workflow.repository - if (workflow.commitId) email_fields['summary']['Pipeline repository Git Commit'] = workflow.commitId - if (workflow.revision) email_fields['summary']['Pipeline Git branch/tag'] = workflow.revision - email_fields['summary']['Nextflow Version'] = workflow.nextflow.version - email_fields['summary']['Nextflow Build'] = workflow.nextflow.build - email_fields['summary']['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp - - def mqc_report = null - try { - if (workflow.success) { - mqc_report = ch_multiqc_report.getVal() - if (mqc_report.getClass() == ArrayList) { - log.warn "[nf-core/sarek] Found multiple reports from process 'multiqc', will use only one" - mqc_report = mqc_report[0] - } - } - } catch (all) { - log.warn "[nf-core/sarek] Could not attach MultiQC report to summary email" - } - - // Check if we are only sending emails on failure - email_address = params.email - if (!params.email && params.email_on_fail && !workflow.success) { - email_address = params.email_on_fail - } - - // Render the TXT template - def engine = new groovy.text.GStringTemplateEngine() - def tf = new File("$projectDir/assets/email_template.txt") - def txt_template = engine.createTemplate(tf).make(email_fields) - def email_txt = txt_template.toString() - - // Render the HTML template - def hf = new File("$projectDir/assets/email_template.html") - def html_template = engine.createTemplate(hf).make(email_fields) - def email_html = html_template.toString() - - // Render the sendmail template - def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, projectDir: "$projectDir", mqcFile: mqc_report, mqcMaxSize: params.max_multiqc_email_size.toBytes() ] - def sf = new File("$projectDir/assets/sendmail_template.txt") - def sendmail_template = engine.createTemplate(sf).make(smail_fields) - def sendmail_html = sendmail_template.toString() - - // Send the HTML e-mail - if (email_address) { - try { - if (params.plaintext_email) { throw GroovyException('Send plaintext e-mail, not HTML') } - // Try to send HTML e-mail using sendmail - [ 'sendmail', '-t' ].execute() << sendmail_html - log.info "[nf-core/sarek] Sent summary e-mail to $email_address (sendmail)" - } catch (all) { - // Catch failures and try with plaintext - def mail_cmd = [ 'mail', '-s', subject, '--content-type=text/html', email_address ] - if ( mqc_report.size() <= params.max_multiqc_email_size.toBytes() ) { - mail_cmd += [ '-A', mqc_report ] - } - mail_cmd.execute() << email_html - log.info "[nf-core/sarek] Sent summary e-mail to $email_address (mail)" - } - } - - // Write summary e-mail HTML to a file - def output_d = new File("${params.outdir}/pipeline_info/") - if (!output_d.exists()) { - output_d.mkdirs() - } - def output_hf = new File(output_d, "pipeline_report.html") - output_hf.withWriter { w -> w << email_html } - def output_tf = new File(output_d, "pipeline_report.txt") - output_tf.withWriter { w -> w << email_txt } - - c_green = params.monochrome_logs ? '' : "\033[0;32m"; - c_purple = params.monochrome_logs ? '' : "\033[0;35m"; - c_red = params.monochrome_logs ? '' : "\033[0;31m"; - c_reset = params.monochrome_logs ? '' : "\033[0m"; - - if (workflow.stats.ignoredCount > 0 && workflow.success) { - log.info "-${c_purple}Warning, pipeline completed, but with errored process(es) ${c_reset}-" - log.info "-${c_red}Number of ignored errored process(es) : ${workflow.stats.ignoredCount} ${c_reset}-" - log.info "-${c_green}Number of successfully ran process(es) : ${workflow.stats.succeedCount} ${c_reset}-" - } - - if (workflow.success) { - log.info "-${c_purple}[nf-core/sarek]${c_green} Pipeline completed successfully${c_reset}-" - } else { - checkHostname() - log.info "-${c_purple}[nf-core/sarek]${c_red} Pipeline completed with errors${c_reset}-" - } -} - -/* -================================================================================ - nf-core functions -================================================================================ -*/ - -def create_workflow_summary(summary) { - def yaml_file = workDir.resolve('workflow_summary_mqc.yaml') - yaml_file.text = """ - id: 'nf-core-sarek-summary' - description: " - this information is collected when the pipeline is started." - section_name: 'nf-core/sarek Workflow Summary' - section_href: 'https://github.com/nf-core/sarek' - plot_type: 'html' - data: | -
-${summary.collect { k, v -> "
$k
${v ?: 'N/A'}
" }.join("\n")} -
- """.stripIndent() - - return yaml_file -} - -def nfcoreHeader() { - // Log colors ANSI codes - c_black = params.monochrome_logs ? '' : "\033[0;30m"; - c_blue = params.monochrome_logs ? '' : "\033[0;34m"; - c_dim = params.monochrome_logs ? '' : "\033[2m"; - c_green = params.monochrome_logs ? '' : "\033[0;32m"; - c_purple = params.monochrome_logs ? '' : "\033[0;35m"; - c_reset = params.monochrome_logs ? '' : "\033[0m"; - c_white = params.monochrome_logs ? '' : "\033[0;37m"; - c_yellow = params.monochrome_logs ? '' : "\033[0;33m"; - - return """ -${c_dim}--------------------------------------------------${c_reset}- - ${c_green},--.${c_black}/${c_green},-.${c_reset} - ${c_blue} ___ __ __ __ ___ ${c_green}/,-._.--~\'${c_reset} - ${c_blue} |\\ | |__ __ / ` / \\ |__) |__ ${c_yellow}} {${c_reset} - ${c_blue} | \\| | \\__, \\__/ | \\ |___ ${c_green}\\`-._,-`-,${c_reset} - ${c_green}`._,._,\'${c_reset} - ${c_white}____${c_reset} - ${c_white}.´ _ `.${c_reset} - ${c_white}/ ${c_green}|\\${c_reset}`-_ \\${c_reset} ${c_blue} __ __ ___ ${c_reset} - ${c_white}| ${c_green}| \\${c_reset} `-|${c_reset} ${c_blue}|__` /\\ |__) |__ |__/${c_reset} - ${c_white}\\ ${c_green}| \\${c_reset} /${c_reset} ${c_blue}.__| /¯¯\\ | \\ |___ | \\${c_reset} - ${c_white}`${c_green}|${c_reset}____${c_green}\\${c_reset}´${c_reset} - - ${c_purple} nf-core/sarek v${workflow.manifest.version}${c_reset} - -${c_dim}--------------------------------------------------${c_reset}- - """.stripIndent() -} - -def checkHostname() { - def c_reset = params.monochrome_logs ? '' : "\033[0m" - def c_white = params.monochrome_logs ? '' : "\033[0;37m" - def c_red = params.monochrome_logs ? '' : "\033[1;91m" - def c_yellow_bold = params.monochrome_logs ? '' : "\033[1;93m" - if (params.hostnames) { - def hostname = "hostname".execute().text.trim() - params.hostnames.each { prof, hnames -> - hnames.each { hname -> - if (hostname.contains(hname) && !workflow.profile.contains(prof)) { - log.error "====================================================\n" + - " ${c_red}WARNING!${c_reset} You are running with `-profile $workflow.profile`\n" + - " but your machine hostname is ${c_white}'$hostname'${c_reset}\n" + - " ${c_yellow_bold}It's highly recommended that you use `-profile $prof${c_reset}`\n" + - "============================================================" - } - } - } - } -} - -/* -================================================================================ - sarek functions -================================================================================ -*/ - -// Check if a row has the expected number of item -def checkNumberOfItem(row, number) { - if (row.size() != number) exit 1, "Malformed row in TSV file: ${row}, see --help for more information" - return true -} - -// Check parameter existence -def checkParameterExistence(it, list) { - if (!list.contains(it)) { - log.warn "Unknown parameter: ${it}" - return false - } - return true -} - -// Compare each parameter with a list of parameters -def checkParameterList(list, realList) { - return list.every{ checkParameterExistence(it, realList) } -} - -// Define list of available tools to annotate -def defineAnnoList() { - return [ - 'haplotypecaller', - 'manta', - 'mutect2', - 'strelka', - 'tiddit' - ] -} - -// Define list of skipable QC tools -def defineSkipQClist() { - return [ - 'bamqc', - 'baserecalibrator', - 'bcftools', - 'documentation', - 'fastqc', - 'markduplicates', - 'multiqc', - 'samtools', - 'sentieon', - 'vcftools', - 'versions' - ] -} - -// Define list of available step -def defineStepList() { - return [ - 'annotate', - 'controlfreec', - 'mapping', - 'preparerecalibration', - 'recalibrate', - 'variantcalling' - ] -} - -// Define list of available tools -def defineToolList() { - return [ - 'ascat', - 'cnvkit', - 'controlfreec', - 'dnascope', - 'dnaseq', - 'freebayes', - 'haplotypecaller', - 'manta', - 'merge', - 'mpileup', - 'mutect2', - 'snpeff', - 'strelka', - 'tiddit', - 'tnscope', - 'vep', - 'msisensor' - ] -} - -// Channeling the TSV file containing BAM. -// Format is: "subject gender status sample bam bai" -def extractBam(tsvFile) { - Channel.from(tsvFile) - .splitCsv(sep: '\t') - .map { row -> - checkNumberOfItem(row, 6) - def idPatient = row[0] - def gender = row[1] - def status = returnStatus(row[2].toInteger()) - def idSample = row[3] - def bamFile = returnFile(row[4]) - def baiFile = returnFile(row[5]) - - if (!hasExtension(bamFile, "bam")) exit 1, "File: ${bamFile} has the wrong extension. See --help for more information" - if (!hasExtension(baiFile, "bai")) exit 1, "File: ${baiFile} has the wrong extension. See --help for more information" - - return [idPatient, gender, status, idSample, bamFile, baiFile] - } -} - -// Create a channel of germline FASTQs from a directory pattern: "my_samples/*/" -// All FASTQ files in subdirectories are collected and emitted; -// they must have _R1_ and _R2_ in their names. -def extractFastqFromDir(pattern) { - def fastq = Channel.create() - // a temporary channel does all the work - Channel - .fromPath(pattern, type: 'dir') - .ifEmpty { error "No directories found matching pattern '${pattern}'" } - .subscribe onNext: { sampleDir -> - // the last name of the sampleDir is assumed to be a unique sample id - sampleId = sampleDir.getFileName().toString() - - for (path1 in file("${sampleDir}/**_R1_*.fastq.gz")) { - assert path1.getName().contains('_R1_') - path2 = file(path1.toString().replace('_R1_', '_R2_')) - if (!path2.exists()) error "Path '${path2}' not found" - (flowcell, lane) = flowcellLaneFromFastq(path1) - String random = org.apache.commons.lang.RandomStringUtils.random(8, true, true) // random string to avoid duplicate names - patient = sampleId - gender = 'ZZ' // unused - status = 0 // normal (not tumor) - rgId = "${flowcell}.${sampleId}.${lane}.${random}" - result = [patient, gender, status, sampleId, rgId, path1, path2] - fastq.bind(result) - } - }, onComplete: { fastq.close() } - fastq -} - -// Extract gender and status from Channel -def extractInfos(channel) { - def genderMap = [:] - def statusMap = [:] - channel = channel.map{ it -> - def idPatient = it[0] - def gender = it[1] - def status = it[2] - def idSample = it[3] - genderMap[idPatient] = gender - statusMap[idPatient, idSample] = status - [idPatient] + it[3..-1] - } - [genderMap, statusMap, channel] -} - -// Channeling the TSV file containing FASTQ or BAM -// Format is: "subject gender status sample lane fastq1 fastq2" -// or: "subject gender status sample lane bam" -def extractFastq(tsvFile) { - Channel.from(tsvFile) - .splitCsv(sep: '\t') - .map { row -> - def idPatient = row[0] - def gender = row[1] - def status = returnStatus(row[2].toInteger()) - def idSample = row[3] - def idRun = row[4] - def file1 = returnFile(row[5]) - def file2 = "null" - if (hasExtension(file1, "fastq.gz") || hasExtension(file1, "fq.gz") || hasExtension(file1, "fastq") || hasExtension(file1, "fq")) { - checkNumberOfItem(row, 7) - file2 = returnFile(row[6]) - if (!hasExtension(file2, "fastq.gz") && !hasExtension(file2, "fq.gz") && !hasExtension(file2, "fastq") && !hasExtension(file2, "fq")) exit 1, "File: ${file2} has the wrong extension. See --help for more information" - if (hasExtension(file1, "fastq") || hasExtension(file1, "fq") || hasExtension(file2, "fastq") || hasExtension(file2, "fq")) { - exit 1, "We do recommend to use gziped fastq file to help you reduce your data footprint." - } - } - else if (hasExtension(file1, "bam")) checkNumberOfItem(row, 6) - else "No recognisable extention for input file: ${file1}" - - [idPatient, gender, status, idSample, idRun, file1, file2] - } -} - -// Channeling the TSV file containing mpileup -// Format is: "subject gender status sample pileup" -def extractPileup(tsvFile) { - Channel.from(tsvFile) - .splitCsv(sep: '\t') - .map { row -> - checkNumberOfItem(row, 5) - def idPatient = row[0] - def gender = row[1] - def status = returnStatus(row[2].toInteger()) - def idSample = row[3] - def mpileup = returnFile(row[4]) - - if (!hasExtension(mpileup, "pileup")) exit 1, "File: ${mpileup} has the wrong extension. See --help for more information" - - return [idPatient, gender, status, idSample, mpileup] - } -} - -// Channeling the TSV file containing Recalibration Tables. -// Format is: "subject gender status sample bam bai recalTable" -def extractRecal(tsvFile) { - Channel.from(tsvFile) - .splitCsv(sep: '\t') - .map { row -> - checkNumberOfItem(row, 7) - def idPatient = row[0] - def gender = row[1] - def status = returnStatus(row[2].toInteger()) - def idSample = row[3] - def bamFile = returnFile(row[4]) - def baiFile = returnFile(row[5]) - def recalTable = returnFile(row[6]) - - if (!hasExtension(bamFile, "bam")) exit 1, "File: ${bamFile} has the wrong extension. See --help for more information" - if (!hasExtension(baiFile, "bai")) exit 1, "File: ${baiFile} has the wrong extension. See --help for more information" - if (!hasExtension(recalTable, "recal.table")) exit 1, "File: ${recalTable} has the wrong extension. See --help for more information" - - [idPatient, gender, status, idSample, bamFile, baiFile, recalTable] - } -} - -// Parse first line of a FASTQ file, return the flowcell id and lane number. -def flowcellLaneFromFastq(path) { - // expected format: - // xx:yy:FLOWCELLID:LANE:... (seven fields) - // or - // FLOWCELLID:LANE:xx:... (five fields) - InputStream fileStream = new FileInputStream(path.toFile()) - InputStream gzipStream = new java.util.zip.GZIPInputStream(fileStream) - Reader decoder = new InputStreamReader(gzipStream, 'ASCII') - BufferedReader buffered = new BufferedReader(decoder) - def line = buffered.readLine() - assert line.startsWith('@') - line = line.substring(1) - def fields = line.split(' ')[0].split(':') - String fcid - int lane - if (fields.size() == 7) { - // CASAVA 1.8+ format - fcid = fields[2] - lane = fields[3].toInteger() - } else if (fields.size() == 5) { - fcid = fields[0] - lane = fields[1].toInteger() - } - [fcid, lane] -} - -// Check file extension -def hasExtension(it, extension) { - it.toString().toLowerCase().endsWith(extension.toLowerCase()) -} - -// Return file if it exists -def returnFile(it) { - if (!file(it).exists()) exit 1, "Missing file in TSV file: ${it}, see --help for more information" - return file(it) -} - -// Remove .ann .gz and .vcf extension from a VCF file -def reduceVCF(file) { - return file.fileName.toString().minus(".ann").minus(".vcf").minus(".gz") -} - -// Return status [0,1] -// 0 == Normal, 1 == Tumor -def returnStatus(it) { - if (!(it in [0, 1])) exit 1, "Status is not recognized in TSV file: ${it}, see --help for more information" - return it -} +// // Creating a TSV file to restart from this step +// tsv_mpileup.map { idPatient, idSample -> +// gender = gender_map[idPatient] +// status = status_map[idPatient, idSample] +// mpileup = "${params.outdir}/VariantCalling/${idSample}/Control-FREEC/${idSample}.pileup" +// "${idPatient}\t${gender}\t${status}\t${idSample}\t${mpileup}\n" +// }.collectFile( +// name: 'control-freec_mpileup.tsv', sort: true, storeDir: "${params.outdir}/VariantCalling/TSV" +// ) + +// tsv_mpileup_sample +// .collectFile(storeDir: "${params.outdir}/VariantCalling/TSV") { +// idPatient, idSample -> +// status = status_map[idPatient, idSample] +// gender = gender_map[idPatient] +// mpileup = "${params.outdir}/VariantCalling/${idSample}/Control-FREEC/${idSample}.pileup" +// ["control-freec_mpileup_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${mpileup}\n"] +// } + +// if (!params.no_intervals) { +// mpileupMerge = mpileupMerge.groupTuple(by:[0, 1]) +// mpileupNoInt = Channel.empty() +// } else { +// (mpileupMerge, mpileupNoInt) = mpileupMerge.into(2) +// mpileupMerge.close() +// } + +// // STEP MPILEUP.2 - MERGE +// process MergeMpileup { +// label 'cpus_1' + +// tag "${idSample}" + +// publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { it == "${idSample}.pileup" ? "VariantCalling/${idSample}/Control-FREEC/${it}" : null } + +// input: +// set idPatient, idSample, file(mpileup) from mpileupMerge + +// output: +// set idPatient, idSample, file("${idSample}.pileup") into mpileupOut + +// when: !(params.no_intervals) && 'controlfreec' in tools || 'mpileup' in tools + +// script: +// """ +// for i in `ls -1v *.pileup`; +// do cat \$i >> ${idSample}.pileup +// done +// """ +// } + +// mpileupOut = mpileupOut.mix(mpileupNoInt) +// mpileupOut = mpileupOut.dump(tag:'mpileup') + +// mpileupOutNormal = Channel.create() +// mpileupOutTumor = Channel.create() + +// if (step == 'controlfreec') mpileupOut = input_sample + +// mpileupOut +// .choice(mpileupOutTumor, mpileupOutNormal) {status_map[it[0], it[1]] == 0 ? 1 : 0} + +// mpileupOut = mpileupOutNormal.combine(mpileupOutTumor, by:0) + +// mpileupOut = mpileupOut.map { +// idPatientNormal, idSampleNormal, mpileupOutNormal, +// idSampleTumor, mpileupOutTumor -> +// [idPatientNormal, idSampleNormal, idSampleTumor, mpileupOutNormal, mpileupOutTumor] +// } + +// // STEP CONTROLFREEC.1 - CONTROLFREEC + +// process ControlFREEC { +// label 'cpus_max' +// //label 'memory_singleCPU_2_task' + +// tag "${idSampleTumor}_vs_${idSampleNormal}" + +// publishDir "${params.outdir}/VariantCalling/${idSampleTumor}_vs_${idSampleNormal}/Control-FREEC", mode: params.publish_dir_mode + +// input: +// set idPatient, idSampleNormal, idSampleTumor, file(mpileupNormal), file(mpileupTumor) from mpileupOut +// file(chrDir) from chr_dir +// file(mappability) from mappability +// file(chrLength) from chr_length +// file(dbsnp) from dbsnp +// file(dbsnpIndex) from dbsnp_tbi +// file(fasta) from fasta +// file(fastaFai) from fai + +// output: +// set idPatient, idSampleNormal, idSampleTumor, file("${idSampleTumor}.pileup_CNVs"), file("${idSampleTumor}.pileup_ratio.txt"), file("${idSampleTumor}.pileup_normal_CNVs"), file("${idSampleTumor}.pileup_normal_ratio.txt"), file("${idSampleTumor}.pileup_BAF.txt"), file("${idSampleNormal}.pileup_BAF.txt") into controlFreecViz +// set file("*.pileup*"), file("${idSampleTumor}_vs_${idSampleNormal}.config.txt") into controlFreecOut + +// when: 'controlfreec' in tools + +// script: +// config = "${idSampleTumor}_vs_${idSampleNormal}.config.txt" +// gender = gender_map[idPatient] +// // if we are using coefficientOfVariation, we must delete the window parameter +// // it is "window = 20000" in the default settings, without coefficientOfVariation set, +// // but we do not like it. Note, it is not written in stone +// coeff_or_window = params.cf_window ? "window = ${params.cf_window}" : "coefficientOfVariation = ${params.cf_coeff}" + +// """ +// touch ${config} +// echo "[general]" >> ${config} +// echo "BedGraphOutput = TRUE" >> ${config} +// echo "chrFiles = \${PWD}/${chrDir.fileName}" >> ${config} +// echo "chrLenFile = \${PWD}/${chrLength.fileName}" >> ${config} +// echo "gemMappabilityFile = \${PWD}/${mappability}" >> ${config} +// echo "${coeff_or_window}" >> ${config} +// echo "contaminationAdjustment = TRUE" >> ${config} +// echo "forceGCcontentNormalization = 1" >> ${config} +// echo "maxThreads = ${task.cpus}" >> ${config} +// echo "minimalSubclonePresence = 20" >> ${config} +// echo "ploidy = ${params.cf_ploidy}" >> ${config} +// echo "sex = ${gender}" >> ${config} +// echo "" >> ${config} + +// echo "[control]" >> ${config} +// echo "inputFormat = pileup" >> ${config} +// echo "mateFile = \${PWD}/${mpileupNormal}" >> ${config} +// echo "mateOrientation = FR" >> ${config} +// echo "" >> ${config} + +// echo "[sample]" >> ${config} +// echo "inputFormat = pileup" >> ${config} +// echo "mateFile = \${PWD}/${mpileupTumor}" >> ${config} +// echo "mateOrientation = FR" >> ${config} +// echo "" >> ${config} + +// echo "[BAF]" >> ${config} +// echo "SNPfile = ${dbsnp.fileName}" >> ${config} + +// freec -conf ${config} +// """ +// } + +// controlFreecOut.dump(tag:'ControlFREEC') + +// // STEP CONTROLFREEC.3 - VISUALIZATION + +// process ControlFreecViz { +// label 'memory_singleCPU_2_task' + +// tag "${idSampleTumor}_vs_${idSampleNormal}" + +// publishDir "${params.outdir}/VariantCalling/${idSampleTumor}_vs_${idSampleNormal}/Control-FREEC", mode: params.publish_dir_mode + +// input: +// set idPatient, idSampleNormal, idSampleTumor, file(cnvTumor), file(ratioTumor), file(cnvNormal), file(ratioNormal), file(bafTumor), file(bafNormal) from controlFreecViz + +// output: +// set file("*.txt"), file("*.png"), file("*.bed") into controlFreecVizOut + +// when: 'controlfreec' in tools + +// """ +// echo "Shaping CNV files to make sure we can assess significance" +// awk 'NF==9{print}' ${cnvTumor} > TUMOR.CNVs +// awk 'NF==7{print}' ${cnvNormal} > NORMAL.CNVs + +// echo "############### Calculating significance values for TUMOR CNVs #############" +// cat /opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/bin/assess_significance.R | R --slave --args TUMOR.CNVs ${ratioTumor} + +// echo "############### Calculating significance values for NORMAL CNVs ############" +// cat /opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/bin/assess_significance.R | R --slave --args NORMAL.CNVs ${ratioNormal} + +// echo "############### Creating graph for TUMOR ratios ###############" +// cat /opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/bin/makeGraph.R | R --slave --args 2 ${ratioTumor} ${bafTumor} + +// echo "############### Creating graph for NORMAL ratios ##############" +// cat /opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/bin/makeGraph.R | R --slave --args 2 ${ratioNormal} ${bafNormal} + +// echo "############### Creating BED files for TUMOR ##############" +// perl /opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/bin/freec2bed.pl -f ${ratioTumor} > ${idSampleTumor}.bed + +// echo "############### Creating BED files for NORMAL #############" +// perl /opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/bin/freec2bed.pl -f ${ratioNormal} > ${idSampleNormal}.bed +// """ +// } + +// controlFreecVizOut.dump(tag:'ControlFreecViz') + +// // Remapping channels for QC and annotation + +// (vcfStrelkaIndels, vcfStrelkaSNVS) = vcfStrelka.into(2) +// (vcfStrelkaBPIndels, vcfStrelkaBPSNVS) = vcfStrelkaBP.into(2) +// (vcfMantaSomaticSV, vcfMantaDiploidSV) = vcfManta.into(2) + +// vcfKeep = Channel.empty().mix( +// filteredMutect2Output.map{ +// variantCaller, idPatient, idSample, vcf, tbi, tsv -> +// [variantcaller, idSample, vcf] +// }, +// vcfConcatenated.map{ +// variantcaller, idPatient, idSample, vcf, tbi -> +// [variantcaller, idSample, vcf] +// }, +// vcf_sentieon_compressed.map { +// variantcaller, idPatient, idSample, vcf, tbi -> +// [variantcaller, idSample, vcf] +// }, +// vcfStrelkaSingle.map { +// variantcaller, idPatient, idSample, vcf, tbi -> +// [variantcaller, idSample, vcf[1]] +// }, +// vcfMantaSingle.map { +// variantcaller, idPatient, idSample, vcf, tbi -> +// [variantcaller, idSample, vcf[2]] +// }, +// vcfMantaDiploidSV.map { +// variantcaller, idPatient, idSample, vcf, tbi -> +// [variantcaller, idSample, vcf[2]] +// }, +// vcfMantaSomaticSV.map { +// variantcaller, idPatient, idSample, vcf, tbi -> +// [variantcaller, idSample, vcf[3]] +// }, +// vcfStrelkaIndels.map { +// variantcaller, idPatient, idSample, vcf, tbi -> +// [variantcaller, idSample, vcf[0]] +// }, +// vcfStrelkaSNVS.map { +// variantcaller, idPatient, idSample, vcf, tbi -> +// [variantcaller, idSample, vcf[1]] +// }, +// vcfStrelkaBPIndels.map { +// variantcaller, idPatient, idSample, vcf, tbi -> +// [variantcaller, idSample, vcf[0]] +// }, +// vcfStrelkaBPSNVS.map { +// variantcaller, idPatient, idSample, vcf, tbi -> +// [variantcaller, idSample, vcf[1]] +// }, +// vcfTIDDIT.map { +// variantcaller, idPatient, idSample, vcf, tbi -> +// [variantcaller, idSample, vcf] +// }) + +// (vcfBCFtools, vcfVCFtools, vcfAnnotation) = vcfKeep.into(3) + +// // STEP VCF.QC + +// process BcftoolsStats { +// label 'cpus_1' + +// tag "${variantCaller} - ${vcf}" + +// publishDir "${params.outdir}/Reports/${idSample}/BCFToolsStats", mode: params.publish_dir_mode + +// input: +// set variantCaller, idSample, file(vcf) from vcfBCFtools + +// output: +// file ("*.bcf.tools.stats.out") into bcftoolsReport + +// when: !('bcftools' in skip_qc) + +// script: +// """ +// bcftools stats ${vcf} > ${reduceVCF(vcf.fileName)}.bcf.tools.stats.out +// """ +// } + +// bcftoolsReport = bcftoolsReport.dump(tag:'BCFTools') + +// process Vcftools { +// label 'cpus_1' + +// tag "${variantCaller} - ${vcf}" + +// publishDir "${params.outdir}/Reports/${idSample}/VCFTools", mode: params.publish_dir_mode + +// input: +// set variantCaller, idSample, file(vcf) from vcfVCFtools + +// output: +// file ("${reduceVCF(vcf.fileName)}.*") into vcftoolsReport + +// when: !('vcftools' in skip_qc) + +// script: +// """ +// vcftools \ +// --gzvcf ${vcf} \ +// --TsTv-by-count \ +// --out ${reduceVCF(vcf.fileName)} + +// vcftools \ +// --gzvcf ${vcf} \ +// --TsTv-by-qual \ +// --out ${reduceVCF(vcf.fileName)} + +// vcftools \ +// --gzvcf ${vcf} \ +// --FILTER-summary \ +// --out ${reduceVCF(vcf.fileName)} +// """ +// } + +// vcftoolsReport = vcftoolsReport.dump(tag:'VCFTools') + +// /* +// -------------------------------------------------------------------------------- +// ANNOTATION +// -------------------------------------------------------------------------------- +// */ + +// if (step == 'annotate') { +// vcfToAnnotate = Channel.create() +// vcfNoAnnotate = Channel.create() + +// if (tsv_path == []) { +// // Sarek, by default, annotates all available vcfs that it can find in the VariantCalling directory +// // Excluding vcfs from FreeBayes, and g.vcf from HaplotypeCaller +// // Basically it's: results/VariantCalling/*/{HaplotypeCaller,Manta,Mutect2,SentieonDNAseq,SentieonDNAscope,SentieonTNscope,Strelka,TIDDIT}/*.vcf.gz +// // Without *SmallIndels.vcf.gz from Manta, and *.genome.vcf.gz from Strelka +// // The small snippet `vcf.minus(vcf.fileName)[-2]` catches idSample +// // This field is used to output final annotated VCFs in the correct directory +// Channel.empty().mix( +// Channel.fromPath("${params.outdir}/VariantCalling/*/HaplotypeCaller/*.vcf.gz") +// .flatten().map{vcf -> ['HaplotypeCaller', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, +// Channel.fromPath("${params.outdir}/VariantCalling/*/Manta/*[!candidate]SV.vcf.gz") +// .flatten().map{vcf -> ['Manta', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, +// Channel.fromPath("${params.outdir}/VariantCalling/*/Mutect2/*.vcf.gz") +// .flatten().map{vcf -> ['Mutect2', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, +// Channel.fromPath("${params.outdir}/VariantCalling/*/SentieonDNAseq/*.vcf.gz") +// .flatten().map{vcf -> ['SentieonDNAseq', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, +// Channel.fromPath("${params.outdir}/VariantCalling/*/SentieonDNAscope/*.vcf.gz") +// .flatten().map{vcf -> ['SentieonDNAscope', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, +// Channel.fromPath("${params.outdir}/VariantCalling/*/SentieonTNscope/*.vcf.gz") +// .flatten().map{vcf -> ['SentieonTNscope', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, +// Channel.fromPath("${params.outdir}/VariantCalling/*/Strelka/*{somatic,variant}*.vcf.gz") +// .flatten().map{vcf -> ['Strelka', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, +// Channel.fromPath("${params.outdir}/VariantCalling/*/TIDDIT/*.vcf.gz") +// .flatten().map{vcf -> ['TIDDIT', vcf.minus(vcf.fileName)[-2].toString(), vcf]} +// ).choice(vcfToAnnotate, vcfNoAnnotate) { +// annotate_tools == [] || (annotate_tools != [] && it[0] in annotate_tools) ? 0 : 1 +// } +// } else if (annotate_tools == []) { +// // Annotate user-submitted VCFs +// // If user-submitted, Sarek assume that the idSample should be assumed automatically +// vcfToAnnotate = Channel.fromPath(tsv_path) +// .map{vcf -> ['userspecified', vcf.minus(vcf.fileName)[-2].toString(), vcf]} +// } else exit 1, "specify only tools or files to annotate, not both" + +// vcfNoAnnotate.close() +// vcfAnnotation = vcfAnnotation.mix(vcfToAnnotate) +// } + +// // as now have the list of VCFs to annotate, the first step is to annotate with allele frequencies, if there are any + +// (vcfSnpeff, vcfVep) = vcfAnnotation.into(2) + +// vcfVep = vcfVep.map { +// variantCaller, idSample, vcf -> +// [variantCaller, idSample, vcf, null] +// } + +// // STEP SNPEFF + +// process Snpeff { +// tag "${idSample} - ${variantCaller} - ${vcf}" + +// publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { +// if (it == "${reducedVCF}_snpEff.ann.vcf") null +// else "Reports/${idSample}/snpEff/${it}" +// } + +// input: +// set variantCaller, idSample, file(vcf) from vcfSnpeff +// file(dataDir) from snpeff_cache +// val snpeffDb from snpeff_db + +// output: +// set file("${reducedVCF}_snpEff.genes.txt"), file("${reducedVCF}_snpEff.html"), file("${reducedVCF}_snpEff.csv") into snpeffReport +// set variantCaller, idSample, file("${reducedVCF}_snpEff.ann.vcf") into snpeffVCF + +// when: 'snpeff' in tools || 'merge' in tools + +// script: +// reducedVCF = reduceVCF(vcf.fileName) +// cache = (params.snpeff_cache && params.annotation_cache) ? "-dataDir \${PWD}/${dataDir}" : "" +// """ +// snpEff -Xmx${task.memory.toGiga()}g \ +// ${snpeffDb} \ +// -csvStats ${reducedVCF}_snpEff.csv \ +// -nodownload \ +// ${cache} \ +// -canon \ +// -v \ +// ${vcf} \ +// > ${reducedVCF}_snpEff.ann.vcf + +// mv snpEff_summary.html ${reducedVCF}_snpEff.html +// """ +// } + +// snpeffReport = snpeffReport.dump(tag:'snpEff report') + +// // STEP COMPRESS AND INDEX VCF.1 - SNPEFF + +// process CompressVCFsnpEff { +// tag "${idSample} - ${vcf}" + +// publishDir "${params.outdir}/Annotation/${idSample}/snpEff", mode: params.publish_dir_mode + +// input: +// set variantCaller, idSample, file(vcf) from snpeffVCF + +// output: +// set variantCaller, idSample, file("*.vcf.gz"), file("*.vcf.gz.tbi") into (compressVCFsnpEffOut) + +// script: +// """ +// bgzip < ${vcf} > ${vcf}.gz +// tabix ${vcf}.gz +// """ +// } + +// compressVCFsnpEffOut = compressVCFsnpEffOut.dump(tag:'VCF') + +// // STEP VEP.1 + +// process VEP { +// label 'VEP' +// label 'cpus_4' + +// tag "${idSample} - ${variantCaller} - ${vcf}" + +// publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { +// if (it == "${reducedVCF}_VEP.summary.html") "Reports/${idSample}/VEP/${it}" +// else null +// } + +// input: +// set variantCaller, idSample, file(vcf), file(idx) from vcfVep +// file(dataDir) from vep_cache +// val cache_version from vep_cache_version +// file(cadd_InDels) from cadd_indels +// file(cadd_InDels_tbi) from cadd_indels_tbi +// file(cadd_WG_SNVs) from cadd_wg_snvs +// file(cadd_WG_SNVs_tbi) from cadd_wg_snvs_tbi +// output: +// set variantCaller, idSample, file("${reducedVCF}_VEP.ann.vcf") into vepVCF +// file("${reducedVCF}_VEP.summary.html") into vepReport + +// when: 'vep' in tools + +// script: +// reducedVCF = reduceVCF(vcf.fileName) +// genome = params.genome == 'smallGRCh37' ? 'GRCh37' : params.genome + +// dir_cache = (params.vep_cache && params.annotation_cache) ? " \${PWD}/${dataDir}" : "/.vep" +// cadd = (params.cadd_cache && params.cadd_wg_snvs && params.cadd_indels) ? "--plugin CADD,whole_genome_SNVs.tsv.gz,InDels.tsv.gz" : "" +// genesplicer = params.genesplicer ? "--plugin GeneSplicer,/opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/bin/genesplicer,/opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/share/genesplicer-1.0-1/human,context=200,tmpdir=\$PWD/${reducedVCF}" : "--offline" +// """ +// mkdir ${reducedVCF} + +// vep \ +// -i ${vcf} \ +// -o ${reducedVCF}_VEP.ann.vcf \ +// --assembly ${genome} \ +// --species ${params.species} \ +// ${cadd} \ +// ${genesplicer} \ +// --cache \ +// --cache_version ${cache_version} \ +// --dir_cache ${dir_cache} \ +// --everything \ +// --filter_common \ +// --fork ${task.cpus} \ +// --format vcf \ +// --per_gene \ +// --stats_file ${reducedVCF}_VEP.summary.html \ +// --total_length \ +// --vcf + +// rm -rf ${reducedVCF} +// """ +// } + +// vepReport = vepReport.dump(tag:'VEP') + +// // STEP VEP.2 - VEP AFTER SNPEFF + +// process VEPmerge { +// label 'VEP' +// label 'cpus_4' + +// tag "${idSample} - ${variantCaller} - ${vcf}" + +// publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { +// if (it == "${reducedVCF}_VEP.summary.html") "Reports/${idSample}/VEP/${it}" +// else null +// } + +// input: +// set variantCaller, idSample, file(vcf), file(idx) from compressVCFsnpEffOut +// file(dataDir) from vep_cache +// val cache_version from vep_cache_version +// file(cadd_InDels) from cadd_indels +// file(cadd_InDels_tbi) from cadd_indels_tbi +// file(cadd_WG_SNVs) from cadd_wg_snvs +// file(cadd_WG_SNVs_tbi) from cadd_wg_snvs_tbi +// output: +// set variantCaller, idSample, file("${reducedVCF}_VEP.ann.vcf") into vepVCFmerge +// file("${reducedVCF}_VEP.summary.html") into vepReportMerge + +// when: 'merge' in tools + +// script: +// reducedVCF = reduceVCF(vcf.fileName) +// genome = params.genome == 'smallGRCh37' ? 'GRCh37' : params.genome +// dir_cache = (params.vep_cache && params.annotation_cache) ? " \${PWD}/${dataDir}" : "/.vep" +// cadd = (params.cadd_cache && params.cadd_wg_snvs && params.cadd_indels) ? "--plugin CADD,whole_genome_SNVs.tsv.gz,InDels.tsv.gz" : "" +// genesplicer = params.genesplicer ? "--plugin GeneSplicer,/opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/bin/genesplicer,/opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/share/genesplicer-1.0-1/human,context=200,tmpdir=\$PWD/${reducedVCF}" : "--offline" +// """ +// mkdir ${reducedVCF} + +// vep \ +// -i ${vcf} \ +// -o ${reducedVCF}_VEP.ann.vcf \ +// --assembly ${genome} \ +// --species ${params.species} \ +// ${cadd} \ +// ${genesplicer} \ +// --cache \ +// --cache_version ${cache_version} \ +// --dir_cache ${dir_cache} \ +// --everything \ +// --filter_common \ +// --fork ${task.cpus} \ +// --format vcf \ +// --per_gene \ +// --stats_file ${reducedVCF}_VEP.summary.html \ +// --total_length \ +// --vcf + +// rm -rf ${reducedVCF} +// """ +// } + +// vepReportMerge = vepReportMerge.dump(tag:'VEP') + +// vcfCompressVCFvep = vepVCF.mix(vepVCFmerge) + +// // STEP COMPRESS AND INDEX VCF.2 - VEP + +// process CompressVCFvep { +// tag "${idSample} - ${vcf}" + +// publishDir "${params.outdir}/Annotation/${idSample}/VEP", mode: params.publish_dir_mode + +// input: +// set variantCaller, idSample, file(vcf) from vcfCompressVCFvep + +// output: +// set variantCaller, idSample, file("*.vcf.gz"), file("*.vcf.gz.tbi") into compressVCFOutVEP + +// script: +// """ +// bgzip < ${vcf} > ${vcf}.gz +// tabix ${vcf}.gz +// """ +// } + +// compressVCFOutVEP = compressVCFOutVEP.dump(tag:'VCF') diff --git a/modules/local/functions.nf b/modules/local/functions.nf new file mode 100644 index 0000000000..7293db1f18 --- /dev/null +++ b/modules/local/functions.nf @@ -0,0 +1,263 @@ +/* + * This file holds several functions used to perform operation in Sarek + */ + +// Check if a row has the expected number of item +def check_number_of_item(row, number) { + if (row.size() != number) exit 1, "Malformed row in TSV file: ${row}, see --help for more information" + return true +} + +// Check parameter existence +def check_parameter_existence(it, list) { + if (!list.contains(it)) { + log.warn "Unknown parameter: ${it}" + return false + } + return true +} + +// Compare each parameter with a list of parameters +def check_parameter_list(list, realList) { + return list.every{ check_parameter_existence(it, realList) } +} + +// Define list of available tools to annotate +def define_anno_list() { + return [ + 'haplotypecaller', + 'manta', + 'mutect2', + 'strelka', + 'tiddit' + ] +} + +// Define list of skipable QC tools +def define_skip_qc_list() { + return [ + 'bamqc', + 'baserecalibrator', + 'bcftools', + 'documentation', + 'fastqc', + 'markduplicates', + 'multiqc', + 'samtools', + 'sentieon', + 'vcftools', + 'versions' + ] +} + +// Define list of available step +def define_step_list() { + return [ + 'annotate', + 'controlfreec', + 'mapping', + 'preparerecalibration', + 'recalibrate', + 'variantcalling' + ] +} + +// Define list of available tools +def define_tool_list() { + return [ + 'ascat', + 'cnvkit', + 'controlfreec', + 'dnascope', + 'dnaseq', + 'freebayes', + 'haplotypecaller', + 'manta', + 'merge', + 'mpileup', + 'mutect2', + 'snpeff', + 'strelka', + 'tiddit', + 'tnscope', + 'vep', + 'msisensor' + ] +} + +// Channeling the TSV file containing BAM. +// Format is: "patient gender status sample bam bai" +def extract_bam(tsvFile) { + Channel.from(tsvFile) + .splitCsv(sep: '\t') + .map { row -> + check_number_of_item(row, 6) + def meta = [:] + + meta.patient = row[0] + meta.gender = row[1] + meta.status = return_status(row[2].toInteger()) + meta.sample = row[3] + meta.id = meta.sample + def bam = return_file(row[4]) + def bai = return_file(row[5]) + + if (!has_extension(bam, "bam")) exit 1, "File: ${bam} has the wrong extension. See --help for more information" + if (!has_extension(bai, "bai")) exit 1, "File: ${bai} has the wrong extension. See --help for more information" + + return [meta, bam, bai] + } +} + +// Create a channel of germline FASTQs from a directory pattern: "my_samples/*/" +// All FASTQ files in subdirectories are collected and emitted; +// they must have _R1_ and _R2_ in their names. +// All FASTQ files are assumed to be from the same sample. +def extract_fastq_from_dir(folder) { + sample = file(folder).getFileName().toString() + + fastq = Channel.fromFilePairs(folder + '/*{_R1_,_R2_}*.fastq.gz') + .ifEmpty { error "No directories found matching folder '${folder}'" } + +// TODO check if flowcellLane_from_fastq is useful or not + + fastq = fastq.map{ run, pair -> + def meta = [:] + meta.patient = sample + meta.sample = meta.patient + meta.gender = 'ZZ' // unused + meta.status = 0 // normal (not tumor) + meta.run = run + meta.id = "${meta.sample}-${meta.run}" + def read1 = pair[0] + def read2 = pair[1] + + return [meta, [read1, read2]] + } +} + +// Channeling the TSV file containing FASTQ or BAM +// Format is: "patient gender status sample lane fastq1 fastq2" +// or: "patient gender status sample lane bam" +def extract_fastq(tsvFile) { + Channel.from(tsvFile) + .splitCsv(sep: '\t') + .map { row -> + def meta = [:] + meta.patient = row[0] + meta.gender = row[1] + meta.status = return_status(row[2].toInteger()) + meta.sample = row[3] + meta.run = row[4] + meta.id = "${meta.sample}-${meta.run}" + def read1 = return_file(row[5]) + def read2 = "null" + if (has_extension(read1, "fastq.gz") || has_extension(read1, "fq.gz") || has_extension(read1, "fastq") || has_extension(read1, "fq")) { + check_number_of_item(row, 7) + read2 = return_file(row[6]) + if (!has_extension(read2, "fastq.gz") && !has_extension(read2, "fq.gz") && !has_extension(read2, "fastq") && !has_extension(read2, "fq")) exit 1, "File: ${file2} has the wrong extension. See --help for more information" + if (has_extension(read1, "fastq") || has_extension(read1, "fq") || has_extension(read2, "fastq") || has_extension(read2, "fq")) { + exit 1, "We do recommend to use gziped fastq file to help you reduce your data footprint." + } + } + else if (has_extension(read1, "bam")) check_number_of_item(row, 6) + else exit 1, "No recognisable extention for input file: ${read1}" + + return [meta, [read1, read2]] + } +} + +// // Channeling the TSV file containing mpileup +// // Format is: "patient gender status sample pileup" +// def extract_pileup(tsvFile) { +// Channel.from(tsvFile) +// .splitCsv(sep: '\t') +// .map { row -> +// check_number_of_item(row, 5) +// def idPatient = row[0] +// def gender = row[1] +// def status = return_status(row[2].toInteger()) +// def idSample = row[3] +// def mpileup = return_file(row[4]) + +// if (!has_extension(mpileup, "pileup")) exit 1, "File: ${mpileup} has the wrong extension. See --help for more information" + +// return [idPatient, gender, status, idSample, mpileup] +// } +// } + +// Channeling the TSV file containing Recalibration Tables. +// Format is: "patient gender status sample bam bai recalTable" +def extract_recal(tsvFile) { + Channel.from(tsvFile) + .splitCsv(sep: '\t') + .map { row -> + check_number_of_item(row, 7) + def meta = [:] + + meta.patient = row[0] + meta.gender = row[1] + meta.status = return_status(row[2].toInteger()) + meta.sample = row[3] + meta.id = meta.sample + def bam = return_file(row[4]) + def bai = return_file(row[5]) + def table = return_file(row[6]) + + if (!has_extension(bam, "bam")) exit 1, "File: ${bam} has the wrong extension. See --help for more information" + if (!has_extension(bai, "bai")) exit 1, "File: ${bai} has the wrong extension. See --help for more information" + if (!has_extension(table, "recal.table")) exit 1, "File: ${table} has the wrong extension. See --help for more information" + + return [meta, bam, bai, table] + } +} + +// // Parse first line of a FASTQ file, return the flowcell id and lane number. +// def flowcellLane_from_fastq(path) { +// // expected format: +// // xx:yy:FLOWCELLID:LANE:... (seven fields) +// // or +// // FLOWCELLID:LANE:xx:... (five fields) +// InputStream fileStream = new FileInputStream(path.toFile()) +// InputStream gzipStream = new java.util.zip.GZIPInputStream(fileStream) +// Reader decoder = new InputStreamReader(gzipStream, 'ASCII') +// BufferedReader buffered = new BufferedReader(decoder) +// def line = buffered.readLine() +// assert line.startsWith('@') +// line = line.substring(1) +// def fields = line.split(' ')[0].split(':') +// String fcid +// int lane +// if (fields.size() == 7) { +// // CASAVA 1.8+ format +// fcid = fields[2] +// lane = fields[3].toInteger() +// } else if (fields.size() == 5) { +// fcid = fields[0] +// lane = fields[1].toInteger() +// } +// [fcid, lane] +// } + +// Check file extension +def has_extension(it, extension) { + it.toString().toLowerCase().endsWith(extension.toLowerCase()) +} + +// Return file if it exists +def return_file(it) { + if (!file(it).exists()) exit 1, "Missing file in TSV file: ${it}, see --help for more information" + return file(it) +} + +// Remove .ann .gz and .vcf extension from a VCF file +def reduce_vcf(file) { + return file.fileName.toString().minus(".ann").minus(".vcf").minus(".gz") +} + +// Return status [0,1] +// 0 == Normal, 1 == Tumor +def return_status(it) { + if (!(it in [0, 1])) exit 1, "Status is not recognized in TSV file: ${it}, see --help for more information" + return it +} \ No newline at end of file diff --git a/modules/local/process/build_intervals.nf b/modules/local/process/build_intervals.nf new file mode 100644 index 0000000000..861c3c07b8 --- /dev/null +++ b/modules/local/process/build_intervals.nf @@ -0,0 +1,30 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './../../nf-core/software/functions' + +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "anaconda::gawk=5.1.0" : null +container = "quay.io/biocontainers/gawk:5.1.0" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/gawk:5.1.0" + +process BUILD_INTERVALS { + tag "${fai}" + + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:"false") } + + conda environment + container container + + input: + path fai + + output: + path "${fai.baseName}.bed" + + script: + """ + awk -v FS='\t' -v OFS='\t' '{ print \$1, \"0\", \$2 }' ${fai} > ${fai.baseName}.bed + """ +} \ No newline at end of file diff --git a/modules/local/process/bwa_mem.nf b/modules/local/process/bwa_mem.nf new file mode 100644 index 0000000000..bcbd43d9ba --- /dev/null +++ b/modules/local/process/bwa_mem.nf @@ -0,0 +1,50 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './../../nf-core/software/functions' + +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::bwa=0.7.17 bioconda::samtools=1.10" : null +container = "quay.io/biocontainers/mulled-v2-fe8faa35dbf6dc65a0f7f5d4ea12e31a79f73e40:eabfac3657eda5818bae4090db989e3d41b01542-0" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/mulled-v2-fe8faa35dbf6dc65a0f7f5d4ea12e31a79f73e40:eabfac3657eda5818bae4090db989e3d41b01542-0" + +process BWA_MEM { + label 'process_high' + + tag "${meta.id}" + + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } + + + conda environment + container container + + input: + tuple val(meta), path(reads) + path bwa + path fasta + path fai + + output: + tuple val(meta), path("*.bam"), emit: bam + path "*.version.txt" , emit: version + + script: + CN = params.sequencing_center ? "CN:${params.sequencing_center}\\t" : "" + readGroup = "@RG\\tID:${meta.run}\\t${CN}PU:${meta.run}\\tSM:${meta.sample}\\tLB:${meta.sample}\\tPL:ILLUMINA" + extra = meta.status == 1 ? "-B 3" : "" + """ + bwa mem \ + ${options.args} \ + -R \"${readGroup}\" \ + ${extra} \ + -t ${task.cpus} \ + ${fasta} ${reads} | \ + samtools sort --threads ${task.cpus} -m 2G - > ${meta.id}.bam + + # samtools index ${meta.id}.bam + + echo \$(bwa version 2>&1) > bwa.version.txt + """ +} diff --git a/modules/local/process/bwamem2_mem.nf b/modules/local/process/bwamem2_mem.nf new file mode 100644 index 0000000000..f0117b5807 --- /dev/null +++ b/modules/local/process/bwamem2_mem.nf @@ -0,0 +1,47 @@ +include { initOptions; saveFiles; getSoftwareName } from './../../nf-core/software/functions' + +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::bwa-mem2=2.0 bioconda::samtools=1.10" : null +container = "quay.io/biocontainers/mulled-v2-e5d375990341c5aef3c9aff74f96f66f65375ef6:876eb6f1d38fbf578296ea94e5aede4e317939e7-0" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/mulled-v2-e5d375990341c5aef3c9aff74f96f66f65375ef6:876eb6f1d38fbf578296ea94e5aede4e317939e7-0" + +process BWAMEM2_MEM { + label 'process_high' + + tag "${meta.id}" + + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } + + conda environment + container container + + input: + tuple val(meta), path(reads) + path bwa + path fasta + path fai + + output: + tuple val(meta), path("*.bam") + + script: + CN = params.sequencing_center ? "CN:${params.sequencing_center}\\t" : "" + readGroup = "@RG\\tID:${meta.run}\\t${CN}PU:${meta.run}\\tSM:${meta.sample}\\tLB:${meta.sample}\\tPL:ILLUMINA" + extra = meta.status == 1 ? "-B 3" : "" + """ + bwa-mem2 mem \ + ${options.args} \ + -R \"${readGroup}\" \ + ${extra} \ + -t ${task.cpus} \ + ${fasta} ${reads} | \ + samtools sort --threads ${task.cpus} -m 2G - > ${meta.id}.bam + + # samtools index ${meta.id}.bam + + echo \$(bwa-mem2 version 2>&1) > bwa-mem2.version.txt + """ +} \ No newline at end of file diff --git a/modules/local/process/concat_vcf.nf b/modules/local/process/concat_vcf.nf new file mode 100644 index 0000000000..aa1bc5387b --- /dev/null +++ b/modules/local/process/concat_vcf.nf @@ -0,0 +1,36 @@ +include { initOptions; saveFiles; getSoftwareName } from './../../nf-core/software/functions' + +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::htslib=1.11" : null +container = "quay.io/biocontainers/htslib:1.11--hd3b49d5_0" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/htslib:1.11--hd3b49d5_0" + +process CONCAT_VCF { + label 'cpus_8' + + tag "${meta.id}" + + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } + + conda environment + container container + + input: + tuple val(meta), path(vcf) + path fai + path bed + + output: + tuple val(meta), path("*_*.vcf.gz"), path("*_*.vcf.gz.tbi"), emit: vcf + + script: + name = options.suffix ? "${options.suffix}_${meta.id}" : "${meta.id}" + target_options = params.target_bed ? "-t ${bed}" : "" + interval_options = params.no_intervals ? "-n" : "" + """ + concatenateVCFs.sh -i ${fai} -c ${task.cpus} -o ${name}.vcf ${target_options} ${interval_options} + """ +} \ No newline at end of file diff --git a/modules/local/process/create_intervals_bed.nf b/modules/local/process/create_intervals_bed.nf new file mode 100644 index 0000000000..6c6eecac95 --- /dev/null +++ b/modules/local/process/create_intervals_bed.nf @@ -0,0 +1,63 @@ +include { initOptions; saveFiles; getSoftwareName } from './../../nf-core/software/functions' +include { has_extension } from '../functions' + +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "anaconda::gawk=5.1.0" : null +container = "quay.io/biocontainers/gawk:5.1.0" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/gawk:5.1.0" + +process CREATE_INTERVALS_BED { + tag "${intervals}" + + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:"false") } + + conda environment + container container + + input: + path intervals + + output: + path ('*.bed') + + script: + // If the interval file is BED format, the fifth column is interpreted to + // contain runtime estimates, which is then used to combine short-running jobs + if (has_extension(intervals, "bed")) + """ + awk -vFS="\t" '{ + t = \$5 # runtime estimate + if (t == "") { + # no runtime estimate in this row, assume default value + t = (\$3 - \$2) / ${params.nucleotides_per_second} + } + if (name == "" || (chunk > 600 && (chunk + t) > longest * 1.05)) { + # start a new chunk + name = sprintf("%s_%d-%d.bed", \$1, \$2+1, \$3) + chunk = 0 + longest = 0 + } + if (t > longest) + longest = t + chunk += t + print \$0 > name + }' ${intervals} + """ + else if (has_extension(intervals, "interval_list")) + """ + grep -v '^@' ${intervals} | awk -vFS="\t" '{ + name = sprintf("%s_%d-%d", \$1, \$2, \$3); + printf("%s\\t%d\\t%d\\n", \$1, \$2-1, \$3) > name ".bed" + }' + """ + else + """ + awk -vFS="[:-]" '{ + name = sprintf("%s_%d-%d", \$1, \$2, \$3); + printf("%s\\t%d\\t%d\\n", \$1, \$2-1, \$3) > name ".bed" + }' ${intervals} + """ +} \ No newline at end of file diff --git a/modules/local/process/merge_bam.nf b/modules/local/process/merge_bam.nf new file mode 100644 index 0000000000..e190d8dbd1 --- /dev/null +++ b/modules/local/process/merge_bam.nf @@ -0,0 +1,33 @@ +include { initOptions; saveFiles; getSoftwareName } from './../../nf-core/software/functions' + +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::samtools=1.10" : null +container = "quay.io/biocontainers/samtools:1.10--h2e538c0_3" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/samtools:1.10--h2e538c0_3" + +process MERGE_BAM { + label 'cpus_8' + + tag "${meta.id}" + + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } + + conda environment + container container + + input: + tuple val(meta), path(bam) + + output: + tuple val(meta), path("${name}.bam"), emit: bam + val meta, emit: tsv + + script: + name = options.suffix ? "${meta.id}.${options.suffix}" : "${meta.id}" + """ + samtools merge --threads ${task.cpus} ${name}.bam ${bam} + """ +} diff --git a/modules/local/subworkflow/build_indices.nf b/modules/local/subworkflow/build_indices.nf new file mode 100644 index 0000000000..0f4eaba448 --- /dev/null +++ b/modules/local/subworkflow/build_indices.nf @@ -0,0 +1,111 @@ +/* +================================================================================ + BUILDING INDICES +================================================================================ +*/ + +params.build_intervals_options = [:] +params.bwa_index_options = [:] +params.bwamem2_index_options = [:] +params.create_intervals_bed_options = [:] +params.gatk_dict_options = [:] +params.samtools_faidx_options = [:] +params.tabix_dbsnp_options = [:] +params.tabix_germline_resource_options = [:] +params.tabix_known_indels_options = [:] +params.tabix_pon_options = [:] + +// Initialize channels based on params or indices that were just built + +include { BUILD_INTERVALS } from '../process/build_intervals.nf' addParams(options: params.build_intervals_options) +include { BWA_INDEX as BWAMEM1_INDEX } from '../../nf-core/software/bwa/index/main.nf' addParams(options: params.bwa_index_options) +include { BWAMEM2_INDEX } from '../../nf-core/software/bwamem2_index.nf' addParams(options: params.bwamem2_index_options) +include { CREATE_INTERVALS_BED } from '../process/create_intervals_bed.nf' addParams(options: params.create_intervals_bed_options) +include { GATK_CREATESEQUENCEDICTIONARY as GATK_DICT } from '../../nf-core/software/gatk/createsequencedictionary.nf' addParams(options: params.gatk_dict_options) +include { HTSLIB_TABIX as TABIX_DBSNP } from '../../nf-core/software/htslib_tabix' addParams(options: params.tabix_dbsnp_options) +include { HTSLIB_TABIX as TABIX_GERMLINE_RESOURCE } from '../../nf-core/software/htslib_tabix' addParams(options: params.tabix_germline_resource_options) +include { HTSLIB_TABIX as TABIX_KNOWN_INDELS } from '../../nf-core/software/htslib_tabix' addParams(options: params.tabix_known_indels_options) +include { HTSLIB_TABIX as TABIX_PON } from '../../nf-core/software/htslib_tabix' addParams(options: params.tabix_pon_options) +include { SAMTOOLS_FAIDX } from '../../nf-core/software/samtools/faidx.nf' addParams(options: params.samtools_faidx_options) + +workflow BUILD_INDICES{ + take: + dbsnp // channel: [optional] dbsnp + fasta // channel: [mandatory] fasta + germline_resource // channel: [optional] germline_resource + known_indels // channel: [optional] known_indels + pon // channel: [optional] pon + step // value: [mandatory] starting step + tools // list: [optional] tools to run + + main: + + result_bwa = Channel.empty() + version_bwa = Channel.empty() + if (!(params.bwa) && 'mapping' in step) + if (params.aligner == "bwa-mem") (result_bwa, version_bwa) = BWAMEM1_INDEX(fasta) + else result_bwa = BWAMEM2_INDEX(fasta) + + result_dict = Channel.empty() + if (!(params.dict) && !('annotate' in step) && !('controlfreec' in step)) + result_dict = GATK_DICT(fasta) + + result_fai = Channel.empty() + if (!(params.fasta_fai) && !('annotate' in step)) + result_fai = SAMTOOLS_FAIDX(fasta) + + result_dbsnp_tbi = Channel.empty() + if (!(params.dbsnp_index) && params.dbsnp && ('mapping' in step || 'preparerecalibration' in step || 'controlfreec' in tools || 'haplotypecaller' in tools || 'mutect2' in tools || 'tnscope' in tools)) + result_dbsnp_tbi = TABIX_DBSNP(dbsnp) + + result_germline_resource_tbi = Channel.empty() + if (!(params.germline_resource_index) && params.germline_resource && 'mutect2' in tools) + result_germline_resource_tbi = TABIX_GERMLINE_RESOURCE(germline_resource) + + result_known_indels_tbi = Channel.empty() + if (!(params.known_indels_index) && params.known_indels && ('mapping' in step || 'preparerecalibration' in step)) + result_known_indels_tbi = TABIX_KNOWN_INDELS(known_indels) + + result_pon_tbi = Channel.empty() + if (!(params.pon_index) && params.pon && ('tnscope' in tools || 'mutect2' in tools)) + result_pon_tbi = TABIX_PON(pon) + + if (params.no_intervals) { + file("${params.outdir}/no_intervals.bed").text = "no_intervals\n" + result_intervals = Channel.from(file("${params.outdir}/no_intervals.bed")) + } else if (!('annotate' in step) && !('controlfreec' in step)) + if (!params.intervals) + result_intervals = CREATE_INTERVALS_BED(BUILD_INTERVALS(result_fai)) + else + result_intervals = CREATE_INTERVALS_BED(file(params.intervals)) + + if (!params.no_intervals) { + result_intervals = result_intervals.flatten() + .map { intervalFile -> + def duration = 0.0 + for (line in intervalFile.readLines()) { + final fields = line.split('\t') + if (fields.size() >= 5) duration += fields[4].toFloat() + else { + start = fields[1].toInteger() + end = fields[2].toInteger() + duration += (end - start) / params.nucleotides_per_second + } + } + [duration, intervalFile] + }.toSortedList({ a, b -> b[0] <=> a[0] }) + .flatten().collate(2) + .map{duration, intervalFile -> intervalFile} + } + + emit: + bwa = result_bwa + bwa_version = version_bwa + dbsnp_tbi = result_dbsnp_tbi + dict = result_dict + fai = result_fai + germline_resource_tbi = result_germline_resource_tbi + intervals = result_intervals + known_indels_tbi = result_known_indels_tbi + pon_tbi = result_pon_tbi +} \ No newline at end of file diff --git a/modules/local/subworkflow/germline_variant_calling.nf b/modules/local/subworkflow/germline_variant_calling.nf new file mode 100644 index 0000000000..1964b12a61 --- /dev/null +++ b/modules/local/subworkflow/germline_variant_calling.nf @@ -0,0 +1,125 @@ +/* +================================================================================ + GERMLINE VARIANT CALLING +================================================================================ +*/ + +params.haplotypecaller_options = [:] +params.genotypegvcf_options = [:] +params.concat_gvcf_options = [:] +params.concat_haplotypecaller_options = [:] +params.strelka_options = [:] + +include { GATK_HAPLOTYPECALLER as HAPLOTYPECALLER } from '../../nf-core/software/gatk/haplotypecaller' addParams(options: params.haplotypecaller_options) +include { GATK_GENOTYPEGVCF as GENOTYPEGVCF } from '../../nf-core/software/gatk/genotypegvcf' addParams(options: params.genotypegvcf_options) +include { CONCAT_VCF as CONCAT_GVCF } from '../process/concat_vcf' addParams(options: params.concat_gvcf_options) +include { CONCAT_VCF as CONCAT_HAPLOTYPECALLER } from '../process/concat_vcf' addParams(options: params.concat_haplotypecaller_options) +include { STRELKA_GERMLINE as STRELKA } from '../../nf-core/software/strelka/germline' addParams(options: params.strelka_options) + +workflow GERMLINE_VARIANT_CALLING { + take: + bam // channel: [mandatory] bam + dbsnp // channel: [mandatory] dbsnp + dbsnp_tbi // channel: [mandatory] dbsnp_tbi + dict // channel: [mandatory] dict + fai // channel: [mandatory] fai + fasta // channel: [mandatory] fasta + intervals // channel: [mandatory] intervals + target_bed // channel: [optional] target_bed + tools // list: [mandatory] list of tools + + main: + + haplotypecaller_gvcf = Channel.empty() + haplotypecaller_vcf = Channel.empty() + strelka_vcf = Channel.empty() + + if ('haplotypecaller' in tools) { + haplotypecaller_interval_bam = bam.combine(intervals) + + // STEP GATK HAPLOTYPECALLER.1 + + HAPLOTYPECALLER( + haplotypecaller_interval_bam, + dbsnp, + dbsnp_tbi, + dict, + fasta, + fai) + + haplotypecaller_interval_gvcf = HAPLOTYPECALLER.out.gvcf.map{ meta, vcf -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + [ patient, sample, gender, status, vcf] + }.groupTuple(by: [0,1]) + + haplotypecaller_interval_gvcf = haplotypecaller_interval_gvcf.map { patient, sample, gender, status, vcf -> + def meta = [:] + meta.patient = patient + meta.sample = sample + meta.gender = gender[0] + meta.status = status[0] + meta.id = meta.sample + [ meta, vcf ] + } + + CONCAT_GVCF( + haplotypecaller_interval_gvcf, + fai, + target_bed) + + haplotypecaller_gvcf = CONCAT_GVCF.out.vcf + + // STEP GATK HAPLOTYPECALLER.2 + + GENOTYPEGVCF( + HAPLOTYPECALLER.out.interval_gvcf, + dbsnp, + dbsnp_tbi, + dict, + fasta, + fai) + + haplotypecaller_interval_vcf = GENOTYPEGVCF.out.map{ meta, vcf -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + [ patient, sample, gender, status, vcf] + }.groupTuple(by: [0,1]) + + haplotypecaller_interval_vcf = haplotypecaller_interval_vcf.map { patient, sample, gender, status, vcf -> + def meta = [:] + meta.patient = patient + meta.sample = sample + meta.gender = gender[0] + meta.status = status[0] + meta.id = meta.sample + [ meta, vcf ] + } + + CONCAT_HAPLOTYPECALLER( + haplotypecaller_interval_vcf, + fai, + target_bed) + + haplotypecaller_vcf = CONCAT_GVCF.out.vcf + } + + if ('strelka' in tools) { + STRELKA( + bam, + fasta, + fai, + target_bed) + + strelka_vcf = STRELKA.out.vcf + } + + emit: + haplotypecaller_gvcf = haplotypecaller_gvcf + haplotypecaller_vcf = haplotypecaller_vcf + strelka_vcf = strelka_vcf +} diff --git a/modules/local/subworkflow/mapping.nf b/modules/local/subworkflow/mapping.nf new file mode 100644 index 0000000000..4ca5effd2f --- /dev/null +++ b/modules/local/subworkflow/mapping.nf @@ -0,0 +1,139 @@ +/* +================================================================================ + MAPPING +================================================================================ +*/ + +params.bwamem1_mem_options = [:] +params.bwamem2_mem_options = [:] +params.merge_bam_options = [:] +params.qualimap_bamqc_options = [:] +params.samtools_index_options = [:] +params.samtools_stats_options = [:] + +include { BWA_MEM as BWAMEM1_MEM } from '../process/bwa_mem' addParams(options: params.bwamem1_mem_options) +include { BWAMEM2_MEM } from '../process/bwamem2_mem' addParams(options: params.bwamem2_mem_options) +include { MERGE_BAM } from '../process/merge_bam' addParams(options: params.merge_bam_options) +include { QUALIMAP_BAMQC } from '../../nf-core/software/qualimap_bamqc' addParams(options: params.qualimap_bamqc_options) +include { SAMTOOLS_INDEX } from '../../nf-core/software/samtools/index' addParams(options: params.samtools_index_options) +include { SAMTOOLS_STATS } from '../../nf-core/software/samtools/stats' addParams(options: params.samtools_stats_options) + +workflow MAPPING { + take: + skip_bamqc // boolean: true/false + skip_samtools // boolean: true/false + bwa // channel: [mandatory] bwa + fai // channel: [mandatory] fai + fasta // channel: [mandatory] fasta + reads_input // channel: [mandatory] reads_input + save_bam_mapped // boolean: true/false + step // value: [mandatory] starting step + target_bed // channel: [optional] target_bed + + main: + + bam_mapped_index = Channel.empty() + bam_reports = Channel.empty() + + if (step == "mapping") { + bam_bwamem1 = Channel.empty() + bam_bwamem2 = Channel.empty() + + if (params.aligner == "bwa-mem") { + BWAMEM1_MEM(reads_input, bwa, fasta, fai) + bam_bwamem1 = BWAMEM1_MEM.out.bam + } else { + BWAMEM2_MEM(reads_input, bwa, fasta, fai) + bam_bwamem2 = BWAMEM2_MEM.out + } + + bam_bwa = bam_bwamem1.mix(bam_bwamem2) + + bam_bwa.map{ meta, bam -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + [patient, sample, gender, status, bam] + }.groupTuple(by: [0,1]) + .branch{ + single: it[4].size() == 1 + multiple: it[4].size() > 1 + }.set{ bam_bwa_to_sort } + + bam_bwa_single = bam_bwa_to_sort.single.map { + patient, sample, gender, status, bam -> + + def meta = [:] + meta.patient = patient + meta.sample = sample + meta.gender = gender[0] + meta.status = status[0] + meta.id = sample + + [meta, bam[0]] + } + + bam_bwa_multiple = bam_bwa_to_sort.multiple.map { + patient, sample, gender, status, bam -> + + def meta = [:] + meta.patient = patient + meta.sample = sample + meta.gender = gender[0] + meta.status = status[0] + meta.id = sample + + [meta, bam] + } + + // STEP 1.5: MERGING AND INDEXING BAM FROM MULTIPLE LANES + + MERGE_BAM(bam_bwa_multiple) + bam_mapped = bam_bwa_single.mix(MERGE_BAM.out.bam) + bam_mapped_index = SAMTOOLS_INDEX(bam_mapped) + + qualimap_bamqc = Channel.empty() + samtools_stats = Channel.empty() + + if (!skip_bamqc) { + QUALIMAP_BAMQC(bam_mapped, target_bed) + qualimap_bamqc = QUALIMAP_BAMQC.out + } + + if (!skip_samtools) { + SAMTOOLS_STATS(bam_mapped) + samtools_stats = SAMTOOLS_STATS.out + } + + bam_reports = samtools_stats.mix(qualimap_bamqc) + + if (save_bam_mapped) { + tsv_bam_mapped = bam_mapped.map { meta, bam -> [meta] } + // Creating TSV files to restart from this step + tsv_bam_mapped.collectFile(storeDir: "${params.outdir}/preprocessing/tsv") { meta -> + patient = meta.patient[0] + sample = meta.sample[0] + gender = meta.gender[0] + status = meta.status[0] + bam = "${params.outdir}/preprocessing/${sample}/mapped/${sample}.bam" + bai = "${params.outdir}/preprocessing/${sample}/mapped/${sample}.bam.bai" + ["mapped_${sample}.tsv", "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\n"] + } + + tsv_bam_mapped.map { meta -> + patient = meta.patient[0] + sample = meta.sample[0] + gender = meta.gender[0] + status = meta.status[0] + bam = "${params.outdir}/preprocessing/${sample}/mapped/${sample}.bam" + bai = "${params.outdir}/preprocessing/${sample}/mapped/${sample}.bam.bai" + "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\n" + }.collectFile(name: "mapped.tsv", sort: true, storeDir: "${params.outdir}/preprocessing/tsv") + } + } + + emit: + bam = bam_mapped_index + qc = bam_reports +} diff --git a/modules/local/subworkflow/markduplicates.nf b/modules/local/subworkflow/markduplicates.nf new file mode 100644 index 0000000000..531e11b464 --- /dev/null +++ b/modules/local/subworkflow/markduplicates.nf @@ -0,0 +1,81 @@ +/* +================================================================================ + MARKDUPLICATES +================================================================================ +*/ + +params.markduplicates_options = [:] + +include { GATK_MARKDUPLICATES } from '../../nf-core/software/gatk/markduplicates' addParams(options: params.markduplicates_options) + +workflow MARKDUPLICATES { + take: + bam_mapped // channel: [mandatory] bam_mapped + step // value: [mandatory] starting step + + main: + + bam_markduplicates = bam_mapped + report_markduplicates = Channel.empty() + + if (step == "mapping") { + if (!params.skip_markduplicates) { + GATK_MARKDUPLICATES(bam_mapped) + report_markduplicates = GATK_MARKDUPLICATES.out.report + bam_markduplicates = GATK_MARKDUPLICATES.out.bam + tsv_markduplicates = GATK_MARKDUPLICATES.out.tsv + + // Creating TSV files to restart from this step + tsv_markduplicates.collectFile(storeDir: "${params.outdir}/preprocessing/tsv") { meta -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + bam = "${params.outdir}/preprocessing/${sample}/markduplicates/${sample}.md.bam" + bai = "${params.outdir}/preprocessing/${sample}/markduplicates/${sample}.md.bam.bai" + table = "${params.outdir}/preprocessing/${sample}/markduplicates/${sample}.recal.table" + ["markduplicates_${sample}.tsv", "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\t${table}\n"] + } + + tsv_markduplicates.map { meta -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + bam = "${params.outdir}/preprocessing/${sample}/markduplicates/${sample}.md.bam" + bai = "${params.outdir}/preprocessing/${sample}/markduplicates/${sample}.md.bam.bai" + table = "${params.outdir}/preprocessing/${sample}/markduplicates/${sample}.recal.table" + "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\t${table}\n" + }.collectFile(name: 'markduplicates.tsv', sort: true, storeDir: "${params.outdir}/preprocessing/tsv") + } else { + tsv_no_markduplicates = bam_markduplicates.map { meta, bam, bai -> [meta] } + + // Creating TSV files to restart from this step + tsv_no_markduplicates.collectFile(storeDir: "${params.outdir}/preprocessing/tsv") { meta -> + patient = meta.patient[0] + sample = meta.sample[0] + gender = meta.gender[0] + status = meta.status[0] + bam = "${params.outdir}/preprocessing/${sample}/mapped/${sample}.bam" + bai = "${params.outdir}/preprocessing/${sample}/mapped/${sample}.bam.bai" + table = "${params.outdir}/preprocessing/${sample}/mapped/${sample}.recal.table" + ["mapped_no_markduplicates_${sample}.tsv", "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\t${table}\n"] + } + + tsv_no_markduplicates.map { meta -> + patient = meta.patient[0] + sample = meta.sample[0] + gender = meta.gender[0] + status = meta.status[0] + bam = "${params.outdir}/preprocessing/${sample}/mapped/${sample}.bam" + bai = "${params.outdir}/preprocessing/${sample}/mapped/${sample}.bam.bai" + table = "${params.outdir}/preprocessing/${sample}/mapped/${sample}.recal.table" + "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\t${table}\n" + }.collectFile(name: 'mapped_no_markduplicates.tsv', sort: true, storeDir: "${params.outdir}/preprocessing/tsv") + } + } + + emit: + bam = bam_markduplicates + report = report_markduplicates +} diff --git a/modules/local/subworkflow/prepare_recalibration.nf b/modules/local/subworkflow/prepare_recalibration.nf new file mode 100644 index 0000000000..510e4bbb55 --- /dev/null +++ b/modules/local/subworkflow/prepare_recalibration.nf @@ -0,0 +1,91 @@ +/* +================================================================================ + PREPARE RECALIBRATION +================================================================================ +*/ + +params.baserecalibrator_options = [:] +params.gatherbqsrreports_options = [:] + +include { GATK_BASERECALIBRATOR as BASERECALIBRATOR } from '../../nf-core/software/gatk/baserecalibrator' addParams(options: params.baserecalibrator_options) +include { GATK_GATHERBQSRREPORTS as GATHERBQSRREPORTS } from '../../nf-core/software/gatk/gatherbqsrreports' addParams(options: params.gatherbqsrreports_options) + +workflow PREPARE_RECALIBRATION { + take: + bam_markduplicates // channel: [mandatory] bam_markduplicates + dbsnp // channel: [optional] dbsnp + dbsnp_tbi // channel: [optional] dbsnp_tbi + dict // channel: [mandatory] dict + fai // channel: [mandatory] fai + fasta // channel: [mandatory] fasta + intervals // channel: [mandatory] intervals + known_indels // channel: [optional] known_indels + known_indels_tbi // channel: [optional] known_indels_tbi + step // value: [mandatory] starting step + + main: + + bam_baserecalibrator = bam_markduplicates.combine(intervals) + table_bqsr = Channel.empty() + tsv_bqsr = Channel.empty() + + if (step in ["mapping", "preparerecalibration"]) { + + BASERECALIBRATOR(bam_baserecalibrator, dbsnp, dbsnp_tbi, dict, fai, fasta, known_indels, known_indels_tbi) + table_bqsr = BASERECALIBRATOR.out.report + tsv_bqsr = BASERECALIBRATOR.out.tsv + + // STEP 3.5: MERGING RECALIBRATION TABLES + if (!params.no_intervals) { + BASERECALIBRATOR.out.report.map{ meta, table -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + [patient, sample, gender, status, table] + }.groupTuple(by: [0,1]).set{ recaltable } + + recaltable = recaltable.map { + patient, sample, gender, status, recal -> + + def meta = [:] + meta.patient = patient + meta.sample = sample + meta.gender = gender[0] + meta.status = status[0] + meta.id = sample + + [meta, recal] + } + + GATHERBQSRREPORTS(recaltable) + table_bqsr = GATHERBQSRREPORTS.out.table + tsv_bqsr = GATHERBQSRREPORTS.out.tsv + + } + + // Creating TSV files to restart from this step + tsv_bqsr.collectFile(storeDir: "${params.outdir}/preprocessing/tsv") { meta -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + bam = "${params.outdir}/preprocessing/${sample}/markduplicates/${sample}.md.bam" + bai = "${params.outdir}/preprocessing/${sample}/markduplicates/${sample}.md.bam.bai" + ["markduplicates_no_table_${sample}.tsv", "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\n"] + } + + tsv_bqsr.map { meta -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + bam = "${params.outdir}/preprocessing/${sample}/markduplicates/${sample}.md.bam" + bai = "${params.outdir}/preprocessing/${sample}/markduplicates/${sample}.md.bam.bai" + "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\n" + }.collectFile(name: 'markduplicates_no_table.tsv', sort: true, storeDir: "${params.outdir}/preprocessing/tsv") + } + + emit: + table_bqsr = table_bqsr +} diff --git a/modules/local/subworkflow/recalibrate.nf b/modules/local/subworkflow/recalibrate.nf new file mode 100644 index 0000000000..5551199799 --- /dev/null +++ b/modules/local/subworkflow/recalibrate.nf @@ -0,0 +1,125 @@ +/* +================================================================================ + RECALIBRATE +================================================================================ +*/ + +params.applybqsr_options = [:] +params.merge_bam_options = [:] +params.qualimap_bamqc_options = [:] +params.samtools_index_options = [:] +params.samtools_stats_options = [:] + +include { GATK_APPLYBQSR as APPLYBQSR } from '../../nf-core/software/gatk/applybqsr' addParams(options: params.applybqsr_options) +include { MERGE_BAM } from '../process/merge_bam' addParams(options: params.merge_bam_options) +include { QUALIMAP_BAMQC } from '../../nf-core/software/qualimap_bamqc' addParams(options: params.qualimap_bamqc_options) +include { SAMTOOLS_INDEX } from '../../nf-core/software/samtools/index' addParams(options: params.samtools_index_options) +include { SAMTOOLS_STATS } from '../../nf-core/software/samtools/stats' addParams(options: params.samtools_stats_options) + +workflow RECALIBRATE { + take: + skip_bamqc // boolean: true/false + skip_samtools // boolean: true/false + bam // channel: [mandatory] bam + dict // channel: [mandatory] dict + fai // channel: [mandatory] fai + fasta // channel: [mandatory] fasta + intervals // channel: [mandatory] intervals + step // value: [mandatory] starting step + target_bed // channel: [optional] target_bed + + main: + + bam_recalibrated_index = Channel.empty() + bam_recalibrated = Channel.empty() + bam_reports = Channel.empty() + + if (step in ["mapping", "preparerecalibration", "recalibrate"]) { + + bam_intervals = bam.combine(intervals) + + APPLYBQSR(bam_intervals, dict, fasta, fai) + + // STEP 4.5: MERGING AND INDEXING THE RECALIBRATED BAM FILES + if (params.no_intervals) { + bam_recalibrated = APPLYBQSR.out.bam + tsv_recalibrated = APPLYBQSR.out.tsv + } else { + APPLYBQSR.out.bam.map{ meta, bam -> //, bai -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + [patient, sample, gender, status, bam] //, bai] + }.groupTuple(by: [0,1]).set{ bam_recalibrated_interval } + + bam_recalibrated_interval = bam_recalibrated_interval.map { + patient, sample, gender, status, bam -> //, bai -> + + def meta = [:] + meta.patient = patient + meta.sample = sample + meta.gender = gender[0] + meta.status = status[0] + meta.id = sample + + [meta, bam] + } + + MERGE_BAM(bam_recalibrated_interval) + bam_recalibrated = MERGE_BAM.out.bam + tsv_recalibrated = MERGE_BAM.out.tsv + } + + bam_recalibrated_index = SAMTOOLS_INDEX(bam_recalibrated) + + qualimap_bamqc = Channel.empty() + samtools_stats = Channel.empty() + + if (!skip_bamqc) { + QUALIMAP_BAMQC(bam_recalibrated, target_bed) + qualimap_bamqc = QUALIMAP_BAMQC.out + } + + if (!skip_samtools) { + SAMTOOLS_STATS(bam_recalibrated) + samtools_stats = SAMTOOLS_STATS.out + } + + bam_reports = samtools_stats.mix(qualimap_bamqc) + + //TODO: set bam_recalibrated with all these steps + // // When using sentieon for mapping, Channel bam_recalibrated is bam_sentieon_recal + // if (params.sentieon && step == 'mapping') bam_recalibrated = bam_sentieon_recal + + // // When no knownIndels for mapping, Channel bam_recalibrated is bam_duplicates_marked + // if (!params.known_indels && step == 'mapping') bam_recalibrated = bam_duplicates_marked + + // // When starting with variant calling, Channel bam_recalibrated is input_sample + // if (step == 'variantcalling') bam_recalibrated = input_sample + // Creating TSV files to restart from this step + tsv_recalibrated.collectFile(storeDir: "${params.outdir}/preprocessing/tsv") { meta -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + bam = "${params.outdir}/preprocessing/${sample}/recalibrated/${sample}.recal.bam" + bai = "${params.outdir}/preprocessing/${sample}/recalibrated/${sample}.recal.bam.bai" + ["recalibrated_${sample}.tsv", "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\n"] + } + + tsv_recalibrated.map { meta -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + bam = "${params.outdir}/preprocessing/${sample}/recalibrated/${sample}.recal.bam" + bai = "${params.outdir}/preprocessing/${sample}/recalibrated/${sample}.recal.bam.bai" + "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\n" + }.collectFile(name: 'recalibrated.tsv', sort: true, storeDir: "${params.outdir}/preprocessing/tsv") + } + + emit: + bam = bam_recalibrated_index + qc = bam_reports +} diff --git a/modules/nf-core/software/bwa/index/main.nf b/modules/nf-core/software/bwa/index/main.nf new file mode 100644 index 0000000000..078cfb51d1 --- /dev/null +++ b/modules/nf-core/software/bwa/index/main.nf @@ -0,0 +1,35 @@ +include { initOptions; saveFiles; getSoftwareName } from './../../functions' + +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::bwa=0.7.17" : null +container = "quay.io/biocontainers/bwa:0.7.17--hed695b0_7" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/bwa:0.7.17--hed695b0_7" + +process BWA_INDEX { + label 'process_high' + + tag "${fasta}" + + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:"false") } + + conda environment + container container + + input: + path fasta + + output: + path "${fasta}.*" , emit: index + path "*.version.txt", emit: version + + script: + def software = getSoftwareName(task.process) + def ioptions = initOptions(options) + """ + bwa index ${ioptions.args} ${fasta} + echo \$(bwa 2>&1) | sed 's/^.*Version: //; s/Contact:.*\$//' > ${software}.version.txt + """ +} diff --git a/modules/nf-core/software/bwa/index/meta.yml b/modules/nf-core/software/bwa/index/meta.yml new file mode 100644 index 0000000000..a2f5b1ed66 --- /dev/null +++ b/modules/nf-core/software/bwa/index/meta.yml @@ -0,0 +1,52 @@ +name: bwa_index +description: Create BWA index for reference genome +keywords: + - index + - fasta + - genome + - reference +tools: + - bwa: + description: | + BWA is a software package for mapping DNA sequences against + a large reference genome, such as the human genome. + homepage: http://bio-bwa.sourceforge.net/ + documentation: http://www.htslib.org/doc/samtools.html + arxiv: arXiv:1303.3997 +params: + - outdir: + type: string + description: | + The pipeline's output directory. By default, the module will + output files into `$params.outdir/` + - publish_dir_mode: + type: string + description: | + Value for the Nextflow `publishDir` mode parameter. + Available: symlink, rellink, link, copy, copyNoFollow, move. + - conda: + type: boolean + description: | + Run the module with Conda using the software specified + via the `conda` directive +input: + - fasta: + type: file + description: Input genome fasta file + - options: + type: map + description: | + Groovy Map containing module options for passing command-line arguments and + output file paths. +output: + - index: + type: file + description: BWA genome index files + pattern: "*.{amb,ann,bwt,pac,sa}" + - version: + type: file + description: File containing software version + pattern: "*.{version.txt}" +authors: + - "@drpatelh" + - "@maxulysse" diff --git a/modules/nf-core/software/bwamem2_index.nf b/modules/nf-core/software/bwamem2_index.nf new file mode 100644 index 0000000000..c8e26dfd7e --- /dev/null +++ b/modules/nf-core/software/bwamem2_index.nf @@ -0,0 +1,33 @@ +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::bwa-mem2=2.0" : null +container = "quay.io/biocontainers/bwa-mem2:2.0--he513fc3_1" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/bwa-mem2:2.0--he513fc3_1" + +process BWAMEM2_INDEX { + tag "${fasta}" + + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:"false") } + + conda environment + container container + + input: + path fasta + + output: + path "${fasta}.*" + + script: + def software = getSoftwareName(task.process) + def ioptions = initOptions(options) + """ + bwa-mem2 index ${ioptions.args} ${fasta} + + echo \$(bwa-mem2 version 2>&1) > ${software}.version.txt + """ +} diff --git a/modules/nf-core/software/fastqc.nf b/modules/nf-core/software/fastqc.nf new file mode 100644 index 0000000000..80a5582a1f --- /dev/null +++ b/modules/nf-core/software/fastqc.nf @@ -0,0 +1,42 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::fastqc=0.11.9" : null +container = "quay.io/biocontainers/fastqc:0.11.9--0" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/fastqc:0.11.9--0" + +process FASTQC { + label 'process_medium' + label 'cpus_2' + + tag "${meta.id}" + + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } + + conda environment + container container + + input: + tuple val(meta), path(reads) + + output: + path "*.html", emit: html + path "*.version.txt", emit: version + path "*.zip", emit: zip + + script: + // Add soft-links to original FastQs for consistent naming in pipeline + prefix = options.suffix ? "${meta.id}.${options.suffix}" : "${meta.id}" + """ + [ ! -f ${prefix}_1.fastq.gz ] && ln -s ${reads[0]} ${prefix}_1.fastq.gz + [ ! -f ${prefix}_2.fastq.gz ] && ln -s ${reads[1]} ${prefix}_2.fastq.gz + + fastqc ${options.args} --threads ${task.cpus} ${prefix}_1.fastq.gz ${prefix}_2.fastq.gz + + fastqc --version | sed -n "s/.*\\(v.*\$\\)/\\1/p" > fastqc.version.txt + """ +} \ No newline at end of file diff --git a/modules/nf-core/software/functions.nf b/modules/nf-core/software/functions.nf new file mode 100644 index 0000000000..ca46a99f5d --- /dev/null +++ b/modules/nf-core/software/functions.nf @@ -0,0 +1,57 @@ +/* + * ----------------------------------------------------- + * Utility functions used in nf-core DSL2 module files + * ----------------------------------------------------- + */ + +/* + * Extract name of software tool from process name using $task.process + */ +def getSoftwareName(task_process) { + return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase() +} + +/* + * Function to initialise default values and to generate a Groovy Map of available options for nf-core modules + */ +def initOptions(Map args) { + def Map options = [:] + options.args = args.args ?: '' + options.args2 = args.args2 ?: '' + options.publish_by_id = args.publish_by_id ?: false + options.publish_dir = args.publish_dir ?: '' + options.publish_files = args.publish_files + options.suffix = args.suffix ?: '' + return options +} + +/* + * Tidy up and join elements of a list to return a path string + */ +def getPathFromList(path_list) { + def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries + paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes + return paths.join('/') +} + +/* + * Function to save/publish module results + */ +def saveFiles(Map args) { + if (!args.filename.endsWith('.version.txt')) { + def ioptions = initOptions(args.options) + def path_list = [ ioptions.publish_dir ?: args.publish_dir ] + if (ioptions.publish_by_id) path_list.add(args.publish_id) + if (ioptions.publish_files instanceof Map) { + for (ext in ioptions.publish_files) { + if (args.filename.endsWith(ext.key)) { + def ext_list = path_list.collect() + ext_list.add(ext.value) + return "${getPathFromList(ext_list)}/${args.filename}" + } + } + } else if (ioptions.publish_files == null) { + return "${getPathFromList(path_list)}/${args.filename}" + } + } +} diff --git a/modules/nf-core/software/gatk/applybqsr.nf b/modules/nf-core/software/gatk/applybqsr.nf new file mode 100644 index 0000000000..d17bb10dda --- /dev/null +++ b/modules/nf-core/software/gatk/applybqsr.nf @@ -0,0 +1,45 @@ +include { initOptions; saveFiles; getSoftwareName } from './../functions' + +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::gatk4-spark=4.1.8.1" : null +container = "quay.io/biocontainers/gatk4-spark:4.1.8.1--0" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/gatk4-spark:4.1.8.1--0" + +process GATK_APPLYBQSR { + label 'memory_singleCPU_2_task' + label 'cpus_2' + + tag "${meta.id}" + + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } + + conda environment + container container + + input: + tuple val(meta), path(bam), path(bai), path(recalibrationReport), path(interval) + path dict + path fasta + path fai + + output: + tuple val(meta), path("${prefix}${meta.sample}.recal.bam") , emit: bam + val meta, emit: tsv + + + script: + prefix = params.no_intervals ? "" : "${interval.baseName}_" + options_intervals = params.no_intervals ? "" : "-L ${interval}" + """ + gatk --java-options -Xmx${task.memory.toGiga()}g \ + ApplyBQSR \ + -R ${fasta} \ + --input ${bam} \ + --output ${prefix}${meta.sample}.recal.bam \ + ${options_intervals} \ + --bqsr-recal-file ${recalibrationReport} + """ +} \ No newline at end of file diff --git a/modules/nf-core/software/gatk/baserecalibrator.nf b/modules/nf-core/software/gatk/baserecalibrator.nf new file mode 100644 index 0000000000..c6b8a35392 --- /dev/null +++ b/modules/nf-core/software/gatk/baserecalibrator.nf @@ -0,0 +1,55 @@ +include { initOptions; saveFiles; getSoftwareName } from './../functions' + +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::gatk4-spark=4.1.8.1" : null +container = "quay.io/biocontainers/gatk4-spark:4.1.8.1--0" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/gatk4-spark:4.1.8.1--0" + +process GATK_BASERECALIBRATOR { + label 'cpus_1' + + tag "${meta.id}" + + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } + + conda environment + container container + + input: + tuple val(meta), path(bam), path(bai), path(interval) + path dbsnp + path dbsnp_tbi + path dict + path fai + path fasta + path known_indels + path known_indels_tbi + + output: + tuple val(meta), path("${prefix}${meta.sample}.recal.table"), emit: report + val meta, emit: tsv + + //when: params.known_indels + + script: + options_dbsnp = params.dbsnp ? "--known-sites ${dbsnp}" : "" + options_intervals = params.no_intervals ? "" : "-L ${interval}" + options_known_indels = params.known_indels ? known_indels.collect{"--known-sites ${it}"}.join(' ') : "" + prefix = params.no_intervals ? "" : "${interval.baseName}_" + // TODO: --use-original-qualities ??? + """ + gatk --java-options -Xmx${task.memory.toGiga()}g \ + BaseRecalibrator \ + -I ${bam} \ + -O ${prefix}${meta.sample}.recal.table \ + --tmp-dir . \ + -R ${fasta} \ + ${options_dbsnp} \ + ${options_known_indels} \ + ${options_intervals} \ + --verbosity INFO + """ +} \ No newline at end of file diff --git a/modules/nf-core/software/gatk/createsequencedictionary.nf b/modules/nf-core/software/gatk/createsequencedictionary.nf new file mode 100644 index 0000000000..a740b91a64 --- /dev/null +++ b/modules/nf-core/software/gatk/createsequencedictionary.nf @@ -0,0 +1,36 @@ +include { initOptions; saveFiles; getSoftwareName } from './../functions' + +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::gatk4-spark=4.1.8.1" : null +container = "quay.io/biocontainers/gatk4-spark:4.1.8.1--0" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/gatk4-spark:4.1.8.1--0" + +process GATK_CREATESEQUENCEDICTIONARY { + tag "${fasta}" + + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:"false") } + + conda environment + container container + + input: + path fasta + + output: + path "${fasta.baseName}.dict" + + script: + def software = getSoftwareName(task.process) + def ioptions = initOptions(options) + """ + gatk --java-options "-Xmx${task.memory.toGiga()}g" \ + CreateSequenceDictionary \ + --REFERENCE ${fasta} \ + --OUTPUT ${fasta.baseName}.dict + + echo \$(gatk CreateSequenceDictionary --version 2>&1) | sed 's/^.*(GATK) v//; s/ HTSJDK.*\$//' > ${software}.version.txt + """ +} \ No newline at end of file diff --git a/modules/nf-core/software/gatk/gatherbqsrreports.nf b/modules/nf-core/software/gatk/gatherbqsrreports.nf new file mode 100644 index 0000000000..7bcebc6db4 --- /dev/null +++ b/modules/nf-core/software/gatk/gatherbqsrreports.nf @@ -0,0 +1,38 @@ +include { initOptions; saveFiles; getSoftwareName } from './../functions' + +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::gatk4-spark=4.1.8.1" : null +container = "quay.io/biocontainers/gatk4-spark:4.1.8.1--0" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/gatk4-spark:4.1.8.1--0" + +process GATK_GATHERBQSRREPORTS { + label 'memory_singleCPU_2_task' + label 'cpus_2' + + tag "${meta.id}" + + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } + + conda environment + container container + + input: + tuple val(meta), path(recal) + + output: + tuple val(meta), path("${meta.sample}.recal.table"), emit: table + path "${meta.sample}.recal.table", emit: report + val meta, emit: tsv + + script: + input = recal.collect{"-I ${it}"}.join(' ') + """ + gatk --java-options -Xmx${task.memory.toGiga()}g \ + GatherBQSRReports \ + ${input} \ + -O ${meta.sample}.recal.table \ + """ +} \ No newline at end of file diff --git a/modules/nf-core/software/gatk/genotypegvcf.nf b/modules/nf-core/software/gatk/genotypegvcf.nf new file mode 100644 index 0000000000..07a009caa9 --- /dev/null +++ b/modules/nf-core/software/gatk/genotypegvcf.nf @@ -0,0 +1,47 @@ +include { initOptions; saveFiles; getSoftwareName } from './../functions' + +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::gatk4-spark=4.1.8.1" : null +container = "quay.io/biocontainers/gatk4-spark:4.1.8.1--0" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/gatk4-spark:4.1.8.1--0" + +process GATK_GENOTYPEGVCF { + tag "${meta.id}" + + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } + + conda environment + container container + + input: + tuple val(meta), path(interval), path(gvcf) + path dbsnp + path dbsnpIndex + path dict + path fasta + path fai + + output: + tuple val(meta), path("${interval.baseName}_${meta.id}.vcf") + + script: + // Using -L is important for speed and we have to index the interval files also + intervalsOptions = params.no_intervals ? "" : "-L ${interval}" + dbsnpOptions = params.dbsnp ? "--D ${dbsnp}" : "" + """ + gatk --java-options -Xmx${task.memory.toGiga()}g \ + IndexFeatureFile \ + -I ${gvcf} + + gatk --java-options -Xmx${task.memory.toGiga()}g \ + GenotypeGVCFs \ + -R ${fasta} \ + ${intervalsOptions} \ + ${dbsnpOptions} \ + -V ${gvcf} \ + -O ${interval.baseName}_${meta.id}.vcf + """ +} \ No newline at end of file diff --git a/modules/nf-core/software/gatk/haplotypecaller.nf b/modules/nf-core/software/gatk/haplotypecaller.nf new file mode 100644 index 0000000000..fdcc259abc --- /dev/null +++ b/modules/nf-core/software/gatk/haplotypecaller.nf @@ -0,0 +1,48 @@ +include { initOptions; saveFiles; getSoftwareName } from './../functions' + +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::gatk4-spark=4.1.8.1" : null +container = "quay.io/biocontainers/gatk4-spark:4.1.8.1--0" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/gatk4-spark:4.1.8.1--0" + +process GATK_HAPLOTYPECALLER { + label 'MEMORY_SINGLECPU_TASK_SQ' + label 'CPUS_2' + + tag "${meta.id}" + + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } + + conda environment + container container + + input: + tuple val(meta), path(bam), path(bai), file(interval) + path dbsnp + path dbsnpIndex + path dict + path fasta + path fai + + output: + tuple val(meta), path("${interval.baseName}_${meta.id}.g.vcf"), emit: gvcf + tuple val(meta), path(interval), path("${interval.baseName}_${meta.id}.g.vcf"), emit: interval_gvcf + + + script: + intervalsOptions = params.no_intervals ? "" : "-L ${interval}" + dbsnpOptions = params.dbsnp ? "--D ${dbsnp}" : "" + """ + gatk --java-options "-Xmx${task.memory.toGiga()}g -Xms6000m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10" \ + HaplotypeCaller \ + -R ${fasta} \ + -I ${bam} \ + ${intervalsOptions} \ + ${dbsnpOptions} \ + -O ${interval.baseName}_${meta.id}.g.vcf \ + -ERC GVCF + """ +} diff --git a/modules/nf-core/software/gatk/markduplicates.nf b/modules/nf-core/software/gatk/markduplicates.nf new file mode 100644 index 0000000000..91bdfe76c8 --- /dev/null +++ b/modules/nf-core/software/gatk/markduplicates.nf @@ -0,0 +1,57 @@ +include { initOptions; saveFiles; getSoftwareName } from './../functions' + +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::gatk4-spark=4.1.8.1" : null +container = "quay.io/biocontainers/gatk4-spark:4.1.8.1--0" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/gatk4-spark:4.1.8.1--0" + +process GATK_MARKDUPLICATES { + label 'cpus_16' + + tag "${meta.id}" + + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } + + conda environment + container container + + input: + tuple val(meta), path("${meta.sample}.bam"), path("${meta.sample}.bam.bai") + + output: + tuple val(meta), path("${meta.sample}.md.bam"), path("${meta.sample}.md.bam.bai"), emit: bam + val meta, emit: tsv + path "${meta.sample}.bam.metrics", optional : true, emit: report + + script: + markdup_java_options = task.memory.toGiga() > 8 ? params.markdup_java_options : "\"-Xms" + (task.memory.toGiga() / 2).trunc() + "g -Xmx" + (task.memory.toGiga() - 1) + "g\"" + metrics = 'markduplicates' in params.skip_qc ? '' : "-M ${meta.sample}.bam.metrics" + + if (params.no_gatk_spark) + """ + gatk --java-options ${markdup_java_options} \ + MarkDuplicates \ + --MAX_RECORDS_IN_RAM 50000 \ + --INPUT ${meta.sample}.bam \ + --METRICS_FILE ${meta.sample}.bam.metrics \ + --TMP_DIR . \ + --ASSUME_SORT_ORDER coordinate \ + --CREATE_INDEX true \ + --OUTPUT ${meta.sample}.md.bam + mv ${meta.sample}.md.bai ${meta.sample}.md.bam.bai + """ + else + """ + gatk --java-options ${markdup_java_options} \ + MarkDuplicatesSpark \ + -I ${meta.sample}.bam \ + -O ${meta.sample}.md.bam \ + ${metrics} \ + --tmp-dir . \ + --create-output-bam-index true \ + --spark-master local[${task.cpus}] + """ +} \ No newline at end of file diff --git a/modules/nf-core/software/htslib_tabix.nf b/modules/nf-core/software/htslib_tabix.nf new file mode 100644 index 0000000000..ce133a4b13 --- /dev/null +++ b/modules/nf-core/software/htslib_tabix.nf @@ -0,0 +1,33 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::tabix=0.2.6" : null +container = "quay.io/biocontainers/tabix:0.2.6--ha92aebf_0" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/tabix:0.2.6--ha92aebf_0" + +process HTSLIB_TABIX { + tag "${vcf}" + + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:"false") } + + conda environment + container container + + input: + path vcf + + output: + path "${vcf}.tbi" + + script: + def software = getSoftwareName(task.process) + """ + tabix -p vcf ${vcf} + + echo \$(tabix -h 2>&1) | sed 's/^.*Version: //; s/(.*\$//' > ${software}.version.txt + """ +} diff --git a/modules/nf-core/software/multiqc.nf b/modules/nf-core/software/multiqc.nf new file mode 100644 index 0000000000..ed201b0f49 --- /dev/null +++ b/modules/nf-core/software/multiqc.nf @@ -0,0 +1,43 @@ +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::multiqc=1.9" : null +container = "quay.io/biocontainers/multiqc:1.9--py_1" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/multiqc:1.9--py_1" + +// Has the run name been specified by the user? +// this has the bonus effect of catching both -name and --name +def custom_runName = params.name +if (!(workflow.runName ==~ /[a-z]+_[a-z]+/)) { + custom_runName = workflow.runName +} + +process MULTIQC { + publishDir "${params.outdir}/multiqc", mode: params.publish_dir_mode + + conda environment + container container + + input: + // path software_versions + path multiqc_config + path multiqc_custom_config + val workflow_summary + path qc_reports + + output: + path "*multiqc_report.html" + path "*_data" + path "multiqc_plots" + + script: + title = custom_runName ? "--title \"${custom_runName}\"" : '' + filename = custom_runName ? "--filename " + custom_runName.replaceAll('\\W','_').replaceAll('_+','_') + "_multiqc_report" : '' + custom_config_file = params.multiqc_config ? "--config ${multiqc_custom_config}" : '' + """ + echo '${workflow_summary}' > workflow_summary_mqc.yaml + multiqc -f ${title} ${filename} ${custom_config_file} . + """ +} diff --git a/modules/nf-core/software/qualimap_bamqc.nf b/modules/nf-core/software/qualimap_bamqc.nf new file mode 100644 index 0000000000..25d3715d11 --- /dev/null +++ b/modules/nf-core/software/qualimap_bamqc.nf @@ -0,0 +1,44 @@ +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::qualimap=2.2.2d" : null +container = "quay.io/biocontainers/qualimap:2.2.2d--1" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/qualimap:2.2.2d--1" + +process QUALIMAP_BAMQC { + label 'memory_max' + label 'cpus_16' + + tag "${meta.id}" + + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } + + conda environment + container container + + input: + tuple val(meta), path(bam) + path(target_bed) + + output: + path("${bam.baseName}") + + script: + use_bed = params.target_bed ? "-gff ${target_bed}" : '' + """ + qualimap --java-mem-size=${task.memory.toGiga()}G \ + bamqc \ + -bam ${bam} \ + --paint-chromosome-limits \ + --genome-gc-distr HUMAN \ + ${use_bed} \ + -nt ${task.cpus} \ + -skip-duplicated \ + --skip-dup-mode 0 \ + -outdir ${bam.baseName} \ + -outformat HTML + """ +} diff --git a/modules/nf-core/software/samtools/faidx.nf b/modules/nf-core/software/samtools/faidx.nf new file mode 100644 index 0000000000..c60a62518c --- /dev/null +++ b/modules/nf-core/software/samtools/faidx.nf @@ -0,0 +1,33 @@ +include { initOptions; saveFiles; getSoftwareName } from './../functions' + +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::samtools=1.10" : null +container = "quay.io/biocontainers/samtools:1.10--h2e538c0_3" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/samtools:1.10--h2e538c0_3" + +process SAMTOOLS_FAIDX { + tag "${fasta}" + + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:"false") } + + conda environment + container container + + input: + path fasta + + output: + path "${fasta}.fai" + + script: + def software = getSoftwareName(task.process) + def ioptions = initOptions(options) + """ + samtools faidx ${fasta} + + echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/ Using.*\$//' > ${software}.version.txt + """ +} diff --git a/modules/nf-core/software/samtools/index.nf b/modules/nf-core/software/samtools/index.nf new file mode 100644 index 0000000000..0e5c3f11f7 --- /dev/null +++ b/modules/nf-core/software/samtools/index.nf @@ -0,0 +1,34 @@ +include { initOptions; saveFiles; getSoftwareName } from './../functions' + +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::samtools=1.10" : null +container = "quay.io/biocontainers/samtools:1.10--h2e538c0_3" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/samtools:1.10--h2e538c0_3" + +process SAMTOOLS_INDEX { + label 'cpus_8' + + tag "${meta.id}" + + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } + + conda environment + container container + + input: + tuple val(meta), path(bam) + + output: + tuple val(meta), path("${name}.bam"), path("*.bai") + + script: + name = options.suffix ? "${meta.id}.${options.suffix}" : "${meta.id}" + """ + [ ! -f ${name}.bam ] && ln -s ${bam} ${name}.bam + + samtools index ${name}.bam + """ +} \ No newline at end of file diff --git a/modules/nf-core/software/samtools/stats.nf b/modules/nf-core/software/samtools/stats.nf new file mode 100644 index 0000000000..6302366db5 --- /dev/null +++ b/modules/nf-core/software/samtools/stats.nf @@ -0,0 +1,31 @@ +include { initOptions; saveFiles; getSoftwareName } from './../functions' + +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::samtools=1.10" : null +container = "quay.io/biocontainers/samtools:1.10--h2e538c0_3" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/samtools:1.10--h2e538c0_3" + +process SAMTOOLS_STATS { + label 'cpus_2' + + tag "${meta.id}" + + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } + + conda environment + container container + + input: + tuple val(meta), path(bam) + + output: + path ("${bam}.samtools.stats.out") + + script: + """ + samtools stats ${bam} > ${bam}.samtools.stats.out + """ +} \ No newline at end of file diff --git a/modules/nf-core/software/strelka/germline.nf b/modules/nf-core/software/strelka/germline.nf new file mode 100644 index 0000000000..278af2adff --- /dev/null +++ b/modules/nf-core/software/strelka/germline.nf @@ -0,0 +1,62 @@ +include { initOptions; saveFiles; getSoftwareName } from './../functions' + +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::strelka=2.9.10" : null +container = "quay.io/biocontainers/strelka:2.9.10--0" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/strelka:2.9.10--0" + +process STRELKA_GERMLINE { + tag "${meta.id}" + + label 'CPUS_MAX' + label 'MEMORY_MAX' + + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } + + conda environment + container container + + input: + tuple val(meta), path(bam), path (bai) + path fasta + path fai + path target_bed + + output: + tuple val(meta), path("*_variants.vcf.gz"), path("*_variants.vcf.gz.tbi"), emit: vcf + tuple val(meta), path("*_genome.vcf.gz"), path("*_genome.vcf.gz.tbi"), emit: genome_vcf + path "*.version.txt", emit: version + + script: + def software = getSoftwareName(task.process) + def ioptions = initOptions(options) + def prefix = ioptions.suffix ? "strelka_${meta.id}${ioptions.suffix}" : "strelka_${meta.id}" + // TODO nf-core: It MUST be possible to pass additional parameters to the tool as a command-line string via the "$ioptions.args" variable + // TODO nf-core: If the tool supports multi-threading then you MUST provide the appropriate parameter + // using the Nextflow "task" variable e.g. "--threads $task.cpus" + beforeScript = params.target_bed ? "bgzip --threads ${task.cpus} -c ${target_bed} > call_targets.bed.gz ; tabix call_targets.bed.gz" : "" + options_strelka = params.target_bed ? ioptions.args : "" + """ + ${beforeScript} + configureStrelkaGermlineWorkflow.py \ + --bam ${bam} \ + --referenceFasta ${fasta} \ + ${options_strelka} \ + --runDir strelka + + python strelka/runWorkflow.py -m local -j ${task.cpus} + + mv strelka/results/variants/genome.*.vcf.gz ${prefix}_genome.vcf.gz + + mv strelka/results/variants/genome.*.vcf.gz.tbi ${prefix}_genome.vcf.gz.tbi + + mv strelka/results/variants/variants.vcf.gz ${prefix}_variants.vcf.gz + + mv strelka/results/variants/variants.vcf.gz.tbi ${prefix}_variants.vcf.gz.tbi + + echo configureStrelkaGermlineWorkflow.py --version &> ${software}.version.txt #2>&1 + """ +} \ No newline at end of file diff --git a/modules/nf-core/software/trimgalore.nf b/modules/nf-core/software/trimgalore.nf new file mode 100644 index 0000000000..60369d80af --- /dev/null +++ b/modules/nf-core/software/trimgalore.nf @@ -0,0 +1,69 @@ +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::trim-galore=0.6.5" : null +container = "quay.io/biocontainers/trim-galore:0.6.5--0" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/trim-galore:0.6.5--0" + +process TRIMGALORE { + label 'process_high' + + tag "${meta.id}" + + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } + + conda environment + container container + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*_1.fq.gz"), path("*_2.fq.gz"), emit: reads + path "*.html" , emit: html optional true + path "*.txt" , emit: log + path "*.version.txt", emit: version + path "*.zip" , emit: zip optional true + + script: + // Calculate number of --cores for TrimGalore based on value of task.cpus + // See: https://github.com/FelixKrueger/TrimGalore/blob/master/Changelog.md#version-060-release-on-1-mar-2019 + // See: https://github.com/nf-core/atacseq/pull/65 + def cores = 1 + if (task.cpus) { + cores = (task.cpus as int) - 4 + if (meta.single_end) cores = (task.cpus as int) - 3 + if (cores < 1) cores = 1 + if (cores > 4) cores = 4 + } + + // Clipping presets have to be evaluated in the context of SE/PE + c_r1 = params.clip_r1 > 0 ? "--clip_r1 ${params.clip_r1}" : '' + c_r2 = params.clip_r2 > 0 ? "--clip_r2 ${params.clip_r2}" : '' + tpc_r1 = params.three_prime_clip_r1 > 0 ? "--three_prime_clip_r1 ${params.three_prime_clip_r1}" : '' + tpc_r2 = params.three_prime_clip_r2 > 0 ? "--three_prime_clip_r2 ${params.three_prime_clip_r2}" : '' + + // Added soft-links to original fastqs for consistent naming in MultiQC + prefix = options.suffix ? "${meta.id}${options.suffix}" : "${meta.id}" + """ + [ ! -f ${prefix}_1.fastq.gz ] && ln -s ${reads[0]} ${prefix}_1.fastq.gz + [ ! -f ${prefix}_2.fastq.gz ] && ln -s ${reads[1]} ${prefix}_2.fastq.gz + + trim_galore \\ + ${options.args} \\ + --cores ${cores} \\ + --paired \\ + --gzip \\ + ${c_r1} \\ + ${c_r2} \\ + ${tpc_r1} \\ + ${tpc_r2} \\ + ${prefix}_1.fastq.gz \\ + ${prefix}_2.fastq.gz + + trim_galore --version > trim_galore.version.txt + """ +} diff --git a/modules/nf-core/subworkflow/qc_trim.nf b/modules/nf-core/subworkflow/qc_trim.nf new file mode 100644 index 0000000000..b39476c5ef --- /dev/null +++ b/modules/nf-core/subworkflow/qc_trim.nf @@ -0,0 +1,53 @@ +/* + * Read QC and trimming + */ +params.fastqc_options = [:] +params.trimgalore_options = [:] + +include { FASTQC } from '../software/fastqc' addParams(options: params.fastqc_options) +include { TRIMGALORE } from '../software/trimgalore' addParams(options: params.trimgalore_options) + +workflow QC_TRIM { + take: + + reads // channel: [ val(meta), [ reads ] ] + skip_fastqc // boolean: true/false + skip_trimming // boolean: true/false + + main: + + fastqc_html = Channel.empty() + fastqc_version = Channel.empty() + fastqc_zip = Channel.empty() + if (!skip_fastqc) { + FASTQC(reads) + fastqc_html = FASTQC.out.html + fastqc_version = FASTQC.out.version + fastqc_zip = FASTQC.out.zip + } + + trim_reads = reads + trimgalore_html = Channel.empty() + trimgalore_zip = Channel.empty() + trimgalore_log = Channel.empty() + trimgalore_version = Channel.empty() + if (!skip_trimming) { + TRIMGALORE(reads) + trim_reads = TRIMGALORE.out.reads + trimgalore_html = TRIMGALORE.out.html + trimgalore_zip = TRIMGALORE.out.zip + trimgalore_log = TRIMGALORE.out.log + trimgalore_version = TRIMGALORE.out.version + } + + emit: + + fastqc_html // path: *.html + fastqc_zip // path: *.zip + fastqc_version // path: *.version.txt + reads = trim_reads // channel: [ val(meta), [ reads ] ] + trimgalore_html // path: *.html + trimgalore_log // path: *.txt + trimgalore_zip // path: *.zip + trimgalore_version // path: *.version.txt +} diff --git a/nextflow.config b/nextflow.config index fd0755e56f..28f8ac9a98 100644 --- a/nextflow.config +++ b/nextflow.config @@ -90,14 +90,16 @@ params { email_on_fail = false plaintext_email = false // Plaintext email disabled max_multiqc_email_size = 25.MB - hostnames = false name = false // No default name tracedir = "${params.outdir}/pipeline_info" + // Singularity containers + pull_docker_container = false // Pull default container by default + // Base specifications // Defaults only, expecting to be overwritten - cpus = 8 + cpus = 8 max_cpus = 16 max_memory = 128.GB max_time = 240.h @@ -105,13 +107,16 @@ params { } // Container slug -// Stable releases should specify release tag (ie: `2.5.2`) +// Stable releases should specify release tag (ie: `dsl2`) // Developmental code should specify dev -process.container = 'nfcore/sarek:dev' +// process.container = 'nfcore/sarek:dsl2' // Load base.config by default for all pipelines includeConfig 'conf/base.config' +// Load modules.config by default for all pipelines +includeConfig 'conf/modules.config' + // Load nf-core custom profiles from different Institutions try { includeConfig "${params.custom_config_base}/nfcore_custom.config" @@ -136,19 +141,21 @@ profiles { singularity { singularity.autoMounts = true singularity.enabled = true + params.enable_conda = false } podman { podman.enabled = true } - test { includeConfig 'conf/test.config' } - test_annotation { includeConfig 'conf/test_annotation.config' } - test_use_gatk_spark { includeConfig 'conf/test_use_gatk_spark.config' } - test_split_fastq { includeConfig 'conf/test_split_fastq.config' } - test_targeted { includeConfig 'conf/test_targeted.config' } - test_tool { includeConfig 'conf/test_tool.config' } - test_trimming { includeConfig 'conf/test_trimming.config' } - test_umi_tso { includeConfig 'conf/test_umi_tso.config' } - test_umi_qiaseq { includeConfig 'conf/test_umi_qiaseq.config' } + test { includeConfig 'conf/test.config' } + test_annotation { includeConfig 'conf/test_annotation.config' } + test_use_gatk_spark { includeConfig 'conf/test_use_gatk_spark.config' } + test_split_fastq { includeConfig 'conf/test_split_fastq.config' } + test_targeted { includeConfig 'conf/test_targeted.config' } + test_tool { includeConfig 'conf/test_tool.config' } + test_trimming { includeConfig 'conf/test_trimming.config' } + test_haplotypecaller { includeConfig 'conf/test_germline_variantcalling.config' } + test_umi_tso { includeConfig 'conf/test_umi_tso.config' } + test_umi_qiaseq { includeConfig 'conf/test_umi_qiaseq.config' } } // Load genomes.config or igenomes.config @@ -165,13 +172,6 @@ env { R_ENVIRON_USER = "/.Renviron" } -// Export these variables to prevent local Python/R libraries from conflicting with those in the container -env { - PYTHONNOUSERSITE = 1 - R_PROFILE_USER = "/.Rprofile" - R_ENVIRON_USER = "/.Renviron" -} - // Capture exit codes from upstream processes when piping process.shell = ['/bin/bash', '-euo', 'pipefail'] @@ -198,7 +198,7 @@ manifest { homePage = 'https://github.com/nf-core/sarek' description = 'An open-source analysis pipeline to detect germline or somatic variants from whole genome or targeted sequencing' mainScript = 'main.nf' - nextflowVersion = '>=20.04.0' + nextflowVersion = '!>=20.11.0-edge' version = '3.0dev' } diff --git a/nextflow_schema.json b/nextflow_schema.json index 8f769263cd..4881d35470 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -784,6 +784,9 @@ }, { "$ref": "#/definitions/institutional_config_options" + }, + { + "$ref": "#/definitions/modules" } ] } \ No newline at end of file