diff --git a/Sarek-data b/Sarek-data index d717855502..ba299d76c8 160000 --- a/Sarek-data +++ b/Sarek-data @@ -1 +1 @@ -Subproject commit d717855502f2e7501e124f2fe116733fc01ae114 +Subproject commit ba299d76c851dc916c051ef5f77d7a4ab39dcc9f diff --git a/annotate.nf b/annotate.nf index c1e085f633..d3571a15cd 100644 --- a/annotate.nf +++ b/annotate.nf @@ -73,23 +73,23 @@ vcfToAnnotate = Channel.create() vcfNotToAnnotate = Channel.create() if (annotateVCF == []) { +// we annote all available vcfs by default that we can find in the VariantCalling directory Channel.empty().mix( Channel.fromPath("${directoryMap.haplotypecaller}/*.vcf.gz") .flatten().map{vcf -> ['haplotypecaller', vcf]}, Channel.fromPath("${directoryMap.manta}/*SV.vcf.gz") .flatten().map{vcf -> ['manta', vcf]}, - Channel.fromPath("${directoryMap.mutect1}/*.vcf.gz") - .flatten().map{vcf -> ['mutect1', vcf]}, Channel.fromPath("${directoryMap.mutect2}/*.vcf.gz") .flatten().map{vcf -> ['mutect2', vcf]}, - Channel.fromPath("${directoryMap.strelka}/*{somatic,variants}*.vcf.gz") + Channel.fromPath("${directoryMap.strelka}/*{somatic,variants}*.vcf.gz") // Strelka only .flatten().map{vcf -> ['strelka', vcf]}, - Channel.fromPath("${directoryMap.strelkabp}/*{somatic,variants}*.vcf.gz") + Channel.fromPath("${directoryMap.strelkabp}/*{somatic,variants}*.vcf.gz") // Strelka with Manta indel candidates .flatten().map{vcf -> ['strelkabp', vcf]} ).choice(vcfToAnnotate, vcfNotToAnnotate) { annotateTools == [] || (annotateTools != [] && it[0] in annotateTools) ? 0 : 1 } } else if (annotateTools == []) { +// alternatively, annotate user-submitted VCFs list = "" annotateVCF.each{ list += ",${it}" } list = list.substring(1) @@ -101,6 +101,10 @@ if (annotateVCF == []) { vcfNotToAnnotate.close() +// as now have the list of VCFs to annotate, the first step is to annotate with allele frequencies, if there are any + + + (vcfForBCFtools, vcfForVCFtools, vcfForSnpeff, vcfForVep) = vcfToAnnotate.into(4) vcfForVep = vcfForVep.map { @@ -224,11 +228,12 @@ process RunVEP { finalannotator = annotator == "snpeff" ? 'merge' : 'vep' genome = params.genome == 'smallGRCh37' ? 'GRCh37' : params.genome """ - vep \ + vep --dir /opt/vep/.vep/ \ -i ${vcf} \ -o ${vcf.simpleName}_VEP.ann.vcf \ --assembly ${genome} \ --cache \ + --cache_version 91 \ --database \ --everything \ --filter_common \ @@ -346,7 +351,6 @@ def helpMessage() { log.info " Possible values are:" log.info " haplotypecaller (Annotate HaplotypeCaller output)" log.info " manta (Annotate Manta output)" - log.info " mutect1 (Annotate MuTect1 output)" log.info " mutect2 (Annotate MuTect2 output)" log.info " strelka (Annotate Strelka output)" log.info " --annotateVCF" diff --git a/buildContainers.nf b/buildContainers.nf index ba6d96ff32..92294ff432 100644 --- a/buildContainers.nf +++ b/buildContainers.nf @@ -177,13 +177,6 @@ def grabRevision() { def defineContainersList(){ // Return list of authorized containers return [ - 'freebayes', - 'gatk', - 'gatk4', - 'igvtools', - 'mutect1', - 'picard', - 'qctools', 'r-base', 'runallelecount', 'sarek', @@ -208,8 +201,7 @@ def helpMessage() { log.info " --containers: Choose which containers to build" log.info " Default: all" log.info " Possible values:" - log.info " all, freebayes, gatk, gatk4, igvtools, mutect1, picard" - log.info " qctools, r-base, runallelecount, sarek, snpeff" + log.info " all, r-base, runallelecount, sarek, snpeff" log.info " snpeffgrch37, snpeffgrch38, vepgrch37, vepgrch38" log.info " --docker: Build containers using Docker" log.info " --help" diff --git a/buildReferences.nf b/buildReferences.nf index eb185ce271..cfc7a567a2 100644 --- a/buildReferences.nf +++ b/buildReferences.nf @@ -29,7 +29,7 @@ kate: syntax groovy; space-indent on; indent-width 2; - ProcessReference - Download all references if needed - DecompressFile - Extract files if needed - BuildBWAindexes - Build indexes for BWA - - BuildPicardIndex - Build index with Picard + - BuildReferenceIndex - Build index for FASTA refs - BuildSAMToolsIndex - Build index with SAMTools - BuildVCFIndex - Build index for VCF files ================================================================================ @@ -98,7 +98,7 @@ if (params.verbose) ch_decompressedFiles = ch_decompressedFiles.view { ch_fastaFile = Channel.create() ch_fastaForBWA = Channel.create() -ch_fastaForPicard = Channel.create() +ch_fastaReference = Channel.create() ch_fastaForSAMTools = Channel.create() ch_otherFile = Channel.create() ch_vcfFile = Channel.create() @@ -108,7 +108,7 @@ ch_decompressedFiles it =~ ".fasta" ? 0 : it =~ ".vcf" ? 1 : 2} -(ch_fastaForBWA, ch_fastaForPicard, ch_fastaForSAMTools, ch_fastaFileToKeep) = ch_fastaFile.into(4) +(ch_fastaForBWA, ch_fastaReference, ch_fastaForSAMTools, ch_fastaFileToKeep) = ch_fastaFile.into(4) (ch_vcfFile, ch_vcfFileToKeep) = ch_vcfFile.into(2) ch_notCompressedfiles @@ -137,29 +137,28 @@ if (params.verbose) bwaIndexes.flatten().view { "BWA index : ${it.fileName}" } -process BuildPicardIndex { +process BuildReferenceIndex { tag {f_reference} publishDir params.outDir, mode: 'link' input: - file(f_reference) from ch_fastaForPicard + file(f_reference) from ch_fastaReference output: - file("*.dict") into ch_picardIndex + file("*.dict") into ch_referenceIndex script: """ - java -Xmx${task.memory.toGiga()}g \ - -jar \$PICARD_HOME/picard.jar \ + gatk --java-options "-Xmx${task.memory.toGiga()}g" \ CreateSequenceDictionary \ - REFERENCE=${f_reference} \ - OUTPUT=${f_reference.baseName}.dict + --REFERENCE ${f_reference} \ + --OUTPUT ${f_reference.baseName}.dict """ } -if (params.verbose) ch_picardIndex.view { - "Picard index : ${it.fileName}" +if (params.verbose) ch_referenceIndex.view { + "Reference index : ${it.fileName}" } process BuildSAMToolsIndex { @@ -196,7 +195,7 @@ process BuildVCFIndex { script: """ - \$IGVTOOLS_HOME/igvtools index ${f_reference} + igvtools index ${f_reference} """ } diff --git a/configuration/containers.config b/configuration/containers.config index 07b862c441..5639a291fa 100644 --- a/configuration/containers.config +++ b/configuration/containers.config @@ -9,52 +9,48 @@ process { $BuildBWAindexes.container = "${params.repository}/sarek:${params.tag}" - $BuildPicardIndex.container = "${params.repository}/picard:${params.tag}" + $BuildReferenceIndex.container = "${params.repository}/sarek:${params.tag}" $BuildSAMToolsIndex.container = "${params.repository}/sarek:${params.tag}" - $BuildVCFIndex.container = "${params.repository}/igvtools:${params.tag}" + $BuildVCFIndex.container = "${params.repository}/sarek:${params.tag}" $CompressVCF.container = "${params.repository}/sarek:${params.tag}" $ConcatVCF.container = "${params.repository}/sarek:${params.tag}" - $CreateRecalibrationTable.container = "${params.repository}/gatk:${params.tag}" - $GetVersionAll.container = "${params.repository}/qctools:${params.tag}" + $CreateRecalibrationTable.container = "${params.repository}/sarek:${params.tag}" + $GetVersionAll.container = "${params.repository}/sarek:${params.tag}" $GetVersionAlleleCount.container = "${params.repository}/runallelecount:${params.tag}" $GetVersionASCAT.container = "${params.repository}/r-base:${params.tag}" - $GetVersionBamQC.container = "${params.repository}/qctools:${params.tag}" + $GetVersionBamQC.container = "${params.repository}/sarek:${params.tag}" $GetVersionBCFtools.container = "${params.repository}/sarek:${params.tag}" $GetVersionBWAsamtools.container = "${params.repository}/sarek:${params.tag}" - $GetVersionFastQC.container = "${params.repository}/qctools:${params.tag}" - $GetVersionFreeBayes.container = "${params.repository}/freebayes:${params.tag}" - $GetVersionGATK.container = "${params.repository}/gatk:${params.tag}" + $GetVersionFastQC.container = "${params.repository}/sarek:${params.tag}" + $GetVersionFreeBayes.container = "${params.repository}/sarek:${params.tag}" + $GetVersionGATK.container = "${params.repository}/sarek:${params.tag}" $GetVersionManta.container = "${params.repository}/sarek:${params.tag}" - $GetVersionPicard.container = "${params.repository}/picard:${params.tag}" $GetVersionSnpeff.container = {params.genome == 'GRCh38' ? "${params.repository}/snpeffgrch38:${params.tag}" : "${params.repository}/snpeffgrch37:${params.tag}"} $GetVersionStrelka.container = "${params.repository}/sarek:${params.tag}" - $GetVersionVCFtools.container = "${params.repository}/qctools:${params.tag}" + $GetVersionVCFtools.container = "${params.repository}/sarek:${params.tag}" $GetVersionVEP.container = {params.genome == 'GRCh38' ? "${params.repository}/vepgrch38:${params.tag}" : "${params.repository}/vepgrch37:${params.tag}"} - $IndelRealigner.container = "${params.repository}/gatk:${params.tag}" $MapReads.container = "${params.repository}/sarek:${params.tag}" - $MarkDuplicates.container = "${params.repository}/picard:${params.tag}" + $MarkDuplicates.container = "${params.repository}/sarek:${params.tag}" $MergeBams.container = "${params.repository}/sarek:${params.tag}" - $RealignerTargetCreator.container = "${params.repository}/gatk:${params.tag}" - $RecalibrateBam.container = "${params.repository}/gatk:${params.tag}" + $RecalibrateBam.container = "${params.repository}/sarek:${params.tag}" $RunAlleleCount.container = "${params.repository}/runallelecount:${params.tag}" $RunAscat.container = "${params.repository}/r-base:${params.tag}" - $RunBamQC.container = "${params.repository}/qctools:${params.tag}" + $RunBamQC.container = "${params.repository}/sarek:${params.tag}" $RunBcftoolsStats.container = "${params.repository}/sarek:${params.tag}" $RunConvertAlleleCounts.container = "${params.repository}/r-base:${params.tag}" - $RunFastQC.container = "${params.repository}/qctools:${params.tag}" - $RunFreeBayes.container = "${params.repository}/freebayes:${params.tag}" - $RunGenotypeGVCFs.container = "${params.repository}/gatk:${params.tag}" - $RunHaplotypecaller.container = "${params.repository}/gatk:${params.tag}" + $RunFastQC.container = "${params.repository}/sarek:${params.tag}" + $RunFreeBayes.container = "${params.repository}/sarek:${params.tag}" + $RunGenotypeGVCFs.container = "${params.repository}/sarek:${params.tag}" + $RunHaplotypecaller.container = "${params.repository}/sarek:${params.tag}" $RunManta.container = "${params.repository}/sarek:${params.tag}" - $RunMultiQC.container = "${params.repository}/qctools:${params.tag}" - $RunMutect1.container = "${params.repository}/mutect1:${params.tag}" - $RunMutect2.container = "${params.repository}/gatk:${params.tag}" + $RunMultiQC.container = "${params.repository}/sarek:${params.tag}" + $RunMutect2.container = "${params.repository}/sarek:${params.tag}" $RunSamtoolsStats.container = "${params.repository}/sarek:${params.tag}" $RunSingleManta.container = "${params.repository}/sarek:${params.tag}" $RunSingleStrelka.container = "${params.repository}/sarek:${params.tag}" $RunSnpeff.container = {params.genome == 'GRCh38' ? "${params.repository}/snpeffgrch38:${params.tag}" : "${params.repository}/snpeffgrch37:${params.tag}"} $RunStrelka.container = "${params.repository}/sarek:${params.tag}" $RunStrelkaBP.container = "${params.repository}/sarek:${params.tag}" - $RunVcftools.container = "${params.repository}/qctools:${params.tag}" + $RunVcftools.container = "${params.repository}/sarek:${params.tag}" $RunVEP.container = {params.genome == 'GRCh38' ? "${params.repository}/vepgrch38:${params.tag}" : "${params.repository}/vepgrch37:${params.tag}"} } diff --git a/configuration/genomes.config b/configuration/genomes.config index 56097e23fe..53ceecf9cf 100644 --- a/configuration/genomes.config +++ b/configuration/genomes.config @@ -42,6 +42,9 @@ params { knownIndels = "${params.genome_base}/{Mills_and_1000G_gold_standard.indels.hg38,beta/Homo_sapiens_assembly38.known_indels}.vcf.gz" knownIndelsIndex = "${params.genome_base}/{Mills_and_1000G_gold_standard.indels.hg38,beta/Homo_sapiens_assembly38.known_indels}.vcf.gz.tbi" snpeffDb = "GRCh38.86" + // This a nasty-looking list of allele-frequencies files. Add/remove files to match to your sets + //AF_files = "${params.genome_base}/{00-All.dbsnp_151.hg38.CAF.TOPMED.alternate.allele.freq,hapmap_3.3_grch38_pop_stratified_af.HMAF,SweGen_hg38_stratified.SWAF}.vcf" + //AF_indexes = "${params.genome_base}/{00-All.dbsnp_151.hg38.CAF.TOPMED.alternate.allele.freq,hapmap_3.3_grch38_pop_stratified_af.HMAF,SweGen_hg38_stratified.SWAF}.vcf.idx" } 'smallGRCh37' { acLoci = "${params.genome_base}/1000G_phase3_20130502_SNP_maf0.3.small.loci" diff --git a/configuration/singularity-path.config b/configuration/singularity-path.config index 7efea0292f..9b51a46318 100644 --- a/configuration/singularity-path.config +++ b/configuration/singularity-path.config @@ -14,52 +14,50 @@ singularity { process { $BuildBWAindexes.container = "${params.containerPath}/sarek-${params.tag}.img" - $BuildPicardIndex.container = "${params.containerPath}/picard-${params.tag}.img" + $BuildReferenceIndex.container = "${params.containerPath}/sarek-${params.tag}.img" $BuildSAMToolsIndex.container = "${params.containerPath}/sarek-${params.tag}.img" - $BuildVCFIndex.container = "${params.containerPath}/igvtools-${params.tag}.img" + $BuildVCFIndex.container = "${params.containerPath}/sarek-${params.tag}.img" $CompressVCF.container = "${params.containerPath}/sarek-${params.tag}.img" $ConcatVCF.container = "${params.containerPath}/sarek-${params.tag}.img" - $CreateRecalibrationTable.container = "${params.containerPath}/gatk-${params.tag}.img" - $GetVersionAll.container = "${params.containerPath}/qctools-${params.tag}.img" + $CreateRecalibrationTable.container = "${params.containerPath}/sarek-${params.tag}.img" + $GetVersionAll.container = "${params.containerPath}/sarek-${params.tag}.img" $GetVersionAlleleCount.container = "${params.containerPath}/runallelecount-${params.tag}.img" $GetVersionASCAT.container = "${params.containerPath}/r-base-${params.tag}.img" - $GetVersionBamQC.container = "${params.containerPath}/qctools-${params.tag}.img" + $GetVersionBamQC.container = "${params.containerPath}/sarek-${params.tag}.img" $GetVersionBCFtools.container = "${params.containerPath}/sarek-${params.tag}.img" $GetVersionBWAsamtools.container = "${params.containerPath}/sarek-${params.tag}.img" - $GetVersionFastQC.container = "${params.containerPath}/qctools-${params.tag}.img" - $GetVersionFreeBayes.container = "${params.containerPath}/freebayes-${params.tag}.img" - $GetVersionGATK.container = "${params.containerPath}/gatk-${params.tag}.img" + $GetVersionFastQC.container = "${params.containerPath}/sarek-${params.tag}.img" + $GetVersionFreeBayes.container = "${params.containerPath}/sarek-${params.tag}.img" + $GetVersionGATK.container = "${params.containerPath}/sarek-${params.tag}.img" $GetVersionManta.container = "${params.containerPath}/sarek-${params.tag}.img" - $GetVersionPicard.container = "${params.containerPath}/picard-${params.tag}.img" $GetVersionSnpeff.container = {params.genome == 'GRCh38' ? "${params.containerPath}/snpeffgrch38-${params.tag}.img" : "${params.containerPath}/snpeffgrch37-${params.tag}.img"} $GetVersionStrelka.container = "${params.containerPath}/sarek-${params.tag}.img" - $GetVersionVCFtools.container = "${params.containerPath}/qctools-${params.tag}.img" + $GetVersionVCFtools.container = "${params.containerPath}/sarek-${params.tag}.img" $GetVersionVEP.container = {params.genome == 'GRCh38' ? "${params.containerPath}/vepgrch38-${params.tag}.img" : "${params.containerPath}/vepgrch37-${params.tag}.img"} - $IndelRealigner.container = "${params.containerPath}/gatk-${params.tag}.img" + $IndelRealigner.container = "${params.containerPath}/sarek-${params.tag}.img" $MapReads.container = "${params.containerPath}/sarek-${params.tag}.img" - $MarkDuplicates.container = "${params.containerPath}/picard-${params.tag}.img" + $MarkDuplicates.container = "${params.containerPath}/sarek-${params.tag}.img" $MergeBams.container = "${params.containerPath}/sarek-${params.tag}.img" - $RealignerTargetCreator.container = "${params.containerPath}/gatk-${params.tag}.img" - $RecalibrateBam.container = "${params.containerPath}/gatk-${params.tag}.img" + $RealignerTargetCreator.container = "${params.containerPath}/sarek-${params.tag}.img" + $RecalibrateBam.container = "${params.containerPath}/sarek-${params.tag}.img" $RunAlleleCount.container = "${params.containerPath}/runallelecount-${params.tag}.img" $RunAscat.container = "${params.containerPath}/r-base-${params.tag}.img" - $RunBamQC.container = "${params.containerPath}/qctools-${params.tag}.img" + $RunBamQC.container = "${params.containerPath}/sarek-${params.tag}.img" $RunBcftoolsStats.container = "${params.containerPath}/sarek-${params.tag}.img" $RunConvertAlleleCounts.container = "${params.containerPath}/r-base-${params.tag}.img" - $RunFastQC.container = "${params.containerPath}/qctools-${params.tag}.img" - $RunFreeBayes.container = "${params.containerPath}/freebayes-${params.tag}.img" - $RunGenotypeGVCFs.container = "${params.containerPath}/gatk-${params.tag}.img" - $RunHaplotypecaller.container = "${params.containerPath}/gatk-${params.tag}.img" + $RunFastQC.container = "${params.containerPath}/sarek-${params.tag}.img" + $RunFreeBayes.container = "${params.containerPath}/sarek-${params.tag}.img" + $RunGenotypeGVCFs.container = "${params.containerPath}/sarek-${params.tag}.img" + $RunHaplotypecaller.container = "${params.containerPath}/sarek-${params.tag}.img" $RunManta.container = "${params.containerPath}/sarek-${params.tag}.img" - $RunMultiQC.container = "${params.containerPath}/qctools-${params.tag}.img" - $RunMutect1.container = "${params.containerPath}/mutect1-${params.tag}.img" - $RunMutect2.container = "${params.containerPath}/gatk-${params.tag}.img" + $RunMultiQC.container = "${params.containerPath}/sarek-${params.tag}.img" + $RunMutect2.container = "${params.containerPath}/sarek-${params.tag}.img" $RunSamtoolsStats.container = "${params.containerPath}/sarek-${params.tag}.img" $RunSingleManta.container = "${params.containerPath}/sarek-${params.tag}.img" $RunSingleStrelka.container = "${params.containerPath}/sarek-${params.tag}.img" $RunSnpeff.container = {params.genome == 'GRCh38' ? "${params.containerPath}/snpeffgrch38-${params.tag}.img" : "${params.containerPath}/snpeffgrch37-${params.tag}.img"} $RunStrelka.container = "${params.containerPath}/sarek-${params.tag}.img" $RunStrelkaBP.container = "${params.containerPath}/sarek-${params.tag}.img" - $RunVcftools.container = "${params.containerPath}/qctools-${params.tag}.img" + $RunVcftools.container = "${params.containerPath}/sarek-${params.tag}.img" $RunVEP.container = {params.genome == 'GRCh38' ? "${params.containerPath}/vepgrch38-${params.tag}.img" : "${params.containerPath}/vepgrch37-${params.tag}.img"} } diff --git a/configuration/uppmax-localhost.config b/configuration/uppmax-localhost.config index 1f4cd5bb56..f5907f6181 100644 --- a/configuration/uppmax-localhost.config +++ b/configuration/uppmax-localhost.config @@ -43,7 +43,7 @@ process { $BuildBWAindexes { memory = {params.totalMemory} // TODO This is likely too high } - $BuildPicardIndex { + $BuildReferenceIndex { memory = {params.totalMemory} // TODO This is likely too high } $BuildSAMToolsIndex { @@ -70,7 +70,9 @@ process { memory = {params.totalMemory} } $MarkDuplicates { - memory = {params.singleCPUMem * 2 * task.attempt} + // Actually the -Xmx value should be kept lower + cpus = 16 + memory = {2 * params.singleCPUMem} } $MergeBams { cpus = 16 @@ -117,10 +119,6 @@ process { } $RunMultiQC { } - $RunMutect1 { - cpus = 1 - memory = {params.singleCPUMem * task.attempt} - } $RunMutect2 { cpus = 1 memory = {params.singleCPUMem * task.attempt} diff --git a/configuration/uppmax-slurm.config b/configuration/uppmax-slurm.config index de1639964f..821a3855d8 100644 --- a/configuration/uppmax-slurm.config +++ b/configuration/uppmax-slurm.config @@ -27,7 +27,7 @@ process { $BuildBWAindexes { } - $BuildPicardIndex { + $BuildReferenceIndex { } $BuildSAMToolsIndex { } @@ -52,10 +52,9 @@ process { time = {params.runTime * task.attempt} } $MarkDuplicates { - cpus = 1 - memory = {params.singleCPUMem * 8 * task.attempt} - queue = 'core' - time = {params.runTime * task.attempt} + // Actually the -Xmx value should be kept lower + cpus = 16 + memory = {2 * params.singleCPUMem} } $MergeBams { memory = {params.singleCPUMem * task.attempt} @@ -116,12 +115,6 @@ process { $RunMultiQC { errorStrategy = { task.exitStatus == 143 ? 'retry' : 'ignore' } } - $RunMutect1 { - queue = 'core' - time = {params.runTime * task.attempt} - cpus = 1 - memory = {params.singleCPUMem * task.attempt} - } $RunMutect2 { cpus = 1 memory = {params.singleCPUMem * task.attempt} diff --git a/containers/freebayes/Dockerfile b/containers/freebayes/Dockerfile deleted file mode 100644 index efacdde8ee..0000000000 --- a/containers/freebayes/Dockerfile +++ /dev/null @@ -1,30 +0,0 @@ -FROM debian:8.9 - -LABEL \ - author="Maxime Garcia" \ - description="FreeBayes image for use in Sarek" \ - maintainer="maxime.garcia@scilifelab.se" - -# Install libraries -RUN \ - apt-get update && apt-get install -y --no-install-recommends \ - build-essential \ - ca-certificates \ - cmake \ - git \ - wget \ - zlib1g-dev \ - && rm -rf /var/lib/apt/lists/* - -# Setup ENV variables -ENV FREEBAYES_VERSION=1.1.0 - -# Install BCFTools -RUN \ - git clone --recursive git://github.com/ekg/freebayes.git freebayes \ - --branch v${FREEBAYES_VERSION} \ - && cd freebayes \ - && make \ - && make install \ - && cd .. \ -&& rm -rf freebayes diff --git a/containers/mutect1/Dockerfile b/containers/mutect1/Dockerfile deleted file mode 100644 index 7068fb963c..0000000000 --- a/containers/mutect1/Dockerfile +++ /dev/null @@ -1,25 +0,0 @@ -FROM openjdk:7-slim - -LABEL \ - author="Maxime Garcia" \ - description="MuTect 1.1.5 image for use in Sarek" \ - maintainer="maxime.garcia@scilifelab.se" - -# Install libraries -RUN \ - apt-get update && apt-get install -y --no-install-recommends \ - wget \ - && rm -rf /var/lib/apt/lists/* - -# Setup ENV variables -ENV \ - MUTECT_HOME=/opt/mutect-1.1.5 \ - MUTECT_VERSION=1.1.5 - -# Install MuTect1 -RUN \ - wget --quiet -O muTect-${MUTECT_VERSION}-bin.zip \ - https://github.com/broadinstitute/mutect/releases/download/${MUTECT_VERSION}/muTect-${MUTECT_VERSION}-bin.zip \ - && unzip muTect-${MUTECT_VERSION}-bin.zip -d ${MUTECT_HOME} \ - && rm muTect-${MUTECT_VERSION}-bin.zip \ -&& mv ${MUTECT_HOME}/muTect-${MUTECT_VERSION}.jar ${MUTECT_HOME}/muTect.jar diff --git a/containers/sarek/Dockerfile b/containers/sarek/Dockerfile index 7ef82bbd8a..340e699e71 100644 --- a/containers/sarek/Dockerfile +++ b/containers/sarek/Dockerfile @@ -1,18 +1,10 @@ -FROM debian:8.9 +FROM nfcore/base:latest LABEL \ - author="Maxime Garcia" \ + authors="Maxime.Garcia@scilifelab.se, Szilveszter.Juhos@scilifelab.se" \ description="Image with tools used in Sarek" \ - maintainer="maxime.garcia@scilifelab.se" + maintainer="Maxime Garcia , Szilveszter Juhos " -# Export PATH -ENV \ - MANTA_INSTALL_PATH=/opt/manta \ - PATH=/opt/bcftools/bin:/opt/htslib/bin:/opt/samtools/bin:$PATH \ - STRELKA_INSTALL_PATH=/opt/strelka - -# Build container -ADD \ - build.sh /usr/bin/ -RUN \ - build.sh +COPY environment.yml / +RUN conda env update -n root -f /environment.yml && conda clean -a +ENV PATH /opt/conda/bin:$PATH diff --git a/containers/sarek/build.sh b/containers/sarek/build.sh deleted file mode 100755 index fbe57b5aee..0000000000 --- a/containers/sarek/build.sh +++ /dev/null @@ -1,103 +0,0 @@ -#!/bin/sh -set -eu pipefail - -# Local ENV variables -BCFTOOLS_VERSION=1.5 -BWA_VERSION=0.7.16 -HTSLIB_VERSION=1.5 -MANTA_VERSION=1.1.1 -SAMTOOLS_VERSION=1.5 -STRELKA_VERSION=2.8.2 - -# Install libraries -apt-get update && apt-get install -y --no-install-recommends \ - build-essential \ - bzip2 \ - ca-certificates \ - g++ \ - gcc \ - git \ - libbz2-dev \ - liblzma-dev \ - libncurses5-dev \ - libncursesw5-dev \ - make \ - python \ - python3 \ - unzip \ - wget \ - zlib1g-dev - -# Install tools -mkdir /build - -# Install BWA -cd /build -git clone http://github.com/lh3/bwa.git bwa \ - --branch v${BWA_VERSION} -cd bwa -make -cp bwa /usr/local/bin/bwa - -# Install HTSlib -cd /build -wget --quiet -O htslib-${HTSLIB_VERSION}.tar.bz2 \ - https://github.com/samtools/htslib/releases/download/${HTSLIB_VERSION}/htslib-${HTSLIB_VERSION}.tar.bz2 -tar xfj htslib-${HTSLIB_VERSION}.tar.bz2 -rm htslib-${HTSLIB_VERSION}.tar.bz2 -cd htslib-${HTSLIB_VERSION} -./configure --prefix=/opt/htslib -make && make install - -# Install BCFtools -cd /build -wget --quiet -O bcftools-${BCFTOOLS_VERSION}.tar.bz2 \ - https://github.com/samtools/bcftools/releases/download/${BCFTOOLS_VERSION}/bcftools-${BCFTOOLS_VERSION}.tar.bz2 -tar xfj bcftools-${BCFTOOLS_VERSION}.tar.bz2 -rm bcftools-${BCFTOOLS_VERSION}.tar.bz2 -cd bcftools-${BCFTOOLS_VERSION} -./configure --prefix=/opt/bcftools -make && make install - -# Install Samtools -cd /build -wget --quiet -O samtools-${SAMTOOLS_VERSION}.tar.bz2 \ - https://github.com/samtools/samtools/releases/download/${SAMTOOLS_VERSION}/samtools-${SAMTOOLS_VERSION}.tar.bz2 -tar xfj samtools-${SAMTOOLS_VERSION}.tar.bz2 -rm samtools-${SAMTOOLS_VERSION}.tar.bz2 -cd samtools-${SAMTOOLS_VERSION} -./configure --prefix=/opt/samtools -make && make install - -# Install Manta -cd /build -wget --quiet -O manta-${MANTA_VERSION}.centos5_x86_64.tar.bz2 \ - https://github.com/Illumina/manta/releases/download/v${MANTA_VERSION}/manta-${MANTA_VERSION}.centos5_x86_64.tar.bz2 -tar xvjf manta-${MANTA_VERSION}.centos5_x86_64.tar.bz2 -mv manta-${MANTA_VERSION}.centos5_x86_64 $MANTA_INSTALL_PATH -rm manta-${MANTA_VERSION}.centos5_x86_64.tar.bz2 - -# Install Strelka -cd /build -wget --quiet -O strelka-${STRELKA_VERSION}.centos5_x86_64.tar.bz2 \ - https://github.com/Illumina/strelka/releases/download/v${STRELKA_VERSION}/strelka-${STRELKA_VERSION}.centos5_x86_64.tar.bz2 -tar xvjf strelka-${STRELKA_VERSION}.centos5_x86_64.tar.bz2 -mv strelka-${STRELKA_VERSION}.centos5_x86_64 $STRELKA_INSTALL_PATH -rm strelka-${STRELKA_VERSION}.centos5_x86_64.tar.bz2 - -# Clean up install -cd / -apt-get remove -y \ - build-essential \ - ca-certificates \ - gcc \ - git \ - libbz2-dev \ - liblzma-dev \ - libncurses5-dev \ - libncursesw5-dev \ - unzip \ - wget \ - zlib1g-dev -apt-get clean -rm -rf /build /var/lib/apt/lists/* /opt/get-pip.py diff --git a/containers/sarek/environment.yml b/containers/sarek/environment.yml new file mode 100644 index 0000000000..af3ea825f4 --- /dev/null +++ b/containers/sarek/environment.yml @@ -0,0 +1,24 @@ +# You can use this file to create a conda environment for this pipeline: +# conda env create -f environment.yml +name: sarek +channels: + - bioconda + - conda-forge + - defaults + +dependencies: + - bcftools=1.8 + - bwa=0.7.17 + - conda-forge::openjdk=8.0.144 # Needed for FastQC docker - see bioconda/bioconda-recipes#5026 + - fastqc=0.11.7 + - freebayes=1.2.0 + - gatk4=4.0.6.0 + - htslib=1.9 + - igvtools=2.3.93 + - manta=1.4.0 + - multiqc=1.5 + - qualimap=2.2.2a + - samtools=1.8 + - strelka=2.9.3 + - vcfanno=0.2.8 + - vcftools=0.1.15 diff --git a/doc/images/GitHub.QR.png b/doc/images/GitHub.QR.png new file mode 100644 index 0000000000..f20bee47fa Binary files /dev/null and b/doc/images/GitHub.QR.png differ diff --git a/germlineVC.nf b/germlineVC.nf index ea3ab75700..4dd71fb528 100644 --- a/germlineVC.nf +++ b/germlineVC.nf @@ -31,7 +31,7 @@ kate: syntax groovy; space-indent on; indent-width 2; - CreateIntervalBeds - Create and sort intervals into bed files - RunHaplotypecaller - Run HaplotypeCaller for Germline Variant Calling (Parallelized processes) - RunGenotypeGVCFs - Run HaplotypeCaller for Germline Variant Calling (Parallelized processes) - - ConcatVCF - Merge results from HaplotypeCaller, MuTect1 and MuTect2 + - ConcatVCF - Merge results from HaplotypeCaller, MuTect2 and other paralellized callers - RunSingleStrelka - Run Strelka for Germline Variant Calling - RunSingleManta - Run Manta for Single Structural Variant Calling - RunBcftoolsStats - Run BCFTools stats on vcf files @@ -71,9 +71,6 @@ if (params.test && params.genome in ['GRCh37', 'GRCh38']) { referenceMap.intervals = file("$workflow.projectDir/repeats/tiny_${params.genome}.list") } -// TODO -// MuTect and Mutect2 could be run without a recalibrated BAM (they support -// the --BQSR option), but this is not implemented, yet. // TODO // FreeBayes does not need recalibrated BAMs, but we need to test whether // the channels are set up correctly when we disable it @@ -287,8 +284,8 @@ bamsAll = bamsAll.map { bamsTumorNormalIntervals = bamsAll.spread(bedIntervals) -// MuTect1, MuTect2, FreeBayes -(bamsFMT1, bamsFMT2, bamsFFB) = bamsTumorNormalIntervals.into(3) +// MuTect2, FreeBayes +(bamsFMT2, bamsFFB) = bamsTumorNormalIntervals.into(2) process RunHaplotypecaller { tag {idSample + "-" + intervalBed.baseName} @@ -310,20 +307,15 @@ process RunHaplotypecaller { when: 'haplotypecaller' in tools && !params.onlyQC script: - BQSR = (recalTable != null) ? "--BQSR $recalTable" : '' """ - java -Xmx${task.memory.toGiga()}g \ - -jar \$GATK_HOME/GenomeAnalysisTK.jar \ - -T HaplotypeCaller \ - --emitRefConfidence GVCF \ - -pairHMM LOGLESS_CACHING \ - -R ${genomeFile} \ - --dbsnp ${dbsnp} \ - ${BQSR} \ - -I ${bam} \ - -L ${intervalBed} \ - --disable_auto_index_creation_and_locking_when_reading_rods \ - -o ${intervalBed.baseName}_${idSample}.g.vcf + gatk --java-options "-Xmx${task.memory.toGiga()}g -Xms6000m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10" \ + HaplotypeCaller \ + -R ${genomeFile} \ + -I ${bam} \ + -L ${intervalBed} \ + --dbsnp ${dbsnp} \ + -O ${intervalBed.baseName}_${idSample}.g.vcf \ + --emit-ref-confidence GVCF """ } hcGenomicVCF = hcGenomicVCF.groupTuple(by:[0,1,2,3]) @@ -349,17 +341,17 @@ process RunGenotypeGVCFs { when: 'haplotypecaller' in tools && !params.onlyQC script: - // Using -L is important for speed + // Using -L is important for speed and we have to index the interval files also """ - java -Xmx${task.memory.toGiga()}g \ - -jar \$GATK_HOME/GenomeAnalysisTK.jar \ - -T GenotypeGVCFs \ + gatk IndexFeatureFile -F ${gvcf} + + gatk --java-options -Xmx${task.memory.toGiga()}g \ + GenotypeGVCFs \ -R ${genomeFile} \ -L ${intervalBed} \ --dbsnp ${dbsnp} \ - --variant ${gvcf} \ - --disable_auto_index_creation_and_locking_when_reading_rods \ - -o ${intervalBed.baseName}_${idSample}.vcf + -V ${gvcf} \ + -O ${intervalBed.baseName}_${idSample}.vcf """ } hcGenotypedVCF = hcGenotypedVCF.groupTuple(by:[0,1,2,3]) @@ -387,7 +379,7 @@ process ConcatVCF { set variantCaller, idPatient, idSampleNormal, idSampleTumor, file("*.vcf.gz"), file("*.vcf.gz.tbi") into vcfConcatenated - when: ( 'haplotypecaller' in tools || 'mutect1' in tools || 'mutect2' in tools || 'freebayes' in tools ) && !params.onlyQC + when: ( 'haplotypecaller' in tools || 'mutect2' in tools || 'freebayes' in tools ) && !params.onlyQC script: if (variantCaller == 'haplotypecaller') outputFile = "${variantCaller}_${idSampleNormal}.vcf" @@ -395,6 +387,7 @@ process ConcatVCF { else outputFile = "${variantCaller}_${idSampleTumor}_vs_${idSampleNormal}.vcf" """ + set -euo pipefail # first make a header from one of the VCF intervals # get rid of interval information only from the GATK command-line, but leave the rest FIRSTVCF=\$(ls *.vcf | head -n 1) @@ -462,7 +455,7 @@ process RunSingleStrelka { script: """ - \$STRELKA_INSTALL_PATH/bin/configureStrelkaGermlineWorkflow.py \ + configureStrelkaGermlineWorkflow.py \ --bam ${bam} \ --referenceFasta ${genomeFile} \ --runDir Strelka @@ -506,7 +499,7 @@ process RunSingleManta { script: """ - \$MANTA_INSTALL_PATH/bin/configManta.py \ + configManta.py \ --bam ${bam} \ --reference ${genomeFile} \ --runDir Manta @@ -670,7 +663,6 @@ def defineToolList() { 'freebayes', 'haplotypecaller', 'manta', - 'mutect1', 'mutect2', 'strelka' ] diff --git a/lib/QC.groovy b/lib/QC.groovy index 2b228a8b68..2f6885d84b 100644 --- a/lib/QC.groovy +++ b/lib/QC.groovy @@ -59,14 +59,14 @@ class QC { // Get GATK version static def getVersionGATK() { """ - echo "GATK version"\$(java -jar \$GATK_HOME/GenomeAnalysisTK.jar --version 2>&1) > v_gatk.txt + gatk ApplyBQSR --help 2>&1| awk -F/ '/java/{for(i=1;i<=NF;i++){if(\$i~/gatk4/){sub("gatk4-","",\$i);print \$i>"v_gatk.txt"}}}' """ } // Get Manta version static def getVersionManta() { """ - cat \$MANTA_INSTALL_PATH/lib/python/configBuildTimeInfo.py | grep workflowVersion > v_manta.txt + configManta.py --version > v_manta.txt """ } @@ -80,7 +80,7 @@ class QC { // Get Strelka version static def getVersionStrelka() { """ - cat \$STRELKA_INSTALL_PATH/lib/python/configBuildTimeInfo.py | grep workflowVersion > v_strelka.txt + configureStrelkaGermlineWorkflow.py --version > v_strelka.txt """ } diff --git a/lib/SarekUtils.groovy b/lib/SarekUtils.groovy index 2a95d198b1..43287f0388 100644 --- a/lib/SarekUtils.groovy +++ b/lib/SarekUtils.groovy @@ -100,13 +100,14 @@ class SarekUtils { } // Loop through all the references files to check their existence + static def checkRefExistence(referenceFile, fileToCheck) { if (fileToCheck instanceof List) return fileToCheck.every{ SarekUtils.checkRefExistence(referenceFile, it) } def f = file(fileToCheck) // this is an expanded wildcard: we can assume all files exist if (f instanceof List && f.size() > 0) return true - else if (!f.exists()) { - this.log.info "Missing references: ${referenceFile} ${fileToCheck}" + else if (!f.exists()) { + println "Missing references: ${referenceFile} ${fileToCheck}" return false } return true @@ -115,15 +116,13 @@ class SarekUtils { // Define map of directories static def defineDirectoryMap(outDir) { return [ - 'nonRealigned' : "${outDir}/Preprocessing/NonRealigned", - 'nonRecalibrated' : "${outDir}/Preprocessing/NonRecalibrated", + 'duplicateMarked' : "${outDir}/Preprocessing/DuplicateMarked", 'recalibrated' : "${outDir}/Preprocessing/Recalibrated", 'ascat' : "${outDir}/VariantCalling/Ascat", 'freebayes' : "${outDir}/VariantCalling/FreeBayes", 'gvcf-hc' : "${outDir}/VariantCalling/HaplotypeCallerGVCF", 'haplotypecaller' : "${outDir}/VariantCalling/HaplotypeCaller", 'manta' : "${outDir}/VariantCalling/Manta", - 'mutect1' : "${outDir}/VariantCalling/MuTect1", 'mutect2' : "${outDir}/VariantCalling/MuTect2", 'strelka' : "${outDir}/VariantCalling/Strelka", 'strelkabp' : "${outDir}/VariantCalling/StrelkaBP", diff --git a/main.nf b/main.nf index f683d397b6..ecd26d3461 100644 --- a/main.nf +++ b/main.nf @@ -29,8 +29,7 @@ kate: syntax groovy; space-indent on; indent-width 2; - RunFastQC - Run FastQC for QC on fastq files - MapReads - Map reads with BWA - MergeBams - Merge BAMs if multilane samples - - MarkDuplicates - Mark Duplicates with Picard - - RealignerTargetCreator - Create realignment target intervals + - MarkDuplicates - Mark Duplicates with GATK4 - IndelRealigner - Realign BAMs as T/N pair - CreateRecalibrationTable - Create Recalibration Table with BaseRecalibrator - RecalibrateBam - Recalibrate Bam with PrintReads @@ -83,8 +82,7 @@ if (params.sample) tsvPath = params.sample if (!params.sample && !params.sampleDir) { tsvPaths = [ 'mapping': "${workflow.projectDir}/Sarek-data/testdata/tsv/tiny.tsv", - 'realign': "${directoryMap.nonRealigned}/nonRealigned.tsv", - 'recalibrate': "${directoryMap.nonRecalibrated}/nonRecalibrated.tsv" + 'recalibrate': "${directoryMap.duplicateMarked}/duplicateMarked.tsv" ] if (params.test || step != 'mapping') tsvPath = tsvPaths[step] } @@ -97,7 +95,6 @@ if (tsvPath) { tsvFile = file(tsvPath) switch (step) { case 'mapping': fastqFiles = extractFastq(tsvFile); break - case 'realign': bamFiles = SarekUtils.extractBams(tsvFile, "somatic"); break case 'recalibrate': bamFiles = extractRecal(tsvFile); break default: exit 1, "Unknown step ${step}" } @@ -182,7 +179,7 @@ process MapReads { """ bwa mem -R \"${readGroup}\" ${extra} -t ${task.cpus} -M \ ${genomeFile} ${fastqFile1} ${fastqFile2} | \ - samtools sort --threads ${task.cpus} -m 4G - > ${idRun}.bam + samtools sort --threads ${task.cpus} -m 2G - > ${idRun}.bam """ } @@ -247,14 +244,14 @@ process MarkDuplicates { publishDir params.outDir, mode: 'link', saveAs: { if (it == "${bam}.metrics") "${directoryMap.markDuplicatesQC}/${it}" - else "${directoryMap.nonRealigned}/${it}" + else "${directoryMap.duplicateMarked}/${it}" } input: set idPatient, status, idSample, file(bam) from mergedBam output: - set idPatient, file("${idSample}_${status}.md.bam"), file("${idSample}_${status}.md.bai") into duplicates + set idPatient, file("${idSample}_${status}.md.bam"), file("${idSample}_${status}.md.bai") into duplicateMarkedBams set idPatient, status, idSample, val("${idSample}_${status}.md.bam"), val("${idSample}_${status}.md.bai") into markDuplicatesTSV file ("${bam}.metrics") into markDuplicatesReport @@ -262,150 +259,27 @@ process MarkDuplicates { script: """ - java -Xmx${task.memory.toGiga()}g \ - -jar \$PICARD_HOME/picard.jar MarkDuplicates \ - INPUT=${bam} \ - METRICS_FILE=${bam}.metrics \ - TMP_DIR=. \ - ASSUME_SORTED=true \ - VALIDATION_STRINGENCY=LENIENT \ - CREATE_INDEX=TRUE \ - OUTPUT=${idSample}_${status}.md.bam + gatk --java-options -Xmx${task.memory.toGiga()}g \ + MarkDuplicates \ + --MAX_RECORDS_IN_RAM 50000 \ + --INPUT ${bam} \ + --METRICS_FILE ${bam}.metrics \ + --TMP_DIR . \ + --ASSUME_SORT_ORDER coordinate \ + --CREATE_INDEX true \ + --OUTPUT ${idSample}_${status}.md.bam """ } // Creating a TSV file to restart from this step markDuplicatesTSV.map { idPatient, status, idSample, bam, bai -> gender = patientGenders[idPatient] - "${idPatient}\t${gender}\t${status}\t${idSample}\t${directoryMap.nonRealigned}/${bam}\t${directoryMap.nonRealigned}/${bai}\n" + "${idPatient}\t${gender}\t${status}\t${idSample}\t${directoryMap.duplicateMarked}/${bam}\t${directoryMap.duplicateMarked}/${bai}\n" }.collectFile( - name: 'nonRealigned.tsv', sort: true, storeDir: directoryMap.nonRealigned + name: 'duplicateMarked.tsv', sort: true, storeDir: directoryMap.duplicateMarked ) -// Create intervals for realignement using both tumor+normal as input -// Group the marked duplicates BAMs for intervals and realign by idPatient -// Grouping also by gender, to make a nicer channel -duplicatesGrouped = Channel.empty() -if (step == 'mapping') duplicatesGrouped = duplicates.groupTuple() -else if (step == 'realign') duplicatesGrouped = bamFiles.map{ - idPatient, status, idSample, bam, bai -> - [idPatient, bam, bai] -}.groupTuple() - -// The duplicatesGrouped channel is duplicated -// one copy goes to the RealignerTargetCreator process -// and the other to the IndelRealigner process -(duplicatesInterval, duplicatesRealign) = duplicatesGrouped.into(2) - -if (params.verbose) duplicatesInterval = duplicatesInterval.view { - "BAMs for RealignerTargetCreator:\n\ - ID : ${it[0]}\n\ - Files : ${it[1].fileName}\n\ - Files : ${it[2].fileName}" -} - -if (params.verbose) duplicatesRealign = duplicatesRealign.view { - "BAMs to join:\n\ - ID : ${it[0]}\n\ - Files : ${it[1].fileName}\n\ - Files : ${it[2].fileName}" -} - -if (params.verbose) markDuplicatesReport = markDuplicatesReport.view { - "MarkDuplicates report:\n\ - File : [${it.fileName}]" -} - -// VCF indexes are added so they will be linked, and not re-created on the fly -// -L "1:131941-141339" \ - -process RealignerTargetCreator { - tag {idPatient} - - input: - set idPatient, file(bam), file(bai) from duplicatesInterval - set file(genomeFile), file(genomeIndex), file(genomeDict), file(knownIndels), file(knownIndelsIndex), file(intervals) from Channel.value([ - referenceMap.genomeFile, - referenceMap.genomeIndex, - referenceMap.genomeDict, - referenceMap.knownIndels, - referenceMap.knownIndelsIndex, - referenceMap.intervals - ]) - - output: - set idPatient, file("${idPatient}.intervals") into intervals - - when: ( step == 'mapping' || step == 'realign' ) && !params.onlyQC - - script: - bams = bam.collect{"-I ${it}"}.join(' ') - known = knownIndels.collect{"-known ${it}"}.join(' ') - """ - java -Xmx${task.memory.toGiga()}g \ - -jar \$GATK_HOME/GenomeAnalysisTK.jar \ - -T RealignerTargetCreator \ - ${bams} \ - -R ${genomeFile} \ - ${known} \ - -nt ${task.cpus} \ - -L ${intervals} \ - -o ${idPatient}.intervals - """ -} - -if (params.verbose) intervals = intervals.view { - "Intervals to join:\n\ - ID : ${it[0]}\n\ - File : [${it[1].fileName}]" -} - -bamsAndIntervals = duplicatesRealign.join(intervals) - -if (params.verbose) bamsAndIntervals = bamsAndIntervals.view { - "BAMs and Intervals joined for IndelRealigner:\n\ - ID : ${it[0]}\n\ - Files : ${it[1].fileName}\n\ - Files : ${it[2].fileName}\n\ - File : [${it[3].fileName}]" -} - -// use nWayOut to split into T/N pair again -process IndelRealigner { - tag {idPatient} - - publishDir directoryMap.nonRecalibrated, mode: 'link' - - input: - set idPatient, file(bam), file(bai), file(intervals) from bamsAndIntervals - set file(genomeFile), file(genomeIndex), file(genomeDict), file(knownIndels), file(knownIndelsIndex) from Channel.value([ - referenceMap.genomeFile, - referenceMap.genomeIndex, - referenceMap.genomeDict, - referenceMap.knownIndels, - referenceMap.knownIndelsIndex]) - - output: - set idPatient, file("*.real.bam"), file("*.real.bai") into realignedBam mode flatten - - when: ( step == 'mapping' || step == 'realign' ) && !params.onlyQC - - script: - bams = bam.collect{"-I ${it}"}.join(' ') - known = knownIndels.collect{"-known ${it}"}.join(' ') - """ - java -Xmx${task.memory.toGiga()}g \ - -jar \$GATK_HOME/GenomeAnalysisTK.jar \ - -T IndelRealigner \ - ${bams} \ - -R ${genomeFile} \ - -targetIntervals ${intervals} \ - ${known} \ - -nWayOut '.real.bam' - """ -} - -realignedBam = realignedBam.map { +duplicateMarkedBams = duplicateMarkedBams.map { idPatient, bam, bai -> tag = bam.baseName.tokenize('.')[0] status = tag[-1..-1].toInteger() @@ -413,21 +287,21 @@ realignedBam = realignedBam.map { [idPatient, status, idSample, bam, bai] } -if (params.verbose) realignedBam = realignedBam.view { +if (params.verbose) duplicateMarkedBams = duplicateMarkedBams.view { "Realigned BAM to CreateRecalibrationTable:\n\ ID : ${it[0]}\tStatus: ${it[1]}\tSample: ${it[2]}\n\ Files : [${it[3].fileName}, ${it[4].fileName}]" } -(realignedBam, realignedBamToJoin) = realignedBam.into(2) +(mdBam, mdBamToJoin) = duplicateMarkedBams.into(2) process CreateRecalibrationTable { tag {idPatient + "-" + idSample} - publishDir directoryMap.nonRecalibrated, mode: 'link', overwrite: false + publishDir directoryMap.duplicateMarked, mode: 'link', overwrite: false input: - set idPatient, status, idSample, file(bam), file(bai) from realignedBam + set idPatient, status, idSample, file(bam), file(bai) from mdBam // realignedBam set file(genomeFile), file(genomeIndex), file(genomeDict), file(dbsnp), file(dbsnpIndex), file(knownIndels), file(knownIndelsIndex), file(intervals) from Channel.value([ referenceMap.genomeFile, referenceMap.genomeIndex, @@ -441,37 +315,35 @@ process CreateRecalibrationTable { output: set idPatient, status, idSample, file("${idSample}.recal.table") into recalibrationTable - set idPatient, status, idSample, val("${idSample}_${status}.md.real.bam"), val("${idSample}_${status}.md.real.bai"), val("${idSample}.recal.table") into recalibrationTableTSV + set idPatient, status, idSample, val("${idSample}_${status}.md.bam"), val("${idSample}_${status}.md.bai"), val("${idSample}.recal.table") into recalibrationTableTSV - when: ( step == 'mapping' || step == 'realign' ) && !params.onlyQC + when: ( step == 'mapping' ) && !params.onlyQC script: - known = knownIndels.collect{ "-knownSites ${it}" }.join(' ') + known = knownIndels.collect{ "--known-sites ${it}" }.join(' ') """ - java -Xmx${task.memory.toGiga()}g \ - -Djava.io.tmpdir="/tmp" \ - -jar \$GATK_HOME/GenomeAnalysisTK.jar \ - -T BaseRecalibrator \ + gatk --java-options -Xmx${task.memory.toGiga()}g \ + BaseRecalibrator \ + --input ${bam} \ + --output ${idSample}.recal.table \ + --TMP_DIR /tmp \ -R ${genomeFile} \ - -I ${bam} \ -L ${intervals} \ - --disable_auto_index_creation_and_locking_when_reading_rods \ - -knownSites ${dbsnp} \ + --known-sites ${dbsnp} \ ${known} \ - -nct ${task.cpus} \ - -l INFO \ - -o ${idSample}.recal.table + --verbosity INFO """ } + // Create a TSV file to restart from this step recalibrationTableTSV.map { idPatient, status, idSample, bam, bai, recalTable -> gender = patientGenders[idPatient] - "${idPatient}\t${gender}\t${status}\t${idSample}\t${directoryMap.nonRecalibrated}/${bam}\t${directoryMap.nonRecalibrated}/${bai}\t${directoryMap.nonRecalibrated}/${recalTable}\n" + "${idPatient}\t${gender}\t${status}\t${idSample}\t${directoryMap.duplicateMarked}/${bam}\t${directoryMap.duplicateMarked}/${bai}\t${directoryMap.duplicateMarked}/${recalTable}\n" }.collectFile( - name: 'nonRecalibrated.tsv', sort: true, storeDir: directoryMap.nonRecalibrated + name: 'duplicateMarked.tsv', sort: true, storeDir: directoryMap.duplicateMarked ) -recalibrationTable = realignedBamToJoin.join(recalibrationTable, by:[0,1,2]) +recalibrationTable = mdBamToJoin.join(recalibrationTable, by:[0,1,2]) if (step == 'recalibrate') recalibrationTable = bamFiles @@ -508,20 +380,20 @@ process RecalibrateBam { set idPatient, status, idSample, file("${idSample}.recal.bam"), file("${idSample}.recal.bai") into recalibratedBam, recalibratedBamForStats set idPatient, status, idSample, val("${idSample}.recal.bam"), val("${idSample}.recal.bai") into recalibratedBamTSV - // HaplotypeCaller can do BQSR on the fly, so do not create a + // GATK4 HaplotypeCaller can not do BQSR on the fly, so we have to create a // recalibrated BAM explicitly. - when: params.explicitBqsrNeeded && !params.onlyQC + when: !params.onlyQC script: """ - java -Xmx${task.memory.toGiga()}g \ - -jar \$GATK_HOME/GenomeAnalysisTK.jar \ - -T PrintReads \ + gatk --java-options -Xmx${task.memory.toGiga()}g \ + ApplyBQSR \ -R ${genomeFile} \ - -I ${bam} \ + --input ${bam} \ + --output ${idSample}.recal.bam \ -L ${intervals} \ - --BQSR ${recalibrationReport} \ - -o ${idSample}.recal.bam + --create-output-bam-index true \ + --bqsr-recal-file ${recalibrationReport} """ } // Creating a TSV file to restart from this step @@ -622,17 +494,6 @@ process GetVersionGATK { script: QC.getVersionGATK() } -process GetVersionPicard { - publishDir directoryMap.version, mode: 'link' - output: file("v_*.txt") - when: step == 'mapping' && !params.onlyQC - - script: - """ - echo "Picard version:"\$(java -jar \$PICARD_HOME/picard.jar MarkDuplicates --version 2>&1) > v_picard.txt - """ -} - /* ================================================================================ = F U N C T I O N S = @@ -679,7 +540,6 @@ def defineReferenceMap() { def defineStepList() { return [ 'mapping', - 'realign', 'recalibrate' ] } @@ -812,7 +672,6 @@ def helpMessage() { log.info " Option to start workflow" log.info " Possible values are:" log.info " mapping (default, will start workflow with FASTQ files)" - log.info " realign (will start workflow with non-realigned BAM files)" log.info " recalibrate (will start workflow with non-recalibrated BAM files)" log.info " --noReports" log.info " Disable QC tools and MultiQC to generate a HTML report" diff --git a/nextflow.config b/nextflow.config index 9fdfe3ea38..4f6196a9fe 100644 --- a/nextflow.config +++ b/nextflow.config @@ -14,7 +14,7 @@ manifest { } env { - NXF_OPTS="-Xms1g -Xmx4g" + NXF_OPTS="-Xms1g" } profiles { diff --git a/scripts/do_all.sh b/scripts/do_all.sh index e4780958a1..d0e693ff57 100755 --- a/scripts/do_all.sh +++ b/scripts/do_all.sh @@ -57,8 +57,8 @@ function toLower() { if [[ $TOOL = docker ]] && [[ GRCh37,GRCh38 =~ $GENOME ]] then - nextflow run buildContainers.nf -profile ${PROFILE} --verbose --docker ${PUSH} --repository ${REPOSITORY} --tag ${TAG} --containers freebayes,gatk,gatk4,igvtools,mutect1,picard,qctools,r-base,runallelecount,sarek,snpeff + nextflow run buildContainers.nf -profile ${PROFILE} --verbose --docker ${PUSH} --repository ${REPOSITORY} --tag ${TAG} --containers r-base,runallelecount,sarek,snpeff nextflow run buildContainers.nf -profile ${PROFILE} --verbose --docker ${PUSH} --repository ${REPOSITORY} --tag ${TAG} --containers snpeff$(toLower ${GENOME}),vep$(toLower ${GENOME}) else - nextflow run buildContainers.nf -profile ${PROFILE} --verbose --singularity --repository ${REPOSITORY} --tag ${TAG} --containerPath containers/ --containers freebayes,gatk,gatk4,igvtools,mutect1,picard,qctools,r-base,runallelecount,sarek,snpeff$(toLower ${GENOME}),vep$(toLower ${GENOME}) + nextflow run buildContainers.nf -profile ${PROFILE} --verbose --singularity --repository ${REPOSITORY} --tag ${TAG} --containerPath containers/ --containers r-base,runallelecount,sarek,snpeff$(toLower ${GENOME}),vep$(toLower ${GENOME}) fi diff --git a/scripts/test.sh b/scripts/test.sh index ac45d10ba8..bcd017d1f1 100755 --- a/scripts/test.sh +++ b/scripts/test.sh @@ -9,6 +9,18 @@ SAMPLE=Sarek-data/testdata/tsv/tiny.tsv TEST=ALL TRAVIS=${TRAVIS:-false} +TMPDIR=`pwd`/tmp +mkdir -p $TMPDIR +export NXF_SINGULARITY_CACHEDIR=$TMPDIR +export NXF_TEMP=$TMPDIR + +export SINGULARITY_TMPDIR=$TMPDIR +export SINGULARITY_CACHEDIR=$TMPDIR + + +# remove Reference directory +rm -rf References + while [[ $# -gt 0 ]] do key=$1 @@ -91,7 +103,6 @@ fi if [[ ALL,STEP =~ $TEST ]] then run_wrapper --germline --sampleDir Sarek-data/testdata/tiny/normal - run_wrapper --germline --step realign --noReports run_wrapper --germline --step recalibrate --noReports clean_repo fi @@ -104,7 +115,7 @@ fi if [[ ALL,TOOLS =~ $TEST ]] then - run_wrapper --somatic --sample $SAMPLE --variantCalling --tools FreeBayes,HaplotypeCaller,MuTect1,MuTect2 + run_wrapper --somatic --sample $SAMPLE --variantCalling --tools FreeBayes,HaplotypeCaller,Mutect2 fi if [[ ALL,MANTA =~ $TEST ]] diff --git a/somaticVC.nf b/somaticVC.nf index c282b8467e..0e2347bb0e 100644 --- a/somaticVC.nf +++ b/somaticVC.nf @@ -29,10 +29,9 @@ kate: syntax groovy; space-indent on; indent-width 2; - RunSamtoolsStats - Run Samtools stats on recalibrated BAM files - RunBamQC - Run qualimap BamQC on recalibrated BAM files - CreateIntervalBeds - Create and sort intervals into bed files - - RunMutect1 - Run MuTect1 for Variant Calling (Parallelized processes) - RunMutect2 - Run MuTect2 for Variant Calling (Parallelized processes) - RunFreeBayes - Run FreeBayes for Variant Calling (Parallelized processes) - - ConcatVCF - Merge results from MuTect1 and MuTect2 + - ConcatVCF - Merge results from paralellized variant callers - RunStrelka - Run Strelka for Variant Calling - RunManta - Run Manta for Structural Variant Calling - RunSingleManta - Run Manta for Single Structural Variant Calling @@ -70,15 +69,12 @@ referenceMap = defineReferenceMap() toolList = defineToolList() if (!SarekUtils.checkReferenceMap(referenceMap)) exit 1, 'Missing Reference file(s), see --help for more information' -if (!checkParameterList(tools,toolList)) exit 1, 'Unknown tool(s), see --help for more information' +if (!SarekUtils.checkParameterList(tools,toolList)) exit 1, 'Unknown tool(s), see --help for more information' if (params.test && params.genome in ['GRCh37', 'GRCh38']) { referenceMap.intervals = file("$workflow.projectDir/repeats/tiny_${params.genome}.list") } -// TODO -// MuTect and Mutect2 could be run without a recalibrated BAM (they support -// the --BQSR option), but this is not implemented, yet. // TODO // FreeBayes does not need recalibrated BAMs, but we need to test whether // the channels are set up correctly when we disable it @@ -200,10 +196,9 @@ bamsTumor = bamsTumor.map { idPatient, status, idSample, bam, bai -> [idPatient, // We know that MuTect2 (and other somatic callers) are notoriously slow. // To speed them up we are chopping the reference into smaller pieces. -// (see repeats/centromeres.list). // Do variant calling by this intervals, and re-merge the VCFs. -// Since we are on a cluster, this can parallelize the variant call processes. -// And push down the variant call wall clock time significanlty. +// Since we are on a cluster or a multi-CPU machine, this can parallelize the +// variant call processes and push down the variant call wall clock time significanlty. process CreateIntervalBeds { tag {intervals.fileName} @@ -285,48 +280,10 @@ bamsAll = bamsAll.map { bamsTumorNormalIntervals = bamsAll.spread(bedIntervals) -// MuTect1, MuTect2, FreeBayes -(bamsFMT1, bamsFMT2, bamsFFB) = bamsTumorNormalIntervals.into(3) - -process RunMutect1 { - tag {idSampleTumor + "_vs_" + idSampleNormal + "-" + intervalBed.baseName} - - input: - set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor), file(intervalBed) from bamsFMT1 - set file(genomeFile), file(genomeIndex), file(genomeDict), file(dbsnp), file(dbsnpIndex), file(cosmic), file(cosmicIndex) from Channel.value([ - referenceMap.genomeFile, - referenceMap.genomeIndex, - referenceMap.genomeDict, - referenceMap.dbsnp, - referenceMap.dbsnpIndex, - referenceMap.cosmic, - referenceMap.cosmicIndex - ]) - - output: - set val("mutect1"), idPatient, idSampleNormal, idSampleTumor, file("${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf") into mutect1Output - - when: 'mutect1' in tools && !params.onlyQC - - script: - """ - java -Xmx${task.memory.toGiga()}g \ - -jar \$MUTECT_HOME/muTect.jar \ - -T MuTect \ - -R ${genomeFile} \ - --cosmic ${cosmic} \ - --dbsnp ${dbsnp} \ - -I:normal ${bamNormal} \ - -I:tumor ${bamTumor} \ - -L ${intervalBed} \ - --disable_auto_index_creation_and_locking_when_reading_rods \ - --out ${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.call_stats.out \ - --vcf ${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf - """ -} - -mutect1Output = mutect1Output.groupTuple(by:[0,1,2,3]) +// MuTect2, FreeBayes +( bamsFMT2, bamsFFB) = bamsTumorNormalIntervals.into(3) +// This will give as a list of unfiltered calls for MuTect2. process RunMutect2 { tag {idSampleTumor + "_vs_" + idSampleNormal + "-" + intervalBed.baseName} @@ -349,19 +306,18 @@ process RunMutect2 { script: """ - java -Xmx${task.memory.toGiga()}g \ - -jar \$GATK_HOME/GenomeAnalysisTK.jar \ - -T MuTect2 \ - -R ${genomeFile} \ - --cosmic ${cosmic} \ - --dbsnp ${dbsnp} \ - -I:normal ${bamNormal} \ - -I:tumor ${bamTumor} \ - --disable_auto_index_creation_and_locking_when_reading_rods \ - -L ${intervalBed} \ - -o ${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf + gatk --java-options "-Xmx${task.memory.toGiga()}g" \ + Mutect2 \ + -R ${genomeFile}\ + -I ${bamTumor} -tumor ${idSampleTumor} \ + -I ${bamNormal} -normal ${idSampleNormal} \ + -L ${intervalBed} \ + -O ${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf """ } +// --germline_resource af-only-gnomad.vcf.gz \ +// --normal_panel pon.vcf.gz \ +// --dbsnp ${dbsnp} \ mutect2Output = mutect2Output.groupTuple(by:[0,1,2,3]) @@ -371,6 +327,7 @@ process RunFreeBayes { input: set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor), file(intervalBed) from bamsFFB file(genomeFile) from Channel.value(referenceMap.genomeFile) + file(genomeIndex) from Channel.value(referenceMap.genomeIndex) output: set val("freebayes"), idPatient, idSampleNormal, idSampleTumor, file("${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf") into freebayesOutput @@ -400,7 +357,7 @@ freebayesOutput = freebayesOutput.groupTuple(by:[0,1,2,3]) // we are merging the VCFs that are called separatelly for different intervals // so we can have a single sorted VCF containing all the calls for a given caller -vcfsToMerge = mutect1Output.mix(mutect2Output, freebayesOutput) +vcfsToMerge = mutect2Output.mix(freebayesOutput) if (params.verbose) vcfsToMerge = vcfsToMerge.view { "VCFs To be merged:\n\ Tool : ${it[0]}\tID : ${it[1]}\tSample: [${it[3]}, ${it[2]}]\n\ @@ -419,12 +376,13 @@ process ConcatVCF { output: set variantCaller, idPatient, idSampleNormal, idSampleTumor, file("*.vcf.gz"), file("*.vcf.gz.tbi") into vcfConcatenated - when: ('mutect1' in tools || 'mutect2' in tools || 'freebayes' in tools ) && !params.onlyQC + when: ( 'mutect2' in tools || 'freebayes' in tools ) && !params.onlyQC script: outputFile = "${variantCaller}_${idSampleTumor}_vs_${idSampleNormal}.vcf" """ + set -euo pipefail # first make a header from one of the VCF intervals # get rid of interval information only from the GATK command-line, but leave the rest FIRSTVCF=\$(ls *.vcf | head -n 1) @@ -492,7 +450,7 @@ process RunStrelka { script: """ - \$STRELKA_INSTALL_PATH/bin/configureStrelkaSomaticWorkflow.py \ + configureStrelkaSomaticWorkflow.py \ --tumor ${bamTumor} \ --normal ${bamNormal} \ --referenceFasta ${genomeFile} \ @@ -538,7 +496,7 @@ process RunManta { script: """ - \$MANTA_INSTALL_PATH/bin/configManta.py \ + configManta.py \ --normalBam ${bamNormal} \ --tumorBam ${bamTumor} \ --reference ${genomeFile} \ @@ -591,7 +549,7 @@ process RunSingleManta { script: """ - \$MANTA_INSTALL_PATH/bin/configManta.py \ + configManta.py \ --tumorBam ${bam} \ --reference ${genomeFile} \ --runDir Manta @@ -620,6 +578,7 @@ if (params.verbose) singleMantaOutput = singleMantaOutput.view { Index : ${it[4].fileName}" } +// Running Strelka Best Practice with Manta indel candidates // For easier joining, remaping channels to idPatient, idSampleNormal, idSampleTumor... bamsForStrelkaBP = bamsForStrelkaBP.map { @@ -650,7 +609,7 @@ process RunStrelkaBP { script: """ - \$STRELKA_INSTALL_PATH/bin/configureStrelkaSomaticWorkflow.py \ + configureStrelkaSomaticWorkflow.py \ --tumor ${bamTumor} \ --normal ${bamNormal} \ --referenceFasta ${genomeFile} \ @@ -932,36 +891,11 @@ def checkParameterExistence(it, list) { return true } -def checkParameterList(list, realList) { - // Loop through all parameters to check their existence and spelling - return list.every{ checkParameterExistence(it, realList) } -} - def checkParamReturnFile(item) { params."${item}" = params.genomes[params.genome]."${item}" return file(params."${item}") } -def checkReferenceMap(referenceMap) { - // Loop through all the references files to check their existence - referenceMap.every { - referenceFile, fileToCheck -> - checkRefExistence(referenceFile, fileToCheck) - } -} - -def checkRefExistence(referenceFile, fileToCheck) { - if (fileToCheck instanceof List) return fileToCheck.every{ checkRefExistence(referenceFile, it) } - def f = file(fileToCheck) - // this is an expanded wildcard: we can assume all files exist - if (f instanceof List && f.size() > 0) return true - else if (!f.exists()) { - log.info "Missing references: ${referenceFile} ${fileToCheck}" - return false - } - return true -} - def checkUppmaxProject() { // check if UPPMAX project number is specified return !(workflow.profile == 'slurm' && !params.project) @@ -994,7 +928,6 @@ def defineToolList() { 'freebayes', 'haplotypecaller', 'manta', - 'mutect1', 'mutect2', 'strelka' ] @@ -1027,7 +960,6 @@ def helpMessage() { log.info " Option to configure which tools to use in the workflow." log.info " Different tools to be separated by commas." log.info " Possible values are:" - log.info " mutect1 (use MuTect1 for VC)" log.info " mutect2 (use MuTect2 for VC)" log.info " freebayes (use FreeBayes for VC)" log.info " strelka (use Strelka for VC)"