From c9b2cd91214c5f37bc53cc8bea0b2e2aea091bfb Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Thu, 5 Jul 2018 09:21:10 +0200 Subject: [PATCH 01/38] QR for posters --- doc/images/GitHub.QR.png | Bin 0 -> 2228 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 doc/images/GitHub.QR.png diff --git a/doc/images/GitHub.QR.png b/doc/images/GitHub.QR.png new file mode 100644 index 0000000000000000000000000000000000000000..f20bee47faa7666cd911fe613a3c1730da673cc2 GIT binary patch literal 2228 zcmeAS@N?(olHy`uVBq!ia0y~yVAKI&4mO}jWo=(6kYY)9^mSxl*x1kgCy^D%XDkkK zcVbv~PUa<$qmb+oH9cpCNftuUEy5>`3@c?cy z9=*8uc=;q&WS;^p!!QNt5V#=;KnEDz*iqM*@axyl30hz)EbEbi1Im2Pj|f7T<~1#> z-s}G!sI03788{CTuIEACElRNE2Zi-#uc@FevLZw{9fnBy2mi5?6uqm_Z z0412Ffg*-8;(*Ww77z`JDb)lDn<=4RC z0Tx_%>^r~|#vXBAiM{dp}>4^@+F5dv%+aA~};Vv&b>ZXva(T zOTVPRR;Jbhqlldg>{S_!*W6$yUjrujq6Lu5)3|9L*wEBk@dHu+FQolyEIgR}tvwGE zvxsy?fSFn#((#=={W5>fFV6ozXufH{I`LTu)j*_CfQX6ShHS<)Uk@Jq_VxeQ|NCo64pp%OUHdE6{$bkxm0N_AL;x%# zTMq^{&or{&f~PuQ$pH))SZDwp3oblBr3zSV%{Ts)aqM4zJ;?d}OWKDGqXs|ua}UQPObVf34% zSFo|z7FcgHsFt`!l%yn~>+BeM_#3o8S2D-&aF0|P4qgDYm? zJt!J-^HVa@DsgLAn*APRg9gZk;<9wBq{QM>-O{2=hP2F_R4aXb{gT`Q{oKU#%;ap{ bNXw)Uui!v;cI60tpdJQKS3j3^P6 Date: Thu, 5 Jul 2018 09:52:55 +0200 Subject: [PATCH 02/38] Dockerand singularity images for vcfanno --- containers/vcfanno/Dockerfile | 15 +++++++++++++++ containers/vcfanno/environment.yml | 9 +++++++++ containers/vcfanno/to_build | 5 +++++ 3 files changed, 29 insertions(+) create mode 100644 containers/vcfanno/Dockerfile create mode 100644 containers/vcfanno/environment.yml create mode 100644 containers/vcfanno/to_build diff --git a/containers/vcfanno/Dockerfile b/containers/vcfanno/Dockerfile new file mode 100644 index 0000000000..80a6473d02 --- /dev/null +++ b/containers/vcfanno/Dockerfile @@ -0,0 +1,15 @@ +FROM nfcore/base:latest + +LABEL \ + author="Szilveszter Juhos" \ + description="vcfanno Image for Sarek" \ + maintainer="szilveszter.juhos@scilifelab.se" + +COPY environment.yml / + +RUN \ + conda env create -f /environment.yml && \ + conda clean -a + + # Export PATH +ENV PATH /opt/conda/envs/sarek-vcfanno-2.0/bin:$PATH diff --git a/containers/vcfanno/environment.yml b/containers/vcfanno/environment.yml new file mode 100644 index 0000000000..bd8418268f --- /dev/null +++ b/containers/vcfanno/environment.yml @@ -0,0 +1,9 @@ +# You can use this file to create a conda environment: +# conda env create -f environment.yml +name: sarek-vcfanno-2.0 +channels: + - bioconda + - conda-forge + - defaults +dependencies: + - vcfanno=0.2.8 diff --git a/containers/vcfanno/to_build b/containers/vcfanno/to_build new file mode 100644 index 0000000000..c2d06a77f9 --- /dev/null +++ b/containers/vcfanno/to_build @@ -0,0 +1,5 @@ +docker login szilvajuhos +docker build -t szilvajuhos/sarek-vcfanno:latest . +docker images +docker push szilvajuhos/sarek-vcfanno:latest +singularity pull docker://szilvajuhos/sarek-vcfanno:latest From 6f438adc792710c9c226790e479634965d666acf Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Fri, 6 Jul 2018 10:45:49 +0200 Subject: [PATCH 03/38] removing superfluous routines, adding comments --- annotate.nf | 12 ++++++++---- configuration/genomes.config | 3 +++ somaticVC.nf | 28 ++-------------------------- 3 files changed, 13 insertions(+), 30 deletions(-) diff --git a/annotate.nf b/annotate.nf index c1e085f633..23c6b92790 100644 --- a/annotate.nf +++ b/annotate.nf @@ -73,23 +73,23 @@ vcfToAnnotate = Channel.create() vcfNotToAnnotate = Channel.create() if (annotateVCF == []) { +// by default we annotate both germline and somatic results that we can find in the VariantCalling directory Channel.empty().mix( Channel.fromPath("${directoryMap.haplotypecaller}/*.vcf.gz") .flatten().map{vcf -> ['haplotypecaller', vcf]}, Channel.fromPath("${directoryMap.manta}/*SV.vcf.gz") .flatten().map{vcf -> ['manta', vcf]}, - Channel.fromPath("${directoryMap.mutect1}/*.vcf.gz") - .flatten().map{vcf -> ['mutect1', vcf]}, Channel.fromPath("${directoryMap.mutect2}/*.vcf.gz") .flatten().map{vcf -> ['mutect2', vcf]}, - Channel.fromPath("${directoryMap.strelka}/*{somatic,variants}*.vcf.gz") + Channel.fromPath("${directoryMap.strelka}/*{somatic,variants}*.vcf.gz") // Strelka only .flatten().map{vcf -> ['strelka', vcf]}, - Channel.fromPath("${directoryMap.strelkabp}/*{somatic,variants}*.vcf.gz") + Channel.fromPath("${directoryMap.strelkabp}/*{somatic,variants}*.vcf.gz") // Strelka with Manta indel candidates .flatten().map{vcf -> ['strelkabp', vcf]} ).choice(vcfToAnnotate, vcfNotToAnnotate) { annotateTools == [] || (annotateTools != [] && it[0] in annotateTools) ? 0 : 1 } } else if (annotateTools == []) { +// alternatively, annotate user-submitted VCFs list = "" annotateVCF.each{ list += ",${it}" } list = list.substring(1) @@ -101,6 +101,10 @@ if (annotateVCF == []) { vcfNotToAnnotate.close() +// as now have the list of VCFs to annotate, the first step is to annotate with allele frequencies, if there are any + + + (vcfForBCFtools, vcfForVCFtools, vcfForSnpeff, vcfForVep) = vcfToAnnotate.into(4) vcfForVep = vcfForVep.map { diff --git a/configuration/genomes.config b/configuration/genomes.config index 56097e23fe..1653879a08 100644 --- a/configuration/genomes.config +++ b/configuration/genomes.config @@ -42,6 +42,9 @@ params { knownIndels = "${params.genome_base}/{Mills_and_1000G_gold_standard.indels.hg38,beta/Homo_sapiens_assembly38.known_indels}.vcf.gz" knownIndelsIndex = "${params.genome_base}/{Mills_and_1000G_gold_standard.indels.hg38,beta/Homo_sapiens_assembly38.known_indels}.vcf.gz.tbi" snpeffDb = "GRCh38.86" + // This a nasty-looking list of allele-frequencies files. Add/remove files to match to your sets + AF_files = "${params.genome_base}/{00-All.dbsnp_151.hg38.CAF.TOPMED.alternate.allele.freq,hapmap_3.3_grch38_pop_stratified_af.HMAF,SweGen_hg38_stratified.SWAF}.vcf" + AF_indexes = "${params.genome_base}/{00-All.dbsnp_151.hg38.CAF.TOPMED.alternate.allele.freq,hapmap_3.3_grch38_pop_stratified_af.HMAF,SweGen_hg38_stratified.SWAF}.vcf.idx" } 'smallGRCh37' { acLoci = "${params.genome_base}/1000G_phase3_20130502_SNP_maf0.3.small.loci" diff --git a/somaticVC.nf b/somaticVC.nf index c282b8467e..b9a0da1352 100644 --- a/somaticVC.nf +++ b/somaticVC.nf @@ -70,7 +70,7 @@ referenceMap = defineReferenceMap() toolList = defineToolList() if (!SarekUtils.checkReferenceMap(referenceMap)) exit 1, 'Missing Reference file(s), see --help for more information' -if (!checkParameterList(tools,toolList)) exit 1, 'Unknown tool(s), see --help for more information' +if (!SarekUtils.checkParameterList(tools,toolList)) exit 1, 'Unknown tool(s), see --help for more information' if (params.test && params.genome in ['GRCh37', 'GRCh38']) { referenceMap.intervals = file("$workflow.projectDir/repeats/tiny_${params.genome}.list") @@ -620,6 +620,7 @@ if (params.verbose) singleMantaOutput = singleMantaOutput.view { Index : ${it[4].fileName}" } +// Running Strelka Best Practice with Manta indel candidates // For easier joining, remaping channels to idPatient, idSampleNormal, idSampleTumor... bamsForStrelkaBP = bamsForStrelkaBP.map { @@ -932,36 +933,11 @@ def checkParameterExistence(it, list) { return true } -def checkParameterList(list, realList) { - // Loop through all parameters to check their existence and spelling - return list.every{ checkParameterExistence(it, realList) } -} - def checkParamReturnFile(item) { params."${item}" = params.genomes[params.genome]."${item}" return file(params."${item}") } -def checkReferenceMap(referenceMap) { - // Loop through all the references files to check their existence - referenceMap.every { - referenceFile, fileToCheck -> - checkRefExistence(referenceFile, fileToCheck) - } -} - -def checkRefExistence(referenceFile, fileToCheck) { - if (fileToCheck instanceof List) return fileToCheck.every{ checkRefExistence(referenceFile, it) } - def f = file(fileToCheck) - // this is an expanded wildcard: we can assume all files exist - if (f instanceof List && f.size() > 0) return true - else if (!f.exists()) { - log.info "Missing references: ${referenceFile} ${fileToCheck}" - return false - } - return true -} - def checkUppmaxProject() { // check if UPPMAX project number is specified return !(workflow.profile == 'slurm' && !params.project) From e00872b0b726730711132c50690e6caef6cd4a23 Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Fri, 6 Jul 2018 21:53:51 +0200 Subject: [PATCH 04/38] logging improvements --- lib/SarekUtils.groovy | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/SarekUtils.groovy b/lib/SarekUtils.groovy index 076a5300f9..33716f2297 100644 --- a/lib/SarekUtils.groovy +++ b/lib/SarekUtils.groovy @@ -1,6 +1,9 @@ import static nextflow.Nextflow.file import nextflow.Channel +import groovy.util.logging.Slf4j +@Grab('ch.qos.logback:logback-classic:1.2.1') +@Slf4j class SarekUtils { // Check file extension @@ -100,13 +103,14 @@ class SarekUtils { } // Loop through all the references files to check their existence + static def checkRefExistence(referenceFile, fileToCheck) { if (fileToCheck instanceof List) return fileToCheck.every{ SarekUtils.checkRefExistence(referenceFile, it) } def f = file(fileToCheck) // this is an expanded wildcard: we can assume all files exist if (f instanceof List && f.size() > 0) return true else if (!f.exists()) { - this.log.info "Missing references: ${referenceFile} ${fileToCheck}" + log.info "Missing references: ${referenceFile} ${fileToCheck}" return false } return true From f6c60bfd4f1413801d2dfd1575f07df26134f99b Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Fri, 6 Jul 2018 21:54:55 +0200 Subject: [PATCH 05/38] nfcore based image file containing vcfanno and qc modules also --- containers/sarek/Dockerfile | 21 +++++++++------------ containers/sarek/environment.yml | 22 ++++++++++++++++++++++ 2 files changed, 31 insertions(+), 12 deletions(-) create mode 100644 containers/sarek/environment.yml diff --git a/containers/sarek/Dockerfile b/containers/sarek/Dockerfile index 7ef82bbd8a..fd4102563b 100644 --- a/containers/sarek/Dockerfile +++ b/containers/sarek/Dockerfile @@ -1,18 +1,15 @@ -FROM debian:8.9 +FROM nfcore/base:latest LABEL \ - author="Maxime Garcia" \ + author="Szilveszter Juhos" \ description="Image with tools used in Sarek" \ - maintainer="maxime.garcia@scilifelab.se" + maintainer="szilveszter.juhos@scilifelab.se" -# Export PATH -ENV \ - MANTA_INSTALL_PATH=/opt/manta \ - PATH=/opt/bcftools/bin:/opt/htslib/bin:/opt/samtools/bin:$PATH \ - STRELKA_INSTALL_PATH=/opt/strelka +COPY environment.yml / -# Build container -ADD \ - build.sh /usr/bin/ RUN \ - build.sh + conda env create -f /environment.yml && \ + conda clean -a + + # Export PATH +ENV PATH /opt/conda/envs/sarek-core/bin:$PATH diff --git a/containers/sarek/environment.yml b/containers/sarek/environment.yml new file mode 100644 index 0000000000..c0b720030f --- /dev/null +++ b/containers/sarek/environment.yml @@ -0,0 +1,22 @@ +# You can use this file to create a conda environment for this pipeline: +# conda env create -f environment.yml +name: sarek-core +channels: + - bioconda + - conda-forge + - defaults + +dependencies: + - conda-forge::openjdk=8.0.144 # Needed for FastQC docker - see bioconda/bioconda-recipes#5026 + - bwa=0.7.17 + - fastqc=0.11.7 + - samtools=1.8 + - gatk4=4.0.3.0 + - qualimap=2.2.2a + - multiqc=1.5 + - manta=1.3.0 + - strelka=2.9.3 + - bcftools=1.8 + - htslib=1.5 + - vcftools=0.1.15 + - vcfanno=0.2.8 From a3f102daaf091c56be6519328608d26fe0f6d990 Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Mon, 9 Jul 2018 20:06:09 +0200 Subject: [PATCH 06/38] on a way to unify to nf-core and GATK4 --- configuration/containers.config | 16 +++++------ configuration/singularity-path.config | 38 +++++++++++++-------------- lib/QC.groovy | 2 +- main.nf | 31 +++++++--------------- 4 files changed, 37 insertions(+), 50 deletions(-) diff --git a/configuration/containers.config b/configuration/containers.config index 07b862c441..780a5a47a4 100644 --- a/configuration/containers.config +++ b/configuration/containers.config @@ -15,20 +15,20 @@ process { $CompressVCF.container = "${params.repository}/sarek:${params.tag}" $ConcatVCF.container = "${params.repository}/sarek:${params.tag}" $CreateRecalibrationTable.container = "${params.repository}/gatk:${params.tag}" - $GetVersionAll.container = "${params.repository}/qctools:${params.tag}" + $GetVersionAll.container = "${params.repository}/sarek:${params.tag}" $GetVersionAlleleCount.container = "${params.repository}/runallelecount:${params.tag}" $GetVersionASCAT.container = "${params.repository}/r-base:${params.tag}" - $GetVersionBamQC.container = "${params.repository}/qctools:${params.tag}" + $GetVersionBamQC.container = "${params.repository}/sarek:${params.tag}" $GetVersionBCFtools.container = "${params.repository}/sarek:${params.tag}" $GetVersionBWAsamtools.container = "${params.repository}/sarek:${params.tag}" - $GetVersionFastQC.container = "${params.repository}/qctools:${params.tag}" + $GetVersionFastQC.container = "${params.repository}/sarek:${params.tag}" $GetVersionFreeBayes.container = "${params.repository}/freebayes:${params.tag}" $GetVersionGATK.container = "${params.repository}/gatk:${params.tag}" $GetVersionManta.container = "${params.repository}/sarek:${params.tag}" $GetVersionPicard.container = "${params.repository}/picard:${params.tag}" $GetVersionSnpeff.container = {params.genome == 'GRCh38' ? "${params.repository}/snpeffgrch38:${params.tag}" : "${params.repository}/snpeffgrch37:${params.tag}"} $GetVersionStrelka.container = "${params.repository}/sarek:${params.tag}" - $GetVersionVCFtools.container = "${params.repository}/qctools:${params.tag}" + $GetVersionVCFtools.container = "${params.repository}/sarek:${params.tag}" $GetVersionVEP.container = {params.genome == 'GRCh38' ? "${params.repository}/vepgrch38:${params.tag}" : "${params.repository}/vepgrch37:${params.tag}"} $IndelRealigner.container = "${params.repository}/gatk:${params.tag}" $MapReads.container = "${params.repository}/sarek:${params.tag}" @@ -38,15 +38,15 @@ process { $RecalibrateBam.container = "${params.repository}/gatk:${params.tag}" $RunAlleleCount.container = "${params.repository}/runallelecount:${params.tag}" $RunAscat.container = "${params.repository}/r-base:${params.tag}" - $RunBamQC.container = "${params.repository}/qctools:${params.tag}" + $RunBamQC.container = "${params.repository}/sarek:${params.tag}" $RunBcftoolsStats.container = "${params.repository}/sarek:${params.tag}" $RunConvertAlleleCounts.container = "${params.repository}/r-base:${params.tag}" - $RunFastQC.container = "${params.repository}/qctools:${params.tag}" + $RunFastQC.container = "${params.repository}/sarek:${params.tag}" $RunFreeBayes.container = "${params.repository}/freebayes:${params.tag}" $RunGenotypeGVCFs.container = "${params.repository}/gatk:${params.tag}" $RunHaplotypecaller.container = "${params.repository}/gatk:${params.tag}" $RunManta.container = "${params.repository}/sarek:${params.tag}" - $RunMultiQC.container = "${params.repository}/qctools:${params.tag}" + $RunMultiQC.container = "${params.repository}/sarek:${params.tag}" $RunMutect1.container = "${params.repository}/mutect1:${params.tag}" $RunMutect2.container = "${params.repository}/gatk:${params.tag}" $RunSamtoolsStats.container = "${params.repository}/sarek:${params.tag}" @@ -55,6 +55,6 @@ process { $RunSnpeff.container = {params.genome == 'GRCh38' ? "${params.repository}/snpeffgrch38:${params.tag}" : "${params.repository}/snpeffgrch37:${params.tag}"} $RunStrelka.container = "${params.repository}/sarek:${params.tag}" $RunStrelkaBP.container = "${params.repository}/sarek:${params.tag}" - $RunVcftools.container = "${params.repository}/qctools:${params.tag}" + $RunVcftools.container = "${params.repository}/sarek:${params.tag}" $RunVEP.container = {params.genome == 'GRCh38' ? "${params.repository}/vepgrch38:${params.tag}" : "${params.repository}/vepgrch37:${params.tag}"} } diff --git a/configuration/singularity-path.config b/configuration/singularity-path.config index 7efea0292f..d67138e4d2 100644 --- a/configuration/singularity-path.config +++ b/configuration/singularity-path.config @@ -14,52 +14,50 @@ singularity { process { $BuildBWAindexes.container = "${params.containerPath}/sarek-${params.tag}.img" - $BuildPicardIndex.container = "${params.containerPath}/picard-${params.tag}.img" + $BuildPicardIndex.container = "${params.containerPath}/sarek-${params.tag}.img" $BuildSAMToolsIndex.container = "${params.containerPath}/sarek-${params.tag}.img" $BuildVCFIndex.container = "${params.containerPath}/igvtools-${params.tag}.img" $CompressVCF.container = "${params.containerPath}/sarek-${params.tag}.img" $ConcatVCF.container = "${params.containerPath}/sarek-${params.tag}.img" - $CreateRecalibrationTable.container = "${params.containerPath}/gatk-${params.tag}.img" - $GetVersionAll.container = "${params.containerPath}/qctools-${params.tag}.img" + $CreateRecalibrationTable.container = "${params.containerPath}/sarek-${params.tag}.img" + $GetVersionAll.container = "${params.containerPath}/sarek-${params.tag}.img" $GetVersionAlleleCount.container = "${params.containerPath}/runallelecount-${params.tag}.img" $GetVersionASCAT.container = "${params.containerPath}/r-base-${params.tag}.img" - $GetVersionBamQC.container = "${params.containerPath}/qctools-${params.tag}.img" + $GetVersionBamQC.container = "${params.containerPath}/sarek-${params.tag}.img" $GetVersionBCFtools.container = "${params.containerPath}/sarek-${params.tag}.img" $GetVersionBWAsamtools.container = "${params.containerPath}/sarek-${params.tag}.img" - $GetVersionFastQC.container = "${params.containerPath}/qctools-${params.tag}.img" + $GetVersionFastQC.container = "${params.containerPath}/sarek-${params.tag}.img" $GetVersionFreeBayes.container = "${params.containerPath}/freebayes-${params.tag}.img" - $GetVersionGATK.container = "${params.containerPath}/gatk-${params.tag}.img" + $GetVersionGATK.container = "${params.containerPath}/sarek-${params.tag}.img" $GetVersionManta.container = "${params.containerPath}/sarek-${params.tag}.img" - $GetVersionPicard.container = "${params.containerPath}/picard-${params.tag}.img" $GetVersionSnpeff.container = {params.genome == 'GRCh38' ? "${params.containerPath}/snpeffgrch38-${params.tag}.img" : "${params.containerPath}/snpeffgrch37-${params.tag}.img"} $GetVersionStrelka.container = "${params.containerPath}/sarek-${params.tag}.img" - $GetVersionVCFtools.container = "${params.containerPath}/qctools-${params.tag}.img" + $GetVersionVCFtools.container = "${params.containerPath}/sarek-${params.tag}.img" $GetVersionVEP.container = {params.genome == 'GRCh38' ? "${params.containerPath}/vepgrch38-${params.tag}.img" : "${params.containerPath}/vepgrch37-${params.tag}.img"} - $IndelRealigner.container = "${params.containerPath}/gatk-${params.tag}.img" + $IndelRealigner.container = "${params.containerPath}/sarek-${params.tag}.img" $MapReads.container = "${params.containerPath}/sarek-${params.tag}.img" - $MarkDuplicates.container = "${params.containerPath}/picard-${params.tag}.img" + $MarkDuplicates.container = "${params.containerPath}/sarek-${params.tag}.img" $MergeBams.container = "${params.containerPath}/sarek-${params.tag}.img" - $RealignerTargetCreator.container = "${params.containerPath}/gatk-${params.tag}.img" - $RecalibrateBam.container = "${params.containerPath}/gatk-${params.tag}.img" + $RealignerTargetCreator.container = "${params.containerPath}/sarek-${params.tag}.img" + $RecalibrateBam.container = "${params.containerPath}/sarek-${params.tag}.img" $RunAlleleCount.container = "${params.containerPath}/runallelecount-${params.tag}.img" $RunAscat.container = "${params.containerPath}/r-base-${params.tag}.img" - $RunBamQC.container = "${params.containerPath}/qctools-${params.tag}.img" + $RunBamQC.container = "${params.containerPath}/sarek-${params.tag}.img" $RunBcftoolsStats.container = "${params.containerPath}/sarek-${params.tag}.img" $RunConvertAlleleCounts.container = "${params.containerPath}/r-base-${params.tag}.img" - $RunFastQC.container = "${params.containerPath}/qctools-${params.tag}.img" + $RunFastQC.container = "${params.containerPath}/sarek-${params.tag}.img" $RunFreeBayes.container = "${params.containerPath}/freebayes-${params.tag}.img" - $RunGenotypeGVCFs.container = "${params.containerPath}/gatk-${params.tag}.img" - $RunHaplotypecaller.container = "${params.containerPath}/gatk-${params.tag}.img" + $RunGenotypeGVCFs.container = "${params.containerPath}/sarek-${params.tag}.img" + $RunHaplotypecaller.container = "${params.containerPath}/sarek-${params.tag}.img" $RunManta.container = "${params.containerPath}/sarek-${params.tag}.img" - $RunMultiQC.container = "${params.containerPath}/qctools-${params.tag}.img" - $RunMutect1.container = "${params.containerPath}/mutect1-${params.tag}.img" - $RunMutect2.container = "${params.containerPath}/gatk-${params.tag}.img" + $RunMultiQC.container = "${params.containerPath}/sarek-${params.tag}.img" + $RunMutect2.container = "${params.containerPath}/sarek-${params.tag}.img" $RunSamtoolsStats.container = "${params.containerPath}/sarek-${params.tag}.img" $RunSingleManta.container = "${params.containerPath}/sarek-${params.tag}.img" $RunSingleStrelka.container = "${params.containerPath}/sarek-${params.tag}.img" $RunSnpeff.container = {params.genome == 'GRCh38' ? "${params.containerPath}/snpeffgrch38-${params.tag}.img" : "${params.containerPath}/snpeffgrch37-${params.tag}.img"} $RunStrelka.container = "${params.containerPath}/sarek-${params.tag}.img" $RunStrelkaBP.container = "${params.containerPath}/sarek-${params.tag}.img" - $RunVcftools.container = "${params.containerPath}/qctools-${params.tag}.img" + $RunVcftools.container = "${params.containerPath}/sarek-${params.tag}.img" $RunVEP.container = {params.genome == 'GRCh38' ? "${params.containerPath}/vepgrch38-${params.tag}.img" : "${params.containerPath}/vepgrch37-${params.tag}.img"} } diff --git a/lib/QC.groovy b/lib/QC.groovy index 2b228a8b68..c4474fa3ed 100644 --- a/lib/QC.groovy +++ b/lib/QC.groovy @@ -59,7 +59,7 @@ class QC { // Get GATK version static def getVersionGATK() { """ - echo "GATK version"\$(java -jar \$GATK_HOME/GenomeAnalysisTK.jar --version 2>&1) > v_gatk.txt + gatk-launch WhyCantWeHaveVersionEasily 2>&1| awk -F/ '/java/{for(i=1;i<=NF;i++){if(\$i~/gatk4/){sub("gatk4-","",\$i);print \$i>"v_gatk.txt"}}}' """ } diff --git a/main.nf b/main.nf index f683d397b6..dd38b36910 100644 --- a/main.nf +++ b/main.nf @@ -29,7 +29,7 @@ kate: syntax groovy; space-indent on; indent-width 2; - RunFastQC - Run FastQC for QC on fastq files - MapReads - Map reads with BWA - MergeBams - Merge BAMs if multilane samples - - MarkDuplicates - Mark Duplicates with Picard + - MarkDuplicates - Mark Duplicates with GATK4 - RealignerTargetCreator - Create realignment target intervals - IndelRealigner - Realign BAMs as T/N pair - CreateRecalibrationTable - Create Recalibration Table with BaseRecalibrator @@ -262,15 +262,14 @@ process MarkDuplicates { script: """ - java -Xmx${task.memory.toGiga()}g \ - -jar \$PICARD_HOME/picard.jar MarkDuplicates \ - INPUT=${bam} \ - METRICS_FILE=${bam}.metrics \ - TMP_DIR=. \ - ASSUME_SORTED=true \ - VALIDATION_STRINGENCY=LENIENT \ - CREATE_INDEX=TRUE \ - OUTPUT=${idSample}_${status}.md.bam + gatk-launch --java-options -Xmx${task.memory.toGiga()}g \ + MarkDuplicates \ + --INPUT ${bam} \ + --METRICS_FILE ${bam}.metrics \ + --TMP_DIR . \ + --ASSUME_SORT_ORDER coordinate \ + --CREATE_INDEX true \ + --OUTPUT ${idSample}_${status}.md.bam """ } @@ -617,22 +616,12 @@ process GetVersionFastQC { process GetVersionGATK { publishDir directoryMap.version, mode: 'link' + errorStrategy 'ignore' // there is no error-free way to get version from GATK4 output: file("v_*.txt") when: !params.onlyQC script: QC.getVersionGATK() } -process GetVersionPicard { - publishDir directoryMap.version, mode: 'link' - output: file("v_*.txt") - when: step == 'mapping' && !params.onlyQC - - script: - """ - echo "Picard version:"\$(java -jar \$PICARD_HOME/picard.jar MarkDuplicates --version 2>&1) > v_picard.txt - """ -} - /* ================================================================================ = F U N C T I O N S = From 576c1713527fb3c23bece1b70b1dc11d2647252e Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Mon, 9 Jul 2018 21:04:00 +0200 Subject: [PATCH 07/38] It works without indelrealignment until CreateRecalibrationTable --- main.nf | 390 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 201 insertions(+), 189 deletions(-) diff --git a/main.nf b/main.nf index dd38b36910..407d337dbb 100644 --- a/main.nf +++ b/main.nf @@ -254,7 +254,7 @@ process MarkDuplicates { set idPatient, status, idSample, file(bam) from mergedBam output: - set idPatient, file("${idSample}_${status}.md.bam"), file("${idSample}_${status}.md.bai") into duplicates + set idPatient, file("${idSample}_${status}.md.bam"), file("${idSample}_${status}.md.bai") into duplicateMarkedBams set idPatient, status, idSample, val("${idSample}_${status}.md.bam"), val("${idSample}_${status}.md.bai") into markDuplicatesTSV file ("${bam}.metrics") into markDuplicatesReport @@ -281,130 +281,144 @@ markDuplicatesTSV.map { idPatient, status, idSample, bam, bai -> name: 'nonRealigned.tsv', sort: true, storeDir: directoryMap.nonRealigned ) -// Create intervals for realignement using both tumor+normal as input -// Group the marked duplicates BAMs for intervals and realign by idPatient -// Grouping also by gender, to make a nicer channel -duplicatesGrouped = Channel.empty() -if (step == 'mapping') duplicatesGrouped = duplicates.groupTuple() -else if (step == 'realign') duplicatesGrouped = bamFiles.map{ - idPatient, status, idSample, bam, bai -> - [idPatient, bam, bai] -}.groupTuple() - -// The duplicatesGrouped channel is duplicated -// one copy goes to the RealignerTargetCreator process -// and the other to the IndelRealigner process -(duplicatesInterval, duplicatesRealign) = duplicatesGrouped.into(2) - -if (params.verbose) duplicatesInterval = duplicatesInterval.view { - "BAMs for RealignerTargetCreator:\n\ - ID : ${it[0]}\n\ - Files : ${it[1].fileName}\n\ - Files : ${it[2].fileName}" -} - -if (params.verbose) duplicatesRealign = duplicatesRealign.view { - "BAMs to join:\n\ - ID : ${it[0]}\n\ - Files : ${it[1].fileName}\n\ - Files : ${it[2].fileName}" -} - -if (params.verbose) markDuplicatesReport = markDuplicatesReport.view { - "MarkDuplicates report:\n\ - File : [${it.fileName}]" -} - -// VCF indexes are added so they will be linked, and not re-created on the fly -// -L "1:131941-141339" \ - -process RealignerTargetCreator { - tag {idPatient} - - input: - set idPatient, file(bam), file(bai) from duplicatesInterval - set file(genomeFile), file(genomeIndex), file(genomeDict), file(knownIndels), file(knownIndelsIndex), file(intervals) from Channel.value([ - referenceMap.genomeFile, - referenceMap.genomeIndex, - referenceMap.genomeDict, - referenceMap.knownIndels, - referenceMap.knownIndelsIndex, - referenceMap.intervals - ]) - - output: - set idPatient, file("${idPatient}.intervals") into intervals - - when: ( step == 'mapping' || step == 'realign' ) && !params.onlyQC - - script: - bams = bam.collect{"-I ${it}"}.join(' ') - known = knownIndels.collect{"-known ${it}"}.join(' ') - """ - java -Xmx${task.memory.toGiga()}g \ - -jar \$GATK_HOME/GenomeAnalysisTK.jar \ - -T RealignerTargetCreator \ - ${bams} \ - -R ${genomeFile} \ - ${known} \ - -nt ${task.cpus} \ - -L ${intervals} \ - -o ${idPatient}.intervals - """ -} - -if (params.verbose) intervals = intervals.view { - "Intervals to join:\n\ - ID : ${it[0]}\n\ - File : [${it[1].fileName}]" -} - -bamsAndIntervals = duplicatesRealign.join(intervals) - -if (params.verbose) bamsAndIntervals = bamsAndIntervals.view { - "BAMs and Intervals joined for IndelRealigner:\n\ - ID : ${it[0]}\n\ - Files : ${it[1].fileName}\n\ - Files : ${it[2].fileName}\n\ - File : [${it[3].fileName}]" -} - -// use nWayOut to split into T/N pair again -process IndelRealigner { - tag {idPatient} - - publishDir directoryMap.nonRecalibrated, mode: 'link' - - input: - set idPatient, file(bam), file(bai), file(intervals) from bamsAndIntervals - set file(genomeFile), file(genomeIndex), file(genomeDict), file(knownIndels), file(knownIndelsIndex) from Channel.value([ - referenceMap.genomeFile, - referenceMap.genomeIndex, - referenceMap.genomeDict, - referenceMap.knownIndels, - referenceMap.knownIndelsIndex]) - - output: - set idPatient, file("*.real.bam"), file("*.real.bai") into realignedBam mode flatten - - when: ( step == 'mapping' || step == 'realign' ) && !params.onlyQC - - script: - bams = bam.collect{"-I ${it}"}.join(' ') - known = knownIndels.collect{"-known ${it}"}.join(' ') - """ - java -Xmx${task.memory.toGiga()}g \ - -jar \$GATK_HOME/GenomeAnalysisTK.jar \ - -T IndelRealigner \ - ${bams} \ - -R ${genomeFile} \ - -targetIntervals ${intervals} \ - ${known} \ - -nWayOut '.real.bam' - """ -} - -realignedBam = realignedBam.map { +//// Create intervals for realignement using both tumor+normal as input +//// Group the marked duplicates BAMs for intervals and realign by idPatient +//// Grouping also by gender, to make a nicer channel +//duplicatesGrouped = Channel.empty() +//if (step == 'mapping') duplicatesGrouped = duplicates.groupTuple() +//else if (step == 'realign') duplicatesGrouped = bamFiles.map{ +// idPatient, status, idSample, bam, bai -> +// [idPatient, bam, bai] +//}.groupTuple() +// +//// The duplicatesGrouped channel is duplicated +//// one copy goes to the RealignerTargetCreator process +//// and the other to the IndelRealigner process +//(duplicatesInterval, duplicatesRealign) = duplicatesGrouped.into(2) +// +//if (params.verbose) duplicatesInterval = duplicatesInterval.view { +// "BAMs for RealignerTargetCreator:\n\ +// ID : ${it[0]}\n\ +// Files : ${it[1].fileName}\n\ +// Files : ${it[2].fileName}" +//} +// +//if (params.verbose) duplicatesRealign = duplicatesRealign.view { +// "BAMs to join:\n\ +// ID : ${it[0]}\n\ +// Files : ${it[1].fileName}\n\ +// Files : ${it[2].fileName}" +//} +// +//if (params.verbose) markDuplicatesReport = markDuplicatesReport.view { +// "MarkDuplicates report:\n\ +// File : [${it.fileName}]" +//} +// +//// VCF indexes are added so they will be linked, and not re-created on the fly +//// -L "1:131941-141339" \ +// +//process RealignerTargetCreator { +// tag {idPatient} +// +// input: +// set idPatient, file(bam), file(bai) from duplicatesInterval +// set file(genomeFile), file(genomeIndex), file(genomeDict), file(knownIndels), file(knownIndelsIndex), file(intervals) from Channel.value([ +// referenceMap.genomeFile, +// referenceMap.genomeIndex, +// referenceMap.genomeDict, +// referenceMap.knownIndels, +// referenceMap.knownIndelsIndex, +// referenceMap.intervals +// ]) +// +// output: +// set idPatient, file("${idPatient}.intervals") into intervals +// +// when: ( step == 'mapping' || step == 'realign' ) && !params.onlyQC +// +// script: +// bams = bam.collect{"-I ${it}"}.join(' ') +// known = knownIndels.collect{"-known ${it}"}.join(' ') +// """ +// gatk-launch --java-options -Xmx${task.memory.toGiga()}g \ +// RealignerTargetCreator \ +// ${bams} \ +// -R ${genomeFile} \ +// ${known} \ +// -nt ${task.cpus} \ +// -L ${intervals} \ +// -o ${idPatient}.intervals +// """ +//} +// +//if (params.verbose) intervals = intervals.view { +// "Intervals to join:\n\ +// ID : ${it[0]}\n\ +// File : [${it[1].fileName}]" +//} +// +//bamsAndIntervals = duplicatesRealign.join(intervals) +// +//if (params.verbose) bamsAndIntervals = bamsAndIntervals.view { +// "BAMs and Intervals joined for IndelRealigner:\n\ +// ID : ${it[0]}\n\ +// Files : ${it[1].fileName}\n\ +// Files : ${it[2].fileName}\n\ +// File : [${it[3].fileName}]" +//} +// +//// use nWayOut to split into T/N pair again +//process IndelRealigner { +// tag {idPatient} +// +// publishDir directoryMap.nonRecalibrated, mode: 'link' +// +// input: +// set idPatient, file(bam), file(bai), file(intervals) from bamsAndIntervals +// set file(genomeFile), file(genomeIndex), file(genomeDict), file(knownIndels), file(knownIndelsIndex) from Channel.value([ +// referenceMap.genomeFile, +// referenceMap.genomeIndex, +// referenceMap.genomeDict, +// referenceMap.knownIndels, +// referenceMap.knownIndelsIndex]) +// +// output: +// set idPatient, file("*.real.bam"), file("*.real.bai") into realignedBam mode flatten +// +// when: ( step == 'mapping' || step == 'realign' ) && !params.onlyQC +// +// script: +// bams = bam.collect{"-I ${it}"}.join(' ') +// known = knownIndels.collect{"-known ${it}"}.join(' ') +// """ +// java -Xmx${task.memory.toGiga()}g \ +// -jar \$GATK_HOME/GenomeAnalysisTK.jar \ +// -T IndelRealigner \ +// ${bams} \ +// -R ${genomeFile} \ +// -targetIntervals ${intervals} \ +// ${known} \ +// -nWayOut '.real.bam' +// """ +//} +// +//realignedBam = realignedBam.map { +// idPatient, bam, bai -> +// tag = bam.baseName.tokenize('.')[0] +// status = tag[-1..-1].toInteger() +// idSample = tag.take(tag.length()-2) +// [idPatient, status, idSample, bam, bai] +//} +// +//if (params.verbose) realignedBam = realignedBam.view { +// "Realigned BAM to CreateRecalibrationTable:\n\ +// ID : ${it[0]}\tStatus: ${it[1]}\tSample: ${it[2]}\n\ +// Files : [${it[3].fileName}, ${it[4].fileName}]" +//} +// +//(realignedBam, realignedBamToJoin) = realignedBam.into(2) +duplicateMarkedBams = duplicateMarkedBams.map { idPatient, bam, bai -> tag = bam.baseName.tokenize('.')[0] status = tag[-1..-1].toInteger() @@ -412,13 +426,13 @@ realignedBam = realignedBam.map { [idPatient, status, idSample, bam, bai] } -if (params.verbose) realignedBam = realignedBam.view { +if (params.verbose) duplicateMarkedBams = duplicateMarkedBams.view { "Realigned BAM to CreateRecalibrationTable:\n\ ID : ${it[0]}\tStatus: ${it[1]}\tSample: ${it[2]}\n\ Files : [${it[3].fileName}, ${it[4].fileName}]" } -(realignedBam, realignedBamToJoin) = realignedBam.into(2) +(mdBam, mdBamToJoin) = duplicateMarkedBams.into(2) process CreateRecalibrationTable { tag {idPatient + "-" + idSample} @@ -426,7 +440,7 @@ process CreateRecalibrationTable { publishDir directoryMap.nonRecalibrated, mode: 'link', overwrite: false input: - set idPatient, status, idSample, file(bam), file(bai) from realignedBam + set idPatient, status, idSample, file(bam), file(bai) from mdBam // realignedBam set file(genomeFile), file(genomeIndex), file(genomeDict), file(dbsnp), file(dbsnpIndex), file(knownIndels), file(knownIndelsIndex), file(intervals) from Channel.value([ referenceMap.genomeFile, referenceMap.genomeIndex, @@ -445,23 +459,21 @@ process CreateRecalibrationTable { when: ( step == 'mapping' || step == 'realign' ) && !params.onlyQC script: - known = knownIndels.collect{ "-knownSites ${it}" }.join(' ') + known = knownIndels.collect{ "--known-sites ${it}" }.join(' ') """ - java -Xmx${task.memory.toGiga()}g \ - -Djava.io.tmpdir="/tmp" \ - -jar \$GATK_HOME/GenomeAnalysisTK.jar \ - -T BaseRecalibrator \ + gatk-launch --java-options -Xmx${task.memory.toGiga()}g \ + BaseRecalibrator \ + --input ${bam} \ + --output ${idSample}.recal.table \ + --TMP_DIR /tmp \ -R ${genomeFile} \ - -I ${bam} \ -L ${intervals} \ - --disable_auto_index_creation_and_locking_when_reading_rods \ - -knownSites ${dbsnp} \ + --known-sites ${dbsnp} \ ${known} \ - -nct ${task.cpus} \ - -l INFO \ - -o ${idSample}.recal.table + --verbosity INFO """ } + // Create a TSV file to restart from this step recalibrationTableTSV.map { idPatient, status, idSample, bam, bai, recalTable -> gender = patientGenders[idPatient] @@ -470,7 +482,7 @@ recalibrationTableTSV.map { idPatient, status, idSample, bam, bai, recalTable -> name: 'nonRecalibrated.tsv', sort: true, storeDir: directoryMap.nonRecalibrated ) -recalibrationTable = realignedBamToJoin.join(recalibrationTable, by:[0,1,2]) +recalibrationTable = mdBamToJoin.join(recalibrationTable, by:[0,1,2]) if (step == 'recalibrate') recalibrationTable = bamFiles @@ -488,55 +500,55 @@ bamForBamQC = bamForBamQC.map { it[0..4] } bamForSamToolsStats = bamForSamToolsStats.map{ it[0..4] } recalTables = recalTables.map { [it[0]] + it[2..-1] } // remove status - -process RecalibrateBam { - tag {idPatient + "-" + idSample} - - publishDir directoryMap.recalibrated, mode: 'link' - - input: - set idPatient, status, idSample, file(bam), file(bai), file(recalibrationReport) from recalibrationTable - set file(genomeFile), file(genomeIndex), file(genomeDict), file(intervals) from Channel.value([ - referenceMap.genomeFile, - referenceMap.genomeIndex, - referenceMap.genomeDict, - referenceMap.intervals, - ]) - - output: - set idPatient, status, idSample, file("${idSample}.recal.bam"), file("${idSample}.recal.bai") into recalibratedBam, recalibratedBamForStats - set idPatient, status, idSample, val("${idSample}.recal.bam"), val("${idSample}.recal.bai") into recalibratedBamTSV - - // HaplotypeCaller can do BQSR on the fly, so do not create a - // recalibrated BAM explicitly. - when: params.explicitBqsrNeeded && !params.onlyQC - - script: - """ - java -Xmx${task.memory.toGiga()}g \ - -jar \$GATK_HOME/GenomeAnalysisTK.jar \ - -T PrintReads \ - -R ${genomeFile} \ - -I ${bam} \ - -L ${intervals} \ - --BQSR ${recalibrationReport} \ - -o ${idSample}.recal.bam - """ -} -// Creating a TSV file to restart from this step -recalibratedBamTSV.map { idPatient, status, idSample, bam, bai -> - gender = patientGenders[idPatient] - "${idPatient}\t${gender}\t${status}\t${idSample}\t${directoryMap.recalibrated}/${bam}\t${directoryMap.recalibrated}/${bai}\n" -}.collectFile( - name: 'recalibrated.tsv', sort: true, storeDir: directoryMap.recalibrated -) - -if (params.verbose) recalibratedBam = recalibratedBam.view { - "Recalibrated BAM for variant Calling:\n\ - ID : ${it[0]}\tStatus: ${it[1]}\tSample: ${it[2]}\n\ - Files : [${it[3].fileName}, ${it[4].fileName}]" -} - +// +//process RecalibrateBam { +// tag {idPatient + "-" + idSample} +// +// publishDir directoryMap.recalibrated, mode: 'link' +// +// input: +// set idPatient, status, idSample, file(bam), file(bai), file(recalibrationReport) from recalibrationTable +// set file(genomeFile), file(genomeIndex), file(genomeDict), file(intervals) from Channel.value([ +// referenceMap.genomeFile, +// referenceMap.genomeIndex, +// referenceMap.genomeDict, +// referenceMap.intervals, +// ]) +// +// output: +// set idPatient, status, idSample, file("${idSample}.recal.bam"), file("${idSample}.recal.bai") into recalibratedBam, recalibratedBamForStats +// set idPatient, status, idSample, val("${idSample}.recal.bam"), val("${idSample}.recal.bai") into recalibratedBamTSV +// +// // HaplotypeCaller can do BQSR on the fly, so do not create a +// // recalibrated BAM explicitly. +// when: params.explicitBqsrNeeded && !params.onlyQC +// +// script: +// """ +// java -Xmx${task.memory.toGiga()}g \ +// -jar \$GATK_HOME/GenomeAnalysisTK.jar \ +// -T PrintReads \ +// -R ${genomeFile} \ +// -I ${bam} \ +// -L ${intervals} \ +// --BQSR ${recalibrationReport} \ +// -o ${idSample}.recal.bam +// """ +//} +//// Creating a TSV file to restart from this step +//recalibratedBamTSV.map { idPatient, status, idSample, bam, bai -> +// gender = patientGenders[idPatient] +// "${idPatient}\t${gender}\t${status}\t${idSample}\t${directoryMap.recalibrated}/${bam}\t${directoryMap.recalibrated}/${bai}\n" +//}.collectFile( +// name: 'recalibrated.tsv', sort: true, storeDir: directoryMap.recalibrated +//) +// +//if (params.verbose) recalibratedBam = recalibratedBam.view { +// "Recalibrated BAM for variant Calling:\n\ +// ID : ${it[0]}\tStatus: ${it[1]}\tSample: ${it[2]}\n\ +// Files : [${it[3].fileName}, ${it[4].fileName}]" +//} +// process RunSamtoolsStats { tag {idPatient + "-" + idSample} From 0aea380b8b9ba7a35e9e22defc5b27c48d6eaaa4 Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Mon, 9 Jul 2018 22:07:41 +0200 Subject: [PATCH 08/38] new target directories for Prerocessing, works to the end of ApplyBQSR --- lib/QC.groovy | 2 +- lib/SarekUtils.groovy | 3 +- main.nf | 251 ++++++++++-------------------------------- 3 files changed, 58 insertions(+), 198 deletions(-) diff --git a/lib/QC.groovy b/lib/QC.groovy index c4474fa3ed..5ee03b83cf 100644 --- a/lib/QC.groovy +++ b/lib/QC.groovy @@ -59,7 +59,7 @@ class QC { // Get GATK version static def getVersionGATK() { """ - gatk-launch WhyCantWeHaveVersionEasily 2>&1| awk -F/ '/java/{for(i=1;i<=NF;i++){if(\$i~/gatk4/){sub("gatk4-","",\$i);print \$i>"v_gatk.txt"}}}' + gatk-launch ApplyBQSR --help 2>&1| awk -F/ '/java/{for(i=1;i<=NF;i++){if(\$i~/gatk4/){sub("gatk4-","",\$i);print \$i>"v_gatk.txt"}}}' """ } diff --git a/lib/SarekUtils.groovy b/lib/SarekUtils.groovy index 33716f2297..a21ec442bb 100644 --- a/lib/SarekUtils.groovy +++ b/lib/SarekUtils.groovy @@ -119,8 +119,7 @@ class SarekUtils { // Define map of directories static def defineDirectoryMap(outDir) { return [ - 'nonRealigned' : "${outDir}/Preprocessing/NonRealigned", - 'nonRecalibrated' : "${outDir}/Preprocessing/NonRecalibrated", + 'duplicateMarked' : "${outDir}/Preprocessing/DuplicateMarked", 'recalibrated' : "${outDir}/Preprocessing/Recalibrated", 'ascat' : "${outDir}/VariantCalling/Ascat", 'freebayes' : "${outDir}/VariantCalling/FreeBayes", diff --git a/main.nf b/main.nf index 407d337dbb..f2e137fb13 100644 --- a/main.nf +++ b/main.nf @@ -83,8 +83,7 @@ if (params.sample) tsvPath = params.sample if (!params.sample && !params.sampleDir) { tsvPaths = [ 'mapping': "${workflow.projectDir}/Sarek-data/testdata/tsv/tiny.tsv", - 'realign': "${directoryMap.nonRealigned}/nonRealigned.tsv", - 'recalibrate': "${directoryMap.nonRecalibrated}/nonRecalibrated.tsv" + 'recalibrate': "${directoryMap.duplicateMarked}/duplicateMarked.tsv" ] if (params.test || step != 'mapping') tsvPath = tsvPaths[step] } @@ -247,7 +246,7 @@ process MarkDuplicates { publishDir params.outDir, mode: 'link', saveAs: { if (it == "${bam}.metrics") "${directoryMap.markDuplicatesQC}/${it}" - else "${directoryMap.nonRealigned}/${it}" + else "${directoryMap.duplicateMarked}/${it}" } input: @@ -276,148 +275,11 @@ process MarkDuplicates { // Creating a TSV file to restart from this step markDuplicatesTSV.map { idPatient, status, idSample, bam, bai -> gender = patientGenders[idPatient] - "${idPatient}\t${gender}\t${status}\t${idSample}\t${directoryMap.nonRealigned}/${bam}\t${directoryMap.nonRealigned}/${bai}\n" + "${idPatient}\t${gender}\t${status}\t${idSample}\t${directoryMap.duplicateMarked}/${bam}\t${directoryMap.duplicateMarked}/${bai}\n" }.collectFile( - name: 'nonRealigned.tsv', sort: true, storeDir: directoryMap.nonRealigned + name: 'duplicateMarked.tsv', sort: true, storeDir: directoryMap.duplicateMarked ) -//// Create intervals for realignement using both tumor+normal as input -//// Group the marked duplicates BAMs for intervals and realign by idPatient -//// Grouping also by gender, to make a nicer channel -//duplicatesGrouped = Channel.empty() -//if (step == 'mapping') duplicatesGrouped = duplicates.groupTuple() -//else if (step == 'realign') duplicatesGrouped = bamFiles.map{ -// idPatient, status, idSample, bam, bai -> -// [idPatient, bam, bai] -//}.groupTuple() -// -//// The duplicatesGrouped channel is duplicated -//// one copy goes to the RealignerTargetCreator process -//// and the other to the IndelRealigner process -//(duplicatesInterval, duplicatesRealign) = duplicatesGrouped.into(2) -// -//if (params.verbose) duplicatesInterval = duplicatesInterval.view { -// "BAMs for RealignerTargetCreator:\n\ -// ID : ${it[0]}\n\ -// Files : ${it[1].fileName}\n\ -// Files : ${it[2].fileName}" -//} -// -//if (params.verbose) duplicatesRealign = duplicatesRealign.view { -// "BAMs to join:\n\ -// ID : ${it[0]}\n\ -// Files : ${it[1].fileName}\n\ -// Files : ${it[2].fileName}" -//} -// -//if (params.verbose) markDuplicatesReport = markDuplicatesReport.view { -// "MarkDuplicates report:\n\ -// File : [${it.fileName}]" -//} -// -//// VCF indexes are added so they will be linked, and not re-created on the fly -//// -L "1:131941-141339" \ -// -//process RealignerTargetCreator { -// tag {idPatient} -// -// input: -// set idPatient, file(bam), file(bai) from duplicatesInterval -// set file(genomeFile), file(genomeIndex), file(genomeDict), file(knownIndels), file(knownIndelsIndex), file(intervals) from Channel.value([ -// referenceMap.genomeFile, -// referenceMap.genomeIndex, -// referenceMap.genomeDict, -// referenceMap.knownIndels, -// referenceMap.knownIndelsIndex, -// referenceMap.intervals -// ]) -// -// output: -// set idPatient, file("${idPatient}.intervals") into intervals -// -// when: ( step == 'mapping' || step == 'realign' ) && !params.onlyQC -// -// script: -// bams = bam.collect{"-I ${it}"}.join(' ') -// known = knownIndels.collect{"-known ${it}"}.join(' ') -// """ -// gatk-launch --java-options -Xmx${task.memory.toGiga()}g \ -// RealignerTargetCreator \ -// ${bams} \ -// -R ${genomeFile} \ -// ${known} \ -// -nt ${task.cpus} \ -// -L ${intervals} \ -// -o ${idPatient}.intervals -// """ -//} -// -//if (params.verbose) intervals = intervals.view { -// "Intervals to join:\n\ -// ID : ${it[0]}\n\ -// File : [${it[1].fileName}]" -//} -// -//bamsAndIntervals = duplicatesRealign.join(intervals) -// -//if (params.verbose) bamsAndIntervals = bamsAndIntervals.view { -// "BAMs and Intervals joined for IndelRealigner:\n\ -// ID : ${it[0]}\n\ -// Files : ${it[1].fileName}\n\ -// Files : ${it[2].fileName}\n\ -// File : [${it[3].fileName}]" -//} -// -//// use nWayOut to split into T/N pair again -//process IndelRealigner { -// tag {idPatient} -// -// publishDir directoryMap.nonRecalibrated, mode: 'link' -// -// input: -// set idPatient, file(bam), file(bai), file(intervals) from bamsAndIntervals -// set file(genomeFile), file(genomeIndex), file(genomeDict), file(knownIndels), file(knownIndelsIndex) from Channel.value([ -// referenceMap.genomeFile, -// referenceMap.genomeIndex, -// referenceMap.genomeDict, -// referenceMap.knownIndels, -// referenceMap.knownIndelsIndex]) -// -// output: -// set idPatient, file("*.real.bam"), file("*.real.bai") into realignedBam mode flatten -// -// when: ( step == 'mapping' || step == 'realign' ) && !params.onlyQC -// -// script: -// bams = bam.collect{"-I ${it}"}.join(' ') -// known = knownIndels.collect{"-known ${it}"}.join(' ') -// """ -// java -Xmx${task.memory.toGiga()}g \ -// -jar \$GATK_HOME/GenomeAnalysisTK.jar \ -// -T IndelRealigner \ -// ${bams} \ -// -R ${genomeFile} \ -// -targetIntervals ${intervals} \ -// ${known} \ -// -nWayOut '.real.bam' -// """ -//} -// -//realignedBam = realignedBam.map { -// idPatient, bam, bai -> -// tag = bam.baseName.tokenize('.')[0] -// status = tag[-1..-1].toInteger() -// idSample = tag.take(tag.length()-2) -// [idPatient, status, idSample, bam, bai] -//} -// -//if (params.verbose) realignedBam = realignedBam.view { -// "Realigned BAM to CreateRecalibrationTable:\n\ -// ID : ${it[0]}\tStatus: ${it[1]}\tSample: ${it[2]}\n\ -// Files : [${it[3].fileName}, ${it[4].fileName}]" -//} -// -//(realignedBam, realignedBamToJoin) = realignedBam.into(2) duplicateMarkedBams = duplicateMarkedBams.map { idPatient, bam, bai -> tag = bam.baseName.tokenize('.')[0] @@ -437,7 +299,7 @@ if (params.verbose) duplicateMarkedBams = duplicateMarkedBams.view { process CreateRecalibrationTable { tag {idPatient + "-" + idSample} - publishDir directoryMap.nonRecalibrated, mode: 'link', overwrite: false + publishDir directoryMap.duplicateMarked, mode: 'link', overwrite: false input: set idPatient, status, idSample, file(bam), file(bai) from mdBam // realignedBam @@ -477,9 +339,9 @@ process CreateRecalibrationTable { // Create a TSV file to restart from this step recalibrationTableTSV.map { idPatient, status, idSample, bam, bai, recalTable -> gender = patientGenders[idPatient] - "${idPatient}\t${gender}\t${status}\t${idSample}\t${directoryMap.nonRecalibrated}/${bam}\t${directoryMap.nonRecalibrated}/${bai}\t${directoryMap.nonRecalibrated}/${recalTable}\n" + "${idPatient}\t${gender}\t${status}\t${idSample}\t${directoryMap.duplicateMarked}/${bam}\t${directoryMap.duplicateMarked}/${bai}\t${directoryMap.duplicateMarked}/${recalTable}\n" }.collectFile( - name: 'nonRecalibrated.tsv', sort: true, storeDir: directoryMap.nonRecalibrated + name: 'duplicateMarked.tsv', sort: true, storeDir: directoryMap.duplicateMarked ) recalibrationTable = mdBamToJoin.join(recalibrationTable, by:[0,1,2]) @@ -500,55 +362,55 @@ bamForBamQC = bamForBamQC.map { it[0..4] } bamForSamToolsStats = bamForSamToolsStats.map{ it[0..4] } recalTables = recalTables.map { [it[0]] + it[2..-1] } // remove status -// -//process RecalibrateBam { -// tag {idPatient + "-" + idSample} -// -// publishDir directoryMap.recalibrated, mode: 'link' -// -// input: -// set idPatient, status, idSample, file(bam), file(bai), file(recalibrationReport) from recalibrationTable -// set file(genomeFile), file(genomeIndex), file(genomeDict), file(intervals) from Channel.value([ -// referenceMap.genomeFile, -// referenceMap.genomeIndex, -// referenceMap.genomeDict, -// referenceMap.intervals, -// ]) -// -// output: -// set idPatient, status, idSample, file("${idSample}.recal.bam"), file("${idSample}.recal.bai") into recalibratedBam, recalibratedBamForStats -// set idPatient, status, idSample, val("${idSample}.recal.bam"), val("${idSample}.recal.bai") into recalibratedBamTSV -// -// // HaplotypeCaller can do BQSR on the fly, so do not create a -// // recalibrated BAM explicitly. -// when: params.explicitBqsrNeeded && !params.onlyQC -// -// script: -// """ -// java -Xmx${task.memory.toGiga()}g \ -// -jar \$GATK_HOME/GenomeAnalysisTK.jar \ -// -T PrintReads \ -// -R ${genomeFile} \ -// -I ${bam} \ -// -L ${intervals} \ -// --BQSR ${recalibrationReport} \ -// -o ${idSample}.recal.bam -// """ -//} -//// Creating a TSV file to restart from this step -//recalibratedBamTSV.map { idPatient, status, idSample, bam, bai -> -// gender = patientGenders[idPatient] -// "${idPatient}\t${gender}\t${status}\t${idSample}\t${directoryMap.recalibrated}/${bam}\t${directoryMap.recalibrated}/${bai}\n" -//}.collectFile( -// name: 'recalibrated.tsv', sort: true, storeDir: directoryMap.recalibrated -//) -// -//if (params.verbose) recalibratedBam = recalibratedBam.view { -// "Recalibrated BAM for variant Calling:\n\ -// ID : ${it[0]}\tStatus: ${it[1]}\tSample: ${it[2]}\n\ -// Files : [${it[3].fileName}, ${it[4].fileName}]" -//} -// + +process RecalibrateBam { + tag {idPatient + "-" + idSample} + + publishDir directoryMap.recalibrated, mode: 'link' + + input: + set idPatient, status, idSample, file(bam), file(bai), file(recalibrationReport) from recalibrationTable + set file(genomeFile), file(genomeIndex), file(genomeDict), file(intervals) from Channel.value([ + referenceMap.genomeFile, + referenceMap.genomeIndex, + referenceMap.genomeDict, + referenceMap.intervals, + ]) + + output: + set idPatient, status, idSample, file("${idSample}.recal.bam"), file("${idSample}.recal.bai") into recalibratedBam, recalibratedBamForStats + set idPatient, status, idSample, val("${idSample}.recal.bam"), val("${idSample}.recal.bai") into recalibratedBamTSV + + // HaplotypeCaller can do BQSR on the fly, so do not create a + // recalibrated BAM explicitly. + when: params.explicitBqsrNeeded && !params.onlyQC + + script: + """ + gatk-launch --java-options -Xmx${task.memory.toGiga()}g \ + ApplyBQSR \ + -R ${genomeFile} \ + --input ${bam} \ + --output ${idSample}.recal.bam \ + -L ${intervals} \ + --create-output-bam-index true \ + --bqsr-recal-file ${recalibrationReport} + """ +} +// Creating a TSV file to restart from this step +recalibratedBamTSV.map { idPatient, status, idSample, bam, bai -> + gender = patientGenders[idPatient] + "${idPatient}\t${gender}\t${status}\t${idSample}\t${directoryMap.recalibrated}/${bam}\t${directoryMap.recalibrated}/${bai}\n" +}.collectFile( + name: 'recalibrated.tsv', sort: true, storeDir: directoryMap.recalibrated +) + +if (params.verbose) recalibratedBam = recalibratedBam.view { + "Recalibrated BAM for variant Calling:\n\ + ID : ${it[0]}\tStatus: ${it[1]}\tSample: ${it[2]}\n\ + Files : [${it[3].fileName}, ${it[4].fileName}]" +} + process RunSamtoolsStats { tag {idPatient + "-" + idSample} @@ -628,7 +490,6 @@ process GetVersionFastQC { process GetVersionGATK { publishDir directoryMap.version, mode: 'link' - errorStrategy 'ignore' // there is no error-free way to get version from GATK4 output: file("v_*.txt") when: !params.onlyQC script: QC.getVersionGATK() From 72a1b5012798ec623361a2e468aacd035a893fab Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Tue, 10 Jul 2018 21:40:01 +0200 Subject: [PATCH 09/38] HaplotypeCaller works with GATK4 --- germlineVC.nf | 35 +++++++++++++++-------------------- main.nf | 4 ++-- 2 files changed, 17 insertions(+), 22 deletions(-) diff --git a/germlineVC.nf b/germlineVC.nf index ea3ab75700..9124d30395 100644 --- a/germlineVC.nf +++ b/germlineVC.nf @@ -310,20 +310,15 @@ process RunHaplotypecaller { when: 'haplotypecaller' in tools && !params.onlyQC script: - BQSR = (recalTable != null) ? "--BQSR $recalTable" : '' """ - java -Xmx${task.memory.toGiga()}g \ - -jar \$GATK_HOME/GenomeAnalysisTK.jar \ - -T HaplotypeCaller \ - --emitRefConfidence GVCF \ - -pairHMM LOGLESS_CACHING \ - -R ${genomeFile} \ - --dbsnp ${dbsnp} \ - ${BQSR} \ - -I ${bam} \ - -L ${intervalBed} \ - --disable_auto_index_creation_and_locking_when_reading_rods \ - -o ${intervalBed.baseName}_${idSample}.g.vcf + gatk-launch --java-options "-Xmx${task.memory.toGiga()}g -Xms6000m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10" \ + HaplotypeCaller \ + -R ${genomeFile} \ + -I ${bam} \ + -L ${intervalBed} \ + --dbsnp ${dbsnp} \ + -O ${intervalBed.baseName}_${idSample}.g.vcf \ + --emit-ref-confidence GVCF """ } hcGenomicVCF = hcGenomicVCF.groupTuple(by:[0,1,2,3]) @@ -349,17 +344,17 @@ process RunGenotypeGVCFs { when: 'haplotypecaller' in tools && !params.onlyQC script: - // Using -L is important for speed + // Using -L is important for speed and we have to index the interval files also """ - java -Xmx${task.memory.toGiga()}g \ - -jar \$GATK_HOME/GenomeAnalysisTK.jar \ - -T GenotypeGVCFs \ + gatk-launch IndexFeatureFile -F ${gvcf} + + gatk-launch --java-options -Xmx${task.memory.toGiga()}g \ + GenotypeGVCFs \ -R ${genomeFile} \ -L ${intervalBed} \ --dbsnp ${dbsnp} \ - --variant ${gvcf} \ - --disable_auto_index_creation_and_locking_when_reading_rods \ - -o ${intervalBed.baseName}_${idSample}.vcf + -V ${gvcf} \ + -O ${intervalBed.baseName}_${idSample}.vcf """ } hcGenotypedVCF = hcGenotypedVCF.groupTuple(by:[0,1,2,3]) diff --git a/main.nf b/main.nf index f2e137fb13..a452c26550 100644 --- a/main.nf +++ b/main.nf @@ -381,9 +381,9 @@ process RecalibrateBam { set idPatient, status, idSample, file("${idSample}.recal.bam"), file("${idSample}.recal.bai") into recalibratedBam, recalibratedBamForStats set idPatient, status, idSample, val("${idSample}.recal.bam"), val("${idSample}.recal.bai") into recalibratedBamTSV - // HaplotypeCaller can do BQSR on the fly, so do not create a + // GATK4 HaplotypeCaller can not do BQSR on the fly, so we have to create a // recalibrated BAM explicitly. - when: params.explicitBqsrNeeded && !params.onlyQC + when: !params.onlyQC script: """ From c6137bf07cc909a8362f2ff6dc226a35fcbf3485 Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Tue, 10 Jul 2018 22:49:51 +0200 Subject: [PATCH 10/38] path and version fixed for both manta and strelka --- germlineVC.nf | 4 ++-- lib/QC.groovy | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/germlineVC.nf b/germlineVC.nf index 9124d30395..99017563cd 100644 --- a/germlineVC.nf +++ b/germlineVC.nf @@ -457,7 +457,7 @@ process RunSingleStrelka { script: """ - \$STRELKA_INSTALL_PATH/bin/configureStrelkaGermlineWorkflow.py \ + configureStrelkaGermlineWorkflow.py \ --bam ${bam} \ --referenceFasta ${genomeFile} \ --runDir Strelka @@ -501,7 +501,7 @@ process RunSingleManta { script: """ - \$MANTA_INSTALL_PATH/bin/configManta.py \ + configManta.py \ --bam ${bam} \ --reference ${genomeFile} \ --runDir Manta diff --git a/lib/QC.groovy b/lib/QC.groovy index 5ee03b83cf..e3c6416f76 100644 --- a/lib/QC.groovy +++ b/lib/QC.groovy @@ -66,7 +66,7 @@ class QC { // Get Manta version static def getVersionManta() { """ - cat \$MANTA_INSTALL_PATH/lib/python/configBuildTimeInfo.py | grep workflowVersion > v_manta.txt + configManta.py --version > v_manta.txt """ } @@ -80,7 +80,7 @@ class QC { // Get Strelka version static def getVersionStrelka() { """ - cat \$STRELKA_INSTALL_PATH/lib/python/configBuildTimeInfo.py | grep workflowVersion > v_strelka.txt + configureStrelkaGermlineWorkflow.py --version > v_strelka.txt """ } From bba73f12cee5b86d5012cb8c140bdab7eb6203bb Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Wed, 11 Jul 2018 19:02:48 +0200 Subject: [PATCH 11/38] Mutect2 GATK4 initial dumb version --- somaticVC.nf | 87 +++++++++++++--------------------------------------- 1 file changed, 21 insertions(+), 66 deletions(-) diff --git a/somaticVC.nf b/somaticVC.nf index b9a0da1352..da7f17363d 100644 --- a/somaticVC.nf +++ b/somaticVC.nf @@ -29,10 +29,9 @@ kate: syntax groovy; space-indent on; indent-width 2; - RunSamtoolsStats - Run Samtools stats on recalibrated BAM files - RunBamQC - Run qualimap BamQC on recalibrated BAM files - CreateIntervalBeds - Create and sort intervals into bed files - - RunMutect1 - Run MuTect1 for Variant Calling (Parallelized processes) - RunMutect2 - Run MuTect2 for Variant Calling (Parallelized processes) - RunFreeBayes - Run FreeBayes for Variant Calling (Parallelized processes) - - ConcatVCF - Merge results from MuTect1 and MuTect2 + - ConcatVCF - Merge results from paralellized variant callers - RunStrelka - Run Strelka for Variant Calling - RunManta - Run Manta for Structural Variant Calling - RunSingleManta - Run Manta for Single Structural Variant Calling @@ -76,9 +75,6 @@ if (params.test && params.genome in ['GRCh37', 'GRCh38']) { referenceMap.intervals = file("$workflow.projectDir/repeats/tiny_${params.genome}.list") } -// TODO -// MuTect and Mutect2 could be run without a recalibrated BAM (they support -// the --BQSR option), but this is not implemented, yet. // TODO // FreeBayes does not need recalibrated BAMs, but we need to test whether // the channels are set up correctly when we disable it @@ -200,10 +196,9 @@ bamsTumor = bamsTumor.map { idPatient, status, idSample, bam, bai -> [idPatient, // We know that MuTect2 (and other somatic callers) are notoriously slow. // To speed them up we are chopping the reference into smaller pieces. -// (see repeats/centromeres.list). // Do variant calling by this intervals, and re-merge the VCFs. -// Since we are on a cluster, this can parallelize the variant call processes. -// And push down the variant call wall clock time significanlty. +// Since we are on a cluster or a multi-CPU machine, this can parallelize the +// variant call processes and push down the variant call wall clock time significanlty. process CreateIntervalBeds { tag {intervals.fileName} @@ -285,48 +280,10 @@ bamsAll = bamsAll.map { bamsTumorNormalIntervals = bamsAll.spread(bedIntervals) -// MuTect1, MuTect2, FreeBayes -(bamsFMT1, bamsFMT2, bamsFFB) = bamsTumorNormalIntervals.into(3) - -process RunMutect1 { - tag {idSampleTumor + "_vs_" + idSampleNormal + "-" + intervalBed.baseName} - - input: - set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor), file(intervalBed) from bamsFMT1 - set file(genomeFile), file(genomeIndex), file(genomeDict), file(dbsnp), file(dbsnpIndex), file(cosmic), file(cosmicIndex) from Channel.value([ - referenceMap.genomeFile, - referenceMap.genomeIndex, - referenceMap.genomeDict, - referenceMap.dbsnp, - referenceMap.dbsnpIndex, - referenceMap.cosmic, - referenceMap.cosmicIndex - ]) - - output: - set val("mutect1"), idPatient, idSampleNormal, idSampleTumor, file("${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf") into mutect1Output - - when: 'mutect1' in tools && !params.onlyQC - - script: - """ - java -Xmx${task.memory.toGiga()}g \ - -jar \$MUTECT_HOME/muTect.jar \ - -T MuTect \ - -R ${genomeFile} \ - --cosmic ${cosmic} \ - --dbsnp ${dbsnp} \ - -I:normal ${bamNormal} \ - -I:tumor ${bamTumor} \ - -L ${intervalBed} \ - --disable_auto_index_creation_and_locking_when_reading_rods \ - --out ${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.call_stats.out \ - --vcf ${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf - """ -} - -mutect1Output = mutect1Output.groupTuple(by:[0,1,2,3]) +// MuTect2, FreeBayes +( bamsFMT2, bamsFFB) = bamsTumorNormalIntervals.into(3) +// This will give as a list of unfiltered calls for MuTect2. process RunMutect2 { tag {idSampleTumor + "_vs_" + idSampleNormal + "-" + intervalBed.baseName} @@ -349,19 +306,18 @@ process RunMutect2 { script: """ - java -Xmx${task.memory.toGiga()}g \ - -jar \$GATK_HOME/GenomeAnalysisTK.jar \ - -T MuTect2 \ - -R ${genomeFile} \ - --cosmic ${cosmic} \ - --dbsnp ${dbsnp} \ - -I:normal ${bamNormal} \ - -I:tumor ${bamTumor} \ - --disable_auto_index_creation_and_locking_when_reading_rods \ - -L ${intervalBed} \ - -o ${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf + gatk-launch --java-options "-Xmx${task.memory.toGiga()}g" \ + Mutect2 \ + -R ${genomeFile}\ + -I ${bamTumor} -tumor ${idSampleTumor} \ + -I ${bamNormal} -normal ${idSampleNormal} \ + -L ${intervalBed} \ + -O ${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf """ } +// --germline_resource af-only-gnomad.vcf.gz \ +// --normal_panel pon.vcf.gz \ +// --dbsnp ${dbsnp} \ mutect2Output = mutect2Output.groupTuple(by:[0,1,2,3]) @@ -400,7 +356,7 @@ freebayesOutput = freebayesOutput.groupTuple(by:[0,1,2,3]) // we are merging the VCFs that are called separatelly for different intervals // so we can have a single sorted VCF containing all the calls for a given caller -vcfsToMerge = mutect1Output.mix(mutect2Output, freebayesOutput) +vcfsToMerge = mutect2Output.mix(freebayesOutput) if (params.verbose) vcfsToMerge = vcfsToMerge.view { "VCFs To be merged:\n\ Tool : ${it[0]}\tID : ${it[1]}\tSample: [${it[3]}, ${it[2]}]\n\ @@ -492,7 +448,7 @@ process RunStrelka { script: """ - \$STRELKA_INSTALL_PATH/bin/configureStrelkaSomaticWorkflow.py \ + configureStrelkaSomaticWorkflow.py \ --tumor ${bamTumor} \ --normal ${bamNormal} \ --referenceFasta ${genomeFile} \ @@ -538,7 +494,7 @@ process RunManta { script: """ - \$MANTA_INSTALL_PATH/bin/configManta.py \ + configManta.py \ --normalBam ${bamNormal} \ --tumorBam ${bamTumor} \ --reference ${genomeFile} \ @@ -591,7 +547,7 @@ process RunSingleManta { script: """ - \$MANTA_INSTALL_PATH/bin/configManta.py \ + configManta.py \ --tumorBam ${bam} \ --reference ${genomeFile} \ --runDir Manta @@ -651,7 +607,7 @@ process RunStrelkaBP { script: """ - \$STRELKA_INSTALL_PATH/bin/configureStrelkaSomaticWorkflow.py \ + configureStrelkaSomaticWorkflow.py \ --tumor ${bamTumor} \ --normal ${bamNormal} \ --referenceFasta ${genomeFile} \ @@ -1003,7 +959,6 @@ def helpMessage() { log.info " Option to configure which tools to use in the workflow." log.info " Different tools to be separated by commas." log.info " Possible values are:" - log.info " mutect1 (use MuTect1 for VC)" log.info " mutect2 (use MuTect2 for VC)" log.info " freebayes (use FreeBayes for VC)" log.info " strelka (use Strelka for VC)" From 70a35842b7fc29322b764790b885516d04cba1bb Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Thu, 12 Jul 2018 01:26:12 +0200 Subject: [PATCH 12/38] Added genome index file --- somaticVC.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/somaticVC.nf b/somaticVC.nf index da7f17363d..f8d231582f 100644 --- a/somaticVC.nf +++ b/somaticVC.nf @@ -327,6 +327,7 @@ process RunFreeBayes { input: set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor), file(intervalBed) from bamsFFB file(genomeFile) from Channel.value(referenceMap.genomeFile) + file(genomeIndex) from Channel.value(referenceMap.genomeIndex) output: set val("freebayes"), idPatient, idSampleNormal, idSampleTumor, file("${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf") into freebayesOutput From a8360571d60467014326cb680f8f05050ca5ed0c Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Thu, 12 Jul 2018 01:29:59 +0200 Subject: [PATCH 13/38] Changed Dockerfile to follow https://github.com/nf-core/methylseq/pull/29/files --- containers/sarek/Dockerfile | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/containers/sarek/Dockerfile b/containers/sarek/Dockerfile index fd4102563b..28db2cb0ee 100644 --- a/containers/sarek/Dockerfile +++ b/containers/sarek/Dockerfile @@ -1,15 +1,9 @@ FROM nfcore/base:latest LABEL \ - author="Szilveszter Juhos" \ + authors="Maxime.Gracia@scilifelab.se, Szilveszter.Juhos@scilifelab.se" \ description="Image with tools used in Sarek" \ - maintainer="szilveszter.juhos@scilifelab.se" + maintainers="Maxime.Gracia@scilifelab.se, Szilveszter.Juhos@scilifelab.se" COPY environment.yml / - -RUN \ - conda env create -f /environment.yml && \ - conda clean -a - - # Export PATH -ENV PATH /opt/conda/envs/sarek-core/bin:$PATH +RUN conda env update -n root -f /environment.yml && conda clean -a From 733e2c47ff22205789cdd2b788fa793d25cea5cb Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Thu, 12 Jul 2018 01:31:02 +0200 Subject: [PATCH 14/38] removed -Xmx4g because it was ignored anyway and made a syntax/workflow error. Maybe NXF bug? --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 9fdfe3ea38..4f6196a9fe 100644 --- a/nextflow.config +++ b/nextflow.config @@ -14,7 +14,7 @@ manifest { } env { - NXF_OPTS="-Xms1g -Xmx4g" + NXF_OPTS="-Xms1g" } profiles { From 449f93906767589a6fd424bc5e1f2c02a71285aa Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Thu, 12 Jul 2018 01:31:43 +0200 Subject: [PATCH 15/38] alphabetical order and added freebayes and new htslib version --- containers/sarek/environment.yml | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/containers/sarek/environment.yml b/containers/sarek/environment.yml index c0b720030f..7187785ab9 100644 --- a/containers/sarek/environment.yml +++ b/containers/sarek/environment.yml @@ -7,16 +7,18 @@ channels: - defaults dependencies: - - conda-forge::openjdk=8.0.144 # Needed for FastQC docker - see bioconda/bioconda-recipes#5026 + - bcftools=1.8 - bwa=0.7.17 + - conda-forge::openjdk=8.0.144 # Needed for FastQC docker - see bioconda/bioconda-recipes#5026 - fastqc=0.11.7 - - samtools=1.8 + - freebayes=1.2.0 - gatk4=4.0.3.0 - - qualimap=2.2.2a - - multiqc=1.5 + - htslib=1.7 + - igvtools=2.3.93 - manta=1.3.0 + - multiqc=1.5 + - qualimap=2.2.2a + - samtools=1.8 - strelka=2.9.3 - - bcftools=1.8 - - htslib=1.5 - - vcftools=0.1.15 - vcfanno=0.2.8 + - vcftools=0.1.15 From 3fb6dc6fd50d03d24317e421934cdfe40562fe00 Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Thu, 12 Jul 2018 01:32:54 +0200 Subject: [PATCH 16/38] freebayes and igvtools in the core container --- configuration/singularity-path.config | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/configuration/singularity-path.config b/configuration/singularity-path.config index d67138e4d2..5392f30717 100644 --- a/configuration/singularity-path.config +++ b/configuration/singularity-path.config @@ -16,7 +16,7 @@ process { $BuildBWAindexes.container = "${params.containerPath}/sarek-${params.tag}.img" $BuildPicardIndex.container = "${params.containerPath}/sarek-${params.tag}.img" $BuildSAMToolsIndex.container = "${params.containerPath}/sarek-${params.tag}.img" - $BuildVCFIndex.container = "${params.containerPath}/igvtools-${params.tag}.img" + $BuildVCFIndex.container = "${params.containerPath}/sarek-${params.tag}.img" $CompressVCF.container = "${params.containerPath}/sarek-${params.tag}.img" $ConcatVCF.container = "${params.containerPath}/sarek-${params.tag}.img" $CreateRecalibrationTable.container = "${params.containerPath}/sarek-${params.tag}.img" @@ -27,7 +27,7 @@ process { $GetVersionBCFtools.container = "${params.containerPath}/sarek-${params.tag}.img" $GetVersionBWAsamtools.container = "${params.containerPath}/sarek-${params.tag}.img" $GetVersionFastQC.container = "${params.containerPath}/sarek-${params.tag}.img" - $GetVersionFreeBayes.container = "${params.containerPath}/freebayes-${params.tag}.img" + $GetVersionFreeBayes.container = "${params.containerPath}/sarek-${params.tag}.img" $GetVersionGATK.container = "${params.containerPath}/sarek-${params.tag}.img" $GetVersionManta.container = "${params.containerPath}/sarek-${params.tag}.img" $GetVersionSnpeff.container = {params.genome == 'GRCh38' ? "${params.containerPath}/snpeffgrch38-${params.tag}.img" : "${params.containerPath}/snpeffgrch37-${params.tag}.img"} @@ -46,7 +46,7 @@ process { $RunBcftoolsStats.container = "${params.containerPath}/sarek-${params.tag}.img" $RunConvertAlleleCounts.container = "${params.containerPath}/r-base-${params.tag}.img" $RunFastQC.container = "${params.containerPath}/sarek-${params.tag}.img" - $RunFreeBayes.container = "${params.containerPath}/freebayes-${params.tag}.img" + $RunFreeBayes.container = "${params.containerPath}/sarek-${params.tag}.img" $RunGenotypeGVCFs.container = "${params.containerPath}/sarek-${params.tag}.img" $RunHaplotypecaller.container = "${params.containerPath}/sarek-${params.tag}.img" $RunManta.container = "${params.containerPath}/sarek-${params.tag}.img" From 0c71f0adad1cebcd6469154822bc386dd18be573 Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Thu, 12 Jul 2018 19:24:29 +0200 Subject: [PATCH 17/38] set -euo pipefail needed also for germline --- germlineVC.nf | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/germlineVC.nf b/germlineVC.nf index 99017563cd..dc10956f9d 100644 --- a/germlineVC.nf +++ b/germlineVC.nf @@ -31,7 +31,7 @@ kate: syntax groovy; space-indent on; indent-width 2; - CreateIntervalBeds - Create and sort intervals into bed files - RunHaplotypecaller - Run HaplotypeCaller for Germline Variant Calling (Parallelized processes) - RunGenotypeGVCFs - Run HaplotypeCaller for Germline Variant Calling (Parallelized processes) - - ConcatVCF - Merge results from HaplotypeCaller, MuTect1 and MuTect2 + - ConcatVCF - Merge results from HaplotypeCaller, MuTect2 and other paralellized callers - RunSingleStrelka - Run Strelka for Germline Variant Calling - RunSingleManta - Run Manta for Single Structural Variant Calling - RunBcftoolsStats - Run BCFTools stats on vcf files @@ -71,9 +71,6 @@ if (params.test && params.genome in ['GRCh37', 'GRCh38']) { referenceMap.intervals = file("$workflow.projectDir/repeats/tiny_${params.genome}.list") } -// TODO -// MuTect and Mutect2 could be run without a recalibrated BAM (they support -// the --BQSR option), but this is not implemented, yet. // TODO // FreeBayes does not need recalibrated BAMs, but we need to test whether // the channels are set up correctly when we disable it @@ -287,8 +284,8 @@ bamsAll = bamsAll.map { bamsTumorNormalIntervals = bamsAll.spread(bedIntervals) -// MuTect1, MuTect2, FreeBayes -(bamsFMT1, bamsFMT2, bamsFFB) = bamsTumorNormalIntervals.into(3) +// MuTect2, FreeBayes +(bamsFMT2, bamsFFB) = bamsTumorNormalIntervals.into(2) process RunHaplotypecaller { tag {idSample + "-" + intervalBed.baseName} @@ -382,7 +379,7 @@ process ConcatVCF { set variantCaller, idPatient, idSampleNormal, idSampleTumor, file("*.vcf.gz"), file("*.vcf.gz.tbi") into vcfConcatenated - when: ( 'haplotypecaller' in tools || 'mutect1' in tools || 'mutect2' in tools || 'freebayes' in tools ) && !params.onlyQC + when: ( 'haplotypecaller' in tools || 'mutect2' in tools || 'freebayes' in tools ) && !params.onlyQC script: if (variantCaller == 'haplotypecaller') outputFile = "${variantCaller}_${idSampleNormal}.vcf" @@ -390,6 +387,7 @@ process ConcatVCF { else outputFile = "${variantCaller}_${idSampleTumor}_vs_${idSampleNormal}.vcf" """ + set -euo pipefail # first make a header from one of the VCF intervals # get rid of interval information only from the GATK command-line, but leave the rest FIRSTVCF=\$(ls *.vcf | head -n 1) @@ -665,7 +663,6 @@ def defineToolList() { 'freebayes', 'haplotypecaller', 'manta', - 'mutect1', 'mutect2', 'strelka' ] From da3be400aea1663ff5361857ebf2802e8d8ebcef Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Thu, 12 Jul 2018 19:24:53 +0200 Subject: [PATCH 18/38] die mutect1 --- annotate.nf | 1 - buildContainers.nf | 4 +--- configuration/containers.config | 23 +++++++++----------- configuration/uppmax-localhost.config | 4 ---- configuration/uppmax-slurm.config | 6 ------ containers/freebayes/Dockerfile | 30 --------------------------- containers/mutect1/Dockerfile | 25 ---------------------- lib/SarekUtils.groovy | 1 - scripts/do_all.sh | 4 ++-- scripts/test.sh | 2 +- somaticVC.nf | 4 ++-- 11 files changed, 16 insertions(+), 88 deletions(-) delete mode 100644 containers/freebayes/Dockerfile delete mode 100644 containers/mutect1/Dockerfile diff --git a/annotate.nf b/annotate.nf index 23c6b92790..8d45ae3fff 100644 --- a/annotate.nf +++ b/annotate.nf @@ -350,7 +350,6 @@ def helpMessage() { log.info " Possible values are:" log.info " haplotypecaller (Annotate HaplotypeCaller output)" log.info " manta (Annotate Manta output)" - log.info " mutect1 (Annotate MuTect1 output)" log.info " mutect2 (Annotate MuTect2 output)" log.info " strelka (Annotate Strelka output)" log.info " --annotateVCF" diff --git a/buildContainers.nf b/buildContainers.nf index ba6d96ff32..7122075e7b 100644 --- a/buildContainers.nf +++ b/buildContainers.nf @@ -181,7 +181,6 @@ def defineContainersList(){ 'gatk', 'gatk4', 'igvtools', - 'mutect1', 'picard', 'qctools', 'r-base', @@ -208,8 +207,7 @@ def helpMessage() { log.info " --containers: Choose which containers to build" log.info " Default: all" log.info " Possible values:" - log.info " all, freebayes, gatk, gatk4, igvtools, mutect1, picard" - log.info " qctools, r-base, runallelecount, sarek, snpeff" + log.info " all, r-base, runallelecount, sarek, snpeff" log.info " snpeffgrch37, snpeffgrch38, vepgrch37, vepgrch38" log.info " --docker: Build containers using Docker" log.info " --help" diff --git a/configuration/containers.config b/configuration/containers.config index 780a5a47a4..195c240326 100644 --- a/configuration/containers.config +++ b/configuration/containers.config @@ -11,10 +11,10 @@ process { $BuildBWAindexes.container = "${params.repository}/sarek:${params.tag}" $BuildPicardIndex.container = "${params.repository}/picard:${params.tag}" $BuildSAMToolsIndex.container = "${params.repository}/sarek:${params.tag}" - $BuildVCFIndex.container = "${params.repository}/igvtools:${params.tag}" + $BuildVCFIndex.container = "${params.repository}/sarek:${params.tag}" $CompressVCF.container = "${params.repository}/sarek:${params.tag}" $ConcatVCF.container = "${params.repository}/sarek:${params.tag}" - $CreateRecalibrationTable.container = "${params.repository}/gatk:${params.tag}" + $CreateRecalibrationTable.container = "${params.repository}/sarek:${params.tag}" $GetVersionAll.container = "${params.repository}/sarek:${params.tag}" $GetVersionAlleleCount.container = "${params.repository}/runallelecount:${params.tag}" $GetVersionASCAT.container = "${params.repository}/r-base:${params.tag}" @@ -22,33 +22,30 @@ process { $GetVersionBCFtools.container = "${params.repository}/sarek:${params.tag}" $GetVersionBWAsamtools.container = "${params.repository}/sarek:${params.tag}" $GetVersionFastQC.container = "${params.repository}/sarek:${params.tag}" - $GetVersionFreeBayes.container = "${params.repository}/freebayes:${params.tag}" - $GetVersionGATK.container = "${params.repository}/gatk:${params.tag}" + $GetVersionFreeBayes.container = "${params.repository}/sarek:${params.tag}" + $GetVersionGATK.container = "${params.repository}/sarek:${params.tag}" $GetVersionManta.container = "${params.repository}/sarek:${params.tag}" $GetVersionPicard.container = "${params.repository}/picard:${params.tag}" $GetVersionSnpeff.container = {params.genome == 'GRCh38' ? "${params.repository}/snpeffgrch38:${params.tag}" : "${params.repository}/snpeffgrch37:${params.tag}"} $GetVersionStrelka.container = "${params.repository}/sarek:${params.tag}" $GetVersionVCFtools.container = "${params.repository}/sarek:${params.tag}" $GetVersionVEP.container = {params.genome == 'GRCh38' ? "${params.repository}/vepgrch38:${params.tag}" : "${params.repository}/vepgrch37:${params.tag}"} - $IndelRealigner.container = "${params.repository}/gatk:${params.tag}" $MapReads.container = "${params.repository}/sarek:${params.tag}" - $MarkDuplicates.container = "${params.repository}/picard:${params.tag}" + $MarkDuplicates.container = "${params.repository}/sarek:${params.tag}" $MergeBams.container = "${params.repository}/sarek:${params.tag}" - $RealignerTargetCreator.container = "${params.repository}/gatk:${params.tag}" - $RecalibrateBam.container = "${params.repository}/gatk:${params.tag}" + $RecalibrateBam.container = "${params.repository}/sarek:${params.tag}" $RunAlleleCount.container = "${params.repository}/runallelecount:${params.tag}" $RunAscat.container = "${params.repository}/r-base:${params.tag}" $RunBamQC.container = "${params.repository}/sarek:${params.tag}" $RunBcftoolsStats.container = "${params.repository}/sarek:${params.tag}" $RunConvertAlleleCounts.container = "${params.repository}/r-base:${params.tag}" $RunFastQC.container = "${params.repository}/sarek:${params.tag}" - $RunFreeBayes.container = "${params.repository}/freebayes:${params.tag}" - $RunGenotypeGVCFs.container = "${params.repository}/gatk:${params.tag}" - $RunHaplotypecaller.container = "${params.repository}/gatk:${params.tag}" + $RunFreeBayes.container = "${params.repository}/sarek:${params.tag}" + $RunGenotypeGVCFs.container = "${params.repository}/sarek:${params.tag}" + $RunHaplotypecaller.container = "${params.repository}/sarek:${params.tag}" $RunManta.container = "${params.repository}/sarek:${params.tag}" $RunMultiQC.container = "${params.repository}/sarek:${params.tag}" - $RunMutect1.container = "${params.repository}/mutect1:${params.tag}" - $RunMutect2.container = "${params.repository}/gatk:${params.tag}" + $RunMutect2.container = "${params.repository}/sarek:${params.tag}" $RunSamtoolsStats.container = "${params.repository}/sarek:${params.tag}" $RunSingleManta.container = "${params.repository}/sarek:${params.tag}" $RunSingleStrelka.container = "${params.repository}/sarek:${params.tag}" diff --git a/configuration/uppmax-localhost.config b/configuration/uppmax-localhost.config index 1f4cd5bb56..58597c59b6 100644 --- a/configuration/uppmax-localhost.config +++ b/configuration/uppmax-localhost.config @@ -117,10 +117,6 @@ process { } $RunMultiQC { } - $RunMutect1 { - cpus = 1 - memory = {params.singleCPUMem * task.attempt} - } $RunMutect2 { cpus = 1 memory = {params.singleCPUMem * task.attempt} diff --git a/configuration/uppmax-slurm.config b/configuration/uppmax-slurm.config index de1639964f..d4de89d0fb 100644 --- a/configuration/uppmax-slurm.config +++ b/configuration/uppmax-slurm.config @@ -116,12 +116,6 @@ process { $RunMultiQC { errorStrategy = { task.exitStatus == 143 ? 'retry' : 'ignore' } } - $RunMutect1 { - queue = 'core' - time = {params.runTime * task.attempt} - cpus = 1 - memory = {params.singleCPUMem * task.attempt} - } $RunMutect2 { cpus = 1 memory = {params.singleCPUMem * task.attempt} diff --git a/containers/freebayes/Dockerfile b/containers/freebayes/Dockerfile deleted file mode 100644 index efacdde8ee..0000000000 --- a/containers/freebayes/Dockerfile +++ /dev/null @@ -1,30 +0,0 @@ -FROM debian:8.9 - -LABEL \ - author="Maxime Garcia" \ - description="FreeBayes image for use in Sarek" \ - maintainer="maxime.garcia@scilifelab.se" - -# Install libraries -RUN \ - apt-get update && apt-get install -y --no-install-recommends \ - build-essential \ - ca-certificates \ - cmake \ - git \ - wget \ - zlib1g-dev \ - && rm -rf /var/lib/apt/lists/* - -# Setup ENV variables -ENV FREEBAYES_VERSION=1.1.0 - -# Install BCFTools -RUN \ - git clone --recursive git://github.com/ekg/freebayes.git freebayes \ - --branch v${FREEBAYES_VERSION} \ - && cd freebayes \ - && make \ - && make install \ - && cd .. \ -&& rm -rf freebayes diff --git a/containers/mutect1/Dockerfile b/containers/mutect1/Dockerfile deleted file mode 100644 index 7068fb963c..0000000000 --- a/containers/mutect1/Dockerfile +++ /dev/null @@ -1,25 +0,0 @@ -FROM openjdk:7-slim - -LABEL \ - author="Maxime Garcia" \ - description="MuTect 1.1.5 image for use in Sarek" \ - maintainer="maxime.garcia@scilifelab.se" - -# Install libraries -RUN \ - apt-get update && apt-get install -y --no-install-recommends \ - wget \ - && rm -rf /var/lib/apt/lists/* - -# Setup ENV variables -ENV \ - MUTECT_HOME=/opt/mutect-1.1.5 \ - MUTECT_VERSION=1.1.5 - -# Install MuTect1 -RUN \ - wget --quiet -O muTect-${MUTECT_VERSION}-bin.zip \ - https://github.com/broadinstitute/mutect/releases/download/${MUTECT_VERSION}/muTect-${MUTECT_VERSION}-bin.zip \ - && unzip muTect-${MUTECT_VERSION}-bin.zip -d ${MUTECT_HOME} \ - && rm muTect-${MUTECT_VERSION}-bin.zip \ -&& mv ${MUTECT_HOME}/muTect-${MUTECT_VERSION}.jar ${MUTECT_HOME}/muTect.jar diff --git a/lib/SarekUtils.groovy b/lib/SarekUtils.groovy index a21ec442bb..54f41ca965 100644 --- a/lib/SarekUtils.groovy +++ b/lib/SarekUtils.groovy @@ -126,7 +126,6 @@ class SarekUtils { 'gvcf-hc' : "${outDir}/VariantCalling/HaplotypeCallerGVCF", 'haplotypecaller' : "${outDir}/VariantCalling/HaplotypeCaller", 'manta' : "${outDir}/VariantCalling/Manta", - 'mutect1' : "${outDir}/VariantCalling/MuTect1", 'mutect2' : "${outDir}/VariantCalling/MuTect2", 'strelka' : "${outDir}/VariantCalling/Strelka", 'strelkabp' : "${outDir}/VariantCalling/StrelkaBP", diff --git a/scripts/do_all.sh b/scripts/do_all.sh index e4780958a1..d0e693ff57 100755 --- a/scripts/do_all.sh +++ b/scripts/do_all.sh @@ -57,8 +57,8 @@ function toLower() { if [[ $TOOL = docker ]] && [[ GRCh37,GRCh38 =~ $GENOME ]] then - nextflow run buildContainers.nf -profile ${PROFILE} --verbose --docker ${PUSH} --repository ${REPOSITORY} --tag ${TAG} --containers freebayes,gatk,gatk4,igvtools,mutect1,picard,qctools,r-base,runallelecount,sarek,snpeff + nextflow run buildContainers.nf -profile ${PROFILE} --verbose --docker ${PUSH} --repository ${REPOSITORY} --tag ${TAG} --containers r-base,runallelecount,sarek,snpeff nextflow run buildContainers.nf -profile ${PROFILE} --verbose --docker ${PUSH} --repository ${REPOSITORY} --tag ${TAG} --containers snpeff$(toLower ${GENOME}),vep$(toLower ${GENOME}) else - nextflow run buildContainers.nf -profile ${PROFILE} --verbose --singularity --repository ${REPOSITORY} --tag ${TAG} --containerPath containers/ --containers freebayes,gatk,gatk4,igvtools,mutect1,picard,qctools,r-base,runallelecount,sarek,snpeff$(toLower ${GENOME}),vep$(toLower ${GENOME}) + nextflow run buildContainers.nf -profile ${PROFILE} --verbose --singularity --repository ${REPOSITORY} --tag ${TAG} --containerPath containers/ --containers r-base,runallelecount,sarek,snpeff$(toLower ${GENOME}),vep$(toLower ${GENOME}) fi diff --git a/scripts/test.sh b/scripts/test.sh index a9f340d7c4..0d9eefa3c6 100755 --- a/scripts/test.sh +++ b/scripts/test.sh @@ -104,7 +104,7 @@ fi if [[ ALL,TOOLS =~ $TEST ]] then - run_wrapper --somatic --sample $SAMPLE --variantCalling --tools FreeBayes,HaplotypeCaller,MuTect1,MuTect2 + run_wrapper --somatic --sample $SAMPLE --variantCalling --tools FreeBayes,HaplotypeCaller,MuTect2 fi if [[ ALL,MANTA =~ $TEST ]] diff --git a/somaticVC.nf b/somaticVC.nf index f8d231582f..d39ac23b5c 100644 --- a/somaticVC.nf +++ b/somaticVC.nf @@ -376,12 +376,13 @@ process ConcatVCF { output: set variantCaller, idPatient, idSampleNormal, idSampleTumor, file("*.vcf.gz"), file("*.vcf.gz.tbi") into vcfConcatenated - when: ('mutect1' in tools || 'mutect2' in tools || 'freebayes' in tools ) && !params.onlyQC + when: ( 'mutect2' in tools || 'freebayes' in tools ) && !params.onlyQC script: outputFile = "${variantCaller}_${idSampleTumor}_vs_${idSampleNormal}.vcf" """ + set -euo pipefail # first make a header from one of the VCF intervals # get rid of interval information only from the GATK command-line, but leave the rest FIRSTVCF=\$(ls *.vcf | head -n 1) @@ -927,7 +928,6 @@ def defineToolList() { 'freebayes', 'haplotypecaller', 'manta', - 'mutect1', 'mutect2', 'strelka' ] From 4db0d5d580b780ece815ef572e441a698b8da143 Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Tue, 17 Jul 2018 20:15:37 +0200 Subject: [PATCH 19/38] commenting out AFs as it will be for a new sprint --- configuration/genomes.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configuration/genomes.config b/configuration/genomes.config index 1653879a08..53ceecf9cf 100644 --- a/configuration/genomes.config +++ b/configuration/genomes.config @@ -43,8 +43,8 @@ params { knownIndelsIndex = "${params.genome_base}/{Mills_and_1000G_gold_standard.indels.hg38,beta/Homo_sapiens_assembly38.known_indels}.vcf.gz.tbi" snpeffDb = "GRCh38.86" // This a nasty-looking list of allele-frequencies files. Add/remove files to match to your sets - AF_files = "${params.genome_base}/{00-All.dbsnp_151.hg38.CAF.TOPMED.alternate.allele.freq,hapmap_3.3_grch38_pop_stratified_af.HMAF,SweGen_hg38_stratified.SWAF}.vcf" - AF_indexes = "${params.genome_base}/{00-All.dbsnp_151.hg38.CAF.TOPMED.alternate.allele.freq,hapmap_3.3_grch38_pop_stratified_af.HMAF,SweGen_hg38_stratified.SWAF}.vcf.idx" + //AF_files = "${params.genome_base}/{00-All.dbsnp_151.hg38.CAF.TOPMED.alternate.allele.freq,hapmap_3.3_grch38_pop_stratified_af.HMAF,SweGen_hg38_stratified.SWAF}.vcf" + //AF_indexes = "${params.genome_base}/{00-All.dbsnp_151.hg38.CAF.TOPMED.alternate.allele.freq,hapmap_3.3_grch38_pop_stratified_af.HMAF,SweGen_hg38_stratified.SWAF}.vcf.idx" } 'smallGRCh37' { acLoci = "${params.genome_base}/1000G_phase3_20130502_SNP_maf0.3.small.loci" From efd11c67d815141fa756c64ded71b523763c4acf Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Tue, 17 Jul 2018 21:29:07 +0200 Subject: [PATCH 20/38] vcfanno is merged to the core --- containers/vcfanno/Dockerfile | 15 --------------- containers/vcfanno/environment.yml | 9 --------- containers/vcfanno/to_build | 5 ----- 3 files changed, 29 deletions(-) delete mode 100644 containers/vcfanno/Dockerfile delete mode 100644 containers/vcfanno/environment.yml delete mode 100644 containers/vcfanno/to_build diff --git a/containers/vcfanno/Dockerfile b/containers/vcfanno/Dockerfile deleted file mode 100644 index 80a6473d02..0000000000 --- a/containers/vcfanno/Dockerfile +++ /dev/null @@ -1,15 +0,0 @@ -FROM nfcore/base:latest - -LABEL \ - author="Szilveszter Juhos" \ - description="vcfanno Image for Sarek" \ - maintainer="szilveszter.juhos@scilifelab.se" - -COPY environment.yml / - -RUN \ - conda env create -f /environment.yml && \ - conda clean -a - - # Export PATH -ENV PATH /opt/conda/envs/sarek-vcfanno-2.0/bin:$PATH diff --git a/containers/vcfanno/environment.yml b/containers/vcfanno/environment.yml deleted file mode 100644 index bd8418268f..0000000000 --- a/containers/vcfanno/environment.yml +++ /dev/null @@ -1,9 +0,0 @@ -# You can use this file to create a conda environment: -# conda env create -f environment.yml -name: sarek-vcfanno-2.0 -channels: - - bioconda - - conda-forge - - defaults -dependencies: - - vcfanno=0.2.8 diff --git a/containers/vcfanno/to_build b/containers/vcfanno/to_build deleted file mode 100644 index c2d06a77f9..0000000000 --- a/containers/vcfanno/to_build +++ /dev/null @@ -1,5 +0,0 @@ -docker login szilvajuhos -docker build -t szilvajuhos/sarek-vcfanno:latest . -docker images -docker push szilvajuhos/sarek-vcfanno:latest -singularity pull docker://szilvajuhos/sarek-vcfanno:latest From e81e10035f3abf3095c46f90a971710ea7bc1b40 Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Tue, 17 Jul 2018 21:44:29 +0200 Subject: [PATCH 21/38] removed igvtools (as went to core) --- buildContainers.nf | 6 ------ buildReferences.nf | 2 +- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/buildContainers.nf b/buildContainers.nf index 7122075e7b..92294ff432 100644 --- a/buildContainers.nf +++ b/buildContainers.nf @@ -177,12 +177,6 @@ def grabRevision() { def defineContainersList(){ // Return list of authorized containers return [ - 'freebayes', - 'gatk', - 'gatk4', - 'igvtools', - 'picard', - 'qctools', 'r-base', 'runallelecount', 'sarek', diff --git a/buildReferences.nf b/buildReferences.nf index eb185ce271..caa47761f0 100644 --- a/buildReferences.nf +++ b/buildReferences.nf @@ -196,7 +196,7 @@ process BuildVCFIndex { script: """ - \$IGVTOOLS_HOME/igvtools index ${f_reference} + igvtools index ${f_reference} """ } From 7cf77b607eac835aa5bbb3b3a3e3f7ecd6e14ae4 Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Mon, 30 Jul 2018 11:53:07 +0200 Subject: [PATCH 22/38] Not using the log util, as it is not working --- lib/SarekUtils.groovy | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/lib/SarekUtils.groovy b/lib/SarekUtils.groovy index 884efe8e11..43287f0388 100644 --- a/lib/SarekUtils.groovy +++ b/lib/SarekUtils.groovy @@ -1,9 +1,6 @@ import static nextflow.Nextflow.file import nextflow.Channel -import groovy.util.logging.Slf4j -@Grab('ch.qos.logback:logback-classic:1.2.1') -@Slf4j class SarekUtils { // Check file extension @@ -109,8 +106,8 @@ class SarekUtils { def f = file(fileToCheck) // this is an expanded wildcard: we can assume all files exist if (f instanceof List && f.size() > 0) return true - else if (!f.exists()) { - log.info "Missing references: ${referenceFile} ${fileToCheck}" + else if (!f.exists()) { + println "Missing references: ${referenceFile} ${fileToCheck}" return false } return true From e2cacd7075304be41042875c353478369d4d8074 Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Tue, 31 Jul 2018 18:22:34 +0200 Subject: [PATCH 23/38] fixed indent --- germlineVC.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/germlineVC.nf b/germlineVC.nf index dc10956f9d..4ffa66ae97 100644 --- a/germlineVC.nf +++ b/germlineVC.nf @@ -313,9 +313,9 @@ process RunHaplotypecaller { -R ${genomeFile} \ -I ${bam} \ -L ${intervalBed} \ - --dbsnp ${dbsnp} \ - -O ${intervalBed.baseName}_${idSample}.g.vcf \ - --emit-ref-confidence GVCF + --dbsnp ${dbsnp} \ + -O ${intervalBed.baseName}_${idSample}.g.vcf \ + --emit-ref-confidence GVCF """ } hcGenomicVCF = hcGenomicVCF.groupTuple(by:[0,1,2,3]) From 735463b1b0b73f6d5569951c9499249c8459e0e6 Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Tue, 31 Jul 2018 18:42:59 +0200 Subject: [PATCH 24/38] gatk-launch rename --- germlineVC.nf | 6 +++--- main.nf | 6 +++--- somaticVC.nf | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/germlineVC.nf b/germlineVC.nf index 4ffa66ae97..4dd71fb528 100644 --- a/germlineVC.nf +++ b/germlineVC.nf @@ -308,7 +308,7 @@ process RunHaplotypecaller { script: """ - gatk-launch --java-options "-Xmx${task.memory.toGiga()}g -Xms6000m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10" \ + gatk --java-options "-Xmx${task.memory.toGiga()}g -Xms6000m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10" \ HaplotypeCaller \ -R ${genomeFile} \ -I ${bam} \ @@ -343,9 +343,9 @@ process RunGenotypeGVCFs { script: // Using -L is important for speed and we have to index the interval files also """ - gatk-launch IndexFeatureFile -F ${gvcf} + gatk IndexFeatureFile -F ${gvcf} - gatk-launch --java-options -Xmx${task.memory.toGiga()}g \ + gatk --java-options -Xmx${task.memory.toGiga()}g \ GenotypeGVCFs \ -R ${genomeFile} \ -L ${intervalBed} \ diff --git a/main.nf b/main.nf index a452c26550..ac7b102b24 100644 --- a/main.nf +++ b/main.nf @@ -261,7 +261,7 @@ process MarkDuplicates { script: """ - gatk-launch --java-options -Xmx${task.memory.toGiga()}g \ + gatk --java-options -Xmx${task.memory.toGiga()}g \ MarkDuplicates \ --INPUT ${bam} \ --METRICS_FILE ${bam}.metrics \ @@ -323,7 +323,7 @@ process CreateRecalibrationTable { script: known = knownIndels.collect{ "--known-sites ${it}" }.join(' ') """ - gatk-launch --java-options -Xmx${task.memory.toGiga()}g \ + gatk --java-options -Xmx${task.memory.toGiga()}g \ BaseRecalibrator \ --input ${bam} \ --output ${idSample}.recal.table \ @@ -387,7 +387,7 @@ process RecalibrateBam { script: """ - gatk-launch --java-options -Xmx${task.memory.toGiga()}g \ + gatk --java-options -Xmx${task.memory.toGiga()}g \ ApplyBQSR \ -R ${genomeFile} \ --input ${bam} \ diff --git a/somaticVC.nf b/somaticVC.nf index d39ac23b5c..0e2347bb0e 100644 --- a/somaticVC.nf +++ b/somaticVC.nf @@ -306,7 +306,7 @@ process RunMutect2 { script: """ - gatk-launch --java-options "-Xmx${task.memory.toGiga()}g" \ + gatk --java-options "-Xmx${task.memory.toGiga()}g" \ Mutect2 \ -R ${genomeFile}\ -I ${bamTumor} -tumor ${idSampleTumor} \ From a64dd3c87e74835e715fe7c8f2a4ce683666faf2 Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Tue, 31 Jul 2018 18:44:42 +0200 Subject: [PATCH 25/38] MarkDuplicates got all the memory --- configuration/uppmax-localhost.config | 4 ++-- configuration/uppmax-slurm.config | 5 +---- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/configuration/uppmax-localhost.config b/configuration/uppmax-localhost.config index 58597c59b6..92466106ae 100644 --- a/configuration/uppmax-localhost.config +++ b/configuration/uppmax-localhost.config @@ -14,7 +14,7 @@ env { params { genome_base = params.genome == 'GRCh37' ? '/sw/data/uppnex/ToolBox/ReferenceAssemblies/hg38make/bundle/2.8/b37' : params.genome == 'GRCh38' ? '/sw/data/uppnex/ToolBox/hg38bundle' : 'References/smallGRCh37' singleCPUMem = 8.GB - totalMemory = 104.GB // change to 240 on irma + totalMemory = 92.GB // change to 240 on irma } executor { @@ -70,7 +70,7 @@ process { memory = {params.totalMemory} } $MarkDuplicates { - memory = {params.singleCPUMem * 2 * task.attempt} + // for deep sequencing we do need all the memory } $MergeBams { cpus = 16 diff --git a/configuration/uppmax-slurm.config b/configuration/uppmax-slurm.config index d4de89d0fb..9858ca35cd 100644 --- a/configuration/uppmax-slurm.config +++ b/configuration/uppmax-slurm.config @@ -52,10 +52,7 @@ process { time = {params.runTime * task.attempt} } $MarkDuplicates { - cpus = 1 - memory = {params.singleCPUMem * 8 * task.attempt} - queue = 'core' - time = {params.runTime * task.attempt} + // looks like overkill, but for deep sequencing we do need all the memory } $MergeBams { memory = {params.singleCPUMem * task.attempt} From 5ab4491ad9a6048a389b086369e1dd34a1b028e2 Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Wed, 1 Aug 2018 17:19:31 +0200 Subject: [PATCH 26/38] Latest versions --- containers/sarek/Dockerfile | 5 +++-- containers/sarek/environment.yml | 6 +++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/containers/sarek/Dockerfile b/containers/sarek/Dockerfile index 28db2cb0ee..340e699e71 100644 --- a/containers/sarek/Dockerfile +++ b/containers/sarek/Dockerfile @@ -1,9 +1,10 @@ FROM nfcore/base:latest LABEL \ - authors="Maxime.Gracia@scilifelab.se, Szilveszter.Juhos@scilifelab.se" \ + authors="Maxime.Garcia@scilifelab.se, Szilveszter.Juhos@scilifelab.se" \ description="Image with tools used in Sarek" \ - maintainers="Maxime.Gracia@scilifelab.se, Szilveszter.Juhos@scilifelab.se" + maintainer="Maxime Garcia , Szilveszter Juhos " COPY environment.yml / RUN conda env update -n root -f /environment.yml && conda clean -a +ENV PATH /opt/conda/bin:$PATH diff --git a/containers/sarek/environment.yml b/containers/sarek/environment.yml index 7187785ab9..25f08613f0 100644 --- a/containers/sarek/environment.yml +++ b/containers/sarek/environment.yml @@ -12,10 +12,10 @@ dependencies: - conda-forge::openjdk=8.0.144 # Needed for FastQC docker - see bioconda/bioconda-recipes#5026 - fastqc=0.11.7 - freebayes=1.2.0 - - gatk4=4.0.3.0 - - htslib=1.7 + - gatk4=4.0.6.0 + - htslib=1.9 - igvtools=2.3.93 - - manta=1.3.0 + - manta=1.4.0 - multiqc=1.5 - qualimap=2.2.2a - samtools=1.8 From 537faa83656a291588bf9719901f04c88f96dc9b Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Wed, 1 Aug 2018 17:21:04 +0200 Subject: [PATCH 27/38] MarkDuplicates MAX_RECORDS_IN_RAM decreased to 50000 --- main.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/main.nf b/main.nf index ac7b102b24..44f8384ccf 100644 --- a/main.nf +++ b/main.nf @@ -263,6 +263,7 @@ process MarkDuplicates { """ gatk --java-options -Xmx${task.memory.toGiga()}g \ MarkDuplicates \ + --MAX_RECORDS_IN_RAM 50000 \ --INPUT ${bam} \ --METRICS_FILE ${bam}.metrics \ --TMP_DIR . \ From 24b1ffb1bef4ef02117ca97dd20fa87140250597 Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Wed, 1 Aug 2018 17:21:41 +0200 Subject: [PATCH 28/38] Picard removed from buildReferences.nf --- buildReferences.nf | 23 +++++++++++------------ configuration/containers.config | 3 +-- configuration/singularity-path.config | 2 +- configuration/uppmax-localhost.config | 2 +- configuration/uppmax-slurm.config | 2 +- 5 files changed, 15 insertions(+), 17 deletions(-) diff --git a/buildReferences.nf b/buildReferences.nf index caa47761f0..cfc7a567a2 100644 --- a/buildReferences.nf +++ b/buildReferences.nf @@ -29,7 +29,7 @@ kate: syntax groovy; space-indent on; indent-width 2; - ProcessReference - Download all references if needed - DecompressFile - Extract files if needed - BuildBWAindexes - Build indexes for BWA - - BuildPicardIndex - Build index with Picard + - BuildReferenceIndex - Build index for FASTA refs - BuildSAMToolsIndex - Build index with SAMTools - BuildVCFIndex - Build index for VCF files ================================================================================ @@ -98,7 +98,7 @@ if (params.verbose) ch_decompressedFiles = ch_decompressedFiles.view { ch_fastaFile = Channel.create() ch_fastaForBWA = Channel.create() -ch_fastaForPicard = Channel.create() +ch_fastaReference = Channel.create() ch_fastaForSAMTools = Channel.create() ch_otherFile = Channel.create() ch_vcfFile = Channel.create() @@ -108,7 +108,7 @@ ch_decompressedFiles it =~ ".fasta" ? 0 : it =~ ".vcf" ? 1 : 2} -(ch_fastaForBWA, ch_fastaForPicard, ch_fastaForSAMTools, ch_fastaFileToKeep) = ch_fastaFile.into(4) +(ch_fastaForBWA, ch_fastaReference, ch_fastaForSAMTools, ch_fastaFileToKeep) = ch_fastaFile.into(4) (ch_vcfFile, ch_vcfFileToKeep) = ch_vcfFile.into(2) ch_notCompressedfiles @@ -137,29 +137,28 @@ if (params.verbose) bwaIndexes.flatten().view { "BWA index : ${it.fileName}" } -process BuildPicardIndex { +process BuildReferenceIndex { tag {f_reference} publishDir params.outDir, mode: 'link' input: - file(f_reference) from ch_fastaForPicard + file(f_reference) from ch_fastaReference output: - file("*.dict") into ch_picardIndex + file("*.dict") into ch_referenceIndex script: """ - java -Xmx${task.memory.toGiga()}g \ - -jar \$PICARD_HOME/picard.jar \ + gatk --java-options "-Xmx${task.memory.toGiga()}g" \ CreateSequenceDictionary \ - REFERENCE=${f_reference} \ - OUTPUT=${f_reference.baseName}.dict + --REFERENCE ${f_reference} \ + --OUTPUT ${f_reference.baseName}.dict """ } -if (params.verbose) ch_picardIndex.view { - "Picard index : ${it.fileName}" +if (params.verbose) ch_referenceIndex.view { + "Reference index : ${it.fileName}" } process BuildSAMToolsIndex { diff --git a/configuration/containers.config b/configuration/containers.config index 195c240326..5639a291fa 100644 --- a/configuration/containers.config +++ b/configuration/containers.config @@ -9,7 +9,7 @@ process { $BuildBWAindexes.container = "${params.repository}/sarek:${params.tag}" - $BuildPicardIndex.container = "${params.repository}/picard:${params.tag}" + $BuildReferenceIndex.container = "${params.repository}/sarek:${params.tag}" $BuildSAMToolsIndex.container = "${params.repository}/sarek:${params.tag}" $BuildVCFIndex.container = "${params.repository}/sarek:${params.tag}" $CompressVCF.container = "${params.repository}/sarek:${params.tag}" @@ -25,7 +25,6 @@ process { $GetVersionFreeBayes.container = "${params.repository}/sarek:${params.tag}" $GetVersionGATK.container = "${params.repository}/sarek:${params.tag}" $GetVersionManta.container = "${params.repository}/sarek:${params.tag}" - $GetVersionPicard.container = "${params.repository}/picard:${params.tag}" $GetVersionSnpeff.container = {params.genome == 'GRCh38' ? "${params.repository}/snpeffgrch38:${params.tag}" : "${params.repository}/snpeffgrch37:${params.tag}"} $GetVersionStrelka.container = "${params.repository}/sarek:${params.tag}" $GetVersionVCFtools.container = "${params.repository}/sarek:${params.tag}" diff --git a/configuration/singularity-path.config b/configuration/singularity-path.config index 5392f30717..9b51a46318 100644 --- a/configuration/singularity-path.config +++ b/configuration/singularity-path.config @@ -14,7 +14,7 @@ singularity { process { $BuildBWAindexes.container = "${params.containerPath}/sarek-${params.tag}.img" - $BuildPicardIndex.container = "${params.containerPath}/sarek-${params.tag}.img" + $BuildReferenceIndex.container = "${params.containerPath}/sarek-${params.tag}.img" $BuildSAMToolsIndex.container = "${params.containerPath}/sarek-${params.tag}.img" $BuildVCFIndex.container = "${params.containerPath}/sarek-${params.tag}.img" $CompressVCF.container = "${params.containerPath}/sarek-${params.tag}.img" diff --git a/configuration/uppmax-localhost.config b/configuration/uppmax-localhost.config index 92466106ae..4e6abb1f64 100644 --- a/configuration/uppmax-localhost.config +++ b/configuration/uppmax-localhost.config @@ -43,7 +43,7 @@ process { $BuildBWAindexes { memory = {params.totalMemory} // TODO This is likely too high } - $BuildPicardIndex { + $BuildReferenceIndex { memory = {params.totalMemory} // TODO This is likely too high } $BuildSAMToolsIndex { diff --git a/configuration/uppmax-slurm.config b/configuration/uppmax-slurm.config index 9858ca35cd..07f95a74be 100644 --- a/configuration/uppmax-slurm.config +++ b/configuration/uppmax-slurm.config @@ -27,7 +27,7 @@ process { $BuildBWAindexes { } - $BuildPicardIndex { + $BuildReferenceIndex { } $BuildSAMToolsIndex { } From a5b3f7c08036d9f651d5130bc941cc28bc460763 Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Wed, 1 Aug 2018 17:32:03 +0200 Subject: [PATCH 29/38] adding all the cpus and mem to MarkDuplicates --- configuration/uppmax-localhost.config | 2 ++ 1 file changed, 2 insertions(+) diff --git a/configuration/uppmax-localhost.config b/configuration/uppmax-localhost.config index 4e6abb1f64..c83bab64ea 100644 --- a/configuration/uppmax-localhost.config +++ b/configuration/uppmax-localhost.config @@ -71,6 +71,8 @@ process { } $MarkDuplicates { // for deep sequencing we do need all the memory + cpus = 16 + memory = {params.totalMemory} } $MergeBams { cpus = 16 From 6ebed7ab113b8db041d14860ab62166a5e9881fd Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Thu, 2 Aug 2018 15:29:34 +0200 Subject: [PATCH 30/38] update submodule Sarek-data a.k.a. #609 --- Sarek-data | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Sarek-data b/Sarek-data index d717855502..ba299d76c8 160000 --- a/Sarek-data +++ b/Sarek-data @@ -1 +1 @@ -Subproject commit d717855502f2e7501e124f2fe116733fc01ae114 +Subproject commit ba299d76c851dc916c051ef5f77d7a4ab39dcc9f From 58bab1363b4d97e27fa8e7372803138698b334ae Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Thu, 2 Aug 2018 15:30:34 +0200 Subject: [PATCH 31/38] s/gatk-launch/gatk/ --- lib/QC.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/QC.groovy b/lib/QC.groovy index e3c6416f76..2f6885d84b 100644 --- a/lib/QC.groovy +++ b/lib/QC.groovy @@ -59,7 +59,7 @@ class QC { // Get GATK version static def getVersionGATK() { """ - gatk-launch ApplyBQSR --help 2>&1| awk -F/ '/java/{for(i=1;i<=NF;i++){if(\$i~/gatk4/){sub("gatk4-","",\$i);print \$i>"v_gatk.txt"}}}' + gatk ApplyBQSR --help 2>&1| awk -F/ '/java/{for(i=1;i<=NF;i++){if(\$i~/gatk4/){sub("gatk4-","",\$i);print \$i>"v_gatk.txt"}}}' """ } From 12826fddae3b96a1251f262f0aebf569c8a5032e Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Thu, 2 Aug 2018 15:30:56 +0200 Subject: [PATCH 32/38] removing realign parts --- main.nf | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/main.nf b/main.nf index 44f8384ccf..85f22d8a8e 100644 --- a/main.nf +++ b/main.nf @@ -30,7 +30,6 @@ kate: syntax groovy; space-indent on; indent-width 2; - MapReads - Map reads with BWA - MergeBams - Merge BAMs if multilane samples - MarkDuplicates - Mark Duplicates with GATK4 - - RealignerTargetCreator - Create realignment target intervals - IndelRealigner - Realign BAMs as T/N pair - CreateRecalibrationTable - Create Recalibration Table with BaseRecalibrator - RecalibrateBam - Recalibrate Bam with PrintReads @@ -96,7 +95,6 @@ if (tsvPath) { tsvFile = file(tsvPath) switch (step) { case 'mapping': fastqFiles = extractFastq(tsvFile); break - case 'realign': bamFiles = SarekUtils.extractBams(tsvFile, "somatic"); break case 'recalibrate': bamFiles = extractRecal(tsvFile); break default: exit 1, "Unknown step ${step}" } @@ -317,9 +315,9 @@ process CreateRecalibrationTable { output: set idPatient, status, idSample, file("${idSample}.recal.table") into recalibrationTable - set idPatient, status, idSample, val("${idSample}_${status}.md.real.bam"), val("${idSample}_${status}.md.real.bai"), val("${idSample}.recal.table") into recalibrationTableTSV + set idPatient, status, idSample, val("${idSample}_${status}.md.bam"), val("${idSample}_${status}.md.bai"), val("${idSample}.recal.table") into recalibrationTableTSV - when: ( step == 'mapping' || step == 'realign' ) && !params.onlyQC + when: ( step == 'mapping' ) && !params.onlyQC script: known = knownIndels.collect{ "--known-sites ${it}" }.join(' ') @@ -542,7 +540,6 @@ def defineReferenceMap() { def defineStepList() { return [ 'mapping', - 'realign', 'recalibrate' ] } @@ -675,7 +672,6 @@ def helpMessage() { log.info " Option to start workflow" log.info " Possible values are:" log.info " mapping (default, will start workflow with FASTQ files)" - log.info " realign (will start workflow with non-realigned BAM files)" log.info " recalibrate (will start workflow with non-recalibrated BAM files)" log.info " --noReports" log.info " Disable QC tools and MultiQC to generate a HTML report" From b21dc688a1bb7e8c25c65aa9835a0c43a72a7c24 Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Thu, 2 Aug 2018 15:31:56 +0200 Subject: [PATCH 33/38] removing realign parts --- scripts/test.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/test.sh b/scripts/test.sh index c57abbcabd..6fda73b9a5 100755 --- a/scripts/test.sh +++ b/scripts/test.sh @@ -91,7 +91,6 @@ fi if [[ ALL,STEP =~ $TEST ]] then run_wrapper --germline --sampleDir Sarek-data/testdata/tiny/normal - run_wrapper --germline --step realign --noReports run_wrapper --germline --step recalibrate --noReports clean_repo fi @@ -104,7 +103,7 @@ fi if [[ ALL,TOOLS =~ $TEST ]] then - run_wrapper --somatic --sample $SAMPLE --variantCalling --tools FreeBayes,HaplotypeCaller,MuTect2 + run_wrapper --somatic --sample $SAMPLE --variantCalling --tools FreeBayes,HaplotypeCaller,Mutect2 fi if [[ ALL,MANTA =~ $TEST ]] From db137ce23d851439f5d7b7b353cac9ca9298a8f5 Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Thu, 2 Aug 2018 15:38:09 +0200 Subject: [PATCH 34/38] small review changes --- annotate.nf | 2 +- configuration/uppmax-localhost.config | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/annotate.nf b/annotate.nf index 8d45ae3fff..a87f252fbc 100644 --- a/annotate.nf +++ b/annotate.nf @@ -73,7 +73,7 @@ vcfToAnnotate = Channel.create() vcfNotToAnnotate = Channel.create() if (annotateVCF == []) { -// by default we annotate both germline and somatic results that we can find in the VariantCalling directory +// we annote all available vcfs by default that we can find in the VariantCalling directory Channel.empty().mix( Channel.fromPath("${directoryMap.haplotypecaller}/*.vcf.gz") .flatten().map{vcf -> ['haplotypecaller', vcf]}, diff --git a/configuration/uppmax-localhost.config b/configuration/uppmax-localhost.config index 4e6abb1f64..d10f5a57e1 100644 --- a/configuration/uppmax-localhost.config +++ b/configuration/uppmax-localhost.config @@ -14,7 +14,7 @@ env { params { genome_base = params.genome == 'GRCh37' ? '/sw/data/uppnex/ToolBox/ReferenceAssemblies/hg38make/bundle/2.8/b37' : params.genome == 'GRCh38' ? '/sw/data/uppnex/ToolBox/hg38bundle' : 'References/smallGRCh37' singleCPUMem = 8.GB - totalMemory = 92.GB // change to 240 on irma + totalMemory = 104.GB // change to 240 on irma } executor { From 97d3fc5c991b9ce0b5ce9ce0d79425cc2800618a Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Sat, 4 Aug 2018 02:27:41 +0200 Subject: [PATCH 35/38] adding singularty environments for tests --- scripts/test.sh | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/scripts/test.sh b/scripts/test.sh index 6fda73b9a5..bcd017d1f1 100755 --- a/scripts/test.sh +++ b/scripts/test.sh @@ -9,6 +9,18 @@ SAMPLE=Sarek-data/testdata/tsv/tiny.tsv TEST=ALL TRAVIS=${TRAVIS:-false} +TMPDIR=`pwd`/tmp +mkdir -p $TMPDIR +export NXF_SINGULARITY_CACHEDIR=$TMPDIR +export NXF_TEMP=$TMPDIR + +export SINGULARITY_TMPDIR=$TMPDIR +export SINGULARITY_CACHEDIR=$TMPDIR + + +# remove Reference directory +rm -rf References + while [[ $# -gt 0 ]] do key=$1 From 98cf879e1375d10ca75f72daa6cc2e4af203274d Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Sat, 4 Aug 2018 02:28:13 +0200 Subject: [PATCH 36/38] VEP fancy parametrisation --- annotate.nf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/annotate.nf b/annotate.nf index a87f252fbc..d3571a15cd 100644 --- a/annotate.nf +++ b/annotate.nf @@ -228,11 +228,12 @@ process RunVEP { finalannotator = annotator == "snpeff" ? 'merge' : 'vep' genome = params.genome == 'smallGRCh37' ? 'GRCh37' : params.genome """ - vep \ + vep --dir /opt/vep/.vep/ \ -i ${vcf} \ -o ${vcf.simpleName}_VEP.ann.vcf \ --assembly ${genome} \ --cache \ + --cache_version 91 \ --database \ --everything \ --filter_common \ From 393da8b72d4e58f439008696d628222e9f729e7e Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Tue, 14 Aug 2018 20:27:06 +0200 Subject: [PATCH 37/38] memory adjustments for samtools and MarkDuplicates --- configuration/uppmax-localhost.config | 4 ++-- configuration/uppmax-slurm.config | 4 +++- main.nf | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/configuration/uppmax-localhost.config b/configuration/uppmax-localhost.config index bb234a8762..f5907f6181 100644 --- a/configuration/uppmax-localhost.config +++ b/configuration/uppmax-localhost.config @@ -70,9 +70,9 @@ process { memory = {params.totalMemory} } $MarkDuplicates { - // for deep sequencing we do need all the memory + // Actually the -Xmx value should be kept lower cpus = 16 - memory = {params.totalMemory} + memory = {2 * params.singleCPUMem} } $MergeBams { cpus = 16 diff --git a/configuration/uppmax-slurm.config b/configuration/uppmax-slurm.config index 07f95a74be..821a3855d8 100644 --- a/configuration/uppmax-slurm.config +++ b/configuration/uppmax-slurm.config @@ -52,7 +52,9 @@ process { time = {params.runTime * task.attempt} } $MarkDuplicates { - // looks like overkill, but for deep sequencing we do need all the memory + // Actually the -Xmx value should be kept lower + cpus = 16 + memory = {2 * params.singleCPUMem} } $MergeBams { memory = {params.singleCPUMem * task.attempt} diff --git a/main.nf b/main.nf index 85f22d8a8e..ecd26d3461 100644 --- a/main.nf +++ b/main.nf @@ -179,7 +179,7 @@ process MapReads { """ bwa mem -R \"${readGroup}\" ${extra} -t ${task.cpus} -M \ ${genomeFile} ${fastqFile1} ${fastqFile2} | \ - samtools sort --threads ${task.cpus} -m 4G - > ${idRun}.bam + samtools sort --threads ${task.cpus} -m 2G - > ${idRun}.bam """ } From 9d20c68e5298d380485586b40f28deb65d7d7bf9 Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Wed, 15 Aug 2018 00:26:37 +0200 Subject: [PATCH 38/38] s/sarek-core/sarek/ also deleting build.sh --- containers/sarek/build.sh | 103 ------------------------------- containers/sarek/environment.yml | 2 +- 2 files changed, 1 insertion(+), 104 deletions(-) delete mode 100755 containers/sarek/build.sh diff --git a/containers/sarek/build.sh b/containers/sarek/build.sh deleted file mode 100755 index fbe57b5aee..0000000000 --- a/containers/sarek/build.sh +++ /dev/null @@ -1,103 +0,0 @@ -#!/bin/sh -set -eu pipefail - -# Local ENV variables -BCFTOOLS_VERSION=1.5 -BWA_VERSION=0.7.16 -HTSLIB_VERSION=1.5 -MANTA_VERSION=1.1.1 -SAMTOOLS_VERSION=1.5 -STRELKA_VERSION=2.8.2 - -# Install libraries -apt-get update && apt-get install -y --no-install-recommends \ - build-essential \ - bzip2 \ - ca-certificates \ - g++ \ - gcc \ - git \ - libbz2-dev \ - liblzma-dev \ - libncurses5-dev \ - libncursesw5-dev \ - make \ - python \ - python3 \ - unzip \ - wget \ - zlib1g-dev - -# Install tools -mkdir /build - -# Install BWA -cd /build -git clone http://github.com/lh3/bwa.git bwa \ - --branch v${BWA_VERSION} -cd bwa -make -cp bwa /usr/local/bin/bwa - -# Install HTSlib -cd /build -wget --quiet -O htslib-${HTSLIB_VERSION}.tar.bz2 \ - https://github.com/samtools/htslib/releases/download/${HTSLIB_VERSION}/htslib-${HTSLIB_VERSION}.tar.bz2 -tar xfj htslib-${HTSLIB_VERSION}.tar.bz2 -rm htslib-${HTSLIB_VERSION}.tar.bz2 -cd htslib-${HTSLIB_VERSION} -./configure --prefix=/opt/htslib -make && make install - -# Install BCFtools -cd /build -wget --quiet -O bcftools-${BCFTOOLS_VERSION}.tar.bz2 \ - https://github.com/samtools/bcftools/releases/download/${BCFTOOLS_VERSION}/bcftools-${BCFTOOLS_VERSION}.tar.bz2 -tar xfj bcftools-${BCFTOOLS_VERSION}.tar.bz2 -rm bcftools-${BCFTOOLS_VERSION}.tar.bz2 -cd bcftools-${BCFTOOLS_VERSION} -./configure --prefix=/opt/bcftools -make && make install - -# Install Samtools -cd /build -wget --quiet -O samtools-${SAMTOOLS_VERSION}.tar.bz2 \ - https://github.com/samtools/samtools/releases/download/${SAMTOOLS_VERSION}/samtools-${SAMTOOLS_VERSION}.tar.bz2 -tar xfj samtools-${SAMTOOLS_VERSION}.tar.bz2 -rm samtools-${SAMTOOLS_VERSION}.tar.bz2 -cd samtools-${SAMTOOLS_VERSION} -./configure --prefix=/opt/samtools -make && make install - -# Install Manta -cd /build -wget --quiet -O manta-${MANTA_VERSION}.centos5_x86_64.tar.bz2 \ - https://github.com/Illumina/manta/releases/download/v${MANTA_VERSION}/manta-${MANTA_VERSION}.centos5_x86_64.tar.bz2 -tar xvjf manta-${MANTA_VERSION}.centos5_x86_64.tar.bz2 -mv manta-${MANTA_VERSION}.centos5_x86_64 $MANTA_INSTALL_PATH -rm manta-${MANTA_VERSION}.centos5_x86_64.tar.bz2 - -# Install Strelka -cd /build -wget --quiet -O strelka-${STRELKA_VERSION}.centos5_x86_64.tar.bz2 \ - https://github.com/Illumina/strelka/releases/download/v${STRELKA_VERSION}/strelka-${STRELKA_VERSION}.centos5_x86_64.tar.bz2 -tar xvjf strelka-${STRELKA_VERSION}.centos5_x86_64.tar.bz2 -mv strelka-${STRELKA_VERSION}.centos5_x86_64 $STRELKA_INSTALL_PATH -rm strelka-${STRELKA_VERSION}.centos5_x86_64.tar.bz2 - -# Clean up install -cd / -apt-get remove -y \ - build-essential \ - ca-certificates \ - gcc \ - git \ - libbz2-dev \ - liblzma-dev \ - libncurses5-dev \ - libncursesw5-dev \ - unzip \ - wget \ - zlib1g-dev -apt-get clean -rm -rf /build /var/lib/apt/lists/* /opt/get-pip.py diff --git a/containers/sarek/environment.yml b/containers/sarek/environment.yml index 25f08613f0..af3ea825f4 100644 --- a/containers/sarek/environment.yml +++ b/containers/sarek/environment.yml @@ -1,6 +1,6 @@ # You can use this file to create a conda environment for this pipeline: # conda env create -f environment.yml -name: sarek-core +name: sarek channels: - bioconda - conda-forge