Skip to content
This repository has been archived by the owner on Jan 27, 2020. It is now read-only.

Awsbatch cpu and memory config #682

Merged
merged 19 commits into from
Dec 17, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- [#671](https://github.com/SciLifeLab/Sarek/pull/671) - publishDir modes are now params
- [#677](https://github.com/SciLifeLab/Sarek/pull/677) - Update docs
- [#679](https://github.com/SciLifeLab/Sarek/pull/679) - Update old awsbatch configuration
- [#682](https://github.com/SciLifeLab/Sarek/pull/682) - Specifications for memory and cpus for awsbatch
- [#693](https://github.com/SciLifeLab/Sarek/pull/693) - Qualimap bamQC is now ran after mapping and after recalibration for better QC
- [#700](https://github.com/SciLifeLab/Sarek/pull/700) - Update GATK to `4.0.9.0`
- [#702](https://github.com/SciLifeLab/Sarek/pull/702) - update FastQC to `0.11.8`
Expand Down
33 changes: 33 additions & 0 deletions conf/aws-batch.config
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,21 @@
params {
genome_base = params.genome == 'GRCh37' ? "s3://ngi-igenomes/igenomes/Homo_sapiens/GATK/GRCh37" : params.genome == 'GRCh38' ? "s3://ngi-igenomes/igenomes/Homo_sapiens/GATK/GRCh38" : "s3://sarek-references/small"
publishDirMode = 'copy'
singleCPUMem = 7.GB // To make the uppmax slurm copy paste work.
localReportDir = 'Reports'
}

executor {
name = 'awsbatch'
awscli = '/home/ec2-user/miniconda/bin/aws'
}

/* Rolling files are currently not supported on s3 */
report.file = "${params.localReportDir}/Sarek_report.html"
timeline.file = "${params.localReportDir}/Sarek_timeline.html"
dag.file = "${params.localReportDir}/Sarek_DAG.svg"
trace.file = "${params.localReportDir}/Sarek_trace.txt"

process {
queue = params.awsqueue

Expand All @@ -26,4 +34,29 @@ process {
cpus = 2
memory = 8.GB

withName:RunBcftoolsStats {
cpus = 1
memory = {params.singleCPUMem * 2} // Memory is doubled so that it won't run two on the same instance
// Use a tiny queue for this one, so storage doesn't run out
queue = params.awsqueue_tiny
}
withName:RunVcftools {
cpus = 1
memory = {params.singleCPUMem * 2} // Memory is doubled so that it won't run two on the same instance
// Use a tiny queue for this one, so storage doesn't run out
queue = params.awsqueue_tiny
}
withName:RunHaplotypecaller {
cpus = 1
// Increase memory quadratically
memory = {params.singleCPUMem * 2} // Memory is doubled so that it won't run two on the same instance
// Use a tiny queue for this one, so storage doesn't run out
queue = params.awsqueue_tiny
}
withName:RunGenotypeGVCFs {
cpus = 1
memory = {params.singleCPUMem * 2} // Memory is doubled so that it won't run two on the same instance
// Use a tiny queue for this one, so storage doesn't run out
queue = params.awsqueue_tiny
}
}
4 changes: 3 additions & 1 deletion conf/base.config
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ params {
test = false // Not testing by default
verbose = false // Enable for more verbose information
awsqueue = false // Queue has to be provided when using awsbatch executor
awsqueue_tiny = params.awsqueue // A separate queue with smaller instance types
localReportDir = false // Used by AWS since reporting is not fully supported on s3 buckets
}

process {
Expand Down Expand Up @@ -67,6 +69,6 @@ dag { // Turning on dag by default

trace { // Turning on trace tracking by default
enabled = true
fields = 'process,task_id,hash,name,attempt,status,exit,realtime,%cpu,vmem,rss,submit,start,complete,duration,realtime,rchar,wchar'
fields = 'process,task_id,hash,name,attempt,status,exit,realtime,cpus,memory,%cpu,vmem,rss,submit,start,complete,duration,realtime,rchar,wchar'
file = "${params.outDir}/Reports/Sarek_trace.txt"
}
40 changes: 27 additions & 13 deletions conf/resources.config
Original file line number Diff line number Diff line change
Expand Up @@ -21,25 +21,27 @@ process {

withName:MapReads {
memory = { check_max( 60.GB * task.attempt, 'memory' ) }
cpus = { check_max( 10, 'cpus' ) }
cpus = { check_max( 16, 'cpus' ) }
}
withName:CreateRecalibrationTable {
cpus = { check_max( 12, 'cpus' ) }
memory = {params.singleCPUMem * 8 * task.attempt}
cpus = { check_max( 1, 'cpus' ) }
memory = { check_max( 60.GB * task.attempt, 'memory') }
}
withName:MarkDuplicates {
// Actually the -Xmx value should be kept lower
// Actually the -Xmx value should be kept lower,
// and is set through the markdup_java_options
cpus = { check_max( 8, 'cpus' ) }
memory = { check_max( 8.GB * task.attempt, 'memory' ) }
}
withName:MergeBams {
cpus = { check_max( 4, 'cpus') }
memory = {params.singleCPUMem * task.attempt}
time = { check_max( 5.h * task.attempt, 'time' ) }
}
withName:RecalibrateBam {
cpus = { check_max( 12, 'cpus' ) }
memory = { check_max( 7.GB * 8 * task.attempt, 'memory' ) }
time = { check_max( 5.h * task.attempt, 'time' ) }
cpus = { check_max( 2, 'cpus' ) }
memory = { check_max( 7.GB * 2 * task.attempt, 'memory' ) }
time = { check_max( 10.h * task.attempt, 'time' ) }
}
withName:RunAlleleCount {
cpus = { check_max( 1, 'cpus' ) }
Expand All @@ -49,6 +51,14 @@ process {
cpus = { check_max( 1, 'cpus' ) }
memory = { check_max( 14.GB * task.attempt, 'memory' ) }
}
withName:RunBamQCmapped {
cpus = { check_max( 6, 'cpus' ) }
memory = { check_max( 70.GB, 'memory' ) }
}
withName:RunBamQCrecalibrated {
cpus = { check_max( 6, 'cpus' ) }
memory = { check_max( 70.GB, 'memory' ) }
}
withName:RunBcftoolsStats {
cpus = { check_max( 1, 'cpus' ) }
}
Expand All @@ -65,13 +75,13 @@ process {
memory = { check_max( 8.GB * task.attempt, 'memory' ) }
}
withName:RunHaplotypecaller {
cpus = { check_max( 20, 'cpus' ) }
cpus = { check_max( 1, 'cpus' ) }
// Increase memory quadratically
memory = { check_max( 7.GB * 2 * task.attempt, 'memory' ) }
time = { check_max( 5.h * task.attempt, 'time' ) }
}
withName:RunGenotypeGVCFs {
cpus = { check_max( 20, 'cpus' ) }
cpus = { check_max( 1, 'cpus' ) }
memory = { check_max( 7.GB * task.attempt, 'memory' ) }
}
withName:RunMultiQC {
Expand All @@ -86,20 +96,24 @@ process {
cpus = { check_max( 2, 'cpus' ) }
time = { check_max( 5.h * task.attempt, 'time' ) }
}
withName:RunSingleManta {
cpus = { check_max( 20, 'cpus' ) }
memory = { check_max( 16.GB, 'memory') }
}
withName:RunSingleStrelka {
cpus = { check_max( 20, 'cpus' ) }
memory = { check_max( 16.GB, 'memory') }
time = { check_max( 5.h * task.attempt, 'time' ) }
}
withName:RunSnpeff {
cpus = { check_max( 1, 'cpus' ) }
errorStrategy = { task.exitStatus == 143 ? 'retry' : 'ignore' }
}
withName:RunStrelka {
cpus = { check_max( 1, 'cpus' ) }
time = { check_max( 5.h * task.attempt, 'time' ) }
}
withName:RunVEP {
cpus = { check_max( 1, 'cpus' ) }
cpus = { check_max( 16, 'cpus' ) }
memory = {check_max (32.GB * task.attempt, 'memory' ) }
errorStrategy = { task.exitStatus == 143 ? 'retry' : 'ignore' }
}
}
}
3 changes: 3 additions & 0 deletions conf/uppmax-slurm.config
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ params {
singleCPUMem = 7.GB // for processes that are using more memory but a single CPU only. Use the 'core' queue for these
}

// Extended set of fields, e.g. native_id, cpu and memory:
trace.fields = 'process,task_id,hash,name,native_id,attempt,status,exit,realtime,cpus,memory,%cpu,vmem,rss,submit,start,complete,duration,realtime,rchar,wchar'

process {
clusterOptions = {"-A $params.project"}
cpus = 16
Expand Down
8 changes: 8 additions & 0 deletions docs/PARAMETERS.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,14 @@ So you can write `--tools mutect2,ascat` or `--tools MuTect2,ASCAT` without worr

Only required if you use the awsbatch profile. This parameter specifies the queue for which jobs are submitted in AWS Batch.

### --awsqueue_tiny `BatchQueueName`

Only used if you use the awsbatch profile. This parameter specifies a queue used for certain small jobs that might still require a significant amount of disk storage.

### --localReportDir `Directory`

Only used if you use the awsbatch profile. This parameter specifies an output directory for nextflow reports, such as Sarek_timeline.html, which currently is not fully supported to store on s3.

### --verbose

Display more information about files being processed.
Expand Down
3 changes: 3 additions & 0 deletions lib/SarekUtils.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ class SarekUtils {
'annotateTools',
'annotateVCF',
'awsqueue',
'awsqueue_tiny',
'build',
'call-name',
'callName',
Expand All @@ -52,6 +53,8 @@ class SarekUtils {
'genome',
'genomes',
'help',
'localReportDir',
'local-report-dir',
'markdup_java_options',
'max_cpus',
'max_memory',
Expand Down
1 change: 1 addition & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ profiles {
includeConfig 'conf/igenomes.config'
includeConfig 'conf/aws-batch.config'
includeConfig 'conf/docker.config'
includeConfig 'conf/resources.config'
includeConfig 'conf/containers.config'
}
// Small testing with Singularity profile
Expand Down